[Devel] Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
Oren Laadan
orenl at cs.columbia.edu
Wed Nov 25 10:46:31 PST 2009
Ok, will add this to user-cr (v19-rc2).
BTW, where is the original nsexec source maintained ?
Oren.
Serge E. Hallyn wrote:
> One of the concerns with clone-with-pids is whether the
> stack handling is all correct and robust enough to withstand
> real usage. Little testcases playing with pid values are
> also necessary, but can't replace really using clone-with-pids
> to start a shell from which to keep working.
>
> This patch tweaks the old ns_exec.c namespace manipulation
> program to add a -z option to specify a pid. So you can:
>
> nsexeccwp -cmp /bin/bash # start a shell in a new pidns+mntns
> mount -t proc proc /proc # mount private /proc
> echo $$
> 1
> nsexeccwp -z /bin/bash # start a shell with pid 999
> echo $$
> 999
>
> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
> ---
> Makefile | 5 +-
> clone.h | 54 +++++++++
> nsexeccwp.c | 352 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 410 insertions(+), 1 deletions(-)
> create mode 100644 clone.h
> create mode 100644 nsexeccwp.c
>
> diff --git a/Makefile b/Makefile
> index 181cc1c..32a6893 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -20,7 +20,7 @@ CFLAGS += -g $(WARNS) $(CKPT_INCLUDE) $(DEBUG)
> # install dir
> INSTALL_DIR = /bin
>
> -PROGS = checkpoint restart ckptinfo
> +PROGS = checkpoint restart ckptinfo nsexeccwp
>
> # other cleanup
> OTHER = ckptinfo_types.c
> @@ -39,11 +39,14 @@ restart: CFLAGS += -D__REENTRANT -pthread
> ifneq ($(SUBARCH),)
> restart: clone_$(SUBARCH).o
> restart: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
> +nsexeccwp: clone_$(SUBARCH).o
> +nsexeccwp: CFLAGS += -DARCH_HAS_CLONE_WITH_PID
> endif
>
> # on powerpc, need also assembly file
> ifeq ($(SUBARCH),ppc)
> restart: clone_$(SUBARCH)_.o
> +nsexeccwp: clone_$(SUBARCH)_.o
> endif
>
> # ckptinfo dependencies
> diff --git a/clone.h b/clone.h
> new file mode 100644
> index 0000000..3569a45
> --- /dev/null
> +++ b/clone.h
> @@ -0,0 +1,54 @@
> +#ifndef CLONE_H
> +#define CLONE_H
> +/*
> + * Copyright (C) 2007 IBM Corporation
> + *
> + * Author: Cedric Le Goater <clg at fr.ibm.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation, version 2 of the
> + * License.
> + *
> + */
> +#include <sys/syscall.h>
> +
> +#ifndef HAVE_UNSHARE
> +
> +#if __i386__
> +# define __NR_unshare 310
> +#elif __x86_64__
> +# define __NR_unshare 272
> +#elif __ia64__
> +# define __NR_unshare 1296
> +#elif __s390x__
> +# define __NR_unshare 303
> +#elif __powerpc__
> +# define __NR_unshare 282
> +#else
> +# error "Architecture not supported"
> +#endif
> +
> +#endif /* HAVE_UNSHARE */
> +
> +#ifndef CLONE_NEWUTS
> +#define CLONE_NEWUTS 0x04000000
> +#endif
> +
> +#ifndef CLONE_NEWIPC
> +#define CLONE_NEWIPC 0x08000000
> +#endif
> +
> +#ifndef CLONE_NEWUSER
> +#define CLONE_NEWUSER 0x10000000
> +#endif
> +
> +#ifndef CLONE_NEWPID
> +#define CLONE_NEWPID 0x20000000
> +#endif
> +
> +#ifndef CLONE_NEWNET
> +#define CLONE_NEWNET 0x40000000
> +#endif
> +
> +#endif /* CLONE_H */
> diff --git a/nsexeccwp.c b/nsexeccwp.c
> new file mode 100644
> index 0000000..f14b8b0
> --- /dev/null
> +++ b/nsexeccwp.c
> @@ -0,0 +1,352 @@
> +/*
> + * Copyright 2008,2009 IBM Corp.
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sched.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <libgen.h>
> +#include <fcntl.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +
> +#include "clone.h"
> +
> +struct pid_set {
> + int num_pids;
> + pid_t *pids;
> +};
> +
> +typedef unsigned long long u64;
> +typedef unsigned int u32;
> +typedef int pid_t;
> +struct clone_args {
> + u64 clone_flags_high;
> +
> + u64 child_stack_base;
> + u64 child_stack_size;
> +
> + u64 parent_tid_ptr;
> + u64 child_tid_ptr;
> +
> + u32 nr_pids;
> +
> + u32 reserved0;
> + u64 reserved1;
> +};
> +extern int clone_with_pids(int (*fn)(void *), void *child_stack,
> + unsigned long stack_size, unsigned long flags,
> + struct pid_set *target_pids, void *arg);
> +
> +extern pid_t getpgid(pid_t pid);
> +extern pid_t getsid(pid_t pid);
> +
> +static const char* procname;
> +
> +static void usage(const char *name)
> +{
> + printf("usage: %s [-h] [-c] [-mnuUip] [-P <pid-file>]"
> + "[command [arg ..]]\n", name);
> + printf("\n");
> + printf(" -h this message\n");
> + printf("\n");
> + printf(" -z <pid> use clone_with_pids and specify chosen pid\n");
> + printf(" Note that -z and -p are not compatible\n");
> + printf(" -c use 'clone' rather than 'unshare' system call\n");
> + printf(" -g launch in new cgroup\n");
> + printf(" -m mount namespace\n");
> + printf(" -n network namespace\n");
> + printf(" -u utsname namespace\n");
> + printf(" -U userid namespace\n");
> + printf(" -i ipc namespace\n");
> + printf(" -P <pid-file> File in which to write global pid of cinit\n");
> + printf(" -p pid namespace\n");
> + printf(" -f <flag> extra clone flags\n");
> + printf("\n");
> + printf("(C) Copyright IBM Corp. 2006\n");
> + printf("\n");
> + exit(1);
> +}
> +
> +static int string_to_ul(const char *str, unsigned long int *res)
> +{
> + char *tail;
> + long long int r;
> +
> + if (!*str)
> + return -1;
> +
> + errno = 0;
> +
> + r = strtol(str, &tail, 16);
> +
> + /*
> + * according to strtol(3), if errno is set or tail does no point
> + * to the ending '\0', the conversion failed.
> + */
> + if (errno || *tail)
> + return -1;
> +
> + *res = r;
> + return 0;
> +}
> +
> +/*
> + * Copied following opentty() from Fedora's util-linux rpm
> + * I just changed the "FATAL" message below from syslog()
> + * to printf
> + */
> +static void
> +opentty(const char * tty) {
> + int i, fd, flags;
> +
> + fd = open(tty, O_RDWR | O_NONBLOCK);
> + if (fd == -1) {
> + printf("FATAL: can't reopen tty: %s", strerror(errno));
> + sleep(1);
> + exit(1);
> + }
> +
> + flags = fcntl(fd, F_GETFL);
> + flags &= ~O_NONBLOCK;
> + fcntl(fd, F_SETFL, flags);
> +
> + for (i = 0; i < fd; i++)
> + close(i);
> + for (i = 0; i < 3; i++)
> + if (fd != i)
> + dup2(fd, i);
> + if (fd >= 3)
> + close(fd);
> +}
> +// Code copy end
> +
> +int do_newcgrp = 0;
> +
> +int load_cgroup_dir(char *dest, int len)
> +{
> + FILE *f = fopen("/proc/mounts", "r");
> + char buf[200];
> + char *name, *path, *fsname, *options, *p1, *p2, *s;
> + if (!f)
> + return 0;
> + while (fgets(buf, 200, f)) {
> + name = strtok_r(buf, " ", &p1);
> + path = strtok_r(NULL, " ", &p1);
> + fsname = strtok_r(NULL, " ", &p1);
> + options = strtok_r(NULL, " ", &p1);
> + if (strcmp(fsname, "cgroup") != 0)
> + continue;
> +
> + /* make sure the freezer is composed */
> + s = strtok_r(options, ",", &p2);
> + while (s && strcmp(s, "freezer") != 0)
> + s = strtok_r(NULL, ",", &p2);
> + if (!s)
> + continue;
> + strncpy(dest, path, len);
> + fclose(f);
> + return 1;
> + }
> + fclose(f);
> + printf("Freezer not mounted\n");
> + return 0;
> +}
> +
> +int move_to_new_cgroup(int newcgroup)
> +{
> + char cgroupname[150], cgroupbase[100], tasksfname[200];
> + FILE *fout;
> + int ret;
> +
> + if (!load_cgroup_dir(cgroupbase, 100))
> + return 0;
> +
> + snprintf(cgroupname, 150, "%s/%d", cgroupbase, newcgroup);
> + ret = mkdir(cgroupname, 0755);
> + if (ret)
> + return 0;
> + snprintf(tasksfname, 200, "%s/tasks", cgroupname);
> + fout = fopen(tasksfname, "w");
> + if (!fout)
> + return 0;
> + fprintf(fout, "%d\n", getpid());
> + fclose(fout);
> + return 1;
> +}
> +
> +int pipefd[2];
> +
> +/* gah. opentty will close the pipefd */
> +int check_newcgrp(void)
> +{
> + int ret, newgroup;
> + char buf[20];
> +
> + if (!do_newcgrp)
> + return 0;
> +
> + close(pipefd[1]);
> + ret = read(pipefd[0], buf, 20);
> + close(pipefd[0]);
> + if (ret == -1) {
> + perror("read");
> + return 1;
> + }
> + newgroup = atoi(buf);
> + if (!move_to_new_cgroup(newgroup))
> + return 1;
> + do_newcgrp = 0;
> + return 0;
> +}
> +
> +int do_child(void *vargv)
> +{
> + char **argv = (char **)vargv;
> +
> + if (check_newcgrp())
> + return 1;
> +
> + execve(argv[0], argv, __environ);
> + perror("execve");
> + return 1;
> +}
> +
> +void write_pid(char *pid_file, int pid)
> +{
> + FILE *fp;
> +
> + if (!pid_file)
> + return;
> +
> + fp = fopen(pid_file, "w");
> + if (!fp) {
> + perror("fopen, pid_file");
> + exit(1);
> + }
> + fprintf(fp, "%d", pid);
> + fflush(fp);
> + fclose(fp);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + int c;
> + unsigned long flags = 0, eflags = 0;
> + char ttyname[256];
> + int status;
> + int ret, use_clone = 0;
> + int pid;
> + char *pid_file = NULL;
> + struct pid_set pid_set;
> + int chosen_pid = 0;
> +
> + pid_set.num_pids = 1;
> + pid_set.pids = &chosen_pid;
> +
> + procname = basename(argv[0]);
> +
> + memset(ttyname, '\0', sizeof(ttyname));
> + readlink("/proc/self/fd/0", ttyname, sizeof(ttyname));
> +
> + while ((c = getopt(argc, argv, "+mguUiphz:cnf:P:")) != EOF) {
> + switch (c) {
> + case 'g': do_newcgrp = getpid(); break;
> + case 'm': flags |= CLONE_NEWNS; break;
> + case 'c': use_clone = 1; break;
> + case 'P': pid_file = optarg; break;
> + case 'u': flags |= CLONE_NEWUTS; break;
> + case 'i': flags |= CLONE_NEWIPC; break;
> + case 'U': flags |= CLONE_NEWUSER; break;
> + case 'n': flags |= CLONE_NEWNET; break;
> + case 'p': flags |= CLONE_NEWNS|CLONE_NEWPID; break;
> + case 'z': chosen_pid = atoi(optarg); break;
> + case 'f': if (!string_to_ul(optarg, &eflags)) {
> + flags |= eflags;
> + break;
> + }
> + case 'h':
> + default:
> + usage(procname);
> + }
> + };
> +
> + if (chosen_pid) {
> + use_clone = 1;
> + if (flags & CLONE_NEWPID) {
> + printf("Error: can't use CLONE_NEWPID and pick a pid\n");
> + exit(1);
> + }
> + }
> + argv = &argv[optind];
> + argc = argc - optind;
> +
> + if (do_newcgrp) {
> + ret = pipe(pipefd);
> + if (ret) {
> + perror("pipe");
> + return -1;
> + }
> + do_newcgrp = pipefd[0];
> + }
> +
> + if (use_clone) {
> + int stacksize = 4*getpagesize();
> + void *stack = malloc(stacksize);
> +
> + if (!stack) {
> + perror("malloc");
> + return -1;
> + }
> +
> + printf("about to clone with %lx\n", flags);
> + if (chosen_pid)
> + printf("Will choose pid %d\n", chosen_pid);
> + flags |= SIGCHLD;
> + pid = clone_with_pids(do_child, stack, stacksize, flags,
> + &pid_set, (void *)argv);
> + if (pid == -1) {
> + perror("clone");
> + return -1;
> + }
> + } else {
> + if ((pid = fork()) == 0) {
> + // Child.
> + //print_my_info(procname, ttyname);
> +
> + if (check_newcgrp())
> + return 1;
> + opentty(ttyname);
> +
> + printf("about to unshare with %lx\n", flags);
> + ret = unshare(flags);
> + if (ret < 0) {
> + perror("unshare");
> + return 1;
> + }
> +
> + return do_child((void*)argv);
> + }
> +
> + }
> + if (pid != -1 && do_newcgrp) {
> + char buf[20];
> + snprintf(buf, 20, "%d", pid);
> + close(pipefd[0]);
> + write(pipefd[1], buf, strlen(buf)+1);
> + close(pipefd[1]);
> + }
> +
> + write_pid(pid_file, pid);
> +
> + if ((ret = waitpid(pid, &status, __WALL)) < 0)
> + printf("waitpid() returns %d, errno %d\n", ret, errno);
> +
> + exit(0);
> +}
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list