[Devel] Re: [PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids
Nathan Lynch
ntl at pobox.com
Mon Nov 16 15:18:42 PST 2009
On Mon, 2009-11-16 at 12:26 -0600, Serge E. Hallyn wrote:
> Quoting Nathan Lynch (ntl at pobox.com):
> > On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote:
> > > Quoting Nathan Lynch (ntl at pobox.com):
> > > > On Thu, 2009-11-12 at 23:24 -0600, serue at us.ibm.com wrote:
> > > > > + if (use_clone) {
> > > > > + int stacksize = 4*getpagesize();
> > > > > + void *stack = malloc(stacksize);
> > > > > +
> > > > > + if (!stack) {
> > > > > + perror("malloc");
> > > > > + return -1;
> > > > > + }
> > > > > +
> > > > > + printf("about to clone with %lx\n", flags);
> > > > > + if (chosen_pid)
> > > > > + printf("Will choose pid %d\n", chosen_pid);
> > > > > + flags |= SIGCHLD;
> > > > > + pid = clone_with_pids(do_child, stack, flags, &pid_set,
> > > > > + (void *)argv);
> > > >
> > > > The stack argument should be adjusted with the usual stack += stacksize
> > > > - 1 or similar, right?
> > >
> > > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the
> > > x86 one by Suka also) does this implicitly, by doing:
> > >
> > > s = child_stack;
> > > *--s = arg;
> > > *--s = fn;
> > > child_stack -= 16
> >
> > That's setting up arguments for the function to run in the child, and
> > afaict that code assumes the value of child_stack is the _end_ of the
> > stack region.
>
> Yes.
>
> > The code I quoted above is passing the beginning of the
> > region (the return value from malloc).
>
> Holy cow, that was a snafu in my switching to sending (stack_base,stack_size)
> for the previous version, and then back again. It was meant to send
> stack_base+stack_size now.
Okay, here's the violence I've committed against your code to get eclone
working on powerpc (tested 32-bit userspace against 64-bit kernel).
./nsexeccwp -z 300 /bin/bash -c 'echo $$'
[debugging cruft elided]
300
This is meant not for inclusion but for discussion at this point. I
made some changes that will certainly break the builds for other
architectures.
Note that I have generic code initializing clone_args with the true
stack base and size and passing that to the architecture code. The
architecture code (e.g. clone_ppc.c) is responsible for calculating the
stack pointer to pass to the kernel. The architecture code is also
responsible for clearing clone_args.child_stack_size and updating
clone_args.child_stack, adjusting for alignment and arguments if
appropriate. In this way, we can accommodate ia64 and parisc and keep
platform details in platform-specific code.
clone_ppc.c | 54 +++++++++++++++++++++++++++++++++++
clone_ppc_.S | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
eclone.h | 25 ++++++++++++++++
nsexeccwp.c | 42 ++++++++++++----------------
4 files changed, 182 insertions(+), 27 deletions(-)
diff --git a/clone_ppc.c b/clone_ppc.c
index 49797fd..9e19fae 100644
--- a/clone_ppc.c
+++ b/clone_ppc.c
@@ -10,14 +10,25 @@
#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <asm/unistd.h>
+#include "eclone.h"
+
struct target_pid_set;
+struct pid_set {
+ size_t nr_pids;
+ pid_t *pids;
+};
+
+
extern int __clone_with_pids(int (*fn)(void *arg),
void *child_stack ,
int flags,
@@ -56,3 +67,46 @@ int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
}
#endif
+
+extern int __eclone(int (*fn)(void *arg),
+ void *child_sp,
+ int flags,
+ void *fn_arg,
+ struct clone_args *args,
+ size_t args_size,
+ pid_t *pids);
+
+int eclone(int (*fn)(void *), void *fn_arg, int clone_flags_low,
+ struct clone_args *clone_args, pid_t *pids)
+{
+ struct clone_args my_args;
+ unsigned long child_sp;
+ int newpid;
+
+ if (clone_args->child_stack)
+ child_sp = clone_args->child_stack +
+ clone_args->child_stack_size - 1;
+ else
+ child_sp = 0;
+
+ my_args = *clone_args;
+ my_args.child_stack = child_sp;
+ my_args.child_stack_size = 0;
+
+ printf("%s: child_sp = %p\n", __func__, (void *)child_sp);
+
+ newpid = __eclone(fn,
+ (void *)child_sp,
+ clone_flags_low,
+ fn_arg,
+ &my_args,
+ sizeof(my_args),
+ pids);
+
+ if (newpid < 0) {
+ errno = -newpid;
+ newpid = -1;
+ }
+
+ return newpid;
+}
diff --git a/clone_ppc_.S b/clone_ppc_.S
index cb3e053..b777b2d 100644
--- a/clone_ppc_.S
+++ b/clone_ppc_.S
@@ -11,6 +11,14 @@
#include <asm/unistd.h>
#include "powerpc_asm.h"
+#ifndef __NR_clone_with_pids
+#define __NR_clone_with_pids 325
+#endif
+
+#ifndef __NR_eclone
+#define __NR_eclone 323
+#endif
+
/* int [r3] clone_with_pids(int (*fn)(void *arg) [r3],
* void *child_stack [r4],
* int flags [r5],
@@ -29,10 +37,10 @@
.globl __clone_with_pids
__clone_with_pids:
-/* No argument validation. */
+ /* No argument validation. */
-/* Set up parent's stack frame. */
-stwu r1,-32(r1)
+ /* Set up parent's stack frame. */
+ stwu r1,-32(r1)
/* Save non-volatiles (r28-r31) which we plan to use. */
stmw r28,16(r1)
@@ -88,3 +96,77 @@ parent:
neg r3,r3
blr
+/* int [r3] eclone(int (*fn)(void *arg) [r3],
+ * void *child_sp [r4],
+ * int flags [r5],
+ * void *fn_arg [r6],
+ * struct clone_args *args [r7],
+ * size_t args_size [r8],
+ * pid_t *pids [r9]);
+ * Creates a child task with the pids specified by pids.
+ * Returns to parent only, child execution and exit is handled here.
+ * On error, returns negated errno. On success, returns the pid of the child
+ * created.
+ */
+
+.globl __eclone
+__eclone:
+
+ /* No argument validation. */
+
+ /* Set up parent's stack frame. */
+ stwu r1,-32(r1)
+
+ /* Save non-volatiles (r28-r31) which we plan to use. */
+ stmw r28,16(r1)
+
+ /* Set up child's stack frame. */
+ clrrwi r4,r4,4
+ li r0,0
+ stw r0,-16(r4)
+
+ /* Save fn, stack pointer, flags, and fn_arg across system call. */
+ mr r28,r3
+ mr r29,r4
+ mr r30,r5
+ mr r31,r6
+
+ /* Set up arguments for system call. */
+ mr r3,r5 /* flags */
+ mr r4,r7 /* clone_args */
+ mr r5,r8 /* clone_args' size */
+ mr r6,r9 /* pids */
+
+ /* Do the system call */
+ li r0,__NR_eclone
+ sc
+
+ /* Parent or child? */
+ cmpwi cr1,r3,0
+ crandc 4*cr1+eq,4*cr1+eq,4*cr0+so
+ bne cr1,eclone_parent
+
+ /* Child. Call fn. */
+ mtctr r28
+ mr r3,r31
+ bctrl
+
+ /* Assume result of fn in r3 and exit. */
+ li r0,__NR_exit
+ sc
+
+eclone_parent:
+ /* Restore non-volatiles. */
+ lmw r28,16(r1)
+
+ addi r1,r1,32
+
+ /* Return to caller on success. */
+ bnslr
+
+ /* Handle error. Negate the return value to signal an error
+ * to the caller, which must set errno.
+ */
+ neg r3,r3
+ blr
+
diff --git a/eclone.h b/eclone.h
new file mode 100644
index 0000000..601a621
--- /dev/null
+++ b/eclone.h
@@ -0,0 +1,25 @@
+#ifndef _ECLONE_H_
+#define _ECLONE_H_
+
+#include <stdint.h>
+
+struct clone_args {
+ uint64_t clone_flags_high;
+ uint64_t child_stack;
+ uint64_t child_stack_size;
+ uint64_t parent_tid_ptr;
+ uint64_t child_tid_ptr;
+
+ uint32_t nr_pids;
+
+ uint32_t reserved0;
+ uint64_t reserved1;
+};
+
+/* arch-dependent code implements this interface */
+extern int eclone(int (*fn)(void *), void *fn_arg,
+ int clone_flags_low,
+ struct clone_args *clone_args,
+ pid_t *pids);
+
+#endif
diff --git a/nsexeccwp.c b/nsexeccwp.c
index a71d9a4..b80b78e 100644
--- a/nsexeccwp.c
+++ b/nsexeccwp.c
@@ -17,29 +17,13 @@
#include <sys/wait.h>
#include "clone.h"
+#include "eclone.h"
struct pid_set {
int num_pids;
pid_t *pids;
};
-typedef unsigned long long u64;
-typedef unsigned int u32;
-typedef int pid_t;
-struct clone_args {
- u64 clone_flags_high;
-
- u64 child_stack_base;
- u64 child_stack_size;
-
- u64 parent_tid_ptr;
- u64 child_tid_ptr;
-
- u32 nr_pids;
-
- u32 reserved0;
- u64 reserved1;
-};
/* (until it's supported by libc) from clone_ARCH.c */
extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
struct pid_set *target_pids, void *arg);
@@ -210,6 +194,9 @@ int do_child(void *vargv)
{
char **argv = (char **)vargv;
+ printf("%s(%p)/%lu\n", __func__, vargv, (unsigned long)getpid());
+ fflush(NULL);
+
if (check_newcgrp())
return 1;
@@ -237,6 +224,7 @@ void write_pid(char *pid_file, int pid)
int main(int argc, char *argv[])
{
+ int i;
int c;
unsigned long flags = 0, eflags = 0;
char ttyname[256];
@@ -244,11 +232,8 @@ int main(int argc, char *argv[])
int ret, use_clone = 0;
int pid;
char *pid_file = NULL;
- struct pid_set pid_set;
- int chosen_pid = 0;
-
- pid_set.num_pids = 1;
- pid_set.pids = &chosen_pid;
+ size_t nr_pids = 1;
+ pid_t chosen_pid = 0;
procname = basename(argv[0]);
@@ -287,6 +272,9 @@ int main(int argc, char *argv[])
argv = &argv[optind];
argc = argc - optind;
+ for (i = 0; i < argc; i++)
+ printf("argv[%d] = '%s'\n", i, argv[i]);
+
if (do_newcgrp) {
ret = pipe(pipefd);
if (ret) {
@@ -297,6 +285,7 @@ int main(int argc, char *argv[])
}
if (use_clone) {
+ struct clone_args clone_args;
int stacksize = 4*getpagesize();
void *stack = malloc(stacksize);
@@ -305,12 +294,17 @@ int main(int argc, char *argv[])
return -1;
}
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.child_stack = (unsigned long)stack;
+ clone_args.child_stack_size = stacksize;
+ clone_args.nr_pids = nr_pids;
+
printf("about to clone with %lx\n", flags);
if (chosen_pid)
printf("Will choose pid %d\n", chosen_pid);
+ printf("argv = %p\n", argv);
flags |= SIGCHLD;
- pid = clone_with_pids(do_child, stack, flags, &pid_set,
- (void *)argv);
+ pid = eclone(do_child, argv, flags, &clone_args, &chosen_pid);
if (pid == -1) {
perror("clone");
return -1;
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list