[Devel] Re: [RFC PATCH 3/5] use next syscall data to predefine process ids
Nadia Derbey
Nadia.Derbey at bull.net
Mon Jul 7 22:44:22 PDT 2008
Serge E. Hallyn wrote:
> Quoting Nadia.Derbey at bull.net (Nadia.Derbey at bull.net):
>
>>[PATCH 03/05]
>>
>>This patch uses the value written into the next_syscall_data proc file
>>as a target upid nr for the next process to be created.
>>The following syscalls have a new behavior if next_syscall_data is set:
>>. fork()
>>. vfork()
>>. clone()
>>
>>In the current version, if the process belongs to nested namespaces, only
>>the upper namespace level upid nr is allowed to be predefined, since there
>>is not yet a way to take a snapshot of upid nrs at all namespaces levels.
>>
>>But this can easily be extended in the future.
>
>
> Good point, we will want to discuss the right way to dump that data. Do
> we add a new file under /proc/<pid>, use /proc/pid/status, or find some
> other way?
>
>
>>Signed-off-by: Nadia Derbey <Nadia.Derbey at bull.net>
>>
>>---
>> include/linux/next_syscall_data.h | 2
>> include/linux/pid.h | 2
>> kernel/fork.c | 3 -
>> kernel/pid.c | 111 ++++++++++++++++++++++++++++++++------
>> 4 files changed, 98 insertions(+), 20 deletions(-)
>>
>>Index: linux-2.6.26-rc5-mm3/kernel/pid.c
>>===================================================================
>>--- linux-2.6.26-rc5-mm3.orig/kernel/pid.c 2008-07-01 10:25:46.000000000 +0200
>>+++ linux-2.6.26-rc5-mm3/kernel/pid.c 2008-07-01 11:25:38.000000000 +0200
>>@@ -122,6 +122,26 @@ static void free_pidmap(struct upid *upi
>> atomic_inc(&map->nr_free);
>> }
>>
>>+static inline int alloc_pidmap_page(struct pidmap *map)
>>+{
>>+ if (unlikely(!map->page)) {
>>+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
>>+ /*
>>+ * Free the page if someone raced with us
>>+ * installing it:
>>+ */
>>+ spin_lock_irq(&pidmap_lock);
>>+ if (map->page)
>>+ kfree(page);
>>+ else
>>+ map->page = page;
>>+ spin_unlock_irq(&pidmap_lock);
>>+ if (unlikely(!map->page))
>>+ return -1;
>>+ }
>>+ return 0;
>>+}
>>+
>> static int alloc_pidmap(struct pid_namespace *pid_ns)
>> {
>> int i, offset, max_scan, pid, last = pid_ns->last_pid;
>>@@ -134,21 +154,8 @@ static int alloc_pidmap(struct pid_names
>> map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
>> max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
>> for (i = 0; i <= max_scan; ++i) {
>>- if (unlikely(!map->page)) {
>>- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
>>- /*
>>- * Free the page if someone raced with us
>>- * installing it:
>>- */
>>- spin_lock_irq(&pidmap_lock);
>>- if (map->page)
>>- kfree(page);
>>- else
>>- map->page = page;
>>- spin_unlock_irq(&pidmap_lock);
>>- if (unlikely(!map->page))
>>- break;
>>- }
>>+ if (unlikely(alloc_pidmap_page(map)))
>>+ break;
>> if (likely(atomic_read(&map->nr_free))) {
>> do {
>> if (!test_and_set_bit(offset, map->page)) {
>>@@ -182,6 +189,33 @@ static int alloc_pidmap(struct pid_names
>> return -1;
>> }
>>
>>+/*
>>+ * Return 0 if successful (i.e. next_nr could be assigned as a upid nr).
>>+ * -errno else
>>+ */
>>+static int alloc_fixed_pidmap(struct pid_namespace *pid_ns, int next_nr)
>>+{
>>+ int offset;
>>+ struct pidmap *map;
>>+
>>+ if (next_nr < RESERVED_PIDS || next_nr >= pid_max)
>>+ return -EINVAL;
>>+
>>+ map = &pid_ns->pidmap[next_nr / BITS_PER_PAGE];
>>+
>>+ if (unlikely(alloc_pidmap_page(map)))
>>+ return -ENOMEM;
>>+
>>+ offset = next_nr & BITS_PER_PAGE_MASK;
>>+ if (test_and_set_bit(offset, map->page))
>>+ return -EBUSY;
>>+
>>+ atomic_dec(&map->nr_free);
>>+ pid_ns->last_pid = max(pid_ns->last_pid, next_nr);
>>+
>>+ return 0;
>>+}
>>+
>> int next_pidmap(struct pid_namespace *pid_ns, int last)
>> {
>> int offset;
>>@@ -239,7 +273,25 @@ void free_pid(struct pid *pid)
>> call_rcu(&pid->rcu, delayed_put_pid);
>> }
>>
>>-struct pid *alloc_pid(struct pid_namespace *ns)
>>+/*
>>+ * Sets a predefined upid nr for the process' upper namespace level
>>+ */
>>+static int set_predefined_pid(struct pid_namespace *ns, struct pid *pid,
>>+ int next_nr)
>>+{
>>+ int i = ns->level;
>>+ int rc;
>>+
>>+ rc = alloc_fixed_pidmap(ns, next_nr);
>>+ if (rc < 0)
>>+ return rc;
>>+
>>+ pid->numbers[i].nr = next_nr;
>>+ pid->numbers[i].ns = ns;
>>+ return 0;
>>+}
>>+
>>+struct pid *alloc_pid(struct pid_namespace *ns, int *retval)
>
>
> Is there a reason why you can't return retval using
No, except that I wanted that I wanted to change the fewest things. Now,
adding 1 argument vs adding 1 or 2 instructions, I don't mind: I'll
change that.
> return ERR_PTR(retval);
> instead of using an additional argument? Then at copy_process,
> after the call, do
>
> if (IS_ERR(pid))
> retval = PTR_ERR(pid);
>
> ?
>
>
>> {
>> struct pid *pid;
>> enum pid_type type;
>>@@ -247,12 +299,37 @@ struct pid *alloc_pid(struct pid_namespa
>> struct pid_namespace *tmp;
>> struct upid *upid;
>>
>>+ *retval = -ENOMEM;
>> pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
>> if (!pid)
>> goto out;
>>
>> tmp = ns;
>>- for (i = ns->level; i >= 0; i--) {
>>+ i = ns->level;
>>+ if (next_data_set(current)) {
>>+ /*
>>+ * There is a upid nr specified, use it instead of letting
>>+ * the kernel chose it for us.
>>+ */
>>+ int next_nr = get_next_data(current);
>>+ int rc;
>>+
>>+ rc = set_predefined_pid(tmp, pid, next_nr);
>>+ if (rc < 0) {
>>+ *retval = rc;
>>+ goto out_free;
>
>
> Again, I'd argue for resetting the syscall data on failure.
>
>
>>+ }
>>+ /* Go up one level */
>>+ tmp = tmp->parent;
>>+ i--;
>>+ reset_next_syscall_data(current);
>>+ }
>>+
>>+ /*
>>+ * Let the lower levels upid nrs be automatically allocated
>>+ */
>>+ *retval = -ENOMEM;
>>+ for ( ; i >= 0; i--) {
>> nr = alloc_pidmap(tmp);
>> if (nr < 0)
>> goto out_free;
>>Index: linux-2.6.26-rc5-mm3/include/linux/pid.h
>>===================================================================
>>--- linux-2.6.26-rc5-mm3.orig/include/linux/pid.h 2008-07-01 10:25:46.000000000 +0200
>>+++ linux-2.6.26-rc5-mm3/include/linux/pid.h 2008-07-01 10:49:07.000000000 +0200
>>@@ -121,7 +121,7 @@ extern struct pid *find_get_pid(int nr);
>> extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
>> int next_pidmap(struct pid_namespace *pid_ns, int last);
>>
>>-extern struct pid *alloc_pid(struct pid_namespace *ns);
>>+extern struct pid *alloc_pid(struct pid_namespace *, int *);
>> extern void free_pid(struct pid *pid);
>>
>> /*
>>Index: linux-2.6.26-rc5-mm3/kernel/fork.c
>>===================================================================
>>--- linux-2.6.26-rc5-mm3.orig/kernel/fork.c 2008-07-01 10:25:46.000000000 +0200
>>+++ linux-2.6.26-rc5-mm3/kernel/fork.c 2008-07-01 10:49:07.000000000 +0200
>>@@ -1110,8 +1110,7 @@ static struct task_struct *copy_process(
>> goto bad_fork_cleanup_io;
>>
>> if (pid != &init_struct_pid) {
>>- retval = -ENOMEM;
>>- pid = alloc_pid(task_active_pid_ns(p));
>>+ pid = alloc_pid(task_active_pid_ns(p), &retval);
>> if (!pid)
>> goto bad_fork_cleanup_io;
>>
>>Index: linux-2.6.26-rc5-mm3/include/linux/next_syscall_data.h
>>===================================================================
>>--- linux-2.6.26-rc5-mm3.orig/include/linux/next_syscall_data.h 2008-07-01 10:41:36.000000000 +0200
>>+++ linux-2.6.26-rc5-mm3/include/linux/next_syscall_data.h 2008-07-01 11:09:35.000000000 +0200
>>@@ -5,6 +5,7 @@
>> * following is supported today:
>> * . object creation with a predefined id
>> * . for a sysv ipc object
>>+ * . for a process
>> *
>> */
>>
>>@@ -19,6 +20,7 @@
>> * For example, it can be used to pre-set the id of the object to be created
>> * by next syscall. The following syscalls support this feature:
>> * . msgget(), semget(), shmget()
>>+ * . fork(), vfork(), clone()
>> */
>> struct next_syscall_data {
>> int ndata;
>>
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list