[Devel] Re: How much of a mess does OpenVZ make? ; ) Was: What can OpenVZ do?
Cedric Le Goater
legoater at free.fr
Fri Mar 13 08:27:28 PDT 2009
Ying Han wrote:
> Hi Serge:
> I made a patch based on Oren's tree recently which implement a new
> syscall clone_with_pid. I tested with checkpoint/restart process tree
> and it works as expected.
> This patch has some hack in it which i made a copy of libc's clone and
> made modifications of passing one more argument(pid number). I will
> try to clean up the code and do more testing.
ok. 2 patches would also be interesting. one for the syscall and one
for the kernel support (which might be acceptable)
> New syscall clone_with_pid
> Implement a new syscall which clone a thread with a preselected pid number.
yes this definitely needed to restart a task/thread. we maintain an ugly
hack which stores a pid in the current task that will be used by the next
clone() call.
> clone_with_pid(child_func, child_stack + CHILD_STACK - 16,
> CLONE_WITH_PID|SIGCHLD, pid, NULL);
since you're introducing a new syscall, I don't see why you need a
CLONE_WITH_PID flag ?
(FYI, attached is my old attempt of clone_with_pid but that's too old)
[ ... ]
> +#define DEBUG
> #include <linux/slab.h>
> #include <linux/init.h>
> #include <linux/unistd.h>
> @@ -959,10 +959,19 @@ static struct task_struct *copy_process(unsigned long cl
> int retval;
> struct task_struct *p;
> int cgroup_callbacks_done = 0;
> + pid_t clone_pid = stack_size;
>
> if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
> return ERR_PTR(-EINVAL);
>
> + /* We only allow the clone_with_pid when a new pid namespace is
> + * created. FIXME: how to restrict it.
> + */
> + if ((clone_flags & CLONE_NEWPID) && (clone_flags & CLONE_WITH_PID))
> + return ERR_PTR(-EINVAL);
> + if ((clone_flags & CLONE_WITH_PID) && (clone_pid <= 1))
> + return ERR_PTR(-EINVAL);
I would let alloc_pid() handle the error.
> /*
> * Thread groups must share signals as well, and detached threads
> * can only be started up within the thread group.
> @@ -1135,7 +1144,10 @@ static struct task_struct *copy_process(unsigned long c
>
> if (pid != &init_struct_pid) {
> retval = -ENOMEM;
> - pid = alloc_pid(task_active_pid_ns(p));
> + if (clone_flags & CLONE_WITH_PID)
> + pid = alloc_pid(task_active_pid_ns(p), clone_pid);
> + else
> + pid = alloc_pid(task_active_pid_ns(p), 0);
this is overkill IMO.
[ ... ]
> -static int alloc_pidmap(struct pid_namespace *pid_ns)
> +static int alloc_pidmap(struct pid_namespace *pid_ns, pid_t pid_nr)
> {
> int i, offset, max_scan, pid, last = pid_ns->last_pid;
> struct pidmap *map;
>
> - pid = last + 1;
> + if (pid_nr)
> + pid = pid_nr;
> + else
> + pid = last + 1;
>
> if (pid >= pid_max)
> pid = RESERVED_PIDS;
> offset = pid & BITS_PER_PAGE_MASK;
> @@ -153,9 +156,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
> do {
> if (!test_and_set_bit(offset, map->page)) {
> atomic_dec(&map->nr_free);
> - pid_ns->last_pid = pid;
> + if (!pid_nr)
> + pid_ns->last_pid = pid;
> return pid;
> }
> + if (pid_nr)
> + return -1;
> offset = find_next_offset(map, offset);
> pid = mk_pid(pid_ns, map, offset);
> /*
> @@ -239,21 +245,25 @@ void free_pid(struct pid *pid)
> call_rcu(&pid->rcu, delayed_put_pid);
> }
>
> -struct pid *alloc_pid(struct pid_namespace *ns)
> +struct pid *alloc_pid(struct pid_namespace *ns, pid_t pid_nr)
> {
> struct pid *pid;
> enum pid_type type;
> int i, nr;
> struct pid_namespace *tmp;
> struct upid *upid;
> + int level = ns->level;
> +
> + if (pid_nr >= pid_max)
> + return NULL;
let alloc_pidmap() handle it ?
>
> pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
> if (!pid)
> goto out;
>
> - tmp = ns;
> - for (i = ns->level; i >= 0; i--) {
> - nr = alloc_pidmap(tmp);
> + tmp = ns->parent;
> + for (i = level-1; i >= 0; i--) {
> + nr = alloc_pidmap(tmp, 0);
> if (nr < 0)
> goto out_free;
>
> @@ -262,6 +272,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> tmp = tmp->parent;
> }
>
> + nr = alloc_pidmap(ns, pid_nr);
> + if (nr < 0)
> + goto out_free;
> + pid->numbers[level].nr = nr;
> + pid->numbers[level].ns = ns;
> + if (nr == pid_nr)
> + pr_debug("nr == pid_nr == %d\n", nr);
> +
> get_pid_ns(ns);
> pid->level = ns->level;
> atomic_set(&pid->count, 1);
>
>
>
>
>
>
>
>
> On Thu, Mar 12, 2009 at 2:21 PM, Serge E. Hallyn <serue at us.ibm.com> wrote:
>> Quoting Greg Kurz (gkurz at fr.ibm.com):
>>> On Thu, 2009-03-12 at 09:53 -0500, Serge E. Hallyn wrote:
>>>> Or are you suggesting that you'll do a dummy clone of (5594,2) so that
>>>> the next clone(CLONE_NEWPID) will be expected to be (5594,3,1)?
>>>>
>>> Of course not
>> Ok - someone *did* argue that at some point I think...
>>
>>> but one should be able to tell clone() to pick a specific
>>> pid.
>> Can you explain exactly how? I must be missing something clever.
>>
>> -serge
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo at kvack.org. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont at kvack.org"> email at kvack.org </a>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: clone_with_pid.patch
URL: <http://lists.openvz.org/pipermail/devel/attachments/20090313/d8f8a756/attachment-0001.ksh>
-------------- next part --------------
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list