[CRIU] [PATCH 3/3] forking: Use last_pid_mutex for synchronization during clone()

Kirill Tkhai ktkhai at virtuozzo.com
Mon May 15 07:33:45 PDT 2017



On 13.05.2017 00:24, Andrei Vagin wrote:
> On Thu, May 11, 2017 at 05:55:21PM +0300, Kirill Tkhai wrote:
>> Before this patch we used flock to order task creation,
>> but this way is not good. It took 5 syscalls to synchronize
>> a creation of a single child:
>>
>> 1)open()
>> 2)flock(LOCK_EX)
>> 3)flock(LOCK_UN)
>> 4)close() in parent
>> 5)close() in child
>>
>> The patch introduces more effective way for synchronization,
>> which executes 2 syscalls only. We use last_pid_mutex,
>> and the syscalls number sounds definitely better.
>>
>> In order to synchronize with parallel CRIU, we still
>> take ns_last_pid flock. After talking to @xemul,
>> we remain the ns_last_pid file locked all the restore time,
>> which is done once by criu task. In further, if we need
>> to relax this, we may take the flock() globally on
>> CR_STATE_FORKING stage only, and synchronize thread
>> creation with flock() locally, but now we don't need that.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>> ---
>>  criu/cr-restore.c   |   41 +++++++++++++++++++++--------------------
>>  criu/namespaces.c   |   23 +++++------------------
>>  criu/pie/restorer.c |   36 +++++++++++++++---------------------
>>  3 files changed, 41 insertions(+), 59 deletions(-)
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index ec80c54b4..e4c7fad36 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -1169,7 +1169,6 @@ static int restore_one_task(int pid, CoreEntry *core)
>>  struct cr_clone_arg {
>>  	struct pstree_item *item;
>>  	unsigned long clone_flags;
>> -	int fd;
>>  
>>  	CoreEntry *core;
>>  };
>> @@ -1378,19 +1377,12 @@ static inline int fork_with_pid(struct pstree_item *item)
>>  
>>  	pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags);
>>  
>> -	ca.fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
>> -	if (ca.fd < 0)
>> -		goto err;
>> -
>>  	wait_pid_ns_helper_prepared(pid_ns, item->pid);
>>  
>>  	if (set_pid_ns_for_children(pid_ns, item->pid) < 0)
>> -		goto err_close;
>> +		goto err;
>>  
>> -	if (flock(ca.fd, LOCK_EX)) {
>> -		pr_perror("%d: Can't lock %s", pid, LAST_PID_PATH);
>> -		goto err_close;
>> -	}
>> +	lock_last_pid();
>>  
>>  	ret = do_fork_with_pid(item, pid_ns, &ca);
>>  	if (ret < 0) {
>> @@ -1399,10 +1391,7 @@ static inline int fork_with_pid(struct pstree_item *item)
>>  	}
>>  
>>  err_unlock:
>> -	if (flock(ca.fd, LOCK_UN))
>> -		pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
>> -err_close:
>> -	close(ca.fd);
>> +	unlock_last_pid();
>>  err:
>>  	if (ca.core)
>>  		core_entry__free_unpacked(ca.core, NULL);
>> @@ -1702,9 +1691,6 @@ static int restore_task_with_children(void *_arg)
>>  			goto err;
>>  	}
>>  
>> -	if ( !(ca->clone_flags & CLONE_FILES))
>> -		close_safe(&ca->fd);
>> -
>>  	ret = clone_service_fd(rsti(current)->service_fd_id);
>>  	if (ret)
>>  		goto err;
>> @@ -2080,7 +2066,7 @@ static int write_restored_pid(void)
>>  static int restore_root_task(struct pstree_item *init)
>>  {
>>  	enum trace_flags flag = TRACE_ALL;
>> -	int ret, fd, mnt_ns_fd = -1;
>> +	int ret, fd, mnt_ns_fd = -1, lock_fd = -1;
>>  	int root_seized = 0;
>>  	struct pstree_item *item;
>>  
>> @@ -2130,11 +2116,19 @@ static int restore_root_task(struct pstree_item *init)
>>  	if (prepare_namespace_before_tasks())
>>  		return -1;
>>  
>> +	lock_fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
>> +	if (lock_fd < 0)
>> +		goto out;
>> +	if (flock(lock_fd, LOCK_EX)) {
>> +		pr_perror("Can't flock ns_last_pid");
>> +		goto out_close;
>> +	}
> 
> We need to lock ns_last_pid only for a case when processes are restored
> in a current pid namespace.

I've send a new series ([PATCH 00/14] Refactor pid_ns helpers creation),
but now I'm not sure: isn't this synchronization to run tests in parallel?
This case, it's need independently of CLONE_NEWPID.
 
> 
>> +
>>  	__restore_switch_stage_nw(CR_STATE_ROOT_TASK);
>>  
>>  	ret = fork_with_pid(init);
>>  	if (ret < 0)
>> -		goto out;
>> +		goto out_unlock;
>>  
>>  	restore_origin_ns_hook();
>>  
>> @@ -2286,6 +2280,9 @@ static int restore_root_task(struct pstree_item *init)
>>  	ret = restore_switch_stage(CR_STATE_RESTORE_CREDS);
>>  	BUG_ON(ret);
>>  
>> +	if (flock(lock_fd, LOCK_UN) || close_safe(&lock_fd))
>> +		pr_perror("Can't unlock or close ns_last_pid");
>> +
>>  	timing_stop(TIME_RESTORE);
>>  
>>  	ret = catch_tasks(root_seized, &flag);
>> @@ -2354,7 +2351,11 @@ static int restore_root_task(struct pstree_item *init)
>>  			if (vpid(pi) > 0)
>>  				kill(vpid(pi), SIGKILL);
>>  	}
>> -
>> +out_unlock:
>> +	if (lock_fd >= 0 && flock(lock_fd, LOCK_UN))
>> +		pr_perror("Can't unlock ns_last_pid");
>> +out_close:
>> +	close_safe(&lock_fd);
>>  out:
>>  	fini_cgroup();
>>  	depopulate_roots_yard(mnt_ns_fd, true);
>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>> index 7a38dfd4a..2a4e1327c 100644
>> --- a/criu/namespaces.c
>> +++ b/criu/namespaces.c
>> @@ -2620,7 +2620,7 @@ static int pid_ns_helper(struct ns_id *ns, int sk)
>>  
>>  static int do_create_pid_ns_helper(void *arg, int sk, pid_t unused_pid)
>>  {
>> -	int pid_ns_fd, mnt_ns_fd, fd, i, lock_fd, transport_fd;
>> +	int pid_ns_fd, mnt_ns_fd, fd, i, transport_fd;
>>  	struct ns_id *ns, *tmp;
>>  	struct pid *pid;
>>  	pid_t child;
>> @@ -2660,21 +2660,12 @@ static int do_create_pid_ns_helper(void *arg, int sk, pid_t unused_pid)
>>  		goto err;
>>  	}
>>  
>> -	lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
>> -	if (lock_fd < 0) {
>> -		pr_perror("Unable to open /proc/" LAST_PID_PATH);
>> -		goto err;
>> -	}
>> -
>>  	if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
>>  		pr_err("Can't restore ns\n");
>>  		goto err;
>>  	}
>>  
>> -	if (flock(lock_fd, LOCK_EX)) {
>> -		close(lock_fd);
>> -		pr_perror("Can't lock %s", LAST_PID_PATH);
>> -	}
>> +	lock_last_pid();
>>  
>>  	transport_fd = get_service_fd(TRANSPORT_FD_OFF);
>>  	/*
>> @@ -2684,18 +2675,14 @@ static int do_create_pid_ns_helper(void *arg, int sk, pid_t unused_pid)
>>  	 */
>>  	for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
>>  		if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
>> +			unlock_last_pid();
>>  			pr_err("Can't set next pid using helper\n");
>> -			flock(lock_fd, LOCK_UN);
>> -			close(lock_fd);
>>  			goto err;
>>  		}
>>  	child = fork();
>> -	if (!child) {
>> -		close(lock_fd);
>> +	if (!child)
>>  		exit(pid_ns_helper(ns, sk));
>> -	}
>> -	flock(lock_fd, LOCK_UN);
>> -	close(lock_fd);
>> +	unlock_last_pid();
>>  	if (child < 0) {
>>  		pr_perror("Can't fork");
>>  		goto err;
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index 6e2f7f4d7..eee683bac 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -1440,20 +1440,18 @@ long __export_restore_task(struct task_restore_args *args)
>>  				   CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS;
>>  		long last_pid_len;
>>  		long parent_tid;
>> -		int i, fd;
>> -
>> -		fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
>> -		if (fd < 0) {
>> -			pr_err("can't open last pid fd %d\n", fd);
>> -			goto core_restore_end;
>> +		int i, fd = -1;
>> +
>> +		if (thread_args[0].pid[1] == 0) {
>> +			/* One level pid ns hierarhy */
>> +			fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
>> +			if (fd < 0) {
>> +				pr_err("can't open last pid fd %d\n", fd);
>> +				goto core_restore_end;
>> +			}
>>  		}
>>  
>> -		ret = sys_flock(fd, LOCK_EX);
>> -		if (ret) {
>> -			pr_err("Can't lock last_pid %d\n", fd);
>> -			sys_close(fd);
>> -			goto core_restore_end;
>> -		}
>> +		mutex_lock(&task_entries_local->last_pid_mutex);
>>  
>>  		for (i = 0; i < args->nr_threads; i++) {
>>  			char last_pid_buf[16], *s;
>> @@ -1461,13 +1459,14 @@ long __export_restore_task(struct task_restore_args *args)
>>  			if (thread_args[i].pid[0] == args->t->pid[0])
>>  				continue;
>>  
>> -			if (thread_args[i].pid[1] == 0) {
>> +			if (fd >= 0) {
>>  				/* One level pid ns hierarhy */
>>  				last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid[0] - 1, &s);
>>  				sys_lseek(fd, 0, SEEK_SET);
>>  				ret = sys_write(fd, s, last_pid_len);
>>  				if (ret < 0) {
>>  					pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
>> +					mutex_unlock(&task_entries_local->last_pid_mutex);
>>  					sys_close(fd);
>>  					goto core_restore_end;
>>  				}
>> @@ -1477,7 +1476,7 @@ long __export_restore_task(struct task_restore_args *args)
>>  						break;
>>  					if (request_set_next_pid(args->pid_ns_id[k], thread_args[i].pid[k], args->transport_fd) < 0) {
>>  						pr_err("Can't request to set pid\n");
>> -						sys_close(fd);
>> +						mutex_unlock(&task_entries_local->last_pid_mutex);
>>  						goto core_restore_end;
>>  					}
>>  				}
>> @@ -1494,14 +1493,9 @@ long __export_restore_task(struct task_restore_args *args)
>>  			RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
>>  		}
>>  
>> -		ret = sys_flock(fd, LOCK_UN);
>> -		if (ret) {
>> -			pr_err("Can't unlock last_pid %ld\n", ret);
>> +		mutex_unlock(&task_entries_local->last_pid_mutex);
>> +		if (fd >= 0)
>>  			sys_close(fd);
>> -			goto core_restore_end;
>> -		}
>> -
>> -		sys_close(fd);
>>  	}
>>  
>>  	restore_rlims(args);
>>


More information about the CRIU mailing list