[CRIU] [PATCH 4/5] restore: restore pocesses which share one fdtable (v3)

Pavel Emelyanov xemul at parallels.com
Wed Dec 26 04:26:49 EST 2012


On 12/25/2012 05:10 PM, Andrey Vagin wrote:
> Currenly crtools supports a case when a child shared a fd table
> with parent.
> 
> Here is only two interesting things.
> * Service descriptors should be cloned for each process
>   who shared one fd table.
> * One task should restore files and other tasks should sleep in this
> * time.
> 
> v2: * allocate fdt_lock from shared memory
>     * don't wait a child, if it doesn't share fdtable
> v3: * don't move ids on the pstree image
> 
> Signed-off-by: Andrey Vagin <avagin at openvz.org>
> ---
>  cr-dump.c         | 29 +++++++++++++++++------------
>  cr-restore.c      | 44 ++++++++++++++++++++++++++++++++++++++++----
>  files.c           |  7 +------
>  include/crtools.h |  9 +++++++++
>  include/files.h   |  1 +
>  include/pstree.h  | 10 ++++++++++
>  pstree.c          | 11 +++++++++++
>  7 files changed, 89 insertions(+), 22 deletions(-)
> 
> diff --git a/cr-dump.c b/cr-dump.c
> index 00af534..7d58248 100644
> --- a/cr-dump.c
> +++ b/cr-dump.c
> @@ -781,10 +781,11 @@ static DECLARE_KCMP_TREE(fs_tree, KCMP_FS);
>  static DECLARE_KCMP_TREE(files_tree, KCMP_FILES);
>  static DECLARE_KCMP_TREE(sighand_tree, KCMP_SIGHAND);
>  
> -static int dump_task_kobj_ids(pid_t pid, CoreEntry *core)
> +static int dump_task_kobj_ids(struct pstree_item *item, CoreEntry *core)
>  {
>  	int new;
>  	struct kid_elem elem;
> +	int pid = item->pid.real;
>  
>  	elem.pid = pid;
>  	elem.idx = 0; /* really 0 for all */
> @@ -806,7 +807,7 @@ static int dump_task_kobj_ids(pid_t pid, CoreEntry *core)
>  
>  	new = 0;
>  	core->ids->files_id = kid_generate_gen(&files_tree, &elem, &new);
> -	if (!core->ids->files_id || !new) {
> +	if (!core->ids->files_id || (!new && !shared_fdtable(item))) {

Will it work _properly_ if task share fdtable with its grand-father?

>  		pr_err("Can't make FILES id for %d\n", pid);
>  		return -1;
>  	}
> @@ -937,12 +938,13 @@ err:
>  	return NULL;
>  }
>  
> -static int dump_task_core_all(pid_t pid, const struct proc_pid_stat *stat,
> +static int dump_task_core_all(struct pstree_item *item, const struct proc_pid_stat *stat,
>  		const struct parasite_dump_misc *misc, const struct parasite_ctl *ctl,
>  		const struct cr_fdset *cr_fdset,
>  		struct list_head *vma_area_list)
>  {
>  	int fd_core = fdset_fd(cr_fdset, CR_FD_CORE);
> +	pid_t pid = item->pid.real;
>  	CoreEntry *core;
>  	int ret = -1;
>  
> @@ -954,7 +956,9 @@ static int dump_task_core_all(pid_t pid, const struct proc_pid_stat *stat,
>  	pr_info("Dumping core (pid: %d)\n", pid);
>  	pr_info("----------------------------------------\n");
>  
> -	ret = dump_task_kobj_ids(pid, core);
> +	item->core = core;
> +
> +	ret = dump_task_kobj_ids(item, core);
>  	if (ret)
>  		goto err_free;
>  
> @@ -993,7 +997,6 @@ static int dump_task_core_all(pid_t pid, const struct proc_pid_stat *stat,
>  		goto err_free;
>  
>  err_free:
> -	core_entry_free(core);
>  	pr_info("----------------------------------------\n");
>  
>  	return ret;
> @@ -1606,12 +1609,6 @@ static int dump_one_task(struct pstree_item *item)
>  	if (!cr_fdset)
>  		goto err_cure;
>  
> -	ret = dump_task_files_seized(parasite_ctl, cr_fdset, dfds);
> -	if (ret) {
> -		pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
> -		goto err_cure;
> -	}
> -
>  	ret = parasite_dump_pages_seized(parasite_ctl, &vma_area_list, cr_fdset);
>  	if (ret) {
>  		pr_err("Can't dump pages (pid: %d) with parasite\n", pid);
> @@ -1630,13 +1627,21 @@ static int dump_one_task(struct pstree_item *item)
>  		goto err_cure;
>  	}
>  
> -	ret = dump_task_core_all(pid, &pps_buf, &misc,
> +	ret = dump_task_core_all(item, &pps_buf, &misc,
>  					parasite_ctl, cr_fdset, &vma_area_list);
>  	if (ret) {
>  		pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
>  		goto err_cure;
>  	}
>  
> +	if (!shared_fdtable(item)) {
> +		ret = dump_task_files_seized(parasite_ctl, cr_fdset, dfds);
> +		if (ret) {
> +			pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
> +			goto err_cure;
> +		}
> +	}
> +
>  	ret = dump_task_threads(parasite_ctl, item);
>  	if (ret) {
>  		pr_err("Can't dump threads\n");
> diff --git a/cr-restore.c b/cr-restore.c
> index b3e9572..6b34e83 100644
> --- a/cr-restore.c
> +++ b/cr-restore.c
> @@ -585,13 +585,34 @@ static int pstree_wait_helpers()
>  
>  static int restore_one_alive_task(int pid, CoreEntry *core)
>  {
> +	struct pstree_item *child;
> +
>  	pr_info("Restoring resources\n");
>  
>  	if (pstree_wait_helpers())
>  		return -1;
>  
> -	if (prepare_fds(current))
> -		return -1;
> +	/*
> +	 * Wait all children, who share a current fd table.
> +	 * We should be sure, that children don't use any file
> +	 * descriptor while fdtable is being restored.
> +	 */
> +	list_for_each_entry(child, &current->children, sibling) {
> +		if (!shared_fdtable(child))
> +			continue;
> +		futex_wait_until(child->rst->fdt_lock, FDT_LOCK_SYNC);
> +	}
> +
> +	if (!shared_fdtable(current)) {
> +		if (prepare_fds(current))
> +			return -1;
> +	} else {
> +		/* Notify a parent, that a current is ready for restoring fdtable */
> +		futex_set_and_wake(current->rst->fdt_lock, FDT_LOCK_SYNC);
> +		futex_wait_until(current->parent->rst->fdt_lock, FDT_LOCK_DONE);
> +	}
> +
> +	futex_set_and_wake(current->rst->fdt_lock, FDT_LOCK_DONE);

Move all the futex dances into prepare_fds().

>  
>  	if (prepare_fs(pid))
>  		return -1;
> @@ -671,6 +692,7 @@ static int restore_one_zombie(int pid, int exit_code)
>  {
>  	pr_info("Restoring zombie with %d code\n", exit_code);
>  
> +	futex_set_and_wake(current->rst->fdt_lock, FDT_LOCK_DONE);
>  	if (task_entries != NULL) {
>  		restore_finish_stage(CR_STATE_RESTORE);
>  		zombie_prepare_signals();
> @@ -792,6 +814,9 @@ static inline int fork_with_pid(struct pstree_item *item, unsigned long ns_clone
>  	ca.item = item;
>  	ca.clone_flags = ns_clone_flags;
>  
> +	if (shared_fdtable(item))
> +		ca.clone_flags |= CLONE_FILES;
> +
>  	if (!(ca.clone_flags & CLONE_NEWPID)) {
>  		char buf[32];
>  
> @@ -989,10 +1014,18 @@ static int restore_task_with_children(void *_arg)
>  	int ret;
>  	sigset_t blockmask;
>  
> -	close_safe(&ca->fd);
> -
>  	current = ca->item;
>  
> +	if (ca->clone_flags & CLONE_FILES)
> +		ret = clone_service_fd(false);
> +	else {
> +		close_safe(&ca->fd);

Why is this close only in this branch?

> +		ret = clone_service_fd(true);
> +	}
> +
> +	if (ret != 0)
> +		return -1;
> +
>  	pid = getpid();
>  	if (current->pid.virt != pid) {
>  		pr_err("Pid %d do not match expected %d\n", pid, current->pid.virt);
> @@ -1023,6 +1056,9 @@ static int restore_task_with_children(void *_arg)
>  			exit(-1);
>  	}
>  
> +	if (!shared_fdtable(current))
> +		close_old_fds(current);
> +
>  	/*
>  	 * The block mask will be restored in sigresturn.
>  	 *
> diff --git a/files.c b/files.c
> index 5bf4ffa..9ebef20 100644
> --- a/files.c
> +++ b/files.c
> @@ -496,7 +496,7 @@ static int open_fdinfos(int pid, struct list_head *list, int state)
>  	return ret;
>  }
>  
> -static int close_old_fds(struct pstree_item *me)
> +int close_old_fds(struct pstree_item *me)
>  {
>  	DIR *dir;
>  	struct dirent *de;
> @@ -531,10 +531,6 @@ int prepare_fds(struct pstree_item *me)
>  	u32 ret;
>  	int state;
>  
> -	ret = close_old_fds(me);
> -	if (ret)
> -		goto err;
> -
>  	pr_info("Opening fdinfo-s\n");
>  
>  	for (state = 0; state < ARRAY_SIZE(states); state++) {
> @@ -560,7 +556,6 @@ int prepare_fds(struct pstree_item *me)
>  			break;
>  	}
>  
> -err:
>  	tty_fini_fds();
>  	return ret;
>  }
> diff --git a/include/crtools.h b/include/crtools.h
> index 9d94fd7..815b574 100644
> --- a/include/crtools.h
> +++ b/include/crtools.h
> @@ -8,6 +8,7 @@
>  #include "list.h"
>  #include "util.h"
>  #include "image.h"
> +#include "lock.h"
>  
>  #include "../protobuf/vma.pb-c.h"
>  
> @@ -226,6 +227,12 @@ struct vma_area {
>  #define vma_area_is(vma_area, s)	vma_entry_is(&((vma_area)->vma), s)
>  #define vma_area_len(vma_area)		vma_entry_len(&((vma_area)->vma))
>  
> +enum {
> +	FDT_LOCK_INIT = 0,
> +	FDT_LOCK_SYNC,
> +	FDT_LOCK_DONE,
> +};
> +
>  struct rst_info {
>  	struct list_head	fds;
>  	struct list_head	eventpoll;
> @@ -233,6 +240,8 @@ struct rst_info {
>  
>  	void			*premmapped_addr;
>  	unsigned long		premmapped_len;
> +
> +	futex_t			*fdt_lock;
>  };
>  
>  static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
> diff --git a/include/files.h b/include/files.h
> index fcd9f31..370e3f3 100644
> --- a/include/files.h
> +++ b/include/files.h
> @@ -94,6 +94,7 @@ extern int get_filemap_fd(int pid, VmaEntry *vma_entry);
>  extern int prepare_fs(int pid);
>  extern int set_fd_flags(int fd, int flags);
>  
> +extern int close_old_fds(struct pstree_item *me);
>  #ifndef AT_EMPTY_PATH
>  #define AT_EMPTY_PATH 0x1000
>  #endif
> diff --git a/include/pstree.h b/include/pstree.h
> index e9e01ce..70b990b 100644
> --- a/include/pstree.h
> +++ b/include/pstree.h
> @@ -46,6 +46,16 @@ struct pstree_item {
>  	struct rst_info		rst[0];
>  };
>  
> +static inline int shared_fdtable(struct pstree_item *item) {
> +	return (item->parent && item->parent->state != TASK_HELPER &&
> +		item->core &&
> +		item->core->ids &&
> +		item->parent->core &&
> +		item->parent->core->ids &&
> +		item->core->ids->files_id &&
> +		item->core->ids->files_id == item->parent->core->ids->files_id);

This is awful :( Each pstree should have a bit on it, meaning, that
the fdtable is shared with parent.

> +}
> +
>  extern void free_pstree(struct pstree_item *root_item);
>  extern struct pstree_item *__alloc_pstree_item(bool rst);
>  #define alloc_pstree_item() __alloc_pstree_item(false)
> diff --git a/pstree.c b/pstree.c
> index c9a402e..c5908b6 100644
> --- a/pstree.c
> +++ b/pstree.c
> @@ -5,6 +5,7 @@
>  #include "pstree.h"
>  #include "restorer.h"
>  #include "util.h"
> +#include "lock.h"
>  
>  #include "protobuf.h"
>  #include "protobuf/pstree.pb-c.h"
> @@ -37,6 +38,16 @@ struct pstree_item *__alloc_pstree_item(bool rst)
>  	if (!item)
>  		return NULL;
>  
> +	if (rst) {
> +		item->rst->fdt_lock = shmalloc(sizeof(*item->rst->fdt_lock));
> +		if (item->rst->fdt_lock == NULL) {
> +			xfree(item);
> +			return NULL;
> +		}
> +
> +		futex_init(item->rst->fdt_lock);
> +	}
> +
>  	INIT_LIST_HEAD(&item->children);
>  	INIT_LIST_HEAD(&item->sibling);
>  
> 




More information about the CRIU mailing list