[CRIU] [PATCH v4 2/3] mount: Forced mount unmounted binfmt_misc to do not lost its content

Kirill Tkhai ktkhai at virtuozzo.com
Tue Aug 2 06:25:10 PDT 2016


On 02.08.2016 16:00, Pavel Emelyanov wrote:
> On 07/20/2016 01:13 PM, Kirill Tkhai wrote:
>> Umount does not remove binfmt_misc content. If it's mounted once again,
>> the same entries remain registered.
>>
>> Criu does not dump content of umounted binfmt_misc. So, after C/R we
>> lose it at all.
>>
>> This patch forces mounting of unmounted binfmt_misc after we collected
>> mountpoints. If it's unmounted, we mount it back and add this mount
>> to the tree of collected mounted mountpoints. Further, binfmt_misc
>> content is dumped in usual way with the only difference, that mount
>> point itself is not dumped.
>>
>> v2: Print error in case of umount() fail.
>>     Move add_forced_mount() to another patch.
>> v3: Close binfmt_misc dir before its umount().
>> v4: Do not dump forced mounted mountpoint.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>> ---
>>  criu/cr-dump.c       |    1 
>>  criu/include/mount.h |    1 
>>  criu/mount.c         |  155 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 156 insertions(+), 1 deletion(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index febef16..b53e45f 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1616,6 +1616,7 @@ static int cr_dump_finish(int ret)
>>  	if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) {
>>  		network_unlock();
>>  		delete_link_remaps();
>> +		clean_cr_time_mounts();
>>  	}
>>  	pstree_switch_state(root_item,
>>  			    (ret || post_dump_ret) ?
>> diff --git a/criu/include/mount.h b/criu/include/mount.h
>> index c7992ac..c8668c9 100644
>> --- a/criu/include/mount.h
>> +++ b/criu/include/mount.h
>> @@ -127,5 +127,6 @@ extern int ext_mount_add(char *key, char *val);
>>  extern int mntns_maybe_create_roots(void);
>>  extern int read_mnt_ns_img(void);
>>  extern void cleanup_mnt_ns(void);
>> +extern void clean_cr_time_mounts(void);
>>  
>>  #endif /* __CR_MOUNT_H__ */
>> diff --git a/criu/mount.c b/criu/mount.c
>> index 6512f51..d583429 100644
>> --- a/criu/mount.c
>> +++ b/criu/mount.c
>> @@ -40,6 +40,7 @@
>>  #undef	LOG_PREFIX
>>  #define LOG_PREFIX "mnt: "
>>  
>> +#define BINFMT_MISC_HOME "/proc/sys/fs/binfmt_misc"
>>  static struct fstype fstypes[];
>>  
>>  int ext_mount_add(char *key, char *val)
>> @@ -1135,6 +1136,53 @@ static int attach_option(struct mount_info *pm, char *opt)
>>  	return pm->options ? 0 : -1;
>>  }
>>  
>> +static int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev)
>> +{
>> +	struct mount_info *mi, *t, *parent;
>> +
>> +	mi = mnt_entry_alloc();
>> +	if (!mi)
>> +		return -1;
>> +	mi->mountpoint = xmalloc(strlen(path) + 2);
>> +	if (!mi->mountpoint)
>> +		return -1;
>> +	mi->ns_mountpoint = mi->mountpoint;
>> +	sprintf(mi->mountpoint, ".%s", path);
>> +	mi->mnt_id = mi->flags = mi->sb_flags = 0;
>> +	mi->root = xstrdup("/");
>> +	mi->fsname = xstrdup(fsname);
>> +	mi->source = xstrdup(fsname);
>> +	mi->options = xstrdup("");
>> +	if (!mi->root || !mi->fsname || !mi->source || !mi->options)
>> +		return -1;
>> +	mi->fstype = find_fstype_by_name(fsname);
>> +
>> +	mi->s_dev = mi->s_dev_rt = s_dev;
>> +
>> +	parent = root;
>> +	while (1) {
>> +		list_for_each_entry(t, &parent->children, siblings) {
>> +			if (strstr(mi->mountpoint, t->mountpoint) == mi->mountpoint) {
> 
> The strstr search for substring, you need to check whether one string
> _starts_ with another, don't you? We have strstartswith() helper for it.

OK

> 
>> +				parent = t;
>> +				break;
>> +			}
>> +		}
>> +		if (&t->siblings == &parent->children)
>> +			break;
>> +	}
>> +
>> +	mi->nsid = parent->nsid;
>> +	mi->parent = parent;
>> +	mi->parent_mnt_id = parent->mnt_id;
>> +	mi->next = parent->next;
>> +	parent->next = mi;
>> +	list_add(&mi->siblings, &parent->children);
>> +	pr_info("Add cr-time mountpoint %s with parent %s(%u)\n",
>> +		mi->mountpoint, parent->mountpoint, parent->mnt_id);
>> +	return 0;
>> +}
>> +
>> +
>>  /* Is it mounted w or w/o the newinstance option */
>>  static int devpts_parse(struct mount_info *pm)
>>  {
>> @@ -1272,6 +1320,65 @@ static int devtmpfs_restore(struct mount_info *pm)
>>  	return ret;
>>  }
>>  
>> +static int try_mount_binfmt_misc(struct ns_id *ns, unsigned int *s_dev)
>> +{
>> +	int num, mnt_fd, ret, exit_code = -1;
>> +	struct dirent *de;
>> +	struct stat st;
>> +	DIR *dir;
>> +
>> +	ret = switch_ns(ns->ns_pid, &mnt_ns_desc, &mnt_fd);
>> +	if (ret < 0) {
>> +		pr_err("Can't switch mnt_ns\n");
>> +		goto out;
>> +	}
>> +
>> +	ret = mount("binfmt_misc", BINFMT_MISC_HOME, "binfmt_misc", 0, NULL);
>> +	if (ret < 0) {
>> +		if (errno == EPERM) {
>> +			pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n");
>> +			exit_code = 0;
>> +			goto restore_ns;
>> +		}
>> +		if (errno != EBUSY && errno != ENODEV && errno != ENOENT) {
>> +			pr_perror("Can't mount binfmt_misc");
>> +			goto restore_ns;
>> +		}
>> +		pr_info("Prepare binfmt_misc: skipping(%d)\n", errno);
>> +	} else {
>> +		dir = opendir(BINFMT_MISC_HOME);
>> +		if (!dir) {
>> +			pr_perror("Can't read binfmt_misc dir");
>> +			goto restore_ns;
>> +		}
>> +
>> +		num = 0;
>> +		/* ".", "..", "register", "status" */
>> +		while (num <= 4 && (de = readdir(dir)) != NULL)
>> +			num++;
>> +		closedir(dir);
>> +		if (num <= 4) {
>> +			/* No entries */
>> +			if (umount(BINFMT_MISC_HOME) < 0)
>> +				pr_perror("Can't umount "BINFMT_MISC_HOME"\n");
>> +		} else {
>> +			if (stat(BINFMT_MISC_HOME, &st) < 0)
>> +				pr_perror("Can't stat on binfmt_misc path\n");
>> +			else {
>> +				*s_dev = st.st_dev;
>> +				exit_code = 1;
>> +			}
>> +			goto restore_ns;
>> +		}
> 
> What does this else branch do? Why can't we just add the entry and
> let ->dump callback do whatever is appropriate?

This branch searches for dev of newly mounted binfmt_misc, because
->dump callback checks for mount is virtual or not. I can pass 0 as
dev here if you want. Do you?

>> +	}
>> +
>> +	exit_code = 0;
>> +restore_ns:
>> +	ret = restore_ns(mnt_fd, &mnt_ns_desc);
>> +out:
>> +	return ret ? -1 : exit_code;
>> +}
>> +
>>  static int binfmt_misc_virtual(struct mount_info *pm)
>>  {
>>  	return kerndat_fs_virtualized(KERNDAT_FS_STAT_BINFMT_MISC, pm->s_dev);
>> @@ -1873,6 +1980,11 @@ static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img)
>>  	if (!pm->dumped && dump_one_fs(pm))
>>  		return -1;
>>  
>> +	if (pm->mnt_id == 0) {
>> +		pr_info("Skip dumping cr-time mountpoint: %s\n", pm->mountpoint);
>> +		return 0;
>> +	}
>> +
>>  	me.mnt_id		= pm->mnt_id;
>>  	me.root_dev		= pm->s_dev;
>>  	me.parent_mnt_id	= pm->parent_mnt_id;
>> @@ -1931,7 +2043,8 @@ static void free_mntinfo(struct mount_info *pms)
>>  
>>  struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
>>  {
>> -	struct mount_info *pm;
>> +	struct mount_info *pm, *mi;
>> +	int ret;
>>  
>>  	pm = parse_mountinfo(ns->ns_pid, ns, for_dump);
>>  	if (!pm) {
>> @@ -1943,6 +2056,22 @@ struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
>>  	if (ns->mnt.mntinfo_tree == NULL)
>>  		goto err;
>>  
>> +	if (for_dump && ns->type == NS_ROOT) {
>> +		for (mi = pm; mi != NULL; mi = mi->next) {
>> +			if (strcmp(mi->fstype->name, "binfmt_misc") == 0)
>> +				break;
>> +		}
> 
> You don't have to search for the entry, just declare the ->parse callback
> and set the boolean flag. Or add flag (or count) on fstype struct and count
> for mountpoints of desired fstype.

What is "just declare the ->parse callback"? 
 
>> +		if (!mi) {
>> +			unsigned int s_dev = 0;
>> +			ret = try_mount_binfmt_misc(ns, &s_dev);
>> +			if (ret < 0)
>> +				return NULL;
>> +			if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc",
>> +							 BINFMT_MISC_HOME, s_dev) < 0)
>> +				return NULL;
>> +		}
>> +	}
>> +
>>  	ns->mnt.mntinfo_list = pm;
>>  	return pm;
>>  err:
>> @@ -3683,4 +3812,28 @@ int dump_mnt_namespaces(void)
>>  	return 0;
>>  }
>>  
>> +void clean_cr_time_mounts(void)
>> +{
>> +	struct mount_info *mi;
>> +	int mnt_fd, ret;
>> +
>> +	for (mi = mntinfo; mi; mi = mi->next) {
>> +		if (mi->mnt_id)
>> +			continue;
>> +		ret = switch_ns(mi->nsid->ns_pid, &mnt_ns_desc, &mnt_fd);
>> +		if (ret) {
>> +			pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid);
>> +			continue;
>> +		}
>> +
>> +		if (umount(mi->mountpoint) < 0)
>> +			pr_perror("Can't umount forced mount %s\n", mi->mountpoint);
>> +
>> +		if (restore_ns(mnt_fd, &mnt_ns_desc)) {
> 
> Can we restore it once outside the loop?

How can we restore hypothetical several forced mounts without a loop?
 
>> +			pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n");
>> +			return;
>> +		}
>> +	}
>> +}
>> +
>>  struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU at openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
>> .
>>
> 


More information about the CRIU mailing list