[CRIU] OverlayFS Patch

Pavel Emelyanov xemul at parallels.com
Wed Aug 5 04:00:16 PDT 2015


Gabriel,

Please, find my comments inline.

> diff --git a/crtools.c b/crtools.c
> index 6af6080..521d0ff 100644
> --- a/crtools.c
> +++ b/crtools.c
> @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
>  		{ "enable-fs",			required_argument,	0, 1065 },
>  		{ "enable-external-sharing", 	no_argument, 		0, 1066 },
>  		{ "enable-external-masters", 	no_argument, 		0, 1067 },
> +		{ "overlayfs",			no_argument,		0, 1068 },

With AUFS workaround we ended up with auto-detection scheme -- when
meeting an AUFS mount the opts.aufs is set to true. Can we do the
same with overlayfs?

>  		{ },
>  	};
>  
> @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
>  		case 1067:
>  			opts.enable_external_masters = true;
>  			break;
> +		case 1068:
> +			opts.overlayfs = true;
> +			break;
>  		case 'M':
>  			{
>  				char *aux;
> diff --git a/files.c b/files.c
> index c82920d..2c6cca0 100644
> --- a/files.c
> +++ b/files.c
> @@ -30,6 +30,7 @@
>  #include "eventfd.h"
>  #include "eventpoll.h"
>  #include "fsnotify.h"
> +#include "mount.h"
>  #include "signalfd.h"
>  #include "namespaces.h"
>  #include "tun.h"
> @@ -157,6 +158,45 @@ void show_saved_files(void)
>  }
>  
>  /*
> + * Workaround for the OverlayFS bug present before Kernel 4.2
> + *
> + * When a process has a file open in an OverlayFS directory,
> + * the information in /proc/<pid>/fd/<fd> and /proc/<pid>/fdinfo/<fd>
> + * is wrong, so, if --overlayfs is specified, we grab that information
> + * from the mountinfo table instead.
> + *
> + * This is done every time fill_fdlink is called. See lookup_overlayfs
> + * for more details.
> + */
> +static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link)
> +{
> +	struct mount_info *m;
> +
> +	if (!link)
> +		return 0;
> +
> +	m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id);
> +	if (!m)
> +		return 0;

NULL here means both -- error occurred and nothing-to-do. Can you
check for errors using the ERR_PTR notation?

> +
> +	p->mnt_id = m->mnt_id;
> +
> +	/* Stores the correct file path in p->link->name */

Can you extend this comment specifying what the incorrectness is.

> +	if (strcmp("./", m->mountpoint) != 0) {
> +		char buf[PATH_MAX];
> +		int n;
> +
> +		strncpy(buf, link->name, PATH_MAX);
> +		n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2);
> +		if (n >= PATH_MAX) {
> +			pr_err("Not enough space to replace %s\n", buf);
> +			return -1;
> +		}
> +	}
> +	return 0;
> +}
> +
> +/*
>   * The gen_id thing is used to optimize the comparison of shared files.
>   * If two files have different gen_ids, then they are different for sure.
>   * If it matches, we don't know it and have to call sys_kcmp().
> @@ -206,6 +246,10 @@ int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link)
>  	}
>  
>  	link->len = len + 1;
> +
> +	if (opts.overlayfs)
> +		if (fixup_overlayfs((struct fd_parms *)p, link) < 0)
> +			return -1;
>  	return 0;
>  }
>  
> diff --git a/include/cr_options.h b/include/cr_options.h
> index 19c2f77..011349c 100644
> --- a/include/cr_options.h
> +++ b/include/cr_options.h
> @@ -79,6 +79,7 @@ struct cr_options {
>  	bool			enable_external_sharing;
>  	bool			enable_external_masters;
>  	bool			aufs;		/* auto-deteced, not via cli */
> +	bool			overlayfs;
>  };
>  
>  extern struct cr_options opts;
> diff --git a/include/mount.h b/include/mount.h
> index 0d5fc7f..01da3f5 100644
> --- a/include/mount.h
> +++ b/include/mount.h
> @@ -22,6 +22,8 @@ extern int prepare_mnt_ns(void);
>  extern int pivot_root(const char *new_root, const char *put_old);
>  
>  struct mount_info;
> +struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev,
> +				unsigned int st_ino, unsigned int mnt_id);
>  extern struct mount_info *lookup_mnt_id(unsigned int id);
>  extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev);
>  
> diff --git a/mount.c b/mount.c
> index acd74f0..6072050 100644
> --- a/mount.c
> +++ b/mount.c
> @@ -127,6 +127,95 @@ static inline int fsroot_mounted(struct mount_info *mi)
>  	return is_root(mi->root);
>  }
>  
> +static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath,
> +						unsigned int st_dev, unsigned int st_ino,
> +						unsigned int mnt_id)
> +{
> +	/*
> +	 * Goes through all entries in the mountinfo table
> +	 * looking for a mount point that contains the file specified
> +	 * in rpath. Uses the device number st_dev and the inode number st_ino
> +	 * to make sure the file is correct.
> +	 */
> +	struct mount_info *mi_ret = NULL;
> +	struct mount_info *m;
> +
> +	for (m = list; m != NULL; m = m->next) {
> +		/* Case 1: If mnt_id is correct, don't do anything */
> +		if (st_dev == m->s_dev && mnt_id == m->mnt_id)
> +			return NULL;

Shouldn't this if () be under the next one, so that everything that happens
here applies to overlayfs only?

> +
> +		/*
> +		 * Case 2: If overlayFS we try to find the correct mnt_id.
> +		 *
> +		 * Note that even if we do find something, we want to keep looking for
> +		 * Case 1, which takes precedence.
> +		 */
> +		if (m->fstype->code == FSTYPE__OVERLAYFS) {
> +			char _abs_path[PATH_MAX];
> +			struct stat f_stat;
> +			int ret_stat;
> +
> +			/* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */
> +			if (strcmp("./", m->mountpoint) == 0)
> +				ret_stat = stat(rpath + 1, &f_stat);
> +			else {
> +				int n = snprintf(_abs_path, PATH_MAX, "%s/%s", m->mountpoint + 1, rpath);
> +
> +				if (n >= PATH_MAX) {
> +					pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath);
> +					return NULL;
> +				}
> +				ret_stat = stat(_abs_path, &f_stat);
> +			}
> +
> +			if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino)
> +				mi_ret = m;
> +		}
> +	}
> +
> +	return mi_ret;
> +}
> +
> +/*
> + * Looks up the mnt_id and path of a file in an overlayFS directory.
> + *
> + * This is useful in order to fix the OverlayFS bug present in the
> + * Linux Kernel before version 4.2. See fixup_overlayfs for details.
> + *
> + * We first enter the mount namespace of the process and check to see
> + * if there are any overlayFS mounted directories in the mountinfo table.
> + * If so, we concatenate the mountpoint with the name of the file, and
> + * stat the resulting path to check if we found the correct device id
> + * and node number. If that is the case, we update the mount id and
> + * link variables with the correct values.
> + *
> + * This update is skipped if there is an entry in the
> + * mountinfo table with the same known device number and
> + * mount id, signifying that we already have the correct id.
> + *
> + * At the end of this procedure, we exit the mount namespace
> + * of the process, back to the original one.
> + */
> +struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
> +					unsigned int st_ino, unsigned int mnt_id)
> +{
> +	int ns_old = -1;
> +	struct mount_info *mi;
> +
> +	if (switch_ns(root_item->pid.real, &mnt_ns_desc, &ns_old) < 0) {

The switch_ns call is quite slow. If you only need to stat the file by path
in proper mount namespace, you can get the namespace's root fd with the
mntns_get_root_fd() call and then do fstatat(mntns_root, path, ...);

> +		pr_err("Failed to switch into mount namespace of PID %d\n", root_item->pid.real);
> +		return NULL;
> +	}
> +	mi = __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
> +
> +	if (restore_ns(ns_old, &mnt_ns_desc)) {
> +		pr_err("Failed to restore to old mount namespace %d\n", ns_old);
> +		return NULL;
> +	}
> +	return mi;
> +}
> +
>  static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
>  {
>  	struct mount_info *m;
> @@ -1365,6 +1454,9 @@ static struct fstype fstypes[32] = {
>  		.code = FSTYPE__FUSE,
>  		.dump = always_fail,
>  		.restore = always_fail,
> +	}, {
> +		.name = "overlay",
> +		.code = FSTYPE__OVERLAYFS,
>  	},
>  };
>  
> diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto
> index 6f8e7d1..6e58e1d 100644
> --- a/protobuf/mnt.proto
> +++ b/protobuf/mnt.proto
> @@ -18,6 +18,7 @@ enum fstype {
>  	MQUEUE			= 14;
>  	FUSE			= 15;
>  	AUTO			= 16;
> +	OVERLAYFS		= 17;
>  };
>  
>  message mnt_entry {

-- Pavel


More information about the CRIU mailing list