[CRIU] OverlayFS Patch
Pavel Emelyanov
xemul at parallels.com
Wed Aug 5 04:00:16 PDT 2015
Gabriel,
Please, find my comments inline.
> diff --git a/crtools.c b/crtools.c
> index 6af6080..521d0ff 100644
> --- a/crtools.c
> +++ b/crtools.c
> @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
> { "enable-fs", required_argument, 0, 1065 },
> { "enable-external-sharing", no_argument, 0, 1066 },
> { "enable-external-masters", no_argument, 0, 1067 },
> + { "overlayfs", no_argument, 0, 1068 },
With AUFS workaround we ended up with auto-detection scheme -- when
meeting an AUFS mount the opts.aufs is set to true. Can we do the
same with overlayfs?
> { },
> };
>
> @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
> case 1067:
> opts.enable_external_masters = true;
> break;
> + case 1068:
> + opts.overlayfs = true;
> + break;
> case 'M':
> {
> char *aux;
> diff --git a/files.c b/files.c
> index c82920d..2c6cca0 100644
> --- a/files.c
> +++ b/files.c
> @@ -30,6 +30,7 @@
> #include "eventfd.h"
> #include "eventpoll.h"
> #include "fsnotify.h"
> +#include "mount.h"
> #include "signalfd.h"
> #include "namespaces.h"
> #include "tun.h"
> @@ -157,6 +158,45 @@ void show_saved_files(void)
> }
>
> /*
> + * Workaround for the OverlayFS bug present before Kernel 4.2
> + *
> + * When a process has a file open in an OverlayFS directory,
> + * the information in /proc/<pid>/fd/<fd> and /proc/<pid>/fdinfo/<fd>
> + * is wrong, so, if --overlayfs is specified, we grab that information
> + * from the mountinfo table instead.
> + *
> + * This is done every time fill_fdlink is called. See lookup_overlayfs
> + * for more details.
> + */
> +static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link)
> +{
> + struct mount_info *m;
> +
> + if (!link)
> + return 0;
> +
> + m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id);
> + if (!m)
> + return 0;
NULL here means both -- error occurred and nothing-to-do. Can you
check for errors using the ERR_PTR notation?
> +
> + p->mnt_id = m->mnt_id;
> +
> + /* Stores the correct file path in p->link->name */
Can you extend this comment specifying what the incorrectness is.
> + if (strcmp("./", m->mountpoint) != 0) {
> + char buf[PATH_MAX];
> + int n;
> +
> + strncpy(buf, link->name, PATH_MAX);
> + n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2);
> + if (n >= PATH_MAX) {
> + pr_err("Not enough space to replace %s\n", buf);
> + return -1;
> + }
> + }
> + return 0;
> +}
> +
> +/*
> * The gen_id thing is used to optimize the comparison of shared files.
> * If two files have different gen_ids, then they are different for sure.
> * If it matches, we don't know it and have to call sys_kcmp().
> @@ -206,6 +246,10 @@ int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link)
> }
>
> link->len = len + 1;
> +
> + if (opts.overlayfs)
> + if (fixup_overlayfs((struct fd_parms *)p, link) < 0)
> + return -1;
> return 0;
> }
>
> diff --git a/include/cr_options.h b/include/cr_options.h
> index 19c2f77..011349c 100644
> --- a/include/cr_options.h
> +++ b/include/cr_options.h
> @@ -79,6 +79,7 @@ struct cr_options {
> bool enable_external_sharing;
> bool enable_external_masters;
> bool aufs; /* auto-deteced, not via cli */
> + bool overlayfs;
> };
>
> extern struct cr_options opts;
> diff --git a/include/mount.h b/include/mount.h
> index 0d5fc7f..01da3f5 100644
> --- a/include/mount.h
> +++ b/include/mount.h
> @@ -22,6 +22,8 @@ extern int prepare_mnt_ns(void);
> extern int pivot_root(const char *new_root, const char *put_old);
>
> struct mount_info;
> +struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev,
> + unsigned int st_ino, unsigned int mnt_id);
> extern struct mount_info *lookup_mnt_id(unsigned int id);
> extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev);
>
> diff --git a/mount.c b/mount.c
> index acd74f0..6072050 100644
> --- a/mount.c
> +++ b/mount.c
> @@ -127,6 +127,95 @@ static inline int fsroot_mounted(struct mount_info *mi)
> return is_root(mi->root);
> }
>
> +static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath,
> + unsigned int st_dev, unsigned int st_ino,
> + unsigned int mnt_id)
> +{
> + /*
> + * Goes through all entries in the mountinfo table
> + * looking for a mount point that contains the file specified
> + * in rpath. Uses the device number st_dev and the inode number st_ino
> + * to make sure the file is correct.
> + */
> + struct mount_info *mi_ret = NULL;
> + struct mount_info *m;
> +
> + for (m = list; m != NULL; m = m->next) {
> + /* Case 1: If mnt_id is correct, don't do anything */
> + if (st_dev == m->s_dev && mnt_id == m->mnt_id)
> + return NULL;
Shouldn't this if () be under the next one, so that everything that happens
here applies to overlayfs only?
> +
> + /*
> + * Case 2: If overlayFS we try to find the correct mnt_id.
> + *
> + * Note that even if we do find something, we want to keep looking for
> + * Case 1, which takes precedence.
> + */
> + if (m->fstype->code == FSTYPE__OVERLAYFS) {
> + char _abs_path[PATH_MAX];
> + struct stat f_stat;
> + int ret_stat;
> +
> + /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */
> + if (strcmp("./", m->mountpoint) == 0)
> + ret_stat = stat(rpath + 1, &f_stat);
> + else {
> + int n = snprintf(_abs_path, PATH_MAX, "%s/%s", m->mountpoint + 1, rpath);
> +
> + if (n >= PATH_MAX) {
> + pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath);
> + return NULL;
> + }
> + ret_stat = stat(_abs_path, &f_stat);
> + }
> +
> + if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino)
> + mi_ret = m;
> + }
> + }
> +
> + return mi_ret;
> +}
> +
> +/*
> + * Looks up the mnt_id and path of a file in an overlayFS directory.
> + *
> + * This is useful in order to fix the OverlayFS bug present in the
> + * Linux Kernel before version 4.2. See fixup_overlayfs for details.
> + *
> + * We first enter the mount namespace of the process and check to see
> + * if there are any overlayFS mounted directories in the mountinfo table.
> + * If so, we concatenate the mountpoint with the name of the file, and
> + * stat the resulting path to check if we found the correct device id
> + * and node number. If that is the case, we update the mount id and
> + * link variables with the correct values.
> + *
> + * This update is skipped if there is an entry in the
> + * mountinfo table with the same known device number and
> + * mount id, signifying that we already have the correct id.
> + *
> + * At the end of this procedure, we exit the mount namespace
> + * of the process, back to the original one.
> + */
> +struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
> + unsigned int st_ino, unsigned int mnt_id)
> +{
> + int ns_old = -1;
> + struct mount_info *mi;
> +
> + if (switch_ns(root_item->pid.real, &mnt_ns_desc, &ns_old) < 0) {
The switch_ns call is quite slow. If you only need to stat the file by path
in proper mount namespace, you can get the namespace's root fd with the
mntns_get_root_fd() call and then do fstatat(mntns_root, path, ...);
> + pr_err("Failed to switch into mount namespace of PID %d\n", root_item->pid.real);
> + return NULL;
> + }
> + mi = __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
> +
> + if (restore_ns(ns_old, &mnt_ns_desc)) {
> + pr_err("Failed to restore to old mount namespace %d\n", ns_old);
> + return NULL;
> + }
> + return mi;
> +}
> +
> static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
> {
> struct mount_info *m;
> @@ -1365,6 +1454,9 @@ static struct fstype fstypes[32] = {
> .code = FSTYPE__FUSE,
> .dump = always_fail,
> .restore = always_fail,
> + }, {
> + .name = "overlay",
> + .code = FSTYPE__OVERLAYFS,
> },
> };
>
> diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto
> index 6f8e7d1..6e58e1d 100644
> --- a/protobuf/mnt.proto
> +++ b/protobuf/mnt.proto
> @@ -18,6 +18,7 @@ enum fstype {
> MQUEUE = 14;
> FUSE = 15;
> AUTO = 16;
> + OVERLAYFS = 17;
> };
>
> message mnt_entry {
-- Pavel
More information about the CRIU
mailing list