[CRIU] OverlayFS Patch

Gabriel Guimaraes gabriellimaguimaraes at gmail.com
Wed Aug 5 12:28:11 PDT 2015


Here is the new post review patch!

2015-08-05 10:51 GMT-07:00 Gabriel Guimaraes <guimaraesg at google.com>:

> Hi Pavel,
>
> Thanks for the review.
>
> On Wed, Aug 5, 2015 at 4:00 AM, Pavel Emelyanov <xemul at parallels.com>
> wrote:
>
>> Gabriel,
>>
>> Please, find my comments inline.
>>
>> > diff --git a/crtools.c b/crtools.c
>> > index 6af6080..521d0ff 100644
>> > --- a/crtools.c
>> > +++ b/crtools.c
>> > @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
>> >               { "enable-fs",                  required_argument,
>> 0, 1065 },
>> >               { "enable-external-sharing",    no_argument,
>> 0, 1066 },
>> >               { "enable-external-masters",    no_argument,
>> 0, 1067 },
>> > +             { "overlayfs",                  no_argument,
>> 0, 1068 },
>>
>> With AUFS workaround we ended up with auto-detection scheme -- when
>> meeting an AUFS mount the opts.aufs is set to true. Can we do the
>> same with overlayfs?
>>
>
> Technically we could always enable overlayfs by default, since we only do
> something if there is an overlayfs mounted entry in the mountinfo table.
> Should I enable it always?
>
>
>>
>> >               { },
>> >       };
>> >
>> > @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
>> >               case 1067:
>> >                       opts.enable_external_masters = true;
>> >                       break;
>> > +             case 1068:
>> > +                     opts.overlayfs = true;
>> > +                     break;
>> >               case 'M':
>> >                       {
>> >                               char *aux;
>> > diff --git a/files.c b/files.c
>> > index c82920d..2c6cca0 100644
>> > --- a/files.c
>> > +++ b/files.c
>> > @@ -30,6 +30,7 @@
>> >  #include "eventfd.h"
>> >  #include "eventpoll.h"
>> >  #include "fsnotify.h"
>> > +#include "mount.h"
>> >  #include "signalfd.h"
>> >  #include "namespaces.h"
>> >  #include "tun.h"
>> > @@ -157,6 +158,45 @@ void show_saved_files(void)
>> >  }
>> >
>> >  /*
>> > + * Workaround for the OverlayFS bug present before Kernel 4.2
>> > + *
>> > + * When a process has a file open in an OverlayFS directory,
>> > + * the information in /proc/<pid>/fd/<fd> and /proc/<pid>/fdinfo/<fd>
>> > + * is wrong, so, if --overlayfs is specified, we grab that information
>> > + * from the mountinfo table instead.
>> > + *
>> > + * This is done every time fill_fdlink is called. See lookup_overlayfs
>> > + * for more details.
>> > + */
>> > +static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link)
>> > +{
>> > +     struct mount_info *m;
>> > +
>> > +     if (!link)
>> > +             return 0;
>> > +
>> > +     m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino,
>> p->mnt_id);
>> > +     if (!m)
>> > +             return 0;
>>
>> NULL here means both -- error occurred and nothing-to-do. Can you
>> check for errors using the ERR_PTR notation?
>>
>> > +
>> > +     p->mnt_id = m->mnt_id;
>> > +
>> > +     /* Stores the correct file path in p->link->name */
>>
>> Can you extend this comment specifying what the incorrectness is.
>>
> Sure.
>
>
>>
>> > +     if (strcmp("./", m->mountpoint) != 0) {
>> > +             char buf[PATH_MAX];
>> > +             int n;
>> > +
>> > +             strncpy(buf, link->name, PATH_MAX);
>> > +             n = snprintf(link->name, PATH_MAX, "%s/%s",
>> m->mountpoint, buf + 2);
>> > +             if (n >= PATH_MAX) {
>> > +                     pr_err("Not enough space to replace %s\n", buf);
>> > +                     return -1;
>> > +             }
>> > +     }
>> > +     return 0;
>> > +}
>> > +
>> > +/*
>> >   * The gen_id thing is used to optimize the comparison of shared files.
>> >   * If two files have different gen_ids, then they are different for
>> sure.
>> >   * If it matches, we don't know it and have to call sys_kcmp().
>> > @@ -206,6 +246,10 @@ int fill_fdlink(int lfd, const struct fd_parms *p,
>> struct fd_link *link)
>> >       }
>> >
>> >       link->len = len + 1;
>> > +
>> > +     if (opts.overlayfs)
>> > +             if (fixup_overlayfs((struct fd_parms *)p, link) < 0)
>> > +                     return -1;
>> >       return 0;
>> >  }
>> >
>> > diff --git a/include/cr_options.h b/include/cr_options.h
>> > index 19c2f77..011349c 100644
>> > --- a/include/cr_options.h
>> > +++ b/include/cr_options.h
>> > @@ -79,6 +79,7 @@ struct cr_options {
>> >       bool                    enable_external_sharing;
>> >       bool                    enable_external_masters;
>> >       bool                    aufs;           /* auto-deteced, not via
>> cli */
>> > +     bool                    overlayfs;
>> >  };
>> >
>> >  extern struct cr_options opts;
>> > diff --git a/include/mount.h b/include/mount.h
>> > index 0d5fc7f..01da3f5 100644
>> > --- a/include/mount.h
>> > +++ b/include/mount.h
>> > @@ -22,6 +22,8 @@ extern int prepare_mnt_ns(void);
>> >  extern int pivot_root(const char *new_root, const char *put_old);
>> >
>> >  struct mount_info;
>> > +struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev,
>> > +                             unsigned int st_ino, unsigned int mnt_id);
>> >  extern struct mount_info *lookup_mnt_id(unsigned int id);
>> >  extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev);
>> >
>> > diff --git a/mount.c b/mount.c
>> > index acd74f0..6072050 100644
>> > --- a/mount.c
>> > +++ b/mount.c
>> > @@ -127,6 +127,95 @@ static inline int fsroot_mounted(struct mount_info
>> *mi)
>> >       return is_root(mi->root);
>> >  }
>> >
>> > +static struct mount_info *__lookup_overlayfs(struct mount_info *list,
>> char *rpath,
>> > +                                             unsigned int st_dev,
>> unsigned int st_ino,
>> > +                                             unsigned int mnt_id)
>> > +{
>> > +     /*
>> > +      * Goes through all entries in the mountinfo table
>> > +      * looking for a mount point that contains the file specified
>> > +      * in rpath. Uses the device number st_dev and the inode number
>> st_ino
>> > +      * to make sure the file is correct.
>> > +      */
>> > +     struct mount_info *mi_ret = NULL;
>> > +     struct mount_info *m;
>> > +
>> > +     for (m = list; m != NULL; m = m->next) {
>> > +             /* Case 1: If mnt_id is correct, don't do anything */
>> > +             if (st_dev == m->s_dev && mnt_id == m->mnt_id)
>> > +                     return NULL;
>>
>> Shouldn't this if () be under the next one, so that everything that
>> happens
>> here applies to overlayfs only?
>
>
> This if () is basically a check to see if the bug exists. We look for all
> instances of the device st_dev and see if any of them match our mnt_id. If
> that's not the case, the bug exists and we want to proceed to try to fix
> it.
>
> I could have left this as a separate for loop to make this more clear, but
> this way we only use one for loop for both. I believe it's better if we
> separate it into two for loops since we prevent some unnecessary stating. I
> can also move this part to lookup_overlayfs, before dealing with
> namespaces. What do you think?
>
>
>>
>> > +
>> > +             /*
>> > +              * Case 2: If overlayFS we try to find the correct mnt_id.
>> > +              *
>> > +              * Note that even if we do find something, we want to
>> keep looking for
>> > +              * Case 1, which takes precedence.
>> > +              */
>> > +             if (m->fstype->code == FSTYPE__OVERLAYFS) {
>> > +                     char _abs_path[PATH_MAX];
>> > +                     struct stat f_stat;
>> > +                     int ret_stat;
>> > +
>> > +                     /* Concatenates m->mountpoint with rpath and
>> attempts to stat the resulting path */
>> > +                     if (strcmp("./", m->mountpoint) == 0)
>> > +                             ret_stat = stat(rpath + 1, &f_stat);
>> > +                     else {
>> > +                             int n = snprintf(_abs_path, PATH_MAX,
>> "%s/%s", m->mountpoint + 1, rpath);
>> > +
>> > +                             if (n >= PATH_MAX) {
>> > +                                     pr_err("Not enough space to
>> concatenate %s and %s\n", m->mountpoint, rpath);
>> > +                                     return NULL;
>> > +                             }
>> > +                             ret_stat = stat(_abs_path, &f_stat);
>> > +                     }
>> > +
>> > +                     if (ret_stat == 0 && st_dev == f_stat.st_dev &&
>> st_ino == f_stat.st_ino)
>> > +                             mi_ret = m;
>> > +             }
>> > +     }
>> > +
>> > +     return mi_ret;
>> > +}
>> > +
>> > +/*
>> > + * Looks up the mnt_id and path of a file in an overlayFS directory.
>> > + *
>> > + * This is useful in order to fix the OverlayFS bug present in the
>> > + * Linux Kernel before version 4.2. See fixup_overlayfs for details.
>> > + *
>> > + * We first enter the mount namespace of the process and check to see
>> > + * if there are any overlayFS mounted directories in the mountinfo
>> table.
>> > + * If so, we concatenate the mountpoint with the name of the file, and
>> > + * stat the resulting path to check if we found the correct device id
>> > + * and node number. If that is the case, we update the mount id and
>> > + * link variables with the correct values.
>> > + *
>> > + * This update is skipped if there is an entry in the
>> > + * mountinfo table with the same known device number and
>> > + * mount id, signifying that we already have the correct id.
>> > + *
>> > + * At the end of this procedure, we exit the mount namespace
>> > + * of the process, back to the original one.
>> > + */
>> > +struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
>> > +                                     unsigned int st_ino, unsigned int
>> mnt_id)
>> > +{
>> > +     int ns_old = -1;
>> > +     struct mount_info *mi;
>> > +
>> > +     if (switch_ns(root_item->pid.real, &mnt_ns_desc, &ns_old) < 0) {
>>
>> The switch_ns call is quite slow. If you only need to stat the file by
>> path
>> in proper mount namespace, you can get the namespace's root fd with the
>> mntns_get_root_fd() call and then do fstatat(mntns_root, path, ...);
>
>
> I'll look into this!
>
>
>>
>>
> > +             pr_err("Failed to switch into mount namespace of PID
>> %d\n", root_item->pid.real);
>> > +             return NULL;
>> > +     }
>> > +     mi = __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
>> > +
>> > +     if (restore_ns(ns_old, &mnt_ns_desc)) {
>> > +             pr_err("Failed to restore to old mount namespace %d\n",
>> ns_old);
>> > +             return NULL;
>> > +     }
>> > +     return mi;
>> > +}
>> > +
>> >  static struct mount_info *__lookup_mnt_id(struct mount_info *list, int
>> id)
>> >  {
>> >       struct mount_info *m;
>> > @@ -1365,6 +1454,9 @@ static struct fstype fstypes[32] = {
>> >               .code = FSTYPE__FUSE,
>> >               .dump = always_fail,
>> >               .restore = always_fail,
>> > +     }, {
>> > +             .name = "overlay",
>> > +             .code = FSTYPE__OVERLAYFS,
>> >       },
>> >  };
>> >
>> > diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto
>> > index 6f8e7d1..6e58e1d 100644
>> > --- a/protobuf/mnt.proto
>> > +++ b/protobuf/mnt.proto
>> > @@ -18,6 +18,7 @@ enum fstype {
>> >       MQUEUE                  = 14;
>> >       FUSE                    = 15;
>> >       AUTO                    = 16;
>> > +     OVERLAYFS               = 17;
>> >  };
>> >
>> >  message mnt_entry {
>>
>> -- Pavel
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openvz.org/pipermail/criu/attachments/20150805/048b3439/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: overlayfs.patch
Type: text/x-patch
Size: 8586 bytes
Desc: not available
URL: <http://lists.openvz.org/pipermail/criu/attachments/20150805/048b3439/attachment-0001.bin>


More information about the CRIU mailing list