[CRIU] OverlayFS Patch
Gabriel Guimaraes
gabriellimaguimaraes at gmail.com
Wed Aug 5 12:28:11 PDT 2015
Here is the new post review patch!
2015-08-05 10:51 GMT-07:00 Gabriel Guimaraes <guimaraesg at google.com>:
> Hi Pavel,
>
> Thanks for the review.
>
> On Wed, Aug 5, 2015 at 4:00 AM, Pavel Emelyanov <xemul at parallels.com>
> wrote:
>
>> Gabriel,
>>
>> Please, find my comments inline.
>>
>> > diff --git a/crtools.c b/crtools.c
>> > index 6af6080..521d0ff 100644
>> > --- a/crtools.c
>> > +++ b/crtools.c
>> > @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
>> > { "enable-fs", required_argument,
>> 0, 1065 },
>> > { "enable-external-sharing", no_argument,
>> 0, 1066 },
>> > { "enable-external-masters", no_argument,
>> 0, 1067 },
>> > + { "overlayfs", no_argument,
>> 0, 1068 },
>>
>> With AUFS workaround we ended up with auto-detection scheme -- when
>> meeting an AUFS mount the opts.aufs is set to true. Can we do the
>> same with overlayfs?
>>
>
> Technically we could always enable overlayfs by default, since we only do
> something if there is an overlayfs mounted entry in the mountinfo table.
> Should I enable it always?
>
>
>>
>> > { },
>> > };
>> >
>> > @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
>> > case 1067:
>> > opts.enable_external_masters = true;
>> > break;
>> > + case 1068:
>> > + opts.overlayfs = true;
>> > + break;
>> > case 'M':
>> > {
>> > char *aux;
>> > diff --git a/files.c b/files.c
>> > index c82920d..2c6cca0 100644
>> > --- a/files.c
>> > +++ b/files.c
>> > @@ -30,6 +30,7 @@
>> > #include "eventfd.h"
>> > #include "eventpoll.h"
>> > #include "fsnotify.h"
>> > +#include "mount.h"
>> > #include "signalfd.h"
>> > #include "namespaces.h"
>> > #include "tun.h"
>> > @@ -157,6 +158,45 @@ void show_saved_files(void)
>> > }
>> >
>> > /*
>> > + * Workaround for the OverlayFS bug present before Kernel 4.2
>> > + *
>> > + * When a process has a file open in an OverlayFS directory,
>> > + * the information in /proc/<pid>/fd/<fd> and /proc/<pid>/fdinfo/<fd>
>> > + * is wrong, so, if --overlayfs is specified, we grab that information
>> > + * from the mountinfo table instead.
>> > + *
>> > + * This is done every time fill_fdlink is called. See lookup_overlayfs
>> > + * for more details.
>> > + */
>> > +static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link)
>> > +{
>> > + struct mount_info *m;
>> > +
>> > + if (!link)
>> > + return 0;
>> > +
>> > + m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino,
>> p->mnt_id);
>> > + if (!m)
>> > + return 0;
>>
>> NULL here means both -- error occurred and nothing-to-do. Can you
>> check for errors using the ERR_PTR notation?
>>
>> > +
>> > + p->mnt_id = m->mnt_id;
>> > +
>> > + /* Stores the correct file path in p->link->name */
>>
>> Can you extend this comment specifying what the incorrectness is.
>>
> Sure.
>
>
>>
>> > + if (strcmp("./", m->mountpoint) != 0) {
>> > + char buf[PATH_MAX];
>> > + int n;
>> > +
>> > + strncpy(buf, link->name, PATH_MAX);
>> > + n = snprintf(link->name, PATH_MAX, "%s/%s",
>> m->mountpoint, buf + 2);
>> > + if (n >= PATH_MAX) {
>> > + pr_err("Not enough space to replace %s\n", buf);
>> > + return -1;
>> > + }
>> > + }
>> > + return 0;
>> > +}
>> > +
>> > +/*
>> > * The gen_id thing is used to optimize the comparison of shared files.
>> > * If two files have different gen_ids, then they are different for
>> sure.
>> > * If it matches, we don't know it and have to call sys_kcmp().
>> > @@ -206,6 +246,10 @@ int fill_fdlink(int lfd, const struct fd_parms *p,
>> struct fd_link *link)
>> > }
>> >
>> > link->len = len + 1;
>> > +
>> > + if (opts.overlayfs)
>> > + if (fixup_overlayfs((struct fd_parms *)p, link) < 0)
>> > + return -1;
>> > return 0;
>> > }
>> >
>> > diff --git a/include/cr_options.h b/include/cr_options.h
>> > index 19c2f77..011349c 100644
>> > --- a/include/cr_options.h
>> > +++ b/include/cr_options.h
>> > @@ -79,6 +79,7 @@ struct cr_options {
>> > bool enable_external_sharing;
>> > bool enable_external_masters;
>> > bool aufs; /* auto-deteced, not via
>> cli */
>> > + bool overlayfs;
>> > };
>> >
>> > extern struct cr_options opts;
>> > diff --git a/include/mount.h b/include/mount.h
>> > index 0d5fc7f..01da3f5 100644
>> > --- a/include/mount.h
>> > +++ b/include/mount.h
>> > @@ -22,6 +22,8 @@ extern int prepare_mnt_ns(void);
>> > extern int pivot_root(const char *new_root, const char *put_old);
>> >
>> > struct mount_info;
>> > +struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev,
>> > + unsigned int st_ino, unsigned int mnt_id);
>> > extern struct mount_info *lookup_mnt_id(unsigned int id);
>> > extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev);
>> >
>> > diff --git a/mount.c b/mount.c
>> > index acd74f0..6072050 100644
>> > --- a/mount.c
>> > +++ b/mount.c
>> > @@ -127,6 +127,95 @@ static inline int fsroot_mounted(struct mount_info
>> *mi)
>> > return is_root(mi->root);
>> > }
>> >
>> > +static struct mount_info *__lookup_overlayfs(struct mount_info *list,
>> char *rpath,
>> > + unsigned int st_dev,
>> unsigned int st_ino,
>> > + unsigned int mnt_id)
>> > +{
>> > + /*
>> > + * Goes through all entries in the mountinfo table
>> > + * looking for a mount point that contains the file specified
>> > + * in rpath. Uses the device number st_dev and the inode number
>> st_ino
>> > + * to make sure the file is correct.
>> > + */
>> > + struct mount_info *mi_ret = NULL;
>> > + struct mount_info *m;
>> > +
>> > + for (m = list; m != NULL; m = m->next) {
>> > + /* Case 1: If mnt_id is correct, don't do anything */
>> > + if (st_dev == m->s_dev && mnt_id == m->mnt_id)
>> > + return NULL;
>>
>> Shouldn't this if () be under the next one, so that everything that
>> happens
>> here applies to overlayfs only?
>
>
> This if () is basically a check to see if the bug exists. We look for all
> instances of the device st_dev and see if any of them match our mnt_id. If
> that's not the case, the bug exists and we want to proceed to try to fix
> it.
>
> I could have left this as a separate for loop to make this more clear, but
> this way we only use one for loop for both. I believe it's better if we
> separate it into two for loops since we prevent some unnecessary stating. I
> can also move this part to lookup_overlayfs, before dealing with
> namespaces. What do you think?
>
>
>>
>> > +
>> > + /*
>> > + * Case 2: If overlayFS we try to find the correct mnt_id.
>> > + *
>> > + * Note that even if we do find something, we want to
>> keep looking for
>> > + * Case 1, which takes precedence.
>> > + */
>> > + if (m->fstype->code == FSTYPE__OVERLAYFS) {
>> > + char _abs_path[PATH_MAX];
>> > + struct stat f_stat;
>> > + int ret_stat;
>> > +
>> > + /* Concatenates m->mountpoint with rpath and
>> attempts to stat the resulting path */
>> > + if (strcmp("./", m->mountpoint) == 0)
>> > + ret_stat = stat(rpath + 1, &f_stat);
>> > + else {
>> > + int n = snprintf(_abs_path, PATH_MAX,
>> "%s/%s", m->mountpoint + 1, rpath);
>> > +
>> > + if (n >= PATH_MAX) {
>> > + pr_err("Not enough space to
>> concatenate %s and %s\n", m->mountpoint, rpath);
>> > + return NULL;
>> > + }
>> > + ret_stat = stat(_abs_path, &f_stat);
>> > + }
>> > +
>> > + if (ret_stat == 0 && st_dev == f_stat.st_dev &&
>> st_ino == f_stat.st_ino)
>> > + mi_ret = m;
>> > + }
>> > + }
>> > +
>> > + return mi_ret;
>> > +}
>> > +
>> > +/*
>> > + * Looks up the mnt_id and path of a file in an overlayFS directory.
>> > + *
>> > + * This is useful in order to fix the OverlayFS bug present in the
>> > + * Linux Kernel before version 4.2. See fixup_overlayfs for details.
>> > + *
>> > + * We first enter the mount namespace of the process and check to see
>> > + * if there are any overlayFS mounted directories in the mountinfo
>> table.
>> > + * If so, we concatenate the mountpoint with the name of the file, and
>> > + * stat the resulting path to check if we found the correct device id
>> > + * and node number. If that is the case, we update the mount id and
>> > + * link variables with the correct values.
>> > + *
>> > + * This update is skipped if there is an entry in the
>> > + * mountinfo table with the same known device number and
>> > + * mount id, signifying that we already have the correct id.
>> > + *
>> > + * At the end of this procedure, we exit the mount namespace
>> > + * of the process, back to the original one.
>> > + */
>> > +struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
>> > + unsigned int st_ino, unsigned int
>> mnt_id)
>> > +{
>> > + int ns_old = -1;
>> > + struct mount_info *mi;
>> > +
>> > + if (switch_ns(root_item->pid.real, &mnt_ns_desc, &ns_old) < 0) {
>>
>> The switch_ns call is quite slow. If you only need to stat the file by
>> path
>> in proper mount namespace, you can get the namespace's root fd with the
>> mntns_get_root_fd() call and then do fstatat(mntns_root, path, ...);
>
>
> I'll look into this!
>
>
>>
>>
> > + pr_err("Failed to switch into mount namespace of PID
>> %d\n", root_item->pid.real);
>> > + return NULL;
>> > + }
>> > + mi = __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
>> > +
>> > + if (restore_ns(ns_old, &mnt_ns_desc)) {
>> > + pr_err("Failed to restore to old mount namespace %d\n",
>> ns_old);
>> > + return NULL;
>> > + }
>> > + return mi;
>> > +}
>> > +
>> > static struct mount_info *__lookup_mnt_id(struct mount_info *list, int
>> id)
>> > {
>> > struct mount_info *m;
>> > @@ -1365,6 +1454,9 @@ static struct fstype fstypes[32] = {
>> > .code = FSTYPE__FUSE,
>> > .dump = always_fail,
>> > .restore = always_fail,
>> > + }, {
>> > + .name = "overlay",
>> > + .code = FSTYPE__OVERLAYFS,
>> > },
>> > };
>> >
>> > diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto
>> > index 6f8e7d1..6e58e1d 100644
>> > --- a/protobuf/mnt.proto
>> > +++ b/protobuf/mnt.proto
>> > @@ -18,6 +18,7 @@ enum fstype {
>> > MQUEUE = 14;
>> > FUSE = 15;
>> > AUTO = 16;
>> > + OVERLAYFS = 17;
>> > };
>> >
>> > message mnt_entry {
>>
>> -- Pavel
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openvz.org/pipermail/criu/attachments/20150805/048b3439/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: overlayfs.patch
Type: text/x-patch
Size: 8586 bytes
Desc: not available
URL: <http://lists.openvz.org/pipermail/criu/attachments/20150805/048b3439/attachment-0001.bin>
More information about the CRIU
mailing list