[CRIU] [RFC] fsnotify: Guess the path to open if multiple mount namespace are present

Cyrill Gorcunov gorcunov at openvz.org
Sun Oct 11 15:15:02 PDT 2015


In debian-8 container we faced the problem -- systemd creates nested
mount namespaces and inotify watchee are resolved into a path which
is inaccessbile on restore (because we're operating in task's mount
namespace).

To resolve this situation we

 1) Save the mount id for the handle unconditionally.
 2) On restore lookup for this mount and walk over all
    mountpoints which device matches the one where the
    handle had been opened initially with attemp to reopen
    it (because the superblock must exist and we're putting
    mark on inode itself it doesnt matter via which path
    we hook it by).

Typical output like

 | (02.313335)      1: fsnotify:   Restore 0x3 wd for 0x00000000
 | (02.313372)      1: fsnotify:           Restore with path hint 137:root/systemd
 | (02.313465)      1: fsnotify:           Can't open gonna do a guess
 | (02.313501)      1: fsnotify:             handle mnt-id 137 best ns_mountpoint @root for path root/systemd
 | (02.313698)      1: fsnotify:             trying open path run/systemd: 13
 | (02.313839)      1: fsnotify:           Restore inotify watch for 0x0000002b:0x0000000000006608 (via /proc/self/fd/13 -> /run/systemd)

https://jira.sw.ru/browse/PSBM-39957

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---

Please don't commit it, I've not yet tested all the details
but ran with debian container only, need to run on other containers
first and pass the testsuite as well.

So any ideas are welcome. I still think the best way for handling
this would be a kernel patching (and print out @mnt_id for whatchee
in fdinfo output, but we've to support old kernels anyway).

 fsnotify.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 95 insertions(+), 4 deletions(-)

diff --git a/fsnotify.c b/fsnotify.c
index 76d73980d854..7e19acd5ddd3 100644
--- a/fsnotify.c
+++ b/fsnotify.c
@@ -154,6 +154,9 @@ int check_open_handle(unsigned int s_dev, unsigned long i_ino,
 			goto err;
 		}
 
+		f_handle->has_mnt_id = true;
+		f_handle->mnt_id = mi->mnt_id;
+
 		/*
 		 * Inode numbers are not restored for tmpfs content, but we can
 		 * get file names, becasue tmpfs cache is not pruned.
@@ -169,9 +172,6 @@ int check_open_handle(unsigned int s_dev, unsigned long i_ino,
 			if (path == NULL)
 				goto err;
 
-			f_handle->has_mnt_id = true;
-			f_handle->mnt_id = mi->mnt_id;
-
 			goto out;
 		}
 
@@ -413,6 +413,92 @@ const struct fdtype_ops fanotify_dump_ops = {
 	.pre_dump	= pre_dump_one_fanotify,
 };
 
+/*
+ * When we receive path to a watchee it might be inaccesible on
+ * restore via same path (re-mounted and living in another mount
+ * namespace), so we need to walk over all possible mount fits
+ * and chose  first one which is openable (note since it's for
+ * inotify objects the mark get bound into inode structure so
+ * it doesnt matter via which path we access it).
+ */
+static int path_fit_first(const char *path, int mnt_id, unsigned int s_dev)
+{
+	/*
+	 * For example the watchee is fetched as
+	 *
+	 * s_dev:	43
+	 * path:	/root/systemd/ask-password
+	 * mnt_id:	137
+	 *
+	 * while in real was opened by application
+	 * in another namespace as /run/systemd/ask-password
+	 * and this path get remapped on restore.
+	 *
+	 * Also we have a set of mountpoints from images
+	 *
+	 * mnt_id:	137
+	 * root:	/systemd/inaccessible"
+	 * mountpoint:	/root
+	 *
+	 * mnt_id:	60
+	 * root_dev	43
+	 * root		/
+	 * mountpoint	/run
+	 *
+	 *
+	 * So we take @mnt_id from the handle and find
+	 * @mountpoint, then cut it off from the @path
+	 * and start iterating over the rest of mount
+	 * entries which @root_dev equals the @s_dev,
+	 * constructing the path and trying to open it.
+	 *
+	 * IOW, in the case above it gonna be a try
+	 * to open
+	 *
+	 * 	/root/systemd/ask-password
+	 * 	/run/systemd/ask-password
+	 */
+
+	struct mount_info *handle_m, *m;
+	char new[PATH_MAX];
+	size_t len;
+
+	handle_m = lookup_mnt_id(mnt_id);
+	if (!handle_m) {
+		pr_err("Can't find mountinfo for mnt_id %d\n", mnt_id);
+		return -ENOENT;
+	}
+
+	len = strlen(&handle_m->ns_mountpoint[1]);
+	pr_debug("\t\t  handle mnt-id %d best ns_mountpoint @%s for path %s\n",
+		 mnt_id, &handle_m->ns_mountpoint[1], path);
+
+	if (strncmp(path, &handle_m->ns_mountpoint[1], len)) {
+		pr_err("Handle mountpoint mismatch (%d @%s %s)\n",
+		 mnt_id, &handle_m->ns_mountpoint[1], path);
+		return -ENOENT;
+	}
+
+	new[PATH_MAX - 1] = '\0';
+	for (m = mntinfo; m; m = m->next) {
+		int fd, mntns_root;
+
+		if (m->s_dev != s_dev)
+			continue;
+
+		snprintf(new, sizeof(new) - 1, "%s%s",
+			 &m->ns_mountpoint[1], &path[len]);
+
+		mntns_root = mntns_get_root_by_mnt_id(m->mnt_id);
+		fd = openat(mntns_root, new, O_PATH);
+		pr_debug("\t\t  trying open path %s: %d\n", new, fd);
+		if (fd >= 0)
+			return fd;
+	}
+
+	return -ENOENT;
+}
+
 static char *get_mark_path(const char *who, struct file_remap *remap,
 			   FhEntry *f_handle, unsigned long i_ino,
 			   unsigned int s_dev, char *buf, int *target)
@@ -430,7 +516,7 @@ static char *get_mark_path(const char *who, struct file_remap *remap,
 	} else if (f_handle->path) {
 		int  mntns_root;
 		char *path = ".";
-		uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1;
+		int mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1;
 
 
 		/* irmap cache is collected in the root namespaces. */
@@ -442,6 +528,11 @@ static char *get_mark_path(const char *who, struct file_remap *remap,
 
 		pr_debug("\t\tRestore with path hint %d:%s\n", mnt_id, path);
 		*target = openat(mntns_root, path, O_PATH);
+
+		if (*target < 0 && mnt_id > -1) {
+			pr_debug("\t\tCan't open gonna do a guess\n");
+			*target = path_fit_first(path, mnt_id, s_dev);
+		}
 	} else
 		*target = open_handle(s_dev, i_ino, f_handle);
 
-- 
2.4.3



More information about the CRIU mailing list