[CRIU] [PATCH 4/6] inotify: Add checkpoint/restore v2

Cyrill Gorcunov gorcunov at openvz.org
Tue Apr 17 09:22:29 EDT 2012


To checkpoint notifiers we need help from kernel side,
thus FS_INOTIFY_GET_MARK ioctl code added.

Other than that

 - the inotify descriptors are collected into global inotify.img file
 - the inotify marks are collected into global inotify-wd.img
 - at restore time they are read and restored with help
   of inotify_init and inotify_add_watch calls
 - file owners are not yet supported, will be addressed in another patch

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 Makefile          |    1 +
 cr-dump.c         |   14 +++
 cr-restore.c      |    8 ++
 cr-show.c         |   42 ++++++++
 crtools.c         |   14 +++
 include/crtools.h |    6 +
 include/image.h   |   17 +++
 include/inotify.h |   33 ++++++
 include/mount.h   |    2 +
 include/types.h   |    7 ++
 inotify.c         |  304 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mount.c           |   25 +++++
 12 files changed, 473 insertions(+), 0 deletions(-)
 create mode 100644 include/inotify.h
 create mode 100644 inotify.c

diff --git a/Makefile b/Makefile
index 652c71a..39e3636 100644
--- a/Makefile
+++ b/Makefile
@@ -49,6 +49,7 @@ OBJS		+= namespaces.o
 OBJS		+= uts_ns.o
 OBJS		+= ipc_ns.o
 OBJS		+= mount.o
+OBJS		+= inotify.o
 
 OBJS-BLOB	+= parasite.o
 SRCS-BLOB	+= $(patsubst %.o,%.c,$(OBJS-BLOB))
diff --git a/cr-dump.c b/cr-dump.c
index b3e6fff..8c83fe8 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -34,6 +34,7 @@
 #include "parasite.h"
 #include "parasite-syscall.h"
 #include "files.h"
+#include "inotify.h"
 
 #ifndef CONFIG_X86_64
 # error No x86-32 support yet
@@ -427,6 +428,9 @@ static int do_dump_gen_file(const struct fd_parms *p, int lfd,
 		case FDINFO_PIPE:
 			ret = dump_one_pipe(lfd, e.id, p);
 			break;
+		case FDINFO_INOTIFY:
+			ret = dump_one_inotify(lfd, e.id, p);
+			break;
 		default:
 			ret = dump_one_reg_file(lfd, e.id, p);
 			break;
@@ -559,6 +563,13 @@ static int dump_chrdev(struct fd_parms *p, int lfd, const struct cr_fdset *set)
 	return dump_unsupp_fd(p);
 }
 
+static int dump_inotify(struct fd_parms *p, int lfd, const struct cr_fdset *set)
+{
+	p->id = MAKE_FD_GENID(p->stat.st_dev, p->stat.st_ino, p->pos);
+	p->type	= FDINFO_INOTIFY;
+	return do_dump_gen_file(p, lfd, set);
+}
+
 static int dump_one_file(pid_t pid, int fd, int lfd, char fd_flags,
 		       const struct cr_fdset *cr_fdset)
 {
@@ -575,6 +586,9 @@ static int dump_one_file(pid_t pid, int fd, int lfd, char fd_flags,
 	if (S_ISCHR(p.stat.st_mode))
 		return dump_chrdev(&p, lfd, cr_fdset);
 
+	if (is_inotify(lfd))
+		return dump_inotify(&p, lfd, cr_fdset);
+
 	if (S_ISREG(p.stat.st_mode) ||
             S_ISDIR(p.stat.st_mode) ||
             S_ISFIFO(p.stat.st_mode))
diff --git a/cr-restore.c b/cr-restore.c
index 1cf475c..669cc07 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -37,6 +37,8 @@
 #include "restorer-blob.h"
 #include "crtools.h"
 #include "namespaces.h"
+#include "mount.h"
+#include "inotify.h"
 
 static struct task_entries *task_entries;
 
@@ -287,6 +289,12 @@ static int prepare_shared(void)
 	if (collect_unix_sockets())
 		return -1;
 
+	if (collect_mount_info())
+		return -1;
+
+	if (collect_inotify())
+		return -1;
+
 	list_for_each_entry(pi, &tasks, list) {
 		ret = prepare_shmem_pid(pi->pid);
 		if (ret < 0)
diff --git a/cr-show.c b/cr-show.c
index cf6471c..ea92cb6 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -62,6 +62,7 @@ static char *fdtype2s(u8 type)
 		[FDINFO_INETSK] = "isk",
 		[FDINFO_PIPE] = "pipe",
 		[FDINFO_UNIXSK] = "usk",
+		[FDINFO_INOTIFY] = "ify",
 	};
 
 	if (type > FDINFO_UND && type < FD_INFO_MAX)
@@ -171,6 +172,47 @@ void show_ghost_file(int fd, struct cr_options *o)
 	pr_img_tail(CR_FD_GHOST_FILE);
 }
 
+void show_inotify_wd(int fd_inotify_wd, struct cr_options *o)
+{
+	struct inotify_wd_entry e;
+
+	pr_img_head(CR_FD_INOTIFY_WD);
+	while (1) {
+		int ret;
+
+		ret = read_img_eof(fd_inotify_wd, &e);
+		if (ret <= 0)
+			goto out;
+
+		pr_msg("inotify-wd: id 0x%08x 0x%08x s_dev 0x%08x i_ino 0x%016lx mask 0x%08x "
+		       "[fhandle] 0x%08x 0x%08x 0x%016lx:0x%016lx ...\n",
+		       e.id, e.wd, e.s_dev, e.i_ino, e.mask,
+		       e.f_handle.bytes, e.f_handle.type,
+		       e.f_handle.__handle[0],
+		       e.f_handle.__handle[1]);
+	}
+out:
+	pr_img_tail(CR_FD_INOTIFY_WD);
+}
+
+void show_inotify(int fd_inotify, struct cr_options *o)
+{
+	struct inotify_file_entry e;
+
+	pr_img_head(CR_FD_INOTIFY);
+	while (1) {
+		int ret;
+
+		ret = read_img_eof(fd_inotify, &e);
+		if (ret <= 0)
+			goto out;
+
+		pr_msg("inotify: id 0x%08x flags 0x%08x\n", e.id, e.flags);
+	}
+out:
+	pr_img_tail(CR_FD_INOTIFY);
+}
+
 void show_pipes_data(int fd_pipes, struct cr_options *o)
 {
 	struct pipe_data_entry e;
diff --git a/crtools.c b/crtools.c
index 3a2c9c6..0c4bd0e 100644
--- a/crtools.c
+++ b/crtools.c
@@ -61,6 +61,20 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
 		.show	= show_reg_files,
 	},
 
+	/* inotify descriptors */
+	[CR_FD_INOTIFY] = {
+		.fmt	= FMT_FNAME_INOTIFY,
+		.magic	= INOTIFY_MAGIC,
+		.show	= show_inotify,
+	},
+
+	/* inotify descriptors */
+	[CR_FD_INOTIFY_WD] = {
+		.fmt	= FMT_FNAME_INOTIFY_WD,
+		.magic	= INOTIFY_WMAGIC,
+		.show	= show_inotify_wd,
+	},
+
 	/* core data, such as regs and vmas and such */
 	[CR_FD_CORE] = {
 		.fmt	= FMT_FNAME_CORE,
diff --git a/include/crtools.h b/include/crtools.h
index a5903fb..d79878f 100644
--- a/include/crtools.h
+++ b/include/crtools.h
@@ -54,6 +54,8 @@ enum {
 	CR_FD_PIPES,
 	CR_FD_PIPES_DATA,
 	CR_FD_REMAP_FPATH,
+	CR_FD_INOTIFY,
+	CR_FD_INOTIFY_WD,
 	_CR_FD_GLOB_TO,
 
 	CR_FD_MAX
@@ -86,6 +88,8 @@ struct cr_fd_desc_tmpl {
 void show_files(int fd_files, struct cr_options *o);
 void show_pages(int fd_pages, struct cr_options *o);
 void show_reg_files(int fd_reg_files, struct cr_options *o);
+void show_inotify(int fd_inotify, struct cr_options *o);
+void show_inotify_wd(int fd_inotify_wd, struct cr_options *o);
 void show_core(int fd_core, struct cr_options *o);
 void show_mm(int fd_mm, struct cr_options *o);
 void show_vmas(int fd_vma, struct cr_options *o);
@@ -107,6 +111,8 @@ extern struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX];
 #define FMT_FNAME_PAGES		"pages-%d.img"
 #define FMT_FNAME_SHMEM_PAGES	"pages-shmem-%ld.img"
 #define FMT_FNAME_REG_FILES	"reg-files.img"
+#define FMT_FNAME_INOTIFY	"inotify.img"
+#define FMT_FNAME_INOTIFY_WD	"inotify-wd.img"
 #define FMT_FNAME_CORE		"core-%d.img"
 #define FMT_FNAME_MM		"mm-%d.img"
 #define FMT_FNAME_VMAS		"vmas-%d.img"
diff --git a/include/image.h b/include/image.h
index 995e9c1..9c90358 100644
--- a/include/image.h
+++ b/include/image.h
@@ -32,6 +32,8 @@
 #define MM_MAGIC		0x57492820 /* Pskov */
 #define REMAP_FPATH_MAGIC	0x59133954 /* Vologda */
 #define GHOST_FILE_MAGIC	0x52583605 /* Oryol */
+#define INOTIFY_MAGIC		0x48424431 /* Volgograd */
+#define INOTIFY_WMAGIC		0x54562009 /* Svetlogorsk (Rauschen) */
 
 #define PIPEFS_MAGIC	0x50495045
 
@@ -41,6 +43,7 @@ enum fd_types {
 	FDINFO_PIPE,
 	FDINFO_INETSK,
 	FDINFO_UNIXSK,
+	FDINFO_INOTIFY,
 
 	FD_INFO_MAX
 };
@@ -83,6 +86,20 @@ struct ghost_file_entry {
 	u32	mode;
 } __packed;
 
+struct inotify_wd_entry {
+	u32	id;
+	u64	i_ino;
+	u32	mask;
+	u32	s_dev;
+	u32	wd;
+	fh_t	f_handle;
+} __packed;
+
+struct inotify_file_entry {
+	u32	id;
+	u16	flags;
+} __packed;
+
 struct fdinfo_entry {
 	u32	fd;
 	u8	type;
diff --git a/include/inotify.h b/include/inotify.h
new file mode 100644
index 0000000..68962bc
--- /dev/null
+++ b/include/inotify.h
@@ -0,0 +1,33 @@
+#ifndef INOTIFY_H__
+#define INOTIFY_H__
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "compiler.h"
+#include "types.h"
+#include "files.h"
+#include "crtools.h"
+
+#define FS_INOTIFY_GET_MARK		0xc0044912
+
+struct inotify_user_mark {
+	u64				i_ino;	/* inode mask refers to */
+	u32				mask;	/* event mask */
+	u32				s_dev;	/* device inode lays on */
+	u32				r_dev;	/* device inode lays on, if special */
+	u32				wd;	/* watch descriptor */
+	fh_t				f_handle;/* file handle encoded */
+};
+
+struct inotify_mark_req {
+	u32				index;	/* mark index we're interested in */
+	u32				size;	/* size provided for @mark */
+	struct inotify_user_mark	*mark;
+};
+
+extern int is_inotify(int lfd);
+extern int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p);
+extern int collect_inotify(void);
+
+#endif /* INOTIFY_H__ */
diff --git a/include/mount.h b/include/mount.h
index 608c448..68793bd 100644
--- a/include/mount.h
+++ b/include/mount.h
@@ -4,5 +4,7 @@
 struct proc_mountinfo;
 
 extern int open_mnt_root(unsigned int s_dev, struct proc_mountinfo *mntinfo, int nr_mntinfo);
+extern int open_mount(unsigned int s_dev);
+extern int collect_mount_info(void);
 
 #endif /* MOUNT_H__ */
diff --git a/include/types.h b/include/types.h
index 36d5a8f..fe528ce 100644
--- a/include/types.h
+++ b/include/types.h
@@ -204,4 +204,11 @@ enum kcmp_type {
 # define SCM_MAX_FD	253
 #endif
 
+/* File handle */
+typedef struct {
+	u32 bytes;
+	u32 type;
+	u64 __handle[16];
+} fh_t;
+
 #endif /* CR_TYPES_H_ */
diff --git a/inotify.c b/inotify.c
new file mode 100644
index 0000000..1b0d100
--- /dev/null
+++ b/inotify.c
@@ -0,0 +1,304 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <utime.h>
+#include <dirent.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/inotify.h>
+#include <sys/vfs.h>
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <aio.h>
+
+#include "compiler.h"
+#include "types.h"
+#include "inotify.h"
+
+#include "syscall.h"
+#include "crtools.h"
+#include "mount.h"
+#include "image.h"
+#include "util.h"
+#include "files.h"
+#include "file-ids.h"
+#include "log.h"
+#include "list.h"
+#include "lock.h"
+
+struct inotify_wd_info {
+	struct list_head		list;
+	struct inotify_wd_entry		iwe;
+};
+
+struct inotify_file_info {
+	struct list_head		list;
+	struct inotify_file_entry	ife;
+	struct list_head		marks;
+	struct file_desc		d;
+};
+
+#ifndef ANON_INODE_FS_MAGIC
+#define ANON_INODE_FS_MAGIC 0x09041934
+#endif
+
+static LIST_HEAD(info_head);
+
+/* Checks if file desciptor @lfd is inotify */
+int is_inotify(int lfd)
+{
+	char link[PATH_MAX], path[32];
+	struct statfs statfs;
+	ssize_t ret;
+
+	if (fstatfs(lfd, &statfs)) {
+		pr_perror("Can't obtain statfs on fd %d\n", lfd);
+		return 0;
+	}
+
+	if (statfs.f_type != ANON_INODE_FS_MAGIC)
+		return 0;
+
+	snprintf(path, sizeof(path), "/proc/self/fd/%d", lfd);
+	ret = readlink(path, link, sizeof(link));
+	if (ret < 0) {
+		pr_perror("Can't read link of fd %d\n", lfd);
+		return 0;
+	}
+	link[ret] = 0;
+	if (!strcmp(link, "anon_inode:inotify"))
+		return 1;
+
+	return 0;
+}
+
+int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p)
+{
+	struct inotify_file_entry ie;
+	struct inotify_wd_entry we;
+	struct inotify_user_mark m;
+	struct inotify_mark_req r;
+	int image_fd, image_wd;
+	int ret = -1;
+
+	image_fd = fdset_fd(glob_fdset, CR_FD_INOTIFY);
+	image_wd = fdset_fd(glob_fdset, CR_FD_INOTIFY_WD);
+
+	pr_info("Dumping inotify %d with id 0x%08x\n", lfd, id);
+
+	ie.id	= id;
+	ie.flags= p->flags;
+
+	if (write_img(image_fd, &ie))
+		goto err;
+
+	memzero(&r, sizeof(r));
+	r.index	= -1U;
+	r.size	= sizeof(m);
+	r.mark	= &m;
+
+next:
+	while (1) {
+		r.index++;
+		memzero(&m, sizeof(m));
+		ret = ioctl(lfd, FS_INOTIFY_GET_MARK, &r);
+		if (ret) {
+			if (errno == ENODATA)
+				continue;
+			else if (errno == ENOENT)
+				break;
+			else {
+				pr_perror("Fetching inotify failed %d", -errno);
+				goto err;
+			}
+		}
+
+		we.id		= id;
+		we.mask		= m.mask;
+		we.i_ino	= m.i_ino;
+		we.s_dev	= m.s_dev;
+		we.wd		= m.wd;
+		we.f_handle	= m.f_handle;
+
+		pr_info("inotify: id 0x%08x flags 0x%08x wd 0x%08x s_dev 0x%08x i_ino 0x%16lx mask 0x%08x\n",
+			ie.id, ie.flags, we.wd, we.s_dev, we.i_ino, we.mask);
+		pr_info("\t[fhandle] bytes 0x%08x type 0x%08x __handle 0x%016lx:0x%016lx\n",
+			we.f_handle.bytes, we.f_handle.type,
+			we.f_handle.__handle[0], we.f_handle.__handle[1]);
+
+		if (write_img(image_wd, &we))
+			goto err;
+	}
+finish:
+	ret = 0;
+err:
+	return ret;
+}
+
+static int restore_one_inotify(int inotify_fd, struct inotify_wd_entry *iwe)
+{
+	char path[32];
+	int mntfd, ret = -1;
+	int i, wd, target;
+
+	mntfd = open_mount(iwe->s_dev);
+	if (mntfd < 0) {
+		pr_err("Mount root for 0x%08x not found\n", iwe->s_dev);
+		return -1;
+	}
+
+	target = sys_open_by_handle_at(mntfd, (void *)&iwe->f_handle, 0);
+	if (target < 0) {
+		pr_perror("Can't open file handle for 0x%08x:0x%016lx",
+			  iwe->s_dev, iwe->i_ino);
+		return -1;
+	}
+
+	snprintf(path, sizeof(path), "/proc/self/fd/%d", target);
+	pr_debug("\t\tRestore watch for 0x%08x:0x%016lx\n", iwe->s_dev, iwe->i_ino);
+
+	/*
+	 * FIXME The kernel allocates wd-s sequentially,
+	 * this is suboptimal, but the kernel doesn't
+	 * provide and API for this yet :(
+	 */
+	wd = 1;
+	while (wd >= 0) {
+		wd = inotify_add_watch(inotify_fd, path, iwe->mask);
+		if (wd < 0) {
+			pr_err("Can't add watch for %d with %d\n", inotify_fd, iwe->wd);
+			break;
+		} else if (wd == iwe->wd) {
+			ret = 0;
+			break;
+		} else if (wd > iwe->wd) {
+			pr_err("Usorted watch found for %d with %d\n", inotify_fd, iwe->wd);
+			break;
+		}
+
+		pr_debug("\t\tWatch got %d but %d expected\n", wd, iwe->wd);
+		inotify_rm_watch(inotify_fd, wd);
+	}
+
+	close(mntfd);
+	close(target);
+
+	return ret;
+}
+
+static int open_inotify_fd(struct file_desc *d)
+{
+	struct inotify_file_info *info;
+	struct inotify_wd_info *wd_info;
+	struct file_desc *p;
+	int tmp;
+
+	info = container_of(d, struct inotify_file_info, d);
+
+	tmp = inotify_init1(info->ife.flags);
+	if (tmp < 0) {
+		pr_perror("Can't create inotify for 0x%08x", info->ife.id);
+		return -1;
+	}
+
+	list_for_each_entry(wd_info, &info->marks, list) {
+		pr_info("\tRestore inotify for 0x%08x\n", wd_info->iwe.id);
+		if (restore_one_inotify(tmp, &wd_info->iwe)) {
+			close_safe(&tmp);
+			break;
+		}
+	}
+
+	return tmp;
+}
+
+static struct file_desc_ops desc_ops = {
+	.open = open_inotify_fd,
+};
+
+static int collect_mark(struct inotify_wd_info *mark)
+{
+	struct inotify_file_info *p;
+
+	list_for_each_entry(p, &info_head, list) {
+		if (p->ife.id == mark->iwe.id) {
+			list_add(&mark->list, &p->marks);
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+int collect_inotify(void)
+{
+	struct inotify_file_info *info;
+	struct inotify_wd_info *mark;
+	int image_fd = -1, image_wd = -1, ret = -1;
+
+	image_fd = open_image_ro(CR_FD_INOTIFY);
+	if (image_fd < 0)
+		return -1;
+
+	while (1) {
+		struct inotify_file_entry ife;
+
+		ret = read_img_eof(image_fd, &ife);
+		if (ret < 0)
+			goto err;
+		else if (!ret)
+			break;
+
+		info = xmalloc(sizeof(*info));
+		if (!info)
+			return -1;
+
+		info->ife = ife;
+		INIT_LIST_HEAD(&info->list);
+		INIT_LIST_HEAD(&info->marks);
+
+		list_add(&info->list, &info_head);
+	}
+
+	image_wd = open_image_ro(CR_FD_INOTIFY_WD);
+	if (image_wd < 0)
+		goto err;
+
+	while (1) {
+		int idx;
+
+		mark = xmalloc(sizeof(*mark));
+		if (!mark)
+			goto err;
+		ret = read_img_eof(image_wd, &mark->iwe);
+		if (ret < 0)
+			goto err;
+		else if (!ret)
+			break;
+
+		if (collect_mark(mark)) {
+			ret = -1;
+			pr_err("Can't find inotify with id 0x%08x\n", mark->iwe.id);
+			goto err;
+		}
+	}
+
+	list_for_each_entry(info, &info_head, list) {
+		pr_info("Collected inotify: id 0x%08x flags 0x%08x\n", info->ife.id, info->ife.flags);
+		file_desc_add(&info->d, FDINFO_INOTIFY, info->ife.id, &desc_ops);
+	}
+	ret = 0;
+err:
+	close_safe(&image_wd);
+	close_safe(&image_fd);
+
+	return ret;
+}
diff --git a/mount.c b/mount.c
index 3db7d34..310ad98 100644
--- a/mount.c
+++ b/mount.c
@@ -10,9 +10,13 @@
 
 #include "types.h"
 #include "util.h"
+#include "log.h"
 #include "mount.h"
 #include "proc_parse.h"
 
+static struct proc_mountinfo *mntinfo;
+static int nr_mntinfo;
+
 /*
  * Returns path for mount device @s_dev
  *
@@ -40,3 +44,24 @@ again:
 
 	return -ENOENT;
 }
+
+int open_mount(unsigned int s_dev)
+{
+	return open_mnt_root(s_dev, mntinfo, nr_mntinfo);
+}
+
+int collect_mount_info(void)
+{
+	nr_mntinfo = 64;
+	mntinfo = xmalloc(sizeof(*mntinfo) * nr_mntinfo);
+	if (!mntinfo)
+		return -1;
+
+	nr_mntinfo = parse_mountinfo(getpid(), mntinfo, nr_mntinfo);
+	if (nr_mntinfo < 1) {
+		pr_err("Parsing mountinfo %d failed\n", getpid());
+		return -1;
+	}
+
+	return 0;
+}
-- 
1.7.7.6



More information about the CRIU mailing list