From 543baa6c4d81429441cb47828768bda1122da049 Mon Sep 17 00:00:00 2001 From: Saied Kazemi Date: Tue, 19 Aug 2014 22:31:07 -0700 Subject: [PATCH] Added AUFS support. The AUFS support code handles the "bad" information that we get from the kernel in /proc//map_files and /proc//mountinfo files. For details see comments in sysfs_parse.c. The main motivation for this work was dumping and restoring Docker containers which by default use the AUFS graph driver. For dump, --aufs-root should be added to the command line options. For restore, there is no need for AUFS-specific command line options but the container's AUFS filesystem should already be set up before calling criu restore. Signed-off-by: Saied Kazemi --- Makefile.crtools | 1 + cr-dump.c | 18 +++ crtools.c | 7 ++ include/cr_options.h | 2 + include/fs-magic.h | 4 + include/proc_parse.h | 3 + include/sysfs_parse.h | 14 +++ include/vma.h | 10 +- mount.c | 7 +- proc_parse.c | 50 +++++++- protobuf/mnt.proto | 1 + sysfs_parse.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++ 12 files changed, 428 insertions(+), 5 deletions(-) create mode 100644 include/sysfs_parse.h create mode 100644 sysfs_parse.c diff --git a/Makefile.crtools b/Makefile.crtools index 6033b2c..ef30b05 100644 --- a/Makefile.crtools +++ b/Makefile.crtools @@ -9,6 +9,7 @@ obj-y += image-desc.o obj-y += net.o obj-y += tun.o obj-y += proc_parse.o +obj-y += sysfs_parse.o obj-y += cr-dump.o obj-y += cr-show.o obj-y += cr-check.o diff --git a/cr-dump.c b/cr-dump.c index 110dc62..cec579e 100644 --- a/cr-dump.c +++ b/cr-dump.c @@ -71,6 +71,7 @@ #include "cr-service.h" #include "plugin.h" #include "irmap.h" +#include "sysfs_parse.h" #include "asm/dump.h" @@ -370,6 +371,22 @@ static int dump_filemap(pid_t pid, struct vma_area *vma_area, BUG_ON(!vma_area->st); p.stat = *vma_area->st; + /* + * AUFS support to compensate for the kernel bug + * exposing branch pathnames in map_files. + * + * If the link found in vma_get_mapfile() pointed + * inside a branch, we should use the pathname + * from root that was saved in vma_area->aufs_rpath. + */ + if (vma_area->aufs_rpath) { + struct fd_link aufs_link; + + strcpy(aufs_link.name, vma_area->aufs_rpath); + aufs_link.len = strlen(aufs_link.name); + p.link = &aufs_link; + } + if (get_fd_mntid(vma_area->vm_file_fd, &p.mnt_id)) return -1; @@ -1927,6 +1944,7 @@ err: free_pstree(root_item); free_file_locks(); free_link_remaps(); + free_aufs_branches(); close_service_fd(CR_PROC_FD_OFF); diff --git a/crtools.c b/crtools.c index da87069..27164c4 100644 --- a/crtools.c +++ b/crtools.c @@ -172,6 +172,7 @@ int main(int argc, char *argv[]) { "exec-cmd", no_argument, 0, 59}, { "manage-cgroups", no_argument, 0, 60}, { "cgroup-root", required_argument, 0, 61}, + { "aufs-root", required_argument, 0, 62}, { }, }; @@ -397,6 +398,9 @@ int main(int argc, char *argv[]) case 'h': usage_error = false; goto usage; + case 62: + opts.aufs_root = optarg; + break; default: goto usage; } @@ -549,6 +553,9 @@ usage: " --exec-cmd execute the command specified after '--' on successful\n" " restore making it the parent of the restored process\n" "\n" +"* AUFS support (applicable to dump only):\n" +" --aufs-root root pathname of the AUFS filesystem\n" +"\n" "* Special resources support:\n" " -x|--" USK_EXT_PARAM " allow external unix connections\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" diff --git a/include/cr_options.h b/include/cr_options.h index 73ec50c..48e4c93 100644 --- a/include/cr_options.h +++ b/include/cr_options.h @@ -60,6 +60,8 @@ struct cr_options { bool manage_cgroups; char *new_global_cg_root; struct list_head new_cgroup_roots; + bool aufs; /* auto-deteced, not via cli */ + char *aufs_root; }; extern struct cr_options opts; diff --git a/include/fs-magic.h b/include/fs-magic.h index 12bb982..13c4961 100644 --- a/include/fs-magic.h +++ b/include/fs-magic.h @@ -37,4 +37,8 @@ #define BTRFS_SUPER_MAGIC 0x9123683E #endif +#ifndef AUFS_SUPER_MAGIC +#define AUFS_SUPER_MAGIC 0x61756673 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/include/proc_parse.h b/include/proc_parse.h index 35d4292..3ac25b2 100644 --- a/include/proc_parse.h +++ b/include/proc_parse.h @@ -214,4 +214,7 @@ extern void put_ctls(struct list_head *); int parse_cgroups(struct list_head *cgroups, unsigned int *n_cgroups); +/* callback for AUFS support */ +extern int aufs_parse(struct mount_info *mi); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/include/sysfs_parse.h b/include/sysfs_parse.h new file mode 100644 index 0000000..4d74c4e --- /dev/null +++ b/include/sysfs_parse.h @@ -0,0 +1,14 @@ +#ifndef __CR_SYSFS_PARSE_H__ +#define __CR_SYSFS_PARSE_H__ + +#define SYSFS_AUFS "/sys/fs/aufs/" +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ + +extern int parse_aufs_branches(struct mount_info *mi); +extern int fixup_aufs_vma_fd(struct vma_area *vma); +extern void free_aufs_branches(void); + +#endif /* __CR_SYSFS_PARSE_H__ */ + diff --git a/include/vma.h b/include/vma.h index c0bd80b..80d228a 100644 --- a/include/vma.h +++ b/include/vma.h @@ -32,8 +32,14 @@ struct vma_area { int vm_socket_id; struct file_desc *fd; }; - unsigned long *page_bitmap; /* existent pages */ - unsigned long *ppage_bitmap; /* parent's existent pages */ + union { + unsigned long *page_bitmap; /* existent pages, restore only */ + char *aufs_rpath; /* path from aufs root, dump only */ + }; + union { + unsigned long *ppage_bitmap; /* parent's existent pages */ + char *aufs_fpath; /* full path from global root, dump only */ + }; unsigned long premmaped_addr; diff --git a/mount.c b/mount.c index 9a147ab..459dfee 100644 --- a/mount.c +++ b/mount.c @@ -26,6 +26,7 @@ #include "protobuf.h" #include "kerndat.h" #include "fs-magic.h" +#include "sysfs_parse.h" #include "protobuf/mnt.pb-c.h" @@ -861,7 +862,11 @@ static struct fstype fstypes[] = { }, { .name = "cgroup", .code = FSTYPE__CGROUP, - } + }, { + .name = "aufs", + .code = FSTYPE__AUFS, + .parse = aufs_parse, + }, }; struct fstype *find_fstype_by_name(char *fst) diff --git a/proc_parse.c b/proc_parse.c index 271d46b..768d4f5 100644 --- a/proc_parse.c +++ b/proc_parse.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "asm/types.h" @@ -24,8 +25,11 @@ #include "vma.h" #include "proc_parse.h" +#include "cr_options.h" +#include "sysfs_parse.h" #include "protobuf.h" #include "protobuf/fdinfo.pb-c.h" +#include "protobuf/mnt.pb-c.h" #include @@ -240,7 +244,8 @@ static int vma_get_mapfile(struct vma_area *vma, DIR *mfd, vma->vm_socket_id = buf.st_ino; } else if (errno != ENOENT) return -1; - } + } else if (opts.aufs && fixup_aufs_vma_fd(vma) < 0) + return -1; return 0; } @@ -455,7 +460,19 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, bool use_map_file if (!st_buf) goto err; - if (fstat(vma_area->vm_file_fd, st_buf) < 0) { + /* + * For AUFS support, we cannot fstat() a file descriptor that + * is a symbolic link to a branch (it would return different + * dev/ino than the real file). Instead, we stat() using the + * full pathname that we saved before. + */ + if (vma_area->aufs_fpath) { + if (stat(vma_area->aufs_fpath, st_buf) < 0) { + pr_perror("Failed stat on %d's map %lu (%s)", + pid, start, vma_area->aufs_fpath); + goto err; + } + } else if (fstat(vma_area->vm_file_fd, st_buf) < 0) { pr_perror("Failed fstat on %d's map %lu", pid, start); goto err; } @@ -1718,3 +1735,32 @@ out: fclose(f); return ret; } + +/* + * AUFS callback function to "fix up" the root pathname. + * See sysfs_parse.c for details. + */ +int aufs_parse(struct mount_info *new) +{ + char *cp; + + opts.aufs = true; + + if (!opts.aufs_root || strcmp(new->mountpoint, "./")) + return 0; + + cp = malloc(strlen(opts.aufs_root) + 1); + if (!cp) { + pr_err("Cannot allocate memory for %s\n", opts.aufs_root); + return -1; + } + strcpy(cp, opts.aufs_root); + + pr_debug("Replacing %s with %s\n", new->root, opts.aufs_root); + free(new->root); + new->root = cp; + + parse_aufs_branches(new); + + return 0; +} diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto index 603bb37..2e39976 100644 --- a/protobuf/mnt.proto +++ b/protobuf/mnt.proto @@ -12,6 +12,7 @@ enum fstype { FUSECTL = 10; DEBUGFS = 11; CGROUP = 12; + AUFS = 13; }; message mnt_entry { diff --git a/sysfs_parse.c b/sysfs_parse.c new file mode 100644 index 0000000..dcfaa4f --- /dev/null +++ b/sysfs_parse.c @@ -0,0 +1,316 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "criu-log.h" +#include "xmalloc.h" +#include "files.h" +#include "proc_parse.h" +#include "util.h" +#include "sysfs_parse.h" +#include "namespaces.h" + +/* + * Currently, there are two kernel problems dealing with AUFS + * filesystems. Until these problems are fixed in the kernel, + * we have AUFS support in CRIU to handle the following issues: + * + * 1) /proc//mountinfo: The problem is that for AUFS the root field + * of the root entry is missing the pathname (it's only /). For example: + * + * 90 61 0:33 / / rw,relatime - aufs none rw,si=4476a910a24617e6 + * + * To handle this issue, the user has to specify the root of the AUFS + * filesystem with the --aufs-root command line option. + * + * 2) /proc//map_files: The symlinks are absolute pathnames of the + * corresponding *physical* files in the branch they exist. For example, + * for a Docker container using AUFS, a symlink would look like: + * 400000-489000 -> /var/lib/docker/aufs/diff//bin/ + * + * Therefore, when we use the link file descriptor vm_file_fd in + * dump_one_reg_file() to read the link, we get the file's physical + * absolute pathname which does not exist relative to the root of the + * mount namespace and even if we used its relative pathname, the dev:ino + * values would be different from the physical file's dev:ino causing the + * dump to fail. + * + * To handle this issue, we figure out the "correct" paths when parsing + * map_files and save it for later use. See fixup_aufs_vma_fd() for + * details. + */ + +struct ns_id *aufs_nsid; +static char **aufs_branches; + +/* + * Parse out and save the AUFS superblock info in the + * given buffer. + */ +static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len) +{ + char *cp; + int n; + + cp = strstr(mi->options, "si="); + if (!cp) { + pr_err("Cannot find sbinfo in option string %s\n", mi->options); + return -1; + } + + /* all ok, copy */ + if (len < 4) { /* 4 for "si_" */ + pr_err("Buffer of %d bytes too small for sbinfo\n", len); + return -1; + } + strcpy(sbinfo, "si_"); + n = 3; + sbinfo += n; + cp += n; + while (isxdigit(*cp) && n < len) { + *sbinfo++ = *cp++; + n++; + } + if (n >= len) { + pr_err("Sbinfo in options string %s too long\n", mi->options); + return -1; + } + *sbinfo = '\0'; + return 0; +} + +/* + * If the specified path is in a branch, replace it + * with pathname from root. + */ +static int fixup_aufs_path(char *path, int size) +{ + char rpath[PATH_MAX]; + int n; + int blen; + + if (aufs_branches == NULL) { + pr_err("No aufs branches to search for %s\n", path); + return -1; + } + + for (n = 0; aufs_branches[n] != NULL; n++) { + blen = strlen(aufs_branches[n]); + if (!strncmp(path, aufs_branches[n], blen)) + break; + } + + if (aufs_branches[n] == NULL) + return 0; /* not in a branch */ + + n = snprintf(rpath, PATH_MAX, "%s", &path[blen]); + if (n >= min(PATH_MAX, size)) { + pr_err("Not enough space to replace %s\n", path); + return -1; + } + + pr_debug("Replacing %s with %s\n", path, rpath); + strcpy(path, rpath); + return n; +} + +/* + * Kernel stores patchnames to AUFS branches in the br files in + * the /sys/fs/aufs/si_ directory where denotes a branch + * number and is a hexadecimal number in %lx format. For + * example: + * + * $ cat /sys/fs/aufs/si_f598876b087ed883/br0 + * /path/to/branch0/directory=rw + * + * This function sets up an array of pointers to branch pathnames. + */ +int parse_aufs_branches(struct mount_info *mi) +{ + char path[AUFSBR_PATH_LEN]; + char *cp; + int n; + int ret; + unsigned int br_num; + unsigned int br_max; + DIR *dp; + FILE *fp; + struct dirent *de; + + pr_info("Collecting AUFS branch pathnames ...\n"); + + if (mi->nsid == 0) { + pr_err("No nsid to parse its aufs branches\n"); + return -1; + } + + if (mi->nsid == aufs_nsid) { + pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid); + return 0; + } + + if (aufs_nsid) + free_aufs_branches(); + + strcpy(path, SYSFS_AUFS); /* /sys/fs/aufs/ */ + if (parse_aufs_sbinfo(mi, &path[sizeof SYSFS_AUFS - 1], SBINFO_LEN) < 0) + return -1; + if ((dp = opendir(path)) == NULL) { + pr_perror("Cannot opendir %s", path); + return -1; + } + + /* + * Find out how many branches we have. + */ + br_max = 0; + ret = 0; + while (1) { + errno = 0; + if ((de = readdir(dp)) == NULL) { + if (errno) { + pr_perror("Cannot readdir %s", path); + ret = -1; + } + break; + } + + ret = sscanf(de->d_name, "br%d", &br_num); + if (ret == 1 && br_num > br_max) + br_max = br_num; + } + closedir(dp); + if (ret == -1) + return -1; + + /* + * Default AUFS maximum is 127, so 1000 should be plenty. + * If you increase the maximum to more than 3 digits, + * make sure to change AUFSBR_PATH_LEN accordingly. + */ + if (br_max > 999) { + pr_err("Too many branches %d\n", br_max); + return -1; + } + + /* + * Allocate an array of pointers to branch pathnames to be read. + * Branches are indexed from 0 and we need a NULL pointer at the end. + */ + aufs_branches = xzalloc((br_max + 2) * sizeof (char *)); + if (!aufs_branches) + return -1; + + /* + * Now read branch pathnames from the branch files. + */ + n = strlen(path); + fp = NULL; + for (br_num = 0; br_num <= br_max; br_num++) { + ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num); + if (ret >= sizeof path - n) { + pr_err("Buffer overrun creating path for branch %d\n", br_num); + goto err; + } + + if ((fp = fopen(path, "r")) == NULL) { + pr_perror("Cannot fopen %s", path); + goto err; + } + + if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 || + aufs_branches[br_num] == NULL) { + pr_perror("Parse error reading %s", path); + goto err; + } + + /* chop off the trailing "=..." stuff */ + if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) { + pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]); + goto err; + } + *cp = '\0'; + + fclose(fp); + /* + * Log branch information for extenal utitilies that + * want to recreate the process's AUFS filesystem + * before calling criu restore. + * + * DO NOT CHANGE this format! + */ + pr_info("%s : %s\n", path, aufs_branches[br_num]); + } + + aufs_nsid = mi->nsid; + return 0; + +err: + if (fp) + fclose(fp); + free_aufs_branches(); + return -1; +} + +/* + * AUFS support to compensate for the kernel bug + * exposing branch pathnames in map_files. + * + * If the link points inside a branch, save the + * relative pathname from the root of the mount + * namespace as well as the full pathname from + * globl root (/) for later use in dump_filemap() + * and parse_smaps(). + */ +int fixup_aufs_vma_fd(struct vma_area *vma) +{ + char path[PATH_MAX]; + int len; + + path[0] = '.'; + len = read_fd_link(vma->vm_file_fd, &path[1], sizeof path - 1); + if (len < 0) + return -1; + + len = fixup_aufs_path(&path[1], sizeof path - 1); + if (len < 0) + return -1; + + if (len > 0) { + vma->aufs_rpath = xmalloc(len + 2); + if (!vma->aufs_rpath) + return -1; + strcpy(vma->aufs_rpath, path); + if (opts.aufs_root) { + vma->aufs_fpath = xmalloc(strlen(opts.aufs_root) + 1 + len + 1); + if (!vma->aufs_fpath) + return -1; + /* skip ./ in path */ + sprintf(vma->aufs_fpath, "%s/%s", opts.aufs_root, &path[2]); + } + pr_debug("Saved AUFS paths %s and %s\n", vma->aufs_rpath, vma->aufs_fpath); + } + + return 0; +} + +void free_aufs_branches(void) +{ + int n; + + if (aufs_branches) { + for (n = 0; aufs_branches[n] != NULL; n++) + xfree(aufs_branches[n]); + + xfree(aufs_branches); + aufs_branches = NULL; + } + + aufs_nsid = NULL; +} -- 1.9.1