From 811d21a1a12996f7d56999c00b39256170d6c31f Mon Sep 17 00:00:00 2001 From: Saied Kazemi Date: Tue, 19 Aug 2014 22:31:07 -0700 Subject: [PATCH] Added AUFS support. The AUFS support code handles the "bad" information that we get from the kernel in /proc//map_files and /proc//mountinfo files. For details see comments in sysfs_parse.c and proc_parse.c The main motivation for this work was dumping and restoring Docker containers. For dump, --aufs and --aufs-root should be specified along with other typical options. For restore, there is no need to specify either of these options but the container's filesystem should already be set up before calling criu restore. Signed-off-by: Saied Kazemi --- Makefile.crtools | 1 + cr-dump.c | 19 ++++ crtools.c | 12 +++ include/cr_options.h | 2 + include/fs-magic.h | 4 + include/proc_parse.h | 3 + include/sysfs_parse.h | 14 +++ include/vma.h | 5 +- mount.c | 7 +- proc_parse.c | 70 ++++++++++++- protobuf/mnt.proto | 1 + sysfs_parse.c | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++ 12 files changed, 419 insertions(+), 3 deletions(-) create mode 100644 include/sysfs_parse.h create mode 100644 sysfs_parse.c diff --git a/Makefile.crtools b/Makefile.crtools index 6033b2c..ef30b05 100644 --- a/Makefile.crtools +++ b/Makefile.crtools @@ -9,6 +9,7 @@ obj-y += image-desc.o obj-y += net.o obj-y += tun.o obj-y += proc_parse.o +obj-y += sysfs_parse.o obj-y += cr-dump.o obj-y += cr-show.o obj-y += cr-check.o diff --git a/cr-dump.c b/cr-dump.c index 110dc62..c965eba 100644 --- a/cr-dump.c +++ b/cr-dump.c @@ -71,6 +71,7 @@ #include "cr-service.h" #include "plugin.h" #include "irmap.h" +#include "sysfs_parse.h" #include "asm/dump.h" @@ -370,6 +371,22 @@ static int dump_filemap(pid_t pid, struct vma_area *vma_area, BUG_ON(!vma_area->st); p.stat = *vma_area->st; + /* + * AUFS support to compensate for the kernel bug + * exposing branch pathnames in map_files. + * + * If the link found in vma_get_mapfile() pointed + * inside a branch, we should use the pathname + * from root that was saved in vma_area->aufs_rpath. + */ + if (opts.aufs && vma_area->aufs_rpath) { + struct fd_link aufs_link; + + strcpy(aufs_link.name, vma_area->aufs_rpath); + aufs_link.len = strlen(aufs_link.name); + p.link = &aufs_link; + } + if (get_fd_mntid(vma_area->vm_file_fd, &p.mnt_id)) return -1; @@ -1927,6 +1944,8 @@ err: free_pstree(root_item); free_file_locks(); free_link_remaps(); + if (opts.aufs) + free_aufs_branches(); close_service_fd(CR_PROC_FD_OFF); diff --git a/crtools.c b/crtools.c index da87069..b202d52 100644 --- a/crtools.c +++ b/crtools.c @@ -172,6 +172,8 @@ int main(int argc, char *argv[]) { "exec-cmd", no_argument, 0, 59}, { "manage-cgroups", no_argument, 0, 60}, { "cgroup-root", required_argument, 0, 61}, + { "aufs", no_argument, 0, 62}, + { "aufs-root", required_argument, 0, 63}, { }, }; @@ -397,6 +399,12 @@ int main(int argc, char *argv[]) case 'h': usage_error = false; goto usage; + case 62: + opts.aufs = true; + break; + case 63: + opts.aufs_root = optarg; + break; default: goto usage; } @@ -549,6 +557,10 @@ usage: " --exec-cmd execute the command specified after '--' on successful\n" " restore making it the parent of the restored process\n" "\n" +"* AUFS support (applicable to dump only):\n" +" --aufs process running in AUFS filesystem\n" +" --aufs-root path to the root of the AUFS filesystem\n" +"\n" "* Special resources support:\n" " -x|--" USK_EXT_PARAM " allow external unix connections\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" diff --git a/include/cr_options.h b/include/cr_options.h index 73ec50c..dcef59e 100644 --- a/include/cr_options.h +++ b/include/cr_options.h @@ -60,6 +60,8 @@ struct cr_options { bool manage_cgroups; char *new_global_cg_root; struct list_head new_cgroup_roots; + bool aufs; + char *aufs_root; }; extern struct cr_options opts; diff --git a/include/fs-magic.h b/include/fs-magic.h index 12bb982..13c4961 100644 --- a/include/fs-magic.h +++ b/include/fs-magic.h @@ -37,4 +37,8 @@ #define BTRFS_SUPER_MAGIC 0x9123683E #endif +#ifndef AUFS_SUPER_MAGIC +#define AUFS_SUPER_MAGIC 0x61756673 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/include/proc_parse.h b/include/proc_parse.h index 35d4292..3ac25b2 100644 --- a/include/proc_parse.h +++ b/include/proc_parse.h @@ -214,4 +214,7 @@ extern void put_ctls(struct list_head *); int parse_cgroups(struct list_head *cgroups, unsigned int *n_cgroups); +/* callback for AUFS support */ +extern int aufs_parse(struct mount_info *mi); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/include/sysfs_parse.h b/include/sysfs_parse.h new file mode 100644 index 0000000..fcd06da --- /dev/null +++ b/include/sysfs_parse.h @@ -0,0 +1,14 @@ +#ifndef __CR_SYSFS_PARSE_H__ +#define __CR_SYSFS_PARSE_H__ + +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_PATH_LEN (13 + SBINFO_LEN + 1) /* /sys/fs/aufs/ */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%d */ + +extern int parse_aufs_branches(struct mount_info *mi); +extern void free_aufs_branches(void); +extern char *fixup_aufs_fd_path(int lfd); +extern int fixup_aufs_path(char *path, int size, bool chop); + +#endif /* __CR_SYSFS_PARSE_H__ */ + diff --git a/include/vma.h b/include/vma.h index c0bd80b..beb909a 100644 --- a/include/vma.h +++ b/include/vma.h @@ -32,7 +32,10 @@ struct vma_area { int vm_socket_id; struct file_desc *fd; }; - unsigned long *page_bitmap; /* existent pages */ + union { + unsigned long *page_bitmap; /* existent pages, restore only */ + char *aufs_rpath; /* path from aufs root, dump only */ + }; unsigned long *ppage_bitmap; /* parent's existent pages */ unsigned long premmaped_addr; diff --git a/mount.c b/mount.c index 9a147ab..459dfee 100644 --- a/mount.c +++ b/mount.c @@ -26,6 +26,7 @@ #include "protobuf.h" #include "kerndat.h" #include "fs-magic.h" +#include "sysfs_parse.h" #include "protobuf/mnt.pb-c.h" @@ -861,7 +862,11 @@ static struct fstype fstypes[] = { }, { .name = "cgroup", .code = FSTYPE__CGROUP, - } + }, { + .name = "aufs", + .code = FSTYPE__AUFS, + .parse = aufs_parse, + }, }; struct fstype *find_fstype_by_name(char *fst) diff --git a/proc_parse.c b/proc_parse.c index 271d46b..9d593dd 100644 --- a/proc_parse.c +++ b/proc_parse.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "asm/types.h" @@ -24,8 +25,11 @@ #include "vma.h" #include "proc_parse.h" +#include "cr_options.h" +#include "sysfs_parse.h" #include "protobuf.h" #include "protobuf/fdinfo.pb-c.h" +#include "protobuf/mnt.pb-c.h" #include @@ -240,6 +244,31 @@ static int vma_get_mapfile(struct vma_area *vma, DIR *mfd, vma->vm_socket_id = buf.st_ino; } else if (errno != ENOENT) return -1; + } else if (opts.aufs) { + /* + * AUFS support to compensate for the kernel bug + * exposing branch pathnames in map_files. + * + * If the link points inside a branch, replace it + * with a pathname from the root for later use in + * dump_filemap(). + */ + char p[PATH_MAX]; + int n; + + p[0] = '.'; + n = read_fd_link(vma->vm_file_fd, &p[1], sizeof p - 1); + if (n < 0) + return -1; + n = fixup_aufs_path(&p[1], sizeof p - 1, true); + if (n < 0) + return -1; + if (n > 0) { + vma->aufs_rpath = xmalloc(n + 2); + if (!vma->aufs_rpath) + return -1; + strcpy(vma->aufs_rpath, p); + } } return 0; @@ -450,12 +479,24 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, bool use_map_file vma_area->st = prev->st; } else if (vma_area->vm_file_fd >= 0) { struct stat *st_buf; + char *f; st_buf = vma_area->st = xmalloc(sizeof(*st_buf)); if (!st_buf) goto err; - if (fstat(vma_area->vm_file_fd, st_buf) < 0) { + /* + * For AUFS support, we cannot fstat() a file descriptor that + * is a symbolic link to a branch. Instead, we obtain the + * pathname of the file from the root and use stat(). + */ + if (opts.aufs && (f = fixup_aufs_fd_path(vma_area->vm_file_fd))) { + if (stat(f, st_buf) < 0) { + pr_perror("Failed stat on %d's map %lu (%s)", + pid, start, f); + goto err; + } + } else if (fstat(vma_area->vm_file_fd, st_buf) < 0) { pr_perror("Failed fstat on %d's map %lu", pid, start); goto err; } @@ -1718,3 +1759,30 @@ out: fclose(f); return ret; } + +/* + * See comments in syfs_parse.c for why we need + * this callback (called from parse_mountinfo()). + */ +int aufs_parse(struct mount_info *new) +{ + char *cp; + + if (!opts.aufs_root || strcmp(new->mountpoint, "./")) + return 0; + + cp = malloc(strlen(opts.aufs_root) + 1); + if (!cp) { + pr_err("Cannot allocate memory for %s\n", opts.aufs_root); + return -1; + } + strcpy(cp, opts.aufs_root); + + pr_debug("Replacing %s with %s\n", new->root, opts.aufs_root); + free(new->root); + new->root = cp; + + parse_aufs_branches(new); + + return 0; +} diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto index 603bb37..2e39976 100644 --- a/protobuf/mnt.proto +++ b/protobuf/mnt.proto @@ -12,6 +12,7 @@ enum fstype { FUSECTL = 10; DEBUGFS = 11; CGROUP = 12; + AUFS = 13; }; message mnt_entry { diff --git a/sysfs_parse.c b/sysfs_parse.c new file mode 100644 index 0000000..e38ed13 --- /dev/null +++ b/sysfs_parse.c @@ -0,0 +1,284 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "criu-log.h" +#include "xmalloc.h" +#include "files.h" +#include "proc_parse.h" +#include "util.h" +#include "sysfs_parse.h" +#include "namespaces.h" + +/* + * If the process's root filesystem is AUFS, the symlinks + * in /proc//map_files directory are absolute pathnames + * of the corresponding *physical* files in the branch they + * exist. For example, for a Docker container using AUFS, + * a symlink in /proc//map_files would look like: + * + * 400000-489000 -> /var/lib/docker/aufs/diff//bin/ + * + * Therefore, when we use vm_file_fd (as argument lfd) in + * dump_one_reg_file() to read the link, we got the file's + * physical absolute pathname which: + * + * 1) does not exist relative to the root of the mount + * namespace + * 2) and even if we used its relative pathname, the dev:ino + * would be different from the physical file's dev:ino + * + * causing the function (and dump) to fail. + * + * To prevent this failure, we must replace abosolute pathnames + * to files in AUFS branches with relative pathnames from the root. + * Since the mountinfo file does not have the AUFS root pathname, + * we have to provide it with the --aufs-root option. + */ + +struct ns_id *aufs_nsid; +static char **aufs_branches; + +static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len) +{ + char *cp; + int n; + + cp = strstr(mi->options, "si="); + if (!cp) { + pr_err("Cannot find sbinfo in option string %s\n", mi->options); + return -1; + } + + /* all ok, copy */ + if (len < 4) { /* 4 for "si_" */ + pr_err("Buffer of %d bytes too small for sbinfo\n", len); + return -1; + } + strcpy(sbinfo, "si_"); + n = 3; + sbinfo += n; + cp += n; + while (isxdigit(*cp) && n < len) { + *sbinfo++ = *cp++; + n++; + } + if (n >= len) { + pr_err("Sbinfo in options string %s too long\n", mi->options); + return -1; + } + *sbinfo = '\0'; + return 0; +} + +/* + * Kernel stores patchnames to AUFS branches in the br files in + * the /sys/fs/aufs/si_ directory where denotes a branch + * number and is a hexadecimal number in %lx format. For + * example: + * + * $ cat /sys/fs/aufs/si_f598876b087ed883/br0 + * /path/to/branch0/directory=rw + * + * This function sets up an array of pointers to branch pathnames. + */ +int parse_aufs_branches(struct mount_info *mi) +{ + char path[AUFSBR_PATH_LEN]; + char *cp; + int n; + int ret; + unsigned int br_num; + unsigned int br_max; + DIR *dp; + FILE *fp; + struct dirent *de; + + pr_info("Collecting AUFS branch pathnames ...\n"); + + if (mi->nsid == 0) { + pr_err("No nsid to parse its aufs branches\n"); + return -1; + } + + if (mi->nsid == aufs_nsid) { + pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid); + return 0; + } + + if (aufs_nsid) + free_aufs_branches(); + + strcpy(path, "/sys/fs/aufs/"); + if (parse_aufs_sbinfo(mi, &path[13], SBINFO_LEN) < 0) + return -1; + if ((dp = opendir(path)) == NULL) { + pr_perror("Cannot opendir %s", path); + return -1; + } + + /* + * Find out how many branches we have. + */ + br_max = 0; + ret = 0; + while (1) { + errno = 0; + if ((de = readdir(dp)) == NULL) { + if (errno) { + pr_perror("Cannot readdir %s", path); + ret = -1; + } + break; + } + + ret = sscanf(de->d_name, "br%d", &br_num); + if (ret == 1 && br_num > br_max) + br_max = br_num; + } + closedir(dp); + if (ret == -1) + return -1; + + /* default maximum is 127, so 1000 should be plenty */ + if (br_max >= 1000) { + pr_err("Too many branches %d\n", br_max); + return -1; + } + + /* + * Allocate an array of pointers to branch pathnames to be read. + * Branches are indexed from 0 and we need a NULL pointer at the end. + */ + aufs_branches = xzalloc((br_max + 2) * sizeof (char *)); + if (!aufs_branches) + return -1; + + /* + * Now read branch pathnames from the branch files. + */ + n = strlen(path); + fp = NULL; + for (br_num = 0; br_num <= br_max; br_num++) { + ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num); + if (ret >= sizeof path - n) { + pr_err("Buffer overrun creating path for branch %d\n", br_num); + goto err; + } + + if ((fp = fopen(path, "r")) == NULL) { + pr_perror("Cannot fopen %s", path); + goto err; + } + + if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 || + aufs_branches[br_num] == NULL) { + pr_perror("Parse error reading %s", path); + goto err; + } + + /* chop off the trailing "=..." stuff */ + if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) { + pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]); + goto err; + } + *cp = '\0'; + + fclose(fp); + /* + * Log branch information for extenal utitilies that + * want to recreate the process's AUFS filesystem + * before calling criu restore. + * + * DO NOT CHANGE this format! + */ + pr_info("%s : %s\n", path, aufs_branches[br_num]); + } + + aufs_nsid = mi->nsid; + return 0; + +err: + if (fp) + fclose(fp); + free_aufs_branches(); + return -1; +} + +void free_aufs_branches(void) +{ + int n; + + if (aufs_branches) { + for (n = 0; aufs_branches[n] != NULL; n++) + xfree(aufs_branches[n]); + + xfree(aufs_branches); + aufs_branches = NULL; + } + + aufs_nsid = NULL; +} + +/* + * If the argument fd is a symbolic link to a file in an AUFS branch, + * return the pathname relative to the root of the AUFS mountpoint. + */ +char *fixup_aufs_fd_path(int lfd) +{ + static char linkpath[PATH_MAX]; + + if (read_fd_link(lfd, linkpath, sizeof linkpath) < 0) + return NULL; + + if (fixup_aufs_path(linkpath, sizeof linkpath, false)) + return linkpath; + + return NULL; +} + +/* + * If the specified path is in a branch pathname, replace it + * with either the relative or the full pathname from the root + * depending on chop. + */ +int fixup_aufs_path(char *path, int size, bool chop) +{ + char rpath[PATH_MAX]; + int n; + int blen; + + if (aufs_branches == NULL) { + pr_err("No aufs branches to search for %s\n", path); + return -1; + } + + for (n = 0; aufs_branches[n] != NULL; n++) { + blen = strlen(aufs_branches[n]); + if (!strncmp(path, aufs_branches[n], blen)) + break; + } + + if (aufs_branches[n] == NULL) + return 0; /* not in a branch */ + + if (chop) + n = snprintf(rpath, PATH_MAX, "%s", &path[blen]); + else + n = snprintf(rpath, PATH_MAX, "%s%s", opts.aufs_root ? : "", + &path[blen]); + if (n >= min(PATH_MAX, size)) { + pr_err("Not enough space to replace %s\n", path); + return -1; + } + + pr_debug("Replacing %s with %s\n", path, rpath); + strcpy(path, rpath); + return n; +} -- 1.9.1