diff --git a/Makefile.crtools b/Makefile.crtools index 6033b2c..ef30b05 100644 --- a/Makefile.crtools +++ b/Makefile.crtools @@ -9,6 +9,7 @@ obj-y += image-desc.o obj-y += net.o obj-y += tun.o obj-y += proc_parse.o +obj-y += sysfs_parse.o obj-y += cr-dump.o obj-y += cr-show.o obj-y += cr-check.o diff --git a/cgroup.c b/cgroup.c index 8308413..80fc63e 100644 --- a/cgroup.c +++ b/cgroup.c @@ -32,8 +32,6 @@ static const char *cpu_props[] = { "cpu.shares", "cpu.cfs_period_us", "cpu.cfs_quota_us", - "cpu.rt_period_us", - "cpu.rt_runtime_us", "notify_on_release", NULL }; @@ -41,7 +39,6 @@ static const char *cpu_props[] = { static const char *memory_props[] = { /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */ "memory.limit_in_bytes", - "memory.memsw.limit_in_bytes", "memory.use_hierarchy", "notify_on_release", NULL diff --git a/cr-dump.c b/cr-dump.c index 1700d9d..b6b86ac 100644 --- a/cr-dump.c +++ b/cr-dump.c @@ -71,6 +71,7 @@ #include "cr-service.h" #include "plugin.h" #include "irmap.h" +#include "sysfs_parse.h" #include "asm/dump.h" @@ -1461,6 +1462,18 @@ static int dump_one_task(struct pstree_item *item) */ return 0; + /* + * For AUFS support, we need to find out branch pathnames. + * See comments in parse_aufs_branches() for details. + */ + if (opts.aufs) { + pr_info("Collecting AUFS branch pathnames ...\n"); + if (parse_aufs_branches(pid) < 0) { + pr_err("Cannot parse AUFS branches (pid %d)\n", pid); + return -1; + } + } + pr_info("Obtaining task stat ... "); ret = parse_pid_stat(pid, &pps_buf); if (ret < 0) @@ -1643,6 +1656,8 @@ err: close_pid_proc(); free_mappings(&vmas); xfree(dfds); + if (opts.aufs) + free_aufs_branches(); return ret; err_cure: diff --git a/crtools.c b/crtools.c index 6caea08..4d31e56 100644 --- a/crtools.c +++ b/crtools.c @@ -169,6 +169,9 @@ int main(int argc, char *argv[]) { "ext-mount-map", required_argument, 0, 'M'}, { "exec-cmd", no_argument, 0, 59}, { "manage-cgroups", no_argument, 0, 60}, + { "aufs", no_argument, 0, 61}, + { "aufs-root", required_argument, 0, 62}, + { "aufs-ref", required_argument, 0, 63}, { }, }; @@ -379,6 +382,15 @@ int main(int argc, char *argv[]) case 'h': usage_error = false; goto usage; + case 61: + opts.aufs = true; + break; + case 62: + opts.aufs_root = optarg; + break; + case 63: + opts.aufs_ref = optarg; + break; default: goto usage; } @@ -531,6 +543,11 @@ usage: " --exec-cmd execute the command specified after '--' on successful\n" " restore making it the parent of the restored process\n" "\n" +"* AUFS support (applicable to dump only):\n" +" --aufs process running in AUFS filesystem\n" +" --aufs-root path to the root of the AUFS filesystem\n" +" --aufs-ref reference mountpoint for fixing up AUFS root\n" +"\n" "* Special resources support:\n" " -x|--" USK_EXT_PARAM " allow external unix connections\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" diff --git a/files.c b/files.c index 68c06c7..c1c0237 100644 --- a/files.c +++ b/files.c @@ -36,6 +36,8 @@ #include "fdset.h" #include "fs-magic.h" #include "proc_parse.h" +#include "cr_options.h" +#include "sysfs_parse.h" #include "parasite.h" #include "parasite-syscall.h" @@ -197,6 +199,19 @@ int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link) return -1; } + /* + * For AUFS support, we need to replace absolute + * branch pathnames with relative pathnames from root. + */ + if (opts.aufs) { + int n = sizeof link->name - 1; + n = fixup_aufs_path(&link->name[1], n, true); + if (n < 0) + return -1; + if (n > 0) + len = n; + } + link->len = len + 1; return 0; } diff --git a/include/cr_options.h b/include/cr_options.h index 0d11aa7..9ac52f0 100644 --- a/include/cr_options.h +++ b/include/cr_options.h @@ -52,6 +52,9 @@ struct cr_options { bool force_irmap; char **exec_cmd; bool manage_cgroups; + bool aufs; + char *aufs_root; + char *aufs_ref; }; extern struct cr_options opts; diff --git a/include/proc_parse.h b/include/proc_parse.h index 35d4292..892142c 100644 --- a/include/proc_parse.h +++ b/include/proc_parse.h @@ -214,4 +214,7 @@ extern void put_ctls(struct list_head *); int parse_cgroups(struct list_head *cgroups, unsigned int *n_cgroups); +extern int get_mountinfo_by_mountpoint(pid_t pid, char *pathname, char *line, int linelen); +extern int parse_mountinfo_aufs_sbinfo(pid_t pid, char *sbinfo, int len); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/include/sysfs_parse.h b/include/sysfs_parse.h new file mode 100644 index 0000000..7c2ffe1 --- /dev/null +++ b/include/sysfs_parse.h @@ -0,0 +1,22 @@ +#ifndef __CR_SYSFS_PARSE_H__ +#define __CR_SYSFS_PARSE_H__ + +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_PATH_LEN (13 + SBINFO_LEN + 1) /* /sys/fs/aufs/ */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%d */ + +/* proc_parse.c */ +extern int get_mountinfo_by_mountpoint(pid_t pid, char *mntpoint, char *line, int linelen); + +/* sysfs_parse.c */ +extern int parse_aufs_reference(pid_t pid, char *mntpoint); +extern int fixup_root(char **root); +extern int fixup_dev(unsigned int *majp, unsigned int *minp); +extern int fixup_src_opt(char **srcp, char **optp); +extern int parse_aufs_branches(pid_t pid); +extern void free_aufs_branches(void); +extern int get_aufs_ref_item(char *item, char **cpp); +extern char *fixup_aufs_fd_path(int lfd); +extern int fixup_aufs_path(char *path, int size, bool chop); + +#endif /* __CR_SYSFS_PARSE_H__ */ diff --git a/mount.c b/mount.c index 9a147ab..80cede9 100644 --- a/mount.c +++ b/mount.c @@ -26,6 +26,7 @@ #include "protobuf.h" #include "kerndat.h" #include "fs-magic.h" +#include "sysfs_parse.h" #include "protobuf/mnt.pb-c.h" @@ -979,6 +980,18 @@ struct mount_info *collect_mntinfo(struct ns_id *ns) { struct mount_info *pm; + /* + * If we are looking at a target process that runs in AUFS, + * we need to parse the reference mountpoint specified on + * the command line *before* calling parse_mntinfo() to + * have the information for fixing up the root entry. See + * comments in sysfs_parse.c for details. + */ + if (opts.aufs_ref && ns->pid != getpid()) { + if (parse_aufs_reference(ns->pid, opts.aufs_ref) < 0) + return NULL; + } + pm = parse_mountinfo(ns->pid, ns); if (!pm) { pr_err("Can't parse %d's mountinfo\n", ns->pid); diff --git a/proc_parse.c b/proc_parse.c index 271d46b..2bab2d3 100644 --- a/proc_parse.c +++ b/proc_parse.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "asm/types.h" @@ -22,8 +23,10 @@ #include "kerndat.h" #include "vdso.h" #include "vma.h" +#include "cr_options.h" #include "proc_parse.h" +#include "sysfs_parse.h" #include "protobuf.h" #include "protobuf/fdinfo.pb-c.h" @@ -450,12 +453,24 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, bool use_map_file vma_area->st = prev->st; } else if (vma_area->vm_file_fd >= 0) { struct stat *st_buf; + char *f; st_buf = vma_area->st = xmalloc(sizeof(*st_buf)); if (!st_buf) goto err; - if (fstat(vma_area->vm_file_fd, st_buf) < 0) { + /* + * For AUFS support, we cannot fstat() file a descriptor that + * is a symbolic link to a branch. Instead, we obtain the + * pathname of the file from the root and use stat(). + */ + if (opts.aufs && (f = fixup_aufs_fd_path(vma_area->vm_file_fd))) { + if (stat(f, st_buf) < 0) { + pr_perror("Failed fstat on %d's map %lu (%s)", + pid, start, f); + goto err; + } + } else if (fstat(vma_area->vm_file_fd, st_buf) < 0) { pr_perror("Failed fstat on %d's map %lu", pid, start); goto err; } @@ -903,6 +918,12 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new) return -1; } + /* see comments in syfs_parse.c */ + if (opts.aufs && !strcmp(new->mountpoint, "./")) { + if (fixup_root(&new->root) < 0 || fixup_dev(&kmaj, &kmin) < 0) + return -1; + } + new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1); new->s_dev = MKKDEV(kmaj, kmin); @@ -921,6 +942,16 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new) if (ret != 3) return -1; + /* see comments in sysfs_parse.c */ + if (opts.aufs && !strcmp(new->mountpoint, "./")) { + if (strcmp(fstype, "aufs")) { + pr_err("Expected fstype aufs got %s\n", fstype); + return -1; + } + if (fixup_src_opt(&new->source, &opt) < 0) + return -1; + } + ret = -1; new->fstype = find_fstype_by_name(fstype); @@ -1718,3 +1749,124 @@ out: fclose(f); return ret; } + +/* + * Copy the line in process's mountinfo file that corresponds + * to the mountpoint specified by the mntpoint argument. Return + * the number of characters parsed in the line, or -1 on error. + */ +int get_mountinfo_by_mountpoint(pid_t pid, char *mntpoint, char *line, int linelen) +{ + char buf[PATH_MAX]; + int n, ret; + FILE *f; + + snprintf(buf, sizeof buf, "/proc/%d/mountinfo", pid); + f = fopen(buf, "r"); + if (!f) { + pr_perror("Cannot fopen %s", buf); + return -1; + } + + do { + n = linelen - 2; + line[n] = '\0'; /* detect long input */ + if (fgets(line, linelen, f) == NULL) { + ret = 0; + break; + } + if (line[n] && line[n] != '\n') { + pr_err("Line in mountinfo too long\n"); + ret = -1; + goto out; + } + + ret = sscanf(line, "%*i %*i %*u:%*u %*s %s %*s - %n", buf, &n); + if (ret != 1) { + pr_err("Cannot parse mountpoint (%s)\n", line); + ret = -1; + goto out; + } + } while (strcmp(buf, mntpoint)); + + if (!ret) { + pr_err("Did not find %s in mountinfo\n", mntpoint); + ret = -1; + } else + ret = n; + +out: + fclose(f); + return ret; +} + +/* + * Parse process's mountinfo to find the AUFS root filesystem entry + * and copy its superblock info into the buffer passed in. The sbinfo + * is a string in the form si=%lx. When copying, we replace si= with + * si_ to match the directory name in /sys/fs/aufs. + * + * A typcial entry in mountinfo would look like: + * 91 62 0:33 / / rw,relatime - aufs none rw,si=f598876b087ed883 + */ +int parse_mountinfo_aufs_sbinfo(pid_t pid, char *sbinfo, int len) +{ + char line[PATH_MAX]; + char *fstype = NULL; + char *opt = NULL; + char *cp; + int n, ret; + + n = get_mountinfo_by_mountpoint(pid, "/", line, sizeof line); + if (n < 0) + return -1; + + ret = sscanf(&line[n], "%ms %*s %ms", &fstype, &opt); + if (ret != 2) { + pr_err("Cannot parse fstype and opt\n"); + ret = -1; + goto out; + } + + if (strcmp(fstype, "aufs")) { + pr_err("Expected aufs but parsed %s for /\n", fstype); + ret = -1; + goto out; + } + + cp = strstr(opt, "si="); + if (!cp) { + pr_err("Cannot find sbinfo in option string %s\n", opt); + ret = -1; + goto out; + } + + /* all ok, copy */ + if (len < 4) { /* 4 for "si_" */ + pr_err("Buffer of %d bytes too small for sbinfo\n", len); + ret = -1; + goto out; + } + strcpy(sbinfo, "si_"); + n = 3; + sbinfo += n; + cp += n; + while (isxdigit(*cp) && n < len) { + *sbinfo++ = *cp++; + n++; + } + if (n >= len) { + pr_err("Sbinfo in options string %s too long\n", opt); + ret = -1; + goto out; + } + *sbinfo = '\0'; + ret = 0; + +out: + if (fstype) + free(fstype); + if (opt) + free(opt); + return ret; +} diff --git a/sysfs_parse.c b/sysfs_parse.c new file mode 100644 index 0000000..5b13f6c --- /dev/null +++ b/sysfs_parse.c @@ -0,0 +1,398 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "criu-log.h" +#include "xmalloc.h" +#include "files.h" +#include "proc_parse.h" +#include "util.h" +#include "sysfs_parse.h" + +/* + * If the process's root filesystem is AUFS, the symlinks + * in /proc//map_files directory are absolute pathnames + * of the corresponding *physical* files in the branch they + * exist. For example, for a Docker container using AUFS, + * a symlink in /proc//map_files would look like: + * + * 400000-489000 -> /var/lib/docker/aufs/diff//bin/ + * + * Therefore, when we use vm_file_fd (as argument lfd) in + * dump_one_reg_file() to read the link, we got the file's + * physical absolute pathname which: + * + * 1) does not exist relative to the root of the mount + * namespace + * 2) and even if we used its relative pathname, the dev:ino + * would be different from the physical file's dev:ino + * + * causing the function (and dump) to fail. + * + * To prevent this failure, we must replace abosolute pathnames + * to files in AUFS branches with relative pathnames from the root. + * Since the mountinfo file does not have the AUFS root pathname, + * we have to provide it with the --aufs-root option. + * + * In addition to fixing up the pathnames, we also need to fix up + * the following items of the root entry in mountinfo so that when + * in collect_shared() we call mounts_equal() to build the mnt_bind + * list, entries for hosts, hostname, etc., would be added to the list. + * + * The "reference" values are obtained from the mountinfo line that + * corresponds to the mountpoint file specified by --aufs-ref option. + * Below are example values for Docker. + * + * ITEM AUFS REPLACE WITH EXAMPLE + * ---- ---- ------- ---- ------ + * kmaj 0 dev major # 8 for /dev/sda1 + * kmin 33 dev minor # 1 for /dev/sda1 + * source none underlying dev /dev/disk/by-uuid/ + * options rw,si= underlying opts rw,errors=remount-ro,data=ordered + * + * Note that we don't have to change fstype from "aufs" to, say, "ext4" + * because both are "unuspported" (see find_fstype_by_name()). + */ + +static int aufs_pid; +static char **aufs_branches; + +static char *ref_maj; +static char *ref_min; +static char *ref_src; +static char *ref_opt; + +/* + * Replace the string that *item points to with the + * string pointed to by option. + */ +static int fixup_item(char *option, char **item) +{ + char *cp; + int n; + + if (!option) { + pr_err("Null option passed for item %s\n", *item); + return -1; + } + + n = strlen(option) + 1; + cp = malloc(n); + if (!cp) { + pr_err("Cannot allocate %d bytes of memory\n", n); + return -1; + } + strcpy(cp, option); + + pr_debug("Replacing %s with %s\n", *item, option); + free(*item); + *item = cp; + return 0; +} + +/* + * Get the device, source, and options values of the specified + * mountpoint to use as reference for fixing up the root entry. + */ +int parse_aufs_reference(pid_t pid, char *mntpoint) +{ + char line[PATH_MAX]; + int ret; + + ret = get_mountinfo_by_mountpoint(pid, mntpoint, line, sizeof line); + if (ret < 0) + return -1; + + ret = sscanf(line, "%*i %*i %m[^:]:%ms %*s %*s %*s - %*s %ms %ms", + &ref_maj, &ref_min, &ref_src, &ref_opt); + if (ret != 4) { + pr_err("Cannot scan reference attributes (ret %d)\n", ret); + return -1; + } + + pr_debug("Will fix up root entry with %s %s %s %s\n", + ref_maj, ref_min, ref_src, ref_opt); + return 0; +} + +int fixup_root(char **root) +{ + if (!opts.aufs_root) + return 0; + + return fixup_item(opts.aufs_root, root); +} + +int fixup_dev(unsigned int *majp, unsigned int *minp) +{ + char *cp; + + if (!opts.aufs_ref) + return 0; + + if (get_aufs_ref_item("major", &cp) < 0) + return -1; + if (cp) + *majp = atoi(cp); + + if (get_aufs_ref_item("minor", &cp) < 0) + return -1; + if (cp) + *minp = atoi(cp); + + return 0; +} + +int fixup_src_opt(char **srcp, char **optp) +{ + char *cp; + + if (!opts.aufs_ref) + return 0; + + if (get_aufs_ref_item("source", &cp) < 0) + return -1; + if (cp && fixup_item(cp, srcp) < 0) + return -1; + + if (opts.aufs_ref && get_aufs_ref_item("options", &cp) < 0) + return -1; + if (cp && fixup_item(cp, optp) < 0) + return -1; + + return 0; +} + +/* + * Kernel stores patchnames to AUFS branches in the br files in + * the /sys/fs/aufs/si_ directory where denotes a branch + * number and is a hexadecimal number in %lx format. For + * example: + * + * $ cat /sys/fs/aufs/si_f598876b087ed883/br0 + * /path/to/branch0/directory=rw + * + * This function sets up an array of pointers to branch pathnames. + */ +int parse_aufs_branches(pid_t pid) +{ + char path[AUFSBR_PATH_LEN]; + char *cp; + int n; + int ret; + unsigned int br_num; + unsigned int br_max; + DIR *dp; + FILE *fp; + struct dirent *de; + + if (pid == 0) { + pr_err("No pid to parse its aufs branches\n"); + return -1; + } + + if (pid == aufs_pid) { + pr_debug("Using cached aufs branch paths for pid %d\n", pid); + return 0; + } + + if (aufs_pid) + free_aufs_branches(); + + /* + * Parse out the sbinfo value from /proc//mountinfo + * and open its corresponding directory in /sys/fs/aufs. + */ + strcpy(path, "/sys/fs/aufs/"); + if (parse_mountinfo_aufs_sbinfo(pid, &path[13], SBINFO_LEN) < 0) + return -1; + if ((dp = opendir(path)) == NULL) { + pr_perror("Cannot opendir %s", path); + return -1; + } + + /* + * Find out how many branches we have. + */ + br_max = 0; + ret = 0; + while (1) { + errno = 0; + if ((de = readdir(dp)) == NULL) { + if (errno) { + pr_perror("Cannot readdir %s", path); + ret = -1; + } + break; + } + + ret = sscanf(de->d_name, "br%d", &br_num); + if (ret == 1 && br_num > br_max) + br_max = br_num; + } + closedir(dp); + if (ret == -1) + return -1; + + /* default maximum is 127, so 1000 should be plenty */ + if (br_max >= 1000) { + pr_err("Too many branches %d\n", br_max); + return -1; + } + + /* + * Allocate an array of pointers to branch pathnames to be read. + * Branches are indexed from 0 and we need a NULL pointer at the end. + */ + aufs_branches = xzalloc((br_max + 2) * sizeof (char *)); + if (!aufs_branches) + return -1; + + /* + * Now read branch pathnames from the branch files. + */ + n = strlen(path); + fp = NULL; + for (br_num = 0; br_num <= br_max; br_num++) { + ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num); + if (ret >= sizeof path - n) { + pr_err("Buffer overrun creating path for branch %d\n", br_num); + goto err; + } + + if ((fp = fopen(path, "r")) == NULL) { + pr_perror("Cannot fopen %s", path); + goto err; + } + + if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 || + aufs_branches[br_num] == NULL) { + pr_perror("Parse error reading %s", path); + goto err; + } + + /* chop off the trailing "=..." stuff */ + if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) { + pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]); + goto err; + } + *cp = '\0'; + + fclose(fp); + pr_debug("%s : %s\n", path, aufs_branches[br_num]); + } + + aufs_pid = pid; + return 0; + +err: + if (fp) + fclose(fp); + free_aufs_branches(); + return -1; +} + +void free_aufs_branches(void) +{ + if (ref_maj) { + free(ref_maj); ref_maj = NULL; + free(ref_min); ref_min = NULL; + free(ref_src); ref_src = NULL; + free(ref_opt); ref_opt = NULL; + } + + if (aufs_branches) { + int n; + + for (n = 0; aufs_branches[n] != NULL; n++) + xfree(aufs_branches[n]); + xfree(aufs_branches); aufs_branches = NULL; + } +} + +int get_aufs_ref_item(char *item, char **cpp) +{ + char *cp; + + if (!strcmp(item, "major")) + cp = ref_maj; + else if (!strcmp(item, "minor")) + cp = ref_min; + else if (!strcmp(item, "source")) + cp = ref_src; + else if (!strcmp(item, "options")) + cp = ref_opt; + else { + pr_err("No such reference item %s\n", item); + return -1; + } + + if (cp == NULL) + pr_warn("Reference item %s has NULL value\n", item); + + *cpp = cp; + return 0; +} + +/* + * If the argument fd is a symbolic link to a file in an AUFS branch, + * return the pathname relative to the root of the AUFS mountpoint. + */ +char *fixup_aufs_fd_path(int lfd) +{ + static char linkpath[PATH_MAX]; + + if (read_fd_link(lfd, linkpath, sizeof linkpath) < 0) + return NULL; + + if (fixup_aufs_path(linkpath, sizeof linkpath, false)) + return linkpath; + + return NULL; +} + +/* + * If the specified path is in a branch pathname, replace it + * with either the relative or the full pathname from the root + * depending on chop. + */ +int fixup_aufs_path(char *path, int size, bool chop) +{ + char rpath[PATH_MAX]; + int n; + int blen; + + if (aufs_branches == NULL) { + pr_err("No aufs branches to search for %s\n", path); + return -1; + } + + for (n = 0; aufs_branches[n] != NULL; n++) { + blen = strlen(aufs_branches[n]); + if (!strncmp(path, aufs_branches[n], blen)) + break; + } + + if (aufs_branches[n] == NULL) + return 0; /* not in a branch */ + + if (chop) + n = snprintf(rpath, PATH_MAX, "%s", &path[blen]); + else + n = snprintf(rpath, PATH_MAX, "%s%s", opts.aufs_root ? : "", + &path[blen]); + if (n >= min(PATH_MAX, size)) { + pr_err("Not enough space to replace %s\n", path); + return -1; + } + + pr_debug("Replacing %s with %s\n", path, rpath); + strcpy(path, rpath); + return n; +}