[CRIU] [PATCH 2/9] lsm: support checkpoint/restore of stacked apparmor profiles

Tycho Andersen tycho.andersen at canonical.com
Mon Oct 3 11:13:41 PDT 2016


Support for apparmor namespaces and stacking is coming to Ubuntu kernels in
16.10, and should hopefully be upstreamed Soon (TM) :).

The basic idea is similar to how cgroups are done: we can restore the
apparmor namespace and profile blobs independently of the tasks, and then
at the end we can just set the task's label appropriately. This means the
code that moves tasks under a label stays the same, and the only new code
is the stuff that dumps and restores the policy blobs that are in the
namespace that were loaded by the container.

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 criu/Makefile.crtools        |   1 +
 criu/apparmor.c              | 478 +++++++++++++++++++++++++++++++++++++++++++
 criu/cr-dump.c               |   4 +
 criu/cr-restore.c            |   4 +
 criu/image-desc.c            |   1 +
 criu/include/apparmor.h      |  12 ++
 criu/include/image-desc.h    |   1 +
 criu/include/magic.h         |   1 +
 criu/include/protobuf-desc.h |   1 +
 criu/lsm.c                   |   7 +
 criu/protobuf-desc.c         |   1 +
 images/Makefile              |   1 +
 images/apparmor.proto        |  16 ++
 images/creds.proto           |   3 +-
 lib/py/images/images.py      |   1 +
 15 files changed, 531 insertions(+), 1 deletion(-)
 create mode 100644 criu/apparmor.c
 create mode 100644 criu/include/apparmor.h
 create mode 100644 images/apparmor.proto

diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
index c435748..a10b605 100644
--- a/criu/Makefile.crtools
+++ b/criu/Makefile.crtools
@@ -6,6 +6,7 @@ ccflags-y		+= -iquote compel/arch/$(ARCH)/plugins/std
 obj-y			+= action-scripts.o
 obj-y			+= external.o
 obj-y			+= aio.o
+obj-y			+= apparmor.o
 obj-y			+= bfd.o
 obj-y			+= bitmap.o
 obj-y			+= cgroup.o
diff --git a/criu/apparmor.c b/criu/apparmor.c
new file mode 100644
index 0000000..f49cfeb
--- /dev/null
+++ b/criu/apparmor.c
@@ -0,0 +1,478 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <ftw.h>
+
+#include "config.h"
+#include "imgset.h"
+#include "pstree.h"
+#include "util.h"
+#include "lsm.h"
+#include "cr_options.h"
+
+#include "protobuf.h"
+#include "images/inventory.pb-c.h"
+#include "images/apparmor.pb-c.h"
+
+/*
+ * Apparmor stacked profile checkpoint restore. Previously, we just saved the
+ * profile that was in use by the task, and we expected it to be present on the
+ * target host. Now with stacking, containers are able to load their own
+ * profiles, so we can't rely on this.
+ *
+ * The basic idea here is that there is some (collection) of (potentially
+ * nested) namespaces that a container uses. We don't collect everything on the
+ * host level, but we *do* collect everything inside the namespace; a container
+ * could have loaded a profile but not yet used it when we start to checkpoint.
+ *
+ * Thus, the old code that saves and restores AA profiles is still relevant, we
+ * just need to add the new code in this file to walk the namespace and dump
+ * any blobs in that AA namespace, and then restore these blobs on restore so
+ * that the profiles the old code tries to use are actualy present.
+ */
+
+static AaNamespace **namespaces = NULL;
+static int n_namespaces = 0;
+
+bool ns_dumping_enabled = false;
+
+static AaNamespace *new_namespace(char *name, AaNamespace *parent)
+{
+	void *m;
+	AaNamespace *ret;
+
+	ret = xmalloc(sizeof(*ret));
+	if (!ret)
+		return NULL;
+	aa_namespace__init(ret);
+
+	ret->name = xstrdup(name);
+	if (!ret->name) {
+		xfree(ret);
+		return NULL;
+	}
+
+	if (parent) {
+		m = xrealloc(parent->namespaces, sizeof(*parent->namespaces) * (parent->n_namespaces + 1));
+		if (!m) {
+			xfree(ret->name);
+			xfree(ret);
+			return NULL;
+		}
+
+		parent->namespaces = m;
+		parent->namespaces[parent->n_namespaces++] = ret;
+	}
+
+	m = xrealloc(namespaces, sizeof(*namespaces) * (n_namespaces + 1));
+	if (!m) {
+		if (parent)
+			parent->n_namespaces--;
+
+		xfree(ret->name);
+		xfree(ret);
+		return NULL;
+	}
+
+	namespaces = m;
+	namespaces[n_namespaces++] = ret;
+
+	return ret;
+}
+
+static int collect_profile(char *path, char *name, AaNamespace *ns)
+{
+	AaPolicy *cur;
+	char *c;
+	int fd;
+	struct stat sb;
+	ssize_t n;
+	void *m;
+
+	strcat(path, name);
+	strcat(path, "/raw_data");
+
+	/* the apparmor kernel stuff puts an extra .N on the profile name that
+	 * is the directory name which is annoying. We strip it off here. */
+	c = strrchr(name, '.');
+	if (!c) {
+		pr_err("malformed apparmor profile name %s\n", name);
+		return -1;
+	}
+
+	pr_info("dumping profile %s\n", path);
+
+	cur = xmalloc(sizeof(*cur));
+	if (!cur)
+		return -1;
+	aa_policy__init(cur);
+
+	cur->name = xstrdup(name);
+	if (!cur) {
+		xfree(cur);
+		return -1;
+	}
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		pr_perror("failed to open aa policy %s", path);
+		goto err;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		pr_perror("failed to stat %s", path);
+		goto close;
+	}
+
+	cur->blob.len = sb.st_size;
+	cur->blob.data = xmalloc(sb.st_size);
+	if (!cur->blob.data)
+		goto close;
+
+	n = read(fd, cur->blob.data, sb.st_size);
+	if (n < 0) {
+		pr_perror("failed to read %s", path);
+		goto close;
+	}
+
+	if (n != sb.st_size) {
+		pr_err("didn't read all of %s\n", path);
+		goto close;
+	}
+
+	close(fd);
+
+	m = xrealloc(ns->policies, sizeof(*ns->policies) * (ns->n_policies + 1));
+	if (!m)
+		goto err;
+	ns->policies = m;
+	ns->policies[ns->n_policies++] = cur;
+
+	return 0;
+
+close:
+	close(fd);
+
+err:
+	xfree(cur->name);
+	xfree(cur);
+	return -1;
+}
+
+static int walk_namespace(char *path, size_t offset, AaNamespace *ns)
+{
+	DIR *dir = NULL;
+	struct dirent *de;
+	int ret = -1;
+	size_t my_offset;
+
+	/* collect all the child namespaces */
+	strcat(path, "/namespaces/");
+	my_offset = offset + 12;
+
+	dir = opendir(path);
+	if (!dir)
+		goto out;
+
+	while((de = readdir(dir))) {
+		AaNamespace *cur;
+
+		if (dir_dots(de))
+			continue;
+
+		path[my_offset] = '\0';
+		strcat(path, de->d_name);
+
+		cur = new_namespace(de->d_name, ns);
+		if (!cur)
+			goto out;
+
+		if (walk_namespace(path, my_offset + strlen(de->d_name), cur) < 0) {
+			aa_namespace__free_unpacked(cur, NULL);
+			ns->n_namespaces--;
+			goto out;
+		}
+	}
+
+	closedir(dir);
+
+	/* now collect the profiles for this namespace */
+	path[offset] = '\0';
+	strcat(path, "/profiles/");
+	my_offset = offset + 10;
+
+	dir = opendir(path);
+	if (!dir)
+		goto out;
+
+	while((de = readdir(dir))) {
+		if (dir_dots(de))
+			continue;
+
+		path[my_offset] = '\0';
+		if (collect_profile(path, de->d_name, ns) < 0)
+			goto out;
+	}
+
+	ret = 0;
+out:
+	if (dir)
+		closedir(dir);
+	return ret;
+}
+
+int collect_aa_namespace(char *profile)
+{
+	char path[PATH_MAX], *namespace, *end;
+	int ret, i;
+	AaNamespace *ns;
+
+	if (!profile)
+		return 0;
+
+	namespace = strchr(profile, ':');
+	if (!namespace)
+		return 0;  /* no namespace to dump */
+	namespace++;
+
+	if (!ns_dumping_enabled) {
+		pr_warn("Apparmor namespace present but dumping not enabled\n");
+		return 0;
+	}
+
+	/* XXX: this is not strictly correct; if something is using namespace
+	 * views, extra //s can indicate a namespace separation. However, I
+	 * think only the apparmor developers use this feature :)
+	 */
+	end = strchr(namespace, ':');
+	if (!end) {
+		pr_err("couldn't find AA namespace end in: %s", namespace);
+		return -1;
+	}
+
+	*end = '\0';
+
+	for (i = 0; i < n_namespaces; i++) {
+		/* did we already dump this namespace? */
+		if (!strcmp(namespaces[i]->name, namespace)) {
+			*end = ':';
+			return 0;
+		}
+	}
+
+	pr_info("dumping AA namespace %s\n", namespace);
+
+	ns = new_namespace(namespace, NULL);
+	*end = ':';
+	if (!ns)
+		return -1;
+
+	ret = snprintf(path, sizeof(path), AA_SECURITYFS_PATH "/policy/namespaces/%s", ns->name);
+	if (ret < 0 || ret >= sizeof(path)) {
+		pr_err("snprintf failed?\n");
+		goto err;
+	}
+
+	if (walk_namespace(path, ret, ns) < 0) {
+		pr_err("walking AA namespace %s failed\n", ns->name);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	aa_namespace__free_unpacked(ns, NULL);
+	n_namespaces--;
+	return -1;
+}
+
+int dump_aa_namespaces(void)
+{
+	ApparmorEntry *ae = NULL;
+	int ret;
+
+	if (n_namespaces == 0)
+		return 0;
+
+	ae = xmalloc(sizeof(*ae));
+	if (!ae)
+		return -1;
+	apparmor_entry__init(ae);
+
+	ae->n_namespaces = n_namespaces;
+	ae->namespaces = namespaces;
+
+	ret = pb_write_one(img_from_set(glob_imgset, CR_FD_APPARMOR), ae, PB_APPARMOR);
+
+	apparmor_entry__free_unpacked(ae, NULL);
+	n_namespaces = -1;
+	namespaces = NULL;
+
+	return ret;
+}
+
+bool check_aa_ns_dumping(void)
+{
+	char contents[48];
+	int major, minor, ret;
+	FILE *f;
+
+	f = fopen(AA_SECURITYFS_PATH "/features/domain/stack", "r");
+	if (!f)
+		return false;
+
+	ret = fscanf(f, "%48s", contents);
+	fclose(f);
+	if (ret != 1) {
+		pr_err("scanning aa stack feature failed\n");
+		return false;
+	}
+
+	if (strcmp("yes", contents)) {
+		pr_warn("aa stack featured disabled: %s\n", contents);
+		return false;
+	}
+
+	f = fopen(AA_SECURITYFS_PATH "/features/domain/version", "r");
+	if (!f)
+		return false;
+
+	ret = fscanf(f, "%d.%d", &major, &minor);
+	fclose(f);
+	if (ret != 2) {
+		pr_err("scanning aa stack version failed\n");
+		return false;
+	}
+
+	return major >= 1 && minor >= 2;
+}
+
+static int restore_aa_namespace(AaNamespace *ns, char *path, int offset)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	if (pid < 0) {
+		pr_perror("fork failed");
+		return -1;
+	}
+
+	if (!pid) {
+		int i, my_offset, ret, fd;
+		char buf[1024];
+
+		ret = snprintf(buf, sizeof(buf), "changeprofile :%s:", ns->name);
+		if (ret < 0 || ret >= sizeof(buf)) {
+			pr_err("profile %s too big\n", ns->name);
+			exit(1);
+		}
+
+		my_offset = snprintf(path+offset, PATH_MAX-offset, "/namespaces/%s", ns->name);
+		if (my_offset < 0 || my_offset >= PATH_MAX-offset) {
+			pr_err("snprintf'd too many characters\n");
+			exit(1);
+		}
+
+		if (mkdir(path, 0755) < 0) {
+			if (errno == EEXIST) {
+				pr_warn("apparmor namespace %s already exists, restoring into it\n", path);
+			} else {
+				pr_perror("failed to create namespace %s", path);
+				exit(1);
+			}
+		}
+
+		fd = open_proc_rw(PROC_SELF, "attr/current");
+		if (fd < 0) {
+			pr_perror("couldn't open attr/current");
+			goto fail;
+		}
+
+		errno = 0;
+		ret = write(fd, buf, strlen(buf));
+		close(fd);
+		if (ret != strlen(buf)) {
+			pr_perror("failed to change aa namespace");
+			goto fail;
+		}
+
+		for (i = 0; i < ns->n_namespaces; i++) {
+			if (restore_aa_namespace(ns, path, offset + my_offset) < 0)
+				goto fail;
+		}
+
+		for (i = 0; i < ns->n_policies; i++) {
+			int fd, n;
+			AaPolicy *p = ns->policies[i];
+
+			fd = open(AA_SECURITYFS_PATH "/.load", O_WRONLY);
+			if (fd < 0) {
+				pr_perror("couldn't open apparmor load file");
+				goto fail;
+			}
+
+			n = write(fd, p->blob.data, p->blob.len);
+			close(fd);
+			if (n != p->blob.len) {
+				pr_perror("write AA policy failed");
+				goto fail;
+			}
+		}
+
+		exit(0);
+fail:
+		rmdir(path);
+		exit(1);
+	}
+
+	if (waitpid(pid, &status, 0) < 0) {
+		pr_perror("waitpid failed");
+		return -1;
+	}
+
+	if (WIFEXITED(status) && WEXITSTATUS(status) == 0)
+		return 0;
+
+	pr_err("failed to restore aa namespace, worker exited: %d\n", status);
+	return -1;
+}
+
+int prepare_apparmor_namespaces(void)
+{
+	struct cr_img *img;
+	int ret, i;
+	ApparmorEntry *ae;
+
+	img = open_image(CR_FD_APPARMOR, O_RSTR);
+	if (!img)
+		return -1;
+
+	ret = pb_read_one_eof(img, &ae, PB_APPARMOR);
+	close_image(img);
+	if (ret <= 0)
+		return 0; /* there was no AA namespace entry */
+
+	BUG_ON(!ae);
+
+	/* no real reason we couldn't do this in parallel, but in usually we
+	 * expect one namespace so there's probably not a lot to be gained.
+	 */
+	for (i = 0; i < ae->n_namespaces; i++) {
+		char path[PATH_MAX] = AA_SECURITYFS_PATH "/policy";
+
+		if (restore_aa_namespace(ae->namespaces[i], path, strlen(path)) < 0) {
+			ret = -1;
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	apparmor_entry__free_unpacked(ae, NULL);
+	return ret;
+}
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 1269999..e90ee52 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -81,6 +81,7 @@
 #include "seccomp.h"
 #include "seize.h"
 #include "fault-injection.h"
+#include "apparmor.h"
 
 #include "asm/dump.h"
 
@@ -1812,6 +1813,9 @@ int cr_dump_tasks(pid_t pid)
 		if (dump_namespaces(root_item, root_ns_mask) < 0)
 			goto err;
 
+	if (dump_aa_namespaces() < 0)
+		goto err;
+
 	ret = dump_cgroups();
 	if (ret)
 		goto err;
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index a323df9..3a157f2 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -77,6 +77,7 @@
 #include "seccomp.h"
 #include "fault-injection.h"
 #include "sk-queue.h"
+#include "apparmor.h"
 
 #include "parasite-syscall.h"
 #include "files-reg.h"
@@ -149,6 +150,9 @@ static int crtools_prepare_shared(void)
 	if (prepare_cgroup())
 		return -1;
 
+	if (prepare_apparmor_namespaces())
+		return -1;
+
 	return 0;
 }
 
diff --git a/criu/image-desc.c b/criu/image-desc.c
index bac7ca2..3728aa3 100644
--- a/criu/image-desc.c
+++ b/criu/image-desc.c
@@ -99,6 +99,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
 	FD_ENTRY(USERNS,	"userns-%d"),
 	FD_ENTRY(NETNF_CT,	"netns-ct-%d"),
 	FD_ENTRY(NETNF_EXP,	"netns-exp-%d"),
+	FD_ENTRY(APPARMOR,	"apparmor"),
 
 	[CR_FD_STATS] = {
 		.fmt	= "stats-%s",
diff --git a/criu/include/apparmor.h b/criu/include/apparmor.h
new file mode 100644
index 0000000..447e015
--- /dev/null
+++ b/criu/include/apparmor.h
@@ -0,0 +1,12 @@
+#ifndef __CR_APPARMOR_H__
+#define __CR_APPARMOR_H__
+
+int collect_aa_namespace(char *profile);
+int dump_aa_namespaces(void);
+
+extern bool ns_dumping_enabled;
+bool check_aa_ns_dumping(void);
+
+int prepare_apparmor_namespaces(void);
+
+#endif /* __CR_APPARMOR_H__ */
diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h
index bb69616..b0d14a3 100644
--- a/criu/include/image-desc.h
+++ b/criu/include/image-desc.h
@@ -81,6 +81,7 @@ enum {
 	CR_FD_TIMERFD,
 	CR_FD_FILE_LOCKS,
 	CR_FD_SECCOMP,
+	CR_FD_APPARMOR,
 	_CR_FD_GLOB_TO,
 
 	CR_FD_TMPFS_IMG,
diff --git a/criu/include/magic.h b/criu/include/magic.h
index deb54b1..9eff82b 100644
--- a/criu/include/magic.h
+++ b/criu/include/magic.h
@@ -93,6 +93,7 @@
 #define SECCOMP_MAGIC		0x64413049 /* Kostomuksha */
 #define BINFMT_MISC_MAGIC	0x67343323 /* Apatity */
 #define AUTOFS_MAGIC		0x49353943 /* Sochi */
+#define APPARMOR_MAGIC		0x59423047 /* Sablino */
 
 #define IFADDR_MAGIC		RAW_IMAGE_MAGIC
 #define ROUTE_MAGIC		RAW_IMAGE_MAGIC
diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h
index 6c76b49..9c3a9c7 100644
--- a/criu/include/protobuf-desc.h
+++ b/criu/include/protobuf-desc.h
@@ -59,6 +59,7 @@ enum {
 	PB_BINFMT_MISC,		/* 50 */
 	PB_TTY_DATA,
 	PB_AUTOFS,
+	PB_APPARMOR,
 
 	/* PB_AUTOGEN_STOP */
 
diff --git a/criu/lsm.c b/criu/lsm.c
index 27ca004..cd3ef6d 100644
--- a/criu/lsm.c
+++ b/criu/lsm.c
@@ -10,6 +10,7 @@
 #include "util.h"
 #include "cr_options.h"
 #include "lsm.h"
+#include "apparmor.h"
 
 #include "protobuf.h"
 #include "images/inventory.pb-c.h"
@@ -121,6 +122,7 @@ void kerndat_lsm(void)
 		get_label = apparmor_get_label;
 		lsmtype = LSMTYPE__APPARMOR;
 		name = "apparmor";
+		ns_dumping_enabled = check_aa_ns_dumping();
 		return;
 	}
 
@@ -158,6 +160,11 @@ int collect_lsm_profile(pid_t pid, CredsEntry *ce)
 	if (get_label(pid, &ce->lsm_profile) < 0)
 		return -1;
 
+	if (lsmtype == LSMTYPE__APPARMOR && collect_aa_namespace(ce->lsm_profile) < 0) {
+		pr_err("failed to collect AA namespace\n");
+		return -1;
+	}
+
 	if (ce->lsm_profile)
 		pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile);
 
diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c
index 9352a76..1b2bcc7 100644
--- a/criu/protobuf-desc.c
+++ b/criu/protobuf-desc.c
@@ -64,6 +64,7 @@
 #include "images/seccomp.pb-c.h"
 #include "images/binfmt-misc.pb-c.h"
 #include "images/autofs.pb-c.h"
+#include "images/apparmor.pb-c.h"
 
 struct cr_pb_message_desc cr_pb_descs[PB_MAX];
 
diff --git a/images/Makefile b/images/Makefile
index cf50794..eb700ac 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -60,6 +60,7 @@ proto-obj-y	+= binfmt-misc.o
 proto-obj-y	+= time.o
 proto-obj-y	+= sysctl.o
 proto-obj-y	+= autofs.o
+proto-obj-y	+= apparmor.o
 
 CFLAGS		+= -iquote $(obj)/
 
diff --git a/images/apparmor.proto b/images/apparmor.proto
new file mode 100644
index 0000000..0c84f80
--- /dev/null
+++ b/images/apparmor.proto
@@ -0,0 +1,16 @@
+syntax = "proto2";
+
+message aa_policy {
+	required string		name	= 1;
+	required bytes		blob	= 2;
+}
+
+message aa_namespace {
+	required string		name			= 1;
+	repeated aa_policy	policies		= 2;
+	repeated aa_namespace	namespaces		= 3;
+}
+
+message apparmor_entry {
+	repeated aa_namespace	namespaces		= 1;
+}
diff --git a/images/creds.proto b/images/creds.proto
index 29fb865..467a810 100644
--- a/images/creds.proto
+++ b/images/creds.proto
@@ -19,5 +19,6 @@ message creds_entry {
 
 	repeated uint32	groups	= 14;
 
-	optional string lsm_profile = 15;
+	optional string lsm_profile 	= 15;
+	optional bytes	apparmor_data	= 16;
 }
diff --git a/lib/py/images/images.py b/lib/py/images/images.py
index c593a3b..127f2b7 100644
--- a/lib/py/images/images.py
+++ b/lib/py/images/images.py
@@ -454,6 +454,7 @@ handlers = {
 	'USERNS'		: entry_handler(userns_entry),
 	'SECCOMP'		: entry_handler(seccomp_entry),
 	'AUTOFS'		: entry_handler(autofs_entry),
+	'APPARMOR'		: entry_handler(apparmor_entry),
 	}
 
 def __rhandler(f):
-- 
2.9.3



More information about the CRIU mailing list