[CRIU] [PATCH v2 5/6] net: add support for macvlan link types

Tycho Andersen tycho.andersen at canonical.com
Thu Oct 13 08:15:44 PDT 2016


While this is in principle similar to how veths are handled, we have to do
things in two different ways depending on whether or not there is a user
namespace involved, because there is no way to ask the kernel to attach a
macvlan NIC to a device in a net ns that we don't have CAP_NET_ADMIN in.

So we do it in two ways:

a. If we are in a user namespace, we create the device in usernsd and use
   IFLA_NET_NS_PID to set the netns which it should be created in (saving
   us a "move into this netns" step).

b. If we aren't in a user namespace, we could still be in a net namespace,
   so we use IFLA_LINK_NETNSID to set namespace that the i/o device will be
   in. Then we open a netlink socket from criu's netns and use
   IFLA_NET_NS_FD to tell the kernel to create the macvlan device in the
   target's namespace.

v2: * s/CLONE_NEWNET/CLONE_NEWUSER
    * Don't bother to dump IFLA_LINK and IFLA_LINK_NETNSID. Although we
      need to provide these on restore, there's no kernel interface that
      persists these. To populate IFLA_LINK, we require users pass
      --macvlan-pair, and we create a NETNSID relation as needed and pass
      that in for macvlan links (although this infrastructure could be used
      elsewhere for links that need it in the future, since is in the
      hoisted populate_newlink_req()).
    * use new external command instead of creating a --macvlan-pair option

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 Documentation/criu.txt |   4 +
 criu/crtools.c         |   1 +
 criu/external.c        |   9 ++
 criu/include/net.h     |   8 ++
 criu/net.c             | 278 +++++++++++++++++++++++++++++++++++++++++++++++--
 images/Makefile        |   1 +
 images/macvlan.proto   |   4 +
 images/netdev.proto    |   4 +
 8 files changed, 299 insertions(+), 10 deletions(-)
 create mode 100644 images/macvlan.proto

diff --git a/Documentation/criu.txt b/Documentation/criu.txt
index 5ee5995..8cc6087 100644
--- a/Documentation/criu.txt
+++ b/Documentation/criu.txt
@@ -381,6 +381,10 @@ during *dump*, and images are migrated to a less capable CPU and are
 to be restored. By default, *criu* shows an error that CPU capabilities
 are not adequate, but this can be suppressed by using *--cpu-cap=none*.
 
+*--macvlan-pair* '<IN>'*=*'<OUT>'::
+    Similar to *--veth-pair*, except '<OUT>' is the interface to attach the
+    macvlan device to.
+
 *check*
 ~~~~~~~
 Checks whether the kernel supports the features needed by *criu* to
diff --git a/criu/crtools.c b/criu/crtools.c
index 8b5ec5d..8deb20a 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -842,6 +842,7 @@ usage:
 "                        Formats of RES on restore:\n"
 "                            dev[VAL]:DEVPATH\n"
 "                            veth[IFNAME]:OUTNAME{@BRIDGE}\n"
+"                            macvlan[IFNAME]:OUTNAME\n"
 "\n"
 "* Special resources support:\n"
 "     --" SK_EST_PARAM "  checkpoint/restore established TCP connections\n"
diff --git a/criu/external.c b/criu/external.c
index bc6c6d4..d8fee21 100644
--- a/criu/external.c
+++ b/criu/external.c
@@ -3,6 +3,9 @@
 #include "cr_options.h"
 #include "xmalloc.h"
 #include "external.h"
+#include "util.h"
+
+#include "net.h"
 
 int add_external(char *key)
 {
@@ -12,6 +15,12 @@ int add_external(char *key)
 	if (!ext)
 		return -1;
 	ext->id = key;
+
+	if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) {
+		xfree(ext);
+		return -1;
+	}
+
 	list_add(&ext->node, &opts.external);
 
 	return 0;
diff --git a/criu/include/net.h b/criu/include/net.h
index b4a6e99..f94d838 100644
--- a/criu/include/net.h
+++ b/criu/include/net.h
@@ -4,6 +4,7 @@
 #include <linux/netlink.h>
 
 #include "list.h"
+#include "external.h"
 
 struct cr_imgset;
 extern int dump_net_ns(int ns_id);
@@ -17,6 +18,12 @@ struct veth_pair {
 	char *bridge;
 };
 
+struct macvlan_pair {
+	struct list_head node;
+	char *inside;
+	int ifi_outside;
+};
+
 extern int collect_net_namespaces(bool for_dump);
 
 extern int network_lock(void);
@@ -30,6 +37,7 @@ extern int read_ns_sys_file(char *path, char *buf, int len);
 extern int restore_link_parms(NetDeviceEntry *nde, int nlsk);
 
 extern int veth_pair_add(char *in, char *out);
+extern int macvlan_ext_add(struct external *ext);
 extern int move_veth_to_bridge(void);
 
 #endif /* __CR_NET_H__ */
diff --git a/criu/net.c b/criu/net.c
index b5f9818..4dfde03 100644
--- a/criu/net.c
+++ b/criu/net.c
@@ -2,6 +2,7 @@
 #include <sys/socket.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <linux/net_namespace.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
@@ -11,6 +12,7 @@
 #include <sched.h>
 #include <sys/mount.h>
 #include <net/if.h>
+#include <linux/if.h>
 #include <linux/sockios.h>
 #include <libnl3/netlink/msg.h>
 
@@ -354,6 +356,7 @@ static int dump_one_netdev(int type, struct ifinfomsg *ifi,
 	SysctlEntry *confs6 = NULL;
 	int size6 = ARRAY_SIZE(devconfs6);
 	char stable_secret[MAX_STR_CONF_LEN + 1] = {};
+	struct nlattr *info[IFLA_INFO_MAX], **arg = NULL;
 
 	if (!tb[IFLA_IFNAME]) {
 		pr_err("No name for link %d\n", ifi->ifi_index);
@@ -421,7 +424,16 @@ static int dump_one_netdev(int type, struct ifinfomsg *ifi,
 	if (!dump)
 		dump = write_netdev_img;
 
-	ret = dump(&netdev, fds, tb);
+	if (tb[IFLA_LINKINFO]) {
+		ret = nla_parse_nested(info, IFLA_INFO_MAX, tb[IFLA_LINKINFO], NULL);
+		if (ret < 0) {
+			pr_err("failed to parse nested linkinfo\n");
+			return -1;
+		}
+		arg = info;
+	}
+
+	ret = dump(&netdev, fds, arg);
 err_free:
 	xfree(netdev.conf4);
 	xfree(confs4);
@@ -498,6 +510,37 @@ static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nla
 	return write_netdev_img(nde, imgset, data);
 }
 
+static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info)
+{
+	MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT;
+	int ret;
+	struct nlattr *data[IFLA_MACVLAN_MAX];
+
+	if (!info || !info[IFLA_INFO_DATA]) {
+		pr_err("no data for macvlan\n");
+		return -1;
+	}
+
+	ret = nla_parse_nested(data, IFLA_MACVLAN_MAX, info[IFLA_INFO_DATA], NULL);
+	if (ret < 0) {
+		pr_err("failed ot parse macvlan data\n");
+		return -1;
+	}
+
+	if (!data[IFLA_MACVLAN_MODE]) {
+		pr_err("macvlan mode required for %s\n", nde->name);
+		return -1;
+	}
+
+	macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE]));
+
+	if (data[IFLA_MACVLAN_FLAGS])
+		macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS]));
+
+	nde->macvlan = &macvlan;
+	return write_netdev_img(nde, imgset, data);
+}
+
 static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
 		struct nlattr **tb, struct cr_imgset *fds)
 {
@@ -530,6 +573,8 @@ static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
 
 		pr_warn("GRE tap device %s not supported natively\n", name);
 	}
+	if (!strcmp(kind, "macvlan"))
+		return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, fds, dump_macvlan);
 
 	return dump_unknown_device(ifi, kind, tb, fds);
 }
@@ -831,8 +876,14 @@ struct newlink_req {
 	char buf[1024];
 };
 
+struct newlink_extras {
+	int netns_id;
+	int link;
+	int target_netns;
+};
+
 static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDeviceEntry *nde,
-		int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+		int (*link_info)(NetDeviceEntry *, struct newlink_req *), struct newlink_extras *extras)
 {
 	memset(req, 0, sizeof(*req));
 
@@ -850,6 +901,18 @@ static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDevice
 		req->i.ifi_index = nde->ifindex;
 	req->i.ifi_flags = nde->flags;
 
+	if (extras) {
+		if (extras->netns_id >= 0)
+			addattr_l(&req->h, sizeof(*req), IFLA_LINK_NETNSID, &extras->netns_id, sizeof(extras->netns_id));
+
+		if (extras->link >= 0)
+			addattr_l(&req->h, sizeof(*req), IFLA_LINK, &extras->link, sizeof(extras->link));
+
+		if (extras->target_netns >= 0) {
+			addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &extras->target_netns, sizeof(extras->target_netns));
+		}
+
+	}
 
 	addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, nde->name, strlen(nde->name));
 	addattr_l(&req->h, sizeof(*req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu));
@@ -879,11 +942,12 @@ static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDevice
 }
 
 static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
-		int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+		int (*link_info)(NetDeviceEntry *, struct newlink_req *),
+		struct newlink_extras *extras)
 {
 	struct newlink_req req;
 
-	if (populate_newlink_req(&req, msg_type, nde, link_info) < 0)
+	if (populate_newlink_req(&req, msg_type, nde, link_info, extras) < 0)
 		return -1;
 
 	return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL);
@@ -891,14 +955,15 @@ static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
 
 int restore_link_parms(NetDeviceEntry *nde, int nlsk)
 {
-	return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL);
+	return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL, NULL);
 }
 
 static int restore_one_link(NetDeviceEntry *nde, int nlsk,
-		int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+		int (*link_info)(NetDeviceEntry *, struct newlink_req *),
+		struct newlink_extras *extras)
 {
 	pr_info("Restoring netdev %s idx %d\n", nde->name, nde->ifindex);
-	return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info);
+	return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info, extras);
 }
 
 #ifndef VETH_INFO_MAX
@@ -979,6 +1044,109 @@ static int bridge_link_info(NetDeviceEntry *nde, struct newlink_req *req)
 	return 0;
 }
 
+static int macvlan_link_info(NetDeviceEntry *nde, struct newlink_req *req)
+{
+	struct rtattr *macvlan_data;
+	MacvlanLinkEntry *macvlan = nde->macvlan;
+
+	if (!macvlan) {
+		pr_err("Missing macvlan link entry %d\n", nde->ifindex);
+		return -1;
+	}
+
+	addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7);
+
+	macvlan_data = NLMSG_TAIL(&req->h);
+	addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
+
+	addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode));
+
+	if (macvlan->has_flags)
+		addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags));
+
+	macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data;
+
+	return 0;
+}
+
+static int userns_restore_one_link(void *arg, int fd, pid_t pid)
+{
+	int nlsk, ret;
+	struct newlink_req *req = arg;
+
+	nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (nlsk < 0) {
+		pr_perror("Can't create nlk socket");
+		return -1;
+	}
+
+	addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_PID, &pid, sizeof(pid));
+
+	ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL);
+	close(nlsk);
+	return ret;
+}
+
+static int changeflags(NetDeviceEntry *nde, int nlsk)
+{
+	struct newlink_req req;
+
+	memzero(&req, sizeof(req));
+
+	req.h.nlmsg_len = NLMSG_LENGTH(sizeof(req.i));
+	req.h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
+	req.h.nlmsg_type = RTM_SETLINK;
+	req.h.nlmsg_seq = CR_NLMSG_SEQ;
+	req.i.ifi_family = AF_PACKET;
+	req.i.ifi_flags = nde->flags;
+
+	addattr_l(&req.h, sizeof(req), IFLA_IFNAME, nde->name, strlen(nde->name));
+
+	return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, NULL, NULL, NULL);
+}
+
+static int get_criu_netnsid(int nlsk)
+{
+	static int netnsid = -1;
+	struct {
+		struct nlmsghdr n;
+		struct rtgenmsg g;
+		char buf[1024];
+	} req;
+	int ns_fd = get_service_fd(NS_FD_OFF), i;
+
+	if (netnsid > 0)
+		return netnsid;
+
+	for (i = 0; i < 10; i++) {
+		int ret;
+
+		memset(&req, 0, sizeof(req));
+
+		req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.g));
+		req.n.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
+		req.n.nlmsg_type = RTM_NEWNSID;
+		req.n.nlmsg_seq = CR_NLMSG_SEQ;
+
+		addattr_l(&req.n, sizeof(req), NETNSA_FD, &ns_fd, sizeof(ns_fd));
+		addattr_l(&req.n, sizeof(req), NETNSA_NSID, &i, sizeof(i));
+
+		ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, NULL, NULL, NULL);
+		if (ret < 0) {
+			if (ret == -EEXIST)
+				continue;
+			errno = -ret;
+			pr_perror("couldn't create new netnsid");
+		}
+
+		netnsid = i;
+		return netnsid;
+	}
+
+	pr_err("tried to create too many netnsids\n");
+	return -1;
+}
+
 static int restore_link(NetDeviceEntry *nde, int nlsk)
 {
 	pr_info("Restoring link %s type %d\n", nde->name, nde->type);
@@ -988,14 +1156,93 @@ static int restore_link(NetDeviceEntry *nde, int nlsk)
 	case ND_TYPE__EXTLINK:  /* see comment in images/netdev.proto */
 		return restore_link_parms(nde, nlsk);
 	case ND_TYPE__VENET:
-		return restore_one_link(nde, nlsk, venet_link_info);
+		return restore_one_link(nde, nlsk, venet_link_info, NULL);
 	case ND_TYPE__VETH:
-		return restore_one_link(nde, nlsk, veth_link_info);
+		return restore_one_link(nde, nlsk, veth_link_info, NULL);
 	case ND_TYPE__TUN:
 		return restore_one_tun(nde, nlsk);
 	case ND_TYPE__BRIDGE:
-		return restore_one_link(nde, nlsk, bridge_link_info);
+		return restore_one_link(nde, nlsk, bridge_link_info, NULL);
+	case ND_TYPE__MACVLAN: {
+		struct newlink_extras extras = {
+			.netns_id = -1,
+			.link = -1,
+			.target_netns = -1,
+		};
+		char key[100], *val;
+
+		snprintf(key, sizeof(key), "macvlan[%s]", nde->name);
+		val = external_lookup_data(key);
+		if (IS_ERR_OR_NULL(val)) {
+			pr_err("a macvlan parent for %s is required\n", nde->name);
+			return -1;
+		}
+
+		extras.link = (int) (unsigned long) val;
+
+		extras.netns_id = get_criu_netnsid(nlsk);
+		if (extras.netns_id < 0) {
+			pr_err("failed to get criu's netnsid\n");
+			return -1;
+		}
+
+		if (root_ns_mask & CLONE_NEWUSER) {
+			struct newlink_req req;
 
+			if (populate_newlink_req(&req, RTM_NEWLINK, nde, macvlan_link_info, &extras) < 0)
+				return -1;
+
+			if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), -1) < 0) {
+				pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name);
+				return -1;
+			}
+		} else {
+			int my_netns, ret, root_nlsk;
+			int ns_fd = get_service_fd(NS_FD_OFF);
+
+			my_netns = open_proc(PROC_SELF, "ns/net");
+			if (my_netns < 0) {
+				pr_perror("couldn't get my netns");
+				return -1;
+			}
+
+			extras.target_netns = my_netns;
+
+			if (setns(ns_fd, CLONE_NEWNET) < 0) {
+				close(my_netns);
+				pr_perror("couldn't setns to parent ns");
+				return -1;
+			}
+
+			root_nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+			if (root_nlsk < 0) {
+				close(my_netns);
+				pr_perror("Can't create nlk socket");
+				return -1;
+			}
+
+			if (setns(my_netns, CLONE_NEWNET) < 0) {
+				close(my_netns);
+				close(root_nlsk);
+				pr_perror("couldn't setns back to my ns");
+				return -1;
+			}
+
+			ret = restore_one_link(nde, root_nlsk, macvlan_link_info, &extras);
+			close(my_netns);
+			close(root_nlsk);
+			if (ret < 0)
+				return -1;
+		}
+
+		/* We have to change the flags of the NDE manually here because
+		 * we used IFLA_LINK_NETNSID to restore it, which creates the
+		 * device and then shuts it down when it changes the device's
+		 * namespace, but doesn't start it back up when it goes to the
+		 * other namespace. So, we restore its state here.
+		 */
+		return changeflags(nde, nlsk);
+	}
 	default:
 		pr_err("Unsupported link type %d\n", nde->type);
 		break;
@@ -1648,6 +1895,17 @@ int veth_pair_add(char *in, char *out)
 	return add_external(e_str);
 }
 
+int macvlan_ext_add(struct external *ext)
+{
+	ext->data = (void *) (unsigned long) if_nametoindex(external_val(ext));
+	if (ext->data == 0) {
+		pr_perror("can't get ifindex of %s", ext->id);
+		return -1;
+	}
+
+	return 0;
+}
+
 /*
  * The setns() syscall (called by switch_ns()) can be extremely
  * slow. If we call it two or more times from the same task the
diff --git a/images/Makefile b/images/Makefile
index cf50794..eb18526 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -60,6 +60,7 @@ proto-obj-y	+= binfmt-misc.o
 proto-obj-y	+= time.o
 proto-obj-y	+= sysctl.o
 proto-obj-y	+= autofs.o
+proto-obj-y	+= macvlan.o
 
 CFLAGS		+= -iquote $(obj)/
 
diff --git a/images/macvlan.proto b/images/macvlan.proto
new file mode 100644
index 0000000..c9c9045
--- /dev/null
+++ b/images/macvlan.proto
@@ -0,0 +1,4 @@
+message macvlan_link_entry {
+	required uint32	mode	= 1;
+	optional uint32 flags	= 2;
+}
diff --git a/images/netdev.proto b/images/netdev.proto
index 19b501c..2f2f3d1 100644
--- a/images/netdev.proto
+++ b/images/netdev.proto
@@ -1,5 +1,6 @@
 syntax = "proto2";
 
+import "macvlan.proto";
 import "opts.proto";
 import "tun.proto";
 import "sysctl.proto";
@@ -20,6 +21,7 @@ enum nd_type {
 	 */
 	VENET		= 5;
 	BRIDGE		= 6;
+	MACVLAN		= 7;
 }
 
 message net_device_entry {
@@ -38,6 +40,8 @@ message net_device_entry {
 	repeated sysctl_entry conf4	= 9;
 
 	repeated sysctl_entry conf6	= 10;
+
+	optional macvlan_link_entry	macvlan		= 11;
 }
 
 message netns_entry {
-- 
2.7.4



More information about the CRIU mailing list