[CRIU] [PATCH v10 08/11] net: add support for macvlan link types

Tycho Andersen tycho.andersen at canonical.com
Thu Oct 20 09:25:28 PDT 2016


While this is in principle similar to how veths are handled, we have to do
things in two different ways depending on whether or not there is a user
namespace involved, because there is no way to ask the kernel to attach a
macvlan NIC to a device in a net ns that we don't have CAP_NET_ADMIN in.

So we do it in two ways:

a. If we are in a user namespace, we create the device in usernsd and use
   IFLA_NET_NS_FD to set the netns which it should be created in (saving
   us a "move into this netns" step).

b. If we aren't in a user namespace, we could still be in a net namespace,
   so we use IFLA_LINK_NETNSID to set namespace that the i/o device will be
   in. Then we open a netlink socket from criu's netns and use
   IFLA_NET_NS_FD to tell the kernel to create the macvlan device in the
   target's namespace.

v2: * s/CLONE_NEWNET/CLONE_NEWUSER
    * Don't bother to dump IFLA_LINK and IFLA_LINK_NETNSID. Although we
      need to provide these on restore, there's no kernel interface that
      persists these. To populate IFLA_LINK, we require users pass
      --macvlan-pair, and we create a NETNSID relation as needed and pass
      that in for macvlan links (although this infrastructure could be used
      elsewhere for links that need it in the future, since is in the
      hoisted populate_newlink_req()).
    * use new external command instead of creating a --macvlan-pair option

v3: add a feature check for linux/net_namespace.h, since not every arch in
    travis has this (new-ish) header

v4: * include sys/types.h instead of linux/if.h to get IFF_UP flag
    * remove old doc addition about --macvlan-pair option

v5: define IFLA_LINK_NETNSID and RTM_NEWNSID if they don't exist

v6: define IFLA_MACVLAN_FLAGS and bump the size of IFLA_MACVLAN_MAX when
    necessary

v7: * remove unused struct macvlan_pair
    * split feature test for linux/net_namespace.h into separate patch
    * move IFLA_INFO_MAX testing in dump_one_netdev to the right patch
    * add documents for netwlink_extras fields
    * split changeflags into separate patch
    * use existing netnsid if we get EEXIST
    * move macvlan code to a helper function
    * use netnsid to restore in userns case, and not pid

v8: * define RTM_GETNSID since we use that too now :)
    * don't bother with IFLA_MACVLAN_MAX; we only understand things up to
      IFLA_MACVLAN_FLAGS, so let's just use that as our max instead. The
      problem with using macros here, is that IFLA_MACLAN_MAX is defined as
      a macro with an enum expansion in it, so we get bitten by the enum
      not being available at preprocessing time, and implicit zero coercion
      when testing against its value for stuff. Yeesh.

v10: * add some comments about when we set up NET_NS_FD and why we use
       IFLA_LINK and IFLA_NET_NS_ID
     * use the socket opened in restore_links() instead of opening one in
       restore_one_macvlan()
     * split the new argument to restore_one_link into its own patch

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 criu/crtools.c            |   1 +
 criu/external.c           |   9 ++
 criu/include/libnetlink.h |   4 +
 criu/include/net.h        |   2 +
 criu/net.c                | 264 ++++++++++++++++++++++++++++++++++++++++++++++
 images/Makefile           |   1 +
 images/macvlan.proto      |   4 +
 images/netdev.proto       |   4 +
 8 files changed, 289 insertions(+)
 create mode 100644 images/macvlan.proto

diff --git a/criu/crtools.c b/criu/crtools.c
index 29b7e57..933c124 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -851,6 +851,7 @@ usage:
 "                        Formats of RES on restore:\n"
 "                            dev[VAL]:DEVPATH\n"
 "                            veth[IFNAME]:OUTNAME{@BRIDGE}\n"
+"                            macvlan[IFNAME]:OUTNAME\n"
 "\n"
 "* Special resources support:\n"
 "     --" SK_EST_PARAM "  checkpoint/restore established TCP connections\n"
diff --git a/criu/external.c b/criu/external.c
index bc6c6d4..d8fee21 100644
--- a/criu/external.c
+++ b/criu/external.c
@@ -3,6 +3,9 @@
 #include "cr_options.h"
 #include "xmalloc.h"
 #include "external.h"
+#include "util.h"
+
+#include "net.h"
 
 int add_external(char *key)
 {
@@ -12,6 +15,12 @@ int add_external(char *key)
 	if (!ext)
 		return -1;
 	ext->id = key;
+
+	if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) {
+		xfree(ext);
+		return -1;
+	}
+
 	list_add(&ext->node, &opts.external);
 
 	return 0;
diff --git a/criu/include/libnetlink.h b/criu/include/libnetlink.h
index 591af0e..0549ef9 100644
--- a/criu/include/libnetlink.h
+++ b/criu/include/libnetlink.h
@@ -13,5 +13,9 @@ extern int addattr_l(struct nlmsghdr *n, int maxlen, int type,
 #define NLMSG_TAIL(nmsg) \
 	((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
 
+#ifndef NETNS_RTA
+#define NETNS_RTA(r) \
+	((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg))))
+#endif
 
 #endif /* __CR_LIBNETLINK_H__ */
diff --git a/criu/include/net.h b/criu/include/net.h
index b367e34..d621da1 100644
--- a/criu/include/net.h
+++ b/criu/include/net.h
@@ -4,6 +4,7 @@
 #include <linux/netlink.h>
 
 #include "list.h"
+#include "external.h"
 
 struct cr_imgset;
 extern int dump_net_ns(int ns_id);
@@ -30,6 +31,7 @@ extern int read_ns_sys_file(char *path, char *buf, int len);
 extern int restore_link_parms(NetDeviceEntry *nde, int nlsk);
 
 extern int veth_pair_add(char *in, char *out);
+extern int macvlan_ext_add(struct external *ext);
 extern int move_veth_to_bridge(void);
 
 #endif /* __CR_NET_H__ */
diff --git a/criu/net.c b/criu/net.c
index f7d586e..2f68e8e 100644
--- a/criu/net.c
+++ b/criu/net.c
@@ -1,6 +1,7 @@
 #include <unistd.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
+#include <linux/net_namespace.h>
 #include <linux/rtnetlink.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
@@ -10,6 +11,7 @@
 #include <sys/wait.h>
 #include <sched.h>
 #include <sys/mount.h>
+#include <sys/types.h>
 #include <net/if.h>
 #include <linux/sockios.h>
 #include <libnl3/netlink/msg.h>
@@ -34,6 +36,22 @@
 #include "protobuf.h"
 #include "images/netdev.pb-c.h"
 
+#ifndef IFLA_LINK_NETNSID
+#define IFLA_LINK_NETNSID	37
+#endif
+
+#ifndef RTM_NEWNSID
+#define RTM_NEWNSID		88
+#endif
+
+#ifndef RTM_GETNSID
+#define RTM_GETNSID		90
+#endif
+
+#ifndef IFLA_MACVLAN_FLAGS
+#define IFLA_MACVLAN_FLAGS 2
+#endif
+
 static int ns_sysfs_fd = -1;
 
 int read_ns_sys_file(char *path, char *buf, int len)
@@ -508,6 +526,37 @@ static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nla
 	return write_netdev_img(nde, imgset, info);
 }
 
+static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info)
+{
+	MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT;
+	int ret;
+	struct nlattr *data[IFLA_MACVLAN_FLAGS+1];
+
+	if (!info || !info[IFLA_INFO_DATA]) {
+		pr_err("no data for macvlan\n");
+		return -1;
+	}
+
+	ret = nla_parse_nested(data, IFLA_MACVLAN_FLAGS, info[IFLA_INFO_DATA], NULL);
+	if (ret < 0) {
+		pr_err("failed ot parse macvlan data\n");
+		return -1;
+	}
+
+	if (!data[IFLA_MACVLAN_MODE]) {
+		pr_err("macvlan mode required for %s\n", nde->name);
+		return -1;
+	}
+
+	macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE]));
+
+	if (data[IFLA_MACVLAN_FLAGS])
+		macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS]));
+
+	nde->macvlan = &macvlan;
+	return write_netdev_img(nde, imgset, info);
+}
+
 static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
 		struct nlattr **tb, struct cr_imgset *fds)
 {
@@ -540,6 +589,8 @@ static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
 
 		pr_warn("GRE tap device %s not supported natively\n", name);
 	}
+	if (!strcmp(kind, "macvlan"))
+		return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, fds, dump_macvlan);
 
 	return dump_unknown_device(ifi, kind, tb, fds);
 }
@@ -1026,6 +1077,206 @@ static int changeflags(int s, char *name, short flags)
 	return 0;
 }
 
+static int macvlan_link_info(NetDeviceEntry *nde, struct newlink_req *req)
+{
+	struct rtattr *macvlan_data;
+	MacvlanLinkEntry *macvlan = nde->macvlan;
+
+	if (!macvlan) {
+		pr_err("Missing macvlan link entry %d\n", nde->ifindex);
+		return -1;
+	}
+
+	addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7);
+
+	macvlan_data = NLMSG_TAIL(&req->h);
+	addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
+
+	addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode));
+
+	if (macvlan->has_flags)
+		addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags));
+
+	macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data;
+
+	return 0;
+}
+
+static int userns_restore_one_link(void *arg, int fd, pid_t pid)
+{
+	int nlsk, ret;
+	struct newlink_req *req = arg;
+
+	nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (nlsk < 0) {
+		pr_perror("Can't create nlk socket");
+		return -1;
+	}
+
+	addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &fd, sizeof(fd));
+
+	ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL);
+	close(nlsk);
+	return ret;
+}
+
+static int get_nsid_cb(struct nlmsghdr *nlh, void *arg)
+{
+	struct rtgenmsg *rthdr;
+	struct rtattr *rta;
+	int len, *netnsid = arg;
+
+	rthdr = NLMSG_DATA(nlh);
+	len = nlh->nlmsg_len - NLMSG_SPACE(sizeof(*rthdr));
+
+	if (len < 0)
+		return -1;
+
+	rta = NETNS_RTA(rthdr);
+
+	while (RTA_OK(rta, len)) {
+		if (rta->rta_type == NETNSA_NSID)
+			*netnsid = *((int *) RTA_DATA(rta));
+		rta = RTA_NEXT(rta, len);
+	}
+
+	if (netnsid < 0) {
+		pr_err("Didn't get a netnsid back from netlink?\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int get_criu_netnsid(int nlsk)
+{
+	static int netnsid = -1;
+	struct {
+		struct nlmsghdr n;
+		struct rtgenmsg g;
+		char buf[1024];
+	} req;
+	int ns_fd = get_service_fd(NS_FD_OFF), i;
+
+	if (netnsid > 0)
+		return netnsid;
+
+	for (i = 0; i < 10; i++) {
+		int ret;
+
+		memset(&req, 0, sizeof(req));
+
+		req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.g));
+		req.n.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
+		req.n.nlmsg_type = RTM_NEWNSID;
+		req.n.nlmsg_seq = CR_NLMSG_SEQ;
+
+		addattr_l(&req.n, sizeof(req), NETNSA_FD, &ns_fd, sizeof(ns_fd));
+		addattr_l(&req.n, sizeof(req), NETNSA_NSID, &i, sizeof(i));
+
+		ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, NULL, NULL, NULL);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				req.n.nlmsg_type = RTM_GETNSID;
+				ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, get_nsid_cb, NULL, &netnsid);
+				if (ret < 0) {
+					pr_err("Couldn't get netnsid: %d\n", ret);
+					return -1;
+				}
+
+				return netnsid;
+			}
+			errno = -ret;
+			pr_perror("couldn't create new netnsid");
+			return -1;
+		}
+
+		netnsid = i;
+		return netnsid;
+	}
+
+	pr_err("tried to create too many netnsids\n");
+	return -1;
+}
+
+static int restore_one_macvlan(NetDeviceEntry *nde, int nlsk, int criu_nlsk)
+{
+	struct newlink_extras extras = {
+		.netns_id = -1,
+		.link = -1,
+		.target_netns = -1,
+	};
+	char key[100], *val;
+	int my_netns = -1, ret = -1, s;
+
+	snprintf(key, sizeof(key), "macvlan[%s]", nde->name);
+	val = external_lookup_data(key);
+	if (IS_ERR_OR_NULL(val)) {
+		pr_err("a macvlan parent for %s is required\n", nde->name);
+		return -1;
+	}
+
+	/* link and netns_id are used to identify the master device to plug our
+	 * macvlan slave into. We identify the destination via setting
+	 * IFLA_NET_NS_FD to my_netns, but we have to do that in two different
+	 * ways: in the userns case, we send the fd across to usernsd and set
+	 * it there, whereas in the non-userns case we can just set it here,
+	 * since we can just use a socket from criu's net ns given to us by
+	 * restore_links(). We need to do this two different ways because
+	 * CAP_NET_ADMIN is required in both namespaces, which we don't have in
+	 * the userns case, and usernsd doesn't exist in the non-userns case.
+	 */
+	extras.link = (int) (unsigned long) val;
+
+	extras.netns_id = get_criu_netnsid(nlsk);
+	if (extras.netns_id < 0) {
+		pr_err("failed to get criu's netnsid\n");
+		return -1;
+	}
+
+	my_netns = open_proc(PROC_SELF, "ns/net");
+	if (my_netns < 0) {
+		pr_perror("couldn't get my netns");
+		return -1;
+	}
+
+	if (root_ns_mask & CLONE_NEWUSER) {
+		struct newlink_req req;
+
+		if (populate_newlink_req(&req, RTM_NEWLINK, nde, macvlan_link_info, &extras) < 0)
+			goto out;
+
+		if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), my_netns) < 0) {
+			pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name);
+			goto out;
+		}
+	} else {
+		extras.target_netns = my_netns;
+		ret = restore_one_link(nde, criu_nlsk, macvlan_link_info, &extras);
+		if (ret < 0)
+			return -1;
+	}
+
+	/* We have to change the flags of the NDE manually here because
+	 * we used IFLA_LINK_NETNSID to restore it, which creates the
+	 * device and then shuts it down when it changes the device's
+	 * namespace, but doesn't start it back up when it goes to the
+	 * other namespace. So, we restore its state here.
+	 */
+	s = socket(AF_LOCAL, SOCK_STREAM, 0);
+	if (s < 0) {
+		pr_perror("couldn't open socket for flag changing");
+		goto out;
+	}
+	ret = changeflags(s, nde->name, nde->flags);
+	close(s);
+
+out:
+	if (my_netns >= 0)
+		close(my_netns);
+	return ret;
+}
+
 static int restore_link(NetDeviceEntry *nde, int nlsk, int criu_nlsk)
 {
 	pr_info("Restoring link %s type %d\n", nde->name, nde->type);
@@ -1042,6 +1293,8 @@ static int restore_link(NetDeviceEntry *nde, int nlsk, int criu_nlsk)
 		return restore_one_tun(nde, nlsk);
 	case ND_TYPE__BRIDGE:
 		return restore_one_link(nde, nlsk, bridge_link_info, NULL);
+	case ND_TYPE__MACVLAN:
+		return restore_one_macvlan(nde, nlsk, criu_nlsk);
 	default:
 		pr_err("Unsupported link type %d\n", nde->type);
 		break;
@@ -1729,6 +1982,17 @@ int veth_pair_add(char *in, char *out)
 	return add_external(e_str);
 }
 
+int macvlan_ext_add(struct external *ext)
+{
+	ext->data = (void *) (unsigned long) if_nametoindex(external_val(ext));
+	if (ext->data == 0) {
+		pr_perror("can't get ifindex of %s", ext->id);
+		return -1;
+	}
+
+	return 0;
+}
+
 /*
  * The setns() syscall (called by switch_ns()) can be extremely
  * slow. If we call it two or more times from the same task the
diff --git a/images/Makefile b/images/Makefile
index cf50794..eb18526 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -60,6 +60,7 @@ proto-obj-y	+= binfmt-misc.o
 proto-obj-y	+= time.o
 proto-obj-y	+= sysctl.o
 proto-obj-y	+= autofs.o
+proto-obj-y	+= macvlan.o
 
 CFLAGS		+= -iquote $(obj)/
 
diff --git a/images/macvlan.proto b/images/macvlan.proto
new file mode 100644
index 0000000..c9c9045
--- /dev/null
+++ b/images/macvlan.proto
@@ -0,0 +1,4 @@
+message macvlan_link_entry {
+	required uint32	mode	= 1;
+	optional uint32 flags	= 2;
+}
diff --git a/images/netdev.proto b/images/netdev.proto
index 19b501c..2f2f3d1 100644
--- a/images/netdev.proto
+++ b/images/netdev.proto
@@ -1,5 +1,6 @@
 syntax = "proto2";
 
+import "macvlan.proto";
 import "opts.proto";
 import "tun.proto";
 import "sysctl.proto";
@@ -20,6 +21,7 @@ enum nd_type {
 	 */
 	VENET		= 5;
 	BRIDGE		= 6;
+	MACVLAN		= 7;
 }
 
 message net_device_entry {
@@ -38,6 +40,8 @@ message net_device_entry {
 	repeated sysctl_entry conf4	= 9;
 
 	repeated sysctl_entry conf6	= 10;
+
+	optional macvlan_link_entry	macvlan		= 11;
 }
 
 message netns_entry {
-- 
2.7.4



More information about the CRIU mailing list