[CRIU] [PATCH 3/5] net: add support for macvlan link types

Tycho Andersen tycho.andersen at canonical.com
Wed Jun 15 07:40:34 PDT 2016


On Wed, Jun 15, 2016 at 03:00:37PM +0300, Pavel Emelyanov wrote:
> On 06/14/2016 07:45 PM, Tycho Andersen wrote:
> > While this is in principle similar to how veths are handled, we have to do
> > things in two different ways depending on whether or not there is a user
> > namespace involved, because there is no way to ask the kernel to attach a
> > macvlan NIC to a device in a net ns that we don't have CAP_NET_ADMIN in.
> > 
> > So we do it in two ways:
> > 
> > a. If we are in a user namespace, we create the device in usernsd and use
> >    IFLA_NET_NS_PID to set the netns which it should be created in (saving
> >    us a "move into this netns" step).
> > 
> > b. If we aren't in a user namespace, we could still be in a net namespace,
> >    so we use IFLA_LINK_NETNSID to set namespace that the i/o device will be
> >    in. This is sort of a hack, since we have to do it via NSID, which we
> >    know is 0 because we don't allow namespace nesting.
> 
> Can we use IFLA_LINK_NETNSID even for the case "a"?

No, because it requires that you have CAP_NET_ADMIN in the namespace
that you are attaching the device to, which we don't in the userns
case :(

> (more comments inline)
> 
> > Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> > ---
> >  criu/net.c           | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  images/Makefile      |   1 +
> >  images/macvlan.proto |   4 ++
> >  images/netdev.proto  |   6 +++
> >  4 files changed, 143 insertions(+)
> >  create mode 100644 images/macvlan.proto
> > 
> > diff --git a/criu/net.c b/criu/net.c
> > index 9954270..793b625 100644
> > --- a/criu/net.c
> > +++ b/criu/net.c
> > @@ -476,6 +476,61 @@ static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nla
> >  	return write_netdev_img(nde, imgset, data);
> >  }
> >  
> > +static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **tb)
> > +{
> > +	MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT;
> > +	int ret;
> > +	struct nlattr *info[IFLA_INFO_MAX], *data[IFLA_MACVLAN_MAX];
> > +
> > +	if (!tb || !tb[IFLA_LINKINFO]) {
> > +		pr_err("no for macvlan\n");
> > +		return -1;
> > +	}
> > +
> > +	if (tb[IFLA_LINK]) {
> > +		nde->has_link = true;
> > +		nde->link = *(int *)RTA_DATA(tb[IFLA_LINK]);
> > +	}
> > +
> > +	if (tb[IFLA_LINK_NETNSID]) {
> > +		nde->has_netns_id = true;
> > +		nde->netns_id = *(int *)RTA_DATA(tb[IFLA_LINK_NETNSID]);
> > +	}
> 
> This seems to be generic enough to be placed in dump_one_netdev.

I initially thought this too, except that the kernel sends it with
veth interfaces, and then gets confused when you send it back because
veths already have their own pseudo mechanism for doing this kind of
thing.

> > +
> > +	ret = nla_parse_nested(info, IFLA_INFO_MAX, tb[IFLA_LINKINFO], NULL);
> > +	if (ret < 0) {
> > +		pr_err("failed to parse nested linkinfo\n");
> > +		return -1;
> > +	}
> > +
> > +	if (!info[IFLA_INFO_DATA]) {
> > +		pr_err("no link info data for macvlan\n");
> > +		return -1;
> > +	}
> > +
> > +	ret = nla_parse_nested(data, IFLA_MACVLAN_MAX, info[IFLA_INFO_DATA], NULL);
> > +	if (ret < 0) {
> > +		pr_err("failed ot parse macvlan data\n");
> > +		return -1;
> > +	}
> > +
> > +	if (!data[IFLA_MACVLAN_MODE]) {
> > +		pr_err("macvlan mode required for %s\n", nde->name);
> > +		return -1;
> > +	}
> > +
> > +	macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE]));
> > +
> > +	if (data[IFLA_MACVLAN_FLAGS])
> > +		macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS]));
> > +
> > +	nde->macvlan = &macvlan;
> > +	ret = write_netdev_img(nde, imgset, data);
> > +
> > +	nde->macvlan = NULL;
> > +	return ret;
> 
> Just doing 'return write_netdev_img()' should be enough.

Sounds good :)

> > +}
> > +
> >  static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
> >  		struct nlattr **tb, struct cr_imgset *fds)
> >  {
> > @@ -508,6 +563,8 @@ static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
> >  
> >  		pr_warn("GRE tap device %s not supported natively\n", name);
> >  	}
> > +	if (!strcmp(kind, "macvlan"))
> > +		return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, fds, dump_macvlan);
> >  
> >  	return dump_unknown_device(ifi, kind, tb, fds);
> >  }
> > @@ -828,6 +885,22 @@ static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDevice
> >  		req->i.ifi_index = nde->ifindex;
> >  	req->i.ifi_flags = nde->flags;
> >  
> > +	/* Note that this id isn't preserved anywhere, but since we don't
> > +	 * support nested namespaces, right now there is only one peer
> > +	 * namespace, the parent NS with an id of 0, so this works. In the
> > +	 * future, we'll need to be more careful about munging this ID to be
> > +	 * correct (or restoring namespaces in such a way that they get the
> > +	 * same ID).
> > +	 */
> > +	if (nde->has_netns_id)
> > +		addattr_l(&req->h, sizeof(*req), IFLA_LINK_NETNSID, &nde->netns_id, sizeof(nde->netns_id));
> > +
> > +	/* Like netns_id, this is not preserved across hosts (indeed, a link
> > +	 * with this ifindex may not even exist). We add support for rewriting
> > +	 * it in a later patch.
> > +	 */
> > +	if (nde->has_link)
> > +		addattr_l(&req->h, sizeof(*req), IFLA_LINK, &nde->link, sizeof(nde->link));
> 
> Can you make these attrs setting and getting be symmetrincal -- either all
> are get/set in macvlan dump/restore code, or all are get/set in generic code,
> or one (netnsid?) is get/set in generic and the other in macvlan?

We can, but we'd need a bigger patch, since right now the callbacks on
restore only allow you to set things in IFLA_LINK_INFO, and these are
at that same level.

This is what I was thinking about in the cover letter, that we might
need some better way to populate these different pieces, sort of like
we have with fstype. I like that each function doesn't have to do the
math about how big IFLA_LINK_INFO is, so it would be nice to keep that
code generic, but then also allow us to add other top-level elements.

> >  	addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, nde->name, strlen(nde->name));
> >  	addattr_l(&req->h, sizeof(*req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu));
> > @@ -949,6 +1022,49 @@ static int bridge_link_info(NetDeviceEntry *nde, struct newlink_req *req)
> >  	return 0;
> >  }
> >  
> > +static int macvlan_link_info(NetDeviceEntry *nde, struct newlink_req *req)
> > +{
> > +	struct rtattr *macvlan_data;
> > +	MacvlanLinkEntry *macvlan = nde->macvlan;
> > +
> > +	if (!macvlan) {
> > +		pr_err("Missing macvlan link entry %d\n", nde->ifindex);
> > +		return -1;
> > +	}
> > +
> > +	addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7);
> > +
> > +	macvlan_data = NLMSG_TAIL(&req->h);
> > +	addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
> > +
> > +	addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode));
> > +
> > +	if (macvlan->has_flags)
> > +		addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags));
> > +
> > +	macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data;
> > +
> > +	return 0;
> > +}
> > +
> > +static int userns_restore_one_link(void *arg, int fd, pid_t pid)
> > +{
> > +	int nlsk, ret;
> > +	struct newlink_req *req = arg;
> > +
> > +	nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
> > +	if (nlsk < 0) {
> > +		pr_perror("Can't create nlk socket");
> > +		return -1;
> > +	}
> > +
> > +	addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_PID, &pid, sizeof(pid));
> > +
> > +	ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL);
> > +	close(nlsk);
> > +	return ret;
> > +}
> > +
> >  static int restore_link(NetDeviceEntry *nde, int nlsk)
> >  {
> >  	pr_info("Restoring link %s type %d\n", nde->name, nde->type);
> > @@ -965,7 +1081,23 @@ static int restore_link(NetDeviceEntry *nde, int nlsk)
> >  		return restore_one_tun(nde, nlsk);
> >  	case ND_TYPE__BRIDGE:
> >  		return restore_one_link(nde, nlsk, bridge_link_info);
> > +	case ND_TYPE__MACVLAN: {
> > +		if (root_ns_mask & CLONE_NEWNET) {
> 
> Shouldn't this be CLONE_NEWUSER instead?

Hmm, yes. And that makes me wonder about the test, since it should
have failed to restore in this case I think :(

Tycho

> > +			struct newlink_req req;
> >  
> > +			if (populate_newlink_req(&req, RTM_NEWLINK, nde, macvlan_link_info) < 0)
> > +				return -1;
> > +
> > +			if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), -1) < 0) {
> > +				pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name);
> > +				return -1;
> > +			}
> > +
> > +			return 0;
> > +		}
> > +
> > +		return restore_one_link(nde, nlsk, macvlan_link_info);
> > +	}
> >  	default:
> >  		pr_err("Unsupported link type %d\n", nde->type);
> >  		break;
> > diff --git a/images/Makefile b/images/Makefile
> > index cf50794..eb18526 100644
> > --- a/images/Makefile
> > +++ b/images/Makefile
> > @@ -60,6 +60,7 @@ proto-obj-y	+= binfmt-misc.o
> >  proto-obj-y	+= time.o
> >  proto-obj-y	+= sysctl.o
> >  proto-obj-y	+= autofs.o
> > +proto-obj-y	+= macvlan.o
> >  
> >  CFLAGS		+= -iquote $(obj)/
> >  
> > diff --git a/images/macvlan.proto b/images/macvlan.proto
> > new file mode 100644
> > index 0000000..c9c9045
> > --- /dev/null
> > +++ b/images/macvlan.proto
> > @@ -0,0 +1,4 @@
> > +message macvlan_link_entry {
> > +	required uint32	mode	= 1;
> > +	optional uint32 flags	= 2;
> > +}
> > diff --git a/images/netdev.proto b/images/netdev.proto
> > index 37cafb3..746db16 100644
> > --- a/images/netdev.proto
> > +++ b/images/netdev.proto
> > @@ -1,3 +1,4 @@
> > +import "macvlan.proto";
> >  import "opts.proto";
> >  import "tun.proto";
> >  import "sysctl.proto";
> > @@ -18,6 +19,7 @@ enum nd_type {
> >  	 */
> >  	VENET		= 5;
> >  	BRIDGE		= 6;
> > +	MACVLAN		= 7;
> >  }
> >  
> >  message net_device_entry {
> > @@ -36,6 +38,10 @@ message net_device_entry {
> >  	repeated sysctl_entry conf4	= 9;
> >  
> >  	repeated sysctl_entry conf6	= 10;
> > +
> > +	optional int32			link		= 11;
> > +	optional int32			netns_id	= 12;
> > +	optional macvlan_link_entry	macvlan		= 13;
> >  }
> >  
> >  message netns_entry {
> > 
> 


More information about the CRIU mailing list