[CRIU] [PATCH 1/2] sk-inet: Add initial support for raw sockets

Pavel Emelyanov xemul at virtuozzo.com
Wed Nov 16 09:03:50 PST 2016


On 11/16/2016 01:59 PM, Cyrill Gorcunov wrote:
> For raw sockets we need DIAG module extension, 

What's the status of respective kernel patches?

> so in case
> if we're failing while collecting socket don't exit with
> error but warn a user and if we really meet raw socket
> we will exit later on socket's lookup stage.
> 
> Strictly speaking we can use procfs parsing instead but
> this gonna be a way more complex that well-known diag
> approach and taking into account that raw sockets are
> note that widely used lets support only when diag module
> is present in the system.
> 
> In the patch the initial raw sockets support added
> compelte enough to handle SO_IP_SET request from
> ipset tool (needed by modern containers). But the
> code might need extention/fixes in future.
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
> ---
>  criu/include/sk-inet.h | 12 +++++++++++
>  criu/sk-inet.c         | 58 ++++++++++++++++++++++++++++++++++++--------------
>  criu/sockets.c         | 35 ++++++++++++++++++++++++++++++
>  images/sk-inet.proto   |  2 ++
>  4 files changed, 91 insertions(+), 16 deletions(-)
> 
> diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h
> index a06a8ac161da..58c05887e04d 100644
> --- a/criu/include/sk-inet.h
> +++ b/criu/include/sk-inet.h
> @@ -16,6 +16,18 @@
>  #define TCP_REPAIR_OPTIONS	22
>  #endif
>  
> +#ifndef IP_HDRINCL
> +# define IP_HDRINCL		3
> +#endif
> +
> +#ifndef IP_NODEFRAG
> +# define IP_NODEFRAG		22
> +#endif
> +
> +#ifndef IPV6_HDRINCL
> +# define IPV6_HDRINCL		36
> +#endif
> +
>  struct inet_sk_desc {
>  	struct socket_desc	sd;
>  	unsigned int		type;
> diff --git a/criu/sk-inet.c b/criu/sk-inet.c
> index 924cf8c09d60..42134d4d5b3b 100644
> --- a/criu/sk-inet.c
> +++ b/criu/sk-inet.c
> @@ -94,7 +94,7 @@ static void show_one_inet_img(const char *act, const InetSkEntry *e)
>  		e->state, src_addr);
>  }
>  
> -static int can_dump_ipproto(int ino, int proto)
> +static int can_dump_ipproto(int ino, int proto, int type)
>  {
>  	/* Make sure it's a proto we support */
>  	switch (proto) {
> @@ -104,8 +104,12 @@ static int can_dump_ipproto(int ino, int proto)
>  	case IPPROTO_UDPLITE:
>  		break;
>  	default:
> -		pr_err("Unsupported proto %d for socket %x\n", proto, ino);
> -		return 0;
> +		/* Raw sockets may have any protocol inside */
> +		if (type != SOCK_RAW) {
> +			pr_err("Unsupported proto %d (type %d) for socket %x\n",
> +			       proto, type, ino);
> +			return 0;
> +		}
>  	}
>  
>  	return 1;
> @@ -135,9 +139,9 @@ static int can_dump_inet_sk(const struct inet_sk_desc *sk)
>  		return 1;
>  	}
>  
> -	if (sk->type != SOCK_STREAM) {
> +	if (sk->type != SOCK_STREAM && sk->type != SOCK_RAW) {
>  		pr_err("Can't dump %d inet socket %x. "
> -				"Only can stream and dgram.\n",
> +				"Only can stream, dgram and raw.\n",
>  				sk->type, sk->sd.ino);
>  		return 0;
>  	}
> @@ -240,12 +244,24 @@ err:
>  	return NULL;
>  }
>  
> -static int dump_ip_opts(int sk, IpOptsEntry *ioe)
> +
> +static int dump_ip_opts(int family, int type, int sk, IpOptsEntry *ioe)
>  {
>  	int ret = 0;
>  
> -	ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> -	ioe->has_freebind = ioe->freebind;
> +	if (type == SOCK_RAW) {
> +		if (family == AF_INET6) {
> +			ret |= dump_opt(sk, SOL_IPV6, IPV6_HDRINCL, &ioe->hdrincl);
> +		} else {
> +			ret |= dump_opt(sk, SOL_IP, IP_HDRINCL, &ioe->hdrincl);
> +			ret |= dump_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
> +			ioe->has_nodefrag = ioe->nodefrag;
> +		}
> +		ioe->has_hdrincl = ioe->hdrincl;
> +	} else {
> +		ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> +		ioe->has_freebind = ioe->freebind;
> +	}
>  
>  	return ret;
>  }
> @@ -275,14 +291,18 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
>  	InetSkEntry ie = INET_SK_ENTRY__INIT;
>  	IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT;
>  	SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
> -	int ret = -1, err = -1, proto;
> +	int ret = -1, err = -1, proto, type;
>  
>  	ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL,
>  					&proto, sizeof(proto));
>  	if (ret)
>  		goto err;
> +	ret = do_dump_opt(lfd, SOL_SOCKET, SO_TYPE,
> +			  &type, sizeof(type));
> +	if (ret)
> +		goto err;
>  
> -	if (!can_dump_ipproto(p->stat.st_ino, proto))
> +	if (!can_dump_ipproto(p->stat.st_ino, proto, type))
>  		goto err;
>  
>  	sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto);
> @@ -359,7 +379,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
>  	memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr));
>  	memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr));
>  
> -	if (dump_ip_opts(lfd, &ipopts))
> +	if (dump_ip_opts(family, sk->type, lfd, &ipopts))
>  		goto err;
>  
>  	if (dump_socket_opts(lfd, &skopts))
> @@ -376,7 +396,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
>  
>  	switch (proto) {
>  	case IPPROTO_TCP:
> -		err = dump_one_tcp(lfd, sk);
> +		err = (sk->type != SOCK_RAW) ? dump_one_tcp(lfd, sk) : 0;
>  		break;
>  	default:
>  		err = 0;
> @@ -540,12 +560,18 @@ static int post_open_inet_sk(struct file_desc *d, int sk)
>  	return 0;
>  }
>  
> -int restore_ip_opts(int sk, IpOptsEntry *ioe)
> +int restore_ip_opts(int family, int sk, IpOptsEntry *ioe)
>  {
>  	int ret = 0;
>  
>  	if (ioe->has_freebind)
>  		ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> +	if (ioe->has_nodefrag)
> +		ret |= restore_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
> +	if (ioe->has_hdrincl)
> +		ret |= restore_opt(sk, family == AF_INET6 ? SOL_IPV6 : SOL_IP,
> +				   family == AF_INET6 ? IPV6_HDRINCL : IP_HDRINCL,
> +				   &ioe->hdrincl);
>  
>  	return ret;
>  }
> @@ -565,7 +591,7 @@ static int open_inet_sk(struct file_desc *d)
>  		return -1;
>  	}
>  
> -	if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM)) {
> +	if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM) && (ie->type != SOCK_RAW)) {
>  		pr_err("Unsupported socket type: %d\n", ie->type);
>  		return -1;
>  	}
> @@ -641,7 +667,7 @@ done:
>  	if (rst_file_params(sk, ie->fown, ie->flags))
>  		goto err;
>  
> -	if (ie->ip_opts && restore_ip_opts(sk, ie->ip_opts))
> +	if (ie->ip_opts && restore_ip_opts(ie->family, sk, ie->ip_opts))
>  		goto err;
>  
>  	if (restore_socket_opts(sk, ie->opts))
> @@ -713,7 +739,7 @@ int inet_bind(int sk, struct inet_sk_info *ii)
>  	 * sockets could not be bound to them in this moment
>  	 * without setting IP_FREEBIND.
>  	 */
> -	if (ii->ie->family == AF_INET6) {
> +	if (ii->ie->family == AF_INET6 && ii->ie->proto != IPPROTO_RAW) {
>  		int yes = 1;
>  
>  		if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes))
> diff --git a/criu/sockets.c b/criu/sockets.c
> index b5c03fdf2984..57fa4c16a646 100644
> --- a/criu/sockets.c
> +++ b/criu/sockets.c
> @@ -57,9 +57,11 @@ enum socket_cl_bits
>  	INET_TCP_CL_BIT,
>  	INET_UDP_CL_BIT,
>  	INET_UDPLITE_CL_BIT,
> +	INET_RAW_CL_BIT,
>  	INET6_TCP_CL_BIT,
>  	INET6_UDP_CL_BIT,
>  	INET6_UDPLITE_CL_BIT,
> +	INET6_RAW_CL_BIT,
>  	UNIX_CL_BIT,
>  	PACKET_CL_BIT,
>  	_MAX_CL_BIT,
> @@ -85,6 +87,8 @@ enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
>  			return INET_UDP_CL_BIT;
>  		if (proto == IPPROTO_UDPLITE)
>  			return INET_UDPLITE_CL_BIT;
> +		if (proto == IPPROTO_RAW)
> +			return INET_RAW_CL_BIT;
>  	}
>  	if (family == AF_INET6) {
>  		if (proto == IPPROTO_TCP)
> @@ -93,6 +97,8 @@ enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
>  			return INET6_UDP_CL_BIT;
>  		if (proto == IPPROTO_UDPLITE)
>  			return INET6_UDPLITE_CL_BIT;
> +		if (proto == IPPROTO_RAW)
> +			return INET6_RAW_CL_BIT;
>  	}
>  
>  	pr_err("Unknown pair family %d proto %d\n", family, proto);
> @@ -593,6 +599,9 @@ static int inet_receive_one(struct nlmsghdr *h, void *arg)
>  	case IPPROTO_TCP:
>  		type = SOCK_STREAM;
>  		break;
> +	case IPPROTO_RAW:
> +		type = SOCK_RAW;
> +		break;
>  	case IPPROTO_UDP:
>  	case IPPROTO_UDPLITE:
>  		type = SOCK_DGRAM;
> @@ -614,6 +623,14 @@ static int do_collect_req(int nl, struct sock_diag_req *req, int size,
>  
>  	if (tmp == 0)
>  		set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol);
> +	else if (tmp == -ENOENT &&
> +		 ((req->r.n.sdiag_family == AF_INET ||
> +		   req->r.n.sdiag_family == AF_INET6) &&
> +		  req->r.n.sdiag_protocol == IPPROTO_RAW)) {
> +		pr_warn("No support for DIAG module on family %s with protocol IPPROTO_RAW, may fail later\n",
> +			req->r.n.sdiag_family == AF_INET ? "IPv4" : "IPv6");
> +		tmp = 0;
> +	}
>  
>  	return tmp;
>  }
> @@ -668,6 +685,15 @@ int collect_sockets(struct ns_id *ns)
>  	if (tmp)
>  		err = tmp;
>  
> +	/* Collect IPv4 RAW sockets */
> +	req.r.i.sdiag_family	= AF_INET;
> +	req.r.i.sdiag_protocol	= IPPROTO_RAW;
> +	req.r.i.idiag_ext	= 0;
> +	req.r.i.idiag_states	= -1; /* All */
> +	tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
> +	if (tmp)
> +		err = tmp;
> +
>  	/* Collect IPv6 TCP sockets */
>  	req.r.i.sdiag_family	= AF_INET6;
>  	req.r.i.sdiag_protocol	= IPPROTO_TCP;
> @@ -696,6 +722,15 @@ int collect_sockets(struct ns_id *ns)
>  	if (tmp)
>  		err = tmp;
>  
> +	/* Collect IPv6 RAW sockets */
> +	req.r.i.sdiag_family	= AF_INET6;
> +	req.r.i.sdiag_protocol	= IPPROTO_RAW;
> +	req.r.i.idiag_ext	= 0;
> +	req.r.i.idiag_states	= -1; /* All */
> +	tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
> +	if (tmp)
> +		err = tmp;
> +
>  	req.r.p.sdiag_family	= AF_PACKET;
>  	req.r.p.sdiag_protocol	= 0;
>  	req.r.p.pdiag_show	= PACKET_SHOW_INFO | PACKET_SHOW_MCLIST |
> diff --git a/images/sk-inet.proto b/images/sk-inet.proto
> index 01dda875a247..6c5b8df585e7 100644
> --- a/images/sk-inet.proto
> +++ b/images/sk-inet.proto
> @@ -6,6 +6,8 @@ import "sk-opts.proto";
>  
>  message ip_opts_entry {
>  	optional bool		freebind	= 1;
> +	optional bool		hdrincl		= 2;
> +	optional bool		nodefrag	= 3;
>  }
>  
>  message inet_sk_entry {
> 



More information about the CRIU mailing list