[CRIU] [PATCH 09/21] soccr: add support for half-closed sockets

Andrei Vagin avagin at openvz.org
Thu Dec 1 00:32:27 PST 2016


From: Andrei Vagin <avagin at virtuozzo.com>

A socket is in one of half-closed states, if it sent a fin packet
or it received a fin packet.

CRIU plays with fin packets to restore half-closed states too.

When we need to sent a fin packet from a socket, we can call
shutdown(SHUT_WR). When a fin packet has to be restore in
a received queue, criu generate a fin packet and send it via
a raw ip socket.

A raw packet is sent with the SOCCR_MARK mark to be able
to not block it.

v2: remove the SOCCR_FLAGS_ACKED_FIN flag
    introduce sets of bits for different actions with fin packets

Signed-off-by: Andrei Vagin <avagin at virtuozzo.com>
---
 criu/Makefile.packages |   2 +-
 soccr/soccr.c          | 237 ++++++++++++++++++++++++++++++++++++++++++++++++-
 soccr/soccr.h          |   3 +
 3 files changed, 238 insertions(+), 4 deletions(-)

diff --git a/criu/Makefile.packages b/criu/Makefile.packages
index 53fbdae..886394f 100644
--- a/criu/Makefile.packages
+++ b/criu/Makefile.packages
@@ -19,7 +19,7 @@ REQ-DEB-PKG-NAMES	+= libcap-dev
 
 REQ-DEB-PKG-TEST-NAMES  += libaio-dev
 
-export LIBS		+= -lrt -lpthread -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/
+export LIBS		+= -lrt -lpthread -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet
 
 check-packages-failed:
 	$(warning Can not find some of the required libraries)
diff --git a/soccr/soccr.c b/soccr/soccr.c
index 4b6c227..dce3151 100644
--- a/soccr/soccr.c
+++ b/soccr/soccr.c
@@ -4,6 +4,9 @@
 #include <sys/ioctl.h>
 #include <errno.h>
 #include <linux/sockios.h>
+#include <libnet.h>
+#include <assert.h>
+
 #include "soccr.h"
 
 #ifndef SIOCOUTQNSD
@@ -11,6 +14,57 @@
 #define SIOCOUTQNSD     0x894B
 #endif
 
+enum {
+        TCPF_ESTABLISHED = (1 << 1),
+        TCPF_SYN_SENT    = (1 << 2),
+        TCPF_SYN_RECV    = (1 << 3),
+        TCPF_FIN_WAIT1   = (1 << 4),
+        TCPF_FIN_WAIT2   = (1 << 5),
+        TCPF_TIME_WAIT   = (1 << 6),
+        TCPF_CLOSE       = (1 << 7),
+        TCPF_CLOSE_WAIT  = (1 << 8),
+        TCPF_LAST_ACK    = (1 << 9),
+        TCPF_LISTEN      = (1 << 10),
+        TCPF_CLOSING     = (1 << 11),
+};
+
+/*
+ * The TCP transition diagram for half closed connections
+ *
+ * ------------
+ * FIN_WAIT1	\ FIN
+ *			---------
+ *		/ ACK   CLOSE_WAIT
+ * -----------
+ * FIN_WAIT2
+ *			----------
+ *		/ FIN   LAST_ACK
+ * -----------
+ * TIME_WAIT	\ ACK
+ *			----------
+ *			CLOSED
+ *
+ * How to get the TCP_CLOSING state
+ *
+ * -----------		----------
+ * FIN_WAIT1	\/ FIN	FIN_WAIT1
+ * -----------		----------
+ *  CLOSING		CLOSING
+ *		\/ ACK
+ * -----------		----------
+ *  TIME_WAIT		TIME_WAIT
+ */
+
+/* Restore a fin packet in a send queue first */
+#define SNDQ_FIRST_FIN	(TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING)
+/* Restore fin in a send queue after restoring fi in the receive queue. */
+#define SNDQ_SECOND_FIN (TCPF_LAST_ACK | TCPF_CLOSE)
+#define SNDQ_FIN_ACKED	(TCPF_FIN_WAIT2 | TCPF_CLOSE)
+
+#define RCVQ_FIRST_FIN	(TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSE)
+#define RCVQ_SECOND_FIN (TCPF_CLOSING)
+#define RCVQ_FIN_ACKED	(TCPF_CLOSE)
+
 static void (*log)(unsigned int loglevel, const char *format, ...)
 	__attribute__ ((__format__ (__printf__, 2, 3)));
 static unsigned int log_level = 0;
@@ -90,6 +144,11 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str
 
 	switch (ti->tcpi_state) {
 	case TCP_ESTABLISHED:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_LAST_ACK:
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
 	case TCP_CLOSE:
 		break;
 	default:
@@ -97,7 +156,7 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str
 		return -1;
 	}
 
-	data->state = TCP_ESTABLISHED;
+	data->state = ti->tcpi_state;
 
 	if (ioctl(sk->fd, SIOCOUTQ, &size) == -1) {
 		logerr("Unable to get size of snd queue");
@@ -113,6 +172,13 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str
 
 	data->unsq_len = size;
 
+	/* Don't account the fin packet. It doesn't countain real data. */
+	if ((1 << data->state) & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) {
+		if (data->outq_len)
+			data->outq_len--;
+		data->unsq_len = data->unsq_len ? data->unsq_len - 1 : 0;
+	}
+
 	if (ioctl(sk->fd, SIOCINQ, &size) == -1) {
 		logerr("Unable to get size of recv queue");
 		return -1;
@@ -330,6 +396,7 @@ static int set_queue_seq(struct libsoccr_sk *sk, int queue, __u32 seq)
 int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk,
 		struct libsoccr_sk_data *data, unsigned data_size)
 {
+	int mstate = 1 << data->state;
 	struct tcp_repair_opt opts[4];
 	int addr_size;
 	int onr = 0;
@@ -337,9 +404,16 @@ int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk,
 	if (!data || data_size < SOCR_DATA_MIN_SIZE)
 		return -1;
 
-	if (data->state != TCP_ESTABLISHED)
+	if (data->state == TCP_LISTEN)
 		return -1;
 
+	if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN))
+		data->inq_seq--;
+
+	/* outq_seq is adjusted due to not accointing the fin packet */
+	if (mstate & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN))
+		data->outq_seq--;
+
 	if (set_queue_seq(sk, TCP_RECV_QUEUE,
 				data->inq_seq - data->inq_len))
 		return -2;
@@ -403,9 +477,135 @@ int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk,
 	return 0;
 }
 
+static int send_fin(struct libsoccr_sk_data *data, unsigned data_size, uint8_t flags)
+{
+	int ret, exit_code = -1;
+	char errbuf[LIBNET_ERRBUF_SIZE];
+	int mark = SOCCR_MARK;;
+	int libnet_type;
+	libnet_t *l;
+
+	if (data->dst_addr.sa.sa_family == AF_INET6)
+		libnet_type = LIBNET_RAW6;
+	else
+		libnet_type = LIBNET_RAW4;
+
+	l = libnet_init(
+		libnet_type,                            /* injection type */
+		NULL,                                   /* network interface */
+		errbuf);                                /* errbuf */
+	if (l == NULL)
+		return -1;
+
+	if (setsockopt(l->fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)))
+		goto err;
+
+	ret = libnet_build_tcp(
+		ntohs(data->dst_addr.v4.sin_port),		/* source port */
+		ntohs(data->src_addr.v4.sin_port),		/* destination port */
+		data->inq_seq,			/* sequence number */
+		data->outq_seq - data->outq_len,	/* acknowledgement num */
+		flags,				/* control flags */
+		data->rcv_wnd,			/* window size */
+		0,				/* checksum */
+		10,				/* urgent pointer */
+		LIBNET_TCP_H + 20,		/* TCP packet size */
+		NULL,				/* payload */
+		0,				/* payload size */
+		l,				/* libnet handle */
+		0);				/* libnet id */
+	if (ret == -1) {
+		loge("Can't build TCP header: %s\n", libnet_geterror(l));
+		goto err;
+	}
+
+	if (data->dst_addr.sa.sa_family == AF_INET6) {
+		struct libnet_in6_addr src, dst;
+
+		memcpy(&dst, &data->dst_addr.v6.sin6_addr, sizeof(dst));
+		memcpy(&src, &data->src_addr.v6.sin6_addr, sizeof(src));
+
+		ret = libnet_build_ipv6(
+			0, 0,
+			LIBNET_TCP_H,	/* length */
+			IPPROTO_TCP,	/* protocol */
+			64,		/* hop limit */
+			dst,		/* source IP */
+			src,		/* destination IP */
+			NULL,		/* payload */
+			0,		/* payload size */
+			l,		/* libnet handle */
+			0);		/* libnet id */
+	} else if (data->dst_addr.sa.sa_family == AF_INET)
+		ret = libnet_build_ipv4(
+			LIBNET_IPV4_H + LIBNET_TCP_H + 20,	/* length */
+			0,			/* TOS */
+			242,			/* IP ID */
+			0,			/* IP Frag */
+			64,			/* TTL */
+			IPPROTO_TCP,		/* protocol */
+			0,			/* checksum */
+			data->dst_addr.v4.sin_addr.s_addr,	/* source IP */
+			data->src_addr.v4.sin_addr.s_addr,	/* destination IP */
+			NULL,			/* payload */
+			0,			/* payload size */
+			l,			/* libnet handle */
+			0);			/* libnet id */
+	else {
+		loge("Unknown socket family");
+		goto err;
+	}
+	if (ret == -1) {
+		loge("Can't build IP header: %s\n", libnet_geterror(l));
+		goto err;
+	}
+
+	ret = libnet_write(l);
+	if (ret == -1) {
+		loge("Unable to send a fin packet: %s", libnet_geterror(l));
+		goto err;
+	}
+
+	exit_code = 0;
+err:
+	libnet_destroy(l);
+	return exit_code;
+}
+
+static int restore_fin_in_snd_queue(int sk, int acked)
+{
+	int queue = TCP_SEND_QUEUE;
+	int ret;
+
+	/*
+	 * If TCP_SEND_QUEUE is set, a fin packet will be
+	 * restored as a sent packet.
+	 */
+	if (acked &&
+	    setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
+		logerr("Can't set repair queue");
+		return -1;
+	}
+
+	ret = shutdown(sk, SHUT_WR);
+	if (ret < 0)
+		logerr("Unable to shut down a socket");
+
+	queue = TCP_NO_QUEUE;
+	if (acked &&
+	    setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
+		logerr("Can't set repair queue");
+		return -1;
+	}
+
+	return ret;
+}
+
 int libsoccr_set_sk_data(struct libsoccr_sk *sk,
 		struct libsoccr_sk_data *data, unsigned data_size)
 {
+	int mstate = 1 << data->state;
+
 	if (data->flags & SOCCR_FLAGS_WINDOW) {
 		struct tcp_repair_window wopt = {
 			.snd_wl1 = data->snd_wl1,
@@ -414,13 +614,44 @@ int libsoccr_set_sk_data(struct libsoccr_sk *sk,
 			.rcv_wnd = data->rcv_wnd,
 			.rcv_wup = data->rcv_wup,
 		};
-	
+
+		if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) {
+			wopt.rcv_wup--;
+			wopt.rcv_wnd++;
+		}
+
 		if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &wopt, sizeof(wopt))) {
 			logerr("Unable to set window parameters");
 			return -1;
 		}
 	}
 
+	/*
+	 * To restore a half closed sockets, fin packets has to be restored in
+	 * recv and send queues. Here shutdown() is used to restore a fin
+	 * packet in the send queue and a fake fin packet is send to restore it
+	 * in the recv queue.
+	 */
+	if (mstate & SNDQ_FIRST_FIN)
+		restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED);
+
+	/* Send a fin packet to the socket to restore it in a receive queue. */
+	if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN))
+		if (send_fin(data, data_size, TH_ACK | TH_FIN) < 0)
+			return -1;
+
+	if (mstate & SNDQ_SECOND_FIN)
+		restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED);
+
+	if (mstate & RCVQ_FIN_ACKED)
+		data->inq_seq++;
+
+	if (mstate & SNDQ_FIN_ACKED) {
+		data->outq_seq++;
+		if (send_fin(data, data_size, TH_ACK) < 0)
+			return -1;
+	}
+
 	return 0;
 }
 
diff --git a/soccr/soccr.h b/soccr/soccr.h
index 789f514..ba12052 100644
--- a/soccr/soccr.h
+++ b/soccr/soccr.h
@@ -6,6 +6,9 @@
 
 #include "config.h"
 
+/* All packets with this mark have not to be blocked. */
+#define SOCCR_MARK 0xC114
+
 #ifndef CONFIG_HAS_TCP_REPAIR_WINDOW
 struct tcp_repair_window {
 	uint32_t   snd_wl1;
-- 
2.7.4



More information about the CRIU mailing list