[CRIU] [PATCH 09/21] soccr: add support for half-closed sockets
Pavel Emelyanov
xemul at virtuozzo.com
Wed Nov 30 01:24:52 PST 2016
On 11/28/2016 10:27 PM, Andrei Vagin wrote:
> From: Andrei Vagin <avagin at virtuozzo.com>
>
> A socket is in one of half-closed states, if it sent a fin packet
> or it received a fin packet.
>
> CRIU plays with fin packets to restore half-closed states too.
>
> When we need to sent a fin packet from a socket, we can call
> shutdown(SHUT_WR). When a fin packet has to be restore in
> a received queue, criu generate a fin packet and send it via
> a raw ip socket.
>
> A raw packet is sent with the SOCCR_MARK mark to be able
> to not block it.
>
> Signed-off-by: Andrei Vagin <avagin at virtuozzo.com>
> ---
> criu/Makefile.packages | 2 +-
> soccr/soccr.c | 177 ++++++++++++++++++++++++++++++++++++++++++++++++-
> soccr/soccr.h | 11 ++-
> 3 files changed, 185 insertions(+), 5 deletions(-)
>
> diff --git a/criu/Makefile.packages b/criu/Makefile.packages
> index 53fbdae..886394f 100644
> --- a/criu/Makefile.packages
> +++ b/criu/Makefile.packages
> @@ -19,7 +19,7 @@ REQ-DEB-PKG-NAMES += libcap-dev
>
> REQ-DEB-PKG-TEST-NAMES += libaio-dev
>
> -export LIBS += -lrt -lpthread -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/
> +export LIBS += -lrt -lpthread -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet
>
> check-packages-failed:
> $(warning Can not find some of the required libraries)
> diff --git a/soccr/soccr.c b/soccr/soccr.c
> index 31cb870..211b9e6 100644
> --- a/soccr/soccr.c
> +++ b/soccr/soccr.c
> @@ -4,6 +4,9 @@
> #include <sys/ioctl.h>
> #include <errno.h>
> #include <linux/sockios.h>
> +#include <libnet.h>
> +#include <assert.h>
> +
> #include "soccr.h"
>
> #ifndef SIOCOUTQNSD
> @@ -11,6 +14,20 @@
> #define SIOCOUTQNSD 0x894B
> #endif
>
> +enum {
> + TCPF_ESTABLISHED = (1 << 1),
> + TCPF_SYN_SENT = (1 << 2),
> + TCPF_SYN_RECV = (1 << 3),
> + TCPF_FIN_WAIT1 = (1 << 4),
> + TCPF_FIN_WAIT2 = (1 << 5),
> + TCPF_TIME_WAIT = (1 << 6),
> + TCPF_CLOSE = (1 << 7),
> + TCPF_CLOSE_WAIT = (1 << 8),
> + TCPF_LAST_ACK = (1 << 9),
> + TCPF_LISTEN = (1 << 10),
> + TCPF_CLOSING = (1 << 11),
> +};
> +
> static void (*log)(unsigned int loglevel, const char *format, ...)
> __attribute__ ((__format__ (__printf__, 2, 3)));
> static unsigned int log_level = 0;
> @@ -90,6 +107,11 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str
>
> switch (ti->tcpi_state) {
> case TCP_ESTABLISHED:
> + case TCP_FIN_WAIT1:
> + case TCP_FIN_WAIT2:
> + case TCP_LAST_ACK:
> + case TCP_CLOSE_WAIT:
> + case TCP_CLOSING:
> case TCP_CLOSE:
> break;
> default:
> @@ -97,7 +119,7 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str
> return -1;
> }
>
> - data->state = TCP_ESTABLISHED;
> + data->state = ti->tcpi_state;
>
> if (ioctl(sk->fd, SIOCOUTQ, &size) == -1) {
> logerr("Unable to get size of snd queue");
> @@ -113,6 +135,17 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str
>
> data->unsq_len = size;
>
> + /* Don't account the fin packet. It doesn't countain real data. */
> + if ((1 << data->state) & (TCPF_FIN_WAIT1 | TCPF_LAST_ACK |
> + TCPF_CLOSING | TCPF_CLOSE)) {
> + if (data->outq_len)
> + data->outq_len--;
> + else
> + data->flags |= SOCCR_FLAGS_ACKED_FIN;
> + data->unsq_len = data->unsq_len ? data->unsq_len - 1 : 0;
> + }
> +
> +
> if (ioctl(sk->fd, SIOCINQ, &size) == -1) {
> logerr("Unable to get size of recv queue");
> return -1;
> @@ -330,6 +363,7 @@ static int set_queue_seq(struct libsoccr_sk *sk, int queue, __u32 seq)
> int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk,
> struct libsoccr_sk_data *data, unsigned data_size)
> {
> + int mstate = 1 << data->state;
> struct tcp_repair_opt opts[4];
> int addr_size;
> int onr = 0;
> @@ -337,9 +371,17 @@ int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk,
> if (!data || data_size < SOCR_DATA_MIN_SIZE)
> return -1;
>
> - if (data->state != TCP_ESTABLISHED)
> + if (data->state == TCP_LISTEN)
> return -1;
>
> + if (mstate & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSE | TCPF_CLOSING))
I believe it's worth introducing sets of bits for particular actions
and use them in corresponding places on dump and restore.
> + data->inq_seq--;
> +
> + /* outq_seq is adjusted due to not accointing the fin packet */
> + if (mstate & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
> + TCPF_LAST_ACK | TCPF_CLOSING | TCPF_CLOSE))
> + data->outq_seq--;
> +
> if (set_queue_seq(sk, TCP_RECV_QUEUE,
> data->inq_seq - data->inq_len))
> return -2;
> @@ -403,9 +445,106 @@ int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk,
> return 0;
> }
>
> +static int send_fin(int sk, struct libsoccr_sk_data *data, unsigned data_size, uint8_t flags)
> +{
> + int ret, exit_code = -1;
> + char errbuf[LIBNET_ERRBUF_SIZE];
> + int mark = SOCCR_MARK;;
> + int libnet_type;
> + libnet_t *l;
> +
> + if (data->dst_addr.sa.sa_family == AF_INET6)
> + libnet_type = LIBNET_RAW6;
> + else
> + libnet_type = LIBNET_RAW4;
> +
> + l = libnet_init(
> + libnet_type, /* injection type */
> + NULL, /* network interface */
> + errbuf); /* errbuf */
> + if (l == NULL)
> + return -1;
> +
> + if (setsockopt(l->fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)))
> + goto err;
> +
> + ret = libnet_build_tcp(
> + ntohs(data->dst_addr.sin.sin_port), /* source port */
> + ntohs(data->src_addr.sin.sin_port), /* destination port */
> + data->inq_seq, /* sequence number */
> + data->outq_seq - data->outq_len, /* acknowledgement num */
> + flags, /* control flags */
> + data->rcv_wnd, /* window size */
> + 0, /* checksum */
> + 10, /* urgent pointer */
> + LIBNET_TCP_H + 20, /* TCP packet size */
> + NULL, /* payload */
> + 0, /* payload size */
> + l, /* libnet handle */
> + 0); /* libnet id */
> + if (ret == -1) {
> + loge("Can't build TCP header: %s\n", libnet_geterror(l));
> + goto err;
> + }
> +
> + if (data->dst_addr.sa.sa_family == AF_INET6) {
> + struct libnet_in6_addr src, dst;
> +
> + memcpy(&dst, &data->dst_addr.sin6.sin6_addr, sizeof(dst));
> + memcpy(&src, &data->src_addr.sin6.sin6_addr, sizeof(src));
> +
> + ret = libnet_build_ipv6(
> + 0, 0,
> + LIBNET_TCP_H, /* length */
> + IPPROTO_TCP, /* protocol */
> + 64, /* hop limit */
> + dst, /* source IP */
> + src, /* destination IP */
> + NULL, /* payload */
> + 0, /* payload size */
> + l, /* libnet handle */
> + 0); /* libnet id */
> + } else if (data->dst_addr.sa.sa_family == AF_INET)
> + ret = libnet_build_ipv4(
> + LIBNET_IPV4_H + LIBNET_TCP_H + 20, /* length */
> + 0, /* TOS */
> + 242, /* IP ID */
> + 0, /* IP Frag */
> + 64, /* TTL */
> + IPPROTO_TCP, /* protocol */
> + 0, /* checksum */
> + data->dst_addr.sin.sin_addr.s_addr, /* source IP */
> + data->src_addr.sin.sin_addr.s_addr, /* destination IP */
> + NULL, /* payload */
> + 0, /* payload size */
> + l, /* libnet handle */
> + 0); /* libnet id */
> + else {
> + loge("Unknown socket family");
> + goto err;
> + }
> + if (ret == -1) {
> + loge("Can't build IP header: %s\n", libnet_geterror(l));
> + goto err;
> + }
> +
> + ret = libnet_write(l);
> + if (ret == -1) {
> + loge("Unable to send a fin packet: %s", libnet_geterror(l));
> + goto err;
> + }
> +
> + exit_code = 0;
> +err:
> + libnet_destroy(l);
> + return exit_code;
> +}
> +
> int libsoccr_set_sk_data(struct libsoccr_sk *sk,
> struct libsoccr_sk_data *data, unsigned data_size)
> {
> + int mstate = 1 << data->state;
> +
> if (data->flags & SOCCR_FLAGS_WINDOW) {
> struct tcp_repair_window wopt = {
> .snd_wl1 = data->snd_wl1,
> @@ -414,13 +553,45 @@ int libsoccr_set_sk_data(struct libsoccr_sk *sk,
> .rcv_wnd = data->rcv_wnd,
> .rcv_wup = data->rcv_wup,
> };
> -
> +
> + if (mstate & (TCPF_CLOSE_WAIT |
> + TCPF_LAST_ACK | TCPF_CLOSE | TCPF_CLOSING)) {
> + wopt.rcv_wup--;
> + wopt.rcv_wnd++;
> + }
> +
> if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &wopt, sizeof(wopt))) {
> logerr("Unable to set window parameters");
> return -1;
> }
> }
>
> + /*
> + * To restore a half closed sockets, fin packets has to be restored in
> + * recv and send queues. Here shutdown() is used to restore a fin
> + * packet in the send queue and a fake fin packet is send to restore it
> + * in the recv queue.
> + */
> + if (data->state == TCP_CLOSING)
Can you make it symmetrical to the below and check mstate & TCPF_CLOSING ?
> + shutdown(sk->fd, SHUT_WR);
> +
> + /* Send a fin packet to the socket to restore it in a receive queue. */
> + if (mstate & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSE | TCPF_CLOSING)) {
> + if (send_fin(sk->fd, data, data_size, TH_ACK | TH_FIN) < 0)
> + return -1;
> + data->inq_seq++;
> + }
> +
> + if (mstate & (TCPF_LAST_ACK | TCPF_FIN_WAIT1 |
> + TCPF_FIN_WAIT2 | TCPF_CLOSE))
> + shutdown(sk->fd, SHUT_WR);
> +
> + if ((mstate & (TCPF_CLOSE)) && (data->flags & SOCCR_FLAGS_ACKED_FIN)) {
> + data->outq_seq++;
> + if (send_fin(sk->fd, data, data_size, TH_ACK) < 0)
> + return -1;
> + }
> +
> return 0;
> }
>
> diff --git a/soccr/soccr.h b/soccr/soccr.h
> index 8bd0377..ca0d0b3 100644
> --- a/soccr/soccr.h
> +++ b/soccr/soccr.h
> @@ -6,6 +6,9 @@
>
> #include "config.h"
>
> +/* All packets with this mark have not to be blocked. */
> +#define SOCCR_MARK 0xC114
> +
> #ifndef CONFIG_HAS_TCP_REPAIR_WINDOW
> struct tcp_repair_window {
> uint32_t snd_wl1;
> @@ -82,7 +85,8 @@ struct libsoccr_sk_data {
> __u32 timestamp;
>
> __u32 flags; /* SOCCR_FLAGS_... below */
> - __u32 snd_wl1;
> +
> + __u32 snd_wl1; /* SOCCR_FLAGS_WINDOW */
> __u32 snd_wnd;
> __u32 max_window;
> __u32 rcv_wnd;
> @@ -109,6 +113,11 @@ struct libsoccr_sk_data {
> #define SOCCR_FLAGS_WINDOW 0x1
>
> /*
> + * The outgoing fin was acked.
> + */
> +#define SOCCR_FLAGS_ACKED_FIN 0x2
The flags are to describe which parts of the libsoccr_sk_data are there
and which are not, not to describe details of the fields contents.
Can we replace this flag with the state check on restore?
> +
> +/*
> * These two calls pause and resume the socket for and after C/R
> * The first one returns an opaque handle that is to be used by all
> * the subsequent calls.
>
More information about the CRIU
mailing list