[Devel] [PATCH 3/3] Add AF_INET c/r support (v2)
Dan Smith
danms at us.ibm.com
Wed Jul 1 11:20:14 PDT 2009
This patch adds AF_INET c/r support based on the framework established in
my AF_UNIX patch. I've tested it by checkpointing a single app with a
pair of sockets connected over loopback.
A couple points about the operation:
1. In order to properly hook up the established sockets with the matching
listening parent socket, I added a new list to the ckpt_ctx and run the
parent attachment in the deferqueue at the end of the restart process.
2. I don't do anything to redirect or freeze traffic flowing to or from the
remote system (to prevent a RST from breaking things). I expect that
userspace will bring down a veth device or freeze traffic to the remote
system to handle this case.
Changes in v2:
- Check for data in the TCP out-of-order queue and fail if present
- Fix a logic issue in sock_add_parent()
- Add comment about holding a reference to sock for parent list
- Write error messages to checkpoint stream where appropriate
- Fix up checking of some return values in restart phase
- Fix up restart logic to restore socket info for all states
- Avoid running sk_proto->hash() for non-TCP sockets
- Fix calling bind() for unconnected (i.e. DGRAM) sockets
- Change 'in' to 'inet' in structure and function names
Cc: Oren Laaden <orenl at cs.columbia.edu>
Cc: Alexey Dobriyan <adobriyan at gmail.com>
Signed-off-by: Dan Smith <danms at us.ibm.com>
---
checkpoint/sys.c | 2 +
include/linux/checkpoint_hdr.h | 1 +
include/linux/checkpoint_types.h | 2 +
include/linux/socket.h | 95 +++++++++++
net/checkpoint.c | 319 +++++++++++++++++++++++++++++++++++++-
5 files changed, 414 insertions(+), 5 deletions(-)
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 38a5299..b6f18ea 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -242,6 +242,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
INIT_LIST_HEAD(&ctx->pgarr_pool);
init_waitqueue_head(&ctx->waitq);
+ INIT_LIST_HEAD(&ctx->listen_sockets);
+
err = -EBADF;
ctx->file = fget(fd);
if (!ctx->file)
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f59b071..16e21ee 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -93,6 +93,7 @@ enum {
CKPT_HDR_SOCKET_BUFFERS,
CKPT_HDR_SOCKET_BUFFER,
CKPT_HDR_SOCKET_UNIX,
+ CKPT_HDR_SOCKET_INET,
CKPT_HDR_TAIL = 9001,
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 27fbe26..d7db190 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -60,6 +60,8 @@ struct ckpt_ctx {
struct list_head pgarr_list; /* page array to dump VMA contents */
struct list_head pgarr_pool; /* pool of empty page arrays chain */
+ struct list_head listen_sockets;/* listening parent sockets */
+
/* [multi-process checkpoint] */
struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
int nr_tasks; /* size of tasks array */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 6480c47..4fe5102 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -332,6 +332,101 @@ struct ckpt_hdr_socket_unix {
__u8 linked;
} __attribute__ ((aligned(8)));
+struct ckpt_hdr_socket_inet {
+ struct ckpt_hdr h;
+
+ __u32 daddr;
+ __u32 rcv_saddr;
+ __u32 saddr;
+ __u16 dport;
+ __u16 num;
+ __u16 sport;
+ __s16 uc_ttl;
+ __u16 cmsg_flags;
+ __u16 __pad;
+
+ struct {
+ __u64 timeout;
+ __u32 ato;
+ __u32 lrcvtime;
+ __u16 last_seg_size;
+ __u16 rcv_mss;
+ __u8 pending;
+ __u8 quick;
+ __u8 pingpong;
+ __u8 blocked;
+ } icsk_ack __attribute__ ((aligned(8)));
+
+ /* FIXME: Skipped opt, tos, multicast, cork settings */
+
+ struct {
+ __u64 last_synq_overflow;
+
+ __u32 rcv_nxt;
+ __u32 copied_seq;
+ __u32 rcv_wup;
+ __u32 snd_nxt;
+ __u32 snd_una;
+ __u32 snd_sml;
+ __u32 rcv_tstamp;
+ __u32 lsndtime;
+
+ __u32 snd_wl1;
+ __u32 snd_wnd;
+ __u32 max_window;
+ __u32 mss_cache;
+ __u32 window_clamp;
+ __u32 rcv_ssthresh;
+ __u32 frto_highmark;
+
+ __u32 srtt;
+ __u32 mdev;
+ __u32 mdev_max;
+ __u32 rttvar;
+ __u32 rtt_seq;
+
+ __u32 packets_out;
+ __u32 retrans_out;
+
+ __u32 snd_up;
+ __u32 rcv_wnd;
+ __u32 write_seq;
+ __u32 pushed_seq;
+ __u32 lost_out;
+ __u32 sacked_out;
+ __u32 fackets_out;
+ __u32 tso_deferred;
+ __u32 bytes_acked;
+
+ __s32 lost_cnt_hint;
+ __u32 retransmit_high;
+
+ __u32 lost_retrans_low;
+
+ __u32 prior_ssthresh;
+ __u32 high_seq;
+
+ __u32 retrans_stamp;
+ __u32 undo_marker;
+ __s32 undo_retrans;
+ __u32 total_retrans;
+
+ __u32 urg_seq;
+ __u32 keepalive_time;
+ __u32 keepalive_intvl;
+
+ __u16 urg_data;
+ __u16 advmss;
+ __u8 frto_counter;
+ __u8 nonagle;
+
+ __u8 ecn_flags;
+ __u8 reordering;
+
+ __u8 keepalive_probes;
+ } tcp __attribute__ ((aligned(8)));
+} __attribute__ ((aligned(8)));
+
struct ckpt_hdr_socket {
struct ckpt_hdr h;
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 701d26c..b3fb66d 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -14,11 +14,64 @@
#include <linux/file.h>
#include <net/af_unix.h>
+#include <net/tcp.h>
#include <net/tcp_states.h>
+#include <linux/tcp.h>
+#include <linux/in.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
#include <linux/namei.h>
+#include <linux/deferqueue.h>
+
+struct ckpt_parent_sock {
+ struct sock *sock;
+ __u32 oref;
+ struct list_head list;
+};
+
+static int sock_add_parent(struct ckpt_ctx *ctx, struct sock *sock)
+{
+ struct ckpt_parent_sock *parent;
+ __u32 objref;
+ int new;
+
+ objref = ckpt_obj_lookup_add(ctx, sock, CKPT_OBJ_SOCK, &new);
+ if (objref < 0)
+ return objref;
+ else if (!new)
+ return 0;
+
+ parent = kmalloc(sizeof(*parent), GFP_KERNEL);
+ if (!parent)
+ return -ENOMEM;
+
+ /* The deferqueue is processed before the objhash is free()'d, thus
+ * the objhash holds a reference to sock for us
+ */
+ parent->sock = sock;
+ parent->oref = objref;
+ INIT_LIST_HEAD(&parent->list);
+
+ list_add(&parent->list, &ctx->listen_sockets);
+
+ return 0;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sock)
+{
+ struct ckpt_parent_sock *parent;
+ struct inet_sock *c = inet_sk(sock);
+
+ list_for_each_entry(parent, &ctx->listen_sockets, list) {
+ struct inet_sock *p = inet_sk(parent->sock);
+
+ if (c->sport == p->sport)
+ return parent->sock;
+ }
+
+ return NULL;
+}
/* Size of an empty struct sockaddr_un */
#define UNIX_LEN_EMPTY 2
@@ -47,17 +100,23 @@ static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
}
static int __sock_write_buffers(struct ckpt_ctx *ctx,
+ uint16_t family,
struct sk_buff_head *queue)
{
struct sk_buff *skb;
int ret = 0;
skb_queue_walk(queue, skb) {
- if (UNIXCB(skb).fp) {
+ if ((family == AF_UNIX) && UNIXCB(skb).fp) {
ckpt_write_err(ctx, "fd-passing is not supported");
return -EBUSY;
}
+ if (skb_shinfo(skb)->nr_frags) {
+ ckpt_write_err(ctx, "socket has fragments in-flight");
+ return -EBUSY;
+ }
+
ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
CKPT_HDR_SOCKET_BUFFER);
if (ret)
@@ -67,7 +126,9 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
return 0;
}
-static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+static int sock_write_buffers(struct ckpt_ctx *ctx,
+ uint16_t family,
+ struct sk_buff_head *queue)
{
struct ckpt_hdr_socket_buffer *h;
struct sk_buff_head tmpq;
@@ -87,7 +148,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
if (!ret)
- ret = __sock_write_buffers(ctx, &tmpq);
+ ret = __sock_write_buffers(ctx, family, &tmpq);
out:
ckpt_hdr_put(ctx, h);
@@ -228,6 +289,152 @@ static int sock_cptrst(struct ckpt_ctx *ctx,
return 0;
}
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+ struct tcp_sock *sk,
+ struct ckpt_hdr_socket_inet *hh,
+ int op)
+{
+ CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+ CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+ CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+ CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+ CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+ CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+ CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+ CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+ CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+ CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+ CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+ CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+ CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+ CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+ CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+ CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+ CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+ CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+ CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+ CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+ CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+ CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+ CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+ CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+ CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+ CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+ CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+ CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+ CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+ CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+ CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+ CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+ CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+ CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+ CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+ CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+ CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+ CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+ CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+ CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+ CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+ CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+ CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+ CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+ CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+ CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+ CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+ CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+ CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+ CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+ CKPT_COPY(op, hh->tcp.last_synq_overflow, sk->last_synq_overflow);
+
+ return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct ckpt_hdr_socket_inet *hh,
+ int op)
+{
+ struct inet_sock *sk = inet_sk(sock);
+ struct inet_connection_sock *icsk = inet_csk(sock);
+ int ret;
+
+ CKPT_COPY(op, hh->daddr, sk->daddr);
+ CKPT_COPY(op, hh->rcv_saddr, sk->rcv_saddr);
+ CKPT_COPY(op, hh->dport, sk->dport);
+ CKPT_COPY(op, hh->num, sk->num);
+ CKPT_COPY(op, hh->saddr, sk->saddr);
+ CKPT_COPY(op, hh->sport, sk->sport);
+ CKPT_COPY(op, hh->uc_ttl, sk->uc_ttl);
+ CKPT_COPY(op, hh->cmsg_flags, sk->cmsg_flags);
+
+ CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+ CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+ CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+ CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+ CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+ CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+ CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+ CKPT_COPY(op,
+ hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+ CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+ if (sock->sk_protocol == IPPROTO_TCP)
+ ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sock), hh, op);
+ else if (sock->sk_protocol == IPPROTO_UDP)
+ ret = 0;
+ else {
+ ckpt_write_err(ctx, "unknown socket protocol %d",
+ sock->sk_protocol);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int sock_inet_checkpoint(struct ckpt_ctx *ctx,
+ struct sock *sock,
+ struct ckpt_hdr_socket *h)
+{
+ int ret = -EINVAL;
+ struct ckpt_hdr_socket_inet *in;
+
+ if (sock->sk_protocol == IPPROTO_TCP) {
+ struct tcp_sock *tsock = tcp_sk(sock);
+ if (!skb_queue_empty(&tsock->out_of_order_queue)) {
+ ckpt_write_err(ctx, "TCP socket has out-of-order data");
+ return -EBUSY;
+ }
+ }
+
+ in = ckpt_hdr_get_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
+ if (!in)
+ goto out;
+
+ ret = sock_inet_cptrst(ctx, sock, in, CKPT_CPT);
+ if (ret < 0)
+ goto out;
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ if (ret < 0)
+ goto out;
+
+ ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
+ out:
+ return ret;
+}
+
int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
{
struct socket *socket = file->private_data;
@@ -266,6 +473,10 @@ int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
ret = sock_unix_checkpoint(ctx, sock, h);
if (ret)
goto out;
+ } else if (sock->sk_family == AF_INET) {
+ ret = sock_inet_checkpoint(ctx, sock, h);
+ if (ret)
+ goto out;
} else {
ckpt_write_err(ctx, "unsupported socket family %i",
sock->sk_family);
@@ -274,11 +485,13 @@ int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
}
if (sock->sk_state != TCP_LISTEN) {
- ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+ uint16_t family = sock->sk_family;
+
+ ret = sock_write_buffers(ctx, family, &sock->sk_receive_queue);
if (ret)
goto out;
- ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+ ret = sock_write_buffers(ctx, family, &sock->sk_write_queue);
if (ret)
goto out;
}
@@ -497,6 +710,99 @@ static int sock_unix_restart(struct ckpt_ctx *ctx,
return ret;
}
+struct dq_sock {
+ struct sock *sock;
+ struct ckpt_ctx *ctx;
+};
+
+static int __sock_hash_parent(void *data)
+{
+ struct dq_sock *dq = (struct dq_sock *)data;
+ struct sock *parent;
+
+ dq->sock->sk_prot->hash(dq->sock);
+
+ parent = sock_get_parent(dq->ctx, dq->sock);
+ if (parent) {
+ inet_sk(dq->sock)->num = ntohs(inet_sk(dq->sock)->sport);
+ local_bh_disable();
+ __inet_inherit_port(parent, dq->sock);
+ local_bh_enable();
+ } else {
+ inet_sk(dq->sock)->num = 0;
+ inet_hash_connect(&tcp_death_row, dq->sock);
+ inet_sk(dq->sock)->num = ntohs(inet_sk(dq->sock)->sport);
+ }
+
+ return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+ struct dq_sock dq;
+
+ dq.sock = sock;
+ dq.ctx = ctx;
+
+ return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+ __sock_hash_parent, __sock_hash_parent);
+}
+
+static int sock_inet_restart(struct ckpt_ctx *ctx,
+ struct ckpt_hdr_socket *h,
+ struct socket *socket)
+{
+ int ret;
+ struct ckpt_hdr_socket_inet *in;
+ struct sockaddr_in *l = (struct sockaddr_in *)&h->laddr;
+
+ in = ckpt_read_obj_type(ctx, sizeof(*in), CKPT_HDR_SOCKET_INET);
+ if (IS_ERR(in))
+ return PTR_ERR(in);
+
+ /* Listening sockets and those that are closed but have a local
+ * address need to call bind()
+ */
+ if ((h->sock.state == TCP_LISTEN) ||
+ ((h->sock.state == TCP_CLOSE) && (h->laddr_len > 0))) {
+ socket->sk->sk_reuse = 2;
+ inet_sk(socket->sk)->freebind = 1;
+ ret = socket->ops->bind(socket,
+ (struct sockaddr *)l,
+ h->laddr_len);
+ if (ret < 0)
+ goto out;
+
+ if (h->sock.state == TCP_LISTEN) {
+ ret = socket->ops->listen(socket, h->sock.backlog);
+ if (ret < 0)
+ goto out;
+
+ ret = sock_add_parent(ctx, socket->sk);
+ if (ret < 0)
+ goto out;
+ }
+ } else {
+ ret = sock_cptrst(ctx, socket->sk, h, CKPT_RST);
+ if (ret)
+ goto out;
+
+ ret = sock_inet_cptrst(ctx, socket->sk, in, CKPT_RST);
+ if (ret)
+ goto out;
+
+ if ((h->sock.state == TCP_ESTABLISHED) &&
+ (h->sock.protocol == IPPROTO_TCP))
+ /* Delay hashing this sock until the end so we can
+ * hook it up with its parent (if appropriate)
+ */
+ ret = sock_defer_hash(ctx, socket->sk);
+ }
+
+ out:
+ return ret;
+ }
+
struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
struct ckpt_hdr_socket *h)
{
@@ -510,6 +816,9 @@ struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
if (h->sock_common.family == AF_UNIX) {
ret = sock_unix_restart(ctx, h, socket);
ckpt_debug("sock_unix_restart: %i\n", ret);
+ } else if (h->sock_common.family == AF_INET) {
+ ret = sock_inet_restart(ctx, h, socket);
+ ckpt_debug("sock_inet_restart: %i\n", ret);
} else {
ckpt_write_err(ctx, "unsupported family %i\n",
h->sock_common.family);
--
1.6.2.2
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list