[Devel] [PATCH 3/7] tcp: Limit orphan sockets per-cg

Pavel Emelyanov xemul at parallels.com
Fri May 29 07:18:16 PDT 2015


Kernel limits the total number of orphan TCP sockets in the
system. One container can eat all this limit and make others'
container to suffer from TCP state machine breakage.

Thus here's the per-CT limit on the number of prphans that
doesn't affect the others. The limit is set to be 1/4-th
of the system limit. Should be OK for hosters loads.

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>

---
 include/net/tcp.h               | 18 +++++++++++
 include/net/tcp_memcontrol.h    |  6 ++++
 net/dccp/proto.c                |  2 +-
 net/ipv4/inet_connection_sock.c |  7 ++--
 net/ipv4/tcp.c                  |  2 +-
 net/ipv4/tcp_memcontrol.c       | 72 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f89f146..65af9f6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,7 @@
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
+#include <net/tcp_memcontrol.h>
 
 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
@@ -330,11 +331,28 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 	return false;
 }
 
+static inline void orphan_count_inc(struct sock *sk)
+{
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_inc(sk);
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+}
+
+static inline void orphan_count_dec(struct sock *sk)
+{
+	percpu_counter_dec(sk->sk_prot->orphan_count);
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_dec(sk);
+}
+
 static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 {
 	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
 	int orphans = percpu_counter_read_positive(ocp);
 
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		return cg_too_many_orphans(sk, shift);
+
 	if (orphans << shift > sysctl_tcp_max_orphans) {
 		orphans = percpu_counter_sum_positive(ocp);
 		if (orphans << shift > sysctl_tcp_max_orphans)
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 7df18bc..46f05c2 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -6,8 +6,10 @@ struct tcp_memcontrol {
 	/* per-cgroup tcp memory pressure knobs */
 	struct res_counter tcp_memory_allocated;
 	struct percpu_counter tcp_sockets_allocated;
+	struct percpu_counter tcp_orphan_count;
 	/* those two are read-mostly, leave them at the end */
 	long tcp_prot_mem[3];
+	int tcp_max_orphans;
 	int tcp_memory_pressure;
 };
 
@@ -16,4 +18,8 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
 unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
+
+void cg_orphan_count_inc(struct sock *sk);
+void cg_orphan_count_dec(struct sock *sk);
+bool cg_too_many_orphans(struct sock *sk, int shift);
 #endif /* _TCP_MEMCG_H */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 391511f..6d484c2 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1056,7 +1056,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 07622a4..3d428d6 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
+#include <net/tcp.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -726,7 +727,7 @@ void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	percpu_counter_dec(sk->sk_prot->orphan_count);
+	orphan_count_dec(sk);
 	sock_put(sk);
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
@@ -743,7 +744,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
 
 	/* The below has to be done to allow calling inet_csk_destroy_sock */
 	sock_set_flag(sk, SOCK_DEAD);
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 	inet_sk(sk)->inet_num = 0;
 }
 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
@@ -822,7 +823,7 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		sock_orphan(child);
 
-		percpu_counter_inc(sk->sk_prot->orphan_count);
+		orphan_count_inc(sk);
 
 		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
 			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8cbf0f5..074fd3b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2200,7 +2200,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index da14436..0323647 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,6 +6,8 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
+#define RES_ORPHANS	1024
+
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
 	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -18,6 +20,66 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
 }
 EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
 
+void cg_orphan_count_inc(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_inc(&tcp->tcp_orphan_count);
+	}
+}
+
+void cg_orphan_count_dec(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_dec(&tcp->tcp_orphan_count);
+	}
+}
+
+bool cg_too_many_orphans(struct sock *sk, int shift)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+		struct percpu_counter *ocp;
+		int orphans;
+
+		tcp = tcp_from_cgproto(cg);
+		ocp = &tcp->tcp_orphan_count;
+		orphans = percpu_counter_read_positive(ocp);
+
+		if (orphans << shift > tcp->tcp_max_orphans) {
+			orphans = percpu_counter_sum_positive(ocp);
+			if (orphans << shift > tcp->tcp_max_orphans)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static u64 tcp_read_orphans(struct mem_cgroup *mem)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup(mem);
+	if (!cg_proto)
+		return 0;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	return percpu_counter_sum_positive(&tcp->tcp_orphan_count);
+}
+
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	/*
@@ -40,6 +102,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
 	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
 	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+	tcp->tcp_max_orphans = sysctl_tcp_max_orphans >> 2;
 	tcp->tcp_memory_pressure = 0;
 
 	parent_cg = tcp_prot.proto_cgroup(parent);
@@ -48,6 +111,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 	res_counter_init(&tcp->tcp_memory_allocated, res_parent);
 	percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
+	percpu_counter_init(&tcp->tcp_orphan_count, 0);
 
 	cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
 	cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
@@ -196,6 +260,9 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
 	case RES_MAX_USAGE:
 		val = tcp_read_stat(memcg, cft->private, 0);
 		break;
+	case RES_ORPHANS:
+		val = tcp_read_orphans(memcg);
+		break;
 	default:
 		BUG();
 	}
@@ -277,6 +344,11 @@ static struct cftype tcp_files[] = {
 		.trigger = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
+	{
+		.name = "kmem.tcp.orphans",
+		.private = RES_ORPHANS,
+		.read_u64 = tcp_cgroup_read, /* XXX add configuration knob */
+	},
 	{ }	/* terminate */
 };
 
-- 
1.8.3.1





More information about the Devel mailing list