[Devel] [PATCH RHEL7 COMMIT] tcp: Limit orphan sockets per-cg

Konstantin Khorenko khorenko at virtuozzo.com
Fri Jun 5 12:55:43 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.10
------>
commit e695351ac3ec21541f42081530b7fe04a82d9742
Author: Pavel Emelyanov <xemul at parallels.com>
Date:   Fri Jun 5 23:55:43 2015 +0400

    tcp: Limit orphan sockets per-cg
    
    Kernel limits the total number of orphan TCP sockets in the
    system. One container can eat all this limit and make others'
    container to suffer from TCP state machine breakage.
    
    Thus here's the per-CT limit on the number of prphans that
    doesn't affect the others. The limit is set to be 1/4-th
    of the system limit. Should be OK for hosters loads.
    
    https://jira.sw.ru/browse/PSBM-33584
    
    khorenko@ changes:
    - exported cg_orphan_count_inc(): used in dccp module
    
    Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
---
 include/net/tcp.h               | 18 ++++++++++
 include/net/tcp_memcontrol.h    |  6 ++++
 net/dccp/proto.c                |  2 +-
 net/ipv4/inet_connection_sock.c |  7 ++--
 net/ipv4/tcp.c                  |  2 +-
 net/ipv4/tcp_memcontrol.c       | 73 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf81c63..f4e704f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,7 @@
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
+#include <net/tcp_memcontrol.h>
 
 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
@@ -330,11 +331,28 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 	return false;
 }
 
+static inline void orphan_count_inc(struct sock *sk)
+{
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_inc(sk);
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+}
+
+static inline void orphan_count_dec(struct sock *sk)
+{
+	percpu_counter_dec(sk->sk_prot->orphan_count);
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_dec(sk);
+}
+
 static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 {
 	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
 	int orphans = percpu_counter_read_positive(ocp);
 
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		return cg_too_many_orphans(sk, shift);
+
 	if (orphans << shift > sysctl_tcp_max_orphans) {
 		orphans = percpu_counter_sum_positive(ocp);
 		if (orphans << shift > sysctl_tcp_max_orphans)
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 7df18bc..46f05c2 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -6,8 +6,10 @@ struct tcp_memcontrol {
 	/* per-cgroup tcp memory pressure knobs */
 	struct res_counter tcp_memory_allocated;
 	struct percpu_counter tcp_sockets_allocated;
+	struct percpu_counter tcp_orphan_count;
 	/* those two are read-mostly, leave them at the end */
 	long tcp_prot_mem[3];
+	int tcp_max_orphans;
 	int tcp_memory_pressure;
 };
 
@@ -16,4 +18,8 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
 unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
+
+void cg_orphan_count_inc(struct sock *sk);
+void cg_orphan_count_dec(struct sock *sk);
+bool cg_too_many_orphans(struct sock *sk, int shift);
 #endif /* _TCP_MEMCG_H */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 391511f..6d484c2 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1056,7 +1056,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 07622a4..3d428d6 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
+#include <net/tcp.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -726,7 +727,7 @@ void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	percpu_counter_dec(sk->sk_prot->orphan_count);
+	orphan_count_dec(sk);
 	sock_put(sk);
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
@@ -743,7 +744,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
 
 	/* The below has to be done to allow calling inet_csk_destroy_sock */
 	sock_set_flag(sk, SOCK_DEAD);
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 	inet_sk(sk)->inet_num = 0;
 }
 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
@@ -822,7 +823,7 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		sock_orphan(child);
 
-		percpu_counter_inc(sk->sk_prot->orphan_count);
+		orphan_count_inc(sk);
 
 		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
 			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8cbf0f5..074fd3b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2200,7 +2200,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index da14436..3636f9d 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,6 +6,8 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
+#define RES_ORPHANS	1024
+
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
 	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -18,6 +20,67 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
 }
 EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
 
+void cg_orphan_count_inc(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_inc(&tcp->tcp_orphan_count);
+	}
+}
+EXPORT_SYMBOL(cg_orphan_count_inc);
+
+void cg_orphan_count_dec(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_dec(&tcp->tcp_orphan_count);
+	}
+}
+
+bool cg_too_many_orphans(struct sock *sk, int shift)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+		struct percpu_counter *ocp;
+		int orphans;
+
+		tcp = tcp_from_cgproto(cg);
+		ocp = &tcp->tcp_orphan_count;
+		orphans = percpu_counter_read_positive(ocp);
+
+		if (orphans << shift > tcp->tcp_max_orphans) {
+			orphans = percpu_counter_sum_positive(ocp);
+			if (orphans << shift > tcp->tcp_max_orphans)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static u64 tcp_read_orphans(struct mem_cgroup *mem)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup(mem);
+	if (!cg_proto)
+		return 0;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	return percpu_counter_sum_positive(&tcp->tcp_orphan_count);
+}
+
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	/*
@@ -40,6 +103,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
 	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
 	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+	tcp->tcp_max_orphans = sysctl_tcp_max_orphans >> 2;
 	tcp->tcp_memory_pressure = 0;
 
 	parent_cg = tcp_prot.proto_cgroup(parent);
@@ -48,6 +112,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 	res_counter_init(&tcp->tcp_memory_allocated, res_parent);
 	percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
+	percpu_counter_init(&tcp->tcp_orphan_count, 0);
 
 	cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
 	cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
@@ -196,6 +261,9 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
 	case RES_MAX_USAGE:
 		val = tcp_read_stat(memcg, cft->private, 0);
 		break;
+	case RES_ORPHANS:
+		val = tcp_read_orphans(memcg);
+		break;
 	default:
 		BUG();
 	}
@@ -277,6 +345,11 @@ static struct cftype tcp_files[] = {
 		.trigger = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
+	{
+		.name = "kmem.tcp.orphans",
+		.private = RES_ORPHANS,
+		.read_u64 = tcp_cgroup_read, /* XXX add configuration knob */
+	},
 	{ }	/* terminate */
 };
 



More information about the Devel mailing list