[Devel] [PATCH RHEL9 COMMIT] net/unix: batching unix socket writes
    Konstantin Khorenko 
    khorenko at virtuozzo.com
       
    Thu Jan 23 20:40:33 MSK 2025
    
    
  
The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.4
------>
commit f22a247d94c2e09cbf62af51755d647f9342c6f3
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Sat Jan 18 02:08:46 2025 +0800
    net/unix: batching unix socket writes
    
    Optimization. When we send a MSG_MORE message via unix socket we
    do not mean the peer must be immediately woken up if we tell
    the message is incomplete.
    
    The optimization is questionable, so leave a way to disable it:
    option "use_unix_mitigation".
    
    Why it is questionable. When receiver is on other cpu, we might
    want to wake it up as early as possible, it will be able to copy
    data in parallel with sender's send of tail of message, which
    increases throughput. So, for microtests it would look like
    disadvantage, throughput will reduce, yet, on larger systems
    it is just test artifcact and total throughput will increase.
    
    Signed-off-by: Alexey Kuznetsov <kuznet at virtuozzo.com>
    
    Feature: net: batching unix socket writes
---
 net/unix/af_unix.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d419173732f2..3a1506f8fea2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -119,6 +119,9 @@
 
 #include "scm.h"
 
+int use_unix_mitigation;
+module_param(use_unix_mitigation, int, 0644);
+
 spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_table_locks);
 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
@@ -2060,6 +2063,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct scm_cookie scm;
 	bool fds_sent = false;
 	int data_len;
+	int flags;
 
 	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
@@ -2067,7 +2071,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		return err;
 
 	err = -EOPNOTSUPP;
-	if (msg->msg_flags&MSG_OOB)
+	flags = msg->msg_flags;
+	if (flags & MSG_OOB)
 		goto out_err;
 
 	if (msg->msg_namelen) {
@@ -2083,7 +2088,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
 		goto pipe_err;
 
-	if ((msg->msg_flags & MSG_ZEROCOPY) && len && sock_flag(sk, SOCK_ZEROCOPY)) {
+	if ((flags & MSG_ZEROCOPY) && len && sock_flag(sk, SOCK_ZEROCOPY)) {
 		uarg = msg_zerocopy_alloc(sk, len);
 		if (!uarg) {
 			err = -ENOBUFS;
@@ -2106,12 +2111,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
 
 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
-						   msg->msg_flags & MSG_DONTWAIT, &err,
+						   flags & MSG_DONTWAIT, &err,
 						   get_order(UNIX_SKB_FRAGS_SZ));
 		} else {
 			size = min_t(int, size, sk->sk_sndbuf);
 			skb = sock_alloc_send_pskb(sk, 0, 0,
-						   msg->msg_flags & MSG_DONTWAIT, &err, 0);
+						   flags & MSG_DONTWAIT, &err, 0);
 		}
 
 		if (!skb)
@@ -2152,7 +2157,9 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		scm_stat_add(other, skb);
 		skb_queue_tail(&other->sk_receive_queue, skb);
 		unix_state_unlock(other);
-		other->sk_data_ready(other);
+		if (!use_unix_mitigation || !(flags & MSG_MORE) || (flags & MSG_EOR) ||
+		    (refcount_read(&sk->sk_wmem_alloc) << 2) > sk->sk_sndbuf)
+			other->sk_data_ready(other);
 		sent += size;
 	}
 
@@ -2165,7 +2172,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	unix_state_unlock(other);
 	kfree_skb(skb);
 pipe_err:
-	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+	if (sent == 0 && !(flags & MSG_NOSIGNAL))
 		send_sig(SIGPIPE, current, 0);
 	err = -EPIPE;
 out_err:
@@ -2277,7 +2284,9 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	unix_state_unlock(other);
 	mutex_unlock(&unix_sk(other)->iolock);
 
-	other->sk_data_ready(other);
+	if (!use_unix_mitigation || !(flags & MSG_MORE) || (flags & MSG_EOR) ||
+	    (refcount_read(&sk->sk_wmem_alloc) << 2) > sk->sk_sndbuf)
+		other->sk_data_ready(other);
 	scm_destroy(&scm);
 	return size;
 
    
    
More information about the Devel
mailing list