]> git.baikalelectronics.ru Git - kernel.git/commitdiff
net/smc: Fix setsockopt and sysctl to specify same buffer size again
authorGerd Bayer <gbayer@linux.ibm.com>
Fri, 4 Aug 2023 17:06:23 +0000 (19:06 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 23 Aug 2023 15:52:18 +0000 (17:52 +0200)
[ Upstream commit 833bac7ec392bf75053c8a4fa4c36d4148dac77d ]

Commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock
and make them tunable") introduced the net.smc.rmem and net.smc.wmem
sysctls to specify the size of buffers to be used for SMC type
connections. This created a regression for users that specified the
buffer size via setsockopt() as the effective buffer size was now
doubled.

Re-introduce the division by 2 in the SMC buffer create code and level
this out by duplicating the net.smc.[rw]mem values used for initializing
sk_rcvbuf/sk_sndbuf at socket creation time. This gives users of both
methods (setsockopt or sysctl) the effective buffer size that they
expect.

Initialize net.smc.[rw]mem from its own constant of 64kB, respectively.
Internal performance tests show that this value is a good compromise
between throughput/latency and memory consumption. Also, this decouples
it from any tuning that was done to net.ipv4.tcp_[rw]mem[1] before the
module for SMC protocol was loaded. Check that no more than INT_MAX / 2
is assigned to net.smc.[rw]mem, in order to avoid any overflow condition
when that is doubled for use in sk_sndbuf or sk_rcvbuf.

While at it, drop the confusing sk_buf_size variable from
__smc_buf_create and name "compressed" buffer size variables more
consistently.

Background:

Before the commit mentioned above, SMC's buffer allocator in
__smc_buf_create() always used half of the sockets' sk_rcvbuf/sk_sndbuf
value as initial value to search for appropriate buffers. If the search
resorted to using a bigger buffer when all buffers of the specified
size were busy, the duplicate of the used effective buffer size is
stored back to sk_rcvbuf/sk_sndbuf.

When available, buffers of exactly the size that a user had specified as
input to setsockopt() were used, despite setsockopt()'s documentation in
"man 7 socket" talking of a mandatory duplication:

[...]
       SO_SNDBUF
              Sets  or  gets the maximum socket send buffer in bytes.
              The kernel doubles this value (to allow space for book‐
              keeping  overhead)  when it is set using setsockopt(2),
              and this doubled value is  returned  by  getsockopt(2).
              The     default     value     is     set     by     the
              /proc/sys/net/core/wmem_default file  and  the  maximum
              allowed value is set by the /proc/sys/net/core/wmem_max
              file.  The minimum (doubled) value for this  option  is
              2048.
[...]

Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
Co-developed-by: Jan Karcher <jaka@linux.ibm.com>
Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
net/smc/af_smc.c
net/smc/smc.h
net/smc/smc_clc.c
net/smc/smc_core.c
net/smc/smc_sysctl.c

index 5712a5297bd0188389c35e65073b2aac8923d1b7..84219c5121bc270c05d0e9abdbdcb957980449c6 100644 (file)
@@ -379,8 +379,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
        sk->sk_state = SMC_INIT;
        sk->sk_destruct = smc_destruct;
        sk->sk_protocol = protocol;
-       WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
-       WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
+       WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
+       WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
        smc = smc_sk(sk);
        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
        INIT_WORK(&smc->connect_work, smc_connect_work);
index 5ed765ea0c731a7f0095cd6a99a0e42d227eaca9..1d36720fc019c5d04deca7219fce43cc7b50abd2 100644 (file)
@@ -161,7 +161,7 @@ struct smc_connection {
 
        struct smc_buf_desc     *sndbuf_desc;   /* send buffer descriptor */
        struct smc_buf_desc     *rmb_desc;      /* RMBE descriptor */
-       int                     rmbe_size_short;/* compressed notation */
+       int                     rmbe_size_comp; /* compressed notation */
        int                     rmbe_update_limit;
                                                /* lower limit for consumer
                                                 * cursor update
index dfb9797f7bc63981eaa3dba55fd3d18e89e80ab7..9b8999e2afca56345b9316742f39fe396a49db20 100644 (file)
@@ -1002,7 +1002,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
                clc->hdr.typev1 = SMC_TYPE_D;
                clc->d0.gid = conn->lgr->smcd->local_gid;
                clc->d0.token = conn->rmb_desc->token;
-               clc->d0.dmbe_size = conn->rmbe_size_short;
+               clc->d0.dmbe_size = conn->rmbe_size_comp;
                clc->d0.dmbe_idx = 0;
                memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
                if (version == SMC_V1) {
@@ -1045,7 +1045,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
                        clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
                        break;
                }
-               clc->r0.rmbe_size = conn->rmbe_size_short;
+               clc->r0.rmbe_size = conn->rmbe_size_comp;
                clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
                        cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
                        cpu_to_be64((u64)sg_dma_address
index f793101e0ecc3b2cef3a3d0d5f11c3e125b71c2a..c676d92af7b7dd409240866190ee02a073008794 100644 (file)
@@ -2305,31 +2305,30 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
        struct smc_connection *conn = &smc->conn;
        struct smc_link_group *lgr = conn->lgr;
        struct list_head *buf_list;
-       int bufsize, bufsize_short;
+       int bufsize, bufsize_comp;
        struct rw_semaphore *lock;      /* lock buffer list */
        bool is_dgraded = false;
-       int sk_buf_size;
 
        if (is_rmb)
                /* use socket recv buffer size (w/o overhead) as start value */
-               sk_buf_size = smc->sk.sk_rcvbuf;
+               bufsize = smc->sk.sk_rcvbuf / 2;
        else
                /* use socket send buffer size (w/o overhead) as start value */
-               sk_buf_size = smc->sk.sk_sndbuf;
+               bufsize = smc->sk.sk_sndbuf / 2;
 
-       for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
-            bufsize_short >= 0; bufsize_short--) {
+       for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb);
+            bufsize_comp >= 0; bufsize_comp--) {
                if (is_rmb) {
                        lock = &lgr->rmbs_lock;
-                       buf_list = &lgr->rmbs[bufsize_short];
+                       buf_list = &lgr->rmbs[bufsize_comp];
                } else {
                        lock = &lgr->sndbufs_lock;
-                       buf_list = &lgr->sndbufs[bufsize_short];
+                       buf_list = &lgr->sndbufs[bufsize_comp];
                }
-               bufsize = smc_uncompress_bufsize(bufsize_short);
+               bufsize = smc_uncompress_bufsize(bufsize_comp);
 
                /* check for reusable slot in the link group */
-               buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
+               buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
                if (buf_desc) {
                        buf_desc->is_dma_need_sync = 0;
                        SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
@@ -2373,8 +2372,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
        if (is_rmb) {
                conn->rmb_desc = buf_desc;
-               conn->rmbe_size_short = bufsize_short;
-               smc->sk.sk_rcvbuf = bufsize;
+               conn->rmbe_size_comp = bufsize_comp;
+               smc->sk.sk_rcvbuf = bufsize * 2;
                atomic_set(&conn->bytes_to_rcv, 0);
                conn->rmbe_update_limit =
                        smc_rmb_wnd_update_limit(buf_desc->len);
@@ -2382,7 +2381,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
                        smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
        } else {
                conn->sndbuf_desc = buf_desc;
-               smc->sk.sk_sndbuf = bufsize;
+               smc->sk.sk_sndbuf = bufsize * 2;
                atomic_set(&conn->sndbuf_space, bufsize);
        }
        return 0;
index b6f79fabb9d3f2256af847aff3ff7cf57ec8f4bb..0b2a957ca5f5f09408cc94460fde5e53b5fa4b5d 100644 (file)
 
 static int min_sndbuf = SMC_BUF_MIN_SIZE;
 static int min_rcvbuf = SMC_BUF_MIN_SIZE;
+static int max_sndbuf = INT_MAX / 2;
+static int max_rcvbuf = INT_MAX / 2;
+static const int net_smc_wmem_init = (64 * 1024);
+static const int net_smc_rmem_init = (64 * 1024);
 
 static struct ctl_table smc_table[] = {
        {
@@ -53,6 +57,7 @@ static struct ctl_table smc_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_sndbuf,
+               .extra2         = &max_sndbuf,
        },
        {
                .procname       = "rmem",
@@ -61,6 +66,7 @@ static struct ctl_table smc_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_rcvbuf,
+               .extra2         = &max_rcvbuf,
        },
        {  }
 };
@@ -88,8 +94,8 @@ int __net_init smc_sysctl_net_init(struct net *net)
        net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
        net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
        net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
-       WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]));
-       WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]));
+       WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init);
+       WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
 
        return 0;