iperf3 udp throughput test downgrade to 0bits/sec problem

最近在进行路由器网络性能调优，发现在跑PC到路由器的UDP吞吐量时，跑一下速度就变成0了。跑路由器到PC没问题。

/ # taskset 1 iperf3 -c 192.168.3.100 -u -b 1000M -t 1000 -l 64000 -R

warning: UDP block size 64000 exceeds TCP MSS 1448, may result in fragmentation / drops

Connecting to host 192.168.3.100, port 5201

Reverse mode, remote host 192.168.3.100 is sending

[ 5] local 192.168.3.1 port 60208 connected to 192.168.3.100 port 5201

[ ID] Interval Transfer Bitrate Jitter Lost/Total Datagrams

[ 5] 0.00-1.00 sec 57.1 MBytes 478 Mbits/sec 0.055 ms 66/1001 (6.6%)

[ 5] 1.00-2.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 2.00-3.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 3.00-4.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 4.00-5.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 5.00-6.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 6.00-7.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 7.00-8.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 8.00-9.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 9.00-10.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 10.00-11.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

[ 5] 11.00-12.00 sec 0.00 Bytes 0.00 bits/sec 0.055 ms 0/0 (0%)

开始怀疑是网卡驱动的问题。尝试将网卡驱动改成非gro，还是不行。

#if defined(NSS_DP_ENABLE_NAPI_GRO)

napi_gro_receive(&rx_info->napi_rx, rx_skb);

#else

netif_receive_skb(rx_skb);

#endif

使用strace iperf3，发现小段时间后，就没有recv了，只有pselect。这就说明内核没有把udp交给套接字。

如果把 -l 64000 选项去掉，默认是1460跑，就没问题，使用-l 3000也有问题。3000和64000都是大于MTU的因此会导致分片。

怀疑是网卡驱动在checksum方面有问题，重组后checksum不对导致丢失。

在路由器上抓1024个包，使用wireshark分析，发现checksum都是对的。

后面尝试看看内核统计数据，是哪里丢了包。阅读内核源代码[2]，发现内核协议层统计数据在/proc/net/snmp文件。

跑流的时候使用delta[1]命令去观察这个文件的统计数据变化。

Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates

Ip: 0 0 5 0 0 2 0 0 3 6 0 0 0 117933 0 117933 0 0 0

Icmp: InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmPro

bs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps

Icmp: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0

IcmpMsg: InType0 OutType0 OutType0 OutType0 OutType 0

IcmpMsg: 0 0 1 0 0

Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors

Tcp: 0 0 0 -0 0 0 0 0 0 0 0 0 0 0 0

Udp: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti MemErrors

Udp: 2 1 0 3 0 0 0 0 0

UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti MemE^C

可以发现 ReasmReqds 和 ReasmFails 统计数据非常高。

查看内核源代码，ReasmReqds是需要重组的skb个数，ReasmFails是重组失败的skb个数。

内核有几个重组失败的地方，在这些地方加上打印看看是哪个地方失败：

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fad803d..99820ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -239,8 +239,10 @@ static int ip_frag_too_far(struct ipq *qp)
  
        rc = qp->q.fragments_tail && (end - start) > max;
  
-       if (rc)
+       if (rc) {
+               pr_warn_ratelimited("too far\n");
                __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
+       }
  
        return rc;
 }
@@ -388,6 +390,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
        __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
 discard_qp:
        inet_frag_kill(&qp->q);
+       pr_warn_ratelimited("discard qp\n");
        __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 err:
        kfree_skb(skb);
@@ -467,6 +470,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 out_oversize:
        net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
 out_fail:
+       pr_warn_ratelimited("out fail\n");
        __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
        return err;
 }
@@ -495,6 +499,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
                return ret;
        }
  
+       pr_warn_ratelimited("defreg\n");
        __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
        kfree_skb(skb);
        return -ENOMEM;

重新编译内核测试，发现是 ip_defrag 失败了，它调用

qp = ip_find(net, ip_hdr(skb), user, vif);

ip_find调用

q = inet_frag_find(net->ipv4.fqdir, &key);

失败了。

inet_frag_find()定义在net/ipv4/inet_fragment.c。有一个可疑的地方加上打印：

/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
        /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
        long high_thresh = READ_ONCE(fqdir->high_thresh);
        struct inet_frag_queue *fq = NULL, *prev;
 
        if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) {
                pr_warn_ratelimited("high thresh\n");
                return NULL;
        }
 
        rcu_read_lock();
 
        prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
        if (!prev)
                fq = inet_frag_create(fqdir, key, &prev);
        if (!IS_ERR_OR_NULL(prev)) {
                fq = prev;
                if (!refcount_inc_not_zero(&fq->refcnt))
                        fq = NULL;
        }   
        rcu_read_unlock();
        return fq; 
}
EXPORT_SYMBOL(inet_frag_find);

再次编译内核，测试发现 high thresh打印了。这就是说是high_thresh太小了。这是一个sysctl选项。

/proc/sys/net/ipv4 # cat ipfrag_high_thresh

4194304

发现它默认只有4M。

尝试将其修改为50M。再次测试，问题解决：

/proc/sys/net/ipv4 # taskset 1 iperf3 -c 192.168.3.100 -u -b 1000M -l 65000 -R

warning: UDP block size 65000 exceeds TCP MSS 1448, may result in fragmentation / drops

Connecting to host 192.168.3.100, port 5201

Reverse mode, remote host 192.168.3.100 is sending

[ 5] local 192.168.3.1 port 54658 connected to 192.168.3.100 port 5201

[ ID] Interval Transfer Bitrate Jitter Lost/Total Datagrams

[ 5] 0.00-1.00 sec 98.0 MBytes 822 Mbits/sec 0.110 ms 273/1854 (15%)

[ 5] 1.00-2.00 sec 102 MBytes 858 Mbits/sec 0.050 ms 201/1850 (11%)

[ 5] 2.00-3.00 sec 110 MBytes 924 Mbits/sec 0.099 ms 73/1851 (3.9%)

[ 5] 3.00-4.00 sec 112 MBytes 936 Mbits/sec 0.121 ms 50/1850 (2.7%)

[ 5] 4.00-5.00 sec 112 MBytes 942 Mbits/sec 0.034 ms 40/1851 (2.2%)

[ 5] 5.00-6.00 sec 113 MBytes 947 Mbits/sec 0.044 ms 28/1850 (1.5%)

[ 5] 6.00-7.00 sec 114 MBytes 953 Mbits/sec 0.143 ms 18/1851 (0.97%)

[ 5] 7.00-8.00 sec 114 MBytes 953 Mbits/sec 0.086 ms 17/1850 (0.92%)

[ 5] 8.00-9.00 sec 114 MBytes 955 Mbits/sec 0.105 ms 14/1850 (0.76%)

[ 5] 9.00-10.00 sec 114 MBytes 954 Mbits/sec 0.055 ms 16/1851 (0.86%)

- - - - - - - - - - - - - - - - - - - - - - - - -

[ ID] Interval Transfer Bitrate Jitter Lost/Total Datagrams

[ 5] 0.00-10.85 sec 1.22 GBytes 962 Mbits/sec 0.000 ms 0/20071 (0%) sender

[ 5] 0.00-10.00 sec 1.08 GBytes 924 Mbits/sec 0.055 ms 730/18508 (3.9%) receiver

参考：

[1] delta command, https://insidelinuxdev.net/~yuanjianpeng/project/delta

[2] udp6 local in path, https://insidelinuxdev.net/article/a0ds1p.html

ILD