Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 26 Aug 2016 18:17:29 +0300
From:      Slawa Olhovchenkov <slw@zxy.spb.ru>
To:        freebsd-net@freebsd.org
Subject:   Network stack profiling/optimisation
Message-ID:  <20160826151729.GG88122@zxy.spb.ru>

next in thread | raw e-mail | index | archive | help
I am collect some data releted to network stack performance under heavy load.

This is data collected on dual E5-2620, under 20Gbit load.

At time peak network traffic (more then 25K connections, about 20Gbit
total traffic) half of cores fully utilised by network stack.

This is flamegraph from one core: http://zxy.spb.ru/cpu10.svg
This is same, but stack cut of at ixgbe_rxeof for more unified
tcp/ip stack view http://zxy.spb.ru/cpu10u.svg

Top 3 used lines is:

7036 0xffffffff804bf02d atomic_cmpset_long /usr/obj/usr/src/sys/VSTREAM/./machine/atomic.h:163

static __inline int
atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src)
{
        u_char res;

        __asm __volatile(
        "       " MPLOCKED "            "
>       "       cmpxchgq %3,%1 ;        "                                                                                                                                                                                                                                                                                                                   
        "       sete    %0 ;            "
        "# atomic_cmpset_long"
        : "=q" (res),                   /* 0 */
          "+m" (*dst),                  /* 1 */
          "+a" (expect)                 /* 2 */
        : "r" (src)                     /* 3 */
        : "memory", "cc");
        return (res);
}                                                                                                                                                                                                                                                                                                                                                           

6099 0xffffffff81171963 ?? ??:0

0xffffffff81171940 <ixgbe_rxeof+1168>:  mov    0x10(%r15),%rax
0xffffffff81171944 <ixgbe_rxeof+1172>:  add    $0x8,%rax
0xffffffff81171948 <ixgbe_rxeof+1176>:  mov    -0x4c(%rbp),%ecx
0xffffffff8117194b <ixgbe_rxeof+1179>:  test   %cx,%cx
0xffffffff8117194e <ixgbe_rxeof+1182>:  mov    %rax,0x10(%r15)
0xffffffff81171952 <ixgbe_rxeof+1186>:  je     0xffffffff8117198d <ixgbe_rxeof+1245>
0xffffffff81171954 <ixgbe_rxeof+1188>:  mov    0x10(%rdi),%rcx
0xffffffff81171958 <ixgbe_rxeof+1192>:  mov    -0x4c(%rbp),%edx
0xffffffff8117195b <ixgbe_rxeof+1195>:  nopl   0x0(%rax,%rax,1)
0xffffffff81171960 <ixgbe_rxeof+1200>:  mov    (%rcx),%rsi
0xffffffff81171963 <ixgbe_rxeof+1203>:  mov    %rsi,(%rax)
0xffffffff81171966 <ixgbe_rxeof+1206>:  mov    0x8(%rcx),%rsi
0xffffffff8117196a <ixgbe_rxeof+1210>:  mov    %rsi,0x8(%rax)
0xffffffff8117196e <ixgbe_rxeof+1214>:  mov    0x10(%rcx),%rsi
0xffffffff81171972 <ixgbe_rxeof+1218>:  mov    %rsi,0x10(%rax)
0xffffffff81171976 <ixgbe_rxeof+1222>:  mov    0x18(%rcx),%rsi
0xffffffff8117197a <ixgbe_rxeof+1226>:  mov    %rsi,0x18(%rax)
0xffffffff8117197e <ixgbe_rxeof+1230>:  add    $0xffffffffffffffe0,%edx
0xffffffff81171981 <ixgbe_rxeof+1233>:  add    $0x20,%rcx
0xffffffff81171985 <ixgbe_rxeof+1237>:  add    $0x20,%rax
0xffffffff81171989 <ixgbe_rxeof+1241>:  test   %edx,%edx

5594 0xffffffff8053395a mb_free_ext /usr/src/sys/kern/uipc_mbuf.c:301

        if (*(m->m_ext.ref_cnt) == 1 ||


I am able collect and process more measure for help
to improve FreeBSD network stack.

Have someone any idea about this?
I am don't see evident and simple points of optimisation :(



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20160826151729.GG88122>