From owner-freebsd-net@FreeBSD.ORG Wed Jul 9 05:30:24 2008 Return-Path: Delivered-To: freebsd-net@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id C848B106566B for ; Wed, 9 Jul 2008 05:30:24 +0000 (UTC) (envelope-from brde@optusnet.com.au) Received: from mail02.syd.optusnet.com.au (mail02.syd.optusnet.com.au [211.29.132.183]) by mx1.freebsd.org (Postfix) with ESMTP id 5037E8FC12 for ; Wed, 9 Jul 2008 05:30:24 +0000 (UTC) (envelope-from brde@optusnet.com.au) Received: from c220-239-252-11.carlnfd3.nsw.optusnet.com.au (c220-239-252-11.carlnfd3.nsw.optusnet.com.au [220.239.252.11]) by mail02.syd.optusnet.com.au (8.13.1/8.13.1) with ESMTP id m695UETn030620 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Wed, 9 Jul 2008 15:30:16 +1000 Date: Wed, 9 Jul 2008 15:30:14 +1000 (EST) From: Bruce Evans X-X-Sender: bde@delplex.bde.org To: Peter Jeremy In-Reply-To: <20080707221257.GH62764@server.vk2pj.dyndns.org> Message-ID: <20080709142008.H26105@delplex.bde.org> References: <2d3001c8def1$f4309b90$020b000a@bartwrkstxp> <486FFF70.3090402@gtcomm.net> <48701921.7090107@gtcomm.net> <4871E618.1080500@freebsd.org> <20080708002228.G680@besplex.bde.org> <48724238.2020103@freebsd.org> <20080708034304.R21502@delplex.bde.org> <20080708045135.V1022@besplex.bde.org> <48727BA9.6020702@elischer.org> <20080707221257.GH62764@server.vk2pj.dyndns.org> MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII; format=flowed Cc: FreeBSD Net , Julian Elischer Subject: Re: Freebsd IP Forwarding performance (question, and some info) [7-stable, current, em, smp] X-BeenThere: freebsd-net@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Networking and TCP/IP with FreeBSD List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 09 Jul 2008 05:30:24 -0000 On Tue, 8 Jul 2008, Peter Jeremy wrote: > On 2008-Jul-07 13:25:13 -0700, Julian Elischer wrote: >> what you need is a speculative prefetch where you an tell teh >> processor "We will probably need the following address so start >> getting it while we go do other stuff". > > This looks like the PREFETCH instructions that exist in at least amd64 > and SPARC. Unfortunately, their optimal use is very implementation- > dependent and the AMD documentation suggests that incorrect use can > degrade performance. I use the following hacks to test these in my version of bge in ~5.2: % Index: dev/bge/if_bge.c % =================================================================== % RCS file: /home/ncvs/src/sys/dev/bge/if_bge.c,v % retrieving revision 1.84 % diff -u -2 -r1.84 if_bge.c % --- dev/bge/if_bge.c 12 Mar 2005 06:51:25 -0000 1.84 % +++ dev/bge/if_bge.c 8 Jul 2008 04:49:12 -0000 % @@ -2690,4 +2845,11 @@ % */ % % +int bge_prefetch = 1; % +int bge_nprefetchnta = 0; % +int bge_nprefetch = 0x40; % +int bge_nprefetchw = 0; % +int bge_nprefetch0 = 0; % +int bge_nprefetch1 = 0; % +int bge_nprefetch2 = 0; % static void % bge_rxeof(sc) % @@ -2789,4 +2960,35 @@ % #endif % eh = mtod(m, struct ether_header *); % + if (bge_prefetch) { % + struct cl { % + char cl_data[64]; /* XXX */ % + } *clp; % + int i, j; % + % + /* XXX misalignment is likely. */ % + clp = mtod(m, struct cl *); % +#ifdef __i386__ /* XXX actually 3dnow */ % + for (i = 0, j = 0; i < bge_nprefetchnta; % + i += sizeof(*clp), j++) % + __asm("prefetchnta %0" : : "m" (clp[j])); % + for (i = 0, j = 0; i < bge_nprefetch; % + i += sizeof(*clp), j++) % + __asm("prefetch %0" : : "m" (clp[j])); % + for (i = 0, j = 0; i < bge_nprefetchw; % + i += sizeof(*clp), j++) % + __asm("prefetchw %0" : : "m" (clp[j])); % +#endif % +#ifdef __amd64__ % + for (i = 0, j = 0; i < bge_nprefetch0; % + i += sizeof(*clp), j++) % + __asm("prefetch0 %0" : : "m" (clp[j])); % + for (i = 0, j = 0; i < bge_nprefetch1; % + i += sizeof(*clp), j++) % + __asm("prefetch1 %0" : : "m" (clp[j])); % + for (i = 0, j = 0; i < bge_nprefetch2; % + i += sizeof(*clp), j++) % + __asm("prefetch2 %0" : : "m" (clp[j])); % +#endif % + } % m->m_pkthdr.len = m->m_len = cur_rx->bge_len - ETHER_CRC_LEN; % m->m_pkthdr.rcvif = ifp; % Index: net/if_ethersubr.c % =================================================================== % RCS file: /home/ncvs/src/sys/net/if_ethersubr.c,v % retrieving revision 1.174 % diff -u -2 -r1.174 if_ethersubr.c % --- net/if_ethersubr.c 24 Jun 2004 12:31:44 -0000 1.174 % +++ net/if_ethersubr.c 7 Jul 2008 18:31:13 -0000 % @@ -479,4 +479,5 @@ % * mbuf chain m with the ethernet header at the front. % */ % +int monearly = 0; % static void % ether_input(struct ifnet *ifp, struct mbuf *m) % @@ -485,4 +486,12 @@ % u_short etype; % % + if (monearly && ifp->if_flags & IFF_MONITOR) { % + /* % + * Interface marked for monitoring; discard packet. % + */ % + m_freem(m); % + return; % + } % + % /* % * Do consistency checks to verify assumptions The results were underwhelming and contrary to Andre's assertion that the primary bottleneck (apart from PCI32) is hardware-related cache misses (I think it is software-related cache misses). I previously reported that fixing monitor mode avoids 1 cache miss and thus saves 5% CPU. Plain prefetch forces this cache miss (but no other hardware-related ones, since there are no other hardware-related ones in upper layers) to occur asynchronously and always occur. However, it only saves 2% in unfixed normal mode and in unfixed monitor mode (in fixed monitor mode, it makes little difference except to not avoid the cache miss -- since the cache miss is asynchronous it doesn't affect %CPU much). Even 5% is a relatively uninteresting savings, since the non-hardware related CPU overhead is 10 times as much as that. I'm testing only receive of udp packets with a payload of 5 bytes (padded), so the whole packet fits in 64 bytes and there is only 1 hardware-related cache miss per packet to avoid or prefetch. The precise size is 60 (64 - CRC_size I think). m->m_data is always misaligned at an offset of 2 bytes from a 64-byte cache line boundary, prefetching 64 bytes at this address is not quite right, but since the 60 bytes all fit in 1 cache line, the prefetch fetches enough. prefetchnta as in Andre's old patch (16 Dec 2004) didn't seem to work. I also prefetch as soon as possible in the driver interrupt handler where Andre's old patch prefetches in ether_input() where this is almost certainly too late. The difference between the 5% and the 2% saings may be due to it also being too late in the driver interrupt handler. Someone mentioned not caring about latency. Doing something else to wait for all the prefetches made by the interrupt handler to complete might help here, but only if you could find something useful to do (hard), and I think latency would just increase the slowness in most cases since significant latency would require long queues and the long queues would bust caches (starting with discarding all the prefetches). Andre's old patch uses a hard-coded prefetch size of 74 (76 after source alignment and 128 after rounding up) where mine uses a parameter of 64 (66 after virtual source alignment and 64 after rounding down). This would cause an unnecessary extra cache miss for small packets. It too only tries to prefetch the packet header, but allows for tcp and tcp options so a small packet's headers alone are larger than 64 bytes. The extra cache miss for never-accessed data shouldn' cost much since it uses prefetchnta. (All of my tests are on an Athlon64 where prefetchnta actually works, unlike on AthlonXP. But actually working might be responsible for it not being very effective here. To work, it must not be too aggressive or it will cost too much for never-accessed data.) Timings (some repeated), all for ttcp receiving on bge0 at 397 kpps: -monitor: 35% idle (8.0-CURRENT) 14 cm/p monitor: 83% idle (8.0-CURRENT) 6 cm/p +monitor: 85% idle (8.0-CURRENT) 5 cm/p -monitor: 17% idle (~5.2) 19 cm/p 17-19 monitor: 66% idle (~5.2) 8 cm/p 66-68 +monitor: 71% idle (~5.2) 7 cm/p 70-75 cm/p = k8-dc-misses (bge0 system) +monitor is monitor mode with the exit moved to the top of ether_input(). Patch for ~5.2 now included. Results with prefetch not actually shown above since I forgot half of the details. cm/p was unchanged except for +monitor it is increased (by the unused prefetch). %idle decreased by 1-2% (less in -current where there is less slop) except for +monitor. Note that -current has many improvements over ~5.2 in both %CPU and cache misses for receiving. But for sending, -current gives a 10% lower rate for the same CPU (100%) though it reduces cache misses. Simplified or improved patches for -current: % diff -c2 ./dev/bge/if_bge.c~ ./dev/bge/if_bge.c % *** ./dev/bge/if_bge.c~ Fri May 16 16:39:01 2008 % --- ./dev/bge/if_bge.c Tue Jul 8 07:58:52 2008 % *************** % *** 3017,3020 **** % --- 3133,3137 ---- % */ % % + int bge_prefetch = 1; % static void % bge_rxeof(struct bge_softc *sc) % *************** % *** 3126,3129 **** % --- 3252,3257 ---- % m->m_pkthdr.len = m->m_len = cur_rx->bge_len - ETHER_CRC_LEN; % m->m_pkthdr.rcvif = ifp; % + if (bge_prefetch) % + __asm("prefetch %0" : : "m" (*mtod(m, char *))); % % if (ifp->if_capenable & IFCAP_RXCSUM) { % diff -c2 ./net/if_ethersubr.c~ ./net/if_ethersubr.c % *** ./net/if_ethersubr.c~ Fri May 16 16:41:45 2008 % --- ./net/if_ethersubr.c Tue Jul 8 07:55:14 2008 % *************** % *** 509,512 **** % --- 507,511 ---- % * mbuf chain m with the ethernet header at the front. % */ % + int broken_monitor = 0; % static void % ether_input(struct ifnet *ifp, struct mbuf *m) % *************** % *** 546,550 **** % } % eh = mtod(m, struct ether_header *); % - etype = ntohs(eh->ether_type); % if (m->m_pkthdr.rcvif == NULL) { % if_printf(ifp, "discard frame w/o interface pointer\n"); % --- 545,548 ---- % *************** % *** 560,564 **** % #endif % % ! if (ETHER_IS_MULTICAST(eh->ether_dhost)) { % if (ETHER_IS_BROADCAST(eh->ether_dhost)) % m->m_flags |= M_BCAST; % --- 558,564 ---- % #endif % % ! if (((ifp->if_flags & IFF_MONITOR) == 0 || broken_monitor) && % ! ETHER_IS_MULTICAST(eh->ether_dhost)) { % ! /* XXX bpf might need this even in monitor mode. */ % if (ETHER_IS_BROADCAST(eh->ether_dhost)) % m->m_flags |= M_BCAST; % *************** % *** 616,619 **** % --- 616,620 ---- % * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. % */ % + etype = ntohs(eh->ether_type); % if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { % struct ether_vlan_header *evl; Bruce