From owner-freebsd-net@FreeBSD.ORG  Wed Jul  9 05:30:24 2008
Return-Path: <owner-freebsd-net@FreeBSD.ORG>
Delivered-To: freebsd-net@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id C848B106566B
	for <freebsd-net@freebsd.org>; Wed,  9 Jul 2008 05:30:24 +0000 (UTC)
	(envelope-from brde@optusnet.com.au)
Received: from mail02.syd.optusnet.com.au (mail02.syd.optusnet.com.au
	[211.29.132.183])
	by mx1.freebsd.org (Postfix) with ESMTP id 5037E8FC12
	for <freebsd-net@freebsd.org>; Wed,  9 Jul 2008 05:30:24 +0000 (UTC)
	(envelope-from brde@optusnet.com.au)
Received: from c220-239-252-11.carlnfd3.nsw.optusnet.com.au
	(c220-239-252-11.carlnfd3.nsw.optusnet.com.au [220.239.252.11])
	by mail02.syd.optusnet.com.au (8.13.1/8.13.1) with ESMTP id
	m695UETn030620
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO);
	Wed, 9 Jul 2008 15:30:16 +1000
Date: Wed, 9 Jul 2008 15:30:14 +1000 (EST)
From: Bruce Evans <brde@optusnet.com.au>
X-X-Sender: bde@delplex.bde.org
To: Peter Jeremy <peterjeremy@optushome.com.au>
In-Reply-To: <20080707221257.GH62764@server.vk2pj.dyndns.org>
Message-ID: <20080709142008.H26105@delplex.bde.org>
References: <2d3001c8def1$f4309b90$020b000a@bartwrkstxp>
	<486FFF70.3090402@gtcomm.net>
	<48701921.7090107@gtcomm.net> <4871E618.1080500@freebsd.org>
	<20080708002228.G680@besplex.bde.org> <48724238.2020103@freebsd.org>
	<20080708034304.R21502@delplex.bde.org>
	<20080708045135.V1022@besplex.bde.org>
	<ed91d4a80807071227q5d9ca283g59270a1ab92c80a9@mail.gmail.com>
	<48727BA9.6020702@elischer.org>
	<20080707221257.GH62764@server.vk2pj.dyndns.org>
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII; format=flowed
Cc: FreeBSD Net <freebsd-net@freebsd.org>,
	Julian Elischer <julian@elischer.org>
Subject: Re: Freebsd IP Forwarding performance (question, and some info)
 [7-stable, current, em, smp]
X-BeenThere: freebsd-net@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: Networking and TCP/IP with FreeBSD <freebsd-net.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-net>,
	<mailto:freebsd-net-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/freebsd-net>
List-Post: <mailto:freebsd-net@freebsd.org>
List-Help: <mailto:freebsd-net-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-net>,
	<mailto:freebsd-net-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Wed, 09 Jul 2008 05:30:24 -0000

On Tue, 8 Jul 2008, Peter Jeremy wrote:

> On 2008-Jul-07 13:25:13 -0700, Julian Elischer <julian@elischer.org> wrote:
>> what you need is a speculative prefetch where you an tell teh
>> processor "We will probably need the following address so start
>> getting it while we go do other stuff".
>
> This looks like the PREFETCH instructions that exist in at least amd64
> and SPARC.  Unfortunately, their optimal use is very implementation-
> dependent and the AMD documentation suggests that incorrect use can
> degrade performance.

I use the following hacks to test these in my version of bge in ~5.2:

% Index: dev/bge/if_bge.c
% ===================================================================
% RCS file: /home/ncvs/src/sys/dev/bge/if_bge.c,v
% retrieving revision 1.84
% diff -u -2 -r1.84 if_bge.c
% --- dev/bge/if_bge.c	12 Mar 2005 06:51:25 -0000	1.84
% +++ dev/bge/if_bge.c	8 Jul 2008 04:49:12 -0000
% @@ -2690,4 +2845,11 @@
%   */
% 
% +int bge_prefetch = 1;
% +int bge_nprefetchnta = 0;
% +int bge_nprefetch = 0x40;
% +int bge_nprefetchw = 0;
% +int bge_nprefetch0 = 0;
% +int bge_nprefetch1 = 0;
% +int bge_nprefetch2 = 0;
%  static void
%  bge_rxeof(sc)
% @@ -2789,4 +2960,35 @@
%  #endif
%  		eh = mtod(m, struct ether_header *);
% +		if (bge_prefetch) {
% +			struct cl {
% +				char	cl_data[64];	/* XXX */
% +			} *clp;
% +			int i, j;
% +
% +			/* XXX misalignment is likely. */
% +			clp = mtod(m, struct cl *);
% +#ifdef __i386__ /* XXX actually 3dnow */
% +			for (i = 0, j = 0; i < bge_nprefetchnta;
% +			    i += sizeof(*clp), j++)
% +				__asm("prefetchnta %0" : : "m" (clp[j]));
% +			for (i = 0, j = 0; i < bge_nprefetch;
% +			    i += sizeof(*clp), j++)
% +				__asm("prefetch %0" : : "m" (clp[j]));
% +			for (i = 0, j = 0; i < bge_nprefetchw;
% +			    i += sizeof(*clp), j++)
% +				__asm("prefetchw %0" : : "m" (clp[j]));
% +#endif
% +#ifdef __amd64__
% +			for (i = 0, j = 0; i < bge_nprefetch0;
% +			    i += sizeof(*clp), j++)
% +				__asm("prefetch0 %0" : : "m" (clp[j]));
% +			for (i = 0, j = 0; i < bge_nprefetch1;
% +			    i += sizeof(*clp), j++)
% +				__asm("prefetch1 %0" : : "m" (clp[j]));
% +			for (i = 0, j = 0; i < bge_nprefetch2;
% +			    i += sizeof(*clp), j++)
% +				__asm("prefetch2 %0" : : "m" (clp[j]));
% +#endif
% +		}
%  		m->m_pkthdr.len = m->m_len = cur_rx->bge_len - ETHER_CRC_LEN;
%  		m->m_pkthdr.rcvif = ifp;
% Index: net/if_ethersubr.c
% ===================================================================
% RCS file: /home/ncvs/src/sys/net/if_ethersubr.c,v
% retrieving revision 1.174
% diff -u -2 -r1.174 if_ethersubr.c
% --- net/if_ethersubr.c	24 Jun 2004 12:31:44 -0000	1.174
% +++ net/if_ethersubr.c	7 Jul 2008 18:31:13 -0000
% @@ -479,4 +479,5 @@
%   * mbuf chain m with the ethernet header at the front.
%   */
% +int monearly = 0;
%  static void
%  ether_input(struct ifnet *ifp, struct mbuf *m)
% @@ -485,4 +486,12 @@
%  	u_short etype;
% 
% +	if (monearly && ifp->if_flags & IFF_MONITOR) {
% +		/*
% +		 * Interface marked for monitoring; discard packet.
% +		 */
% +		m_freem(m);
% +		return;
% +	}
% +
%  	/*
%  	 * Do consistency checks to verify assumptions

The results were underwhelming and contrary to Andre's assertion that the
primary bottleneck (apart from PCI32) is hardware-related cache misses
(I think it is software-related cache misses).

I previously reported that fixing monitor mode avoids 1 cache miss and
thus saves 5% CPU.  Plain prefetch forces this cache miss (but no other
hardware-related ones, since there are no other hardware-related ones
in upper layers) to occur asynchronously and always occur.  However,
it only saves 2% in unfixed normal mode and in unfixed monitor mode
(in fixed monitor mode, it makes little difference except to not avoid
the cache miss -- since the cache miss is asynchronous it doesn't
affect %CPU much).  Even 5% is a relatively uninteresting savings, since
the non-hardware related CPU overhead is 10 times as much as that.  I'm
testing only receive of udp packets with a payload of 5 bytes (padded),
so the whole packet fits in 64 bytes and there is only 1 hardware-related
cache miss per packet to avoid or prefetch.  The precise size is 60
(64 - CRC_size I think).  m->m_data is always misaligned at an offset of
2 bytes from a 64-byte cache line boundary, prefetching 64 bytes at this
address is not quite right, but since the 60 bytes all fit in 1 cache
line, the prefetch fetches enough.

prefetchnta as in Andre's old patch (16 Dec 2004) didn't seem to work.
I also prefetch as soon as possible in the driver interrupt handler
where Andre's old patch prefetches in ether_input() where this is
almost certainly too late.  The difference between the 5% and the 2%
saings may be due to it also being too late in the driver interrupt
handler.  Someone mentioned not caring about latency.  Doing something
else to wait for all the prefetches made by the interrupt handler to
complete might help here, but only if you could find something useful
to do (hard), and I think latency would just increase the slowness in
most cases since significant latency would require long queues and the
long queues would bust caches (starting with discarding all the
prefetches).

Andre's old patch uses a hard-coded prefetch size of 74 (76 after
source alignment and 128 after rounding up) where mine uses a parameter
of 64 (66 after virtual source alignment and 64 after rounding down).
This would cause an unnecessary extra cache miss for small packets.
It too only tries to prefetch the packet header, but allows for tcp
and tcp options so a small packet's headers alone are larger than 64
bytes.  The extra cache miss for never-accessed data shouldn' cost
much since it uses prefetchnta.  (All of my tests are on an Athlon64
where prefetchnta actually works, unlike on AthlonXP.  But actually
working might be responsible for it not being very effective here.
To work, it must not be too aggressive or it will cost too much for
never-accessed data.)

Timings (some repeated), all for ttcp receiving on bge0 at 397 kpps:

-monitor: 35% idle (8.0-CURRENT)  14 cm/p
  monitor: 83% idle (8.0-CURRENT)   6 cm/p
+monitor: 85% idle (8.0-CURRENT)   5 cm/p
-monitor: 17% idle (~5.2)         19 cm/p
           17-19
  monitor: 66% idle (~5.2)          8 cm/p
 	  66-68
+monitor: 71% idle (~5.2)          7 cm/p
           70-75

cm/p = k8-dc-misses (bge0 system)
+monitor is monitor mode with the exit moved to the top of ether_input().
Patch for ~5.2 now included.

Results with prefetch not actually shown above since I forgot half of the
details.  cm/p was unchanged except for +monitor it is increased (by the
unused prefetch).  %idle decreased by 1-2% (less in -current where there
is less slop) except for +monitor.

Note that -current has many improvements over ~5.2 in both %CPU and cache
misses for receiving.  But for sending, -current gives a 10% lower rate
for the same CPU (100%) though it reduces cache misses.

Simplified or improved patches for -current:

% diff -c2 ./dev/bge/if_bge.c~ ./dev/bge/if_bge.c
% *** ./dev/bge/if_bge.c~	Fri May 16 16:39:01 2008
% --- ./dev/bge/if_bge.c	Tue Jul  8 07:58:52 2008
% ***************
% *** 3017,3020 ****
% --- 3133,3137 ----
%    */
% 
% + int bge_prefetch = 1;
%   static void
%   bge_rxeof(struct bge_softc *sc)
% ***************
% *** 3126,3129 ****
% --- 3252,3257 ----
%   		m->m_pkthdr.len = m->m_len = cur_rx->bge_len - ETHER_CRC_LEN;
%   		m->m_pkthdr.rcvif = ifp;
% + 		if (bge_prefetch)
% + 			__asm("prefetch %0" : : "m" (*mtod(m, char *)));
% 
%   		if (ifp->if_capenable & IFCAP_RXCSUM) {
% diff -c2 ./net/if_ethersubr.c~ ./net/if_ethersubr.c
% *** ./net/if_ethersubr.c~	Fri May 16 16:41:45 2008
% --- ./net/if_ethersubr.c	Tue Jul  8 07:55:14 2008
% ***************
% *** 509,512 ****
% --- 507,511 ----
%    * mbuf chain m with the ethernet header at the front.
%    */
% + int broken_monitor = 0;
%   static void
%   ether_input(struct ifnet *ifp, struct mbuf *m)
% ***************
% *** 546,550 ****
%   	}
%   	eh = mtod(m, struct ether_header *);
% - 	etype = ntohs(eh->ether_type);
%   	if (m->m_pkthdr.rcvif == NULL) {
%   		if_printf(ifp, "discard frame w/o interface pointer\n");
% --- 545,548 ----
% ***************
% *** 560,564 ****
%   #endif
% 
% ! 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
%   		if (ETHER_IS_BROADCAST(eh->ether_dhost))
%   			m->m_flags |= M_BCAST;
% --- 558,564 ----
%   #endif
% 
% ! 	if (((ifp->if_flags & IFF_MONITOR) == 0 || broken_monitor) &&
% ! 	    ETHER_IS_MULTICAST(eh->ether_dhost)) {
% ! 		/* XXX bpf might need this even in monitor mode. */
%   		if (ETHER_IS_BROADCAST(eh->ether_dhost))
%   			m->m_flags |= M_BCAST;
% ***************
% *** 616,619 ****
% --- 616,620 ----
%   	 * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
%   	 */
% + 	etype = ntohs(eh->ether_type);
%   	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
%   		struct ether_vlan_header *evl;

Bruce