Date: Mon, 18 May 2009 06:46:34 +0000 (UTC) From: Kip Macy <kmacy@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r192295 - in user/kmacy/releng_7_2_fcs/sys: amd64/conf conf dev/e1000 i386/conf net netinet Message-ID: <200905180646.n4I6kYrv052564@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kmacy Date: Mon May 18 06:46:34 2009 New Revision: 192295 URL: http://svn.freebsd.org/changeset/base/192295 Log: Import changes from HEAD 191038 191154 add utility routine for updating an struct llentry * 191158 191159 191160 191161 191162 191221 191255 191257 191258 191259 191324 191440 191441 191442 191603 191611 191612 Added: user/kmacy/releng_7_2_fcs/sys/net/flowtable.c user/kmacy/releng_7_2_fcs/sys/net/flowtable.h Modified: user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS user/kmacy/releng_7_2_fcs/sys/conf/NOTES user/kmacy/releng_7_2_fcs/sys/conf/files user/kmacy/releng_7_2_fcs/sys/conf/options user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS user/kmacy/releng_7_2_fcs/sys/net/if.c user/kmacy/releng_7_2_fcs/sys/net/if_bridge.c user/kmacy/releng_7_2_fcs/sys/net/if_llatbl.c user/kmacy/releng_7_2_fcs/sys/net/if_llatbl.h user/kmacy/releng_7_2_fcs/sys/net/if_var.h user/kmacy/releng_7_2_fcs/sys/netinet/in_pcb.h user/kmacy/releng_7_2_fcs/sys/netinet/ip_input.c user/kmacy/releng_7_2_fcs/sys/netinet/ip_output.c user/kmacy/releng_7_2_fcs/sys/netinet/vinet.h Modified: user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS Mon May 18 06:46:34 2009 (r192295) @@ -16,9 +16,9 @@ device io # I/O device device uart_ns8250 # Default partitioning schemes -options GEOM_BSD -options GEOM_MBR - -# KSE support went from being default to a kernel option -options KSE options VIMAGE_GLOBALS +options GEOM_PART_BSD +options GEOM_PART_MBR + +options FLOWTABLE + Modified: user/kmacy/releng_7_2_fcs/sys/conf/NOTES ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/conf/NOTES Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/conf/NOTES Mon May 18 06:46:34 2009 (r192295) @@ -549,6 +549,9 @@ options LIBMCHAIN # libalias library, performing NAT options LIBALIAS +# flowtable cache +options FLOWTABLE + # # SCTP is a NEW transport protocol defined by # RFC2960 updated by RFC3309 and RFC3758.. and Modified: user/kmacy/releng_7_2_fcs/sys/conf/files ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/conf/files Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/conf/files Mon May 18 06:46:34 2009 (r192295) @@ -1806,6 +1806,7 @@ net/bpf_filter.c optional bpf | netgrap net/bpf_zerocopy.c optional bpf net/bridgestp.c optional bridge | if_bridge net/bsd_comp.c optional ppp_bsdcomp +net/flowtable.c optional flowtable net/ieee8023ad_lacp.c optional lagg net/if.c standard net/if_arcsubr.c optional arcnet Modified: user/kmacy/releng_7_2_fcs/sys/conf/options ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/conf/options Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/conf/options Mon May 18 06:46:34 2009 (r192295) @@ -405,6 +405,7 @@ VLAN_ARRAY opt_vlan.h XBONEHACK KRPC NFSLOCKD +FLOWTABLE opt_route.h # # SCTP Modified: user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c Mon May 18 06:46:34 2009 (r192295) @@ -893,6 +893,7 @@ em_detach(device_t dev) if_free(ifp); drbr_free(adapter->br, M_DEVBUF); + drbr_free(adapter->br, M_DEVBUF); em_free_transmit_structures(adapter); em_free_receive_structures(adapter); @@ -987,7 +988,7 @@ em_resume(device_t dev) * the packet is requeued. **********************************************************************/ -#ifdef IFNET_MULTIQUEUE +#ifdef IFNET_BUF_RING static int em_transmit_locked(struct ifnet *ifp, struct mbuf *m) { @@ -1000,68 +1001,63 @@ em_transmit_locked(struct ifnet *ifp, st || (!adapter->link_active)) { error = drbr_enqueue(ifp, adapter->br, m); return (error); - } - - if (buf_ring_empty(adapter->br) && + } else if (ADAPTER_RING_EMPTY(adapter) && (adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)) { if (em_xmit(adapter, &m)) { - if (m && (error = drbr_enqueue(ifp, adapter->br, m)) != 0) { + if (m && (error = drbr_enqueue(ifp, adapter->br, m)) != 0) return (error); - } - } else{ - /* Send a copy of the frame to the BPF listener */ + } else { + /* + * We've bypassed the buf ring so we need to update + * ifp directly + */ + drbr_stats_update(ifp, m->m_pkthdr.len, m->m_flags); + /* + ** Send a copy of the frame to the BPF + ** listener and set the watchdog on. + */ ETHER_BPF_MTAP(ifp, m); } } else if ((error = drbr_enqueue(ifp, adapter->br, m)) != 0) return (error); - if (!buf_ring_empty(adapter->br)) + if (!ADAPTER_RING_EMPTY(adapter)) em_start_locked(ifp); return (0); } -static void -em_start_locked(struct ifnet *ifp) +static int +em_transmit(struct ifnet *ifp, struct mbuf *m) { - struct adapter *adapter = ifp->if_softc; - struct mbuf *m_head; - - EM_TX_LOCK_ASSERT(adapter); - - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING) - return; - if (!adapter->link_active) - return; - - while ((adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD) - && (!buf_ring_empty(adapter->br))) { + + struct adapter *adapter = ifp->if_softc; + int error = 0; - m_head = buf_ring_dequeue_sc(adapter->br); - if (m_head == NULL) - break; - /* - * Encapsulation can modify our pointer, and or make it - * NULL on failure. In that event, we can't requeue. - */ - if (em_xmit(adapter, &m_head)) { - if (m_head == NULL) - break; - break; - } + if(EM_TX_TRYLOCK(adapter)) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + error = em_transmit_locked(ifp, m); + EM_TX_UNLOCK(adapter); + } else + error = drbr_enqueue(ifp, adapter->br, m); - /* Send a copy of the frame to the BPF listener */ - ETHER_BPF_MTAP(ifp, m_head); + return (error); +} - /* Set timeout in case hardware has problems transmitting. */ - adapter->watchdog_timer = EM_TX_TIMEOUT; - } - if ((adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)) - ifp->if_drv_flags |= IFF_DRV_OACTIVE; +static void +em_qflush(struct ifnet *ifp) +{ + struct mbuf *m; + struct adapter *adapter = (struct adapter *)ifp->if_softc; + EM_TX_LOCK(adapter); + while ((m = buf_ring_dequeue_sc(adapter->br)) != NULL) + m_freem(m); + if_qflush(ifp); + EM_TX_UNLOCK(adapter); } -#else +#endif + static void em_start_locked(struct ifnet *ifp) { @@ -1076,9 +1072,10 @@ em_start_locked(struct ifnet *ifp) if (!adapter->link_active) return; - while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + while ((adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD) + && (!ADAPTER_RING_EMPTY(adapter))) { - IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); + m_head = em_dequeue(ifp, adapter->br); if (m_head == NULL) break; /* @@ -1088,8 +1085,10 @@ em_start_locked(struct ifnet *ifp) if (em_xmit(adapter, &m_head)) { if (m_head == NULL) break; +#ifndef IFNET_BUF_RING ifp->if_drv_flags |= IFF_DRV_OACTIVE; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); +#endif break; } @@ -1099,8 +1098,10 @@ em_start_locked(struct ifnet *ifp) /* Set timeout in case hardware has problems transmitting. */ adapter->watchdog_timer = EM_TX_TIMEOUT; } + if ((adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)) + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + } -#endif static void em_start(struct ifnet *ifp) @@ -1113,23 +1114,6 @@ em_start(struct ifnet *ifp) EM_TX_UNLOCK(adapter); } -static int -em_transmit(struct ifnet *ifp, struct mbuf *m) -{ - - struct adapter *adapter = ifp->if_softc; - int error = 0; - - if(EM_TX_TRYLOCK(adapter)) { - if (ifp->if_drv_flags & IFF_DRV_RUNNING) - error = em_transmit_locked(ifp, m); - EM_TX_UNLOCK(adapter); - } else - error = drbr_enqueue(ifp, adapter->br, m); - - return (error); -} - /********************************************************************* * Ioctl entry point * @@ -1693,11 +1677,7 @@ em_poll(struct ifnet *ifp, enum poll_cmd EM_TX_LOCK(adapter); em_txeof(adapter); -#ifdef IFNET_MULTIQUEUE - if (!buf_ring_empty(adapter->br)) -#else - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) -#endif + if (!ADAPTER_RING_EMPTY(adapter)) em_start_locked(ifp); EM_TX_UNLOCK(adapter); } @@ -1767,13 +1747,7 @@ em_intr(void *arg) if (ifp->if_drv_flags & IFF_DRV_RUNNING && -#ifdef IFNET_MULTIQUEUE - !buf_ring_empty(adapter->br) -#else - !IFQ_DRV_IS_EMPTY(&ifp->if_snd) -#endif - ) - + !ADAPTER_RING_EMPTY(adapter)) em_start(ifp); } @@ -1812,11 +1786,7 @@ em_handle_rxtx(void *context, int pendin EM_TX_LOCK(adapter); em_txeof(adapter); -#ifdef IFNET_MULTIQUEUE - if (!buf_ring_empty(adapter->br)) -#else - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) -#endif + if (!ADAPTER_RING_EMPTY(adapter)) em_start_locked(ifp); EM_TX_UNLOCK(adapter); } @@ -1824,43 +1794,6 @@ em_handle_rxtx(void *context, int pendin em_enable_intr(adapter); } -static void -em_handle_rx(void *context, int pending) -{ - struct adapter *adapter = context; - struct ifnet *ifp = adapter->ifp; - - if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && - (em_rxeof(adapter, adapter->rx_process_limit) != 0)) - taskqueue_enqueue(adapter->tq, &adapter->rx_task); - -} - -static void -em_handle_tx(void *context, int pending) -{ - struct adapter *adapter = context; - struct ifnet *ifp = adapter->ifp; - - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { -#ifdef IFNET_MULTIQUEUE - if (!EM_TX_TRYLOCK(adapter)) - return; -#else - EM_TX_LOCK(adapter); -#endif - - em_txeof(adapter); -#ifdef IFNET_MULTIQUEUE - if (!buf_ring_empty(adapter->br)) -#else - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) -#endif - em_start_locked(ifp); - EM_TX_UNLOCK(adapter); - } -} - /********************************************************************* * * Fast Legacy/MSI Combined Interrupt Service routine @@ -1989,6 +1922,35 @@ em_msix_link(void *arg) EM_MSIX_LINK | E1000_IMS_LSC); return; } + +static void +em_handle_rx(void *context, int pending) +{ + struct adapter *adapter = context; + struct ifnet *ifp = adapter->ifp; + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && + (em_rxeof(adapter, adapter->rx_process_limit) != 0)) + taskqueue_enqueue(adapter->tq, &adapter->rx_task); + +} + +static void +em_handle_tx(void *context, int pending) +{ + struct adapter *adapter = context; + struct ifnet *ifp = adapter->ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (!EM_TX_TRYLOCK(adapter)) + return; + + em_txeof(adapter); + if (!ADAPTER_RING_EMPTY(adapter)) + em_start_locked(ifp); + EM_TX_UNLOCK(adapter); + } +} #endif /* EM_FAST_IRQ */ /********************************************************************* @@ -2646,6 +2608,8 @@ em_local_timer(void *arg) EM_CORE_LOCK_ASSERT(adapter); taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); + taskqueue_enqueue(adapter->tq, + &adapter->rxtx_task); em_update_link_status(adapter); em_update_stats_counters(adapter); @@ -2990,6 +2954,11 @@ em_allocate_msix(struct adapter *adapter */ TASK_INIT(&adapter->rx_task, 0, em_handle_rx, adapter); TASK_INIT(&adapter->tx_task, 0, em_handle_tx, adapter); + /* + * Handle compatibility for msi case for deferral due to + * trylock failure + */ + TASK_INIT(&adapter->rxtx_task, 0, em_handle_tx, adapter); TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter); adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); @@ -3244,6 +3213,11 @@ em_setup_interface(device_t dev, struct adapter->br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &adapter->tx_mtx); #endif +#ifdef IFNET_BUF_RING + ifp->if_transmit = em_transmit; + ifp->if_qflush = em_qflush; + adapter->br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &adapter->tx_mtx); +#endif if (adapter->hw.mac.type >= e1000_82543) { int version_cap; #if __FreeBSD_version < 700000 Modified: user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h Mon May 18 06:46:34 2009 (r192295) @@ -36,7 +36,7 @@ #ifndef _EM_H_DEFINED_ #define _EM_H_DEFINED_ -#define IFNET_MULTIQUEUE +#define IFNET_BUF_RING /* Tunables */ /* @@ -302,8 +302,10 @@ struct em_dma_alloc { /* Our adapter structure */ struct adapter { struct ifnet *ifp; -#ifdef IFNET_MULTIQUEUE +#ifdef IFNET_BUF_RING struct buf_ring *br; +#else + void *br; #endif struct e1000_hw hw; @@ -494,4 +496,27 @@ typedef struct _DESCRIPTOR_PAIR #define EM_CORE_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->core_mtx, MA_OWNED) #define EM_TX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->tx_mtx, MA_OWNED) +#ifdef IFNET_BUF_RING +#define ADAPTER_RING_EMPTY(adapter) drbr_empty((adapter)->ifp, (adapter)->br) +#define em_dequeue drbr_dequeue + +#else +#define ADAPTER_RING_EMPTY(adapter) IFQ_DRV_IS_EMPTY(&((adapter)->ifp->if_snd)) +#define drbr_free(br, type) +static __inline struct mbuf * +em_dequeue(struct ifnet *ifp, struct buf_ring *br) +{ + struct mbuf *m; + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m); + return (m); +} +#ifdef BUF_RING_UNDEFINED + +struct buf_ring { +}; + +#endif +#endif + #endif /* _EM_H_DEFINED_ */ Modified: user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS ============================================================================== --- user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS Mon May 18 06:32:38 2009 (r192294) +++ user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS Mon May 18 06:46:34 2009 (r192295) @@ -23,6 +23,9 @@ device uart_ns8250 options GEOM_BSD options GEOM_MBR -# KSE support went from being default to a kernel option -options KSE options VIMAGE_GLOBALS +# enable support for native hardware +options NATIVE +device atpic + +options FLOWTABLE Added: user/kmacy/releng_7_2_fcs/sys/net/flowtable.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/kmacy/releng_7_2_fcs/sys/net/flowtable.c Mon May 18 06:46:34 2009 (r192295) @@ -0,0 +1,1076 @@ +/************************************************************************** + +Copyright (c) 2008-2009, BitGravity Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the BitGravity Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include "opt_route.h" +#include "opt_mpath.h" + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/bitstring.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/limits.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> +#include <sys/vimage.h> + +#include <net/if.h> +#include <net/if_llatbl.h> +#include <net/if_var.h> +#include <net/route.h> +#include <net/vnet.h> +#include <net/flowtable.h> + + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/sctp.h> + +/* + * Taken from http://burtleburtle.net/bob/c/lookup3.c + */ + +#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used http://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} + +/* +-------------------------------------------------------------------- + This works on all machines. To be useful, it requires + -- that the key be an array of uint32_t's, and + -- that the length be the number of uint32_t's in the key + + The function hashword() is identical to hashlittle() on little-endian + machines, and identical to hashbig() on big-endian machines, + except that the length has to be measured in uint32_ts rather than in + bytes. hashlittle() is more complicated than hashword() only because + hashlittle() has to dance around fitting the key bytes into registers. +-------------------------------------------------------------------- +*/ +static uint32_t hashword( +const uint32_t *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32_t initval) /* the previous hash, or an arbitrary value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + return c; +} + + +struct ipv4_tuple { + uint16_t ip_sport; /* source port */ + uint16_t ip_dport; /* destination port */ + in_addr_t ip_saddr; /* source address */ + in_addr_t ip_daddr; /* destination address */ +}; + +union ipv4_flow { + struct ipv4_tuple ipf_ipt; + uint32_t ipf_key[3]; +}; + +struct ipv6_tuple { + uint16_t ip_sport; /* source port */ + uint16_t ip_dport; /* destination port */ + struct in6_addr ip_saddr; /* source address */ + struct in6_addr ip_daddr; /* destination address */ +}; + +union ipv6_flow { + struct ipv6_tuple ipf_ipt; + uint32_t ipf_key[9]; +}; + +struct flentry { + volatile uint32_t f_fhash; /* hash flowing forward */ + uint16_t f_flags; /* flow flags */ + uint8_t f_pad; /* alignment */ + uint8_t f_proto; /* protocol */ + uint32_t f_uptime; /* uptime at last access */ + struct flentry *f_next; /* pointer to collision entry */ + volatile struct rtentry *f_rt; /* rtentry for flow */ + volatile struct llentry *f_lle; /* llentry for flow */ +}; + +struct flentry_v4 { + struct flentry fl_entry; + union ipv4_flow fl_flow; +}; + +struct flentry_v6 { + struct flentry fl_entry; + union ipv6_flow fl_flow; +}; + +#define fl_fhash fl_entry.fl_fhash +#define fl_flags fl_entry.fl_flags +#define fl_proto fl_entry.fl_proto +#define fl_uptime fl_entry.fl_uptime +#define fl_rt fl_entry.fl_rt +#define fl_lle fl_entry.fl_lle + +#define SECS_PER_HOUR 3600 +#define SECS_PER_DAY (24*SECS_PER_HOUR) + +#define SYN_IDLE 300 +#define UDP_IDLE 300 +#define FIN_WAIT_IDLE 600 +#define TCP_IDLE SECS_PER_DAY + + +typedef void fl_lock_t(struct flowtable *, uint32_t); +typedef void fl_rtalloc_t(struct route *, uint32_t, u_int); + +union flentryp { + struct flentry **global; + struct flentry **pcpu[MAXCPU]; +}; + +struct flowtable { + int ft_size; + int ft_lock_count; + uint32_t ft_flags; + uint32_t ft_collisions; + uint32_t ft_allocated; + uint32_t ft_misses; + uint64_t ft_hits; + + uint32_t ft_udp_idle; + uint32_t ft_fin_wait_idle; + uint32_t ft_syn_idle; + uint32_t ft_tcp_idle; + + fl_lock_t *ft_lock; + fl_lock_t *ft_unlock; + fl_rtalloc_t *ft_rtalloc; + struct mtx *ft_locks; + + + union flentryp ft_table; + bitstr_t *ft_masks[MAXCPU]; + bitstr_t *ft_tmpmask; + struct flowtable *ft_next; +}; + +static struct proc *flowcleanerproc; +static struct flowtable *flow_list_head; +static uint32_t hashjitter; +static uma_zone_t ipv4_zone; +static uma_zone_t ipv6_zone; + +/* + * TODO: + * - Make flowtable stats per-cpu, aggregated at sysctl call time, + * to avoid extra cache evictions caused by incrementing a shared + * counter + * - add IPv6 support to flow lookup + * - add sysctls to resize && flush flow tables + * - Add per flowtable sysctls for statistics and configuring timeouts + * - add saturation counter to rtentry to support per-packet load-balancing + * add flag to indicate round-robin flow, add list lookup from head + for flows + * - add sysctl / device node / syscall to support exporting and importing + * of flows with flag to indicate that a flow was imported so should + * not be considered for auto-cleaning + * - support explicit connection state (currently only ad-hoc for DSR) + */ +SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); +int flowtable_enable = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, + &flowtable_enable, 0, "enable flowtable caching."); +static int flowtable_hits = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD, + &flowtable_hits, 0, "# flowtable hits."); +static int flowtable_lookups = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD, + &flowtable_lookups, 0, "# flowtable lookups."); +static int flowtable_misses = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD, + &flowtable_misses, 0, "#flowtable misses."); +static int flowtable_frees = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD, + &flowtable_frees, 0, "#flows freed."); +static int flowtable_free_checks = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD, + &flowtable_free_checks, 0, "#flows free checks."); +static int flowtable_max_depth = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD, + &flowtable_max_depth, 0, "max collision list length."); +static int flowtable_collisions = 0; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD, + &flowtable_collisions, 0, "#flowtable collisions."); + +/* + * XXX This does not end up updating timeouts at runtime + * and only reflects the value for the last table added :-/ + */ +static int flowtable_syn_expire = SYN_IDLE; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW, + &flowtable_syn_expire, 0, "seconds after which to remove syn allocated flow."); +static int flowtable_udp_expire = UDP_IDLE; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW, + &flowtable_udp_expire, 0, "seconds after which to remove flow allocated to UDP."); +static int flowtable_fin_wait_expire = FIN_WAIT_IDLE; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW, + &flowtable_fin_wait_expire, 0, "seconds after which to remove a flow in FIN_WAIT."); +static int flowtable_tcp_expire = TCP_IDLE; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW, + &flowtable_tcp_expire, 0, "seconds after which to remove flow allocated to a TCP connection."); + + +/* + * Maximum number of flows that can be allocated of a given type. + * + * The table is allocated at boot time (for the pure caching case + * there is no reason why this could not be changed at runtime) + * and thus (currently) needs to be set with a tunable. + */ +static int nmbflows = 4096; + +static int +sysctl_nmbflows(SYSCTL_HANDLER_ARGS) +{ + int error, newnmbflows; + + newnmbflows = nmbflows; + error = sysctl_handle_int(oidp, &newnmbflows, 0, req); + if (error == 0 && req->newptr) { + if (newnmbflows > nmbflows) { + nmbflows = newnmbflows; + uma_zone_set_max(ipv4_zone, nmbflows); + uma_zone_set_max(ipv6_zone, nmbflows); + } else + error = EINVAL; + } + return (error); +} +SYSCTL_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, CTLTYPE_INT|CTLFLAG_RW, + &nmbflows, 0, sysctl_nmbflows, "IU", "Maximum number of flows allowed"); + +#ifndef RADIX_MPATH +static void +in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib) +{ + + in_rtalloc_ign(ro, 0, fib); +} +#endif + +static void +flowtable_global_lock(struct flowtable *table, uint32_t hash) +{ + int lock_index = (hash)&(table->ft_lock_count - 1); + + mtx_lock(&table->ft_locks[lock_index]); +} + +static void +flowtable_global_unlock(struct flowtable *table, uint32_t hash) +{ + int lock_index = (hash)&(table->ft_lock_count - 1); + + mtx_unlock(&table->ft_locks[lock_index]); +} + +static void +flowtable_pcpu_lock(struct flowtable *table, uint32_t hash) +{ + + critical_enter(); +} + +static void +flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) +{ + + mb(); + critical_exit(); +} + +#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size) +#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash)) +#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) +#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) + +#define FL_STALE (1<<8) +#define FL_IPV6 (1<<9) + +static uint32_t +ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro, + uint32_t *key, uint16_t *flags, uint8_t *protop) +{ + uint16_t sport = 0, dport = 0; + struct ip *ip = NULL; + uint8_t proto = 0; + int iphlen; + uint32_t hash; + struct sockaddr_in *sin; + struct tcphdr *th; + struct udphdr *uh; + struct sctphdr *sh; + + if (flowtable_enable == 0) + return (0); + + key[1] = key[0] = 0; + sin = (struct sockaddr_in *)&ro->ro_dst; + if (m != NULL) { + ip = mtod(m, struct ip *); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + } else + *flags &= ~FL_HASH_PORTS; + + key[2] = sin->sin_addr.s_addr; + + if ((*flags & FL_HASH_PORTS) == 0) + goto skipports; + + proto = ip->ip_p; + iphlen = ip->ip_hl << 2; /* XXX options? */ + key[1] = ip->ip_src.s_addr; + + switch (proto) { + case IPPROTO_TCP: + th = (struct tcphdr *)((caddr_t)ip + iphlen); + sport = ntohs(th->th_sport); + dport = ntohs(th->th_dport); + *flags |= th->th_flags; + if (*flags & TH_RST) + *flags |= FL_STALE; + break; + case IPPROTO_UDP: + uh = (struct udphdr *)((caddr_t)ip + iphlen); + sport = uh->uh_sport; + dport = uh->uh_dport; + break; + case IPPROTO_SCTP: + sh = (struct sctphdr *)((caddr_t)ip + iphlen); + sport = sh->src_port; + dport = sh->dest_port; + break; + default: + if (*flags & FL_HASH_PORTS) + goto noop; + /* no port - hence not a protocol we care about */ + break;; + + } + *protop = proto; + + /* + * If this is a transmit route cache then + * hash all flows to a given destination to + * the same bucket + */ + if ((*flags & FL_HASH_PORTS) == 0) + proto = sport = dport = 0; + + ((uint16_t *)key)[0] = sport; + ((uint16_t *)key)[1] = dport; + +skipports: + hash = hashword(key, 3, hashjitter + proto); + if (m != NULL && (m->m_flags & M_FLOWID) == 0) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = hash; + } + + return (hash); +noop: + *protop = proto; + return (0); +} + +static bitstr_t * +flowtable_mask(struct flowtable *ft) +{ *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200905180646.n4I6kYrv052564>