Date: Tue, 30 Dec 2008 04:20:06 +0000 (UTC) From: Kip Macy <kmacy@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r186583 - in user/kmacy/HEAD_fast_net/sys: conf net netinet Message-ID: <200812300420.mBU4K6bC075639@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kmacy Date: Tue Dec 30 04:20:06 2008 New Revision: 186583 URL: http://svn.freebsd.org/changeset/base/186583 Log: import updated flowtable for use by forwarding and unconnected sockets Added: user/kmacy/HEAD_fast_net/sys/net/flowtable.c (contents, props changed) user/kmacy/HEAD_fast_net/sys/net/flowtable.h (contents, props changed) Modified: user/kmacy/HEAD_fast_net/sys/conf/files user/kmacy/HEAD_fast_net/sys/netinet/ip_input.c user/kmacy/HEAD_fast_net/sys/netinet/ip_output.c user/kmacy/HEAD_fast_net/sys/netinet/vinet.h Modified: user/kmacy/HEAD_fast_net/sys/conf/files ============================================================================== --- user/kmacy/HEAD_fast_net/sys/conf/files Tue Dec 30 01:33:15 2008 (r186582) +++ user/kmacy/HEAD_fast_net/sys/conf/files Tue Dec 30 04:20:06 2008 (r186583) @@ -2154,6 +2154,7 @@ net/bpf_filter.c optional bpf | netgrap net/bpf_zerocopy.c optional bpf net/bridgestp.c optional bridge | if_bridge net/bsd_comp.c optional ppp_bsdcomp +net/flowtable.c standard net/ieee8023ad_lacp.c optional lagg net/if.c standard net/if_arcsubr.c optional arcnet Added: user/kmacy/HEAD_fast_net/sys/net/flowtable.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/kmacy/HEAD_fast_net/sys/net/flowtable.c Tue Dec 30 04:20:06 2008 (r186583) @@ -0,0 +1,747 @@ +#include "opt_mpath.h" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/limits.h> +#include <sys/kernel.h> +#include <sys/bitstring.h> +#include <sys/vimage.h> +#include <sys/sysctl.h> + + +#include <sys/callout.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/syslog.h> + +#include <net/route.h> +#include <net/vnet.h> +#include <net/flowtable.h> +#include <net/if.h> +#include <net/if_llatbl.h> +#include <net/if_var.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/sctp.h> + +#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO) + +/* + * Taken from http://burtleburtle.net/bob/c/lookup3.c + */ + +#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used http://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} + +/* +-------------------------------------------------------------------- + This works on all machines. To be useful, it requires + -- that the key be an array of uint32_t's, and + -- that the length be the number of uint32_t's in the key + + The function hashword() is identical to hashlittle() on little-endian + machines, and identical to hashbig() on big-endian machines, + except that the length has to be measured in uint32_ts rather than in + bytes. hashlittle() is more complicated than hashword() only because + hashlittle() has to dance around fitting the key bytes into registers. +-------------------------------------------------------------------- +*/ +static uint32_t hashword( +const uint32_t *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32_t initval) /* the previous hash, or an arbitrary value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + return c; +} + + +struct ipv4_tuple { + uint16_t ip_sport; /* source port */ + uint16_t ip_dport; /* destination port */ + in_addr_t ip_saddr; /* source address */ + in_addr_t ip_daddr; /* destination address */ +}; + +union ipv4_flow { + struct ipv4_tuple ipf_ipt; + uint32_t ipf_key[3]; +}; + +struct ipv6_tuple { + uint16_t ip_sport; /* source port */ + uint16_t ip_dport; /* destination port */ + struct in6_addr ip_saddr; /* source address */ + struct in6_addr ip_daddr; /* destination address */ +}; + +union ipv6_flow { + struct ipv6_tuple ipf_ipt; + uint32_t ipf_key[9]; +}; + +struct flentry { + volatile uint32_t f_fhash; /* hash flowing forward */ + uint16_t f_flags; /* flow flags */ + uint8_t f_pad; + uint8_t f_proto; /* protocol */ + uint32_t f_uptime; + volatile struct rtentry *f_rt; /* rtentry for flow */ + volatile struct llentry *f_lle; /* llentry for flow */ +}; + +struct flentry_v4 { + struct flentry fl_entry; + union ipv4_flow fl_flow; +}; + +struct flentry_v6 { + struct flentry fl_entry; + union ipv6_flow fl_flow; +}; + +#define fl_fhash fl_entry.fl_fhash +#define fl_flags fl_entry.fl_flags +#define fl_proto fl_entry.fl_proto +#define fl_uptime fl_entry.fl_uptime +#define fl_rt fl_entry.fl_rt +#define fl_lle fl_entry.fl_lle + +#define SECS_PER_HOUR 3600 +#define SECS_PER_DAY (24*SECS_PER_HOUR) + +#define SYN_IDLE 300 +#define UDP_IDLE 300 +#define FIN_WAIT_IDLE 600 +#define TCP_IDLE SECS_PER_DAY + + +typedef void fl_lock_t(struct flowtable *, uint32_t); +typedef void fl_rtalloc_t(struct route *, uint32_t, u_int); + +union flentryp { + struct flentry_v4 *v4; + struct flentry_v6 *v6; + struct flentry_v4 *v4_pcpu[MAXCPU]; + struct flentry_v6 *v6_pcpu[MAXCPU]; +}; + +struct flowtable { + union flentryp ft_table; + int ft_size; + bitstr_t *ft_masks[MAXCPU]; + struct mtx *ft_locks; + int ft_lock_count; + uint32_t ft_flags; + uint32_t ft_collisions; + uint32_t ft_allocated; + uint64_t ft_hits; + + uint32_t ft_udp_idle; + uint32_t ft_fin_wait_idle; + uint32_t ft_syn_idle; + uint32_t ft_tcp_idle; + + fl_lock_t *ft_lock; + fl_lock_t *ft_unlock; + fl_rtalloc_t *ft_rtalloc; + +}; + +static uint32_t hashjitter; + +SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); + +int flowtable_enable = 1; +SYSCTL_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, + &flowtable_enable, 0, "enable flowtable caching."); + + +#ifndef RADIX_MPATH +static void +in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib) +{ + + in_rtalloc_ign(ro, 0, fib); +} +#endif + +static void +flowtable_global_lock(struct flowtable *table, uint32_t hash) +{ + int lock_index = (hash)&(table->ft_lock_count - 1); + + mtx_lock(&table->ft_locks[lock_index]); +} + +static void +flowtable_global_unlock(struct flowtable *table, uint32_t hash) +{ + int lock_index = (hash)&(table->ft_lock_count - 1); + + mtx_unlock(&table->ft_locks[lock_index]); +} + +static void +flowtable_pcpu_lock(struct flowtable *table, uint32_t hash) +{ + + critical_enter(); +} + +static void +flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) +{ + + mb(); + critical_exit(); +} + +#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size) +#define FL_ENTRY(table, hash) flowtable_entry((table), (hash)) +#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) +#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) + +#define FL_STALE (1<<8) + +static uint32_t +ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro, + uint32_t *key, uint16_t *flags, uint8_t *protop) +{ + uint16_t sport = 0, dport = 0; + struct ip *ip = mtod(m, struct ip *); + uint8_t proto = ip->ip_p; + int iphlen = ip->ip_hl << 2; + uint32_t hash; + struct sockaddr_in *sin; + struct tcphdr *th; + struct udphdr *uh; + struct sctphdr *sh; + + key[0] = 0; + key[1] = ip->ip_src.s_addr; + key[2] = ip->ip_dst.s_addr; + + sin = (struct sockaddr_in *)&ro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + + if (flowtable_enable == 0) + return (0); + + switch (proto) { + case IPPROTO_TCP: + th = (struct tcphdr *)((caddr_t)ip + iphlen); + sport = ntohs(th->th_sport); + dport = ntohs(th->th_dport); + *flags |= th->th_flags; + if (*flags & TH_RST) + *flags |= FL_STALE; + break; + case IPPROTO_UDP: + uh = (struct udphdr *)((caddr_t)ip + iphlen); + sport = uh->uh_sport; + dport = uh->uh_dport; + break; + case IPPROTO_SCTP: + sh = (struct sctphdr *)((caddr_t)ip + iphlen); + sport = sh->src_port; + dport = sh->dest_port; + break; + default: + if (*flags & FL_HASH_PORTS) + goto noop; + /* no port - hence not a protocol we care about */ + break;; + + } + *protop = proto; + + /* + * If this is a transmit route cache then + * hash all flows to a given destination to + * the same bucket + */ + if ((*flags & FL_HASH_PORTS) == 0) + proto = sport = dport = 0; + + ((uint16_t *)key)[0] = sport; + ((uint16_t *)key)[1] = dport; + + hash = hashword(key, 3, hashjitter + proto); + if (m->m_pkthdr.flowid == 0) + m->m_pkthdr.flowid = hash; + + CTR5(KTR_SPARE3, "proto=%d hash=%x key[0]=%x sport=%d dport=%d\n", proto, hash, key[0], sport, dport); + + return (hash); +noop: + *protop = proto; + return (0); +} + +static bitstr_t * +flowtable_mask(struct flowtable *ft) +{ + bitstr_t *mask; + + if (ft->ft_flags & FL_PCPU) + mask = ft->ft_masks[curcpu]; + else + mask = ft->ft_masks[0]; + + return (mask); +} + +static struct flentry * +flowtable_entry(struct flowtable *ft, uint32_t hash) +{ + struct flentry *fle; + int index = (hash % ft->ft_size); + + + if ((ft->ft_flags & FL_IPV6) == 0) { + if (ft->ft_flags & FL_PCPU) { + fle = (struct flentry *) + &ft->ft_table.v4_pcpu[curcpu][index]; + } else + fle = (struct flentry *)&ft->ft_table.v4[index]; + } else { + if (ft->ft_flags & FL_PCPU) + fle = (struct flentry *) + &ft->ft_table.v6_pcpu[curcpu][index]; + else + fle = (struct flentry *)&ft->ft_table.v6[index]; + } + + return (fle); +} + +static int +flow_stale(struct flowtable *ft, struct flentry *fle) +{ + time_t idle_time; + + if ((fle->f_fhash == 0) + || ((fle->f_rt->rt_flags & RTF_HOST) && + ((fle->f_rt->rt_flags & (RTF_UP)) + != (RTF_UP))) + || (fle->f_rt->rt_ifp == NULL)) + return (1); + + idle_time = time_uptime - fle->f_uptime; + + if ((fle->f_flags & FL_STALE) || + ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0 + && (idle_time > ft->ft_udp_idle)) || + ((fle->f_flags & TH_FIN) + && (idle_time > ft->ft_fin_wait_idle)) || + ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN + && (idle_time > ft->ft_syn_idle)) || + ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK) + && (idle_time > ft->ft_tcp_idle)) || + ((fle->f_rt->rt_flags & RTF_UP) == 0 || + (fle->f_rt->rt_ifp == NULL))) + return (1); + + return (0); +} + +static void +flowtable_set_hashkey(struct flowtable *ft, struct flentry *fle, uint32_t *key) +{ + uint32_t *hashkey; + int i, nwords; + + if (ft->ft_flags & FL_IPV6) { + nwords = 9; + hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; + } else { + nwords = 3; + hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + } + + for (i = 0; i < nwords; i++) + hashkey[i] = key[i]; +} + +static int +flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, + uint8_t proto, struct route *ro, uint16_t flags) +{ + struct flentry *fle; + volatile struct rtentry *rt0 = NULL; + struct rtentry *rt1; + int stale; + bitstr_t *mask; + +retry: + FL_ENTRY_LOCK(ft, hash); + mask = flowtable_mask(ft); + fle = flowtable_entry(ft, hash); + if (fle->f_fhash) { + if ((stale = flow_stale(ft, fle)) != 0) { + fle->f_fhash = 0; + rt0 = fle->f_rt; + fle->f_rt = NULL; + bit_clear(mask, FL_ENTRY_INDEX(ft, hash)); + } + FL_ENTRY_UNLOCK(ft, hash); + if (!stale) + return (ENOSPC); + + rt1 = __DEVOLATILE(struct rtentry *, rt0); + RTFREE(rt1); + /* + * We might end up on a different cpu + */ + goto retry; + + } + flowtable_set_hashkey(ft, fle, key); + bit_set(mask, FL_ENTRY_INDEX(ft, hash)); + + fle->f_proto = proto; + fle->f_rt = ro->ro_rt; + fle->f_lle = ro->ro_lle; + fle->f_fhash = hash; + fle->f_uptime = time_uptime; + FL_ENTRY_UNLOCK(ft, hash); + return (0); +} + +static int +flowtable_key_equal(struct flentry *fle, uint32_t *key, int flags) +{ + uint32_t *hashkey; + int i, nwords; + + if (flags & FL_IPV6) { + nwords = 9; + hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; + } else { + nwords = 3; + hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + } + + for (i = 0; i < nwords; i++) + if (hashkey[i] != key[i]) + return (0); + + return (1); +} + +int +flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro) +{ + uint32_t key[9], hash; + struct flentry *fle; + uint16_t flags; + uint8_t proto = 0; + int cache = 1, error = 0; + struct rtentry *rt; + struct llentry *lle; + + flags = ft ? ft->ft_flags : 0; + ro->ro_rt = NULL; + ro->ro_lle = NULL; + + /* + * The internal hash lookup is the only IPv4 specific bit + * remaining + */ + hash = ipv4_flow_lookup_hash_internal(m, ro, key, + &flags, &proto); + + /* + * Ports are zero and this isn't a transmit cache + * - thus not a protocol for which we need to keep + * statex + * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP + */ + if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS))) { + cache = 0; + goto uncached; + } + FL_ENTRY_LOCK(ft, hash); + fle = FL_ENTRY(ft, hash); + rt = __DEVOLATILE(struct rtentry *, fle->f_rt); + lle = __DEVOLATILE(struct llentry *, fle->f_lle); + if ((rt != NULL) + && fle->f_fhash == hash + && flowtable_key_equal(fle, key, flags) + && (proto == fle->f_proto) + && (rt->rt_flags & RTF_UP) + && (rt->rt_ifp != NULL)) { + fle->f_uptime = time_uptime; + fle->f_flags |= flags; + ro->ro_rt = rt; + ro->ro_lle = lle; + FL_ENTRY_UNLOCK(ft, hash); + return (0); + } + FL_ENTRY_UNLOCK(ft, hash); + +uncached: + /* + * This bit of code ends up locking the + * same route 3 times (just like ip_output + ether_output) + * - at lookup + * - in rt_check when called by arpresolve + * - dropping the refcount for the rtentry + * + * This could be consolidated to one if we wrote a variant + * of arpresolve with an rt_check variant that expected to + * receive the route locked + */ + ft->ft_rtalloc(ro, hash, M_GETFIB(m)); + if (ro->ro_rt == NULL) + error = ENETUNREACH; + else { + int finsert; + struct llentry *lle = NULL; + struct sockaddr *l3addr; + struct rtentry *rt = ro->ro_rt; + struct ifnet *ifp = rt->rt_ifp; + + if (rt->rt_flags & RTF_GATEWAY) + l3addr = rt->rt_gateway; + else + l3addr = &ro->ro_dst; + IF_AFDATA_RLOCK(ifp); + lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, l3addr); + IF_AFDATA_RUNLOCK(ifp); + if ((lle == NULL) && + (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { + IF_AFDATA_WLOCK(ifp); + lle = lla_lookup(LLTABLE(ifp), + (LLE_CREATE | LLE_EXCLUSIVE), l3addr); + IF_AFDATA_WUNLOCK(ifp); + } + if (lle != NULL) { + LLE_ADDREF(lle); + LLE_WUNLOCK(lle); + } + ro->ro_lle = lle; + finsert = ((lle != NULL) && cache); + if (finsert) + error = flowtable_insert(ft, hash, key, proto, + ro, flags); + + if (error || !finsert) { + RTFREE(rt); + if (lle != NULL) + LLE_FREE(lle); + } + error = 0; + } + + return (error); +} + +#ifdef notyet +static __inline int +bit_fns(bitstr_t *name, int nbits, int lastbit) +{ + int lastbit_start = lastbit & ~0x7; + bitstr_t *bitstr_start = &name[lastbit_start]; + int value = 0; + + while (value <= lastbit && value != 1) + bit_ffs(bitstr_start, nbits, &value); + + return (value); +} +#endif + +struct flowtable * +flowtable_alloc(int nentry, int flags) +{ + struct flowtable *ft; + int i; + + if (hashjitter == 0) + hashjitter = arc4random(); + + ft = malloc(sizeof(struct flowtable), + M_RTABLE, M_WAITOK | M_ZERO); + + ft->ft_flags = flags; + ft->ft_size = nentry; +#ifdef RADIX_MPATH + ft->ft_rtalloc = rtalloc_mpath_fib; +#else + ft->ft_rtalloc = in_rtalloc_ign_wrapper; +#endif + if (flags & FL_PCPU) { + ft->ft_lock = flowtable_pcpu_lock; + ft->ft_unlock = flowtable_pcpu_unlock; + + for (i = 0; i < mp_ncpus; i++) { + ft->ft_table.v4_pcpu[i] = + malloc(nentry*sizeof(struct flentry_v4), + M_RTABLE, M_WAITOK | M_ZERO); + ft->ft_masks[i] = bit_alloc(nentry); + } + } else { + ft->ft_lock_count = 2*(powerof2(mp_ncpus) ? mp_ncpus : + (fls(mp_ncpus) << 1)); + + ft->ft_lock = flowtable_global_lock; + ft->ft_unlock = flowtable_global_unlock; + ft->ft_table.v4 = + malloc(nentry*sizeof(struct flentry_v4), + M_RTABLE, M_WAITOK | M_ZERO); + ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx), + M_RTABLE, M_WAITOK | M_ZERO); + for (i = 0; i < ft->ft_lock_count; i++) + mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK); + + ft->ft_masks[0] = bit_alloc(nentry); + } + + /* + * In the local transmit case the table truly is + * just a cache - so everything is eligible for + * replacement after 5s of non-use + */ + if (flags & FL_HASH_PORTS) { + ft->ft_udp_idle = UDP_IDLE; + ft->ft_syn_idle = SYN_IDLE; + ft->ft_fin_wait_idle = FIN_WAIT_IDLE; + ft->ft_tcp_idle = TCP_IDLE; + } else { + ft->ft_udp_idle = ft->ft_fin_wait_idle = + ft->ft_syn_idle = ft->ft_tcp_idle = 30; + + } + + + return (ft); +} + Added: user/kmacy/HEAD_fast_net/sys/net/flowtable.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/kmacy/HEAD_fast_net/sys/net/flowtable.h Tue Dec 30 04:20:06 2008 (r186583) @@ -0,0 +1,24 @@ +#ifndef _NET_FLOWTABLE_H_ +#define _NET_FLOWTABLE_H_ + +#ifdef _KERNEL +#include <net/ethernet.h> +#include <netinet/in.h> + +#define FL_HASH_PORTS (1<<0) /* hash 4-tuple + protocol */ +#define FL_PCPU (1<<1) /* pcpu cache */ +#define FL_IPV6 (1<<2) /* IPv6 table */ + +struct flowtable; +struct flowtable *flowtable_alloc(int nentry, int flags); + +/* + * Given a flow table, look up the L3 and L2 information and + * return it in the route + * + */ +int flowtable_lookup(struct flowtable *ft, struct mbuf *m, + struct route *ro); + +#endif +#endif Modified: user/kmacy/HEAD_fast_net/sys/netinet/ip_input.c ============================================================================== --- user/kmacy/HEAD_fast_net/sys/netinet/ip_input.c Tue Dec 30 01:33:15 2008 (r186582) +++ user/kmacy/HEAD_fast_net/sys/netinet/ip_input.c Tue Dec 30 04:20:06 2008 (r186583) @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include <sys/vimage.h> #include <net/pfil.h> +#include <net/flowtable.h> #include <net/if.h> #include <net/if_types.h> #include <net/if_var.h> @@ -210,6 +211,21 @@ SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, m SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif +static int ip_pcpu_flowtable_size = 2048; +TUNABLE_INT("net.inet.ip.pcpu_flowtable_size", &ip_pcpu_flowtable_size); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, pcpu_flowtable_size, + CTLFLAG_RDTUN, ip_pcpu_flowtable_size, 0, + "number of entries in the per cpu flow caches"); + +#ifdef RADIX_MPATH +static int ip_global_flowtable_size = 128*1024; +#else +static int ip_global_flowtable_size = 16*1024; +#endif +TUNABLE_INT("net.inet.ip.global_flowtable_size", &ip_global_flowtable_size); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, global_flowtable_size, + CTLFLAG_RDTUN, ip_global_flowtable_size, 0, + "number of entries in the global flow cache"); /* * ipfw_ether and ipfw_bridge hooks. @@ -220,6 +236,8 @@ ip_dn_io_t *ip_dn_io_ptr = NULL; #ifdef VIMAGE_GLOBALS int fw_one_pass; #endif +struct flowtable *ipv4_ft; +struct flowtable *ipv4_forward_ft; static void ip_freef(struct ipqhead *, struct ipq *); @@ -319,6 +337,9 @@ ip_init(void) ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); netisr_register(NETISR_IP, ip_input, &ipintrq, 0); + + ipv4_ft = flowtable_alloc(ip_pcpu_flowtable_size, FL_PCPU); + ipv4_forward_ft = flowtable_alloc(ip_global_flowtable_size, FL_HASH_PORTS); } void Modified: user/kmacy/HEAD_fast_net/sys/netinet/ip_output.c ============================================================================== --- user/kmacy/HEAD_fast_net/sys/netinet/ip_output.c Tue Dec 30 01:33:15 2008 (r186582) +++ user/kmacy/HEAD_fast_net/sys/netinet/ip_output.c Tue Dec 30 04:20:06 2008 (r186583) @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <sys/ucred.h> #include <sys/vimage.h> +#include <net/flowtable.h> #include <net/if.h> #include <net/if_llatbl.h> #include <net/netisr.h> @@ -101,6 +102,7 @@ static void ip_mloopback extern struct protosw inetsw[]; +extern struct flowtable *ipv4_ft; /* * IP output. The packet in mbuf chain m contains a skeletal IP @@ -122,7 +124,7 @@ ip_output(struct mbuf *m, struct mbuf *o int hlen = sizeof (struct ip); int mtu; int len, error = 0; - int neednewroute = 0, neednewlle = 0; + int neednewroute = 0, neednewlle = 0, nortfree = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; @@ -159,6 +161,11 @@ ip_output(struct mbuf *m, struct mbuf *o neednewlle = 1; } } + if ((ro == &iproute) && (ro->ro_rt == NULL) && (ro->ro_lle == NULL)) { + if (flowtable_lookup(ipv4_ft, m, ro) == 0) + nortfree = 1; + } + if (opt) { len = 0; @@ -200,7 +207,8 @@ again: if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { - if (inp == NULL || (ro->ro_rt != inp->inp_rt)) + if ((nortfree == 0) && + (inp == NULL || (ro->ro_rt != inp->inp_rt))) RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } @@ -640,12 +648,12 @@ done: return (error); } - if (inp == NULL || (inp->inp_vflag & INP_RT_VALID) == 0) + if ((nortfree == 0) && + (inp == NULL || (inp->inp_vflag & INP_RT_VALID) == 0)) RTFREE(ro->ro_rt); else if (neednewroute && ro->ro_rt != inp->inp_rt) { RTFREE(inp->inp_rt); inp->inp_rt = ro->ro_rt; - } if (neednewlle) { IF_AFDATA_RLOCK(ifp); Modified: user/kmacy/HEAD_fast_net/sys/netinet/vinet.h ============================================================================== --- user/kmacy/HEAD_fast_net/sys/netinet/vinet.h Tue Dec 30 01:33:15 2008 (r186582) +++ user/kmacy/HEAD_fast_net/sys/netinet/vinet.h Tue Dec 30 04:20:06 2008 (r186583) @@ -75,6 +75,8 @@ struct vnet_inet { int _ip_sendsourcequench; int _ip_do_randomid; int _ip_checkinterface; + int _ip_pcpu_flowtable_size; + int _ip_global_flowtable_size; u_short _ip_id; uma_zone_t _ipq_zone;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200812300420.mBU4K6bC075639>