From owner-svn-src-head@FreeBSD.ORG Thu Feb 13 04:59:19 2014 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 82993A95; Thu, 13 Feb 2014 04:59:19 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mx1.freebsd.org (Postfix) with ESMTPS id 6D3D8175B; Thu, 13 Feb 2014 04:59:19 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s1D4xJXo074582; Thu, 13 Feb 2014 04:59:19 GMT (envelope-from glebius@svn.freebsd.org) Received: (from glebius@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s1D4xJvQ074579; Thu, 13 Feb 2014 04:59:19 GMT (envelope-from glebius@svn.freebsd.org) Message-Id: <201402130459.s1D4xJvQ074579@svn.freebsd.org> From: Gleb Smirnoff Date: Thu, 13 Feb 2014 04:59:19 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r261823 - head/sys/net X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.17 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 13 Feb 2014 04:59:19 -0000 Author: glebius Date: Thu Feb 13 04:59:18 2014 New Revision: 261823 URL: http://svnweb.freebsd.org/changeset/base/261823 Log: o Axe non-pcpu flowtable implementation. It wasn't enabled or used, and probably is a leftover from first prototyping by Kip. The non-pcpu implementation used mutexes, so it doubtfully worked better than simple routing lookup. o Use UMA_ZONE_PCPU zone for pointers instead of [MAXCPU] arrays, use zpcpu_get() to access data in there. o Substitute own single list implementation with SLIST(). This has two functional side effects: - new flows go into head of a list, before they went to tail. - a bug when incorrect flow was deleted in flow cleaner is fixed. o Due to cache line alignment, there is no reason to keep different zones for IPv4 and IPv6 flows. Both consume one cache line, real size of allocation is equal. o Rely on that f_hash, f_rt, f_lle are stable during fle lifetime, remove useless volatile quilifiers. o More INET/INET6 splitting. Reviewed by: adrian Sponsored by: Netflix Sponsored by: Nginx, Inc. Modified: head/sys/net/flowtable.c head/sys/net/flowtable.h Modified: head/sys/net/flowtable.c ============================================================================== --- head/sys/net/flowtable.c Thu Feb 13 04:55:46 2014 (r261822) +++ head/sys/net/flowtable.c Thu Feb 13 04:59:18 2014 (r261823) @@ -47,13 +47,16 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include #include #include #include #include #include #include +#include #include #include @@ -76,6 +79,7 @@ __FBSDID("$FreeBSD$"); #include +#ifdef INET struct ipv4_tuple { uint16_t ip_sport; /* source port */ uint16_t ip_dport; /* destination port */ @@ -87,7 +91,9 @@ union ipv4_flow { struct ipv4_tuple ipf_ipt; uint32_t ipf_key[3]; }; +#endif +#ifdef INET6 struct ipv6_tuple { uint16_t ip_sport; /* source port */ uint16_t ip_dport; /* destination port */ @@ -99,28 +105,44 @@ union ipv6_flow { struct ipv6_tuple ipf_ipt; uint32_t ipf_key[9]; }; +#endif struct flentry { - volatile uint32_t f_fhash; /* hash flowing forward */ + uint32_t f_fhash; /* hash flowing forward */ uint16_t f_flags; /* flow flags */ uint8_t f_pad; uint8_t f_proto; /* protocol */ uint32_t f_fibnum; /* fib index */ uint32_t f_uptime; /* uptime at last access */ - struct flentry *f_next; /* pointer to collision entry */ - volatile struct rtentry *f_rt; /* rtentry for flow */ - volatile struct llentry *f_lle; /* llentry for flow */ + SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ + struct rtentry *f_rt; /* rtentry for flow */ + struct llentry *f_lle; /* llentry for flow */ + union { +#ifdef INET + union ipv4_flow v4; +#endif +#ifdef INET6 + union ipv6_flow v6; +#endif + } f_flow; +#define f_flow4 f_flow.v4 +#define f_flow6 f_flow.v6 }; +#define KEYLEN(flags) ((((flags) & FL_IPV6) ? 9 : 3) * 4) -struct flentry_v4 { - struct flentry fl_entry; - union ipv4_flow fl_flow; -}; +/* Make sure f_flow begins with key. */ +#ifdef INET +CTASSERT(offsetof(struct flentry, f_flow) == + offsetof(struct flentry, f_flow4.ipf_key)); +#endif +#ifdef INET6 +CTASSERT(offsetof(struct flentry, f_flow) == + offsetof(struct flentry, f_flow6.ipf_key)); +#endif -struct flentry_v6 { - struct flentry fl_entry; - union ipv6_flow fl_flow; -}; +SLIST_HEAD(flist, flentry); +/* Make sure we can use pcpu_zone_ptr for struct flist. */ +CTASSERT(sizeof(struct flist) == sizeof(void *)); #define SECS_PER_HOUR 3600 #define SECS_PER_DAY (24*SECS_PER_HOUR) @@ -130,37 +152,28 @@ struct flentry_v6 { #define FIN_WAIT_IDLE 600 #define TCP_IDLE SECS_PER_DAY - -typedef void fl_lock_t(struct flowtable *, uint32_t); - -union flentryp { - struct flentry **global; - struct flentry **pcpu[MAXCPU]; -}; - struct flowtable { counter_u64_t *ft_stat; - uma_zone_t ft_zone; int ft_size; - int ft_lock_count; uint32_t ft_flags; uint32_t ft_max_depth; - fl_lock_t *ft_lock; - fl_lock_t *ft_unlock; + /* - * XXX need to pad out + * ft_table is a malloc(9)ed array of pointers. Pointers point to + * memory from UMA_ZONE_PCPU zone. + * ft_masks is per-cpu pointer itself. Each instance points + * to a malloc(9)ed bitset, that is private to corresponding CPU. */ - struct mtx *ft_locks; - union flentryp ft_table; - bitstr_t *ft_masks[MAXCPU]; + struct flist **ft_table; + bitstr_t **ft_masks; bitstr_t *ft_tmpmask; - uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE); + uint32_t ft_udp_idle; uint32_t ft_fin_wait_idle; uint32_t ft_syn_idle; uint32_t ft_tcp_idle; boolean_t ft_full; -} __aligned(CACHE_LINE_SIZE); +}; #define FLOWSTAT_ADD(ft, name, v) \ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) @@ -190,15 +203,15 @@ static uint32_t flowclean_freq; */ #ifdef INET static VNET_DEFINE(struct flowtable, ip4_ft); -#define V_ip4_ft VNET(ip4_ft) -static uma_zone_t flow_ipv4_zone; +#define V_ip4_ft VNET(ip4_ft) #endif #ifdef INET6 static VNET_DEFINE(struct flowtable, ip6_ft); #define V_ip6_ft VNET(ip6_ft) -static uma_zone_t flow_ipv6_zone; #endif +static uma_zone_t flow_zone; + static VNET_DEFINE(int, flowtable_enable) = 1; static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE; static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE; @@ -215,6 +228,8 @@ static SYSCTL_NODE(_net, OID_AUTO, flowt "flowtable"); SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW, &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); +SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, + &flow_zone, "Maximum number of flows allowed"); /* * XXX This does not end up updating timeouts at runtime @@ -233,43 +248,10 @@ SYSCTL_VNET_INT(_net_flowtable, OID_AUTO &VNET_NAME(flowtable_tcp_expire), 0, "seconds after which to remove flow allocated to a TCP connection."); -static void -flowtable_global_lock(struct flowtable *table, uint32_t hash) -{ - int lock_index = (hash)&(table->ft_lock_count - 1); - - mtx_lock(&table->ft_locks[lock_index]); -} - -static void -flowtable_global_unlock(struct flowtable *table, uint32_t hash) -{ - int lock_index = (hash)&(table->ft_lock_count - 1); - - mtx_unlock(&table->ft_locks[lock_index]); -} - -static void -flowtable_pcpu_lock(struct flowtable *table, uint32_t hash) -{ - - critical_enter(); -} - -static void -flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) -{ - - critical_exit(); -} - -#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size) -#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash)) -#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) -#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) - #define FL_STALE (1<<8) +static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); + static struct flentry *flowtable_lookup_common(struct flowtable *, struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int); @@ -320,27 +302,6 @@ flags_to_proto(int flags) } #ifdef INET -#ifdef FLOWTABLE_DEBUG -static void -ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin, - struct sockaddr_in *dsin) -{ - char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; - - if (flags & FL_HASH_ALL) { - inet_ntoa_r(ssin->sin_addr, saddr); - inet_ntoa_r(dsin->sin_addr, daddr); - printf("proto=%d %s:%d->%s:%d\n", - proto, saddr, ntohs(ssin->sin_port), daddr, - ntohs(dsin->sin_port)); - } else { - inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr); - printf("proto=%d %s\n", proto, daddr); - } - -} -#endif - static int ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags) @@ -456,10 +417,10 @@ flow_to_route(struct flentry *fle, struc sin = (struct sockaddr_in *)&ro->ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; + hashkey = fle->f_flow4.ipf_key; sin->sin_addr.s_addr = hashkey[2]; - ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); + ro->ro_rt = fle->f_rt; + ro->ro_lle = fle->f_lle; ro->ro_flags |= RT_NORTREF; } #endif /* INET */ @@ -661,10 +622,10 @@ flow_to_route_in6(struct flentry *fle, s sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + hashkey = fle->f_flow6.ipf_key; memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr)); - ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); + ro->ro_rt = fle->f_rt; + ro->ro_lle = fle->f_lle; ro->ro_flags |= RT_NORTREF; } #endif /* INET6 */ @@ -672,31 +633,24 @@ flow_to_route_in6(struct flentry *fle, s static bitstr_t * flowtable_mask(struct flowtable *ft) { - bitstr_t *mask; - if (ft->ft_flags & FL_PCPU) - mask = ft->ft_masks[curcpu]; - else - mask = ft->ft_masks[0]; + /* + * flowtable_free_stale() calls w/o critical section, but + * with sched_bind(). Since pointer is stable throughout + * ft lifetime, it is safe, otherwise... + * + * CRITICAL_ASSERT(curthread); + */ - return (mask); + return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); } -static struct flentry ** -flowtable_entry(struct flowtable *ft, uint32_t hash) +static struct flist * +flowtable_list(struct flowtable *ft, uint32_t hash) { - struct flentry **fle; - int index = (hash % ft->ft_size); - if (ft->ft_flags & FL_PCPU) { - KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set")); - fle = &ft->ft_table.pcpu[curcpu][index]; - } else { - KASSERT(&ft->ft_table.global[0] != NULL, ("global not set")); - fle = &ft->ft_table.global[index]; - } - - return (fle); + CRITICAL_ASSERT(curthread); + return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); } static int @@ -730,24 +684,6 @@ flow_stale(struct flowtable *ft, struct return (0); } -static void -flowtable_set_hashkey(struct flentry *fle, uint32_t *key) -{ - uint32_t *hashkey; - int i, nwords; - - if (fle->f_flags & FL_IPV6) { - nwords = 9; - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - } else { - nwords = 3; - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; - } - - for (i = 0; i < nwords; i++) - hashkey[i] = key[i]; -} - static int flow_full(struct flowtable *ft) { @@ -755,8 +691,8 @@ flow_full(struct flowtable *ft) int count, max; full = ft->ft_full; - count = uma_zone_get_cur(ft->ft_zone); - max = uma_zone_get_max(ft->ft_zone); + count = uma_zone_get_cur(flow_zone); + max = uma_zone_get_max(flow_zone); if (full && (count < (max - (max >> 3)))) ft->ft_full = FALSE; @@ -783,26 +719,31 @@ static int flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, uint32_t fibnum, struct route *ro, uint16_t flags) { - struct flentry *fle, *fletail, *newfle, **flep; + struct flist *flist; + struct flentry *fle, *iter; int depth; bitstr_t *mask; - uint8_t proto; - newfle = uma_zalloc(ft->ft_zone, M_NOWAIT | M_ZERO); - if (newfle == NULL) + fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); + if (fle == NULL) return (ENOMEM); - newfle->f_flags |= (flags & FL_IPV6); - proto = flags_to_proto(flags); + bcopy(key, &fle->f_flow, KEYLEN(flags)); + fle->f_flags |= (flags & FL_IPV6); + fle->f_proto = flags_to_proto(flags); + fle->f_rt = ro->ro_rt; + fle->f_lle = ro->ro_lle; + fle->f_fhash = hash; + fle->f_fibnum = fibnum; + fle->f_uptime = time_uptime; - FL_ENTRY_LOCK(ft, hash); + critical_enter(); mask = flowtable_mask(ft); - flep = flowtable_entry(ft, hash); - fletail = fle = *flep; + flist = flowtable_list(ft, hash); - if (fle == NULL) { - bit_set(mask, FL_ENTRY_INDEX(ft, hash)); - *flep = fle = newfle; + if (SLIST_EMPTY(flist)) { + bit_set(mask, (hash % ft->ft_size)); + SLIST_INSERT_HEAD(flist, fle, f_next); goto skip; } @@ -812,65 +753,30 @@ flowtable_insert(struct flowtable *ft, u * find end of list and make sure that we were not * preempted by another thread handling this flow */ - while (fle != NULL) { - if (fle->f_fhash == hash && !flow_stale(ft, fle)) { + SLIST_FOREACH(iter, flist, f_next) { + if (iter->f_fhash == hash && !flow_stale(ft, iter)) { /* * there was either a hash collision * or we lost a race to insert */ - FL_ENTRY_UNLOCK(ft, hash); - uma_zfree(ft->ft_zone, newfle); + critical_exit(); + uma_zfree(flow_zone, fle); return (EEXIST); } - /* - * re-visit this double condition XXX - */ - if (fletail->f_next != NULL) - fletail = fle->f_next; - depth++; - fle = fle->f_next; } if (depth > ft->ft_max_depth) ft->ft_max_depth = depth; - fletail->f_next = newfle; - fle = newfle; + + SLIST_INSERT_HEAD(flist, fle, f_next); skip: - flowtable_set_hashkey(fle, key); + critical_exit(); - fle->f_proto = proto; - fle->f_rt = ro->ro_rt; - fle->f_lle = ro->ro_lle; - fle->f_fhash = hash; - fle->f_fibnum = fibnum; - fle->f_uptime = time_uptime; - FL_ENTRY_UNLOCK(ft, hash); return (0); } -static int -flowtable_key_equal(struct flentry *fle, uint32_t *key) -{ - uint32_t *hashkey; - int i, nwords; - - if (fle->f_flags & FL_IPV6) { - nwords = 9; - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - } else { - nwords = 3; - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; - } - - for (i = 0; i < nwords; i++) - if (hashkey[i] != key[i]) - return (0); - - return (1); -} - struct flentry * flowtable_lookup(sa_family_t sa, struct mbuf *m) { @@ -895,6 +801,7 @@ flowtable_lookup_common(struct flowtable { struct route_in6 sro6; struct route sro, *ro; + struct flist *flist; struct flentry *fle; struct rtentry *rt; struct llentry *lle; @@ -974,34 +881,24 @@ flowtable_lookup_common(struct flowtable return (NULL); FLOWSTAT_INC(ft, ft_lookups); - FL_ENTRY_LOCK(ft, hash); - if ((fle = FL_ENTRY(ft, hash)) == NULL) { - FL_ENTRY_UNLOCK(ft, hash); - goto uncached; - } -keycheck: - rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - lle = __DEVOLATILE(struct llentry *, fle->f_lle); - if ((rt != NULL) - && lle != NULL - && fle->f_fhash == hash - && flowtable_key_equal(fle, key) - && (proto == fle->f_proto) - && (fibnum == fle->f_fibnum) - && (rt->rt_flags & RTF_UP) - && (rt->rt_ifp != NULL) - && (lle->la_flags & LLE_VALID)) { - FLOWSTAT_INC(ft, ft_hits); - fle->f_uptime = time_uptime; - fle->f_flags |= flags; - FL_ENTRY_UNLOCK(ft, hash); - goto success; - } else if (fle->f_next != NULL) { - fle = fle->f_next; - goto keycheck; - } - FL_ENTRY_UNLOCK(ft, hash); -uncached: + + critical_enter(); + flist = flowtable_list(ft, hash); + SLIST_FOREACH(fle, flist, f_next) + if (fle->f_fhash == hash && bcmp(&fle->f_flow, key, + KEYLEN(fle->f_flags)) == 0 && + proto == fle->f_proto && fibnum == fle->f_fibnum && + (fle->f_rt->rt_flags & RTF_UP) && + fle->f_rt->rt_ifp != NULL && + (fle->f_lle->la_flags & LLE_VALID)) { + fle->f_uptime = time_uptime; + fle->f_flags |= flags; + critical_exit(); + FLOWSTAT_INC(ft, ft_hits); + goto success; + } + critical_exit(); + if (flags & FL_NOAUTO || flow_full(ft)) return (NULL); @@ -1088,38 +985,22 @@ success: /* * used by the bit_alloc macro */ -#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO) - +#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) static void flowtable_alloc(struct flowtable *ft) { - if (ft->ft_flags & FL_PCPU) { - ft->ft_lock = flowtable_pcpu_lock; - ft->ft_unlock = flowtable_pcpu_unlock; - - for (int i = 0; i <= mp_maxid; i++) { - ft->ft_table.pcpu[i] = - malloc(ft->ft_size * sizeof(struct flentry *), - M_RTABLE, M_WAITOK | M_ZERO); - ft->ft_masks[i] = bit_alloc(ft->ft_size); - } - } else { - ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1): - (fls(mp_maxid + 1) << 1)); - - ft->ft_lock = flowtable_global_lock; - ft->ft_unlock = flowtable_global_unlock; - ft->ft_table.global = - malloc(ft->ft_size * sizeof(struct flentry *), - M_RTABLE, M_WAITOK | M_ZERO); - ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx), - M_RTABLE, M_WAITOK | M_ZERO); - for (int i = 0; i < ft->ft_lock_count; i++) - mtx_init(&ft->ft_locks[i], "flow", NULL, - MTX_DEF | MTX_DUPOK); + ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), + M_FTABLE, M_WAITOK); + for (int i = 0; i < ft->ft_size; i++) + ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); + + ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); + for (int i = 0; i < mp_ncpus; i++) { + bitstr_t **b; - ft->ft_masks[0] = bit_alloc(ft->ft_size); + b = zpcpu_get_cpu(ft->ft_masks, i); + *b = bit_alloc(ft->ft_size); } ft->ft_tmpmask = bit_alloc(ft->ft_size); @@ -1139,41 +1020,22 @@ flowtable_alloc(struct flowtable *ft) } } - -/* - * The rest of the code is devoted to garbage collection of expired entries. - * It is a new additon made necessary by the switch to dynamically allocating - * flow tables. - * - */ -static void -fle_free(struct flentry *fle, struct flowtable *ft) -{ - struct rtentry *rt; - struct llentry *lle; - - rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - lle = __DEVOLATILE(struct llentry *, fle->f_lle); - if (rt != NULL) - RTFREE(rt); - if (lle != NULL) - LLE_FREE(lle); - uma_zfree(ft->ft_zone, fle); -} +#undef calloc static void flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) { - int curbit = 0, tmpsize; - struct flentry *fle, **flehead, *fleprev; - struct flentry *flefreehead, *flefreetail, *fletmp; + struct flist *flist, freelist; + struct flentry *fle, *fle1, *fleprev; bitstr_t *mask, *tmpmask; + int curbit, tmpsize; - flefreehead = flefreetail = NULL; + SLIST_INIT(&freelist); mask = flowtable_mask(ft); tmpmask = ft->ft_tmpmask; tmpsize = ft->ft_size; memcpy(tmpmask, mask, ft->ft_size/8); + curbit = 0; /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead @@ -1187,69 +1049,72 @@ flowtable_free_stale(struct flowtable *f break; } - FL_ENTRY_LOCK(ft, curbit); - flehead = flowtable_entry(ft, curbit); - fle = fleprev = *flehead; - FLOWSTAT_INC(ft, ft_free_checks); + + critical_enter(); + flist = flowtable_list(ft, curbit); #ifdef DIAGNOSTIC - if (fle == NULL && curbit > 0) { + if (SLIST_EMPTY(flist) && curbit > 0) { log(LOG_ALERT, "warning bit=%d set, but no fle found\n", curbit); } #endif - while (fle != NULL) { - if (rt != NULL) { - if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) { - fleprev = fle; - fle = fle->f_next; - continue; - } - } else if (!flow_stale(ft, fle)) { + SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { + if (rt != NULL && fle->f_rt != rt) { fleprev = fle; - fle = fle->f_next; continue; } - /* - * delete head of the list - */ - if (fleprev == *flehead) { - fletmp = fleprev; - if (fle == fleprev) { - fleprev = *flehead = fle->f_next; - } else - fleprev = *flehead = fle; - fle = fle->f_next; - } else { - /* - * don't advance fleprev - */ - fletmp = fle; - fleprev->f_next = fle->f_next; - fle = fleprev->f_next; + if (!flow_stale(ft, fle)) { + fleprev = fle; + continue; } - if (flefreehead == NULL) - flefreehead = flefreetail = fletmp; - else { - flefreetail->f_next = fletmp; - flefreetail = fletmp; - } - fletmp->f_next = NULL; + if (fle == SLIST_FIRST(flist)) + SLIST_REMOVE_HEAD(flist, f_next); + else + SLIST_REMOVE_AFTER(fleprev, f_next); + SLIST_INSERT_HEAD(&freelist, fle, f_next); } - if (*flehead == NULL) + if (SLIST_EMPTY(flist)) bit_clear(mask, curbit); - FL_ENTRY_UNLOCK(ft, curbit); + critical_exit(); + bit_clear(tmpmask, curbit); tmpmask += (curbit / 8); tmpsize -= (curbit / 8) * 8; bit_ffs(tmpmask, tmpsize, &curbit); } - while ((fle = flefreehead) != NULL) { - flefreehead = fle->f_next; + + SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { FLOWSTAT_INC(ft, ft_frees); - fle_free(fle, ft); + if (fle->f_rt != NULL) + RTFREE(fle->f_rt); + if (fle->f_lle != NULL) + LLE_FREE(fle->f_lle); + uma_zfree(flow_zone, fle); + } +} + +static void +flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt) +{ + int i; + + CPU_FOREACH(i) { + if (smp_started == 1) { + thread_lock(curthread); + sched_bind(curthread, i); + thread_unlock(curthread); + } + + flowtable_free_stale(ft, rt); + + if (smp_started == 1) { + thread_lock(curthread); + sched_unbind(curthread); + thread_unlock(curthread); + } } } @@ -1257,7 +1122,6 @@ void flowtable_route_flush(sa_family_t sa, struct rtentry *rt) { struct flowtable *ft; - int i; switch (sa) { #ifdef INET @@ -1274,51 +1138,7 @@ flowtable_route_flush(sa_family_t sa, st panic("%s: sa %d", __func__, sa); } - if (ft->ft_flags & FL_PCPU) { - CPU_FOREACH(i) { - if (smp_started == 1) { - thread_lock(curthread); - sched_bind(curthread, i); - thread_unlock(curthread); - } - - flowtable_free_stale(ft, rt); - - if (smp_started == 1) { - thread_lock(curthread); - sched_unbind(curthread); - thread_unlock(curthread); - } - } - } else { - flowtable_free_stale(ft, rt); - } -} - -static void -flowtable_clean_vnet(struct flowtable *ft) -{ - - if (ft->ft_flags & FL_PCPU) { - int i; - - CPU_FOREACH(i) { - if (smp_started == 1) { - thread_lock(curthread); - sched_bind(curthread, i); - thread_unlock(curthread); - } - - flowtable_free_stale(ft, NULL); - - if (smp_started == 1) { - thread_lock(curthread); - sched_unbind(curthread); - thread_unlock(curthread); - } - } - } else - flowtable_free_stale(ft, NULL); + flowtable_clean_vnet(ft, rt); } static void @@ -1335,10 +1155,10 @@ flowtable_cleaner(void) VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef INET - flowtable_clean_vnet(&V_ip4_ft); + flowtable_clean_vnet(&V_ip4_ft, NULL); #endif #ifdef INET6 - flowtable_clean_vnet(&V_ip6_ft); + flowtable_clean_vnet(&V_ip6_ft, NULL); #endif CURVNET_RESTORE(); } @@ -1408,16 +1228,9 @@ flowtable_init(const void *unused __unus flow_hashjitter = arc4random(); -#ifdef INET - flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4), + flow_zone = uma_zcreate("flows", sizeof(struct flentry), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_MAXBUCKET); - uma_zone_set_max(flow_ipv4_zone, 1024 + maxusers * 64 * mp_ncpus); -#endif -#ifdef INET6 - flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6), - NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_MAXBUCKET); - uma_zone_set_max(flow_ipv6_zone, 1024 + maxusers * 64 * mp_ncpus); -#endif + uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); cv_init(&flowclean_c_cv, "c_flowcleanwait"); cv_init(&flowclean_f_cv, "f_flowcleanwait"); @@ -1432,8 +1245,6 @@ SYSINIT(flowtable_init, SI_SUB_PROTO_BEG #ifdef INET static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, "Flowtable for IPv4"); -SYSCTL_UMA_MAX(_net_flowtable_ip4, OID_AUTO, maxflows, CTLFLAG_RW, - &flow_ipv4_zone, "Maximum number of IPv4 flows allowed"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); VNET_PCPUSTAT_SYSINIT(ip4_ftstat); @@ -1446,9 +1257,7 @@ static void flowtable_init_vnet_v4(const void *unused __unused) { - V_ip4_ft.ft_zone = flow_ipv4_zone; V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); - V_ip4_ft.ft_flags = FL_PCPU; V_ip4_ft.ft_stat = VNET(ip4_ftstat); flowtable_alloc(&V_ip4_ft); } @@ -1459,8 +1268,6 @@ VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IF #ifdef INET6 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, "Flowtable for IPv6"); -SYSCTL_UMA_MAX(_net_flowtable_ip6, OID_AUTO, maxflows, CTLFLAG_RW, - &flow_ipv6_zone, "Maximum number of IPv6 flows allowed"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); VNET_PCPUSTAT_SYSINIT(ip6_ftstat); @@ -1473,9 +1280,7 @@ static void flowtable_init_vnet_v6(const void *unused __unused) { - V_ip6_ft.ft_zone = flow_ipv6_zone; V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); - V_ip6_ft.ft_flags = FL_PCPU; V_ip6_ft.ft_stat = VNET(ip6_ftstat); flowtable_alloc(&V_ip6_ft); } @@ -1484,45 +1289,18 @@ VNET_SYSINIT(flowtable_init_vnet_v6, SI_ #endif /* INET6 */ #ifdef DDB -static uint32_t * -flowtable_get_hashkey(struct flentry *fle) -{ - uint32_t *hashkey; - - if (fle->f_flags & FL_IPV6) - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - else - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; - - return (hashkey); -} - static bitstr_t * flowtable_mask_pcpu(struct flowtable *ft, int cpuid) { - bitstr_t *mask; - if (ft->ft_flags & FL_PCPU) - mask = ft->ft_masks[cpuid]; - else - mask = ft->ft_masks[0]; - - return (mask); + return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); } -static struct flentry ** -flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) +static struct flist * +flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) { - struct flentry **fle; - int index = (hash % ft->ft_size); - - if (ft->ft_flags & FL_PCPU) { - fle = &ft->ft_table.pcpu[cpuid][index]; - } else { - fle = &ft->ft_table.global[index]; - } - return (fle); + return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); } static void @@ -1542,7 +1320,7 @@ flow_show(struct flowtable *ft, struct f if (rt_valid) ifp = rt->rt_ifp; ifp_valid = ifp != NULL; - hashkey = flowtable_get_hashkey(fle); + hashkey = (uint32_t *)&fle->f_flow; if (fle->f_flags & FL_IPV6) goto skipaddr; @@ -1594,7 +1372,6 @@ static void flowtable_show(struct flowtable *ft, int cpuid) { int curbit = 0; - struct flentry *fle, **flehead; bitstr_t *mask, *tmpmask; if (cpuid != -1) @@ -1608,20 +1385,19 @@ flowtable_show(struct flowtable *ft, int */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { + struct flist *flist; + struct flentry *fle; + if (curbit >= ft->ft_size || curbit < -1) { db_printf("warning: bad curbit value %d \n", curbit); break; } - flehead = flowtable_entry_pcpu(ft, curbit, cpuid); - fle = *flehead; + flist = flowtable_list_pcpu(ft, curbit, cpuid); - while (fle != NULL) { + SLIST_FOREACH(fle, flist, f_next) flow_show(ft, fle); - fle = fle->f_next; - continue; - } bit_clear(tmpmask, curbit); bit_ffs(tmpmask, ft->ft_size, &curbit); } @@ -1631,14 +1407,10 @@ static void flowtable_show_vnet(struct flowtable *ft) { - if (ft->ft_flags & FL_PCPU) { - int i; + int i; - CPU_FOREACH(i) { - flowtable_show(ft, i); - } - } else - flowtable_show(ft, -1); + CPU_FOREACH(i) + flowtable_show(ft, i); } DB_SHOW_COMMAND(flowtables, db_show_flowtables) Modified: head/sys/net/flowtable.h ============================================================================== --- head/sys/net/flowtable.h Thu Feb 13 04:55:46 2014 (r261822) +++ head/sys/net/flowtable.h Thu Feb 13 04:59:18 2014 (r261823) @@ -44,7 +44,6 @@ struct flowtable_stat { #ifdef _KERNEL #define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */ -#define FL_PCPU (1<<1) /* pcpu cache */ #define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */ #define FL_IPV6 (1<<9)