Date: Tue, 5 Aug 2003 16:18:26 -0700 (PDT) From: Sam Leffler <sam@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 35571 for review Message-ID: <200308052318.h75NIQ14073824@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=35571 Change 35571 by sam@sam_ebb on 2003/08/05 16:18:18 Checkpoint fast forwarding locking: we use a mutex per hash bucket. Need to revisit fast path processing to see if we can come up with a lock free scheme. Note this code is untested. Affected files ... .. //depot/projects/netperf/sys/netinet/ip_flow.c#2 edit .. //depot/projects/netperf/sys/netinet/ip_flow.h#2 edit Differences ... ==== //depot/projects/netperf/sys/netinet/ip_flow.c#2 (text+ko) ==== @@ -38,8 +38,10 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/mutex.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/kernel.h> @@ -59,10 +61,31 @@ #define IPFLOW_TIMER (5 * PR_SLOWHZ) #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ #define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) -static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE]; +#if IPFLOW_HASHSIZE > 255 +#error "make ipf_hash larger" +#endif +static struct ipflow_head ipflows[IPFLOW_HASHSIZE]; static int ipflow_inuse; #define IPFLOW_MAX 256 +/* + * Each flow list has a lock that guards updates to the list and to + * all entries on the list. Flow entries hold the hash index for + * finding the head of the list so the lock can be found quickly. + * + * ipflow_inuse holds a count of the number of flow entries present. + * This is used to bound the size of the table. When IPFLOW_MAX entries + * are present and an additional entry is needed one is chosen for + * replacement. We could use atomic ops for this counter but having it + * inconsistent doesn't appear to be a problem. + */ +#define IPFLOW_HEAD_LOCK(_ipfh) mtx_lock(&(_ipfh)->ipfh_mtx) +#define IPFLOW_HEAD_UNLOCK(_ipfh) mtx_unlock(&(_ipfh)->ipfh_mtx) +#define IPFLOW_LOCK(_ipf) \ + IPFLOW_HEAD_LOCK(&ipflows[(_ipf)->ipf_hash]) +#define IPFLOW_UNLOCK(_ipf) \ + IPFLOW_HEAD_UNLOCK(&ipflows[(_ipf)->ipf_hash]) + static int ipflow_active = 0; SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, &ipflow_active, 0, "Enable flow-based IP forwarding"); @@ -70,10 +93,7 @@ static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); static unsigned -ipflow_hash( - struct in_addr dst, - struct in_addr src, - unsigned tos) +ipflow_hash(struct in_addr dst, struct in_addr src, unsigned tos) { unsigned hash = tos; int idx; @@ -83,28 +103,30 @@ } static struct ipflow * -ipflow_lookup( - const struct ip *ip) +ipflow_lookup(const struct ip *ip) { unsigned hash; + struct ipflow_head *head; struct ipflow *ipf; hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + head = &ipflows[hash]; - ipf = LIST_FIRST(&ipflows[hash]); - while (ipf != NULL) { + IPFLOW_HEAD_LOCK(head); + LIST_FOREACH(ipf, &head->ipfh_head, ipf_next) { if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr && ip->ip_src.s_addr == ipf->ipf_src.s_addr - && ip->ip_tos == ipf->ipf_tos) - break; - ipf = LIST_NEXT(ipf, ipf_next); + && ip->ip_tos == ipf->ipf_tos) { + /* NB: return head locked */ + return ipf; + } } - return ipf; + IPFLOW_HEAD_UNLOCK(head); + return NULL; } int -ipflow_fastforward( - struct mbuf *m) +ipflow_fastforward(struct mbuf *m) { struct ip *ip; struct ipflow *ipf; @@ -134,14 +156,18 @@ * Route and interface still up? */ rt = ipf->ipf_ro.ro_rt; - if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) + if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) { + IPFLOW_UNLOCK(ipf); return 0; + } /* * Packet size OK? TTL? */ - if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) + if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) { + IPFLOW_UNLOCK(ipf); return 0; + } /* * Everything checks out and so we can forward this packet. @@ -170,12 +196,12 @@ else ipf->ipf_errors++; } + IPFLOW_UNLOCK(ipf); return 1; } - + static void -ipflow_addstats( - struct ipflow *ipf) +ipflow_addstats(struct ipflow *ipf) { ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; @@ -183,36 +209,21 @@ ipstat.ips_fastforward += ipf->ipf_uses; } -static void -ipflow_free( - struct ipflow *ipf) -{ - int s; - /* - * Remove the flow from the hash table (at elevated IPL). - * Once it's off the list, we can deal with it at normal - * network IPL. - */ - s = splimp(); - LIST_REMOVE(ipf, ipf_next); - splx(s); - ipflow_addstats(ipf); - RTFREE(ipf->ipf_ro.ro_rt); - ipflow_inuse--; - free(ipf, M_IPFLOW); -} - +/* + * XXX the locking here makes reaping an entry very expensive... + */ static struct ipflow * -ipflow_reap( - void) +ipflow_reap(void) { - struct ipflow *ipf, *maybe_ipf = NULL; + struct ipflow *victim = NULL; + struct ipflow *ipf; int idx; - int s; for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { - ipf = LIST_FIRST(&ipflows[idx]); - while (ipf != NULL) { + struct ipflow_head *head = &ipflows[idx]; + + IPFLOW_HEAD_LOCK(head); + LIST_FOREACH(ipf, &head->ipfh_head, ipf_next) { /* * If this no longer points to a valid route * reclaim it. @@ -224,38 +235,58 @@ * or has had the least uses in the last 1.5 * intervals. */ - if (maybe_ipf == NULL - || ipf->ipf_timer < maybe_ipf->ipf_timer - || (ipf->ipf_timer == maybe_ipf->ipf_timer + if (victim == NULL) + victim = ipf; + else if (ipf->ipf_timer < victim->ipf_timer + || (ipf->ipf_timer == victim->ipf_timer && ipf->ipf_last_uses + ipf->ipf_uses < - maybe_ipf->ipf_last_uses + - maybe_ipf->ipf_uses)) - maybe_ipf = ipf; - ipf = LIST_NEXT(ipf, ipf_next); + victim->ipf_last_uses + victim->ipf_uses)) { + if (victim->ipf_hash != ipf->ipf_hash) + IPFLOW_UNLOCK(victim); + victim = ipf; + } } + if (victim && victim->ipf_hash != idx) + IPFLOW_HEAD_UNLOCK(head); } - ipf = maybe_ipf; + ipf = victim; done: /* * Remove the entry from the flow table. */ - s = splimp(); LIST_REMOVE(ipf, ipf_next); - splx(s); + IPFLOW_UNLOCK(ipf); + ipflow_addstats(ipf); RTFREE(ipf->ipf_ro.ro_rt); return ipf; } +static void +ipflow_free(struct ipflow *ipf) +{ + /* + * Remove the flow from the hash table. + */ + LIST_REMOVE(ipf, ipf_next); + + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipflow_inuse--; + free(ipf, M_IPFLOW); +} + void -ipflow_slowtimo( - void) +ipflow_slowtimo(void) { struct ipflow *ipf; int idx; for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { - ipf = LIST_FIRST(&ipflows[idx]); + struct ipflow_head *head = &ipflows[idx]; + + IPFLOW_HEAD_LOCK(head); + ipf = LIST_FIRST(&head->ipfh_head); while (ipf != NULL) { struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next); if (--ipf->ipf_timer == 0) { @@ -269,18 +300,15 @@ } ipf = next_ipf; } + IPFLOW_HEAD_UNLOCK(head); } } void -ipflow_create( - const struct route *ro, - struct mbuf *m) +ipflow_create(const struct route *ro, struct mbuf *m) { const struct ip *const ip = mtod(m, struct ip *); struct ipflow *ipf; - unsigned hash; - int s; /* * Don't create cache entries for ICMP messages. @@ -304,12 +332,18 @@ ipflow_inuse++; } bzero((caddr_t) ipf, sizeof(*ipf)); + + ipf->ipf_hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + ipf->ipf_dst = ip->ip_dst; + ipf->ipf_src = ip->ip_src; + ipf->ipf_tos = ip->ip_tos; + + IPFLOW_LOCK(ipf); } else { - s = splimp(); LIST_REMOVE(ipf, ipf_next); - splx(s); - ipflow_addstats(ipf); - RTFREE(ipf->ipf_ro.ro_rt); + + ipflow_addstats(ipf); /* add stats to old route */ + RTFREE(ipf->ipf_ro.ro_rt); /* clear reference */ ipf->ipf_uses = ipf->ipf_last_uses = 0; ipf->ipf_errors = ipf->ipf_dropped = 0; } @@ -318,16 +352,26 @@ * Fill in the updated information. */ ipf->ipf_ro = *ro; + RT_LOCK(ro->ro_rt); ro->ro_rt->rt_refcnt++; - ipf->ipf_dst = ip->ip_dst; - ipf->ipf_src = ip->ip_src; - ipf->ipf_tos = ip->ip_tos; + RT_UNLOCK(ro->ro_rt); ipf->ipf_timer = IPFLOW_TIMER; /* * Insert into the approriate bucket of the flow table. */ - hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); - s = splimp(); - LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next); - splx(s); + LIST_INSERT_HEAD(&ipflows[ipf->ipf_hash].ipfh_head, ipf, ipf_next); + IPFLOW_UNLOCK(ipf); +} + +static void +ipflow_init(void) +{ + int idx; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + struct ipflow_head *head = &ipflows[idx]; + LIST_INIT(&head->ipfh_head); + mtx_init(&head->ipfh_mtx, "ipflow list head", NULL, MTX_DEF); + } } +SYSINIT(ipflow, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipflow_init, 0); ==== //depot/projects/netperf/sys/netinet/ip_flow.h#2 (text+ko) ==== @@ -44,6 +44,8 @@ struct in_addr ipf_dst; /* destination address */ struct in_addr ipf_src; /* source address */ + /* NB: this assumes the size of the list head hash table is <=256 */ + u_int8_t ipf_hash; /* index in list head table */ u_int8_t ipf_tos; /* type-of-service */ struct route ipf_ro; /* associated route entry */ u_long ipf_uses; /* number of uses in this period */ @@ -54,4 +56,9 @@ u_long ipf_last_uses; /* number of uses in last period */ }; +struct ipflow_head { + LIST_HEAD(ipflowhead, ipflow) ipfh_head; + struct mtx ipfh_mtx; +}; + #endif
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200308052318.h75NIQ14073824>