Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 6 Jun 2011 12:55:03 +0000 (UTC)
From:      Robert Watson <rwatson@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r222748 - in head/sys: conf netinet netinet/ipfw netinet6
Message-ID:  <201106061255.p56Ct3qN031795@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rwatson
Date: Mon Jun  6 12:55:02 2011
New Revision: 222748
URL: http://svn.freebsd.org/changeset/base/222748

Log:
  Implement a CPU-affine TCP and UDP connection lookup data structure,
  struct inpcbgroup.  pcbgroups, or "connection groups", supplement the
  existing inpcbinfo connection hash table, which when pcbgroups are
  enabled, might now be thought of more usefully as a per-protocol
  4-tuple reservation table.
  
  Connections are assigned to connection groups base on a hash of their
  4-tuple; wildcard sockets require special handling, and are members
  of all connection groups.  During a connection lookup, a
  per-connection group lock is employed rather than the global pcbinfo
  lock.  By aligning connection groups with input path processing,
  connection groups take on an effective CPU affinity, especially when
  aligned with RSS work placement (see a forthcoming commit for
  details).  This eliminates cache line migration associated with
  global, protocol-layer data structures in steady state TCP and UDP
  processing (with the exception of protocol-layer statistics; further
  commit to follow).
  
  Elements of this approach were inspired by Willman, Rixner, and Cox's
  2006 USENIX paper, "An Evaluation of Network Stack Parallelization
  Strategies in Modern Operating Systems".  However, there are also
  significant differences: we maintain the inpcb lock, rather than using
  the connection group lock for per-connection state.
  
  Likewise, the focus of this implementation is alignment with NIC
  packet distribution strategies such as RSS, rather than pure software
  strategies.  Despite that focus, software distribution is supported
  through the parallel netisr implementation, and works well in
  configurations where the number of hardware threads is greater than
  the number of NIC input queues, such as in the RMI XLR threaded MIPS
  architecture.
  
  Another important difference is the continued maintenance of existing
  hash tables as "reservation tables" -- these are useful both to
  distinguish the resource allocation aspect of protocol name management
  and the more common-case lookup aspect.  In configurations where
  connection tables are aligned with hardware hashes, it is desirable to
  use the traditional lookup tables for loopback or encapsulated traffic
  rather than take the expense of hardware hashes that are hard to
  implement efficiently in software (such as RSS Toeplitz).
  
  Connection group support is enabled by compiling "options PCBGROUP"
  into your kernel configuration; for the time being, this is an
  experimental feature, and hence is not enabled by default.
  
  Subject to the limited MFCability of change dependencies in inpcb,
  and its change to the inpcbinfo init function signature, this change
  in principle could be merged to FreeBSD 8.x.
  
  Reviewed by:    bz
  Sponsored by:   Juniper Networks, Inc.

Added:
  head/sys/netinet/in_pcbgroup.c   (contents, props changed)
  head/sys/netinet6/in6_pcbgroup.c   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/conf/options
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/ip_divert.c
  head/sys/netinet/ipfw/ip_fw2.c
  head/sys/netinet/raw_ip.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet/udp_usrreq.c
  head/sys/netinet6/in6_pcb.c
  head/sys/netinet6/in6_pcb.h

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Mon Jun  6 12:21:42 2011	(r222747)
+++ head/sys/conf/files	Mon Jun  6 12:55:02 2011	(r222748)
@@ -2748,6 +2748,7 @@ netinet/ip_gre.c		optional gre inet
 netinet/ip_id.c			optional inet
 netinet/in_mcast.c		optional inet
 netinet/in_pcb.c		optional inet | inet6
+netinet/in_pcbgroup.c		optional inet pcbgroup | inet6 pcbgroup
 netinet/in_proto.c		optional inet | inet6 \
 	compile-with "${NORMAL_C} -I$S/contrib/pf"
 netinet/in_rmx.c		optional inet
@@ -2825,6 +2826,7 @@ netinet6/in6_gif.c		optional gif inet6 |
 netinet6/in6_ifattach.c		optional inet6
 netinet6/in6_mcast.c		optional inet6
 netinet6/in6_pcb.c		optional inet6
+netinet6/in6_pcbgroup.c		optional inet6 pcbgroup
 netinet6/in6_proto.c		optional inet6
 netinet6/in6_rmx.c		optional inet6
 netinet6/in6_src.c		optional inet6

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Mon Jun  6 12:21:42 2011	(r222747)
+++ head/sys/conf/options	Mon Jun  6 12:55:02 2011	(r222748)
@@ -419,6 +419,7 @@ MROUTING		opt_mrouting.h
 NCP
 NETATALK		opt_atalk.h
 NFSLOCKD
+PCBGROUP		opt_pcbgroup.h
 RADIX_MPATH		opt_mpath.h
 ROUTETABLES		opt_route.h
 SLIP_IFF_OPTS		opt_slip.h

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Mon Jun  6 12:21:42 2011	(r222747)
+++ head/sys/netinet/in_pcb.c	Mon Jun  6 12:55:02 2011	(r222748)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_pcbgroup.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -212,7 +213,7 @@ void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
-    uint32_t inpcbzone_flags)
+    uint32_t inpcbzone_flags, u_int hashfields)
 {
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
@@ -227,6 +228,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinf
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
+#ifdef PCBGROUP
+	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
+#endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
 	    inpcbzone_flags);
@@ -246,6 +250,9 @@ in_pcbinfo_destroy(struct inpcbinfo *pcb
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
+#ifdef PCBGROUP
+	in_pcbgroup_destroy(pcbinfo);
+#endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
@@ -1053,7 +1060,8 @@ in_pcbdetach(struct inpcb *inp)
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
- * but where the inpcb lock is already held.
+ * but where the inpcb lock may already held, or when acquiring a reference
+ * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
@@ -1223,6 +1231,9 @@ in_pcbdrop(struct inpcb *inp)
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
+#ifdef PCBGROUP
+		in_pcbgroup_remove(inp);
+#endif
 	}
 }
 
@@ -1472,6 +1483,148 @@ in_pcblookup_local(struct inpcbinfo *pcb
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
+#ifdef PCBGROUP
+/*
+ * Lookup PCB in hash list, using pcbgroup tables.
+ */
+static struct inpcb *
+in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
+    struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
+    u_int lport_arg, int lookupflags, struct ifnet *ifp)
+{
+	struct inpcbhead *head;
+	struct inpcb *inp, *tmpinp;
+	u_short fport = fport_arg, lport = lport_arg;
+
+	/*
+	 * First look for an exact match.
+	 */
+	tmpinp = NULL;
+	INP_GROUP_LOCK(pcbgroup);
+	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
+	    pcbgroup->ipg_hashmask)];
+	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
+#ifdef INET6
+		/* XXX inp locking */
+		if ((inp->inp_vflag & INP_IPV4) == 0)
+			continue;
+#endif
+		if (inp->inp_faddr.s_addr == faddr.s_addr &&
+		    inp->inp_laddr.s_addr == laddr.s_addr &&
+		    inp->inp_fport == fport &&
+		    inp->inp_lport == lport) {
+			/*
+			 * XXX We should be able to directly return
+			 * the inp here, without any checks.
+			 * Well unless both bound with SO_REUSEPORT?
+			 */
+			if (prison_flag(inp->inp_cred, PR_IP4))
+				goto found;
+			if (tmpinp == NULL)
+				tmpinp = inp;
+		}
+	}
+	if (tmpinp != NULL) {
+		inp = tmpinp;
+		goto found;
+	}
+
+	/*
+	 * Then look for a wildcard match, if requested.
+	 */
+	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
+		struct inpcb *local_wild = NULL, *local_exact = NULL;
+#ifdef INET6
+		struct inpcb *local_wild_mapped = NULL;
+#endif
+		struct inpcb *jail_wild = NULL;
+		struct inpcbhead *head;
+		int injail;
+
+		/*
+		 * Order of socket selection - we always prefer jails.
+		 *      1. jailed, non-wild.
+		 *      2. jailed, wild.
+		 *      3. non-jailed, non-wild.
+		 *      4. non-jailed, wild.
+		 */
+		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
+		    0, pcbinfo->ipi_wildmask)];
+		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
+#ifdef INET6
+			/* XXX inp locking */
+			if ((inp->inp_vflag & INP_IPV4) == 0)
+				continue;
+#endif
+			if (inp->inp_faddr.s_addr != INADDR_ANY ||
+			    inp->inp_lport != lport)
+				continue;
+
+			/* XXX inp locking */
+			if (ifp && ifp->if_type == IFT_FAITH &&
+			    (inp->inp_flags & INP_FAITH) == 0)
+				continue;
+
+			injail = prison_flag(inp->inp_cred, PR_IP4);
+			if (injail) {
+				if (prison_check_ip4(inp->inp_cred,
+				    &laddr) != 0)
+					continue;
+			} else {
+				if (local_exact != NULL)
+					continue;
+			}
+
+			if (inp->inp_laddr.s_addr == laddr.s_addr) {
+				if (injail)
+					goto found;
+				else
+					local_exact = inp;
+			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
+#ifdef INET6
+				/* XXX inp locking, NULL check */
+				if (inp->inp_vflag & INP_IPV6PROTO)
+					local_wild_mapped = inp;
+				else
+#endif /* INET6 */
+					if (injail)
+						jail_wild = inp;
+					else
+						local_wild = inp;
+			}
+		} /* LIST_FOREACH */
+		inp = jail_wild;
+		if (inp == NULL)
+			inp = local_exact;
+		if (inp == NULL)
+			inp = local_wild;
+#ifdef INET6
+		if (inp == NULL)
+			inp = local_wild_mapped;
+#endif /* defined(INET6) */
+		if (inp != NULL)
+			goto found;
+	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
+	INP_GROUP_UNLOCK(pcbgroup);
+	return (NULL);
+
+found:
+	in_pcbref(inp);
+	INP_GROUP_UNLOCK(pcbgroup);
+	if (lookupflags & INPLOOKUP_WLOCKPCB) {
+		INP_WLOCK(inp);
+		if (in_pcbrele_wlocked(inp))
+			return (NULL);
+	} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
+		INP_RLOCK(inp);
+		if (in_pcbrele_rlocked(inp))
+			return (NULL);
+	} else
+		panic("%s: locking bug", __func__);
+	return (inp);
+}
+#endif /* PCBGROUP */
+
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
@@ -1636,17 +1789,30 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
+ *
+ * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
+#if defined(PCBGROUP)
+	struct inpcbgroup *pcbgroup;
+#endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
+#if defined(PCBGROUP)
+	if (in_pcbgroup_enabled(pcbinfo)) {
+		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
+		    fport);
+		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
+		    laddr, lport, lookupflags, ifp));
+	}
+#endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
@@ -1656,12 +1822,28 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbi
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
+#ifdef PCBGROUP
+	struct inpcbgroup *pcbgroup;
+#endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
+#ifdef PCBGROUP
+	if (in_pcbgroup_enabled(pcbinfo)) {
+		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
+		    m->m_pkthdr.flowid);
+		if (pcbgroup != NULL)
+			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
+			    fport, laddr, lport, lookupflags, ifp));
+		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
+		    fport);
+		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
+		    laddr, lport, lookupflags, ifp));
+	}
+#endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
@@ -1670,8 +1852,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbi
 /*
  * Insert PCB onto various hash lists.
  */
-int
-in_pcbinshash(struct inpcb *inp)
+static int
+in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
@@ -1721,10 +1903,39 @@ in_pcbinshash(struct inpcb *inp)
 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
+#ifdef PCBGROUP
+	if (do_pcbgroup_update)
+		in_pcbgroup_update(inp);
+#endif
 	return (0);
 }
 
 /*
+ * For now, there are two public interfaces to insert an inpcb into the hash
+ * lists -- one that does update pcbgroups, and one that doesn't.  The latter
+ * is used only in the TCP syncache, where in_pcbinshash is called before the
+ * full 4-tuple is set for the inpcb, and we don't want to install in the
+ * pcbgroup until later.
+ *
+ * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
+ * connection groups, and partially initialised inpcbs should not be exposed
+ * to either reservation hash tables or pcbgroups.
+ */
+int
+in_pcbinshash(struct inpcb *inp)
+{
+
+	return (in_pcbinshash_internal(inp, 1));
+}
+
+int
+in_pcbinshash_nopcbgroup(struct inpcb *inp)
+{
+
+	return (in_pcbinshash_internal(inp, 0));
+}
+
+/*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
@@ -1755,6 +1966,13 @@ in_pcbrehash_mbuf(struct inpcb *inp, str
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
+
+#ifdef PCBGROUP
+	if (m != NULL)
+		in_pcbgroup_update_mbuf(inp, m);
+	else
+		in_pcbgroup_update(inp);
+#endif
 }
 
 void
@@ -1791,6 +2009,9 @@ in_pcbremlists(struct inpcb *inp)
 	}
 	LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
+#ifdef PCBGROUP
+	in_pcbgroup_remove(inp);
+#endif
 }
 
 /*

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Mon Jun  6 12:21:42 2011	(r222747)
+++ head/sys/netinet/in_pcb.h	Mon Jun  6 12:55:02 2011	(r222748)
@@ -141,6 +141,7 @@ struct	icmp6_filter;
  *
  * Key:
  * (c) - Constant after initialization
+ * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
  * (s) - Protected by another subsystem's locks
@@ -160,9 +161,12 @@ struct	icmp6_filter;
  */
 struct inpcb {
 	LIST_ENTRY(inpcb) inp_hash;	/* (i/p) hash list */
+	LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	LIST_ENTRY(inpcb) inp_list;	/* (i/p) list for all PCBs for proto */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
+	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
+	LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
@@ -272,13 +276,14 @@ struct inpcbport {
  * the former covering mutable global fields (such as the global pcb list),
  * and the latter covering the hashed lookup tables.  The lock order is:
  *
- *    ipi_lock (before) inpcb locks (before) ipi_hash_lock
+ *    ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks}
  *
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
  * (g) Locked by ipi_lock
- * (h) Read using either ipi_hash_lock or inpcb lock; write requires both.
+ * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (p) Protected by one or more pcbgroup locks
  * (x) Synchronisation properties poorly defined
  */
 struct inpcbinfo {
@@ -312,7 +317,16 @@ struct inpcbinfo {
 	struct	uma_zone	*ipi_zone;		/* (c) */
 
 	/*
-	 * Global lock protecting hash lookup tables.
+	 * Connection groups associated with this protocol.  These fields are
+	 * constant, but pcbgroup structures themselves are protected by
+	 * per-pcbgroup locks.
+	 */
+	struct inpcbgroup	*ipi_pcbgroups;		/* (c) */
+	u_int			 ipi_npcbgroups;	/* (c) */
+	u_int			 ipi_hashfields;	/* (c) */
+
+	/*
+	 * Global lock protecting non-pcbgroup hash lookup tables.
 	 */
 	struct rwlock		 ipi_hash_lock;
 
@@ -330,6 +344,14 @@ struct inpcbinfo {
 	u_long			 ipi_porthashmask;	/* (h) */
 
 	/*
+	 * List of wildcard inpcbs for use with pcbgroups.  In the past, was
+	 * per-pcbgroup but is now global.  All pcbgroup locks must be held
+	 * to modify the list, so any is sufficient to read it.
+	 */
+	struct inpcbhead	*ipi_wildbase;		/* (p) */
+	u_long			 ipi_wildmask;		/* (p) */
+
+	/*
 	 * Pointer to network stack instance
 	 */
 	struct vnet		*ipi_vnet;		/* (c) */
@@ -340,6 +362,31 @@ struct inpcbinfo {
 	void 			*ipi_pspare[2];
 };
 
+/*
+ * Connection groups hold sets of connections that have similar CPU/thread
+ * affinity.  Each connection belongs to exactly one connection group.
+ */
+struct inpcbgroup {
+	/*
+	 * Per-connection group hash of inpcbs, hashed by local and foreign
+	 * addresses and port numbers.
+	 */
+	struct inpcbhead	*ipg_hashbase;		/* (c) */
+	u_long			 ipg_hashmask;		/* (c) */
+
+	/*
+	 * Notional affinity of this pcbgroup.
+	 */
+	u_int			 ipg_cpu;		/* (p) */
+
+	/*
+	 * Per-connection group lock, not to be confused with ipi_lock.
+	 * Protects the hash table hung off the group, but also the global
+	 * wildcard list in inpcbinfo.
+	 */
+	struct mtx		 ipg_lock;
+} __aligned(CACHE_LINE_SIZE);
+
 #define INP_LOCK_INIT(inp, d, t) \
 	rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE |  RW_DUPOK)
 #define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
@@ -423,6 +470,14 @@ void 	inp_4tuple_get(struct inpcb *inp, 
 #define	INP_HASH_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
 					    RA_WLOCKED)
 
+#define	INP_GROUP_LOCK_INIT(ipg, d)	mtx_init(&(ipg)->ipg_lock, (d), NULL, \
+					    MTX_DEF | MTX_DUPOK)
+#define	INP_GROUP_LOCK_DESTROY(ipg)	mtx_destroy(&(ipg)->ipg_lock)
+
+#define	INP_GROUP_LOCK(ipg)		mtx_lock(&(ipg)->ipg_lock)
+#define	INP_GROUP_LOCK_ASSERT(ipg)	mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
+#define	INP_GROUP_UNLOCK(ipg)		mtx_unlock(&(ipg)->ipg_lock)
+
 #define INP_PCBHASH(faddr, lport, fport, mask) \
 	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
@@ -482,6 +537,7 @@ void 	inp_4tuple_get(struct inpcb *inp, 
  */
 #define	INP_LLE_VALID		0x00000001 /* cached lle is valid */	
 #define	INP_RT_VALID		0x00000002 /* cached rtentry is valid */
+#define	INP_PCBGROUPWILD	0x00000004 /* in pcbgroup wildcard list */
 
 /*
  * Flags passed to in_pcblookup*() functions.
@@ -500,6 +556,13 @@ void 	inp_4tuple_get(struct inpcb *inp, 
 
 #define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
 
+/*
+ * Constants for pcbinfo.ipi_hashfields.
+ */
+#define	IPI_HASHFIELDS_NONE	0
+#define	IPI_HASHFIELDS_2TUPLE	1
+#define	IPI_HASHFIELDS_4TUPLE	2
+
 #ifdef _KERNEL
 VNET_DECLARE(int, ipport_reservedhigh);
 VNET_DECLARE(int, ipport_reservedlow);
@@ -531,7 +594,21 @@ VNET_DECLARE(int, ipport_tcpallocs);
 
 void	in_pcbinfo_destroy(struct inpcbinfo *);
 void	in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
-	    int, int, char *, uma_init, uma_fini, uint32_t);
+	    int, int, char *, uma_init, uma_fini, uint32_t, u_int);
+
+struct inpcbgroup *
+	in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
+struct inpcbgroup *
+	in_pcbgroup_byinpcb(struct inpcb *);
+struct inpcbgroup *
+	in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short,
+	    struct in_addr, u_short);
+void	in_pcbgroup_destroy(struct inpcbinfo *);
+int	in_pcbgroup_enabled(struct inpcbinfo *);
+void	in_pcbgroup_init(struct inpcbinfo *, u_int, int);
+void	in_pcbgroup_remove(struct inpcb *);
+void	in_pcbgroup_update(struct inpcb *);
+void	in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *);
 
 void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
 int	in_pcballoc(struct socket *, struct inpcbinfo *);
@@ -551,6 +628,7 @@ void	in_pcbdisconnect(struct inpcb *);
 void	in_pcbdrop(struct inpcb *);
 void	in_pcbfree(struct inpcb *);
 int	in_pcbinshash(struct inpcb *);
+int	in_pcbinshash_nopcbgroup(struct inpcb *);
 struct inpcb *
 	in_pcblookup_local(struct inpcbinfo *,
 	    struct in_addr, u_short, int, struct ucred *);

Added: head/sys/netinet/in_pcbgroup.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/in_pcbgroup.c	Mon Jun  6 12:55:02 2011	(r222748)
@@ -0,0 +1,457 @@
+/*-
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson under contract
+ * to Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+#include <sys/socketvar.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif /* INET6 */
+
+/*
+ * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
+ * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
+ * Strategies in Modern Operating Systems".  This implementation differs
+ * significantly from that described in the paper, in that it attempts to
+ * introduce not just notions of affinity for connections and distribute work
+ * so as to reduce lock contention, but also align those notions with
+ * hardware work distribution strategies such as RSS.  In this construction,
+ * connection groups supplement, rather than replace, existing reservation
+ * tables for protocol 4-tuples, offering CPU-affine lookup tables with
+ * minimal cache line migration and lock contention during steady state
+ * operation.
+ *
+ * Internet protocols, such as UDP and TCP, register to use connection groups
+ * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
+ * indicates to the connection group code whether a 2-tuple or 4-tuple is
+ * used as an argument to hashes that assign a connection to a particular
+ * group.  This must be aligned with any hardware offloaded distribution
+ * model, such as RSS or similar approaches taken in embedded network boards.
+ * Wildcard sockets require special handling, as in Willman 2006, and are
+ * shared between connection groups -- while being protected by group-local
+ * locks.  This means that connection establishment and teardown can be
+ * signficantly more expensive than without connection groups, but that
+ * steady-state processing can be significantly faster.
+ *
+ * Most of the implementation of connection groups is in this file; however,
+ * connection group lookup is implemented in in_pcb.c alongside reservation
+ * table lookups -- see in_pcblookup_group().
+ *
+ * TODO:
+ *
+ * Implement dynamic rebalancing of buckets with connection groups; when
+ * load is unevenly distributed, search for more optimal balancing on
+ * demand.  This might require scaling up the number of connection groups
+ * by <<1.
+ *
+ * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
+ * groups for ip_input and ip6_input, allowing non-offloaded work
+ * distribution.
+ *
+ * Expose effective CPU affinity of connections to userspace using socket
+ * options.
+ *
+ * Investigate per-connection affinity overrides based on socket options; an
+ * option could be set, certainly resulting in work being distributed
+ * differently in software, and possibly propagated to supporting hardware
+ * with TCAMs or hardware hash tables.  This might require connections to
+ * exist in more than one connection group at a time.
+ *
+ * Hook netisr thread reconfiguration events, and propagate those to RSS so
+ * that rebalancing can occur when the thread pool grows or shrinks.
+ *
+ * Expose per-pcbgroup statistics to userspace monitoring tools such as
+ * netstat, in order to allow better debugging and profiling.
+ */
+
+void
+in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
+    int hash_nelements)
+{
+	struct inpcbgroup *pcbgroup;
+	u_int numpcbgroups, pgn;
+
+	/*
+	 * Only enable connection groups for a protocol if it has been
+	 * specifically requested.
+	 */
+	if (hashfields == IPI_HASHFIELDS_NONE)
+		return;
+
+	/*
+	 * Connection groups are about multi-processor load distribution,
+	 * lock contention, and connection CPU affinity.  As such, no point
+	 * in turning them on for a uniprocessor machine, it only wastes
+	 * memory.
+	 */
+	if (mp_ncpus == 1)
+		return;
+
+	/*
+	 * Use one group per CPU for now.  If we decide to do dynamic
+	 * rebalancing a la RSS, we'll need to shift left by at least 1.
+	 */
+	numpcbgroups = mp_ncpus;
+
+	pcbinfo->ipi_hashfields = hashfields;
+	pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
+	    sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
+	pcbinfo->ipi_npcbgroups = numpcbgroups;
+	pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
+	    &pcbinfo->ipi_wildmask);
+	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
+		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
+		pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
+		    &pcbgroup->ipg_hashmask);
+		INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
+
+		/*
+		 * Initialise notional affinity of the pcbgroup -- for RSS,
+		 * we want the same notion of affinity as NICs to be used.
+		 * Just round robin for the time being.
+		 */
+		pcbgroup->ipg_cpu = (pgn % mp_ncpus);
+	}
+}
+
+void
+in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
+{
+	struct inpcbgroup *pcbgroup;
+	u_int pgn;
+
+	if (pcbinfo->ipi_npcbgroups == 0)
+		return;
+
+	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
+		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
+		KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
+		    ("in_pcbinfo_destroy: listhead not empty"));
+		INP_GROUP_LOCK_DESTROY(pcbgroup);
+		hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
+		    pcbgroup->ipg_hashmask);
+	}
+	hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
+	free(pcbinfo->ipi_pcbgroups, M_PCB);
+	pcbinfo->ipi_pcbgroups = NULL;
+	pcbinfo->ipi_npcbgroups = 0;
+	pcbinfo->ipi_hashfields = 0;
+}
+
+/*
+ * Given a hash of whatever the covered tuple might be, return a pcbgroup
+ * index.
+ */
+static __inline u_int
+in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
+{
+
+	return (hash % pcbinfo->ipi_npcbgroups);
+}
+
+/*
+ * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
+ * information is insufficient to identify the pcbgroup.
+ */
+struct inpcbgroup *
+in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
+{
+
+	return (NULL);
+}
+
+static struct inpcbgroup *
+in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
+{
+
+	return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
+	    m->m_pkthdr.flowid));
+}
+
+struct inpcbgroup *
+in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
+    u_short lport, struct in_addr faddr, u_short fport)
+{
+	uint32_t hash;
+
+	switch (pcbinfo->ipi_hashfields) {
+	case IPI_HASHFIELDS_4TUPLE:
+		hash = faddr.s_addr ^ fport;
+		break;
+
+	case IPI_HASHFIELDS_2TUPLE:
+		hash = faddr.s_addr ^ laddr.s_addr;
+		break;
+
+	default:
+		hash = 0;
+	}
+	return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
+	    hash)]);
+}
+
+struct inpcbgroup *
+in_pcbgroup_byinpcb(struct inpcb *inp)
+{
+
+	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
+	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
+}
+
+static void
+in_pcbwild_add(struct inpcb *inp)
+{
+	struct inpcbinfo *pcbinfo;
+	struct inpcbhead *head;
+	u_int pgn;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
+	    ("%s: is wild",__func__));
+
+	pcbinfo = inp->inp_pcbinfo;
+	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
+		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
+	head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
+	    0, pcbinfo->ipi_wildmask)];
+	LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
+	inp->inp_flags2 |= INP_PCBGROUPWILD;
+	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
+		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
+}
+
+static void
+in_pcbwild_remove(struct inpcb *inp)
+{
+	struct inpcbinfo *pcbinfo;
+	u_int pgn;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
+	    ("%s: not wild", __func__));
+
+	pcbinfo = inp->inp_pcbinfo;
+	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
+		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
+	LIST_REMOVE(inp, inp_pcbgroup_wild);
+	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
+		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
+	inp->inp_flags2 &= ~INP_PCBGROUPWILD;
+}
+
+static __inline int
+in_pcbwild_needed(struct inpcb *inp)
+{
+
+#ifdef INET6
+	if (inp->inp_vflag & INP_IPV6)
+		return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
+	else
+#endif
+		return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
+}
+
+static void
+in_pcbwild_update_internal(struct inpcb *inp)
+{
+	int wildcard_needed;
+
+	wildcard_needed = in_pcbwild_needed(inp);
+	if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
+		in_pcbwild_add(inp);
+	else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
+		in_pcbwild_remove(inp);
+}
+
+/*
+ * Update the pcbgroup of an inpcb, which might include removing an old
+ * pcbgroup reference and/or adding a new one.  Wildcard processing is not
+ * performed here, although ideally we'll never install a pcbgroup for a
+ * wildcard inpcb (asserted below).
+ */
+static void
+in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
+    struct inpcbgroup *newpcbgroup, struct inpcb *inp)
+{
+	struct inpcbgroup *oldpcbgroup;
+	struct inpcbhead *pcbhash;
+	uint32_t hashkey_faddr;
+
+	INP_WLOCK_ASSERT(inp);
+
+	oldpcbgroup = inp->inp_pcbgroup;
+	if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
+		INP_GROUP_LOCK(oldpcbgroup);
+		LIST_REMOVE(inp, inp_pcbgrouphash);
+		inp->inp_pcbgroup = NULL;
+		INP_GROUP_UNLOCK(oldpcbgroup);
+	}
+	if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
+#ifdef INET6
+		if (inp->inp_vflag & INP_IPV6)
+			hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */
+		else
+#endif
+			hashkey_faddr = inp->inp_faddr.s_addr;
+		INP_GROUP_LOCK(newpcbgroup);
+		pcbhash = &newpcbgroup->ipg_hashbase[
+		    INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
+		    newpcbgroup->ipg_hashmask)];
+		LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
+		inp->inp_pcbgroup = newpcbgroup;
+		INP_GROUP_UNLOCK(newpcbgroup);
+	}
+
+	KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
+	    ("%s: pcbgroup and wildcard!", __func__));
+}
+
+/*
+ * Two update paths: one in which the 4-tuple on an inpcb has been updated
+ * and therefore connection groups may need to change (or a wildcard entry
+ * may needed to be installed), and another in which the 4-tuple has been
+ * set as a result of a packet received, in which case we may be able to use
+ * the hash on the mbuf to avoid doing a software hash calculation for RSS.
+ *
+ * In each case: first, let the wildcard code have a go at placing it as a
+ * wildcard socket.  If it was a wildcard, or if the connection has been
+ * dropped, then no pcbgroup is required (so potentially clear it);
+ * otherwise, calculate and update the pcbgroup for the inpcb.
+ */
+void
+in_pcbgroup_update(struct inpcb *inp)
+{
+	struct inpcbinfo *pcbinfo;
+	struct inpcbgroup *newpcbgroup;
+
+	INP_WLOCK_ASSERT(inp);
+
+	pcbinfo = inp->inp_pcbinfo;
+	if (!in_pcbgroup_enabled(pcbinfo))
+		return;
+
+	in_pcbwild_update_internal(inp);
+	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
+	    !(inp->inp_flags & INP_DROPPED)) {
+#ifdef INET6
+		if (inp->inp_vflag & INP_IPV6)
+			newpcbgroup = in6_pcbgroup_byinpcb(inp);
+		else
+#endif
+			newpcbgroup = in_pcbgroup_byinpcb(inp);
+	} else
+		newpcbgroup = NULL;
+	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
+}
+
+void
+in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
+{
+	struct inpcbinfo *pcbinfo;
+	struct inpcbgroup *newpcbgroup;
+
+	INP_WLOCK_ASSERT(inp);
+
+	pcbinfo = inp->inp_pcbinfo;
+	if (!in_pcbgroup_enabled(pcbinfo))
+		return;
+
+	/*
+	 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
+	 * it; presumably this function should never be called for anything
+	 * other than non-wildcard socket?
+	 */
+	in_pcbwild_update_internal(inp);
+	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
+	    !(inp->inp_flags & INP_DROPPED)) {
+		newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
+#ifdef INET6
+		if (inp->inp_vflag & INP_IPV6) {
+			if (newpcbgroup == NULL)
+				newpcbgroup = in6_pcbgroup_byinpcb(inp);
+		} else {
+#endif
+			if (newpcbgroup == NULL)
+				newpcbgroup = in_pcbgroup_byinpcb(inp);
+#ifdef INET6
+		}
+#endif
+	} else
+		newpcbgroup = NULL;
+	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
+}
+
+/*
+ * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
+ */
+void
+in_pcbgroup_remove(struct inpcb *inp)
+{
+	struct inpcbgroup *pcbgroup;
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
+		return;
+
+	if (inp->inp_flags2 & INP_PCBGROUPWILD)
+		in_pcbwild_remove(inp);
+
+	pcbgroup = inp->inp_pcbgroup;
+	if (pcbgroup != NULL) {
+		INP_GROUP_LOCK(pcbgroup);
+		LIST_REMOVE(inp, inp_pcbgrouphash);
+		inp->inp_pcbgroup = NULL;
+		INP_GROUP_UNLOCK(pcbgroup);
+	}
+}
+
+/*
+ * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
+ * for a protocol.
+ */
+int
+in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201106061255.p56Ct3qN031795>