Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 10 Jul 2014 03:10:57 +0000 (UTC)
From:      Adrian Chadd <adrian@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r268479 - head/sys/netinet
Message-ID:  <201407100310.s6A3AvQ5093684@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: adrian
Date: Thu Jul 10 03:10:56 2014
New Revision: 268479
URL: http://svnweb.freebsd.org/changeset/base/268479

Log:
  Implement the first stage of multi-bind listen sockets and RSS socket
  awareness.
  
  * Introduce IP_BINDMULTI - indicating that it's okay to bind multiple
    sockets on the same bind details.
  
    Although the PCB code has been taught about this (see below) this patch
    doesn't introduce the rest of the PCB changes necessary to distribute
    lookups among multiple PCB entries in the global wildcard table.
  
  * Introduce IP_RSS_LISTEN_BUCKET - placing an listen socket into the
    given RSS bucket (and thus a single PCBGROUP hash.)
  
  * Modify the PCB add path to be aware of IP_BINDMULTI:
    + Only allow further PCB entries to be added if the owner credentials
      and IP_BINDMULTI has been specified.  Ie, only allow further
      IP_BINDMULTI sockets to appear if the first bind() was IP_BINDMULTI.
  
  * Teach the PCBGROUP code about IP_RSS_LISTE_BUCKET marked PCB entries.
    Instead of using the wildcard logic and hashing, these sockets are
    simply placed into the PCBGROUP and _not_ in the wildcard hash.
  
  * When doing a PCBGROUP lookup, also do a wildcard match as well.
    This allows for an RSS bucket PCB entry to appear in a PCBGROUP
    rather than having to exist in the wildcard list.
  
  Tested:
  
  * TCP IPv4 server testing with igb(4)
  * TCP IPv4 server testing with ix(4)
  
  TODO:
  
  * The pcbgroup lookup code duplicated the wildcard and wildcard-PCB
    logic.  This could be refactored into a single function.
  
  * This doesn't yet work for IPv6 (The PCBGROUP code in netinet6/ doesn't
    yet know about this); nor does it yet fully work for UDP.

Modified:
  head/sys/netinet/in.h
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/in_pcbgroup.c
  head/sys/netinet/ip_output.c

Modified: head/sys/netinet/in.h
==============================================================================
--- head/sys/netinet/in.h	Thu Jul 10 02:15:16 2014	(r268478)
+++ head/sys/netinet/in.h	Thu Jul 10 03:10:56 2014	(r268479)
@@ -432,6 +432,8 @@ __END_DECLS
 
 #define	IP_ONESBCAST		23   /* bool: send all-ones broadcast */
 #define	IP_BINDANY		24   /* bool: allow bind to any address */
+#define	IP_BINDMULTI		25   /* bool: allow multiple listeners on a tuple */
+#define	IP_RSS_LISTEN_BUCKET	26   /* int; set RSS listen bucket */
 
 /*
  * Options for controlling the firewall and dummynet.

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Thu Jul 10 02:15:16 2014	(r268478)
+++ head/sys/netinet/in_pcb.c	Thu Jul 10 03:10:56 2014	(r268479)
@@ -488,6 +488,36 @@ inp_so_options(const struct inpcb *inp)
 
 #ifdef INET
 /*
+ * Check if a new BINDMULTI socket is allowed to be created.
+ *
+ * ni points to the new inp.
+ * oi points to the exisitng inp.
+ *
+ * This checks whether the existing inp also has BINDMULTI and
+ * whether the credentials match.
+ */
+static int
+in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
+{
+	/* Check permissions match */
+	if ((ni->inp_flags2 & INP_BINDMULTI) &&
+	    (ni->inp_cred->cr_uid !=
+	    oi->inp_cred->cr_uid))
+		return (0);
+
+	/* Check the existing inp has BINDMULTI set */
+	if ((ni->inp_flags2 & INP_BINDMULTI) &&
+	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
+		return (0);
+
+	/*
+	 * We're okay - either INP_BINDMULTI isn't set on ni, or
+	 * it is and it matches the checks.
+	 */
+	return (1);
+}
+
+/*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
@@ -589,6 +619,7 @@ in_pcbbind_setup(struct inpcb *inp, stru
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
+				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
@@ -598,6 +629,15 @@ in_pcbbind_setup(struct inpcb *inp, stru
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
+
+				/*
+				 * If the socket is a BINDMULTI socket, then
+				 * the credentials need to match and the
+				 * original socket also has to have been bound
+				 * with BINDMULTI.
+				 */
+				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
+					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
@@ -612,7 +652,9 @@ in_pcbbind_setup(struct inpcb *inp, stru
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
-			} else if (t && (reuseport & inp_so_options(t)) == 0) {
+			} else if (t &&
+			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+			    (reuseport & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
@@ -622,6 +664,8 @@ in_pcbbind_setup(struct inpcb *inp, stru
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 				return (EADDRINUSE);
+				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
+					return (EADDRINUSE);
 			}
 		}
 	}
@@ -1556,6 +1600,88 @@ in_pcblookup_group(struct inpcbinfo *pcb
 		goto found;
 	}
 
+#ifdef	RSS
+	/*
+	 * For incoming connections, we may wish to do a wildcard
+	 * match for an RSS-local socket.
+	 */
+	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
+		struct inpcb *local_wild = NULL, *local_exact = NULL;
+#ifdef INET6
+		struct inpcb *local_wild_mapped = NULL;
+#endif
+		struct inpcb *jail_wild = NULL;
+		struct inpcbhead *head;
+		int injail;
+
+		/*
+		 * Order of socket selection - we always prefer jails.
+		 *      1. jailed, non-wild.
+		 *      2. jailed, wild.
+		 *      3. non-jailed, non-wild.
+		 *      4. non-jailed, wild.
+		 */
+
+		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
+		    lport, 0, pcbgroup->ipg_hashmask)];
+		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
+#ifdef INET6
+			/* XXX inp locking */
+			if ((inp->inp_vflag & INP_IPV4) == 0)
+				continue;
+#endif
+			if (inp->inp_faddr.s_addr != INADDR_ANY ||
+			    inp->inp_lport != lport)
+				continue;
+
+			/* XXX inp locking */
+			if (ifp && ifp->if_type == IFT_FAITH &&
+			    (inp->inp_flags & INP_FAITH) == 0)
+				continue;
+
+			injail = prison_flag(inp->inp_cred, PR_IP4);
+			if (injail) {
+				if (prison_check_ip4(inp->inp_cred,
+				    &laddr) != 0)
+					continue;
+			} else {
+				if (local_exact != NULL)
+					continue;
+			}
+
+			if (inp->inp_laddr.s_addr == laddr.s_addr) {
+				if (injail)
+					goto found;
+				else
+					local_exact = inp;
+			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
+#ifdef INET6
+				/* XXX inp locking, NULL check */
+				if (inp->inp_vflag & INP_IPV6PROTO)
+					local_wild_mapped = inp;
+				else
+#endif
+					if (injail)
+						jail_wild = inp;
+					else
+						local_wild = inp;
+			}
+		} /* LIST_FOREACH */
+
+		inp = jail_wild;
+		if (inp == NULL)
+			inp = local_exact;
+		if (inp == NULL)
+			inp = local_wild;
+#ifdef INET6
+		if (inp == NULL)
+			inp = local_wild_mapped;
+#endif
+		if (inp != NULL)
+			goto found;
+	}
+#endif
+
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Thu Jul 10 02:15:16 2014	(r268478)
+++ head/sys/netinet/in_pcb.h	Thu Jul 10 03:10:56 2014	(r268479)
@@ -181,7 +181,8 @@ struct inpcb {
 	u_int	inp_refcount;		/* (i) refcount */
 	void	*inp_pspare[5];		/* (x) route caching / general use */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
-	u_int	inp_ispare[5];		/* (x) route caching / user cookie /
+	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
+	u_int	inp_ispare[4];		/* (x) route caching / user cookie /
 					 *     general use */
 
 	/* Local and foreign ports, local and foreign addr. */
@@ -546,6 +547,8 @@ short	inp_so_options(const struct inpcb 
 #define	INP_REUSEPORT		0x00000008 /* SO_REUSEPORT option is set */
 #define	INP_FREED		0x00000010 /* inp itself is not valid */
 #define	INP_REUSEADDR		0x00000020 /* SO_REUSEADDR option is set */
+#define	INP_BINDMULTI		0x00000040 /* IP_BINDMULTI option is set */
+#define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
 
 /*
  * Flags passed to in_pcblookup*() functions.

Modified: head/sys/netinet/in_pcbgroup.c
==============================================================================
--- head/sys/netinet/in_pcbgroup.c	Thu Jul 10 02:15:16 2014	(r268478)
+++ head/sys/netinet/in_pcbgroup.c	Thu Jul 10 03:10:56 2014	(r268479)
@@ -297,6 +297,18 @@ in_pcbgroup_bytuple(struct inpcbinfo *pc
 struct inpcbgroup *
 in_pcbgroup_byinpcb(struct inpcb *inp)
 {
+#ifdef	RSS
+	/*
+	 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
+	 * RSS bucket and thus we should use this pcbgroup, rather than
+	 * using a tuple or hash.
+	 *
+	 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
+	 * fits in that!
+	 */
+	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
+		return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
+#endif
 
 	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
 	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
@@ -346,6 +358,15 @@ in_pcbwild_remove(struct inpcb *inp)
 static __inline int
 in_pcbwild_needed(struct inpcb *inp)
 {
+#ifdef	RSS
+	/*
+	 * If it's a listen socket and INP_RSS_BUCKET_SET is set,
+	 * it's a wildcard socket _but_ it's in a specific pcbgroup.
+	 * Thus we don't treat it as a pcbwild inp.
+	 */
+	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
+		return (0);
+#endif
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
@@ -398,9 +419,24 @@ in_pcbgroup_update_internal(struct inpcb
 #endif
 			hashkey_faddr = inp->inp_faddr.s_addr;
 		INP_GROUP_LOCK(newpcbgroup);
-		pcbhash = &newpcbgroup->ipg_hashbase[
-		    INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
-		    newpcbgroup->ipg_hashmask)];
+		/*
+		 * If the inp is an RSS bucket wildcard entry, ensure
+		 * that the PCB hash is calculated correctly.
+		 *
+		 * The wildcard hash calculation differs from the
+		 * non-wildcard definition.  The source address is
+		 * INADDR_ANY and the far port is 0.
+		 */
+		if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
+			pcbhash = &newpcbgroup->ipg_hashbase[
+			    INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
+			    newpcbgroup->ipg_hashmask)];
+		} else {
+			pcbhash = &newpcbgroup->ipg_hashbase[
+			    INP_PCBHASH(hashkey_faddr, inp->inp_lport,
+			    inp->inp_fport,
+			    newpcbgroup->ipg_hashmask)];
+		}
 		LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
 		inp->inp_pcbgroup = newpcbgroup;
 		INP_GROUP_UNLOCK(newpcbgroup);

Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c	Thu Jul 10 02:15:16 2014	(r268478)
+++ head/sys/netinet/ip_output.c	Thu Jul 10 03:10:56 2014	(r268479)
@@ -1000,6 +1000,10 @@ ip_ctloutput(struct socket *so, struct s
 					break;
 			}
 			/* FALLTHROUGH */
+		case IP_BINDMULTI:
+#ifdef	RSS
+		case IP_RSS_LISTEN_BUCKET:
+#endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
@@ -1042,6 +1046,15 @@ ip_ctloutput(struct socket *so, struct s
 	INP_WUNLOCK(inp);						\
 } while (0)
 
+#define	OPTSET2(bit, val) do {						\
+	INP_WLOCK(inp);							\
+	if (val)							\
+		inp->inp_flags2 |= bit;					\
+	else								\
+		inp->inp_flags2 &= ~bit;				\
+	INP_WUNLOCK(inp);						\
+} while (0)
+
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
@@ -1078,9 +1091,24 @@ ip_ctloutput(struct socket *so, struct s
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
+			case IP_BINDMULTI:
+				OPTSET2(INP_BINDMULTI, optval);
+				break;
+#ifdef	RSS
+			case IP_RSS_LISTEN_BUCKET:
+				if ((optval >= 0) &&
+				    (optval < rss_getnumbuckets())) {
+					inp->inp_rss_listen_bucket = optval;
+					OPTSET2(INP_RSS_BUCKET_SET, 1);
+				} else {
+					error = EINVAL;
+				}
+				break;
+#endif
 			}
 			break;
 #undef OPTSET
+#undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
@@ -1188,8 +1216,12 @@ ip_ctloutput(struct socket *so, struct s
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
+		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
+#ifdef	RSS
+		case IP_RSSBUCKETID:
+#endif
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
@@ -1205,6 +1237,7 @@ ip_ctloutput(struct socket *so, struct s
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
+#define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
@@ -1268,6 +1301,9 @@ ip_ctloutput(struct socket *so, struct s
 					error = EINVAL;
 				break;
 #endif
+			case IP_BINDMULTI:
+				optval = OPTBIT2(INP_BINDMULTI);
+				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201407100310.s6A3AvQ5093684>