Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 20 Apr 2023 16:13:50 GMT
From:      Mark Johnston <markj@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: fdb987bebddf - main - inpcb: Split PCB hash tables
Message-ID:  <202304201613.33KGDov0045387@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=fdb987bebddf05e15a5af840379c7715a94aec1c

commit fdb987bebddf05e15a5af840379c7715a94aec1c
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-04-20 15:48:01 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-04-20 16:13:06 +0000

    inpcb: Split PCB hash tables
    
    Currently we use a single hash table per PCB database for connected and
    bound PCBs.  Since we started using net_epoch to synchronize hash table
    lookups, there's been a bug, noted in a comment above in_pcbrehash():
    connecting a socket can cause an inpcb to move between hash chains, and
    this can cause a concurrent lookup to follow the wrong linkage pointers.
    I believe this could cause rare, spurious ECONNREFUSED errors in the
    worse case.
    
    Address the problem by introducing a second hash table and adding more
    linkage pointers to struct inpcb.  Now the database has one table each
    for connected and unconnected sockets.
    
    When inserting an inpcb into the hash table, in_pcbinhash() now looks at
    the foreign address of the inpcb to figure out which table to use.  This
    ensures that queue linkage pointers are stable until the socket is
    disconnected, so the problem described above goes away.  There is also a
    small benefit in that in_pcblookup_*() can now search just one of the
    two possible hash buckets.
    
    I also made the "rehash" parameter of in(6)_pcbconnect() unused.  This
    parameter seems confusing and it is simpler to let the inpcb code figure
    out what to do using the existing INP_INHASHLIST flag.
    
    UDP sockets pose a special problem since they can be connected and
    disconnected multiple times during their lifecycle.  To handle this, the
    patch plugs a hole in the inpcb structure and uses it to store an SMR
    sequence number.  When an inpcb is disconnected - an operation which
    requires the global PCB database hash lock - the write sequence number
    is advanced, and in order to reconnect, the connecting thread must wait
    for readers to drain before reusing the inpcb's hash chain linkage
    pointers.
    
    raw_ip (ab)uses the hash table without using the corresponding
    accessors.  Since there are now two hash tables, it arbitrarily uses the
    "connected" table for all of its PCBs.  This will be addressed in some
    way in the future.
    
    inp interators which specify a hash bucket will only visit connected
    PCBs.  This is not really correct, but nothing in the tree uses that
    functionality except raw_ip, which as mentioned above places all of its
    PCBs in the "connected" table and so is unaffected.
    
    Discussed with: glebius
    Tested by:      glebius
    Sponsored by:   Klara, Inc.
    Sponsored by:   Modirum MDPay
    Differential Revision:  https://reviews.freebsd.org/D38569
---
 sys/netinet/in_pcb.c   | 179 ++++++++++++++++++++++++++++++++++---------------
 sys/netinet/in_pcb.h   |  16 +++--
 sys/netinet/raw_ip.c   |   6 +-
 sys/netinet6/in6_pcb.c |  40 +++++------
 4 files changed, 159 insertions(+), 82 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 3b8931a90262..c6c5ca4d2281 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -58,8 +58,10 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
+#include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
+#include <sys/smr.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
@@ -510,7 +512,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
 #endif
 	CK_LIST_INIT(&pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
-	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
+	pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
+	    &pcbinfo->ipi_hashmask);
+	pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
@@ -532,7 +536,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
-	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
+	hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask);
+	hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
@@ -630,6 +635,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 #ifdef INET
 		inp->inp_vflag |= INP_IPV4;
 #endif
+	inp->inp_smr = SMR_SEQ_INVALID;
+
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
@@ -1010,7 +1017,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp,
  */
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
-    bool rehash)
+    bool rehash __unused)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
@@ -1018,6 +1025,8 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
+	KASSERT(in_nullhost(inp->inp_faddr),
+	    ("%s: inp is already connected", __func__));
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
@@ -1027,28 +1036,26 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred,
 	if (error)
 		return (error);
 
+	inp->inp_faddr.s_addr = faddr;
+	inp->inp_fport = fport;
+
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
-		KASSERT(rehash == true,
-		    ("Rehashing required for unbound inps"));
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
-			inp->inp_laddr.s_addr = INADDR_ANY;
-			inp->inp_lport = 0;
+			inp->inp_laddr.s_addr = inp->inp_faddr.s_addr =
+			    INADDR_ANY;
+			inp->inp_lport = inp->inp_fport = 0;
 			return (EAGAIN);
 		}
-	}
-
-	/* Commit the remaining changes. */
-	inp->inp_lport = lport;
-	inp->inp_laddr.s_addr = laddr;
-	inp->inp_faddr.s_addr = faddr;
-	inp->inp_fport = fport;
-	if (rehash) {
-		in_pcbrehash(inp);
 	} else {
-		in_pcbinshash(inp);
+		inp->inp_lport = lport;
+		inp->inp_laddr.s_addr = laddr;
+		if ((inp->inp_flags & INP_INHASHLIST) != 0)
+			in_pcbrehash(inp);
+		else
+			in_pcbinshash(inp);
 	}
 
 	if (anonport)
@@ -1402,11 +1409,16 @@ in_pcbdisconnect(struct inpcb *inp)
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
+	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
+	    ("%s: inp %p was already disconnected", __func__, inp));
+
+	in_pcbremhash_locked(inp);
 
+	/* See the comment in in_pcbinshash(). */
+	inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
-	in_pcbrehash(inp);
 }
 #endif /* INET */
 
@@ -1551,11 +1563,11 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
 #define	II_LIST_FIRST(ipi, hash)					\
 		(((hash) == INP_ALL_LIST) ?				\
 		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
-		    CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
+		    CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
 #define	II_LIST_NEXT(inp, hash)						\
 		(((hash) == INP_ALL_LIST) ?				\
 		    CK_LIST_NEXT((inp), inp_list) :			\
-		    CK_LIST_NEXT((inp), inp_hash))
+		    CK_LIST_NEXT((inp), inp_hash_exact))
 #define	II_LOCK_ASSERT(inp, lock)					\
 		rw_assert(&(inp)->inp_lock,				\
 		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
@@ -1996,9 +2008,9 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
-		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
+		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
-		CK_LIST_FOREACH(inp, head, inp_hash) {
+		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -2178,9 +2190,9 @@ in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	match = NULL;
-	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
+	head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
-	CK_LIST_FOREACH(inp, head, inp_hash) {
+	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -2214,13 +2226,13 @@ in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 	 *      3. non-jailed, non-wild.
 	 *      4. non-jailed, wild.
 	 */
-	head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
+	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
 	    pcbinfo->ipi_hashmask)];
 	local_wild = local_exact = jail_wild = NULL;
 #ifdef INET6
 	local_wild_mapped = NULL;
 #endif
-	CK_LIST_FOREACH(inp, head, inp_hash) {
+	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
 		bool injail;
 
 #ifdef INET6
@@ -2368,21 +2380,31 @@ in_pcbinshash(struct inpcb *inp)
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
+	uint32_t hash;
+	bool connected;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
-
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
-	if (inp->inp_vflag & INP_IPV6)
-		pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
-		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
-	else
+	if (inp->inp_vflag & INP_IPV6) {
+		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
+		    inp->inp_fport, pcbinfo->ipi_hashmask);
+		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
+	} else
 #endif
-		pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
-		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
+	{
+		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
+		    inp->inp_fport, pcbinfo->ipi_hashmask);
+		connected = !in_nullhost(inp->inp_faddr);
+	}
+
+	if (connected)
+		pcbhash = &pcbinfo->ipi_hash_exact[hash];
+	else
+		pcbhash = &pcbinfo->ipi_hash_wild[hash];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
@@ -2421,66 +2443,117 @@ in_pcbinshash(struct inpcb *inp)
 	}
 	inp->inp_phd = phd;
 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
-	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
+
+	/*
+	 * The PCB may have been disconnected in the past.  Before we can safely
+	 * make it visible in the hash table, we must wait for all readers which
+	 * may be traversing this PCB to finish.
+	 */
+	if (inp->inp_smr != SMR_SEQ_INVALID) {
+		smr_wait(pcbinfo->ipi_smr, inp->inp_smr);
+		inp->inp_smr = SMR_SEQ_INVALID;
+	}
+
+	if (connected)
+		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
+	else
+		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild);
 	inp->inp_flags |= INP_INHASHLIST;
 
 	return (0);
 }
 
-static void
-in_pcbremhash(struct inpcb *inp)
+void
+in_pcbremhash_locked(struct inpcb *inp)
 {
 	struct inpcbport *phd = inp->inp_phd;
 
 	INP_WLOCK_ASSERT(inp);
+	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 	MPASS(inp->inp_flags & INP_INHASHLIST);
 
-	INP_HASH_WLOCK(inp->inp_pcbinfo);
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		in_pcbremlbgrouphash(inp);
-	CK_LIST_REMOVE(inp, inp_hash);
+#ifdef INET6
+	if (inp->inp_vflag & INP_IPV6) {
+		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
+			CK_LIST_REMOVE(inp, inp_hash_wild);
+		else
+			CK_LIST_REMOVE(inp, inp_hash_exact);
+	} else
+#endif
+	{
+		if (in_nullhost(inp->inp_faddr))
+			CK_LIST_REMOVE(inp, inp_hash_wild);
+		else
+			CK_LIST_REMOVE(inp, inp_hash_exact);
+	}
 	CK_LIST_REMOVE(inp, inp_portlist);
 	if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 		CK_LIST_REMOVE(phd, phd_hash);
 		uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
 	}
-	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 	inp->inp_flags &= ~INP_INHASHLIST;
 }
 
+static void
+in_pcbremhash(struct inpcb *inp)
+{
+	INP_HASH_WLOCK(inp->inp_pcbinfo);
+	in_pcbremhash_locked(inp);
+	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
+}
+
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
- *
- * XXXGL: a race between this function and SMR-protected hash iterator
- * will lead to iterator traversing a possibly wrong hash list. However,
- * this race should have been here since change from rwlock to epoch.
  */
 void
 in_pcbrehash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
+	uint32_t hash;
+	bool connected;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
-
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
-	    ("in_pcbrehash: !INP_INHASHLIST"));
+	    ("%s: !INP_INHASHLIST", __func__));
+	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
+	    ("%s: inp was disconnected", __func__));
 
 #ifdef INET6
-	if (inp->inp_vflag & INP_IPV6)
-		head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
-		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
-	else
+	if (inp->inp_vflag & INP_IPV6) {
+		hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport,
+		    inp->inp_fport, pcbinfo->ipi_hashmask);
+		connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr);
+	} else
 #endif
-		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
-		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
+	{
+		hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport,
+		    inp->inp_fport, pcbinfo->ipi_hashmask);
+		connected = !in_nullhost(inp->inp_faddr);
+	}
 
-	CK_LIST_REMOVE(inp, inp_hash);
-	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
+	/*
+	 * When rehashing, the caller must ensure that either the new or the old
+	 * foreign address was unspecified.
+	 */
+	if (connected)
+		CK_LIST_REMOVE(inp, inp_hash_wild);
+	else
+		CK_LIST_REMOVE(inp, inp_hash_exact);
+
+	if (connected) {
+		head = &pcbinfo->ipi_hash_exact[hash];
+		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
+	} else {
+		head = &pcbinfo->ipi_hash_wild[hash];
+		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
+	}
 }
 
 /*
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index f8a2a311cf4e..179d706381a7 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -45,13 +45,13 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
+#include <sys/_smr.h>
 #include <net/route.h>
 
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
-#include <sys/smr.h>
 #include <sys/sysctl.h>
 #include <net/vnet.h>
 #include <vm/uma.h>
@@ -215,7 +215,8 @@ struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
 	/* Cache line #1 (amd64) */
-	CK_LIST_ENTRY(inpcb) inp_hash;	/* (w:h/r:e)  hash list */
+	CK_LIST_ENTRY(inpcb) inp_hash_exact;	/* hash table linkage */
+	CK_LIST_ENTRY(inpcb) inp_hash_wild;	/* hash table linkage */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
 #define	inp_start_zero	inp_hpts
@@ -261,11 +262,12 @@ struct inpcb {
 	u_char	inp_ip_p;		/* (c) protocol proto */
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_flowid;		/* (x) flow id / queue id */
+	smr_seq_t inp_smr;		/* (i) sequence number at disconnect */
 	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
 
 	/* Local and foreign ports, local and foreign addr. */
-	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
+	struct	in_conninfo inp_inc;	/* (i,h) list for PCB's local port */
 
 	/* MAC and IPSEC policy information. */
 	struct	label *inp_label;	/* (i) MAC label */
@@ -430,10 +432,12 @@ struct inpcbinfo {
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
-	 * port numbers.
+	 * port numbers.  The "exact" hash holds PCBs connected to a foreign
+	 * address, and "wild" holds the rest.
 	 */
 	struct mtx		 ipi_hash_lock;
-	struct inpcbhead 	*ipi_hashbase;		/* (r:e/w:h) */
+	struct inpcbhead 	*ipi_hash_exact;	/* (r:e/w:h) */
+	struct inpcbhead 	*ipi_hash_wild;		/* (r:e/w:h) */
 	u_long			 ipi_hashmask;		/* (c) */
 
 	/*
@@ -643,7 +647,6 @@ int	inp_so_options(const struct inpcb *inp);
 #define	IN6P_RTHDRDSTOPTS	0x00200000 /* receive dstoptions before rthdr */
 #define	IN6P_TCLASS		0x00400000 /* receive traffic class value */
 #define	IN6P_AUTOFLOWLABEL	0x00800000 /* attach flowlabel automatically */
-/* was	INP_TIMEWAIT		0x01000000 */
 #define	INP_ONESBCAST		0x02000000 /* send all-ones broadcast */
 #define	INP_DROPPED		0x04000000 /* protocol drop flag */
 #define	INP_SOCKREF		0x08000000 /* strong socket reference */
@@ -760,6 +763,7 @@ void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
+void	in_pcbremhash_locked(struct inpcb *);
 bool	in_pcbrele_rlocked(struct inpcb *);
 bool	in_pcbrele_wlocked(struct inpcb *);
 
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 39f40fcebff1..7fc5ca2ec712 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -166,8 +166,8 @@ rip_inshash(struct inpcb *inp)
 		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
 	} else
 		hash = 0;
-	pcbhash = &pcbinfo->ipi_hashbase[hash];
-	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
+	pcbhash = &pcbinfo->ipi_hash_exact[hash];
+	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
 }
 
 static void
@@ -177,7 +177,7 @@ rip_delhash(struct inpcb *inp)
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
-	CK_LIST_REMOVE(inp, inp_hash);
+	CK_LIST_REMOVE(inp, inp_hash_exact);
 }
 #endif /* INET */
 
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 81a3fd49a93d..6a5fbe93b58a 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -81,7 +81,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
+#include <sys/proc.h>
 #include <sys/protosw.h>
+#include <sys/smr.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
@@ -398,7 +400,7 @@ in6_pcbladdr(struct inpcb *inp, struct sockaddr_in6 *sin6,
  */
 int
 in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
-    bool rehash)
+    bool rehash __unused)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct sockaddr_in6 laddr6;
@@ -411,6 +413,8 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
 	    ("%s: invalid address family for %p", __func__, sin6));
 	KASSERT(sin6->sin6_len == sizeof(*sin6),
 	    ("%s: invalid address length for %p", __func__, sin6));
+	KASSERT(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr),
+	    ("%s: inp is already connected", __func__));
 
 	bzero(&laddr6, sizeof(laddr6));
 	laddr6.sin6_family = AF_INET6;
@@ -440,17 +444,6 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
 		return (EADDRINUSE);
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		if (inp->inp_lport == 0) {
-			/*
-			 * rehash was required to be true in the past for
-			 * this case; retain that convention.  However,
-			 * we now call in_pcb_lport_dest rather than
-			 * in6_pcbbind; the former does not insert into
-			 * the hash table, the latter does.  Change rehash
-			 * to false to do the in_pcbinshash below.
-			 */
-			KASSERT(rehash == true,
-			    ("Rehashing required for unbound inps"));
-			rehash = false;
 			error = in_pcb_lport_dest(inp,
 			    (struct sockaddr *) &laddr6, &inp->inp_lport,
 			    (struct sockaddr *) sin6, sin6->sin6_port, cred,
@@ -468,7 +461,7 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred,
 		inp->inp_flow |=
 		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 
-	if (rehash) {
+	if ((inp->inp_flags & INP_INHASHLIST) != 0) {
 		in_pcbrehash(inp);
 	} else {
 		in_pcbinshash(inp);
@@ -483,13 +476,20 @@ in6_pcbdisconnect(struct inpcb *inp)
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
+	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
+	    ("%s: inp %p was already disconnected", __func__, inp));
+
+	in_pcbremhash_locked(inp);
+
+	/* See the comment in in_pcbinshash(). */
+	inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr);
 
+	/* XXX-MJ torn writes are visible to SMR lookup */
 	memset(&inp->in6p_laddr, 0, sizeof(inp->in6p_laddr));
 	memset(&inp->in6p_faddr, 0, sizeof(inp->in6p_faddr));
 	inp->inp_fport = 0;
 	/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
-	in_pcbrehash(inp);
 }
 
 struct sockaddr *
@@ -712,9 +712,9 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
-		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
+		head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
-		CK_LIST_FOREACH(inp, head, inp_hash) {
+		CK_LIST_FOREACH(inp, head, inp_hash_wild) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
@@ -952,9 +952,9 @@ in6_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
 	 * First look for an exact match.
 	 */
 	match = NULL;
-	head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(faddr, lport, fport,
+	head = &pcbinfo->ipi_hash_exact[INP6_PCBHASH(faddr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
-	CK_LIST_FOREACH(inp, head, inp_hash) {
+	CK_LIST_FOREACH(inp, head, inp_hash_exact) {
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
@@ -982,10 +982,10 @@ in6_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo,
 	 *      3. non-jailed, non-wild.
 	 *      4. non-jailed, wild.
 	 */
-	head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
+	head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport,
 	    pcbinfo->ipi_hashmask)];
 	local_wild = local_exact = jail_wild = NULL;
-	CK_LIST_FOREACH(inp, head, inp_hash) {
+	CK_LIST_FOREACH(inp, head, inp_hash_wild) {
 		bool injail;
 
 		/* XXX inp locking */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202304201613.33KGDov0045387>