Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 19 Jun 2018 01:54:00 +0000 (UTC)
From:      Matt Macy <mmacy@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r335356 - in head/sys: kern netinet netinet/tcp_stacks
Message-ID:  <201806190154.w5J1s0Nd029642@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mmacy
Date: Tue Jun 19 01:54:00 2018
New Revision: 335356
URL: https://svnweb.freebsd.org/changeset/base/335356

Log:
  convert inpcbinfo hash and info rwlocks to epoch + mutex
  
  - Convert inpcbinfo info & hash locks to epoch for read and mutex for write
  - Garbage collect code that handled INP_INFO_TRY_RLOCK failures as
    INP_INFO_RLOCK which can no longer fail
  
  When running 64 netperfs sending minimal sized packets on a 2x8x2 reduces
  unhalted core cycles samples in rwlock rlock/runlock in udp_send from 51% to
  3%.
  
  Overall packet throughput rate limited by CPU affinity and NIC driver design
  choices.
  
  On the receiver unhalted core cycles samples in in_pcblookup_hash went from
  13% to to 1.6%
  
  Tested by LLNW and pho@
  
  Reviewed by: jtl
  Sponsored by: Limelight Networks
  Differential Revision: https://reviews.freebsd.org/D15686

Modified:
  head/sys/kern/subr_witness.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/tcp_hpts.c
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_stacks/rack.c
  head/sys/netinet/tcp_timewait.c

Modified: head/sys/kern/subr_witness.c
==============================================================================
--- head/sys/kern/subr_witness.c	Tue Jun 19 01:33:03 2018	(r335355)
+++ head/sys/kern/subr_witness.c	Tue Jun 19 01:54:00 2018	(r335356)
@@ -561,14 +561,14 @@ static struct witness_order_list_entry order_lists[] =
 	/*
 	 * UDP/IP
 	 */
-	{ "udp", &lock_class_rw },
+	{ "udp", &lock_class_mtx_sleep },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
-	{ "tcp", &lock_class_rw },
+	{ "tcp", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Tue Jun 19 01:33:03 2018	(r335355)
+++ head/sys/netinet/in_pcb.h	Tue Jun 19 01:54:00 2018	(r335356)
@@ -51,6 +51,8 @@
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <net/vnet.h>
+#include <net/if.h>
+#include <net/if_var.h>
 #include <vm/uma.h>
 #endif
 #include <sys/ck.h>
@@ -157,6 +159,7 @@ struct in_conninfo {
  * Key:
  * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
+ * (e) - Protected by the net_epoch_prempt epoch
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
@@ -231,7 +234,7 @@ struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
 	/* Cache line #1 (amd64) */
-	CK_LIST_ENTRY(inpcb) inp_hash;	/* (h/i) hash list */
+	CK_LIST_ENTRY(inpcb) inp_hash;	/* [w](h/i) [r](e/i)  hash list */
 	CK_LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
@@ -324,8 +327,8 @@ struct inpcb {
 		struct route_in6 inp_route6;
 	};
 	CK_LIST_ENTRY(inpcb) inp_list;	/* (p/l) list for all PCBs for proto */
-	                                /* (p[w]) for list iteration */
-	                                /* (p[r]/l) for addition/removal */
+	                                /* (e[r]) for list iteration */
+	                                /* (p[w]/l) for addition/removal */
 	struct epoch_context inp_epoch_ctx;
 };
 #endif	/* _KERNEL */
@@ -436,22 +439,23 @@ struct in_pcblist {
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
+ * (e) - Protected by the net_epoch_prempt epoch
  * (g) Locked by ipi_lock
  * (l) Locked by ipi_list_lock
- * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
  * (p) Protected by one or more pcbgroup locks
  * (x) Synchronisation properties poorly defined
  */
 struct inpcbinfo {
 	/*
-	 * Global lock protecting full inpcb list traversal
+	 * Global lock protecting inpcb list modification
 	 */
-	struct rwlock		 ipi_lock;
+	struct mtx		 ipi_lock;
 
 	/*
 	 * Global list of inpcbs on the protocol.
 	 */
-	struct inpcbhead	*ipi_listhead;		/* (g/l) */
+	struct inpcbhead	*ipi_listhead;		/* [r](e) [w](g/l) */
 	u_int			 ipi_count;		/* (l) */
 
 	/*
@@ -482,9 +486,9 @@ struct inpcbinfo {
 	u_int			 ipi_hashfields;	/* (c) */
 
 	/*
-	 * Global lock protecting non-pcbgroup hash lookup tables.
+	 * Global lock protecting modification non-pcbgroup hash lookup tables.
 	 */
-	struct rwlock		 ipi_hash_lock;
+	struct mtx		 ipi_hash_lock;
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
@@ -626,20 +630,18 @@ int	inp_so_options(const struct inpcb *inp);
 #endif /* _KERNEL */
 
 #define INP_INFO_LOCK_INIT(ipi, d) \
-	rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_RLOCK(ipi)	rw_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi)	rw_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_RLOCK(ipi)	rw_try_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_WLOCK(ipi)	rw_try_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_UPGRADE(ipi)	rw_try_upgrade(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCKED(ipi)	rw_wowned(&(ipi)->ipi_lock)
-#define INP_INFO_RUNLOCK(ipi)	rw_runlock(&(ipi)->ipi_lock)
-#define INP_INFO_WUNLOCK(ipi)	rw_wunlock(&(ipi)->ipi_lock)
-#define	INP_INFO_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
-#define INP_INFO_RLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
-#define INP_INFO_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
-#define INP_INFO_UNLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+	mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
+#define INP_INFO_LOCK_DESTROY(ipi)  mtx_destroy(&(ipi)->ipi_lock)
+#define INP_INFO_RLOCK(ipi)	NET_EPOCH_ENTER()
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_WLOCK(ipi)	mtx_trylock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi)	mtx_owned(&(ipi)->ipi_lock)
+#define INP_INFO_RUNLOCK(ipi)	NET_EPOCH_EXIT()
+#define INP_INFO_WUNLOCK(ipi)	mtx_unlock(&(ipi)->ipi_lock)
+#define	INP_INFO_LOCK_ASSERT(ipi)	MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_RLOCK_ASSERT(ipi)	MPASS(in_epoch())
+#define INP_INFO_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
+#define INP_INFO_UNLOCK_ASSERT(ipi)	MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock))
 
 #define INP_LIST_LOCK_INIT(ipi, d) \
         rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
@@ -660,17 +662,14 @@ int	inp_so_options(const struct inpcb *inp);
 #define INP_LIST_UNLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
 
-#define	INP_HASH_LOCK_INIT(ipi, d) \
-	rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
-#define	INP_HASH_LOCK_DESTROY(ipi)	rw_destroy(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_RLOCK(ipi)		rw_rlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_WLOCK(ipi)		rw_wlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_RUNLOCK(ipi)		rw_runlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_WUNLOCK(ipi)		rw_wunlock(&(ipi)->ipi_hash_lock)
-#define	INP_HASH_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
-					    RA_LOCKED)
-#define	INP_HASH_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
-					    RA_WLOCKED)
+#define	INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
+#define	INP_HASH_LOCK_DESTROY(ipi)	mtx_destroy(&(ipi)->ipi_hash_lock)
+#define	INP_HASH_RLOCK(ipi)		NET_EPOCH_ENTER()
+#define	INP_HASH_WLOCK(ipi)		mtx_lock(&(ipi)->ipi_hash_lock)
+#define	INP_HASH_RUNLOCK(ipi)		NET_EPOCH_EXIT()
+#define	INP_HASH_WUNLOCK(ipi)		mtx_unlock(&(ipi)->ipi_hash_lock)
+#define	INP_HASH_LOCK_ASSERT(ipi)	MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock))
+#define	INP_HASH_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
 
 #define	INP_GROUP_LOCK_INIT(ipg, d)	mtx_init(&(ipg)->ipg_lock, (d), NULL, \
 					    MTX_DEF | MTX_DUPOK)

Modified: head/sys/netinet/tcp_hpts.c
==============================================================================
--- head/sys/netinet/tcp_hpts.c	Tue Jun 19 01:33:03 2018	(r335355)
+++ head/sys/netinet/tcp_hpts.c	Tue Jun 19 01:54:00 2018	(r335356)
@@ -184,9 +184,6 @@ TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_
 
 static struct tcp_hptsi tcp_pace;
 
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp,
-    struct tcpcb **tp);
 static void tcp_wakehpts(struct tcp_hpts_entry *p);
 static void tcp_wakeinput(struct tcp_hpts_entry *p);
 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
@@ -498,59 +495,6 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE
     0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
 
 
-/*
- * Try to get the INP_INFO lock.
- *
- * This function always succeeds in getting the lock. It will clear
- * *tpp and return (1) if something critical changed while the inpcb
- * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
- *
- * This function relies on the fact that the hpts always holds a
- * reference on the inpcb while the segment is on the hptsi wheel and
- * in the input queue.
- *
- */
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
-{
-	struct tcp_function_block *tfb;
-	struct tcpcb *tp;
-	void *ptr;
-
-	/* Try the easy way. */
-	if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
-		return (0);
-
-	/*
-	 * OK, let's try the hard way. We'll save the function pointer block
-	 * to make sure that doesn't change while we aren't holding the
-	 * lock.
-	 */
-	tp = *tpp;
-	tfb = tp->t_fb;
-	ptr = tp->t_fb_ptr;
-	INP_WUNLOCK(inp);
-	INP_INFO_RLOCK(&V_tcbinfo);
-	INP_WLOCK(inp);
-	/* If the session went away, return an error. */
-	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
-	    (inp->inp_flags2 & INP_FREED)) {
-		*tpp = NULL;
-		return (1);
-	}
-	/*
-	 * If the function block or stack-specific data block changed,
-	 * report an error.
-	 */
-	tp = intotcpcb(inp);
-	if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
-		*tpp = NULL;
-		return (1);
-	}
-	return (0);
-}
-
-
 static void
 tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
@@ -1290,10 +1234,7 @@ out:
 		    (m->m_pkthdr.pace_lock == TI_RLOCKED ||
 		    tp->t_state != TCPS_ESTABLISHED)) {
 			ti_locked = TI_RLOCKED;
-			if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
-				CURVNET_RESTORE();
-				goto out;
-			}
+			INP_INFO_RLOCK(&V_tcbinfo);
 			m = tp->t_in_pkt;
 		}
 		if (in_newts_every_tcb) {
@@ -1360,7 +1301,6 @@ out:
 				 */
 				if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 				    (inp->inp_flags2 & INP_FREED)) {
-			out_free:
 					while (m) {
 						m_freem(m);
 						m = n;
@@ -1376,8 +1316,7 @@ out:
 				if (ti_locked == TI_UNLOCKED &&
 				    (tp->t_state != TCPS_ESTABLISHED)) {
 					ti_locked = TI_RLOCKED;
-					if (tcp_hptsi_lock_inpinfo(inp, &tp))
-						goto out_free;
+					INP_INFO_RLOCK(&V_tcbinfo);
 				}
 			}	/** end while(m) */
 		}		/** end if ((m != NULL)  && (m == tp->t_in_pkt)) */

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Tue Jun 19 01:33:03 2018	(r335355)
+++ head/sys/netinet/tcp_input.c	Tue Jun 19 01:54:00 2018	(r335356)
@@ -960,25 +960,10 @@ findpcb:
 	 *
 	 * XXXRW: It may be time to rethink timewait locking.
 	 */
-relocked:
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		if (ti_locked == TI_UNLOCKED) {
-			if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
-				in_pcbref(inp);
-				INP_WUNLOCK(inp);
-				INP_INFO_RLOCK(&V_tcbinfo);
-				ti_locked = TI_RLOCKED;
-				INP_WLOCK(inp);
-				if (in_pcbrele_wlocked(inp)) {
-					inp = NULL;
-					goto findpcb;
-				} else if (inp->inp_flags & INP_DROPPED) {
-					INP_WUNLOCK(inp);
-					inp = NULL;
-					goto findpcb;
-				}
-			} else
-				ti_locked = TI_RLOCKED;
+			INP_INFO_RLOCK();
+			ti_locked = TI_RLOCKED;
 		}
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
@@ -1026,23 +1011,8 @@ relocked:
 	      (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
 	       !IS_FASTOPEN(tp->t_flags)))) {
 		if (ti_locked == TI_UNLOCKED) {
-			if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
-				in_pcbref(inp);
-				INP_WUNLOCK(inp);
-				INP_INFO_RLOCK(&V_tcbinfo);
-				ti_locked = TI_RLOCKED;
-				INP_WLOCK(inp);
-				if (in_pcbrele_wlocked(inp)) {
-					inp = NULL;
-					goto findpcb;
-				} else if (inp->inp_flags & INP_DROPPED) {
-					INP_WUNLOCK(inp);
-					inp = NULL;
-					goto findpcb;
-				}
-				goto relocked;
-			} else
-				ti_locked = TI_RLOCKED;
+			INP_INFO_RLOCK();
+			ti_locked = TI_RLOCKED;
 		}
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	}

Modified: head/sys/netinet/tcp_stacks/rack.c
==============================================================================
--- head/sys/netinet/tcp_stacks/rack.c	Tue Jun 19 01:33:03 2018	(r335355)
+++ head/sys/netinet/tcp_stacks/rack.c	Tue Jun 19 01:54:00 2018	(r335356)
@@ -6837,34 +6837,8 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, str
 		 * Initial input (ACK to SYN-ACK etc)lets go ahead and get
 		 * it processed
 		 */
-		if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo))
-			ti_locked = TI_RLOCKED;
-		if (ti_locked != TI_RLOCKED) {
-			inp = tp->t_inpcb;
-			tfb = tp->t_fb;
-			in_pcbref(inp);
-			INP_WUNLOCK(inp);
-			INP_INFO_RLOCK(&V_tcbinfo);
-			ti_locked = TI_RLOCKED;
-			INP_WLOCK(inp);
-			if (in_pcbrele_wlocked(inp))
-				inp = NULL;
-			if (inp == NULL || (inp->inp_flags2 & INP_FREED) ||
-			    (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
-				/* The TCPCB went away. Free the packet. */
-				INP_INFO_RUNLOCK(&V_tcbinfo);
-				if (inp)
-					INP_WUNLOCK(inp);
-				m_freem(m);
-				return;
-			}
-			/* If the stack changed, call the correct stack. */
-			if (tp->t_fb != tfb) {
-				tp->t_fb->tfb_tcp_do_segment(m, th, so, tp,
-				    drop_hdrlen, tlen, iptos, ti_locked);
-				return;
-			}
-		}
+		INP_INFO_RLOCK();
+		ti_locked = TI_RLOCKED;
 		tcp_get_usecs(&tv);
 		rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
 		    tlen, iptos, ti_locked, 0, &tv);

Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c	Tue Jun 19 01:33:03 2018	(r335355)
+++ head/sys/netinet/tcp_timewait.c	Tue Jun 19 01:54:00 2018	(r335356)
@@ -707,54 +707,46 @@ tcp_tw_2msl_scan(int reuse)
 		in_pcbref(inp);
 		TW_RUNLOCK(V_tw_lock);
 
-		if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
-
-			INP_WLOCK(inp);
-			tw = intotw(inp);
-			if (in_pcbrele_wlocked(inp)) {
-				if (__predict_true(tw == NULL)) {
-					INP_INFO_RUNLOCK(&V_tcbinfo);
-					continue;
-				} else {
-					/* This should not happen as in TIMEWAIT
-					 * state the inp should not be destroyed
-					 * before its tcptw. If INVARIANTS is
-					 * defined panic.
-					 */
+		INP_INFO_RLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		tw = intotw(inp);
+		if (in_pcbrele_wlocked(inp)) {
+			if (__predict_true(tw == NULL)) {
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+				continue;
+			} else {
+				/* This should not happen as in TIMEWAIT
+				 * state the inp should not be destroyed
+				 * before its tcptw. If INVARIANTS is
+				 * defined panic.
+				 */
 #ifdef INVARIANTS
-					panic("%s: Panic before an infinite "
-					    "loop: INP_TIMEWAIT && (INP_FREED "
-					    "|| inp last reference) && tw != "
-					    "NULL", __func__);
+				panic("%s: Panic before an infinite "
+					  "loop: INP_TIMEWAIT && (INP_FREED "
+					  "|| inp last reference) && tw != "
+					  "NULL", __func__);
 #else
-					log(LOG_ERR, "%s: Avoid an infinite "
-					    "loop: INP_TIMEWAIT && (INP_FREED "
-					    "|| inp last reference) && tw != "
-					    "NULL", __func__);
+				log(LOG_ERR, "%s: Avoid an infinite "
+					"loop: INP_TIMEWAIT && (INP_FREED "
+					"|| inp last reference) && tw != "
+					"NULL", __func__);
 #endif
-					INP_INFO_RUNLOCK(&V_tcbinfo);
-					break;
-				}
-			}
-
-			if (tw == NULL) {
-				/* tcp_twclose() has already been called */
-				INP_WUNLOCK(inp);
 				INP_INFO_RUNLOCK(&V_tcbinfo);
-				continue;
+				break;
 			}
+		}
 
-			tcp_twclose(tw, reuse);
+		if (tw == NULL) {
+			/* tcp_twclose() has already been called */
+			INP_WUNLOCK(inp);
 			INP_INFO_RUNLOCK(&V_tcbinfo);
-			if (reuse)
-			    return tw;
-		} else {
-			/* INP_INFO lock is busy, continue later. */
-			INP_WLOCK(inp);
-			if (!in_pcbrele_wlocked(inp))
-				INP_WUNLOCK(inp);
-			break;
+			continue;
 		}
+
+		tcp_twclose(tw, reuse);
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+		if (reuse)
+			return tw;
 	}
 
 	return NULL;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201806190154.w5J1s0Nd029642>