Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 2 Dec 2021 18:49:10 GMT
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: f971e791391d - main - tcp_hpts: rename input queue to drop queue and trim dead code
Message-ID:  <202112021849.1B2InAPE069003@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=f971e791391d9f7ff6bfd7d7e0bed93267216329

commit f971e791391d9f7ff6bfd7d7e0bed93267216329
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2021-12-02 18:48:48 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2021-12-02 18:48:48 +0000

    tcp_hpts: rename input queue to drop queue and trim dead code
    
    The HPTS input queue is in reality used only for "delayed drops".
    When a TCP stack decides to drop a connection on the output path
    it can't do that due to locking protocol between main tcp_output()
    and stacks.  So, rack/bbr utilize HPTS to drop the connection in
    a different context.
    
    In the past the queue could also process input packets in context
    of HPTS thread, but now no stack uses this, so remove this
    functionality.
    
    Reviewed by:            rrs
    Differential revision:  https://reviews.freebsd.org/D33025
---
 sys/netinet/in_pcb.c          |   6 +-
 sys/netinet/in_pcb.h          |  18 +--
 sys/netinet/tcp_hpts.c        | 342 ++++++++++++++++--------------------------
 sys/netinet/tcp_hpts.h        |   7 +-
 sys/netinet/tcp_lro.c         |   2 +-
 sys/netinet/tcp_stacks/bbr.c  |   2 +-
 sys/netinet/tcp_stacks/rack.c |  30 ++--
 sys/netinet/tcp_subr.c        |   2 +-
 8 files changed, 163 insertions(+), 246 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index ac8c0d3e368a..081d204f559c 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -629,7 +629,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 	 * If using hpts lets drop a random number in so
 	 * not all new connections fall on the same CPU.
 	 */
-	inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp);
+	inp->inp_hpts_cpu = inp->inp_dropq_cpu = hpts_random_cpu(inp);
 #endif
 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
 	INP_WLOCK(inp);
@@ -1760,7 +1760,7 @@ in_pcbrele_rlocked(struct inpcb *inp)
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
 	MPASS(inp->inp_in_hpts == 0);
-	MPASS(inp->inp_in_input == 0);
+	MPASS(inp->inp_in_dropq == 0);
 	INP_RUNLOCK(inp);
 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 	return (true);
@@ -1778,7 +1778,7 @@ in_pcbrele_wlocked(struct inpcb *inp)
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
 	MPASS(inp->inp_in_hpts == 0);
-	MPASS(inp->inp_in_input == 0);
+	MPASS(inp->inp_in_dropq == 0);
 	INP_WUNLOCK(inp);
 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 	return (true);
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 305356914d14..3e89ba9ee90f 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -234,22 +234,21 @@ struct inpcb {
 	 * fields can *not* be collapsed into a signal bit field.
 	 */
 #if defined(__amd64__) || defined(__i386__)
-	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
-	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
+	uint8_t inp_in_hpts; /* on output hpts (lock b) */
+	uint8_t inp_in_dropq; /* on input hpts (lock b) */
 #else
-	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
-	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
+	uint32_t inp_in_hpts; /* on output hpts (lock b) */
+	uint32_t inp_in_dropq; /* on input hpts (lock b) */
 #endif
 	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
 	volatile uint16_t  inp_irq_cpu;	/* Set by LRO in behalf of or the driver */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
-	volatile uint16_t  inp_input_cpu; /* Lock (i) */
-	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
-			 inp_input_cpu_set : 1,	/* on input hpts (i) */
+	uint16_t  inp_dropq_cpu; /* Lock (i) */
+	uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
+			 inp_dropq_cpu_set : 1,	/* on input hpts (i) */
 			 inp_hpts_calls :1,	/* (i) from output hpts */
-			 inp_input_calls :1,	/* (i) from input hpts */
 			 inp_irq_cpu_set :1,	/* (i) from LRO/Driver */
 			 inp_spare_bits2 : 3;
 	uint8_t inp_numa_domain;	/* numa domain */
@@ -257,7 +256,8 @@ struct inpcb {
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
 	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
-	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
+	uint32_t	inp_dropq_gencnt;
+	TAILQ_ENTRY(inpcb) inp_dropq;	/* hpts drop queue next lock(b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 237452932ca3..a620be4b3e30 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -62,15 +62,7 @@ __FBSDID("$FreeBSD$");
  * Of course this is a bare bones example and the stack will probably
  * have more consideration then just the above.
  *
- * Now the second function (actually two functions I guess :D)
- * the tcp_hpts system provides is the  ability to either abort
- * a connection (later) or process input on a connection.
- * Why would you want to do this? To keep processor locality
- * and or not have to worry about untangling any recursive
- * locks. The input function now is hooked to the new LRO
- * system as well.
- *
- * In order to use the input redirection function the
+ * In order to run input queued segments from the HPTS context the
  * tcp stack must define an input function for
  * tfb_do_queued_segments(). This function understands
  * how to dequeue a array of packets that were input and
@@ -109,6 +101,10 @@ __FBSDID("$FreeBSD$");
  * you have defined the tfb_do_segment_nounlock() as
  * described above.
  *
+ * Now the second function the tcp_hpts system provides is the ability
+ * to abort a connection later. Why would you want to do this?
+ * To not have to worry about untangling any recursive locks.
+ *
  * The second feature of the input side of hpts is the
  * dropping of a connection. This is due to the way that
  * locking may have occured on the INP_WLOCK. So if
@@ -202,6 +198,8 @@ __FBSDID("$FreeBSD$");
 
 /* Each hpts has its own p_mtx which is used for locking */
 #define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+#define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx)
+#define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx)
 TAILQ_HEAD(hptsh, inpcb);
 struct tcp_hpts_entry {
 	/* Cache line 0x00 */
@@ -226,10 +224,11 @@ struct tcp_hpts_entry {
 	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	void *p_inp;
-	struct hptsh p_input;	/* For the tcp-input runner */
+	TAILQ_HEAD(, inpcb) p_dropq;	/* Delayed drop queue */
 	/* Hptsi wheel */
 	struct hptsh *p_hptss;
-	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
+	uint32_t p_dropq_cnt;		/* Count on drop queue */
+	uint32_t p_dropq_gencnt;
 	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
 					 * of 255ms */
 	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
@@ -270,7 +269,6 @@ static int hpts_does_tp_logging = 0;
 static int hpts_use_assigned_cpu = 1;
 static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
 
-static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
 static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
 static void tcp_hpts_thread(void *ctx);
 static void tcp_init_hptsi(void *st);
@@ -558,41 +556,6 @@ hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hpt
 	}
 }
 
-static inline void
-hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
-{
-	HPTS_MTX_ASSERT(hpts);
-	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
-		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
-	KASSERT(inp->inp_in_input != 0,
-		("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
-	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
-	hpts->p_on_inqueue_cnt--;
-	KASSERT(hpts->p_on_inqueue_cnt >= 0,
-		("Hpts in goes negative inp:%p hpts:%p",
-		 inp, hpts));
-	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
-		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
-		("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
-		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
-	if (clear)
-		inp->inp_in_input = 0;
-}
-
-static inline void
-hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
-{
-	HPTS_MTX_ASSERT(hpts);
-	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
-		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
-	KASSERT(inp->inp_in_input == 0,
-		("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
-	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
-	inp->inp_in_input = 1;
-	hpts->p_on_inqueue_cnt++;
-	in_pcbref(inp);
-}
-
 static struct tcp_hpts_entry *
 tcp_hpts_lock(struct inpcb *inp)
 {
@@ -614,19 +577,19 @@ again:
 }
 
 static struct tcp_hpts_entry *
-tcp_input_lock(struct inpcb *inp)
+tcp_dropq_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
-	hpts_num = inp->inp_input_cpu;
+	hpts_num = inp->inp_dropq_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
 		("Hpts:%p owns mtx prior-to lock line:%d",
 		hpts, __LINE__));
 	mtx_lock(&hpts->p_mtx);
-	if (hpts_num != inp->inp_input_cpu) {
+	if (hpts_num != inp->inp_dropq_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
@@ -652,13 +615,38 @@ tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, in
 }
 
 static void
-tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+tcp_dropq_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp)
 {
+	bool released __diagused;
+
 	HPTS_MTX_ASSERT(hpts);
-	if (inp->inp_in_input) {
-		hpts_sane_input_remove(hpts, inp, 1);
-		tcp_remove_hpts_ref(inp, hpts, line);
+	INP_WLOCK_ASSERT(inp);
+
+	if (inp->inp_in_dropq != IHPTS_ONQUEUE)
+		return;
+
+	MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
+	if (__predict_true(inp->inp_dropq_gencnt == hpts->p_dropq_gencnt)) {
+		TAILQ_REMOVE(&hpts->p_dropq, inp, inp_dropq);
+		MPASS(hpts->p_dropq_cnt > 0);
+		hpts->p_dropq_cnt--;
+		inp->inp_in_dropq = IHPTS_NONE;
+		released = in_pcbrele_wlocked(inp);
+		MPASS(released == false);
+	} else {
+		/*
+		 * tcp_delayed_drop() now owns the TAILQ head of this inp.
+		 * Can't TAILQ_REMOVE, just mark it.
+		 */
+#ifdef INVARIANTS
+		struct inpcb *tmp;
+
+		TAILQ_FOREACH(tmp, &hpts->p_dropq, inp_dropq)
+			MPASS(tmp != inp);
+#endif
+		inp->inp_in_dropq = IHPTS_MOVING;
 	}
+
 }
 
 /*
@@ -669,7 +657,7 @@ tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int
  *
  * Valid values in the flags are
  * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
- * HPTS_REMOVE_INPUT - remove from the input of the hpts.
+ * HPTS_REMOVE_DROPQ - remove from the drop queue of the hpts.
  * Note that you can use one or both values together
  * and get two actions.
  */
@@ -684,9 +672,9 @@ __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
 		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
-	if (flags & HPTS_REMOVE_INPUT) {
-		hpts = tcp_input_lock(inp);
-		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
+	if (flags & HPTS_REMOVE_DROPQ) {
+		hpts = tcp_dropq_lock(inp);
+		tcp_dropq_remove(hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 }
@@ -1097,31 +1085,29 @@ __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
 }
 
 void
-__tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
+tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason)
 {
 	struct tcp_hpts_entry *hpts;
-	struct tcpcb *tp;
+	struct tcpcb *tp = intotcpcb(inp);
 
-	tp = intotcpcb(inp);
-	hpts = tcp_input_lock(tp->t_inpcb);
-	if (inp->inp_in_input == 0) {
-		/* Ok we need to set it on the hpts in the current slot */
-		hpts_sane_input_insert(hpts, inp, line);
-		if ((hpts->p_hpts_active == 0) &&
-		    (hpts->p_on_min_sleep == 0)){
-			/*
-			 * Activate the hpts if it is sleeping.
-			 */
-			hpts->p_direct_wake = 1;
-			tcp_wakehpts(hpts);
-		}
-	} else if ((hpts->p_hpts_active == 0) &&
-		   (hpts->p_on_min_sleep == 0)){
+	INP_WLOCK_ASSERT(inp);
+	inp->inp_hpts_drop_reas = reason;
+	if (inp->inp_in_dropq != IHPTS_NONE)
+		return;
+	hpts = tcp_dropq_lock(tp->t_inpcb);
+	MPASS(hpts->p_cpu == inp->inp_dropq_cpu);
+
+	TAILQ_INSERT_TAIL(&hpts->p_dropq, inp, inp_dropq);
+	inp->inp_in_dropq = IHPTS_ONQUEUE;
+	inp->inp_dropq_gencnt = hpts->p_dropq_gencnt;
+	hpts->p_dropq_cnt++;
+	in_pcbref(inp);
+
+	if ((hpts->p_hpts_active == 0) && (hpts->p_on_min_sleep == 0)){
 		hpts->p_direct_wake = 1;
 		tcp_wakehpts(hpts);
 	}
-	inp->inp_hpts_drop_reas = reason;
-	mtx_unlock(&hpts->p_mtx);
+	HPTS_UNLOCK(hpts);
 }
 
 static uint16_t
@@ -1136,8 +1122,8 @@ hpts_random_cpu(struct inpcb *inp){
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
-	if (inp->inp_input_cpu_set) {
-		return (inp->inp_input_cpu);
+	if (inp->inp_dropq_cpu_set) {
+		return (inp->inp_dropq_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
@@ -1160,8 +1146,8 @@ hpts_cpuid(struct inpcb *inp, int *failed)
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
-	if (inp->inp_input_cpu_set) {
-		return (inp->inp_input_cpu);
+	if (inp->inp_dropq_cpu_set) {
+		return (inp->inp_dropq_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
@@ -1249,117 +1235,50 @@ tcp_drop_in_pkts(struct tcpcb *tp)
  * list.
  */
 static void
-tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
+tcp_delayed_drop(struct tcp_hpts_entry *hpts)
 {
+	TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
+	struct inpcb *inp, *tmp;
 	struct tcpcb *tp;
-	struct inpcb *inp;
-	uint16_t drop_reason;
-	int16_t set_cpu;
-	uint32_t did_prefetch = 0;
-	int dropped;
 
 	HPTS_MTX_ASSERT(hpts);
 	NET_EPOCH_ASSERT();
 
-	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
-		HPTS_MTX_ASSERT(hpts);
-		hpts_sane_input_remove(hpts, inp, 0);
-		if (inp->inp_input_cpu_set == 0) {
-			set_cpu = 1;
-		} else {
-			set_cpu = 0;
-		}
-		hpts->p_inp = inp;
-		drop_reason = inp->inp_hpts_drop_reas;
-		inp->inp_in_input = 0;
-		mtx_unlock(&hpts->p_mtx);
+	TAILQ_SWAP(&head, &hpts->p_dropq, inpcb, inp_dropq);
+	hpts->p_dropq_cnt = 0;
+	hpts->p_dropq_gencnt++;
+	HPTS_UNLOCK(hpts);
+
+	TAILQ_FOREACH_SAFE(inp, &head, inp_dropq, tmp) {
 		INP_WLOCK(inp);
-#ifdef VIMAGE
-		CURVNET_SET(inp->inp_vnet);
-#endif
-		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
-out:
-			hpts->p_inp = NULL;
-			if (in_pcbrele_wlocked(inp) == 0) {
+		MPASS(inp->inp_hpts_drop_reas != 0);
+		if (__predict_false(inp->inp_in_dropq == IHPTS_MOVING)) {
+			inp->inp_in_dropq = IHPTS_NONE;
+			if (in_pcbrele_wlocked(inp) == false)
 				INP_WUNLOCK(inp);
-			}
-#ifdef VIMAGE
-			CURVNET_RESTORE();
-#endif
-			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
-		tp = intotcpcb(inp);
-		if ((tp == NULL) || (tp->t_inpcb == NULL)) {
-			goto out;
-		}
-		if (drop_reason) {
-			/* This tcb is being destroyed for drop_reason */
-			tcp_drop_in_pkts(tp);
-			tp = tcp_drop(tp, drop_reason);
-			if (tp == NULL) {
-				INP_WLOCK(inp);
-			}
-			if (in_pcbrele_wlocked(inp) == 0)
+		MPASS(inp->inp_in_dropq == IHPTS_ONQUEUE);
+		inp->inp_in_dropq = IHPTS_NONE;
+		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
+			if (in_pcbrele_wlocked(inp) == false)
 				INP_WUNLOCK(inp);
-#ifdef VIMAGE
-			CURVNET_RESTORE();
-#endif
-			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
-		if (set_cpu) {
-			/*
-			 * Setup so the next time we will move to the right
-			 * CPU. This should be a rare event. It will
-			 * sometimes happens when we are the client side
-			 * (usually not the server). Somehow tcp_output()
-			 * gets called before the tcp_do_segment() sets the
-			 * intial state. This means the r_cpu and r_hpts_cpu
-			 * is 0. We get on the hpts, and then tcp_input()
-			 * gets called setting up the r_cpu to the correct
-			 * value. The hpts goes off and sees the mis-match.
-			 * We simply correct it here and the CPU will switch
-			 * to the new hpts nextime the tcb gets added to the
-			 * the hpts (not this time) :-)
-			 */
-			tcp_set_hpts(inp);
-		}
-		if (tp->t_fb_ptr != NULL) {
-			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
-			did_prefetch = 1;
-		}
-		if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) {
-			if (inp->inp_in_input)
-				tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
-			dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
-			if (dropped) {
-				/* Re-acquire the wlock so we can release the reference */
-				INP_WLOCK(inp);
-			}
-		} else if (tp->t_in_pkt) {
-			/*
-			 * We reach here only if we had a
-			 * stack that supported INP_SUPPORTS_MBUFQ
-			 * and then somehow switched to a stack that
-			 * does not. The packets are basically stranded
-			 * and would hang with the connection until
-			 * cleanup without this code. Its not the
-			 * best way but I know of no other way to
-			 * handle it since the stack needs functions
-			 * it does not have to handle queued packets.
-			 */
+		CURVNET_SET(inp->inp_vnet);
+		if (__predict_true((tp = intotcpcb(inp)) != NULL)) {
+			MPASS(tp->t_inpcb == inp);
 			tcp_drop_in_pkts(tp);
+			tp = tcp_drop(tp, inp->inp_hpts_drop_reas);
+			if (tp == NULL)
+				INP_WLOCK(inp);
 		}
-		if (in_pcbrele_wlocked(inp) == 0)
+		if (in_pcbrele_wlocked(inp) == false)
 			INP_WUNLOCK(inp);
-		INP_UNLOCK_ASSERT(inp);
-#ifdef VIMAGE
 		CURVNET_RESTORE();
-#endif
-		mtx_lock(&hpts->p_mtx);
-		hpts->p_inp = NULL;
 	}
+
+	mtx_lock(&hpts->p_mtx); /* XXXGL */
 }
 
 static void
@@ -1489,10 +1408,10 @@ again:
 		hpts->p_nxt_slot = hpts->p_prev_slot;
 		hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
 	}
-	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
-		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+	KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) ||
+		 ((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))),
 		("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
-		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
+		 __FUNCTION__, hpts, hpts->p_dropq_cnt));
 	HPTS_MTX_ASSERT(hpts);
 	if (hpts->p_on_queue_cnt == 0) {
 		goto no_one;
@@ -1716,10 +1635,10 @@ no_one:
 	 * Check to see if we took an excess amount of time and need to run
 	 * more ticks (if we did not hit eno-bufs).
 	 */
-	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
-		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+	KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) ||
+		 ((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))),
 		("%s hpts:%p in_hpts cnt:%d queue state mismatch",
-		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
+		 __FUNCTION__, hpts, hpts->p_dropq_cnt));
 	hpts->p_prev_slot = hpts->p_cur_slot;
 	hpts->p_lasttick = hpts->p_curtick;
 	if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
@@ -1765,31 +1684,30 @@ no_run:
 	 * Run any input that may be there not covered
 	 * in running data.
 	 */
-	if (!TAILQ_EMPTY(&hpts->p_input)) {
-		tcp_input_data(hpts, &tv);
-		/*
-		 * Now did we spend too long running input and need to run more ticks?
-		 * Note that if wrap_loop_cnt < 2 then we should have the conditions
-		 * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
-		 * is greater than 2, then the condtion most likely are *not* true. Also
-		 * if we are called not from the callout, we don't run the wheel multiple
-		 * times so the slots may not align either.
-		 */
-		KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
-			 (wrap_loop_cnt >= 2) || (from_callout == 0)),
-			("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
-			 hpts->p_prev_slot, hpts->p_cur_slot));
-		KASSERT(((hpts->p_lasttick == hpts->p_curtick)
-			 || (wrap_loop_cnt >= 2) || (from_callout == 0)),
-			("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
-			 hpts->p_lasttick, hpts->p_curtick));
-		if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
-			hpts->p_curtick = tcp_gethptstick(&tv);
-			counter_u64_add(hpts_loops, 1);
-			hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
-			goto again;
-		}
+	tcp_delayed_drop(hpts);
+	/*
+	 * Now did we spend too long running input and need to run more ticks?
+	 * Note that if wrap_loop_cnt < 2 then we should have the conditions
+	 * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
+	 * is greater than 2, then the condtion most likely are *not* true.
+	 * Also if we are called not from the callout, we don't run the wheel
+	 * multiple times so the slots may not align either.
+	 */
+	KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
+		 (wrap_loop_cnt >= 2) || (from_callout == 0)),
+		("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
+		 hpts->p_prev_slot, hpts->p_cur_slot));
+	KASSERT(((hpts->p_lasttick == hpts->p_curtick)
+		 || (wrap_loop_cnt >= 2) || (from_callout == 0)),
+		("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
+		 hpts->p_lasttick, hpts->p_curtick));
+	if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
+		hpts->p_curtick = tcp_gethptstick(&tv);
+		counter_u64_add(hpts_loops, 1);
+		hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+		goto again;
 	}
+
 	if (from_callout){
 		tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
 	}
@@ -1814,12 +1732,12 @@ __tcp_set_hpts(struct inpcb *inp, int32_t line)
 			inp->inp_hpts_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
-	hpts = tcp_input_lock(inp);
-	if ((inp->inp_input_cpu_set == 0) &&
-	    (inp->inp_in_input == 0)) {
-		inp->inp_input_cpu = hpts_cpuid(inp, &failed);
+	hpts = tcp_dropq_lock(inp);
+	if ((inp->inp_dropq_cpu_set == 0) &&
+	    (inp->inp_in_dropq == 0)) {
+		inp->inp_dropq_cpu = hpts_cpuid(inp, &failed);
 		if (failed == 0)
-			inp->inp_input_cpu_set = 1;
+			inp->inp_dropq_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
@@ -2140,7 +2058,7 @@ tcp_init_hptsi(void *st)
 		 */
 		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
 		    "hpts", MTX_DEF | MTX_DUPOK);
-		TAILQ_INIT(&hpts->p_input);
+		TAILQ_INIT(&hpts->p_dropq);
 		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
 			TAILQ_INIT(&hpts->p_hptss[j]);
 		}
@@ -2155,8 +2073,8 @@ tcp_init_hptsi(void *st)
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "in_qcnt", CTLFLAG_RD,
-		    &hpts->p_on_inqueue_cnt, 0,
-		    "Count TCB's awaiting input processing");
+		    &hpts->p_dropq_cnt, 0,
+		    "Count TCB's awaiting delayed drop");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index 4b0fca566c3f..2f3cffe0b798 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -116,9 +116,9 @@ struct hpts_diag {
 #ifdef _KERNEL
 #define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
 void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
-#define HPTS_REMOVE_INPUT  0x01
+#define HPTS_REMOVE_DROPQ  0x01
 #define HPTS_REMOVE_OUTPUT 0x02
-#define HPTS_REMOVE_ALL    (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
+#define HPTS_REMOVE_ALL    (HPTS_REMOVE_DROPQ | HPTS_REMOVE_OUTPUT)
 
 static inline bool
 tcp_in_hpts(struct inpcb *inp)
@@ -160,8 +160,7 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
 void __tcp_set_hpts(struct inpcb *inp, int32_t line);
 #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
 
-void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
-#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);
 
 void tcp_run_hpts(void);
 
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 0357056da1b1..215b9097a4fd 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -1354,7 +1354,7 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
 	if (le->m_head != NULL) {
 		counter_u64_add(tcp_inp_lro_direct_queue, 1);
 		tcp_lro_log(tp, lc, le, NULL, 22, 1,
-			    inp->inp_flags2, inp->inp_in_input, 1);
+			    inp->inp_flags2, inp->inp_in_dropq, 1);
 		tcp_queue_pkts(inp, tp, le);
 	}
 	if (should_wake) {
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 74a9fada9174..24d238bbd04e 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -1884,7 +1884,7 @@ bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t ct
 	l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
 	l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
 	l->inhpts = tcp_in_hpts(bbr->rc_inp);
-	l->ininput = bbr->rc_inp->inp_in_input;
+	l->ininput = bbr->rc_inp->inp_in_dropq;
 	l->use_lt_bw = bbr->rc_lt_use_bw;
 	l->pkts_out = bbr->r_ctl.rc_flight_at_input;
 	l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 5a4849566cf9..7391734a9786 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -2295,7 +2295,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t
 		log.u_bbr.flex6 = rsm->r_end;
 		log.u_bbr.flex8 = mod;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
@@ -2330,7 +2330,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot
 		else
 			log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
@@ -2355,7 +2355,7 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex8 = to_num;
 		log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 		log.u_bbr.flex2 = rack->rc_rack_rtt;
@@ -2394,7 +2394,7 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex8 = flag;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.cur_del_rate = (uint64_t)prev;
 		log.u_bbr.delRate = (uint64_t)rsm;
 		log.u_bbr.rttProp = (uint64_t)next;
@@ -2439,7 +2439,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l
 		struct timeval tv;
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex1 = t;
 		log.u_bbr.flex2 = len;
 		log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
@@ -2589,7 +2589,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = tick;
 		log.u_bbr.flex3 = tp->t_maxunacktime;
@@ -2616,7 +2616,7 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex1 = slot;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex2 = 0;
@@ -2718,7 +2718,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = reason;
@@ -2751,7 +2751,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
 		log.u_bbr.flex3 = flags_on_entry;
@@ -13329,7 +13329,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
 #endif
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		if (rack->rack_no_prr == 0)
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		else
@@ -14321,7 +14321,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 #endif
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		if (rack->rack_no_prr == 0)
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		else
@@ -15612,7 +15612,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		log.u_bbr.flex1 = error;
 		log.u_bbr.flex2 = flags;
 		log.u_bbr.flex3 = rsm_is_null;
@@ -16128,7 +16128,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex1 = 0;
 		else
@@ -16629,7 +16629,7 @@ again:
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex1 = 0;
 		else
@@ -18801,7 +18801,7 @@ send:
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
-		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.ininput = rack->rc_inp->inp_in_dropq;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex1 = 0;
 		else
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 47fa8656a51d..20591e4006b9 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -2096,7 +2096,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 
 			memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 			log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
-			log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+			log.u_bbr.ininput = tp->t_inpcb->inp_in_dropq;
 			log.u_bbr.flex8 = 4;
 			log.u_bbr.pkts_out = tp->t_maxseg;
 			log.u_bbr.timeStamp = tcp_get_usecs(&tv);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202112021849.1B2InAPE069003>