Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 20 Apr 2021 17:06:36 GMT
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 1db08fbe3ffa - main - tcp_input: always request read-locking of PCB for any pure SYN segment.
Message-ID:  <202104201706.13KH6aMh039585@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=1db08fbe3ffad1be59c9449628ee867b50f6be6f

commit 1db08fbe3ffad1be59c9449628ee867b50f6be6f
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2021-04-16 21:56:14 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2021-04-20 17:02:20 +0000

    tcp_input: always request read-locking of PCB for any pure SYN segment.
    
    This is further rework of 08d9c920275.  Now we carry the knowledge of
    lock type all the way through tcp_input() and also into tcp_twcheck().
    Ideally the rlocking for pure SYNs should propagate all the way into
    the alternative TCP stacks, but not yet today.
    
    This should close a race when socket is bind(2)-ed but not yet
    listen(2)-ed and a SYN-packet arrives racing with listen(2), discovered
    recently by pho@.
---
 sys/netinet/in_pcb.c       | 14 ++++----------
 sys/netinet/in_pcb.h       |  3 +--
 sys/netinet/tcp_input.c    | 16 +++++++++++++---
 sys/netinet/tcp_timewait.c | 29 ++++++++++++++++++++++++-----
 sys/netinet6/in6_pcb.c     | 13 +++----------
 5 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 43cd469e0fcf..43eb31c4e31d 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -2524,12 +2524,6 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 			INP_WLOCK(inp);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
-		} else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
-			if (inp->inp_socket != NULL &&
-			    SOLISTENING(inp->inp_socket))
-				INP_RLOCK(inp);
-			else
-				INP_WLOCK(inp);
 		} else
 			panic("%s: locking bug", __func__);
 		if (__predict_false(inp->inp_flags2 & INP_FREED)) {
@@ -2557,8 +2551,8 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
-	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
-	    INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
+	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
+	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
@@ -2593,8 +2587,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
-	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
-	    INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
+	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
+	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 4959fc238dfb..923c5ba4edff 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -761,10 +761,9 @@ int	inp_so_options(const struct inpcb *inp);
 #define	INPLOOKUP_WILDCARD	0x00000001	/* Allow wildcard sockets. */
 #define	INPLOOKUP_RLOCKPCB	0x00000002	/* Return inpcb read-locked. */
 #define	INPLOOKUP_WLOCKPCB	0x00000004	/* Return inpcb write-locked. */
-#define	INPLOOKUP_RLOCKLISTEN	0x00000008	/* Rlock if listening, W if !.*/
 
 #define	INPLOOKUP_MASK	(INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
-	    INPLOOKUP_WLOCKPCB | INPLOOKUP_RLOCKLISTEN)
+	    INPLOOKUP_WLOCKPCB)
 
 #define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
 
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index cac67024705e..9d45c5ab42dc 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -834,11 +834,12 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 	/*
-	 * For initial SYN packets arriving on listening socket,
-	 * we don't need write lock.
+	 * For initial SYN packets we don't need write lock on matching
+	 * PCB, be it a listening one or a synchronized one.  The packet
+	 * shall not modify its state.
 	 */
 	lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
-	    INPLOOKUP_RLOCKLISTEN : INPLOOKUP_WLOCKPCB;
+	    INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB;
 findpcb:
 #ifdef INET6
 	if (isipv6 && fwd_tag != NULL) {
@@ -1380,7 +1381,16 @@ tfo_socket_result:
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
+	 *
+	 * XXXGL: in case of a pure SYN arriving on existing connection
+	 * TCP stacks won't need to modify the PCB, they would either drop
+	 * the segment silently, or send a challenge ACK.  However, we try
+	 * to upgrade the lock, because calling convention for stacks is
+	 * write-lock on PCB.  If upgrade fails, drop the SYN.
 	 */
+	if (lookupflag == INPLOOKUP_RLOCKPCB && INP_TRY_UPGRADE(inp) == 0)
+		goto dropunlock;
+
 	tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
 	return (IPPROTO_DONE);
 
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index b62386ddca05..695ec4413ac9 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -377,7 +377,9 @@ tcp_twstart(struct tcpcb *tp)
 /*
  * Returns 1 if the TIME_WAIT state was killed and we should start over,
  * looking for a pcb in the listen state.  Returns 0 otherwise.
- * It be called with to == NULL only for pure SYN-segments.
+ *
+ * For pure SYN-segments the PCB shall be read-locked and the tcpopt pointer
+ * may be NULL.  For the rest write-lock and valid tcpopt.
  */
 int
 tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
@@ -388,7 +390,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	tcp_seq seq;
 
 	NET_EPOCH_ASSERT();
-	INP_WLOCK_ASSERT(inp);
+	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
@@ -401,8 +403,16 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 		goto drop;
 
 	thflags = th->th_flags;
-	KASSERT(to != NULL || (thflags & (TH_SYN | TH_ACK)) == TH_SYN,
-	        ("tcp_twcheck: called without options on a non-SYN segment"));
+#ifdef INVARIANTS
+	if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN)
+		INP_RLOCK_ASSERT(inp);
+	else {
+		INP_WLOCK_ASSERT(inp);
+		KASSERT(to != NULL,
+		    ("%s: called without options on a non-SYN segment",
+		    __func__));
+	}
+#endif
 
 	/*
 	 * NOTE: for FIN_WAIT_2 (to be added later),
@@ -442,6 +452,13 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	 * Allow UDP port number changes in this case.
 	 */
 	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
+		/*
+		 * In case we can't upgrade our lock just pretend we have
+		 * lost this packet.
+		 */
+		if (((thflags & (TH_SYN | TH_ACK)) == TH_SYN) &&
+		    INP_TRY_UPGRADE(inp) == 0)
+			goto drop;
 		tcp_twclose(tw, 0);
 		return (1);
 	}
@@ -471,6 +488,8 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	if ((thflags & TH_ACK) == 0)
 		goto drop;
 
+	INP_WLOCK_ASSERT(inp);
+
 	/*
 	 * If timestamps were negotiated during SYN/ACK and a
 	 * segment without a timestamp is received, silently drop
@@ -503,7 +522,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 drop:
 	TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
 dropnoprobe:
-	INP_WUNLOCK(inp);
+	INP_UNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 96be795d5757..4cdd2a8f152f 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -1299,12 +1299,6 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
 			INP_WLOCK(inp);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
-		} else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
-			if (inp->inp_socket != NULL &&
-			    SOLISTENING(inp->inp_socket))
-				INP_RLOCK(inp);
-			else
-				INP_WLOCK(inp);
 		} else
 			panic("%s: locking bug", __func__);
 		if (__predict_false(inp->inp_flags2 & INP_FREED)) {
@@ -1331,8 +1325,8 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
-	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
-	    INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
+	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
+	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
@@ -1367,8 +1361,7 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
-	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
-	    INPLOOKUP_RLOCKLISTEN)) != 0,
+	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202104201706.13KH6aMh039585>