Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 6 Feb 2025 16:27:27 GMT
From:      Mark Johnston <markj@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 5dc99e9bb985 - main - tcp: Add a sysctl to modify listening socket FIB inheritance
Message-ID:  <202502061627.516GRRZb095203@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=5dc99e9bb985dce58e8fc85c09ef4e49bf051971

commit 5dc99e9bb985dce58e8fc85c09ef4e49bf051971
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2025-02-06 14:14:49 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2025-02-06 14:14:49 +0000

    tcp: Add a sysctl to modify listening socket FIB inheritance
    
    Introduce the net.inet.tcp.bind_all_fibs tunable, set to 1 by default
    for compatibility with current behaviour.  When set to 0, all TCP
    listening sockets are private to their FIB.  Inbound connection requests
    will only succeed if a matching inpcb is bound to the same FIB as the
    request.
    
    No functional change intended, as the new behaviour is not enabled by
    default.
    
    Reviewed by:    glebius
    MFC after:      2 weeks
    Sponsored by:   Klara, Inc.
    Sponsored by:   Stormshield
    Differential Revision:  https://reviews.freebsd.org/D48663
---
 share/man/man4/tcp.4     | 32 +++++++++++++++++++++++++++++++-
 sys/netinet/tcp_input.c  |  8 +++++++-
 sys/netinet/tcp_usrreq.c | 12 ++++++++----
 sys/netinet/tcp_var.h    |  2 ++
 4 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index 192fa90a1635..70b50a3b197f 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -31,7 +31,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd August 3, 2024
+.Dd January 10, 2025
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -200,6 +200,35 @@ The alternate TCP stack must already be loaded in the kernel.
 To list the available TCP stacks, see
 .Va functions_available
 in the
+.Sx FIB support
+TCP sockets are FIB-aware.
+They inherit the FIB of the process which created the socket, or that of the
+listening socket for sockets created by
+.Xr accept 2 .
+In particular, the FIB is not inherited from that of the interface where the
+initiating SYN packet was received.
+When an incoming connection request arrives to a listening socket, the initial
+handshake also occurs in the FIB of the listening socket, not that of the
+received packet.
+.Pp
+By default, a TCP listening socket can accept connections originating from any
+FIB.
+If the
+.Va net.inet.tcp.bind_all_fibs
+tunable is set to 0, a listening socket will only accept connections
+originating
+from the FIB's listening socket.
+Connection requests from other FIBs will be treated as though there is no
+listening socket for the destination address and port.
+In this mode, multiple listening sockets owned by the same user can listen on
+the same address and port so long as they belong to different FIBs, similar to
+the behavior of the
+.Dv SO_REUSEPORT
+socket option.
+If the tunable is set to 0, all sockets added to a load-balancing group created
+with the
+.Dv SO_REUSEPORT_LB
+socket option must belong to the same FIB.
 .Sx MIB (sysctl) Variables
 section further down.
 To list the default TCP stack, see
@@ -1041,6 +1070,7 @@ when trying to use a TCP function block that is not available;
 .El
 .Sh SEE ALSO
 .Xr getsockopt 2 ,
+.Xr setfib 2 ,
 .Xr socket 2 ,
 .Xr stats 3 ,
 .Xr sysctl 3 ,
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 647bcd17f7bc..12dc4670f531 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -135,6 +135,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_log_in_vain), 0,
     "Log all incoming TCP segments to closed ports");
 
+VNET_DEFINE(int, tcp_bind_all_fibs) = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN,
+    &VNET_NAME(tcp_bind_all_fibs), 0,
+    "Bound sockets receive traffic from all FIBs");
+
 VNET_DEFINE(int, blackhole) = 0;
 #define	V_blackhole		VNET(blackhole)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
@@ -833,7 +838,8 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 	 */
 	lookupflag = INPLOOKUP_WILDCARD |
 	    ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
-	    INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB);
+	    INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) |
+	    (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB);
 findpcb:
 	tp = NULL;
 #ifdef INET6
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 776ebe5db83b..0b4a93390f1b 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -262,7 +262,8 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 		goto out;
 	}
 	INP_HASH_WLOCK(&V_tcbinfo);
-	error = in_pcbbind(inp, sinp, 0, td->td_ucred);
+	error = in_pcbbind(inp, sinp, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
+	    td->td_ucred);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 out:
 	tcp_bblog_pru(tp, PRU_BIND, error);
@@ -336,7 +337,8 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 		}
 	}
 #endif
-	error = in6_pcbbind(inp, sin6, 0, td->td_ucred);
+	error = in6_pcbbind(inp, sin6, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
+	    td->td_ucred);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 out:
 	if (error != 0)
@@ -378,7 +380,8 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
 	}
 	if (inp->inp_lport == 0) {
 		INP_HASH_WLOCK(&V_tcbinfo);
-		error = in_pcbbind(inp, NULL, 0, td->td_ucred);
+		error = in_pcbbind(inp, NULL,
+		    V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
 		INP_HASH_WUNLOCK(&V_tcbinfo);
 	}
 	if (error == 0) {
@@ -441,7 +444,8 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
 		inp->inp_vflag &= ~INP_IPV4;
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
 			inp->inp_vflag |= INP_IPV4;
-		error = in6_pcbbind(inp, NULL, 0, td->td_ucred);
+		error = in6_pcbbind(inp, NULL,
+		    V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
 	}
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	if (error == 0) {
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 466b263854b7..e65e44840bd8 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1271,6 +1271,7 @@ VNET_DECLARE(uint32_t, tcp_ack_war_time_window);
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 VNET_DECLARE(int, tcp_autosndbuf_max);
+VNET_DECLARE(int, tcp_bind_all_fibs);
 VNET_DECLARE(int, tcp_delack_enabled);
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 VNET_DECLARE(int, tcp_do_autosndbuf);
@@ -1324,6 +1325,7 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
 #define	V_tcp_autorcvbuf_max		VNET(tcp_autorcvbuf_max)
 #define	V_tcp_autosndbuf_inc		VNET(tcp_autosndbuf_inc)
 #define	V_tcp_autosndbuf_max		VNET(tcp_autosndbuf_max)
+#define	V_tcp_bind_all_fibs		VNET(tcp_bind_all_fibs)
 #define	V_tcp_delack_enabled		VNET(tcp_delack_enabled)
 #define	V_tcp_do_autorcvbuf		VNET(tcp_do_autorcvbuf)
 #define	V_tcp_do_autosndbuf		VNET(tcp_do_autosndbuf)



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202502061627.516GRRZb095203>