Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 9 Jun 2025 21:06:52 GMT
From:      Kristof Provost <kp@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: defc181278cc - main - pf: reorganise fragment reassembly
Message-ID:  <202506092106.559L6qCG089724@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by kp:

URL: https://cgit.FreeBSD.org/src/commit/?id=defc181278cc8e04369cf439d18f1de184532a34

commit defc181278cc8e04369cf439d18f1de184532a34
Author:     Kristof Provost <kp@FreeBSD.org>
AuthorDate: 2025-05-29 11:27:17 +0000
Commit:     Kristof Provost <kp@FreeBSD.org>
CommitDate: 2025-06-09 19:37:56 +0000

    pf: reorganise fragment reassembly
    
    To avoid packet loss due to reuse of the 16 bit IPv4 fragment id,
    we need suitable data structures.  Organize the pf fragments with
    two red-black trees.  One is holding the address and protocol
    information and the other has only the fragment id.  This will allow
    to drop fragemts for specific connections more aggressively.
    from markus@; OK sashan@
    
    Obtained from:  OpenBSD, bluhm <bluhm@openbsd.org>, 09228e5ff0
    Sponsored by:   Rubicon Communications, LLC ("Netgate")
    Differential Revision:  https://reviews.freebsd.org/D50723
---
 sys/netpfil/pf/pf_norm.c | 169 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 114 insertions(+), 55 deletions(-)

diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c
index ed9aed157993..45f4415d084b 100644
--- a/sys/netpfil/pf/pf_norm.c
+++ b/sys/netpfil/pf/pf_norm.c
@@ -76,21 +76,20 @@ struct pf_frent {
 	uint16_t	fe_mff;		/* more fragment flag */
 };
 
-struct pf_fragment_cmp {
-	struct pf_addr	frc_src;
-	struct pf_addr	frc_dst;
-	uint32_t	frc_id;
-	sa_family_t	frc_af;
-	uint8_t		frc_proto;
+RB_HEAD(pf_frag_tree, pf_fragment);
+struct pf_frnode {
+	struct pf_addr		fn_src;		/* ip source address */
+	struct pf_addr		fn_dst;		/* ip destination address */
+	sa_family_t		fn_af;		/* address family */
+	u_int8_t		fn_proto;	/* protocol for fragments in fn_tree */
+	u_int32_t		fn_fragments;	/* number of entries in fn_tree */
+
+	RB_ENTRY(pf_frnode)	fn_entry;
+	struct pf_frag_tree	fn_tree;	/* matching fragments, lookup by id */
 };
 
 struct pf_fragment {
-	struct pf_fragment_cmp	fr_key;
-#define fr_src	fr_key.frc_src
-#define fr_dst	fr_key.frc_dst
-#define fr_id	fr_key.frc_id
-#define fr_af	fr_key.frc_af
-#define fr_proto	fr_key.frc_proto
+	uint32_t	fr_id;	/* fragment id for reassemble */
 
 	/* pointers to queue element */
 	struct pf_frent	*fr_firstoff[PF_FRAG_ENTRY_POINTS];
@@ -102,6 +101,7 @@ struct pf_fragment {
 	TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
 	uint16_t	fr_maxlen;	/* maximum length of single fragment */
 	u_int16_t	fr_holes;	/* number of holes in the queue */
+	struct pf_frnode *fr_node;	/* ip src/dst/proto/af for fragments */
 };
 
 VNET_DEFINE_STATIC(struct mtx, pf_frag_mtx);
@@ -114,16 +114,25 @@ VNET_DEFINE(uma_zone_t, pf_state_scrub_z);	/* XXX: shared with pfsync */
 
 VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z);
 #define	V_pf_frent_z	VNET(pf_frent_z)
+VNET_DEFINE_STATIC(uma_zone_t, pf_frnode_z);
+#define	V_pf_frnode_z	VNET(pf_frnode_z)
 VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z);
 #define	V_pf_frag_z	VNET(pf_frag_z)
 
 TAILQ_HEAD(pf_fragqueue, pf_fragment);
 TAILQ_HEAD(pf_cachequeue, pf_fragment);
+RB_HEAD(pf_frnode_tree, pf_frnode);
 VNET_DEFINE_STATIC(struct pf_fragqueue,	pf_fragqueue);
 #define	V_pf_fragqueue			VNET(pf_fragqueue)
-RB_HEAD(pf_frag_tree, pf_fragment);
 VNET_DEFINE_STATIC(struct pf_frag_tree,	pf_frag_tree);
 #define	V_pf_frag_tree			VNET(pf_frag_tree)
+static __inline int	pf_frnode_compare(struct pf_frnode *,
+			    struct pf_frnode *);
+VNET_DEFINE_STATIC(struct pf_frnode_tree, pf_frnode_tree);
+#define	V_pf_frnode_tree		VNET(pf_frnode_tree)
+RB_PROTOTYPE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare);
+RB_GENERATE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare);
+
 static int		 pf_frag_compare(struct pf_fragment *,
 			    struct pf_fragment *);
 static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
@@ -134,8 +143,7 @@ static void	pf_free_fragment(struct pf_fragment *);
 
 static struct pf_frent *pf_create_fragment(u_short *);
 static int	pf_frent_holes(struct pf_frent *frent);
-static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key,
-		    struct pf_frag_tree *tree);
+static struct pf_fragment	*pf_find_fragment(struct pf_frnode *, u_int32_t);
 static inline int	pf_frent_index(struct pf_frent *);
 static int	pf_frent_insert(struct pf_fragment *,
 			    struct pf_frent *, struct pf_frent *);
@@ -143,7 +151,7 @@ void			pf_frent_remove(struct pf_fragment *,
 			    struct pf_frent *);
 struct pf_frent		*pf_frent_previous(struct pf_fragment *,
 			    struct pf_frent *);
-static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *,
+static struct pf_fragment *pf_fillup_fragment(struct pf_frnode *, u_int32_t,
 		    struct pf_frent *, u_short *);
 static struct mbuf *pf_join_fragment(struct pf_fragment *);
 #ifdef INET
@@ -163,14 +171,13 @@ static int	pf_reassemble6(struct mbuf **,
 
 #ifdef INET
 static void
-pf_ip2key(struct ip *ip, struct pf_fragment_cmp *key)
+pf_ip2key(struct ip *ip, struct pf_frnode *key)
 {
 
-	key->frc_src.v4 = ip->ip_src;
-	key->frc_dst.v4 = ip->ip_dst;
-	key->frc_af = AF_INET;
-	key->frc_proto = ip->ip_p;
-	key->frc_id = ip->ip_id;
+	key->fn_src.v4 = ip->ip_src;
+	key->fn_dst.v4 = ip->ip_dst;
+	key->fn_af = AF_INET;
+	key->fn_proto = ip->ip_p;
 }
 #endif	/* INET */
 
@@ -180,6 +187,9 @@ pf_normalize_init(void)
 
 	V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	V_pf_frnode_z = uma_zcreate("pf fragment node",
+	    sizeof(struct pf_frnode), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
 	V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
@@ -202,26 +212,36 @@ pf_normalize_cleanup(void)
 
 	uma_zdestroy(V_pf_state_scrub_z);
 	uma_zdestroy(V_pf_frent_z);
+	uma_zdestroy(V_pf_frnode_z);
 	uma_zdestroy(V_pf_frag_z);
 
 	mtx_destroy(&V_pf_frag_mtx);
 }
 
 static int
-pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+pf_frnode_compare(struct pf_frnode *a, struct pf_frnode *b)
 {
 	int	diff;
 
-	if ((diff = a->fr_id - b->fr_id) != 0)
+	if ((diff = a->fn_proto - b->fn_proto) != 0)
 		return (diff);
-	if ((diff = a->fr_proto - b->fr_proto) != 0)
+	if ((diff = a->fn_af - b->fn_af) != 0)
 		return (diff);
-	if ((diff = a->fr_af - b->fr_af) != 0)
+	if ((diff = pf_addr_cmp(&a->fn_src, &b->fn_src, a->fn_af)) != 0)
 		return (diff);
-	if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0)
+	if ((diff = pf_addr_cmp(&a->fn_dst, &b->fn_dst, a->fn_af)) != 0)
 		return (diff);
-	if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0)
+	return (0);
+}
+
+static __inline int
+pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+{
+	int	diff;
+
+	if ((diff = a->fr_id - b->fr_id) != 0)
 		return (diff);
+
 	return (0);
 }
 
@@ -281,10 +301,20 @@ static void
 pf_free_fragment(struct pf_fragment *frag)
 {
 	struct pf_frent		*frent;
+	struct pf_frnode	*frnode;
 
 	PF_FRAG_ASSERT();
 
-	RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
+	frnode = frag->fr_node;
+	RB_REMOVE(pf_frag_tree, &frnode->fn_tree, frag);
+	MPASS(frnode->fn_fragments >= 1);
+	frnode->fn_fragments--;
+	if (frnode->fn_fragments == 0) {
+		MPASS(RB_EMPTY(&frnode->fn_tree));
+		RB_REMOVE(pf_frnode_tree, &V_pf_frnode_tree, frnode);
+		uma_zfree(V_pf_frnode_z, frnode);
+	}
+
 	TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
 
 	/* Free all fragment entries */
@@ -299,17 +329,23 @@ pf_free_fragment(struct pf_fragment *frag)
 }
 
 static struct pf_fragment *
-pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree)
+pf_find_fragment(struct pf_frnode *key, uint32_t id)
 {
-	struct pf_fragment	*frag;
+	struct pf_fragment	*frag, idkey;
+	struct pf_frnode	*frnode;
 
 	PF_FRAG_ASSERT();
 
-	frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key);
-	if (frag != NULL) {
-		TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
-		TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
-	}
+	frnode = RB_FIND(pf_frnode_tree, &V_pf_frnode_tree, key);
+	if (frnode == NULL)
+		return (NULL);
+	MPASS(frnode->fn_fragments >= 1);
+	idkey.fr_id = id;
+	frag = RB_FIND(pf_frag_tree, &frnode->fn_tree, &idkey);
+	if (frag == NULL)
+		return (NULL);
+	TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
+	TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
 
 	return (frag);
 }
@@ -527,11 +563,12 @@ pf_frent_previous(struct pf_fragment *frag, struct pf_frent *frent)
 }
 
 static struct pf_fragment *
-pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
-    u_short *reason)
+pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
+    struct pf_frent *frent, u_short *reason)
 {
 	struct pf_frent		*after, *next, *prev;
 	struct pf_fragment	*frag;
+	struct pf_frnode	*frnode;
 	uint16_t		total;
 
 	PF_FRAG_ASSERT();
@@ -555,12 +592,12 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
 		goto bad_fragment;
 	}
 
-	DPFPRINTF((key->frc_af == AF_INET ?
+	DPFPRINTF((key->fn_af == AF_INET ?
 	    "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n",
-	    key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len));
+	    id, frent->fe_off, frent->fe_off + frent->fe_len));
 
 	/* Fully buffer all of the fragments in this fragment queue. */
-	frag = pf_find_fragment(key, &V_pf_frag_tree);
+	frag = pf_find_fragment(key, id);
 
 	/* Create a new reassembly queue for this packet. */
 	if (frag == NULL) {
@@ -574,7 +611,22 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
 			}
 		}
 
-		*(struct pf_fragment_cmp *)frag = *key;
+		frnode = RB_FIND(pf_frnode_tree, &V_pf_frnode_tree, key);
+		if (frnode == NULL) {
+			frnode = uma_zalloc(V_pf_frnode_z, M_NOWAIT);
+			if (frnode == NULL) {
+				pf_flush_fragments();
+				frnode = uma_zalloc(V_pf_frnode_z, M_NOWAIT);
+				if (frnode == NULL) {
+					REASON_SET(reason, PFRES_MEMORY);
+					uma_zfree(V_pf_frag_z, frag);
+					goto drop_fragment;
+				}
+			}
+			*frnode = *key;
+			RB_INIT(&frnode->fn_tree);
+			frnode->fn_fragments = 0;
+		}
 		memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff));
 		memset(frag->fr_entries, 0, sizeof(frag->fr_entries));
 		frag->fr_timeout = time_uptime;
@@ -582,7 +634,14 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
 		frag->fr_maxlen = frent->fe_len;
 		frag->fr_holes = 1;
 
-		RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag);
+		frag->fr_id = id;
+		frag->fr_node = frnode;
+		/* RB_INSERT cannot fail as pf_find_fragment() found nothing */
+		RB_INSERT(pf_frag_tree, &frnode->fn_tree, frag);
+		frnode->fn_fragments++;
+		if (frnode->fn_fragments == 1)
+			RB_INSERT(pf_frnode_tree, &V_pf_frnode_tree, frnode);
+
 		TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
 
 		/* We do not have a previous fragment, cannot fail. */
@@ -592,6 +651,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
 	}
 
 	KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue"));
+	MPASS(frag->fr_node);
 
 	/* Remember maximum fragment len for refragmentation. */
 	if (frent->fe_len > frag->fr_maxlen)
@@ -627,7 +687,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
 	if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) {
 		uint16_t precut;
 
-		if (frag->fr_af == AF_INET6)
+		if (frag->fr_node->fn_af == AF_INET6)
 			goto free_fragment;
 
 		precut = prev->fe_off + prev->fe_len - frent->fe_off;
@@ -681,7 +741,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
 	return (frag);
 
 free_ipv6_fragment:
-	if (frag->fr_af == AF_INET)
+	if (frag->fr_node->fn_af == AF_INET)
 		goto bad_fragment;
 free_fragment:
 	/*
@@ -743,8 +803,8 @@ pf_reassemble(struct mbuf **m0, u_short *reason)
 	struct pf_fragment	*frag;
 	struct m_tag		*mtag;
 	struct pf_fragment_tag	*ftag;
-	struct pf_fragment_cmp	key;
-	uint16_t		total, hdrlen;
+	struct pf_frnode	 key;
+	uint16_t		 total, hdrlen;
 	uint32_t		 frag_id;
 	uint16_t		 maxlen;
 
@@ -761,7 +821,7 @@ pf_reassemble(struct mbuf **m0, u_short *reason)
 
 	pf_ip2key(ip, &key);
 
-	if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
+	if ((frag = pf_fillup_fragment(&key, ip->ip_id, frent, reason)) == NULL)
 		return (PF_DROP);
 
 	/* The mbuf is part of the fragment entry, no direct free or access */
@@ -835,7 +895,7 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr,
 	struct ip6_hdr		*ip6 = mtod(m, struct ip6_hdr *);
 	struct pf_frent		*frent;
 	struct pf_fragment	*frag;
-	struct pf_fragment_cmp	 key;
+	struct pf_frnode	 key;
 	struct m_tag		*mtag;
 	struct pf_fragment_tag	*ftag;
 	int			 off;
@@ -858,14 +918,13 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr,
 	frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
 	frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG;
 
-	key.frc_src.v6 = ip6->ip6_src;
-	key.frc_dst.v6 = ip6->ip6_dst;
-	key.frc_af = AF_INET6;
+	key.fn_src.v6 = ip6->ip6_src;
+	key.fn_dst.v6 = ip6->ip6_dst;
+	key.fn_af = AF_INET6;
 	/* Only the first fragment's protocol is relevant. */
-	key.frc_proto = 0;
-	key.frc_id = fraghdr->ip6f_ident;
+	key.fn_proto = 0;
 
-	if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
+	if ((frag = pf_fillup_fragment(&key, fraghdr->ip6f_ident, frent, reason)) == NULL) {
 		PF_FRAG_UNLOCK();
 		return (PF_DROP);
 	}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202506092106.559L6qCG089724>