Date: Sun, 12 Apr 2026 18:35:32 +0000 From: Gleb Smirnoff <glebius@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: ac5b9628002c - main - inpcb: retire the inpcb global list Message-ID: <69dbe5f4.395af.58134a93@gitrepo.freebsd.org>
index | next in thread | raw e-mail
The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=ac5b9628002c7c97929984eb578918077d564be4 commit ac5b9628002c7c97929984eb578918077d564be4 Author: Gleb Smirnoff <glebius@FreeBSD.org> AuthorDate: 2026-04-12 18:31:09 +0000 Commit: Gleb Smirnoff <glebius@FreeBSD.org> CommitDate: 2026-04-12 18:31:09 +0000 inpcb: retire the inpcb global list The iteration over all pcbs is possible without the global list. The newborn inpcbs are put on a global list of unconnected inpcbs, then after connect(2) or bind(2) they move to respective hash slot list. This adds a bit of complexity to inp_next(), but the storage scheme is actually simplified. One potential problem before this change was that a couple of pcbs fall into the same hash slot and are linked A->B there, but they also sit next to each other in the global list, linked as B->A. This can deadlock of course. The problem was never observed in the wild, but I was able to instrument it with lots of effort: just few pcbs in the system, hash size reduced down to 2 and a lot of repetitive calls into two kinds of iterators. However the main motivation is not the above problem, but make a step towards splitting the big hash lock into per-slot locks. Differential Revision: https://reviews.freebsd.org/D55967 --- sys/netinet/in_pcb.c | 125 ++++++++++++++++++++++++++++++----------------- sys/netinet/in_pcb.h | 49 ++++++++++++------- sys/netinet/in_pcb_var.h | 2 +- sys/netinet/tcp_usrreq.c | 29 ++++++----- sys/netinet6/in6_pcb.c | 4 +- 5 files changed, 134 insertions(+), 75 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index d60c75ab45b5..cb5b7fa274f6 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -240,8 +240,6 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); #endif -static void in_pcbremhash(struct inpcb *); - /* * in_pcb.c: manage the Protocol Control Blocks. * @@ -563,7 +561,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif - CK_LIST_INIT(&pcbinfo->ipi_listhead); + CK_LIST_INIT(&pcbinfo->ipi_list_unconn); pcbinfo->ipi_count = 0; ha.size = hash_nelements; @@ -698,7 +696,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) INP_HASH_WLOCK(pcbinfo); pcbinfo->ipi_count++; inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); + CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list); INP_HASH_WUNLOCK(pcbinfo); so->so_pcb = inp; @@ -1429,7 +1427,9 @@ in_pcbdisconnect(struct inpcb *inp) KASSERT(inp->inp_smr == SMR_SEQ_INVALID, ("%s: inp %p was already disconnected", __func__, inp)); - in_pcbremhash_locked(inp); + in_pcbremhash(inp); + CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp, + inp_unconn_list); if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) { /* See the comment in in_pcbinshash(). */ @@ -1552,8 +1552,17 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) * * - Iterator can have either write-lock or read-lock semantics, that can not * be changed later. - * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through - * a single hash slot. Note: only rip_input() does the latter. + * - Iterator has three modes of operation, defined by value of .hash member + * on the first call: + * - .hash = INP_ALL_LIST: the iterator will go through the unconnected + * list, then all wildcard hash slots and then all exact hash slots. + * - .hash = INP_UNCONN_LIST: the iterator will go through the list of + * unconnected pcbs only. + * - .hash initialized with an arbitrary positive value: iterator will go + * through this exact hash slot only. + * Note: only rip_input() and sysctl_setsockopt() use the latter. + * The interface may be extended for iteration over single wildcard hash + * slot, but there is no use case for that today. * - Iterator may have optional bool matching function. The matching function * will be executed for each inpcb in the SMR context, so it can not acquire * locks and can safely access only immutable fields of inpcb. @@ -1571,49 +1580,72 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) * - Removed entries won't stop traversal as long as they are not added to * a different list. This is violated by in_pcbrehash(). */ -#define II_LIST_FIRST(ipi, hash) \ - (((hash) == INP_ALL_LIST) ? \ - CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ - CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) -#define II_LIST_NEXT(inp, hash) \ - (((hash) == INP_ALL_LIST) ? \ - CK_LIST_NEXT((inp), inp_list) : \ - CK_LIST_NEXT((inp), inp_hash_exact)) -#define II_LOCK_ASSERT(inp, lock) \ - rw_assert(&(inp)->inp_lock, \ - (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) +static inline struct inpcb * +ii_list_first(const struct inpcb_iterator *ii) +{ + const struct inpcbinfo *ipi = ii->ipi; + const int hash = ii->hash; + + if (hash < 0) + return (CK_LIST_FIRST(&ipi->ipi_list_unconn)); + else if (hash <= ipi->ipi_hashmask) + return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash])); + else + return (CK_LIST_FIRST( + &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1])); +} + +static inline struct inpcb * +ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp) +{ + if (ii->hash < 0) + return (CK_LIST_NEXT(inp, inp_unconn_list)); + else if (ii->hash <= ii->ipi->ipi_hashmask) + return (CK_LIST_NEXT(inp, inp_hash_wild)); + else + return (CK_LIST_NEXT(inp, inp_hash_exact)); +} + struct inpcb * inp_next(struct inpcb_iterator *ii) { const struct inpcbinfo *ipi = ii->ipi; + const int hashmax = (ipi->ipi_hashmask + 1) * 2; inp_match_t *match = ii->match; void *ctx = ii->ctx; inp_lookup_t lock = ii->lock; - int hash = ii->hash; struct inpcb *inp; if (ii->inp == NULL) { /* First call. */ + if ((ii->hash = ii->mode) >= 0) { + /* Targeted iterators support only the exact hash. */ + MPASS(ii->hash <= ipi->ipi_hashmask); + ii->hash += ipi->ipi_hashmask + 1; + } smr_enter(ipi->ipi_smr); - /* This is unrolled CK_LIST_FOREACH(). */ - for (inp = II_LIST_FIRST(ipi, hash); +next_first: + /* This is unrolled CK_LIST_FOREACH() over different headers. */ + for (inp = ii_list_first(ii); inp != NULL; - inp = II_LIST_NEXT(inp, hash)) { + inp = ii_list_next(ii, inp)) { if (match != NULL && (match)(inp, ctx) == false) continue; if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) break; else { smr_enter(ipi->ipi_smr); - MPASS(inp != II_LIST_FIRST(ipi, hash)); - inp = II_LIST_FIRST(ipi, hash); + MPASS(inp != ii_list_first(ii)); + inp = ii_list_first(ii); if (inp == NULL) break; } } - if (inp == NULL) + if (inp == NULL) { + if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) + goto next_first; smr_exit(ipi->ipi_smr); - else + } else ii->inp = inp; return (inp); @@ -1623,10 +1655,16 @@ inp_next(struct inpcb_iterator *ii) smr_enter(ipi->ipi_smr); restart: inp = ii->inp; - II_LOCK_ASSERT(inp, lock); + rw_assert(&inp->inp_lock, + lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED); next: - inp = II_LIST_NEXT(inp, hash); + inp = ii_list_next(ii, inp); if (inp == NULL) { + if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) { + inp_unlock(ii->inp, lock); + ii->inp = NULL; + goto next_first; + } smr_exit(ipi->ipi_smr); goto found; } @@ -1799,10 +1837,11 @@ in_pcbfree(struct inpcb *inp) */ INP_HASH_WLOCK(pcbinfo); if (inp->inp_flags & INP_INHASHLIST) - in_pcbremhash_locked(inp); + in_pcbremhash(inp); + else + CK_LIST_REMOVE(inp, inp_unconn_list); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; pcbinfo->ipi_count--; - CK_LIST_REMOVE(inp, inp_list); INP_HASH_WUNLOCK(pcbinfo); #ifdef RATELIMIT @@ -1882,8 +1921,13 @@ in_pcbdrop(struct inpcb *inp) INP_WLOCK_ASSERT(inp); inp->inp_flags |= INP_DROPPED; - if (inp->inp_flags & INP_INHASHLIST) + if (inp->inp_flags & INP_INHASHLIST) { + INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremhash(inp); + CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp, + inp_unconn_list); + INP_HASH_WUNLOCK(inp->inp_pcbinfo); + } } #ifdef INET @@ -2693,6 +2737,8 @@ in_pcbinshash(struct inpcb *inp) inp->inp_smr = SMR_SEQ_INVALID; } + CK_LIST_REMOVE(inp, inp_unconn_list); + if (connected) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); else { @@ -2710,7 +2756,7 @@ in_pcbinshash(struct inpcb *inp) } void -in_pcbremhash_locked(struct inpcb *inp) +in_pcbremhash(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); @@ -2737,14 +2783,6 @@ in_pcbremhash_locked(struct inpcb *inp) inp->inp_flags &= ~INP_INHASHLIST; } -static void -in_pcbremhash(struct inpcb *inp) -{ - INP_HASH_WLOCK(inp->inp_pcbinfo); - in_pcbremhash_locked(inp); - INP_HASH_WUNLOCK(inp->inp_pcbinfo); -} - /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the @@ -2787,15 +2825,12 @@ in_pcbrehash(struct inpcb *inp) * When rehashing, the caller must ensure that either the new or the old * foreign address was unspecified. */ - if (connected) - CK_LIST_REMOVE(inp, inp_hash_wild); - else - CK_LIST_REMOVE(inp, inp_hash_exact); - if (connected) { + CK_LIST_REMOVE(inp, inp_hash_wild); head = &pcbinfo->ipi_hash_exact[hash]; CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); } else { + CK_LIST_REMOVE(inp, inp_hash_exact); head = &pcbinfo->ipi_hash_wild[hash]; CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); } diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index e76e3fa8e382..592d951c018f 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -321,10 +321,15 @@ CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup); * Almost all fields of struct inpcb are static after creation or protected by * a per-inpcb rwlock, inp_lock. * - * A inpcb database is indexed by addresses/ports hash as well as list of - * all pcbs that belong to a certain proto. Database lookups or list traversals - * are be performed inside SMR section. Once desired PCB is found its own - * lock is to be obtained and SMR section exited. + * A inpcb database consists of two hash tables: one for connected pcbs and the + * other for wildcard-bound pcbs. The newborn pcbs reside on the unconnected + * list. Although a pcb can be on either of these three lists, we can't share + * the linkage pointers because unlocked readers might be iterating over them. + * The only thing that can be unionized is the load-balance table and exact + * hash, as a pcb can never participate in both tables through its entire life + * time. Database lookups or list traversals are to be performed inside SMR + * section. Once desired PCB is found its own lock is to be obtained and SMR + * section exited. * * Key: * (c) - Constant after initialization @@ -350,14 +355,13 @@ struct icmp6_filter; struct inpcbpolicy; struct m_snd_tag; struct inpcb { - /* Cache line #1 (amd64) */ union { - CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */ - LIST_ENTRY(inpcb) inp_lbgroup_list; /* lb group list */ + CK_LIST_ENTRY(inpcb) inp_hash_exact; + LIST_ENTRY(inpcb) inp_lbgroup_list; }; - CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */ + CK_LIST_ENTRY(inpcb) inp_hash_wild; + CK_LIST_ENTRY(inpcb) inp_unconn_list; struct rwlock inp_lock; - /* Cache line #2 (amd64) */ #define inp_start_zero inp_refcount #define inp_zero_size (sizeof(struct inpcb) - \ offsetof(struct inpcb, inp_start_zero)) @@ -412,7 +416,6 @@ struct inpcb { struct route inp_route; struct route_in6 inp_route6; }; - CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */ }; #define inp_vnet inp_pcbinfo->ipi_vnet @@ -431,7 +434,6 @@ struct inpcb { * (h) Locked by ipi_hash_lock */ struct inpcbinfo { - struct inpcbhead ipi_listhead; /* (r:e/w:h) */ u_int ipi_count; /* (h) */ /* @@ -460,6 +462,7 @@ struct inpcbinfo { * address, and "wild" holds the rest. */ struct mtx ipi_hash_lock; + struct inpcbhead ipi_list_unconn; /* (r:e/w:h) */ struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */ struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */ u_long ipi_hashmask; /* (c) */ @@ -670,14 +673,26 @@ int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)); #endif +/* + * struct inpcb_iterator is located on the stack of a function that uses + * inp_next(). The caller shall initialize the const members before first + * invocation of inp_next(). After that, until the iterator finishes the + * caller is supposed to only read 'inp' until it reads NULL. Some members + * have constness commented out for convenience of callers, that may reuse + * the iterator after it finishes. + * (c) - caller + * (n) - inp_next() + */ typedef bool inp_match_t(const struct inpcb *, void *); struct inpcb_iterator { const struct inpcbinfo *ipi; - struct inpcb *inp; - inp_match_t *match; - void *ctx; - int hash; + struct inpcb *inp; /* c:r, n:rw */ + /* const */ inp_match_t *match; + /* const */ void *ctx; + int hash; /* n:rw */ + /* const */ int mode; #define INP_ALL_LIST -1 +#define INP_UNCONN_LIST -2 const inp_lookup_t lock; }; @@ -686,7 +701,7 @@ struct inpcb_iterator { { \ .ipi = (_ipi), \ .lock = (_lock), \ - .hash = INP_ALL_LIST, \ + .mode = INP_ALL_LIST, \ .match = (_match), \ .ctx = (_ctx), \ } @@ -694,7 +709,7 @@ struct inpcb_iterator { { \ .ipi = (_ipi), \ .lock = (_lock), \ - .hash = INP_ALL_LIST, \ + .mode = INP_ALL_LIST, \ } struct inpcb *inp_next(struct inpcb_iterator *); diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h index 7e8a1626ab40..7a5c489f26d7 100644 --- a/sys/netinet/in_pcb_var.h +++ b/sys/netinet/in_pcb_var.h @@ -57,7 +57,7 @@ struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, int, struct ucred *); int in_pcbinshash(struct inpcb *); void in_pcbrehash(struct inpcb *); -void in_pcbremhash_locked(struct inpcb *); +void in_pcbremhash(struct inpcb *); /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index b0a75127b124..07c436a1f2e0 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -2841,9 +2841,12 @@ db_print_bblog_state(int state) static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog, - bool show_inpcb) + bool show_inpcb, bool only_locked) { + if (only_locked && tp->t_inpcb.inp_lock.rw_lock == RW_UNLOCKED) + return; + db_print_indent(indent); db_printf("%s at %p\n", name, tp); @@ -2987,14 +2990,13 @@ DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) show_bblog = strchr(modif, 'b') != NULL; show_inpcb = strchr(modif, 'i') != NULL; tp = (struct tcpcb *)addr; - db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb); + db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb, false); } DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs) { VNET_ITERATOR_DECL(vnet_iter); struct inpcb *inp; - struct tcpcb *tp; bool only_locked, show_bblog, show_inpcb; only_locked = strchr(modif, 'l') != NULL; @@ -3002,18 +3004,23 @@ DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs) show_inpcb = strchr(modif, 'i') != NULL; VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) { - if (only_locked && - inp->inp_lock.rw_lock == RW_UNLOCKED) - continue; - tp = intotcpcb(inp); - db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb); + for (u_int i = 0; i <= V_tcbinfo.ipi_porthashmask; i++) + CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_porthashbase[i], + inp_portlist) { + db_print_tcpcb(intotcpcb(inp), "tcpcb", 0, + show_bblog, show_inpcb, only_locked); + if (db_pager_quit) + goto break_hash; + } +break_hash: + CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_list_unconn, + inp_unconn_list) { + db_print_tcpcb(intotcpcb(inp), "tcpcb", 0, + show_bblog, show_inpcb, only_locked); if (db_pager_quit) break; } CURVNET_RESTORE(); - if (db_pager_quit) - break; } } #endif diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 2d6e860a72ba..8a644f96f72e 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -508,7 +508,9 @@ in6_pcbdisconnect(struct inpcb *inp) KASSERT(inp->inp_smr == SMR_SEQ_INVALID, ("%s: inp %p was already disconnected", __func__, inp)); - in_pcbremhash_locked(inp); + in_pcbremhash(inp); + CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp, + inp_unconn_list); if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) { /* See the comment in in_pcbinshash(). */home | help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?69dbe5f4.395af.58134a93>
