From owner-svn-src-head@freebsd.org Tue Jun 19 01:54:02 2018 Return-Path: Delivered-To: svn-src-head@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 4537510118E0; Tue, 19 Jun 2018 01:54:02 +0000 (UTC) (envelope-from mmacy@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 0ECC781B58; Tue, 19 Jun 2018 01:54:02 +0000 (UTC) (envelope-from mmacy@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id E52D14D8F; Tue, 19 Jun 2018 01:54:01 +0000 (UTC) (envelope-from mmacy@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w5J1s14m029648; Tue, 19 Jun 2018 01:54:01 GMT (envelope-from mmacy@FreeBSD.org) Received: (from mmacy@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w5J1s0Nd029642; Tue, 19 Jun 2018 01:54:00 GMT (envelope-from mmacy@FreeBSD.org) Message-Id: <201806190154.w5J1s0Nd029642@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: mmacy set sender to mmacy@FreeBSD.org using -f From: Matt Macy Date: Tue, 19 Jun 2018 01:54:00 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r335356 - in head/sys: kern netinet netinet/tcp_stacks X-SVN-Group: head X-SVN-Commit-Author: mmacy X-SVN-Commit-Paths: in head/sys: kern netinet netinet/tcp_stacks X-SVN-Commit-Revision: 335356 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.26 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 19 Jun 2018 01:54:02 -0000 Author: mmacy Date: Tue Jun 19 01:54:00 2018 New Revision: 335356 URL: https://svnweb.freebsd.org/changeset/base/335356 Log: convert inpcbinfo hash and info rwlocks to epoch + mutex - Convert inpcbinfo info & hash locks to epoch for read and mutex for write - Garbage collect code that handled INP_INFO_TRY_RLOCK failures as INP_INFO_RLOCK which can no longer fail When running 64 netperfs sending minimal sized packets on a 2x8x2 reduces unhalted core cycles samples in rwlock rlock/runlock in udp_send from 51% to 3%. Overall packet throughput rate limited by CPU affinity and NIC driver design choices. On the receiver unhalted core cycles samples in in_pcblookup_hash went from 13% to to 1.6% Tested by LLNW and pho@ Reviewed by: jtl Sponsored by: Limelight Networks Differential Revision: https://reviews.freebsd.org/D15686 Modified: head/sys/kern/subr_witness.c head/sys/netinet/in_pcb.h head/sys/netinet/tcp_hpts.c head/sys/netinet/tcp_input.c head/sys/netinet/tcp_stacks/rack.c head/sys/netinet/tcp_timewait.c Modified: head/sys/kern/subr_witness.c ============================================================================== --- head/sys/kern/subr_witness.c Tue Jun 19 01:33:03 2018 (r335355) +++ head/sys/kern/subr_witness.c Tue Jun 19 01:54:00 2018 (r335356) @@ -561,14 +561,14 @@ static struct witness_order_list_entry order_lists[] = /* * UDP/IP */ - { "udp", &lock_class_rw }, + { "udp", &lock_class_mtx_sleep }, { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ - { "tcp", &lock_class_rw }, + { "tcp", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, Modified: head/sys/netinet/in_pcb.h ============================================================================== --- head/sys/netinet/in_pcb.h Tue Jun 19 01:33:03 2018 (r335355) +++ head/sys/netinet/in_pcb.h Tue Jun 19 01:54:00 2018 (r335356) @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include #endif #include @@ -157,6 +159,7 @@ struct in_conninfo { * Key: * (b) - Protected by the hpts lock. * (c) - Constant after initialization + * (e) - Protected by the net_epoch_prempt epoch * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb @@ -231,7 +234,7 @@ struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - CK_LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ + CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ @@ -324,8 +327,8 @@ struct inpcb { struct route_in6 inp_route6; }; CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ - /* (p[w]) for list iteration */ - /* (p[r]/l) for addition/removal */ + /* (e[r]) for list iteration */ + /* (p[w]/l) for addition/removal */ struct epoch_context inp_epoch_ctx; }; #endif /* _KERNEL */ @@ -436,22 +439,23 @@ struct in_pcblist { * Locking key: * * (c) Constant or nearly constant after initialisation + * (e) - Protected by the net_epoch_prempt epoch * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock - * (h) Read using either ipi_hash_lock or inpcb lock; write requires both + * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting full inpcb list traversal + * Global lock protecting inpcb list modification */ - struct rwlock ipi_lock; + struct mtx ipi_lock; /* * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_listhead; /* (g/l) */ + struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */ u_int ipi_count; /* (l) */ /* @@ -482,9 +486,9 @@ struct inpcbinfo { u_int ipi_hashfields; /* (c) */ /* - * Global lock protecting non-pcbgroup hash lookup tables. + * Global lock protecting modification non-pcbgroup hash lookup tables. */ - struct rwlock ipi_hash_lock; + struct mtx ipi_hash_lock; /* * Global hash of inpcbs, hashed by local and foreign addresses and @@ -626,20 +630,18 @@ int inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) -#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) -#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) -#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) -#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) -#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) -#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) -#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) + mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) +#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) +#define INP_INFO_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) +#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch()) +#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) +#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) @@ -660,17 +662,14 @@ int inp_so_options(const struct inpcb *inp); #define INP_LIST_UNLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) -#define INP_HASH_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) -#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) -#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ - RA_LOCKED) -#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ - RA_WLOCKED) +#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) +#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) +#define INP_HASH_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) +#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ MTX_DEF | MTX_DUPOK) Modified: head/sys/netinet/tcp_hpts.c ============================================================================== --- head/sys/netinet/tcp_hpts.c Tue Jun 19 01:33:03 2018 (r335355) +++ head/sys/netinet/tcp_hpts.c Tue Jun 19 01:54:00 2018 (r335356) @@ -184,9 +184,6 @@ TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_ static struct tcp_hptsi tcp_pace; -static int -tcp_hptsi_lock_inpinfo(struct inpcb *inp, - struct tcpcb **tp); static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); @@ -498,59 +495,6 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); -/* - * Try to get the INP_INFO lock. - * - * This function always succeeds in getting the lock. It will clear - * *tpp and return (1) if something critical changed while the inpcb - * was unlocked. Otherwise, it will leave *tpp unchanged and return (0). - * - * This function relies on the fact that the hpts always holds a - * reference on the inpcb while the segment is on the hptsi wheel and - * in the input queue. - * - */ -static int -tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp) -{ - struct tcp_function_block *tfb; - struct tcpcb *tp; - void *ptr; - - /* Try the easy way. */ - if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) - return (0); - - /* - * OK, let's try the hard way. We'll save the function pointer block - * to make sure that doesn't change while we aren't holding the - * lock. - */ - tp = *tpp; - tfb = tp->t_fb; - ptr = tp->t_fb_ptr; - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - INP_WLOCK(inp); - /* If the session went away, return an error. */ - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { - *tpp = NULL; - return (1); - } - /* - * If the function block or stack-specific data block changed, - * report an error. - */ - tp = intotcpcb(inp); - if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) { - *tpp = NULL; - return (1); - } - return (0); -} - - static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { @@ -1290,10 +1234,7 @@ out: (m->m_pkthdr.pace_lock == TI_RLOCKED || tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - if (tcp_hptsi_lock_inpinfo(inp, &tp)) { - CURVNET_RESTORE(); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); m = tp->t_in_pkt; } if (in_newts_every_tcb) { @@ -1360,7 +1301,6 @@ out: */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { - out_free: while (m) { m_freem(m); m = n; @@ -1376,8 +1316,7 @@ out: if (ti_locked == TI_UNLOCKED && (tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - if (tcp_hptsi_lock_inpinfo(inp, &tp)) - goto out_free; + INP_INFO_RLOCK(&V_tcbinfo); } } /** end while(m) */ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ Modified: head/sys/netinet/tcp_input.c ============================================================================== --- head/sys/netinet/tcp_input.c Tue Jun 19 01:33:03 2018 (r335355) +++ head/sys/netinet/tcp_input.c Tue Jun 19 01:54:00 2018 (r335356) @@ -960,25 +960,10 @@ findpcb: * * XXXRW: It may be time to rethink timewait locking. */ -relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - inp = NULL; - goto findpcb; - } else if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - inp = NULL; - goto findpcb; - } - } else - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK(); + ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); @@ -1026,23 +1011,8 @@ relocked: (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && !IS_FASTOPEN(tp->t_flags)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - inp = NULL; - goto findpcb; - } else if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - inp = NULL; - goto findpcb; - } - goto relocked; - } else - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK(); + ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } Modified: head/sys/netinet/tcp_stacks/rack.c ============================================================================== --- head/sys/netinet/tcp_stacks/rack.c Tue Jun 19 01:33:03 2018 (r335355) +++ head/sys/netinet/tcp_stacks/rack.c Tue Jun 19 01:54:00 2018 (r335356) @@ -6837,34 +6837,8 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, str * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ - if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo)) - ti_locked = TI_RLOCKED; - if (ti_locked != TI_RLOCKED) { - inp = tp->t_inpcb; - tfb = tp->t_fb; - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - inp = NULL; - if (inp == NULL || (inp->inp_flags2 & INP_FREED) || - (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { - /* The TCPCB went away. Free the packet. */ - INP_INFO_RUNLOCK(&V_tcbinfo); - if (inp) - INP_WUNLOCK(inp); - m_freem(m); - return; - } - /* If the stack changed, call the correct stack. */ - if (tp->t_fb != tfb) { - tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, - drop_hdrlen, tlen, iptos, ti_locked); - return; - } - } + INP_INFO_RLOCK(); + ti_locked = TI_RLOCKED; tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked, 0, &tv); Modified: head/sys/netinet/tcp_timewait.c ============================================================================== --- head/sys/netinet/tcp_timewait.c Tue Jun 19 01:33:03 2018 (r335355) +++ head/sys/netinet/tcp_timewait.c Tue Jun 19 01:54:00 2018 (r335356) @@ -707,54 +707,46 @@ tcp_tw_2msl_scan(int reuse) in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { - - INP_WLOCK(inp); - tw = intotw(inp); - if (in_pcbrele_wlocked(inp)) { - if (__predict_true(tw == NULL)) { - INP_INFO_RUNLOCK(&V_tcbinfo); - continue; - } else { - /* This should not happen as in TIMEWAIT - * state the inp should not be destroyed - * before its tcptw. If INVARIANTS is - * defined panic. - */ + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tw = intotw(inp); + if (in_pcbrele_wlocked(inp)) { + if (__predict_true(tw == NULL)) { + INP_INFO_RUNLOCK(&V_tcbinfo); + continue; + } else { + /* This should not happen as in TIMEWAIT + * state the inp should not be destroyed + * before its tcptw. If INVARIANTS is + * defined panic. + */ #ifdef INVARIANTS - panic("%s: Panic before an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); + panic("%s: Panic before an infinite " + "loop: INP_TIMEWAIT && (INP_FREED " + "|| inp last reference) && tw != " + "NULL", __func__); #else - log(LOG_ERR, "%s: Avoid an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); + log(LOG_ERR, "%s: Avoid an infinite " + "loop: INP_TIMEWAIT && (INP_FREED " + "|| inp last reference) && tw != " + "NULL", __func__); #endif - INP_INFO_RUNLOCK(&V_tcbinfo); - break; - } - } - - if (tw == NULL) { - /* tcp_twclose() has already been called */ - INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); - continue; + break; } + } - tcp_twclose(tw, reuse); + if (tw == NULL) { + /* tcp_twclose() has already been called */ + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); - if (reuse) - return tw; - } else { - /* INP_INFO lock is busy, continue later. */ - INP_WLOCK(inp); - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); - break; + continue; } + + tcp_twclose(tw, reuse); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (reuse) + return tw; } return NULL;