Date: Mon, 7 Oct 2019 16:24:58 +0200 From: Mateusz Guzik <mjguzik@gmail.com> To: Cy Schubert <Cy.Schubert@cschubert.com> Cc: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: Re: svn commit: r353149 - head/sys/amd64/amd64 Message-ID: <CAGudoHFbuUgCrBa1JS6S5ynB_obpwBJdyRP66bd-WuhHO8ZP7A@mail.gmail.com> In-Reply-To: <201910070419.x974JOkQ020574@slippy.cwsent.com> References: <201910062213.x96MDZv3085523@repo.freebsd.org> <201910070406.x9746N0U009068@slippy.cwsent.com> <201910070419.x974JOkQ020574@slippy.cwsent.com>
next in thread | previous in thread | raw e-mail | index | archive | help
Can you show: sysctl vm.phys_segs and from the crashdump: p pv_table On 10/7/19, Cy Schubert <Cy.Schubert@cschubert.com> wrote: > In message <201910070406.x9746N0U009068@slippy.cwsent.com>, Cy Schubert > writes: >> In message <201910062213.x96MDZv3085523@repo.freebsd.org>, Mateusz Guzik >> writes >> : >> > Author: mjg >> > Date: Sun Oct 6 22:13:35 2019 >> > New Revision: 353149 >> > URL: https://svnweb.freebsd.org/changeset/base/353149 >> > >> > Log: >> > amd64 pmap: implement per-superpage locks >> > >> > The current 256-lock sized array is a problem in the following ways: >> > - it's way too small >> > - there are 2 locks per cacheline >> > - it is not NUMA-aware >> > >> > Solve these issues by introducing per-superpage locks backed by pages >> > allocated from respective domains. >> > >> > This significantly reduces contention e.g. during poudriere -j 104. >> > See the review for results. >> > >> > Reviewed by: kib >> > Discussed with: jeff >> > Sponsored by: The FreeBSD Foundation >> > Differential Revision: https://reviews.freebsd.org/D21833 >> > >> > Modified: >> > head/sys/amd64/amd64/pmap.c >> > >> > Modified: head/sys/amd64/amd64/pmap.c >> > =========================================================================== >> == >> > = >> > --- head/sys/amd64/amd64/pmap.c Sun Oct 6 20:36:25 2019 (r35314 >> > 8) >> > +++ head/sys/amd64/amd64/pmap.c Sun Oct 6 22:13:35 2019 (r35314 >> > 9) >> > @@ -316,13 +316,25 @@ pmap_pku_mask_bit(pmap_t pmap) >> > #define PV_STAT(x) do { } while (0) >> > #endif >> > >> > -#define pa_index(pa) ((pa) >> PDRSHIFT) >> > +#undef pa_index >> > +#define pa_index(pa) ({ \ >> > + KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ >> > + ("address %lx beyond the last segment", (pa))); \ >> > + (pa) >> PDRSHIFT; \ >> > +}) >> > +#if VM_NRESERVLEVEL > 0 >> > +#define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) >> > +#define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) >> > +#define PHYS_TO_PV_LIST_LOCK(pa) \ >> > + (&(pa_to_pmdp(pa)->pv_lock)) >> > +#else >> > #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) >> > >> > #define NPV_LIST_LOCKS MAXCPU >> > >> > #define PHYS_TO_PV_LIST_LOCK(pa) \ >> > (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) >> > +#endif >> > >> > #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ >> > struct rwlock **_lockp = (lockp); \ >> > @@ -400,14 +412,22 @@ static int pmap_initialized; >> > >> > /* >> > * Data for the pv entry allocation mechanism. >> > - * Updates to pv_invl_gen are protected by the pv_list_locks[] >> > - * elements, but reads are not. >> > + * Updates to pv_invl_gen are protected by the pv list lock but reads >> > are >> no >> > t. >> > */ >> > static TAILQ_HEAD(pch, pv_chunk) pv_chunks = >> > TAILQ_HEAD_INITIALIZER(pv_chu >> nk >> > s); >> > static struct mtx __exclusive_cache_line pv_chunks_mutex; >> > +#if VM_NRESERVLEVEL > 0 >> > +struct pmap_large_md_page { >> > + struct rwlock pv_lock; >> > + struct md_page pv_page; >> > + u_long pv_invl_gen; >> > +}; >> > +static struct pmap_large_md_page *pv_table; >> > +#else >> > static struct rwlock __exclusive_cache_line >> > pv_list_locks[NPV_LIST_LOCKS]; >> > static u_long pv_invl_gen[NPV_LIST_LOCKS]; >> > static struct md_page *pv_table; >> > +#endif >> > static struct md_page pv_dummy; >> > >> > /* >> > @@ -918,12 +938,21 @@ SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, >> > CTLFL >> A >> > "Number of slow invalidation waits for lockless DI"); >> > #endif >> > >> > +#if VM_NRESERVLEVEL > 0 >> > static u_long * >> > pmap_delayed_invl_genp(vm_page_t m) >> > { >> > >> > + return (&pa_to_pmdp(VM_PAGE_TO_PHYS(m))->pv_invl_gen); >> > +} >> > +#else >> > +static u_long * >> > +pmap_delayed_invl_genp(vm_page_t m) >> > +{ >> > + >> > return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); >> > } >> > +#endif >> > >> > static void >> > pmap_delayed_invl_callout_func(void *arg __unused) >> > @@ -1803,6 +1832,112 @@ pmap_page_init(vm_page_t m) >> > m->md.pat_mode = PAT_WRITE_BACK; >> > } >> > >> > +#if VM_NRESERVLEVEL > 0 >> > +static void >> > +pmap_init_pv_table(void) >> > +{ >> > + struct pmap_large_md_page *pvd; >> > + vm_size_t s; >> > + long start, end, highest, pv_npg; >> > + int domain, i, j, pages; >> > + >> > + /* >> > + * We strongly depend on the size being a power of two, so the assert >> > + * is overzealous. However, should the struct be resized to a >> > + * different power of two, the code below needs to be revisited. >> > + */ >> > + CTASSERT((sizeof(*pvd) == 64)); >> > + >> > + /* >> > + * Calculate the size of the array. >> > + */ >> > + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); >> > + s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); >> > + s = round_page(s); >> > + pv_table = (struct pmap_large_md_page *)kva_alloc(s); >> > + if (pv_table == NULL) >> > + panic("%s: kva_alloc failed\n", __func__); >> > + >> > + /* >> > + * Iterate physical segments to allocate space for respective pages. >> > + */ >> > + highest = -1; >> > + s = 0; >> > + for (i = 0; i < vm_phys_nsegs; i++) { >> > + start = vm_phys_segs[i].start / NBPDR; >> > + end = vm_phys_segs[i].end / NBPDR; >> > + domain = vm_phys_segs[i].domain; >> > + >> > + if (highest >= end) >> > + continue; >> > + >> > + if (start < highest) { >> > + start = highest + 1; >> > + pvd = &pv_table[start]; >> > + } else { >> > + /* >> > + * The lowest address may land somewhere in the middle >> > + * of our page. Simplify the code by pretending it is >> > + * at the beginning. >> > + */ >> > + pvd = pa_to_pmdp(vm_phys_segs[i].start); >> > + pvd = (struct pmap_large_md_page *)trunc_page(pvd); >> > + start = pvd - pv_table; >> > + } >> > + >> > + pages = end - start + 1; >> > + s = round_page(pages * sizeof(*pvd)); >> > + highest = start + (s / sizeof(*pvd)) - 1; >> > + >> > + for (j = 0; j < s; j += PAGE_SIZE) { >> > + vm_page_t m = vm_page_alloc_domain(NULL, 0, >> > + domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); >> > + if (m == NULL) >> > + panic("vm_page_alloc_domain failed for %lx\n", >> > (vm_offset_t)pvd + j); >> > + pmap_qenter((vm_offset_t)pvd + j, &m, 1); >> > + } >> > + >> > + for (j = 0; j < s / sizeof(*pvd); j++) { >> > + rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); >> > + TAILQ_INIT(&pvd->pv_page.pv_list); >> > + pvd->pv_page.pv_gen = 0; >> > + pvd->pv_page.pat_mode = 0; >> > + pvd->pv_invl_gen = 0; >> > + pvd++; >> > + } >> > + } >> > + TAILQ_INIT(&pv_dummy.pv_list); >> > +} >> > +#else >> > +static void >> > +pmap_init_pv_table(void) >> > +{ >> > + vm_size_t s; >> > + long i, pv_npg; >> > + >> > + /* >> > + * Initialize the pool of pv list locks. >> > + */ >> > + for (i = 0; i < NPV_LIST_LOCKS; i++) >> > + rw_init(&pv_list_locks[i], "pmap pv list"); >> > + >> > + /* >> > + * Calculate the size of the pv head table for superpages. >> > + */ >> > + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); >> > + >> > + /* >> > + * Allocate memory for the pv head table for superpages. >> > + */ >> > + s = (vm_size_t)pv_npg * sizeof(struct md_page); >> > + s = round_page(s); >> > + pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); >> > + for (i = 0; i < pv_npg; i++) >> > + TAILQ_INIT(&pv_table[i].pv_list); >> > + TAILQ_INIT(&pv_dummy.pv_list); >> > +} >> > +#endif >> > + >> > /* >> > * Initialize the pmap module. >> > * Called by vm_init, to initialize any structures that the pmap >> > @@ -1813,8 +1948,7 @@ pmap_init(void) >> > { >> > struct pmap_preinit_mapping *ppim; >> > vm_page_t m, mpte; >> > - vm_size_t s; >> > - int error, i, pv_npg, ret, skz63; >> > + int error, i, ret, skz63; >> > >> > /* L1TF, reserve page @0 unconditionally */ >> > vm_page_blacklist_add(0, bootverbose); >> > @@ -1902,26 +2036,7 @@ pmap_init(void) >> > */ >> > mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); >> > >> > - /* >> > - * Initialize the pool of pv list locks. >> > - */ >> > - for (i = 0; i < NPV_LIST_LOCKS; i++) >> > - rw_init(&pv_list_locks[i], "pmap pv list"); >> > - >> > - /* >> > - * Calculate the size of the pv head table for superpages. >> > - */ >> > - pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); >> > - >> > - /* >> > - * Allocate memory for the pv head table for superpages. >> > - */ >> > - s = (vm_size_t)(pv_npg * sizeof(struct md_page)); >> > - s = round_page(s); >> > - pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); >> > - for (i = 0; i < pv_npg; i++) >> > - TAILQ_INIT(&pv_table[i].pv_list); >> > - TAILQ_INIT(&pv_dummy.pv_list); >> > + pmap_init_pv_table(); >> > >> > pmap_initialized = 1; >> > for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { >> > >> >> This causes a page fault during X (xdm) startup, which loads >> drm-current-kmod. >> >> db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame >> 0xfffffe0093e9c260 >> vpanic() at vpanic+0x19d/frame 0xfffffe0093e9c2b0 >> panic() at panic+0x43/frame 0xfffffe0093e9c310 >> vm_fault() at vm_fault+0x2126/frame 0xfffffe0093e9c460 >> vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c4b0 >> trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c510 >> trap() at trap+0x2a1/frame 0xfffffe0093e9c620 >> calltrap() at calltrap+0x8/frame 0xfffffe0093e9c620 >> --- trap 0xc, rip = 0xffffffff80a054b1, rsp = 0xfffffe0093e9c6f0, rbp = >> 0xfffffe0093e9c7a0 --- >> pmap_enter() at pmap_enter+0x861/frame 0xfffffe0093e9c7a0 >> vm_fault() at vm_fault+0x1c69/frame 0xfffffe0093e9c8f0 >> vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c940 >> trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c9a0 >> trap() at trap+0x438/frame 0xfffffe0093e9cab0 >> calltrap() at calltrap+0x8/frame 0xfffffe0093e9cab0 >> --- trap 0xc, rip = 0x30e2a9c3, rsp = 0x7fffffffea50, rbp = 0x7fffffffeaa0 >> >> --- >> Uptime: 3m33s >> Dumping 945 out of 7974 >> MB:..2%..11%..21%..31%..41%..51%..61%..72%..82%..92% >> >> (kgdb) bt >> #0 doadump (textdump=1) at pcpu_aux.h:55 >> #1 0xffffffff8068c5ed in kern_reboot (howto=260) >> at /opt/src/svn-current/sys/kern/kern_shutdown.c:479 >> #2 0xffffffff8068caa9 in vpanic (fmt=<value optimized out>, >> ap=<value optimized out>) >> at /opt/src/svn-current/sys/kern/kern_shutdown.c:908 >> #3 0xffffffff8068c8a3 in panic (fmt=<value optimized out>) >> at /opt/src/svn-current/sys/kern/kern_shutdown.c:835 >> #4 0xffffffff8098c966 in vm_fault (map=<value optimized out>, >> vaddr=<value optimized out>, fault_type=<value optimized out>, >> fault_flags=<value optimized out>, m_hold=<value optimized out>) >> at /opt/src/svn-current/sys/vm/vm_fault.c:672 >> #5 0xffffffff8098a723 in vm_fault_trap (map=0xfffff80002001000, >> vaddr=<value optimized out>, fault_type=2 '\002', >> fault_flags=<value optimized out>, signo=0x0, ucode=0x0) >> at /opt/src/svn-current/sys/vm/vm_fault.c:568 >> #6 0xffffffff80a18326 in trap_pfault (frame=0xfffffe0093e9c630, >> signo=<value optimized out>, ucode=<value optimized out>) >> at /opt/src/svn-current/sys/amd64/amd64/trap.c:828 >> #7 0xffffffff80a177f1 in trap (frame=0xfffffe0093e9c630) >> at /opt/src/svn-current/sys/amd64/amd64/trap.c:407 >> #8 0xffffffff809f1aac in calltrap () >> at /opt/src/svn-current/sys/amd64/amd64/exception.S:289 >> ---Type <return> to continue, or q <return> to quit--- >> #9 0xffffffff80a054b1 in pmap_enter (pmap=<value optimized out>, >> va=851443712, m=0xfffffe0005b25ce8, prot=<value optimized out>, >> flags=2677542912, psind=<value optimized out>) at atomic.h:221 >> #10 0xffffffff8098c4a9 in vm_fault (map=<value optimized out>, >> vaddr=<value optimized out>, fault_type=232 '\ufffd', >> fault_flags=<value optimized out>, m_hold=0x0) >> at /opt/src/svn-current/sys/vm/vm_fault.c:489 >> #11 0xffffffff8098a723 in vm_fault_trap (map=0xfffff80173eb5000, >> vaddr=<value optimized out>, fault_type=2 '\002', >> fault_flags=<value optimized out>, signo=0xfffffe0093e9ca84, >> ucode=0xfffffe0093e9ca80) at >> /opt/src/svn-current/sys/vm/vm_fault.c:568 >> #12 0xffffffff80a18326 in trap_pfault (frame=0xfffffe0093e9cac0, >> signo=<value optimized out>, ucode=<value optimized out>) >> at /opt/src/svn-current/sys/amd64/amd64/trap.c:828 >> #13 0xffffffff80a17988 in trap (frame=0xfffffe0093e9cac0) >> at /opt/src/svn-current/sys/amd64/amd64/trap.c:347 >> #14 0xffffffff809f1aac in calltrap () >> at /opt/src/svn-current/sys/amd64/amd64/exception.S:289 >> #15 0x0000000030e2a9c3 in ?? () >> Previous frame inner to this frame (corrupt stack?) >> Current language: auto; currently minimal >> (kgdb) frame 9 >> #9 0xffffffff80a054b1 in pmap_enter (pmap=<value optimized out>, >> va=851443712, m=0xfffffe0005b25ce8, prot=<value optimized out>, >> flags=2677542912, psind=<value optimized out>) at atomic.h:221 >> 221 ATOMIC_CMPSET(long); >> (kgdb) l >> 216 } >> 217 >> 218 ATOMIC_CMPSET(char); >> 219 ATOMIC_CMPSET(short); >> 220 ATOMIC_CMPSET(int); >> 221 ATOMIC_CMPSET(long); >> 222 >> 223 /* >> 224 * Atomically add the value of v to the integer pointed to by p and >> return >> 225 * the previous value of *p. >> (kgdb) > > I should use kgdb from ports instead of /usr/libexec version. Similar > result. > > <4>WARN_ON(!mutex_is_locked(&fbc->lock))WARN_ON(!mutex_is_locked(&fbc-> > lock)) > panic: vm_fault: fault on nofault entry, addr: 0xfffffe000e01c000 > cpuid = 1 > time = 1570417211 > KDB: stack backtrace: > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame > 0xfffffe0093e9c260 > vpanic() at vpanic+0x19d/frame 0xfffffe0093e9c2b0 > panic() at panic+0x43/frame 0xfffffe0093e9c310 > vm_fault() at vm_fault+0x2126/frame 0xfffffe0093e9c460 > vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c4b0 > trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c510 > trap() at trap+0x2a1/frame 0xfffffe0093e9c620 > calltrap() at calltrap+0x8/frame 0xfffffe0093e9c620 > --- trap 0xc, rip = 0xffffffff80a054b1, rsp = 0xfffffe0093e9c6f0, rbp = > 0xfffffe0093e9c7a0 --- > pmap_enter() at pmap_enter+0x861/frame 0xfffffe0093e9c7a0 > vm_fault() at vm_fault+0x1c69/frame 0xfffffe0093e9c8f0 > vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c940 > trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c9a0 > trap() at trap+0x438/frame 0xfffffe0093e9cab0 > calltrap() at calltrap+0x8/frame 0xfffffe0093e9cab0 > --- trap 0xc, rip = 0x30e2a9c3, rsp = 0x7fffffffea50, rbp = 0x7fffffffeaa0 > --- > Uptime: 3m33s > Dumping 945 out of 7974 > MB:..2%..11%..21%..31%..41%..51%..61%..72%..82%..92% > > __curthread () at /opt/src/svn-current/sys/amd64/include/pcpu_aux.h:55 > 55 __asm("movq %%gs:%P1,%0" : "=r" (td) : "n" (offsetof(struct pcpu, > (kgdb) > > Backtrace stopped: Cannot access memory at address 0x7fffffffea50 > (kgdb) frame 10 > #10 0xffffffff80a054b1 in atomic_fcmpset_long (dst=<optimized out>, > src=<optimized out>, expect=<optimized out>) > at /opt/src/svn-current/sys/amd64/include/atomic.h:221 > 221 ATOMIC_CMPSET(long); > (kgdb) l > 216 } > 217 > 218 ATOMIC_CMPSET(char); > 219 ATOMIC_CMPSET(short); > 220 ATOMIC_CMPSET(int); > 221 ATOMIC_CMPSET(long); > 222 > 223 /* > 224 * Atomically add the value of v to the integer pointed to by p and > return > 225 * the previous value of *p. > (kgdb) > > > > -- > Cheers, > Cy Schubert <Cy.Schubert@cschubert.com> > FreeBSD UNIX: <cy@FreeBSD.org> Web: http://www.FreeBSD.org > > The need of the many outweighs the greed of the few. > > > -- Mateusz Guzik <mjguzik gmail.com>
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAGudoHFbuUgCrBa1JS6S5ynB_obpwBJdyRP66bd-WuhHO8ZP7A>