Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 7 Oct 2019 16:24:58 +0200
From:      Mateusz Guzik <mjguzik@gmail.com>
To:        Cy Schubert <Cy.Schubert@cschubert.com>
Cc:        src-committers@freebsd.org, svn-src-all@freebsd.org,  svn-src-head@freebsd.org
Subject:   Re: svn commit: r353149 - head/sys/amd64/amd64
Message-ID:  <CAGudoHFbuUgCrBa1JS6S5ynB_obpwBJdyRP66bd-WuhHO8ZP7A@mail.gmail.com>
In-Reply-To: <201910070419.x974JOkQ020574@slippy.cwsent.com>
References:  <201910062213.x96MDZv3085523@repo.freebsd.org> <201910070406.x9746N0U009068@slippy.cwsent.com> <201910070419.x974JOkQ020574@slippy.cwsent.com>

next in thread | previous in thread | raw e-mail | index | archive | help
Can you show:

sysctl vm.phys_segs

and from the crashdump:
p pv_table

On 10/7/19, Cy Schubert <Cy.Schubert@cschubert.com> wrote:
> In message <201910070406.x9746N0U009068@slippy.cwsent.com>, Cy Schubert
> writes:
>> In message <201910062213.x96MDZv3085523@repo.freebsd.org>, Mateusz Guzik
>> writes
>> :
>> > Author: mjg
>> > Date: Sun Oct  6 22:13:35 2019
>> > New Revision: 353149
>> > URL: https://svnweb.freebsd.org/changeset/base/353149
>> >
>> > Log:
>> >   amd64 pmap: implement per-superpage locks
>> >
>> >   The current 256-lock sized array is a problem in the following ways:
>> >   - it's way too small
>> >   - there are 2 locks per cacheline
>> >   - it is not NUMA-aware
>> >
>> >   Solve these issues by introducing per-superpage locks backed by pages
>> >   allocated from respective domains.
>> >
>> >   This significantly reduces contention e.g. during poudriere -j 104.
>> >   See the review for results.
>> >
>> >   Reviewed by:	kib
>> >   Discussed with:	jeff
>> >   Sponsored by:	The FreeBSD Foundation
>> >   Differential Revision:	https://reviews.freebsd.org/D21833
>> >
>> > Modified:
>> >   head/sys/amd64/amd64/pmap.c
>> >
>> > Modified: head/sys/amd64/amd64/pmap.c
>> > ===========================================================================
>> ==
>> > =
>> > --- head/sys/amd64/amd64/pmap.c	Sun Oct  6 20:36:25 2019	(r35314
>> > 8)
>> > +++ head/sys/amd64/amd64/pmap.c	Sun Oct  6 22:13:35 2019	(r35314
>> > 9)
>> > @@ -316,13 +316,25 @@ pmap_pku_mask_bit(pmap_t pmap)
>> >  #define PV_STAT(x)	do { } while (0)
>> >  #endif
>> >
>> > -#define	pa_index(pa)	((pa) >> PDRSHIFT)
>> > +#undef pa_index
>> > +#define	pa_index(pa)	({					\
>> > +	KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end,	\
>> > +	    ("address %lx beyond the last segment", (pa)));	\
>> > +	(pa) >> PDRSHIFT;					\
>> > +})
>> > +#if VM_NRESERVLEVEL > 0
>> > +#define	pa_to_pmdp(pa)	(&pv_table[pa_index(pa)])
>> > +#define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
>> > +#define	PHYS_TO_PV_LIST_LOCK(pa)	\
>> > +			(&(pa_to_pmdp(pa)->pv_lock))
>> > +#else
>> >  #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
>> >
>> >  #define	NPV_LIST_LOCKS	MAXCPU
>> >
>> >  #define	PHYS_TO_PV_LIST_LOCK(pa)	\
>> >  			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
>> > +#endif
>> >
>> >  #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
>> >  	struct rwlock **_lockp = (lockp);		\
>> > @@ -400,14 +412,22 @@ static int pmap_initialized;
>> >
>> >  /*
>> >   * Data for the pv entry allocation mechanism.
>> > - * Updates to pv_invl_gen are protected by the pv_list_locks[]
>> > - * elements, but reads are not.
>> > + * Updates to pv_invl_gen are protected by the pv list lock but reads
>> > are
>> no
>> > t.
>> >   */
>> >  static TAILQ_HEAD(pch, pv_chunk) pv_chunks =
>> > TAILQ_HEAD_INITIALIZER(pv_chu
>> nk
>> > s);
>> >  static struct mtx __exclusive_cache_line pv_chunks_mutex;
>> > +#if VM_NRESERVLEVEL > 0
>> > +struct pmap_large_md_page {
>> > +	struct rwlock   pv_lock;
>> > +	struct md_page  pv_page;
>> > +	u_long pv_invl_gen;
>> > +};
>> > +static struct pmap_large_md_page *pv_table;
>> > +#else
>> >  static struct rwlock __exclusive_cache_line
>> > pv_list_locks[NPV_LIST_LOCKS];
>> >  static u_long pv_invl_gen[NPV_LIST_LOCKS];
>> >  static struct md_page *pv_table;
>> > +#endif
>> >  static struct md_page pv_dummy;
>> >
>> >  /*
>> > @@ -918,12 +938,21 @@ SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow,
>> > CTLFL
>> A
>> >      "Number of slow invalidation waits for lockless DI");
>> >  #endif
>> >
>> > +#if VM_NRESERVLEVEL > 0
>> >  static u_long *
>> >  pmap_delayed_invl_genp(vm_page_t m)
>> >  {
>> >
>> > +	return (&pa_to_pmdp(VM_PAGE_TO_PHYS(m))->pv_invl_gen);
>> > +}
>> > +#else
>> > +static u_long *
>> > +pmap_delayed_invl_genp(vm_page_t m)
>> > +{
>> > +
>> >  	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
>> >  }
>> > +#endif
>> >
>> >  static void
>> >  pmap_delayed_invl_callout_func(void *arg __unused)
>> > @@ -1803,6 +1832,112 @@ pmap_page_init(vm_page_t m)
>> >  	m->md.pat_mode = PAT_WRITE_BACK;
>> >  }
>> >
>> > +#if VM_NRESERVLEVEL > 0
>> > +static void
>> > +pmap_init_pv_table(void)
>> > +{
>> > +	struct pmap_large_md_page *pvd;
>> > +	vm_size_t s;
>> > +	long start, end, highest, pv_npg;
>> > +	int domain, i, j, pages;
>> > +
>> > +	/*
>> > +	 * We strongly depend on the size being a power of two, so the assert
>> > +	 * is overzealous. However, should the struct be resized to a
>> > +	 * different power of two, the code below needs to be revisited.
>> > +	 */
>> > +	CTASSERT((sizeof(*pvd) == 64));
>> > +
>> > +	/*
>> > +	 * Calculate the size of the array.
>> > +	 */
>> > +	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
>> > +	s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page);
>> > +	s = round_page(s);
>> > +	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
>> > +	if (pv_table == NULL)
>> > +		panic("%s: kva_alloc failed\n", __func__);
>> > +
>> > +	/*
>> > +	 * Iterate physical segments to allocate space for respective pages.
>> > +	 */
>> > +	highest = -1;
>> > +	s = 0;
>> > +	for (i = 0; i < vm_phys_nsegs; i++) {
>> > +		start = vm_phys_segs[i].start / NBPDR;
>> > +		end = vm_phys_segs[i].end / NBPDR;
>> > +		domain = vm_phys_segs[i].domain;
>> > +
>> > +		if (highest >= end)
>> > +			continue;
>> > +
>> > +		if (start < highest) {
>> > +			start = highest + 1;
>> > +			pvd = &pv_table[start];
>> > +		} else {
>> > +			/*
>> > +			 * The lowest address may land somewhere in the middle
>> > +			 * of our page. Simplify the code by pretending it is
>> > +			 * at the beginning.
>> > +			 */
>> > +			pvd = pa_to_pmdp(vm_phys_segs[i].start);
>> > +			pvd = (struct pmap_large_md_page *)trunc_page(pvd);
>> > +			start = pvd - pv_table;
>> > +		}
>> > +
>> > +		pages = end - start + 1;
>> > +		s = round_page(pages * sizeof(*pvd));
>> > +		highest = start + (s / sizeof(*pvd)) - 1;
>> > +
>> > +		for (j = 0; j < s; j += PAGE_SIZE) {
>> > +			vm_page_t m = vm_page_alloc_domain(NULL, 0,
>> > +			    domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
>> > +			if (m == NULL)
>> > +				panic("vm_page_alloc_domain failed for %lx\n",
>> > (vm_offset_t)pvd + j);
>> > +			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
>> > +		}
>> > +
>> > +		for (j = 0; j < s / sizeof(*pvd); j++) {
>> > +			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
>> > +			TAILQ_INIT(&pvd->pv_page.pv_list);
>> > +			pvd->pv_page.pv_gen = 0;
>> > +			pvd->pv_page.pat_mode = 0;
>> > +			pvd->pv_invl_gen = 0;
>> > +			pvd++;
>> > +		}
>> > +	}
>> > +	TAILQ_INIT(&pv_dummy.pv_list);
>> > +}
>> > +#else
>> > +static void
>> > +pmap_init_pv_table(void)
>> > +{
>> > +	vm_size_t s;
>> > +	long i, pv_npg;
>> > +
>> > +	/*
>> > +	 * Initialize the pool of pv list locks.
>> > +	 */
>> > +	for (i = 0; i < NPV_LIST_LOCKS; i++)
>> > +		rw_init(&pv_list_locks[i], "pmap pv list");
>> > +
>> > +	/*
>> > +	 * Calculate the size of the pv head table for superpages.
>> > +	 */
>> > +	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
>> > +
>> > +	/*
>> > +	 * Allocate memory for the pv head table for superpages.
>> > +	 */
>> > +	s = (vm_size_t)pv_npg * sizeof(struct md_page);
>> > +	s = round_page(s);
>> > +	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
>> > +	for (i = 0; i < pv_npg; i++)
>> > +		TAILQ_INIT(&pv_table[i].pv_list);
>> > +	TAILQ_INIT(&pv_dummy.pv_list);
>> > +}
>> > +#endif
>> > +
>> >  /*
>> >   *	Initialize the pmap module.
>> >   *	Called by vm_init, to initialize any structures that the pmap
>> > @@ -1813,8 +1948,7 @@ pmap_init(void)
>> >  {
>> >  	struct pmap_preinit_mapping *ppim;
>> >  	vm_page_t m, mpte;
>> > -	vm_size_t s;
>> > -	int error, i, pv_npg, ret, skz63;
>> > +	int error, i, ret, skz63;
>> >
>> >  	/* L1TF, reserve page @0 unconditionally */
>> >  	vm_page_blacklist_add(0, bootverbose);
>> > @@ -1902,26 +2036,7 @@ pmap_init(void)
>> >  	 */
>> >  	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
>> >
>> > -	/*
>> > -	 * Initialize the pool of pv list locks.
>> > -	 */
>> > -	for (i = 0; i < NPV_LIST_LOCKS; i++)
>> > -		rw_init(&pv_list_locks[i], "pmap pv list");
>> > -
>> > -	/*
>> > -	 * Calculate the size of the pv head table for superpages.
>> > -	 */
>> > -	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
>> > -
>> > -	/*
>> > -	 * Allocate memory for the pv head table for superpages.
>> > -	 */
>> > -	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
>> > -	s = round_page(s);
>> > -	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
>> > -	for (i = 0; i < pv_npg; i++)
>> > -		TAILQ_INIT(&pv_table[i].pv_list);
>> > -	TAILQ_INIT(&pv_dummy.pv_list);
>> > +	pmap_init_pv_table();
>> >
>> >  	pmap_initialized = 1;
>> >  	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
>> >
>>
>> This causes a page fault during X (xdm) startup, which loads
>> drm-current-kmod.
>>
>> db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame
>> 0xfffffe0093e9c260
>> vpanic() at vpanic+0x19d/frame 0xfffffe0093e9c2b0
>> panic() at panic+0x43/frame 0xfffffe0093e9c310
>> vm_fault() at vm_fault+0x2126/frame 0xfffffe0093e9c460
>> vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c4b0
>> trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c510
>> trap() at trap+0x2a1/frame 0xfffffe0093e9c620
>> calltrap() at calltrap+0x8/frame 0xfffffe0093e9c620
>> --- trap 0xc, rip = 0xffffffff80a054b1, rsp = 0xfffffe0093e9c6f0, rbp =
>> 0xfffffe0093e9c7a0 ---
>> pmap_enter() at pmap_enter+0x861/frame 0xfffffe0093e9c7a0
>> vm_fault() at vm_fault+0x1c69/frame 0xfffffe0093e9c8f0
>> vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c940
>> trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c9a0
>> trap() at trap+0x438/frame 0xfffffe0093e9cab0
>> calltrap() at calltrap+0x8/frame 0xfffffe0093e9cab0
>> --- trap 0xc, rip = 0x30e2a9c3, rsp = 0x7fffffffea50, rbp = 0x7fffffffeaa0
>>
>> ---
>> Uptime: 3m33s
>> Dumping 945 out of 7974
>> MB:..2%..11%..21%..31%..41%..51%..61%..72%..82%..92%
>>
>> (kgdb) bt
>> #0  doadump (textdump=1) at pcpu_aux.h:55
>> #1  0xffffffff8068c5ed in kern_reboot (howto=260)
>>     at /opt/src/svn-current/sys/kern/kern_shutdown.c:479
>> #2  0xffffffff8068caa9 in vpanic (fmt=<value optimized out>,
>>     ap=<value optimized out>)
>>     at /opt/src/svn-current/sys/kern/kern_shutdown.c:908
>> #3  0xffffffff8068c8a3 in panic (fmt=<value optimized out>)
>>     at /opt/src/svn-current/sys/kern/kern_shutdown.c:835
>> #4  0xffffffff8098c966 in vm_fault (map=<value optimized out>,
>>     vaddr=<value optimized out>, fault_type=<value optimized out>,
>>     fault_flags=<value optimized out>, m_hold=<value optimized out>)
>>     at /opt/src/svn-current/sys/vm/vm_fault.c:672
>> #5  0xffffffff8098a723 in vm_fault_trap (map=0xfffff80002001000,
>>     vaddr=<value optimized out>, fault_type=2 '\002',
>>     fault_flags=<value optimized out>, signo=0x0, ucode=0x0)
>>     at /opt/src/svn-current/sys/vm/vm_fault.c:568
>> #6  0xffffffff80a18326 in trap_pfault (frame=0xfffffe0093e9c630,
>>     signo=<value optimized out>, ucode=<value optimized out>)
>>     at /opt/src/svn-current/sys/amd64/amd64/trap.c:828
>> #7  0xffffffff80a177f1 in trap (frame=0xfffffe0093e9c630)
>>     at /opt/src/svn-current/sys/amd64/amd64/trap.c:407
>> #8  0xffffffff809f1aac in calltrap ()
>>     at /opt/src/svn-current/sys/amd64/amd64/exception.S:289
>> ---Type <return> to continue, or q <return> to quit---
>> #9  0xffffffff80a054b1 in pmap_enter (pmap=<value optimized out>,
>>     va=851443712, m=0xfffffe0005b25ce8, prot=<value optimized out>,
>>     flags=2677542912, psind=<value optimized out>) at atomic.h:221
>> #10 0xffffffff8098c4a9 in vm_fault (map=<value optimized out>,
>>     vaddr=<value optimized out>, fault_type=232 '\ufffd',
>>     fault_flags=<value optimized out>, m_hold=0x0)
>>     at /opt/src/svn-current/sys/vm/vm_fault.c:489
>> #11 0xffffffff8098a723 in vm_fault_trap (map=0xfffff80173eb5000,
>>     vaddr=<value optimized out>, fault_type=2 '\002',
>>     fault_flags=<value optimized out>, signo=0xfffffe0093e9ca84,
>>     ucode=0xfffffe0093e9ca80) at
>> /opt/src/svn-current/sys/vm/vm_fault.c:568
>> #12 0xffffffff80a18326 in trap_pfault (frame=0xfffffe0093e9cac0,
>>     signo=<value optimized out>, ucode=<value optimized out>)
>>     at /opt/src/svn-current/sys/amd64/amd64/trap.c:828
>> #13 0xffffffff80a17988 in trap (frame=0xfffffe0093e9cac0)
>>     at /opt/src/svn-current/sys/amd64/amd64/trap.c:347
>> #14 0xffffffff809f1aac in calltrap ()
>>     at /opt/src/svn-current/sys/amd64/amd64/exception.S:289
>> #15 0x0000000030e2a9c3 in ?? ()
>> Previous frame inner to this frame (corrupt stack?)
>> Current language:  auto; currently minimal
>> (kgdb) frame 9
>> #9  0xffffffff80a054b1 in pmap_enter (pmap=<value optimized out>,
>>     va=851443712, m=0xfffffe0005b25ce8, prot=<value optimized out>,
>>     flags=2677542912, psind=<value optimized out>) at atomic.h:221
>> 221	ATOMIC_CMPSET(long);
>> (kgdb) l
>> 216	}
>> 217	
>> 218	ATOMIC_CMPSET(char);
>> 219	ATOMIC_CMPSET(short);
>> 220	ATOMIC_CMPSET(int);
>> 221	ATOMIC_CMPSET(long);
>> 222	
>> 223	/*
>> 224	 * Atomically add the value of v to the integer pointed to by p and
>> return
>> 225	 * the previous value of *p.
>> (kgdb)
>
> I should use kgdb from ports instead of /usr/libexec version. Similar
> result.
>
> <4>WARN_ON(!mutex_is_locked(&fbc->lock))WARN_ON(!mutex_is_locked(&fbc->
> lock))
> panic: vm_fault: fault on nofault entry, addr: 0xfffffe000e01c000
> cpuid = 1
> time = 1570417211
> KDB: stack backtrace:
> db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame
> 0xfffffe0093e9c260
> vpanic() at vpanic+0x19d/frame 0xfffffe0093e9c2b0
> panic() at panic+0x43/frame 0xfffffe0093e9c310
> vm_fault() at vm_fault+0x2126/frame 0xfffffe0093e9c460
> vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c4b0
> trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c510
> trap() at trap+0x2a1/frame 0xfffffe0093e9c620
> calltrap() at calltrap+0x8/frame 0xfffffe0093e9c620
> --- trap 0xc, rip = 0xffffffff80a054b1, rsp = 0xfffffe0093e9c6f0, rbp =
> 0xfffffe0093e9c7a0 ---
> pmap_enter() at pmap_enter+0x861/frame 0xfffffe0093e9c7a0
> vm_fault() at vm_fault+0x1c69/frame 0xfffffe0093e9c8f0
> vm_fault_trap() at vm_fault_trap+0x73/frame 0xfffffe0093e9c940
> trap_pfault() at trap_pfault+0x1b6/frame 0xfffffe0093e9c9a0
> trap() at trap+0x438/frame 0xfffffe0093e9cab0
> calltrap() at calltrap+0x8/frame 0xfffffe0093e9cab0
> --- trap 0xc, rip = 0x30e2a9c3, rsp = 0x7fffffffea50, rbp = 0x7fffffffeaa0
> ---
> Uptime: 3m33s
> Dumping 945 out of 7974
> MB:..2%..11%..21%..31%..41%..51%..61%..72%..82%..92%
>
> __curthread () at /opt/src/svn-current/sys/amd64/include/pcpu_aux.h:55
> 55		__asm("movq %%gs:%P1,%0" : "=r" (td) : "n" (offsetof(struct pcpu,
> (kgdb)
>
> Backtrace stopped: Cannot access memory at address 0x7fffffffea50
> (kgdb) frame 10
> #10 0xffffffff80a054b1 in atomic_fcmpset_long (dst=<optimized out>,
>     src=<optimized out>, expect=<optimized out>)
>     at /opt/src/svn-current/sys/amd64/include/atomic.h:221
> 221	ATOMIC_CMPSET(long);
> (kgdb) l
> 216	}
> 217	
> 218	ATOMIC_CMPSET(char);
> 219	ATOMIC_CMPSET(short);
> 220	ATOMIC_CMPSET(int);
> 221	ATOMIC_CMPSET(long);
> 222	
> 223	/*
> 224	 * Atomically add the value of v to the integer pointed to by p and
> return
> 225	 * the previous value of *p.
> (kgdb)
>
>
>
> --
> Cheers,
> Cy Schubert <Cy.Schubert@cschubert.com>
> FreeBSD UNIX:  <cy@FreeBSD.org>   Web:  http://www.FreeBSD.org
>
> 	The need of the many outweighs the greed of the few.
>
>
>


-- 
Mateusz Guzik <mjguzik gmail.com>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAGudoHFbuUgCrBa1JS6S5ynB_obpwBJdyRP66bd-WuhHO8ZP7A>