Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 6 Oct 2019 22:13:35 +0000 (UTC)
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r353149 - head/sys/amd64/amd64
Message-ID:  <201910062213.x96MDZv3085523@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mjg
Date: Sun Oct  6 22:13:35 2019
New Revision: 353149
URL: https://svnweb.freebsd.org/changeset/base/353149

Log:
  amd64 pmap: implement per-superpage locks
  
  The current 256-lock sized array is a problem in the following ways:
  - it's way too small
  - there are 2 locks per cacheline
  - it is not NUMA-aware
  
  Solve these issues by introducing per-superpage locks backed by pages
  allocated from respective domains.
  
  This significantly reduces contention e.g. during poudriere -j 104.
  See the review for results.
  
  Reviewed by:	kib
  Discussed with:	jeff
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D21833

Modified:
  head/sys/amd64/amd64/pmap.c

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Sun Oct  6 20:36:25 2019	(r353148)
+++ head/sys/amd64/amd64/pmap.c	Sun Oct  6 22:13:35 2019	(r353149)
@@ -316,13 +316,25 @@ pmap_pku_mask_bit(pmap_t pmap)
 #define PV_STAT(x)	do { } while (0)
 #endif
 
-#define	pa_index(pa)	((pa) >> PDRSHIFT)
+#undef pa_index
+#define	pa_index(pa)	({					\
+	KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end,	\
+	    ("address %lx beyond the last segment", (pa)));	\
+	(pa) >> PDRSHIFT;					\
+})
+#if VM_NRESERVLEVEL > 0
+#define	pa_to_pmdp(pa)	(&pv_table[pa_index(pa)])
+#define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
+#define	PHYS_TO_PV_LIST_LOCK(pa)	\
+			(&(pa_to_pmdp(pa)->pv_lock))
+#else
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+#endif
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
@@ -400,14 +412,22 @@ static int pmap_initialized;
 
 /*
  * Data for the pv entry allocation mechanism.
- * Updates to pv_invl_gen are protected by the pv_list_locks[]
- * elements, but reads are not.
+ * Updates to pv_invl_gen are protected by the pv list lock but reads are not.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx __exclusive_cache_line pv_chunks_mutex;
+#if VM_NRESERVLEVEL > 0
+struct pmap_large_md_page {
+	struct rwlock   pv_lock;
+	struct md_page  pv_page;
+	u_long pv_invl_gen;
+};
+static struct pmap_large_md_page *pv_table;
+#else
 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
+#endif
 static struct md_page pv_dummy;
 
 /*
@@ -918,12 +938,21 @@ SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLA
     "Number of slow invalidation waits for lockless DI");
 #endif
 
+#if VM_NRESERVLEVEL > 0
 static u_long *
 pmap_delayed_invl_genp(vm_page_t m)
 {
 
+	return (&pa_to_pmdp(VM_PAGE_TO_PHYS(m))->pv_invl_gen);
+}
+#else
+static u_long *
+pmap_delayed_invl_genp(vm_page_t m)
+{
+
 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 }
+#endif
 
 static void
 pmap_delayed_invl_callout_func(void *arg __unused)
@@ -1803,6 +1832,112 @@ pmap_page_init(vm_page_t m)
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
+#if VM_NRESERVLEVEL > 0
+static void
+pmap_init_pv_table(void)
+{
+	struct pmap_large_md_page *pvd;
+	vm_size_t s;
+	long start, end, highest, pv_npg;
+	int domain, i, j, pages;
+
+	/*
+	 * We strongly depend on the size being a power of two, so the assert
+	 * is overzealous. However, should the struct be resized to a
+	 * different power of two, the code below needs to be revisited.
+	 */
+	CTASSERT((sizeof(*pvd) == 64));
+
+	/*
+	 * Calculate the size of the array.
+	 */
+	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
+	s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page);
+	s = round_page(s);
+	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
+	if (pv_table == NULL)
+		panic("%s: kva_alloc failed\n", __func__);
+
+	/*
+	 * Iterate physical segments to allocate space for respective pages.
+	 */
+	highest = -1;
+	s = 0;
+	for (i = 0; i < vm_phys_nsegs; i++) {
+		start = vm_phys_segs[i].start / NBPDR;
+		end = vm_phys_segs[i].end / NBPDR;
+		domain = vm_phys_segs[i].domain;
+
+		if (highest >= end)
+			continue;
+
+		if (start < highest) {
+			start = highest + 1;
+			pvd = &pv_table[start];
+		} else {
+			/*
+			 * The lowest address may land somewhere in the middle
+			 * of our page. Simplify the code by pretending it is
+			 * at the beginning.
+			 */
+			pvd = pa_to_pmdp(vm_phys_segs[i].start);
+			pvd = (struct pmap_large_md_page *)trunc_page(pvd);
+			start = pvd - pv_table;
+		}
+
+		pages = end - start + 1;
+		s = round_page(pages * sizeof(*pvd));
+		highest = start + (s / sizeof(*pvd)) - 1;
+
+		for (j = 0; j < s; j += PAGE_SIZE) {
+			vm_page_t m = vm_page_alloc_domain(NULL, 0,
+			    domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
+			if (m == NULL)
+				panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j);
+			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
+		}
+
+		for (j = 0; j < s / sizeof(*pvd); j++) {
+			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
+			TAILQ_INIT(&pvd->pv_page.pv_list);
+			pvd->pv_page.pv_gen = 0;
+			pvd->pv_page.pat_mode = 0;
+			pvd->pv_invl_gen = 0;
+			pvd++;
+		}
+	}
+	TAILQ_INIT(&pv_dummy.pv_list);
+}
+#else
+static void
+pmap_init_pv_table(void)
+{
+	vm_size_t s;
+	long i, pv_npg;
+
+	/*
+	 * Initialize the pool of pv list locks.
+	 */
+	for (i = 0; i < NPV_LIST_LOCKS; i++)
+		rw_init(&pv_list_locks[i], "pmap pv list");
+
+	/*
+	 * Calculate the size of the pv head table for superpages.
+	 */
+	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
+
+	/*
+	 * Allocate memory for the pv head table for superpages.
+	 */
+	s = (vm_size_t)pv_npg * sizeof(struct md_page);
+	s = round_page(s);
+	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
+	for (i = 0; i < pv_npg; i++)
+		TAILQ_INIT(&pv_table[i].pv_list);
+	TAILQ_INIT(&pv_dummy.pv_list);
+}
+#endif
+
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
@@ -1813,8 +1948,7 @@ pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
-	vm_size_t s;
-	int error, i, pv_npg, ret, skz63;
+	int error, i, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
 	vm_page_blacklist_add(0, bootverbose);
@@ -1902,26 +2036,7 @@ pmap_init(void)
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
-	/*
-	 * Initialize the pool of pv list locks.
-	 */
-	for (i = 0; i < NPV_LIST_LOCKS; i++)
-		rw_init(&pv_list_locks[i], "pmap pv list");
-
-	/*
-	 * Calculate the size of the pv head table for superpages.
-	 */
-	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
-
-	/*
-	 * Allocate memory for the pv head table for superpages.
-	 */
-	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
-	s = round_page(s);
-	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
-	for (i = 0; i < pv_npg; i++)
-		TAILQ_INIT(&pv_table[i].pv_list);
-	TAILQ_INIT(&pv_dummy.pv_list);
+	pmap_init_pv_table();
 
 	pmap_initialized = 1;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201910062213.x96MDZv3085523>