Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 26 Jun 2013 10:11:44 -0600
From:      Chris Torek <torek@torek.net>
To:        freebsd-hackers@freebsd.org
Subject:   expanding amd64 past the 1TB limit
Message-ID:  <201306261611.r5QGBie4025519@elf.torek.net>

next in thread | raw e-mail | index | archive | help
(Note: Last week I asked about this on the freebsd-current list.
It turned out slightly harder than I thought, as the 512GB kernel
virtual area is based on what fits into a single L4 page table
entry.)

I was asked to expand the kernel limits for amd64 systems.  While
I do not have a system with enough RAM to test this for real, the
changes below seem to boot and run OK.

I went just a little bit wild in create_pagetables(). :-)  The
lines with the casts got long (and hard to read) so I shortened
them (but I still needed the map I drew of the page tables...).
If using ptoa() like this is OK, probably there should be a few
more of those, e.g., in the changes to pmap_pinit().

Anyway, I wonder if some form of this patch (perhaps even without
the #ifdefs) might be accepted back.  I'm not sure about the KPML4BASE
name, but it clearly needs to be different from KPML4I.  (At first
I was considering moving KERNBASE too but the branch offsets seem
to be the real limiting factor here.)

Possibly dumb question: around the comment "this replaces some of
the KPTphys entries above", would it be possible to reclaim a few
pages by calculating in advance where the 2MB page mappings obviate
the need for the underlying KPTphys pages, and just offset things?

Another note: one could get rid of the "power of 2" requirement
for NDMPML4E.  It arises from the translation between direct
mapped virtual and physical addresses (being |= and &=~), but the
same result can be achieved by adding and subtracting an offset,
which would allow the base and limit to be arbitrary, rather than
a power of two.  (Still, it did not seem worth doing here.)

Chris

diff --git a/amd64/amd64/pmap.c b/amd64/amd64/pmap.c
index 272158d..acf5af2 100644
--- a/amd64/amd64/pmap.c
+++ b/amd64/amd64/pmap.c
@@ -534,6 +534,10 @@ static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i, j, ndm1g, nkpdpe;
+	pt_entry_t *pt_p;
+	pd_entry_t *pd_p;
+	pdp_entry_t *pdp_p;
+	pml4_entry_t *p4_p;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
@@ -556,6 +560,10 @@ create_pagetables(vm_paddr_t *firstaddr)
 	 * bootstrap.  We defer this until after all memory-size dependent
 	 * allocations are done (e.g. direct map), so that we don't have to
 	 * build in too much slop in our estimate.
+	 *
+	 * Note that when NKPML4E > 1, we have an empty page underneath
+	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
+	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 	 */
 	nkpt_init(*firstaddr);
 	nkpdpe = NKPDPE(nkpt);
@@ -564,32 +572,26 @@ create_pagetables(vm_paddr_t *firstaddr)
 	KPDphys = allocpages(firstaddr, nkpdpe);
 
 	/* Fill in the underlying page table pages */
-	/* Read-only from zero to physfree */
+	/* Nominally read-only (but really R/W) from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
-	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
-		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
-		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
-	}
+	pt_p = (pt_entry_t *)KPTphys;
+	for (i = 0; ptoa(i) < *firstaddr; i++)
+		pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
 
 	/* Now map the page tables at their location within PTmap */
-	for (i = 0; i < nkpt; i++) {
-		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
-		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
-	}
+	pd_p = (pd_entry_t *)KPDphys;
+	for (i = 0; i < nkpt; i++)
+		pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
-	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
-		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
-		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
-	}
+	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
+		pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
 
-	/* And connect up the PD to the PDP */
-	for (i = 0; i < nkpdpe; i++) {
-		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
-		    (i << PAGE_SHIFT);
-		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
-	}
+	/* And connect up the PD to the PDP (leaving room for L4 pages) */
+	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
+	for (i = 0; i < nkpdpe; i++)
+		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
@@ -599,37 +601,41 @@ create_pagetables(vm_paddr_t *firstaddr)
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
+	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
-		((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
+		pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
 		    PG_M | PG_A;
 	}
+	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
-		((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
+		pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
 		    PG_M | PG_A;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
-		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
-		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+		pdp_p[i] = DMPDphys + ptoa(j);
+		pdp_p[i] |= PG_RW | PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
-	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
-	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+	p4_p = (pml4_entry_t *)KPML4phys;
+	p4_p[PML4PML4I] = KPML4phys;
+	p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
 	for (i = 0; i < NDMPML4E; i++) {
-		((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
-		    (i << PAGE_SHIFT);
-		((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
+		p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
 	}
 
-	/* Connect the KVA slot up to the PML4 */
-	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
-	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+	/* Connect the KVA slots up to the PML4 */
+	for (i = 0; i < NKPML4E; i++) {
+		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
+		p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
+	}
 }
 
 /*
@@ -1688,7 +1694,10 @@ pmap_pinit(pmap_t pmap)
 		pagezero(pmap->pm_pml4);
 
 	/* Wire in kernel global address entries. */
-	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+	for (i = 0; i < NKPML4E; i++) {
+		pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
+		    PG_RW | PG_V | PG_U;
+	}
 	for (i = 0; i < NDMPML4E; i++) {
 		pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
 		    PG_RW | PG_V | PG_U;
@@ -1944,7 +1953,8 @@ pmap_release(pmap_t pmap)
 
 	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 
-	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
+	for (i = 0; i < NKPML4E; i++)	/* KVA */
+		pmap->pm_pml4[KPML4BASE + i] = 0;
 	for (i = 0; i < NDMPML4E; i++)	/* Direct Map */
 		pmap->pm_pml4[DMPML4I + i] = 0;
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
diff --git a/amd64/include/pmap.h b/amd64/include/pmap.h
index 6d76ec3..58d1c9d 100644
--- a/amd64/include/pmap.h
+++ b/amd64/include/pmap.h
@@ -113,7 +113,17 @@
 	((unsigned long)(l2) << PDRSHIFT) | \
 	((unsigned long)(l1) << PAGE_SHIFT))
 
-#define NKPML4E		1		/* number of kernel PML4 slots */
+/*
+ * Number of kernel PML4 slots.  Can be anywhere from 1 to 64 or so,
+ * but setting it larger than NDMPML4E makes no sense.
+ *
+ * Each slot provides .5 TB of kernel virtual space.
+ */
+#ifdef AMD64_HUGE
+#define NKPML4E		16
+#else
+#define NKPML4E		1
+#endif
 
 #define	NUPML4E		(NPML4EPG/2)	/* number of userland PML4 pages */
 #define	NUPDPE		(NUPML4E*NPDPEPG)/* number of userland PDP pages */
@@ -121,20 +131,39 @@
 
 /*
  * NDMPML4E is the number of PML4 entries that are used to implement the
- * direct map.  It must be a power of two.
+ * direct map.  It must be a power of two, and should generally exceed
+ * NKPML4E.  The maximum possible value is 64; using 128 will make the
+ * direct map intrude into the recursive page table map.
  */
+#ifdef AMD64_HUGE
+#define	NDMPML4E	32
+#else
 #define	NDMPML4E	2
+#endif
 
 /*
- * The *PDI values control the layout of virtual memory.  The starting address
+ * These values control the layout of virtual memory.  The starting address
  * of the direct map, which is controlled by DMPML4I, must be a multiple of
  * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ *
+ * Note: KPML4I is the index of the (single) level 4 page that maps
+ * the KVA that holds KERNBASE, while KPML4BASE is the index of the
+ * first level 4 page that maps VM_MIN_KERNEL_ADDRESS.  If NKPML4E
+ * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
+ * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
+ * KERNBASE.  Similarly, if KMPL4I < NKPML4E, extra level 4 PDEs are
+ * needed to map from somewhere-above-KERNBASE to VM_MAX_KERNEL_ADDRESS.
+ *
+ * (KPML4I combines with KPDPI to choose where KERNBASE starts.
+ * Or, in other words, KPML4I provides bits 39..46 of KERNBASE,
+ * and KPDPI provides bits 30..38.)
  */
 #define	PML4PML4I	(NPML4EPG/2)	/* Index of recursive pml4 mapping */
 
-#define	KPML4I		(NPML4EPG-1)	/* Top 512GB for KVM */
-#define	DMPML4I		rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below KVM */
+#define	KPML4BASE	(NPML4EPG-NKPML4E) /* KVM at highest addresses */
+#define	DMPML4I		rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
 
+#define	KPML4I		(NPML4EPG-1)
 #define	KPDPI		(NPDPEPG-2)	/* kernbase at -2GB */
 
 /*
diff --git a/amd64/include/vmparam.h b/amd64/include/vmparam.h
index 33f62bd..47a8ef8 100644
--- a/amd64/include/vmparam.h
+++ b/amd64/include/vmparam.h
@@ -145,18 +145,26 @@
  * 0x0000000000000000 - 0x00007fffffffffff   user map
  * 0x0000800000000000 - 0xffff7fffffffffff   does not exist (hole)
  * 0xffff800000000000 - 0xffff804020100fff   recursive page table (512GB slot)
+#ifdef AMD64_HUGE
+ * 0xffff804020101000 - 0xffffdfffffffffff   unused
+ * 0xffffe00000000000 - 0xffffefffffffffff   16TB direct map
+ * 0xfffff00000000000 - 0xfffff7ffffffffff   unused
+ * 0xfffff80000000000 - 0xffffffffffffffff   8TB kernel map
+#else
  * 0xffff804020101000 - 0xfffffdffffffffff   unused
  * 0xfffffe0000000000 - 0xfffffeffffffffff   1TB direct map
  * 0xffffff0000000000 - 0xffffff7fffffffff   unused
  * 0xffffff8000000000 - 0xffffffffffffffff   512GB kernel map
+#endif
  *
  * Within the kernel map:
  *
  * 0xffffffff80000000                        KERNBASE
  */
 
-#define	VM_MAX_KERNEL_ADDRESS	KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-#define	VM_MIN_KERNEL_ADDRESS	KVADDR(KPML4I, NPDPEPG-512, 0, 0)
+#define	VM_MIN_KERNEL_ADDRESS	KVADDR(KPML4BASE, 0, 0, 0)
+#define	VM_MAX_KERNEL_ADDRESS	KVADDR(KPML4BASE + NKPML4E - 1, \
+					NPDPEPG-1, NPDEPG-1, NPTEPG-1)
 
 #define	DMAP_MIN_ADDRESS	KVADDR(DMPML4I, 0, 0, 0)
 #define	DMAP_MAX_ADDRESS	KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
diff --git a/conf/options.amd64 b/conf/options.amd64
index 90348b7..f3ce505 100644
--- a/conf/options.amd64
+++ b/conf/options.amd64
@@ -1,6 +1,7 @@
 # $FreeBSD$
 # Options specific to AMD64 platform kernels
 
+AMD64_HUGE		opt_global.h
 AUTO_EOI_1		opt_auto_eoi.h
 AUTO_EOI_2		opt_auto_eoi.h
 COUNT_XINVLTLB_HITS	opt_smp.h



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201306261611.r5QGBie4025519>