Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 11 Jul 2013 00:41:38 -0700
From:      Neel Natu <neelnatu@gmail.com>
To:        Chris Torek <torek@torek.net>
Cc:        Konstantin Belousov <kostikbel@gmail.com>, freebsd-hackers@freebsd.org, kib@freebsd.org, alc@freebsd.org
Subject:   Re: expanding amd64 past the 1TB limit
Message-ID:  <CAFgRE9HycyTL-Mch9KF08Df6qFzzCTb-aU0AkKQhi7F5gtGevA@mail.gmail.com>
In-Reply-To: <201307080642.r686gbDQ089570@elf.torek.net>
References:  <201306282033.r5SKXtYK053022@elf.torek.net> <201307080642.r686gbDQ089570@elf.torek.net>

next in thread | previous in thread | raw e-mail | index | archive | help
Hi Chris,

On Sun, Jul 7, 2013 at 11:42 PM, Chris Torek <torek@torek.net> wrote:
> Here is a final (I hope) version of the patch.  I dropped the
> config option, but I added code to limit the "real" size of the
> direct map PDEs.  The end result is that on small systems, this
> ties up 14 more pages (15 from increasing NKPML4E, but one
> regained because the new static variable ndmpdpphys is 1 instead
> of 2).
>

The patch looks good. I have a couple of comments inline:

> (I fixed the comment errors I spotted earlier, too.)
>
> Chris
>
>  amd64/amd64/pmap.c      | 100 +++++++++++++++++++++++++++++-------------------
>  amd64/include/pmap.h    |  36 +++++++++++++----
>  amd64/include/vmparam.h |  13 ++++---
>  3 files changed, 97 insertions(+), 52 deletions(-)
>
> Author: Chris Torek <chris.torek@gmail.com>
> Date:   Thu Jun 27 18:49:29 2013 -0600
>
>     increase physical and virtual memory limits
>
>     Increase kernel VM space: go from .5 TB of KVA and 1 TB of direct
>     map, to 8 TB of KVA and 16 TB of direct map.  However, we allocate
>     less direct map space for small physical-memory systems.  Also, if
>     Maxmem is so large that there is not enough direct map space,
>     reduce Maxmem to fit, so that the system can boot unassisted.
>
> diff --git a/amd64/amd64/pmap.c b/amd64/amd64/pmap.c
> index 8dcf232..7368c96 100644
> --- a/amd64/amd64/pmap.c
> +++ b/amd64/amd64/pmap.c
> @@ -232,6 +232,7 @@ u_int64_t           KPML4phys;      /* phys addr of kernel level 4 */
>
>  static u_int64_t       DMPDphys;       /* phys addr of direct mapped level 2 */
>  static u_int64_t       DMPDPphys;      /* phys addr of direct mapped level 3 */
> +static int             ndmpdpphys;     /* number of DMPDPphys pages */
>
>  static struct rwlock_padalign pvh_global_lock;
>
> @@ -531,12 +532,27 @@ static void
>  create_pagetables(vm_paddr_t *firstaddr)
>  {
>         int i, j, ndm1g, nkpdpe;
> +       pt_entry_t *pt_p;
> +       pd_entry_t *pd_p;
> +       pdp_entry_t *pdp_p;
> +       pml4_entry_t *p4_p;

The changes associated with pt_p, pd_p and p4_p are cosmetic and IMHO
detract from the meat of the change.

My preference would be for the cosmetic changes to be committed
separately from the changes that rearrange the KVA.

>
>         /* Allocate page table pages for the direct map */
>         ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
>         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
>                 ndmpdp = 4;
> -       DMPDPphys = allocpages(firstaddr, NDMPML4E);
> +       ndmpdpphys = howmany(ndmpdp, NPML4EPG);

NPDPEPG should be used here instead of NPML4EPG even though they are
numerically identical.

> +       if (ndmpdpphys > NDMPML4E) {
> +               /*
> +                * Each NDMPML4E allows 512 GB, so limit to that,
> +                * and then readjust ndmpdp and ndmpdpphys.
> +                */
> +               printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
> +               Maxmem = atop(NDMPML4E * NBPML4);
> +               ndmpdpphys = NDMPML4E;
> +               ndmpdp = NDMPML4E * NPDEPG;
> +       }
> +       DMPDPphys = allocpages(firstaddr, ndmpdpphys);
>         ndm1g = 0;
>         if ((amd_feature & AMDID_PAGE1GB) != 0)
>                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
> @@ -553,6 +569,10 @@ create_pagetables(vm_paddr_t *firstaddr)
>          * bootstrap.  We defer this until after all memory-size dependent
>          * allocations are done (e.g. direct map), so that we don't have to
>          * build in too much slop in our estimate.
> +        *
> +        * Note that when NKPML4E > 1, we have an empty page underneath
> +        * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
> +        * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
>          */
>         nkpt_init(*firstaddr);
>         nkpdpe = NKPDPE(nkpt);
> @@ -561,32 +581,26 @@ create_pagetables(vm_paddr_t *firstaddr)
>         KPDphys = allocpages(firstaddr, nkpdpe);
>
>         /* Fill in the underlying page table pages */
> -       /* Read-only from zero to physfree */
> +       /* Nominally read-only (but really R/W) from zero to physfree */
>         /* XXX not fully used, underneath 2M pages */
> -       for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
> -               ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
> -               ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
> -       }
> +       pt_p = (pt_entry_t *)KPTphys;
> +       for (i = 0; ptoa(i) < *firstaddr; i++)
> +               pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
>
>         /* Now map the page tables at their location within PTmap */
> -       for (i = 0; i < nkpt; i++) {
> -               ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
> -               ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
> -       }
> +       pd_p = (pd_entry_t *)KPDphys;
> +       for (i = 0; i < nkpt; i++)
> +               pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
>
>         /* Map from zero to end of allocations under 2M pages */
>         /* This replaces some of the KPTphys entries above */
> -       for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
> -               ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
> -               ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
> -       }
> +       for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
> +               pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
>
> -       /* And connect up the PD to the PDP */
> -       for (i = 0; i < nkpdpe; i++) {
> -               ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
> -                   (i << PAGE_SHIFT);
> -               ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
> -       }
> +       /* And connect up the PD to the PDP (leaving room for L4 pages) */
> +       pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
> +       for (i = 0; i < nkpdpe; i++)
> +               pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
>
>         /*
>          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
> @@ -596,37 +610,41 @@ create_pagetables(vm_paddr_t *firstaddr)
>          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
>          * that are partially used.
>          */
> +       pd_p = (pd_entry_t *)DMPDphys;
>         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
> -               ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
> +               pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
>                 /* Preset PG_M and PG_A because demotion expects it. */
> -               ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
> +               pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
>                     PG_M | PG_A;
>         }
> +       pdp_p = (pdp_entry_t *)DMPDPphys;
>         for (i = 0; i < ndm1g; i++) {
> -               ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
> +               pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
>                 /* Preset PG_M and PG_A because demotion expects it. */
> -               ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
> +               pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
>                     PG_M | PG_A;
>         }
>         for (j = 0; i < ndmpdp; i++, j++) {
> -               ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
> -               ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
> +               pdp_p[i] = DMPDphys + ptoa(j);
> +               pdp_p[i] |= PG_RW | PG_V | PG_U;
>         }
>
>         /* And recursively map PML4 to itself in order to get PTmap */
> -       ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
> -       ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
> +       p4_p = (pml4_entry_t *)KPML4phys;
> +       p4_p[PML4PML4I] = KPML4phys;
> +       p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
>
>         /* Connect the Direct Map slot(s) up to the PML4. */
> -       for (i = 0; i < NDMPML4E; i++) {
> -               ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
> -                   (i << PAGE_SHIFT);
> -               ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
> +       for (i = 0; i < ndmpdpphys; i++) {
> +               p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
> +               p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
>         }
>
> -       /* Connect the KVA slot up to the PML4 */
> -       ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
> -       ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
> +       /* Connect the KVA slots up to the PML4 */
> +       for (i = 0; i < NKPML4E; i++) {
> +               p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
> +               p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
> +       }
>  }
>
>  /*
> @@ -1685,8 +1703,11 @@ pmap_pinit(pmap_t pmap)
>                 pagezero(pmap->pm_pml4);
>
>         /* Wire in kernel global address entries. */
> -       pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
> -       for (i = 0; i < NDMPML4E; i++) {
> +       for (i = 0; i < NKPML4E; i++) {
> +               pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
> +                   PG_RW | PG_V | PG_U;
> +       }
> +       for (i = 0; i < ndmpdpphys; i++) {
>                 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
>                     PG_RW | PG_V | PG_U;
>         }
> @@ -1941,8 +1962,9 @@ pmap_release(pmap_t pmap)
>
>         m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
>
> -       pmap->pm_pml4[KPML4I] = 0;      /* KVA */
> -       for (i = 0; i < NDMPML4E; i++)  /* Direct Map */
> +       for (i = 0; i < NKPML4E; i++)   /* KVA */
> +               pmap->pm_pml4[KPML4BASE + i] = 0;
> +       for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
>                 pmap->pm_pml4[DMPML4I + i] = 0;
>         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
>
> diff --git a/amd64/include/pmap.h b/amd64/include/pmap.h
> index dc02e49..da80241 100644
> --- a/amd64/include/pmap.h
> +++ b/amd64/include/pmap.h
> @@ -113,28 +113,50 @@
>         ((unsigned long)(l2) << PDRSHIFT) | \
>         ((unsigned long)(l1) << PAGE_SHIFT))
>
> -#define NKPML4E                1               /* number of kernel PML4 slots */
> +/*
> + * Number of kernel PML4 slots.  Can be anywhere from 1 to 64 or so,
> + * but setting it larger than NDMPML4E makes no sense.
> + *
> + * Each slot provides .5 TB of kernel virtual space.
> + */
> +#define NKPML4E                16
>
>  #define        NUPML4E         (NPML4EPG/2)    /* number of userland PML4 pages */
>  #define        NUPDPE          (NUPML4E*NPDPEPG)/* number of userland PDP pages */
>  #define        NUPDE           (NUPDPE*NPDEPG) /* number of userland PD entries */
>
>  /*
> - * NDMPML4E is the number of PML4 entries that are used to implement the
> - * direct map.  It must be a power of two.
> + * NDMPML4E is the maximum number of PML4 entries that will be
> + * used to implement the direct map.  It must be a power of two,
> + * and should generally exceed NKPML4E.  The maximum possible
> + * value is 64; using 128 will make the direct map intrude into
> + * the recursive page table map.
>   */
> -#define        NDMPML4E        2
> +#define        NDMPML4E        32
>
>  /*
> - * The *PDI values control the layout of virtual memory.  The starting address
> + * These values control the layout of virtual memory.  The starting address
>   * of the direct map, which is controlled by DMPML4I, must be a multiple of
>   * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
> + *
> + * Note: KPML4I is the index of the (single) level 4 page that maps
> + * the KVA that holds KERNBASE, while KPML4BASE is the index of the
> + * first level 4 page that maps VM_MIN_KERNEL_ADDRESS.  If NKPML4E
> + * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
> + * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
> + * KERNBASE.  Similarly, if KMPL4I < (base+N), extra level 4 PDEs are

level 2 PDE's, right?

> + * needed to map from somewhere-above-KERNBASE to VM_MAX_KERNEL_ADDRESS.
> + *
> + * (KPML4I combines with KPDPI to choose where KERNBASE starts.
> + * Or, in other words, KPML4I provides bits 39..46 of KERNBASE,
> + * and KPDPI provides bits 30..38.)
>   */
>  #define        PML4PML4I       (NPML4EPG/2)    /* Index of recursive pml4 mapping */
>
> -#define        KPML4I          (NPML4EPG-1)    /* Top 512GB for KVM */
> -#define        DMPML4I         rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below KVM */
> +#define        KPML4BASE       (NPML4EPG-NKPML4E) /* KVM at highest addresses */
> +#define        DMPML4I         rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
>
> +#define        KPML4I          (NPML4EPG-1)
>  #define        KPDPI           (NPDPEPG-2)     /* kernbase at -2GB */
>
>  /*
> diff --git a/amd64/include/vmparam.h b/amd64/include/vmparam.h
> index 33f62bd..cff2558 100644
> --- a/amd64/include/vmparam.h
> +++ b/amd64/include/vmparam.h
> @@ -145,18 +145,19 @@
>   * 0x0000000000000000 - 0x00007fffffffffff   user map
>   * 0x0000800000000000 - 0xffff7fffffffffff   does not exist (hole)
>   * 0xffff800000000000 - 0xffff804020100fff   recursive page table (512GB slot)
> - * 0xffff804020101000 - 0xfffffdffffffffff   unused
> - * 0xfffffe0000000000 - 0xfffffeffffffffff   1TB direct map
> - * 0xffffff0000000000 - 0xffffff7fffffffff   unused
> - * 0xffffff8000000000 - 0xffffffffffffffff   512GB kernel map
> + * 0xffff804020101000 - 0xffffdfffffffffff   unused
> + * 0xffffe00000000000 - 0xffffefffffffffff   16TB direct map
> + * 0xfffff00000000000 - 0xfffff7ffffffffff   unused
> + * 0xfffff80000000000 - 0xffffffffffffffff   8TB kernel map
>   *
>   * Within the kernel map:
>   *
>   * 0xffffffff80000000                        KERNBASE
>   */
>
> -#define        VM_MAX_KERNEL_ADDRESS   KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, NPTEPG-1)
> -#define        VM_MIN_KERNEL_ADDRESS   KVADDR(KPML4I, NPDPEPG-512, 0, 0)
> +#define        VM_MIN_KERNEL_ADDRESS   KVADDR(KPML4BASE, 0, 0, 0)
> +#define        VM_MAX_KERNEL_ADDRESS   KVADDR(KPML4BASE + NKPML4E - 1, \
> +                                       NPDPEPG-1, NPDEPG-1, NPTEPG-1)
>
>  #define        DMAP_MIN_ADDRESS        KVADDR(DMPML4I, 0, 0, 0)
>  #define        DMAP_MAX_ADDRESS        KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)

best
Neel



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAFgRE9HycyTL-Mch9KF08Df6qFzzCTb-aU0AkKQhi7F5gtGevA>