Date: Wed, 24 Apr 2013 08:51:15 +0000 (UTC) From: Attilio Rao <attilio@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r249834 - in user/attilio/jeff-numa/sys: sys vm Message-ID: <201304240851.r3O8pF36048457@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: attilio Date: Wed Apr 24 08:51:15 2013 New Revision: 249834 URL: http://svnweb.freebsd.org/changeset/base/249834 Log: o Add accessor functions to add and remove pages from a specific freelist. o Split the pool of free_queues really by domain and not relying on VM_RAW_NFREELIST definition. o For MAXDOMAIN > 1, wrap the RR allocation logic into a specific function that is called when calculating the allocation domain. The RR counter is kept per-thread. In the future it is expected that such function is going to handle different types of policies based on specific informations retrieved by curthread and backing vm_objects. o Add the concept of "probed domains" under the form of vm_ndomains (it is similar in concepts to mp_ncpus but it does refer to mem domains). Right now there are no probed domains for any architecture. It is responsibility of architectures maintainers to add the proper bits to do domains probing. Please note that vm_ndomains and td_dom_rr_idx are both int because segs store domains as int. u_int would have made much more sense. Probabilly we should clean them up altogether in the future. o Apply RR domain selection also to vm_phys_zero_pages_idle(). Sponsored by: EMC / Isilon storage division Partly obtained from: jeff Modified: user/attilio/jeff-numa/sys/sys/proc.h user/attilio/jeff-numa/sys/vm/vm_phys.c user/attilio/jeff-numa/sys/vm/vm_phys.h Modified: user/attilio/jeff-numa/sys/sys/proc.h ============================================================================== --- user/attilio/jeff-numa/sys/sys/proc.h Wed Apr 24 06:40:48 2013 (r249833) +++ user/attilio/jeff-numa/sys/sys/proc.h Wed Apr 24 08:51:15 2013 (r249834) @@ -274,6 +274,7 @@ struct thread { pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ + int td_dom_rr_idx; /* (k) RR Numa domain selection. */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ Modified: user/attilio/jeff-numa/sys/vm/vm_phys.c ============================================================================== --- user/attilio/jeff-numa/sys/vm/vm_phys.c Wed Apr 24 06:40:48 2013 (r249833) +++ user/attilio/jeff-numa/sys/vm/vm_phys.c Wed Apr 24 08:51:15 2013 (r249834) @@ -48,6 +48,9 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mutex.h> +#if MAXDOMAIN > 1 +#include <sys/proc.h> +#endif #include <sys/queue.h> #include <sys/sbuf.h> #include <sys/sysctl.h> @@ -62,13 +65,6 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_page.h> #include <vm/vm_phys.h> -/* - * VM_FREELIST_DEFAULT is split into MAXDOMAIN lists, one for each - * domain. These extra lists are stored at the end of the regular - * free lists starting with VM_NFREELIST. - */ -#define VM_RAW_NFREELIST (VM_NFREELIST + MAXDOMAIN - 1) - struct vm_freelist { struct pglist pl; int lcnt; @@ -84,6 +80,8 @@ struct vm_phys_seg { struct mem_affinity *mem_affinity; +int vm_ndomains = 1; + static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; static int vm_phys_nsegs; @@ -98,9 +96,7 @@ static struct mtx vm_phys_fictitious_reg MALLOC_DEFINE(M_FICT_PAGES, "", ""); static struct vm_freelist - vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; -static struct vm_freelist -(*vm_phys_lookup_lists[MAXDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER]; + vm_phys_free_queues[MAXDOMAIN][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; static int vm_nfreelists = VM_FREELIST_DEFAULT + 1; @@ -116,11 +112,8 @@ static int sysctl_vm_phys_segs(SYSCTL_HA SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); -#if MAXDOMAIN > 1 -static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS); -SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD, - NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists"); -#endif +SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, + &vm_ndomains, 0, "Number of physical memory domains available."); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain); @@ -129,6 +122,22 @@ static int vm_phys_paddr_to_segind(vm_pa static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); +static __inline int +vm_rr_selectdomain(void) +{ +#if MAXDOMAIN > 1 + struct thread *td; + + td = curthread; + + td->td_dom_rr_idx++; + td->td_dom_rr_idx %= vm_ndomains; + return (td->td_dom_rr_idx); +#else + return (0); +#endif +} + /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. @@ -137,8 +146,7 @@ static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; - struct vm_freelist *fl; - int error, flind, oind, pind; + int dom, error, flind, lcnt, oind, pind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) @@ -158,8 +166,10 @@ sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) sbuf_printf(&sbuf, " %2d (%6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { - fl = vm_phys_free_queues[flind][pind]; - sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt); + lcnt = 0; + for (dom = 0; dom < vm_ndomains; dom++) + lcnt += vm_phys_free_queues[dom][flind][pind][oind].lcnt; + sbuf_printf(&sbuf, " | %6d", lcnt); } sbuf_printf(&sbuf, "\n"); } @@ -198,33 +208,27 @@ sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) return (error); } -#if MAXDOMAIN > 1 -/* - * Outputs the set of free list lookup lists. - */ -static int -sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS) +static void +vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { - struct sbuf sbuf; - int domain, error, flind, ndomains; - error = sysctl_wire_old_buffer(req, 0); - if (error != 0) - return (error); - sbuf_new_for_sysctl(&sbuf, NULL, 128, req); - ndomains = vm_nfreelists - VM_NFREELIST + 1; - for (domain = 0; domain < ndomains; domain++) { - sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain); - for (flind = 0; flind < vm_nfreelists; flind++) - sbuf_printf(&sbuf, " [%d]:\t%p\n", flind, - vm_phys_lookup_lists[domain][flind]); - } - error = sbuf_finish(&sbuf); - sbuf_delete(&sbuf); - return (error); + m->order = order; + if (tail) + TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq); + else + TAILQ_INSERT_HEAD(&fl[order].pl, m, pageq); + fl[order].lcnt++; } -#endif - + +static void +vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) +{ + + TAILQ_REMOVE(&fl[order].pl, m, pageq); + fl[order].lcnt--; + m->order = VM_NFREEORDER; +} + /* * Create a physical memory segment. */ @@ -253,14 +257,7 @@ _vm_phys_create_seg(vm_paddr_t start, vm #else seg->first_page = PHYS_TO_VM_PAGE(start); #endif -#if MAXDOMAIN > 1 - if (flind == VM_FREELIST_DEFAULT && domain != 0) { - flind = VM_NFREELIST + (domain - 1); - if (flind >= vm_nfreelists) - vm_nfreelists = flind + 1; - } -#endif - seg->free_queues = &vm_phys_free_queues[flind]; + seg->free_queues = &vm_phys_free_queues[domain][flind]; } static void @@ -299,10 +296,7 @@ void vm_phys_init(void) { struct vm_freelist *fl; - int flind, i, oind, pind; -#if MAXDOMAIN > 1 - int ndomains, j; -#endif + int dom, flind, i, oind, pind; for (i = 0; phys_avail[i + 1] != 0; i += 2) { #ifdef VM_FREELIST_ISADMA @@ -338,45 +332,15 @@ vm_phys_init(void) vm_phys_create_seg(phys_avail[i], phys_avail[i + 1], VM_FREELIST_DEFAULT); } - for (flind = 0; flind < vm_nfreelists; flind++) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { - fl = vm_phys_free_queues[flind][pind]; - for (oind = 0; oind < VM_NFREEORDER; oind++) - TAILQ_INIT(&fl[oind].pl); + for (dom = 0; dom < vm_ndomains; dom++) { + for (flind = 0; flind < vm_nfreelists; flind++) { + for (pind = 0; pind < VM_NFREEPOOL; pind++) { + fl = vm_phys_free_queues[dom][flind][pind]; + for (oind = 0; oind < VM_NFREEORDER; oind++) + TAILQ_INIT(&fl[oind].pl); + } } } -#if MAXDOMAIN > 1 - /* - * Build a free list lookup list for each domain. All of the - * memory domain lists are inserted at the VM_FREELIST_DEFAULT - * index in a round-robin order starting with the current - * domain. - */ - ndomains = vm_nfreelists - VM_NFREELIST + 1; - for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++) - for (i = 0; i < ndomains; i++) - vm_phys_lookup_lists[i][flind] = - &vm_phys_free_queues[flind]; - for (i = 0; i < ndomains; i++) - for (j = 0; j < ndomains; j++) { - flind = (i + j) % ndomains; - if (flind == 0) - flind = VM_FREELIST_DEFAULT; - else - flind += VM_NFREELIST - 1; - vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] = - &vm_phys_free_queues[flind]; - } - for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST; - flind++) - for (i = 0; i < ndomains; i++) - vm_phys_lookup_lists[i][flind + ndomains - 1] = - &vm_phys_free_queues[flind]; -#else - for (flind = 0; flind < vm_nfreelists; flind++) - vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind]; -#endif - mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF); } @@ -394,9 +358,7 @@ vm_phys_split_pages(vm_page_t m, int oin KASSERT(m_buddy->order == VM_NFREEORDER, ("vm_phys_split_pages: page %p has unexpected order %d", m_buddy, m_buddy->order)); - m_buddy->order = oind; - TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq); - fl[oind].lcnt++; + vm_freelist_add(fl, m_buddy, oind, 0); } } @@ -426,35 +388,15 @@ vm_phys_add_page(vm_paddr_t pa) } /* - * Allocate a contiguous, power of two-sized set of physical pages - * from the free lists. - * - * The free page queues must be locked. - */ -vm_page_t -vm_phys_alloc_pages(int pool, int order) -{ - vm_page_t m; - int flind; - - for (flind = 0; flind < vm_nfreelists; flind++) { - m = vm_phys_alloc_freelist_pages(flind, pool, order); - if (m != NULL) - return (m); - } - return (NULL); -} - -/* * Find and dequeue a free page on the given free list, with the * specified pool and order */ -vm_page_t -vm_phys_alloc_freelist_pages(int flind, int pool, int order) +static vm_page_t +vm_phys_alloc_freelist_pages_domain(int domain, int flind, int pool, int order) { struct vm_freelist *fl; struct vm_freelist *alt; - int domain, oind, pind; + int oind, pind; vm_page_t m; KASSERT(flind < VM_NFREELIST, @@ -464,19 +406,12 @@ vm_phys_alloc_freelist_pages(int flind, KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); -#if MAXDOMAIN > 1 - domain = PCPU_GET(domain); -#else - domain = 0; -#endif mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - fl = (*vm_phys_lookup_lists[domain][flind])[pool]; + fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); if (m != NULL) { - TAILQ_REMOVE(&fl[oind].pl, m, pageq); - fl[oind].lcnt--; - m->order = VM_NFREEORDER; + vm_freelist_rem(fl, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } @@ -490,12 +425,10 @@ vm_phys_alloc_freelist_pages(int flind, */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { - alt = (*vm_phys_lookup_lists[domain][flind])[pind]; + alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { - TAILQ_REMOVE(&alt[oind].pl, m, pageq); - alt[oind].lcnt--; - m->order = VM_NFREEORDER; + vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); @@ -506,6 +439,40 @@ vm_phys_alloc_freelist_pages(int flind, } /* + * See the comments for vm_phys_alloc_freelist_pages_domain(). + * When MAXDOMAIN is bumped picks up a domain in round-robin fashion. + */ +vm_page_t +vm_phys_alloc_freelist_pages(int flind, int pool, int order) +{ + + return (vm_phys_alloc_freelist_pages_domain(vm_rr_selectdomain(), + flind, pool, order)); +} + +/* + * Allocate a contiguous, power of two-sized set of physical pages + * from the free lists. + * + * The free page queues must be locked. + */ +vm_page_t +vm_phys_alloc_pages(int pool, int order) +{ + vm_page_t m; + int domain, flind; + + domain = vm_rr_selectdomain(); + for (flind = 0; flind < vm_nfreelists; flind++) { + m = vm_phys_alloc_freelist_pages_domain(domain, flind, pool, + order); + if (m != NULL) + return (m); + } + return (NULL); +} + +/* * Find the vm_page corresponding to the given physical address. */ vm_page_t @@ -679,9 +646,7 @@ vm_phys_free_pages(vm_page_t m, int orde if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; - TAILQ_REMOVE(&fl[order].pl, m_buddy, pageq); - fl[order].lcnt--; - m_buddy->order = VM_NFREEORDER; + vm_freelist_rem(fl, m_buddy, order); if (m_buddy->pool != m->pool) vm_phys_set_pool(m->pool, m_buddy, order); order++; @@ -689,10 +654,8 @@ vm_phys_free_pages(vm_page_t m, int orde m = &seg->first_page[atop(pa - seg->start)]; } while (order < VM_NFREEORDER - 1); } - m->order = order; fl = (*seg->free_queues)[m->pool]; - TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq); - fl[order].lcnt++; + vm_freelist_add(fl, m, order, 1); } /* @@ -797,9 +760,7 @@ vm_phys_unfree_page(vm_page_t m) */ fl = (*seg->free_queues)[m_set->pool]; order = m_set->order; - TAILQ_REMOVE(&fl[order].pl, m_set, pageq); - fl[order].lcnt--; - m_set->order = VM_NFREEORDER; + vm_freelist_rem(fl, m_set, order); while (order > 0) { order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); @@ -809,9 +770,7 @@ vm_phys_unfree_page(vm_page_t m) m_tmp = m_set; m_set = &seg->first_page[atop(pa_half - seg->start)]; } - m_tmp->order = order; - TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq); - fl[order].lcnt++; + vm_freelist_add(fl, m_tmp, order, 0); } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); return (TRUE); @@ -823,10 +782,13 @@ vm_phys_unfree_page(vm_page_t m) boolean_t vm_phys_zero_pages_idle(void) { - static struct vm_freelist *fl = vm_phys_free_queues[0][0]; + static struct vm_freelist *fl; static int flind, oind, pind; vm_page_t m, m_tmp; + int domain; + domain = vm_rr_selectdomain(); + fl = vm_phys_free_queues[domain][0][0]; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;;) { TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) { @@ -856,7 +818,7 @@ vm_phys_zero_pages_idle(void) if (flind == vm_nfreelists) flind = 0; } - fl = vm_phys_free_queues[flind][pind]; + fl = vm_phys_free_queues[domain][flind][pind]; } } } @@ -883,12 +845,8 @@ vm_phys_alloc_contig(u_long npages, vm_p int domain, flind, oind, order, pind; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); -#if MAXDOMAIN > 1 - domain = PCPU_GET(domain); -#else - domain = 0; -#endif size = npages << PAGE_SHIFT; + domain = vm_rr_selectdomain(); KASSERT(size != 0, ("vm_phys_alloc_contig: size must not be 0")); KASSERT((alignment & (alignment - 1)) == 0, @@ -900,8 +858,7 @@ vm_phys_alloc_contig(u_long npages, vm_p for (flind = 0; flind < vm_nfreelists; flind++) { for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { - fl = (*vm_phys_lookup_lists[domain][flind]) - [pind]; + fl = &vm_phys_free_queues[domain][flind][pind][0]; TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) { /* * A free list may contain physical pages @@ -959,9 +916,7 @@ vm_phys_alloc_contig(u_long npages, vm_p done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { fl = (*seg->free_queues)[m->pool]; - TAILQ_REMOVE(&fl[m->order].pl, m, pageq); - fl[m->order].lcnt--; - m->order = VM_NFREEORDER; + vm_freelist_rem(fl, m, m->order); } if (m_ret->pool != VM_FREEPOOL_DEFAULT) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); @@ -981,28 +936,30 @@ done: DB_SHOW_COMMAND(freepages, db_show_freepages) { struct vm_freelist *fl; - int flind, oind, pind; + int flind, oind, pind, dom; - for (flind = 0; flind < vm_nfreelists; flind++) { - db_printf("FREE LIST %d:\n" - "\n ORDER (SIZE) | NUMBER" - "\n ", flind); - for (pind = 0; pind < VM_NFREEPOOL; pind++) - db_printf(" | POOL %d", pind); - db_printf("\n-- "); - for (pind = 0; pind < VM_NFREEPOOL; pind++) - db_printf("-- -- "); - db_printf("--\n"); - for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { - db_printf(" %2.2d (%6.6dK)", oind, - 1 << (PAGE_SHIFT - 10 + oind)); - for (pind = 0; pind < VM_NFREEPOOL; pind++) { - fl = vm_phys_free_queues[flind][pind]; - db_printf(" | %6.6d", fl[oind].lcnt); + for (dom = 0; dom < vm_ndomains; dom++) { + for (flind = 0; flind < vm_nfreelists; flind++) { + db_printf("DOMAIN %d FREE LIST %d:\n" + "\n ORDER (SIZE) | NUMBER" + "\n ", dom, flind); + for (pind = 0; pind < VM_NFREEPOOL; pind++) + db_printf(" | POOL %d", pind); + db_printf("\n-- "); + for (pind = 0; pind < VM_NFREEPOOL; pind++) + db_printf("-- -- "); + db_printf("--\n"); + for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { + db_printf(" %2.2d (%6.6dK)", oind, + 1 << (PAGE_SHIFT - 10 + oind)); + for (pind = 0; pind < VM_NFREEPOOL; pind++) { + fl = vm_phys_free_queues[dom][flind][pind]; + db_printf(" | %6.6d", fl[oind].lcnt); + } + db_printf("\n"); } db_printf("\n"); } - db_printf("\n"); } } #endif Modified: user/attilio/jeff-numa/sys/vm/vm_phys.h ============================================================================== --- user/attilio/jeff-numa/sys/vm/vm_phys.h Wed Apr 24 06:40:48 2013 (r249833) +++ user/attilio/jeff-numa/sys/vm/vm_phys.h Wed Apr 24 08:51:15 2013 (r249834) @@ -48,6 +48,7 @@ struct mem_affinity { }; extern struct mem_affinity *mem_affinity; +extern int vm_ndomains; /* * The following functions are only to be used by the virtual memory system.
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201304240851.r3O8pF36048457>