Date: Thu, 31 May 2018 21:19:58 +1000 (EST) From: Bruce Evans <brde@optusnet.com.au> To: Mateusz Guzik <mjg@freebsd.org> Cc: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: Re: svn commit: r334419 - head/sys/amd64/amd64 Message-ID: <20180531201225.L2478@besplex.bde.org> In-Reply-To: <201805310956.w4V9u2rL084194@repo.freebsd.org> References: <201805310956.w4V9u2rL084194@repo.freebsd.org>
next in thread | previous in thread | raw e-mail | index | archive | help
On Thu, 31 May 2018, Mateusz Guzik wrote: > Log: > amd64: switch pagecopy from non-temporal stores to rep movsq As for pagezero, this pessimizes for machines with slow movsq and/or caches (mostly older machines). > The copied data is accessed in part soon after and it results with additional > cache misses during a -j 1 buildkernel WITHOUT_CTF=yes KERNFAST=1, as measured > with pmc stat. Of course it causes more cache misses later, but for large data going through slow caches is much slower so the cache misses later cost less. > Modified: head/sys/amd64/amd64/support.S > ============================================================================== > --- head/sys/amd64/amd64/support.S Thu May 31 09:11:21 2018 (r334418) > +++ head/sys/amd64/amd64/support.S Thu May 31 09:56:02 2018 (r334419) > @@ -281,26 +281,12 @@ END(memset) > */ > ENTRY(pagecopy) > PUSH_FRAME_POINTER > - movq $-PAGE_SIZE,%rax > - movq %rax,%rdx > - subq %rax,%rdi > - subq %rax,%rsi > -1: > - prefetchnta (%rdi,%rax) > - addq $64,%rax > - jne 1b > -2: > - movq (%rdi,%rdx),%rax > - movnti %rax,(%rsi,%rdx) > - movq 8(%rdi,%rdx),%rax > - movnti %rax,8(%rsi,%rdx) > - movq 16(%rdi,%rdx),%rax > - movnti %rax,16(%rsi,%rdx) > - movq 24(%rdi,%rdx),%rax > - movnti %rax,24(%rsi,%rdx) > - addq $32,%rdx > - jne 2b > - sfence > + movq $PAGE_SIZE/8,%rcx > + movq %rdi,%r9 > + movq %rsi,%rdi > + movq %r9,%rsi > + rep > + movsq > POP_FRAME_POINTER > ret > END(pagecopy) It is negatively useful to write this in asm. This is now just memcpy() and the asm version of that is fast enough, though movsq takes too long to start up. This memcpy() might be inlined and then it would be insignificantly faster than the function call. __builtin_memcpy() won't actually inline it, since its size is large and compilers know that they don't understand memory. This doesn't even pessimize for i386. I have used many versions of this, with sysctls to control it. Newer machines normally select memcpy and older machines normally select pagecopy. This change breaks the selection for older machines. They actually want nontemporal pagecopy. My current version is: XX Index: amd64/amd64/pmap.c XX =================================================================== XX --- amd64/amd64/pmap.c (revision 332488) XX +++ amd64/amd64/pmap.c (working copy) XX @@ -360,6 +360,14 @@ XX SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, XX &pg_ps_enabled, 0, "Are large page mappings enabled?"); XX XX +static int pagecopy_memcpy; XX +SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_memcpy, CTLFLAG_RW, XX + &pagecopy_memcpy, 0, "Use memcpy for pagecopy?"); XX + XX +static int pagezero_bzero; XX +SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_bzero, CTLFLAG_RW, XX + &pagezero_bzero, 0, "Use bzero for pagezero?"); XX + XX #define PAT_INDEX_SIZE 8 XX static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ XX XX @@ -5638,7 +5646,10 @@ XX { XX vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); XX XX - pagezero((void *)va); XX + if (pagezero_bzero || (cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0) XX + pagezero((void *)va); XX + else XX + sse2_pagezero((void *)va); XX } XX XX /* XX @@ -5665,7 +5676,10 @@ XX vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); XX vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); XX XX - pagecopy((void *)src, (void *)dst); XX + if (pagecopy_memcpy || (cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0) XX + memcpy((void *)dst, (void *)src, PAGE_SIZE); XX + else XX + pagecopy((void *)src, (void *)dst); XX } XX XX int unmapped_buf_allowed = 1; XX Index: i386/i386/pmap.c XX =================================================================== XX --- i386/i386/pmap.c (revision 332488) XX +++ i386/i386/pmap.c (working copy) XX @@ -219,6 +219,10 @@ XX SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, XX &pg_ps_enabled, 0, "Are large page mappings enabled?"); XX XX +static int pagezero_bzero; XX +SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_bzero, CTLFLAG_RW, XX + &pagezero_bzero, 0, "Use bzero for pagezero?"); XX + XX #define PAT_INDEX_SIZE 8 XX static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ XX XX @@ -4212,6 +4217,10 @@ XX static __inline void XX pagezero(void *page) XX { XX + if (pagezero_bzero || (cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0) { XX + bzero(page, PAGE_SIZE); XX + return; XX + } XX #if defined(I686_CPU) XX if (cpu_class == CPUCLASS_686) { XX if (cpu_feature & CPUID_SSE2) i386 doesn't have a nontemporal pagecopy() and this patch doesn't bother to add one, but in other versions I use: - nontemporal pagecopy through SSE1 if available (requires slow FPU switch but is otherwise better than movnti, starting with not needing SSE2) - nontemporal pagezero through SSE1 if available. - nontemporal pagecopy through SSE2 (see next set of patches) This actually defaults to using memcpy() or bzero() accoring to (cpu_stdext_feature & CPUID_STDEXT_ERMS). This is not quite the right test, but it distinguishes right between all of my slow and fast machines. Newer machines tend to have CPUID_STDEXT_ERMS. Cache speed is more important than fast-strings but newer machines tend to have both. The default and sysctl recover from the loss of nontemporal pagezero on amd64 and maintain i386 to have the same functionality. I recovered nontemporal pagezero as sse2_pagezero on amd64, but it is unreachable without the above sysctl. kib didn't like the fast-strings heuristic and I haven't found or needed a better one. Here is the more elaborate version that I used 2 years ago. Testing with hundreds of makeworlds showed that nothing with temporal moves was any good on Haswell. Nontemporal pagecopy was better for Turion2, but pagecopy is used so rarely that nontemporal pagecopy is not worth adding for i386 now. Much older tests showed that nontemporal stores are not worth using for almost anything, even for older machines than Turion2 where they look useful. They are useful for data larger than all caches or large enough to go out of caches before it is used, but the former is rare and the latter is hard to predict. SunOS used nontemporal copies for not very large sizes in bcopy() and/or copyin/out(). I tried that, and couldn't find any threshold where nontemporal copies worked better. Nontemporal copies are just worse for small data and for large data it makes little difference. pagecopy() is more predictable but still hard to predict. A single page is too small for a nontemporal copying. There might be a large process with a few GB of page faults all at once, but that is rare, especially with the non-bloatware that use. YY diff -c2 ./amd64/amd64/pmap.c~ ./amd64/amd64/pmap.c YY *** ./amd64/amd64/pmap.c~ Sat Jan 23 05:12:31 2016 YY --- ./amd64/amd64/pmap.c Sun Feb 14 17:22:07 2016 YY *************** YY *** 352,355 **** YY --- 352,395 ---- YY &pg_ps_enabled, 0, "Are large page mappings enabled?"); YY YY + static u_long pagecopy_calls; YY + SYSCTL_ULONG(_vm_pmap, OID_AUTO, pagecopy_calls, CTLFLAG_RW, YY + &pagecopy_calls, 0, "Number of calls to pmap_copy_page"); YY + YY + static int pagecopy_memcpy; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_memcpy, CTLFLAG_RW, YY + &pagecopy_memcpy, 0, "Use memcpy for pagecopy?"); YY + YY + static int pagecopy_pagecopy1; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_pagecopy1, CTLFLAG_RW, YY + &pagecopy_pagecopy1, 0, "Use pagecopy1 for pagecopy?"); YY + YY + static int pagecopy_pagecopy8; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_pagecopy8, CTLFLAG_RW, YY + &pagecopy_pagecopy8, 0, "Use pagecopy8 for pagecopy?"); YY + YY + static int pagecopy_pagecopy81; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_pagecopy81, CTLFLAG_RW, YY + &pagecopy_pagecopy81, 0, "Use pagecopy81 for pagecopy?"); YY + YY + static u_long pagezero_calls; YY + SYSCTL_ULONG(_vm_pmap, OID_AUTO, pagezero_calls, CTLFLAG_RW, YY + &pagezero_calls, 0, "Number of calls to pagezero"); YY + YY + static int pagezero_pagezero1; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_pagezero1, CTLFLAG_RW, YY + &pagezero_pagezero1, 0, "Use pagezero1 for pagezero?"); YY + YY + static int pagezero_pagezero2; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_pagezero2, CTLFLAG_RW, YY + &pagezero_pagezero2, 0, "Use pagezero2 for pagezero?"); YY + YY + static int pagezero_pagezero8; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_pagezero8, CTLFLAG_RW, YY + &pagezero_pagezero8, 0, "Use pagezero8 for pagezero?"); YY + YY + static int pagezero_bzero; YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_bzero, CTLFLAG_RW, YY + &pagezero_bzero, 0, "Use bzero for pagezero?"); YY + YY #define PAT_INDEX_SIZE 8 YY static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ YY *************** YY *** 4989,4995 **** YY } YY YY /* YY ! * pmap_zero_page zeros the specified hardware page by mapping YY ! * the page into KVM and using bzero to clear its contents. YY */ YY void YY --- 5029,5038 ---- YY } YY YY + void pagezero1(void *); YY + void pagezero2(void *); YY + void pagezero8(void *); YY + YY /* YY ! * Zero the specified hardware page. YY */ YY void YY *************** YY *** 4998,5009 **** YY vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); YY YY ! pagezero((void *)va); YY } YY YY /* YY ! * pmap_zero_page_area zeros the specified hardware page by mapping YY ! * the page into KVM and using bzero to clear its contents. YY ! * YY ! * off and size may not cover an area beyond a single hardware page. YY */ YY void YY --- 5041,5060 ---- YY vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); YY YY ! pagezero_calls++; YY ! if (pagezero_pagezero1) YY ! pagezero1((void *)va); YY ! else if (pagezero_pagezero2) YY ! pagezero2((void *)va); YY ! else if (pagezero_pagezero8) YY ! pagezero8((void *)va); YY ! else if (pagezero_bzero) YY ! bzero((void *)va, PAGE_SIZE); YY ! else YY ! pagezero((void *)va); YY } YY YY /* YY ! * Zero an an area within a single hardware page. off and size must not YY ! * cover an area beyond a single hardware page. YY */ YY void YY *************** YY *** 5032,5035 **** YY --- 5083,5090 ---- YY } YY YY + void pagecopy1(void *, void *); YY + void pagecopy8(void *, void *); YY + void pagecopy81(void *, void *); YY + YY /* YY * pmap_copy_page copies the specified (machine independent) YY *************** YY *** 5044,5048 **** YY vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); YY YY ! pagecopy((void *)src, (void *)dst); YY } YY YY --- 5099,5113 ---- YY vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); YY YY ! pagecopy_calls++; YY ! if (pagecopy_pagecopy1) YY ! pagecopy1((void *)src, (void *)dst); YY ! else if (pagecopy_pagecopy8) YY ! pagecopy8((void *)src, (void *)dst); YY ! else if (pagecopy_pagecopy81) YY ! pagecopy81((void *)src, (void *)dst); YY ! else if (pagecopy_memcpy) YY ! memcpy((void *)dst, (void *)src, PAGE_SIZE); YY ! else YY ! pagecopy((void *)src, (void *)dst); YY } YY YY diff -c2 ./amd64/amd64/support.S~ ./amd64/amd64/support.S YY *** ./amd64/amd64/support.S~ Wed Feb 24 22:35:30 2016 YY --- ./amd64/amd64/support.S Wed Feb 24 22:47:10 2016 YY *************** YY *** 80,83 **** YY --- 80,133 ---- YY END(pagezero) YY YY + ENTRY(pagezero1) YY + PUSH_FRAME_POINTER YY + movq $-PAGE_SIZE,%rdx YY + subq %rdx,%rdi YY + xorl %eax,%eax YY + 1: YY + movnti %rax,(%rdi,%rdx) YY + addq $8,%rdx YY + jne 1b YY + sfence YY + POP_FRAME_POINTER YY + ret YY + END(pagezero1) YY + YY + ENTRY(pagezero2) YY + PUSH_FRAME_POINTER YY + movq $-PAGE_SIZE,%rdx YY + subq %rdx,%rdi YY + xorl %eax,%eax YY + 1: YY + movnti %rax,(%rdi,%rdx) YY + movnti %rax,8(%rdi,%rdx) YY + addq $16,%rdx YY + jne 1b YY + sfence YY + POP_FRAME_POINTER YY + ret YY + END(pagezero2) YY + YY + ENTRY(pagezero8) YY + PUSH_FRAME_POINTER YY + movq $-PAGE_SIZE,%rdx YY + subq %rdx,%rdi YY + xorl %eax,%eax YY + 1: YY + movnti %rax,(%rdi,%rdx) YY + movnti %rax,8(%rdi,%rdx) YY + movnti %rax,16(%rdi,%rdx) YY + movnti %rax,24(%rdi,%rdx) YY + movnti %rax,32(%rdi,%rdx) YY + movnti %rax,40(%rdi,%rdx) YY + movnti %rax,48(%rdi,%rdx) YY + movnti %rax,56(%rdi,%rdx) YY + addq $64,%rdx YY + jne 1b YY + sfence YY + POP_FRAME_POINTER YY + ret YY + END(pagezero8) YY + YY ENTRY(bcmp) YY PUSH_FRAME_POINTER YY *************** YY *** 195,198 **** YY --- 245,334 ---- YY END(pagecopy) YY YY + ENTRY(pagecopy1) YY + PUSH_FRAME_POINTER YY + movq $-PAGE_SIZE,%rax YY + movq %rax,%rdx YY + subq %rax,%rdi YY + subq %rax,%rsi YY + 2: YY + movq (%rdi,%rdx),%rax YY + movnti %rax,(%rsi,%rdx) YY + movq 8(%rdi,%rdx),%rax YY + movnti %rax,8(%rsi,%rdx) YY + movq 16(%rdi,%rdx),%rax YY + movnti %rax,16(%rsi,%rdx) YY + movq 24(%rdi,%rdx),%rax YY + movnti %rax,24(%rsi,%rdx) YY + addq $32,%rdx YY + jne 2b YY + sfence YY + POP_FRAME_POINTER YY + ret YY + END(pagecopy1) YY + YY + ENTRY(pagecopy8) YY + PUSH_FRAME_POINTER YY + movq $-PAGE_SIZE,%rax YY + movq %rax,%rdx YY + subq %rax,%rdi YY + subq %rax,%rsi YY + 1: YY + prefetchnta (%rdi,%rax) YY + addq $64,%rax YY + jne 1b YY + 2: YY + movq (%rdi,%rdx),%rax YY + movnti %rax,(%rsi,%rdx) YY + movq 8(%rdi,%rdx),%rax YY + movnti %rax,8(%rsi,%rdx) YY + movq 16(%rdi,%rdx),%rax YY + movnti %rax,16(%rsi,%rdx) YY + movq 24(%rdi,%rdx),%rax YY + movnti %rax,24(%rsi,%rdx) YY + movq 32(%rdi,%rdx),%rax YY + movnti %rax,32(%rsi,%rdx) YY + movq 40(%rdi,%rdx),%rax YY + movnti %rax,40(%rsi,%rdx) YY + movq 48(%rdi,%rdx),%rax YY + movnti %rax,48(%rsi,%rdx) YY + movq 56(%rdi,%rdx),%rax YY + movnti %rax,56(%rsi,%rdx) YY + addq $64,%rdx YY + jne 2b YY + sfence YY + POP_FRAME_POINTER YY + ret YY + END(pagecopy8) YY + YY + ENTRY(pagecopy81) YY + PUSH_FRAME_POINTER YY + movq $-PAGE_SIZE,%rax YY + movq %rax,%rdx YY + subq %rax,%rdi YY + subq %rax,%rsi YY + 2: YY + movq (%rdi,%rdx),%rax YY + movnti %rax,(%rsi,%rdx) YY + movq 8(%rdi,%rdx),%rax YY + movnti %rax,8(%rsi,%rdx) YY + movq 16(%rdi,%rdx),%rax YY + movnti %rax,16(%rsi,%rdx) YY + movq 24(%rdi,%rdx),%rax YY + movnti %rax,24(%rsi,%rdx) YY + movq 32(%rdi,%rdx),%rax YY + movnti %rax,32(%rsi,%rdx) YY + movq 40(%rdi,%rdx),%rax YY + movnti %rax,40(%rsi,%rdx) YY + movq 48(%rdi,%rdx),%rax YY + movnti %rax,48(%rsi,%rdx) YY + movq 56(%rdi,%rdx),%rax YY + movnti %rax,56(%rsi,%rdx) YY + addq $64,%rdx YY + jne 2b YY + sfence YY + POP_FRAME_POINTER YY + ret YY + END(pagecopy81) YY + YY /* fillw(pat, base, cnt) */ YY /* %rdi,%rsi, %rdx */ Bruce
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20180531201225.L2478>