Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 31 May 2018 21:19:58 +1000 (EST)
From:      Bruce Evans <brde@optusnet.com.au>
To:        Mateusz Guzik <mjg@freebsd.org>
Cc:        src-committers@freebsd.org, svn-src-all@freebsd.org,  svn-src-head@freebsd.org
Subject:   Re: svn commit: r334419 - head/sys/amd64/amd64
Message-ID:  <20180531201225.L2478@besplex.bde.org>
In-Reply-To: <201805310956.w4V9u2rL084194@repo.freebsd.org>
References:  <201805310956.w4V9u2rL084194@repo.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On Thu, 31 May 2018, Mateusz Guzik wrote:

> Log:
>  amd64: switch pagecopy from non-temporal stores to rep movsq

As for pagezero, this pessimizes for machines with slow movsq and/or caches
(mostly older machines).

>  The copied data is accessed in part soon after and it results with additional
>  cache misses during a -j 1 buildkernel WITHOUT_CTF=yes KERNFAST=1, as measured
>  with pmc stat.

Of course it causes more cache misses later, but for large data going through
slow caches is much slower so the cache misses later cost less.

> Modified: head/sys/amd64/amd64/support.S
> ==============================================================================
> --- head/sys/amd64/amd64/support.S	Thu May 31 09:11:21 2018	(r334418)
> +++ head/sys/amd64/amd64/support.S	Thu May 31 09:56:02 2018	(r334419)
> @@ -281,26 +281,12 @@ END(memset)
>  */
> ENTRY(pagecopy)
> 	PUSH_FRAME_POINTER
> -	movq	$-PAGE_SIZE,%rax
> -	movq	%rax,%rdx
> -	subq	%rax,%rdi
> -	subq	%rax,%rsi
> -1:
> -	prefetchnta (%rdi,%rax)
> -	addq	$64,%rax
> -	jne	1b
> -2:
> -	movq	(%rdi,%rdx),%rax
> -	movnti	%rax,(%rsi,%rdx)
> -	movq	8(%rdi,%rdx),%rax
> -	movnti	%rax,8(%rsi,%rdx)
> -	movq	16(%rdi,%rdx),%rax
> -	movnti	%rax,16(%rsi,%rdx)
> -	movq	24(%rdi,%rdx),%rax
> -	movnti	%rax,24(%rsi,%rdx)
> -	addq	$32,%rdx
> -	jne	2b
> -	sfence
> +	movq	$PAGE_SIZE/8,%rcx
> +	movq	%rdi,%r9
> +	movq	%rsi,%rdi
> +	movq	%r9,%rsi
> +	rep
> +	movsq
> 	POP_FRAME_POINTER
> 	ret
> END(pagecopy)

It is negatively useful to write this in asm.  This is now just memcpy()
and the asm version of that is fast enough, though movsq takes too long
to start up.  This memcpy() might be inlined and then it would be
insignificantly faster than the function call.  __builtin_memcpy() won't
actually inline it, since its size is large and compilers know that they
don't understand memory.

This doesn't even pessimize for i386.

I have used many versions of this, with sysctls to control it.  Newer
machines normally select memcpy and older machines normally select
pagecopy.  This change breaks the selection for older machines.  They
actually want nontemporal pagecopy.

My current version is:

XX Index: amd64/amd64/pmap.c
XX ===================================================================
XX --- amd64/amd64/pmap.c	(revision 332488)
XX +++ amd64/amd64/pmap.c	(working copy)
XX @@ -360,6 +360,14 @@
XX  SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
XX      &pg_ps_enabled, 0, "Are large page mappings enabled?");
XX 
XX +static int pagecopy_memcpy;
XX +SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_memcpy, CTLFLAG_RW,
XX +    &pagecopy_memcpy, 0, "Use memcpy for pagecopy?");
XX +
XX +static int pagezero_bzero;
XX +SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_bzero, CTLFLAG_RW,
XX +    &pagezero_bzero, 0, "Use bzero for pagezero?");
XX +
XX  #define	PAT_INDEX_SIZE	8
XX  static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
XX 
XX @@ -5638,7 +5646,10 @@
XX  {
XX  	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
XX 
XX -	pagezero((void *)va);
XX +	if (pagezero_bzero || (cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0)
XX +		pagezero((void *)va);
XX +	else
XX +		sse2_pagezero((void *)va);
XX  }
XX 
XX  /*
XX @@ -5665,7 +5676,10 @@
XX  	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
XX  	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
XX 
XX -	pagecopy((void *)src, (void *)dst);
XX +	if (pagecopy_memcpy || (cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0)
XX +		memcpy((void *)dst, (void *)src, PAGE_SIZE);
XX +	else
XX +		pagecopy((void *)src, (void *)dst);
XX  }
XX 
XX  int unmapped_buf_allowed = 1;
XX Index: i386/i386/pmap.c
XX ===================================================================
XX --- i386/i386/pmap.c	(revision 332488)
XX +++ i386/i386/pmap.c	(working copy)
XX @@ -219,6 +219,10 @@
XX  SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
XX      &pg_ps_enabled, 0, "Are large page mappings enabled?");
XX 
XX +static int pagezero_bzero;
XX +SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_bzero, CTLFLAG_RW,
XX +    &pagezero_bzero, 0, "Use bzero for pagezero?");
XX +
XX  #define	PAT_INDEX_SIZE	8
XX  static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
XX 
XX @@ -4212,6 +4217,10 @@
XX  static __inline void
XX  pagezero(void *page)
XX  {
XX +	if (pagezero_bzero || (cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0) {
XX +		bzero(page, PAGE_SIZE);
XX +		return;
XX +	}
XX  #if defined(I686_CPU)
XX  	if (cpu_class == CPUCLASS_686) {
XX  		if (cpu_feature & CPUID_SSE2)

i386 doesn't have a nontemporal pagecopy() and this patch doesn't bother
to add one, but in other versions I use:
- nontemporal pagecopy through SSE1 if available (requires slow FPU switch
   but is otherwise better than movnti, starting with not needing SSE2)
- nontemporal pagezero through SSE1 if available.
- nontemporal pagecopy through SSE2 (see next set of patches)

This actually defaults to using memcpy() or bzero() accoring to
(cpu_stdext_feature & CPUID_STDEXT_ERMS).  This is not quite the right
test, but it distinguishes right between all of my slow and fast machines.
Newer machines tend to have CPUID_STDEXT_ERMS.  Cache speed is more important
than fast-strings but newer machines tend to have both.

The default and sysctl recover from the loss of nontemporal pagezero on
amd64 and maintain i386 to have the same functionality.  I recovered
nontemporal pagezero as sse2_pagezero on amd64, but it is unreachable
without the above sysctl.  kib didn't like the fast-strings heuristic and
I haven't found or needed a better one.

Here is the more elaborate version that I used 2 years ago.  Testing with
hundreds of makeworlds showed that nothing with temporal moves was any
good on Haswell.  Nontemporal pagecopy was better for Turion2, but pagecopy
is used so rarely that nontemporal pagecopy is not worth adding for i386
now.

Much older tests showed that nontemporal stores are not worth using for
almost anything, even for older machines than Turion2 where they look useful.
They are useful for data larger than all caches or large enough to go out
of caches before it is used, but the former is rare and the latter is hard
to predict.

SunOS used nontemporal copies for not very large sizes in bcopy()
and/or copyin/out().  I tried that, and couldn't find any threshold
where nontemporal copies worked better.  Nontemporal copies are just
worse for small data and for large data it makes little difference.

pagecopy() is more predictable but still hard to predict.  A single page
is too small for a nontemporal copying.  There might be a large process
with a few GB of page faults all at once, but that is rare, especially
with the non-bloatware that use.

YY diff -c2 ./amd64/amd64/pmap.c~ ./amd64/amd64/pmap.c
YY *** ./amd64/amd64/pmap.c~	Sat Jan 23 05:12:31 2016
YY --- ./amd64/amd64/pmap.c	Sun Feb 14 17:22:07 2016
YY ***************
YY *** 352,355 ****
YY --- 352,395 ----
YY       &pg_ps_enabled, 0, "Are large page mappings enabled?");
YY 
YY + static u_long pagecopy_calls;
YY + SYSCTL_ULONG(_vm_pmap, OID_AUTO, pagecopy_calls, CTLFLAG_RW,
YY +     &pagecopy_calls, 0, "Number of calls to pmap_copy_page");
YY + 
YY + static int pagecopy_memcpy;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_memcpy, CTLFLAG_RW,
YY +     &pagecopy_memcpy, 0, "Use memcpy for pagecopy?");
YY + 
YY + static int pagecopy_pagecopy1;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_pagecopy1, CTLFLAG_RW,
YY +     &pagecopy_pagecopy1, 0, "Use pagecopy1 for pagecopy?");
YY + 
YY + static int pagecopy_pagecopy8;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_pagecopy8, CTLFLAG_RW,
YY +     &pagecopy_pagecopy8, 0, "Use pagecopy8 for pagecopy?");
YY + 
YY + static int pagecopy_pagecopy81;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagecopy_pagecopy81, CTLFLAG_RW,
YY +     &pagecopy_pagecopy81, 0, "Use pagecopy81 for pagecopy?");
YY + 
YY + static u_long pagezero_calls;
YY + SYSCTL_ULONG(_vm_pmap, OID_AUTO, pagezero_calls, CTLFLAG_RW,
YY +     &pagezero_calls, 0, "Number of calls to pagezero");
YY + 
YY + static int pagezero_pagezero1;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_pagezero1, CTLFLAG_RW,
YY +     &pagezero_pagezero1, 0, "Use pagezero1 for pagezero?");
YY + 
YY + static int pagezero_pagezero2;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_pagezero2, CTLFLAG_RW,
YY +     &pagezero_pagezero2, 0, "Use pagezero2 for pagezero?");
YY + 
YY + static int pagezero_pagezero8;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_pagezero8, CTLFLAG_RW,
YY +     &pagezero_pagezero8, 0, "Use pagezero8 for pagezero?");
YY + 
YY + static int pagezero_bzero;
YY + SYSCTL_INT(_vm_pmap, OID_AUTO, pagezero_bzero, CTLFLAG_RW,
YY +     &pagezero_bzero, 0, "Use bzero for pagezero?");
YY + 
YY   #define	PAT_INDEX_SIZE	8
YY   static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
YY ***************
YY *** 4989,4995 ****
YY   }
YY 
YY   /*
YY !  *	pmap_zero_page zeros the specified hardware page by mapping
YY !  *	the page into KVM and using bzero to clear its contents.
YY    */
YY   void
YY --- 5029,5038 ----
YY   }
YY 
YY + void pagezero1(void *);
YY + void pagezero2(void *);
YY + void pagezero8(void *);
YY + 
YY   /*
YY !  * Zero the specified hardware page.
YY    */
YY   void
YY ***************
YY *** 4998,5009 ****
YY   	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
YY 
YY ! 	pagezero((void *)va);
YY   }
YY 
YY   /*
YY !  *	pmap_zero_page_area zeros the specified hardware page by mapping 
YY !  *	the page into KVM and using bzero to clear its contents.
YY !  *
YY !  *	off and size may not cover an area beyond a single hardware page.
YY    */
YY   void
YY --- 5041,5060 ----
YY   	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
YY 
YY ! 	pagezero_calls++;
YY ! 	if (pagezero_pagezero1)
YY ! 		pagezero1((void *)va);
YY ! 	else if (pagezero_pagezero2)
YY ! 		pagezero2((void *)va);
YY ! 	else if (pagezero_pagezero8)
YY ! 		pagezero8((void *)va);
YY ! 	else if (pagezero_bzero)
YY ! 		bzero((void *)va, PAGE_SIZE);
YY ! 	else
YY ! 		pagezero((void *)va);
YY   }
YY 
YY   /*
YY !  * Zero an an area within a single hardware page.  off and size must not
YY !  * cover an area beyond a single hardware page.
YY    */
YY   void
YY ***************
YY *** 5032,5035 ****
YY --- 5083,5090 ----
YY   }
YY 
YY + void pagecopy1(void *, void *);
YY + void pagecopy8(void *, void *);
YY + void pagecopy81(void *, void *);
YY + 
YY   /*
YY    *	pmap_copy_page copies the specified (machine independent)
YY ***************
YY *** 5044,5048 ****
YY   	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
YY 
YY ! 	pagecopy((void *)src, (void *)dst);
YY   }
YY 
YY --- 5099,5113 ----
YY   	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
YY 
YY ! 	pagecopy_calls++;
YY ! 	if (pagecopy_pagecopy1)
YY ! 		pagecopy1((void *)src, (void *)dst);
YY ! 	else if (pagecopy_pagecopy8)
YY ! 		pagecopy8((void *)src, (void *)dst);
YY ! 	else if (pagecopy_pagecopy81)
YY ! 		pagecopy81((void *)src, (void *)dst);
YY ! 	else if (pagecopy_memcpy)
YY ! 		memcpy((void *)dst, (void *)src, PAGE_SIZE);
YY ! 	else
YY ! 		pagecopy((void *)src, (void *)dst);
YY   }
YY 
YY diff -c2 ./amd64/amd64/support.S~ ./amd64/amd64/support.S
YY *** ./amd64/amd64/support.S~	Wed Feb 24 22:35:30 2016
YY --- ./amd64/amd64/support.S	Wed Feb 24 22:47:10 2016
YY ***************
YY *** 80,83 ****
YY --- 80,133 ----
YY   END(pagezero)
YY 
YY + ENTRY(pagezero1)
YY + 	PUSH_FRAME_POINTER
YY + 	movq	$-PAGE_SIZE,%rdx
YY + 	subq	%rdx,%rdi
YY + 	xorl	%eax,%eax
YY + 1:
YY + 	movnti	%rax,(%rdi,%rdx)
YY + 	addq	$8,%rdx
YY + 	jne	1b
YY + 	sfence
YY + 	POP_FRAME_POINTER
YY + 	ret
YY + END(pagezero1)
YY + 
YY + ENTRY(pagezero2)
YY + 	PUSH_FRAME_POINTER
YY + 	movq	$-PAGE_SIZE,%rdx
YY + 	subq	%rdx,%rdi
YY + 	xorl	%eax,%eax
YY + 1:
YY + 	movnti	%rax,(%rdi,%rdx)
YY + 	movnti	%rax,8(%rdi,%rdx)
YY + 	addq	$16,%rdx
YY + 	jne	1b
YY + 	sfence
YY + 	POP_FRAME_POINTER
YY + 	ret
YY + END(pagezero2)
YY + 
YY + ENTRY(pagezero8)
YY + 	PUSH_FRAME_POINTER
YY + 	movq	$-PAGE_SIZE,%rdx
YY + 	subq	%rdx,%rdi
YY + 	xorl	%eax,%eax
YY + 1:
YY + 	movnti	%rax,(%rdi,%rdx)
YY + 	movnti	%rax,8(%rdi,%rdx)
YY + 	movnti	%rax,16(%rdi,%rdx)
YY + 	movnti	%rax,24(%rdi,%rdx)
YY + 	movnti	%rax,32(%rdi,%rdx)
YY + 	movnti	%rax,40(%rdi,%rdx)
YY + 	movnti	%rax,48(%rdi,%rdx)
YY + 	movnti	%rax,56(%rdi,%rdx)
YY + 	addq	$64,%rdx
YY + 	jne	1b
YY + 	sfence
YY + 	POP_FRAME_POINTER
YY + 	ret
YY + END(pagezero8)
YY + 
YY   ENTRY(bcmp)
YY   	PUSH_FRAME_POINTER
YY ***************
YY *** 195,198 ****
YY --- 245,334 ----
YY   END(pagecopy)
YY 
YY + ENTRY(pagecopy1)
YY + 	PUSH_FRAME_POINTER
YY + 	movq	$-PAGE_SIZE,%rax
YY + 	movq	%rax,%rdx
YY + 	subq	%rax,%rdi
YY + 	subq	%rax,%rsi
YY + 2:
YY + 	movq	(%rdi,%rdx),%rax
YY + 	movnti	%rax,(%rsi,%rdx)
YY + 	movq	8(%rdi,%rdx),%rax
YY + 	movnti	%rax,8(%rsi,%rdx)
YY + 	movq	16(%rdi,%rdx),%rax
YY + 	movnti	%rax,16(%rsi,%rdx)
YY + 	movq	24(%rdi,%rdx),%rax
YY + 	movnti	%rax,24(%rsi,%rdx)
YY + 	addq	$32,%rdx
YY + 	jne	2b
YY + 	sfence
YY + 	POP_FRAME_POINTER
YY + 	ret
YY + END(pagecopy1)
YY + 
YY + ENTRY(pagecopy8)
YY + 	PUSH_FRAME_POINTER
YY + 	movq	$-PAGE_SIZE,%rax
YY + 	movq	%rax,%rdx
YY + 	subq	%rax,%rdi
YY + 	subq	%rax,%rsi
YY + 1:
YY + 	prefetchnta (%rdi,%rax)
YY + 	addq	$64,%rax
YY + 	jne	1b
YY + 2:
YY + 	movq	(%rdi,%rdx),%rax
YY + 	movnti	%rax,(%rsi,%rdx)
YY + 	movq	8(%rdi,%rdx),%rax
YY + 	movnti	%rax,8(%rsi,%rdx)
YY + 	movq	16(%rdi,%rdx),%rax
YY + 	movnti	%rax,16(%rsi,%rdx)
YY + 	movq	24(%rdi,%rdx),%rax
YY + 	movnti	%rax,24(%rsi,%rdx)
YY + 	movq	32(%rdi,%rdx),%rax
YY + 	movnti	%rax,32(%rsi,%rdx)
YY + 	movq	40(%rdi,%rdx),%rax
YY + 	movnti	%rax,40(%rsi,%rdx)
YY + 	movq	48(%rdi,%rdx),%rax
YY + 	movnti	%rax,48(%rsi,%rdx)
YY + 	movq	56(%rdi,%rdx),%rax
YY + 	movnti	%rax,56(%rsi,%rdx)
YY + 	addq	$64,%rdx
YY + 	jne	2b
YY + 	sfence
YY + 	POP_FRAME_POINTER
YY + 	ret
YY + END(pagecopy8)
YY + 
YY + ENTRY(pagecopy81)
YY + 	PUSH_FRAME_POINTER
YY + 	movq	$-PAGE_SIZE,%rax
YY + 	movq	%rax,%rdx
YY + 	subq	%rax,%rdi
YY + 	subq	%rax,%rsi
YY + 2:
YY + 	movq	(%rdi,%rdx),%rax
YY + 	movnti	%rax,(%rsi,%rdx)
YY + 	movq	8(%rdi,%rdx),%rax
YY + 	movnti	%rax,8(%rsi,%rdx)
YY + 	movq	16(%rdi,%rdx),%rax
YY + 	movnti	%rax,16(%rsi,%rdx)
YY + 	movq	24(%rdi,%rdx),%rax
YY + 	movnti	%rax,24(%rsi,%rdx)
YY + 	movq	32(%rdi,%rdx),%rax
YY + 	movnti	%rax,32(%rsi,%rdx)
YY + 	movq	40(%rdi,%rdx),%rax
YY + 	movnti	%rax,40(%rsi,%rdx)
YY + 	movq	48(%rdi,%rdx),%rax
YY + 	movnti	%rax,48(%rsi,%rdx)
YY + 	movq	56(%rdi,%rdx),%rax
YY + 	movnti	%rax,56(%rsi,%rdx)
YY + 	addq	$64,%rdx
YY + 	jne	2b
YY + 	sfence
YY + 	POP_FRAME_POINTER
YY + 	ret
YY + END(pagecopy81)
YY + 
YY   /* fillw(pat, base, cnt) */
YY   /*       %rdi,%rsi, %rdx */

Bruce



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20180531201225.L2478>