From owner-svn-src-all@freebsd.org  Thu Oct 11 23:37:58 2018
Return-Path: <owner-svn-src-all@freebsd.org>
Delivered-To: svn-src-all@mailman.ysv.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1])
 by mailman.ysv.freebsd.org (Postfix) with ESMTP id 94EBA10CB8FA;
 Thu, 11 Oct 2018 23:37:58 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org
 [IPv6:2610:1c1:1:606c::19:3])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client CN "mxrelay.nyi.freebsd.org",
 Issuer "Let's Encrypt Authority X3" (verified OK))
 by mx1.freebsd.org (Postfix) with ESMTPS id 3EE7F874D4;
 Thu, 11 Oct 2018 23:37:58 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 34CC81757E;
 Thu, 11 Oct 2018 23:37:58 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w9BNbwjo004025;
 Thu, 11 Oct 2018 23:37:58 GMT (envelope-from mjg@FreeBSD.org)
Received: (from mjg@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id w9BNbwP8004024;
 Thu, 11 Oct 2018 23:37:58 GMT (envelope-from mjg@FreeBSD.org)
Message-Id: <201810112337.w9BNbwP8004024@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: mjg set sender to mjg@FreeBSD.org
 using -f
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Thu, 11 Oct 2018 23:37:58 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-head@freebsd.org
Subject: svn commit: r339322 - head/sys/amd64/amd64
X-SVN-Group: head
X-SVN-Commit-Author: mjg
X-SVN-Commit-Paths: head/sys/amd64/amd64
X-SVN-Commit-Revision: 339322
X-SVN-Commit-Repository: base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-all@freebsd.org
X-Mailman-Version: 2.1.27
Precedence: list
List-Id: "SVN commit messages for the entire src tree \(except for &quot;
 user&quot; and &quot; projects&quot; \)" <svn-src-all.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-all/>
List-Post: <mailto:svn-src-all@freebsd.org>
List-Help: <mailto:svn-src-all-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Thu, 11 Oct 2018 23:37:58 -0000

Author: mjg
Date: Thu Oct 11 23:37:57 2018
New Revision: 339322
URL: https://svnweb.freebsd.org/changeset/base/339322

Log:
  amd64: make memmove and memcpy less slow with mov
  
  The reasoning is the same as with the memset change, see r339205
  
  Reviewed by:	kib (previous version)
  Approved by:	re (gjb)
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D17441

Modified:
  head/sys/amd64/amd64/support.S

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S	Thu Oct 11 23:28:04 2018	(r339321)
+++ head/sys/amd64/amd64/support.S	Thu Oct 11 23:37:57 2018	(r339322)
@@ -200,82 +200,236 @@ END(memcmp)
  * Adapted from bcopy written by:
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
-ENTRY(memmove_std)
-	PUSH_FRAME_POINTER
-	movq	%rdi,%rax
-	movq	%rdx,%rcx
 
+/*
+ * Register state at entry is supposed to be as follows:
+ * rdi - destination
+ * rsi - source
+ * rdx - count
+ *
+ * The macro possibly clobbers the above and: rcx, r8.
+ * It does not clobber rax, r10 nor r11.
+ */
+.macro MEMMOVE erms overlap begin end
+	\begin
+.if \overlap == 1
 	movq	%rdi,%r8
 	subq	%rsi,%r8
-	cmpq	%rcx,%r8			/* overlapping && src < dst? */
+	cmpq	%rcx,%r8	/* overlapping && src < dst? */
 	jb	2f
+.endif
 
-	cmpq	$15,%rcx
-	jbe	1f
-	shrq	$3,%rcx				/* copy by 64-bit words */
-	rep
-	movsq
-	movq	%rdx,%rcx
-	andq	$7,%rcx				/* any bytes left? */
-	jne	1f
-	POP_FRAME_POINTER
+	cmpq	$32,%rcx
+	jb	1016f
+
+	cmpq	$256,%rcx
+	ja	1256f
+
+1032:
+	movq	(%rsi),%rdx
+	movq	%rdx,(%rdi)
+	movq	8(%rsi),%rdx
+	movq	%rdx,8(%rdi)
+	movq	16(%rsi),%rdx
+	movq	%rdx,16(%rdi)
+	movq	24(%rsi),%rdx
+	movq	%rdx,24(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	1032b
+	cmpb	$0,%cl
+	jne	1016f
+	\end
 	ret
 	ALIGN_TEXT
-1:
+1016:
+	cmpb	$16,%cl
+	jl	1008f
+	movq	(%rsi),%rdx
+	movq	%rdx,(%rdi)
+	movq	8(%rsi),%rdx
+	movq	%rdx,8(%rdi)
+	subb	$16,%cl
+	jz	1000f
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+1008:
+	cmpb	$8,%cl
+	jl	1004f
+	movq	(%rsi),%rdx
+	movq	%rdx,(%rdi)
+	subb	$8,%cl
+	jz	1000f
+	leaq	8(%rsi),%rsi
+	leaq	8(%rdi),%rdi
+1004:
+	cmpb	$4,%cl
+	jl	1002f
+	movl	(%rsi),%edx
+	movl	%edx,(%rdi)
+	subb	$4,%cl
+	jz	1000f
+	leaq	4(%rsi),%rsi
+	leaq	4(%rdi),%rdi
+1002:
+	cmpb	$2,%cl
+	jl	1001f
+	movw	(%rsi),%dx
+	movw	%dx,(%rdi)
+	subb	$2,%cl
+	jz	1000f
+	leaq	2(%rsi),%rsi
+	leaq	2(%rdi),%rdi
+1001:
+	cmpb	$1,%cl
+	jl	1000f
+	movb	(%rsi),%dl
+	movb	%dl,(%rdi)
+1000:
+	\end
+	ret
+
+	ALIGN_TEXT
+1256:
+.if \erms == 1
 	rep
 	movsb
-	POP_FRAME_POINTER
+.else
+	shrq	$3,%rcx                         /* copy by 64-bit words */
+	rep
+	movsq
+	movq	%rdx,%rcx
+	andb	$7,%cl                         /* any bytes left? */
+	jne	1004b
+.endif
+	\end
 	ret
 
-	/* ALIGN_TEXT */
+.if \overlap == 1
+	/*
+	 * Copy backwards.
+	 */
+        ALIGN_TEXT
 2:
-	addq	%rcx,%rdi			/* copy backwards */
+	addq	%rcx,%rdi
 	addq	%rcx,%rsi
+
+	cmpq	$32,%rcx
+	jb	2016f
+
+	cmpq	$256,%rcx
+	ja	2256f
+
+2032:
+	movq	-8(%rsi),%rdx
+	movq	%rdx,-8(%rdi)
+	movq	-16(%rsi),%rdx
+	movq	%rdx,-16(%rdi)
+	movq	-24(%rsi),%rdx
+	movq	%rdx,-24(%rdi)
+	movq	-32(%rsi),%rdx
+	movq	%rdx,-32(%rdi)
+	leaq	-32(%rsi),%rsi
+	leaq	-32(%rdi),%rdi
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	2032b
+	cmpb	$0,%cl
+	jne	2016f
+	\end
+	ret
+	ALIGN_TEXT
+2016:
+	cmpb	$16,%cl
+	jl	2008f
+	movq	-8(%rsi),%rdx
+	movq	%rdx,-8(%rdi)
+	movq	-16(%rsi),%rdx
+	movq	%rdx,-16(%rdi)
+	subb	$16,%cl
+	jz	2000f
+	leaq	-16(%rsi),%rsi
+	leaq	-16(%rdi),%rdi
+2008:
+	cmpb	$8,%cl
+	jl	2004f
+	movq	-8(%rsi),%rdx
+	movq	%rdx,-8(%rdi)
+	subb	$8,%cl
+	jz	2000f
+	leaq	-8(%rsi),%rsi
+	leaq	-8(%rdi),%rdi
+2004:
+	cmpb	$4,%cl
+	jl	2002f
+	movl	-4(%rsi),%edx
+	movl	%edx,-4(%rdi)
+	subb	$4,%cl
+	jz	2000f
+	leaq	-4(%rsi),%rsi
+	leaq	-4(%rdi),%rdi
+2002:
+	cmpb	$2,%cl
+	jl	2001f
+	movw	-2(%rsi),%dx
+	movw	%dx,-2(%rdi)
+	subb	$2,%cl
+	jz	2000f
+	leaq	-2(%rsi),%rsi
+	leaq	-2(%rdi),%rdi
+2001:
+	cmpb	$1,%cl
+	jl	2000f
+	movb	-1(%rsi),%dl
+	movb	%dl,-1(%rdi)
+2000:
+	\end
+	ret
+	ALIGN_TEXT
+2256:
 	decq	%rdi
 	decq	%rsi
 	std
-	andq	$7,%rcx				/* any fractional bytes? */
+.if \erms == 1
+	rep
+	movsb
+.else
+	andq	$7,%rcx                         /* any fractional bytes? */
 	je	3f
 	rep
 	movsb
 3:
-	movq	%rdx,%rcx			/* copy remainder by 32-bit words */
+	movq	%rdx,%rcx                       /* copy remainder by 32-bit words */
 	shrq	$3,%rcx
 	subq	$7,%rsi
 	subq	$7,%rdi
 	rep
 	movsq
+.endif
 	cld
-	POP_FRAME_POINTER
+	\end
 	ret
-END(memmove_std)
+.endif
+.endm
 
-ENTRY(memmove_erms)
+.macro MEMMOVE_BEGIN
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
+.endm
 
-	movq	%rdi,%r8
-	subq	%rsi,%r8
-	cmpq	%rcx,%r8			/* overlapping && src < dst? */
-	jb	1f
-
-	rep
-	movsb
+.macro MEMMOVE_END
 	POP_FRAME_POINTER
-	ret
+.endm
 
-1:
-	addq	%rcx,%rdi			/* copy backwards */
-	addq	%rcx,%rsi
-	decq	%rdi
-	decq	%rsi
-	std
-	rep
-	movsb
-	cld
-	POP_FRAME_POINTER
-	ret
+ENTRY(memmove_std)
+	MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
+END(memmove_std)
+
+ENTRY(memmove_erms)
+	MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memmove_erms)
 
 /*
@@ -285,35 +439,11 @@ END(memmove_erms)
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy_std)
-	PUSH_FRAME_POINTER
-	movq	%rdi,%rax
-	movq	%rdx,%rcx
-	cmpq	$15,%rcx
-	jbe	1f
-	shrq	$3,%rcx				/* copy by 64-bit words */
-	rep
-	movsq
-	movq	%rdx,%rcx
-	andq	$7,%rcx				/* any bytes left? */
-	jne	1f
-	POP_FRAME_POINTER
-	ret
-	ALIGN_TEXT
-1:
-	rep
-	movsb
-	POP_FRAME_POINTER
-	ret
+	MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memcpy_std)
 
 ENTRY(memcpy_erms)
-	PUSH_FRAME_POINTER
-	movq	%rdi,%rax
-	movq	%rdx,%rcx
-	rep
-	movsb
-	POP_FRAME_POINTER
-	ret
+	MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memcpy_erms)
 
 /*