From owner-svn-src-all@freebsd.org  Fri Oct  5 19:25:10 2018
Return-Path: <owner-svn-src-all@freebsd.org>
Delivered-To: svn-src-all@mailman.ysv.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1])
 by mailman.ysv.freebsd.org (Postfix) with ESMTP id 2EBA110B4788;
 Fri,  5 Oct 2018 19:25:10 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org
 [IPv6:2610:1c1:1:606c::19:3])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client CN "mxrelay.nyi.freebsd.org",
 Issuer "Let's Encrypt Authority X3" (verified OK))
 by mx1.freebsd.org (Postfix) with ESMTPS id D92168D6D1;
 Fri,  5 Oct 2018 19:25:09 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id D03081984B;
 Fri,  5 Oct 2018 19:25:09 +0000 (UTC) (envelope-from mjg@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w95JP9Cf055768;
 Fri, 5 Oct 2018 19:25:09 GMT (envelope-from mjg@FreeBSD.org)
Received: (from mjg@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id w95JP9hG055767;
 Fri, 5 Oct 2018 19:25:09 GMT (envelope-from mjg@FreeBSD.org)
Message-Id: <201810051925.w95JP9hG055767@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: mjg set sender to mjg@FreeBSD.org
 using -f
From: Mateusz Guzik <mjg@FreeBSD.org>
Date: Fri, 5 Oct 2018 19:25:09 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-head@freebsd.org
Subject: svn commit: r339205 - head/sys/amd64/amd64
X-SVN-Group: head
X-SVN-Commit-Author: mjg
X-SVN-Commit-Paths: head/sys/amd64/amd64
X-SVN-Commit-Revision: 339205
X-SVN-Commit-Repository: base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-all@freebsd.org
X-Mailman-Version: 2.1.27
Precedence: list
List-Id: "SVN commit messages for the entire src tree \(except for &quot;
 user&quot; and &quot; projects&quot; \)" <svn-src-all.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-all/>
List-Post: <mailto:svn-src-all@freebsd.org>
List-Help: <mailto:svn-src-all-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Fri, 05 Oct 2018 19:25:10 -0000

Author: mjg
Date: Fri Oct  5 19:25:09 2018
New Revision: 339205
URL: https://svnweb.freebsd.org/changeset/base/339205

Log:
  amd64: make memset less slow with mov
  
  rep stos has a high startup time even on modern microarchitectures like
  Skylake. Intel optimization manuals discuss how for small sizes it is
  beneficial to go for streaming stores. Since those cannot be used without
  extra penalty in the kernel I investigated performance impact of just
  regular movs.
  
  The patch below implements a very simple scheme: a 32-byte loop followed
  by filling in the remainder of at most 31 bytes. It has a 256 breaking
  point on which it falls back to rep stos. It provides a significant win
  over the current primitive on several machines I tested (both Intel and
  AMD). A 64-byte loop did not provide any benefit even for multiple of 64
  sizes.
  
  See the review for benchmark data.
  
  Reviewed by:	kib
  Approved by:	re (gjb)
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D17398

Modified:
  head/sys/amd64/amd64/support.S

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S	Fri Oct  5 18:15:44 2018	(r339204)
+++ head/sys/amd64/amd64/support.S	Fri Oct  5 19:25:09 2018	(r339205)
@@ -320,43 +320,92 @@ END(memcpy_erms)
  * memset(dst, c,   len)
  *        rdi, rsi, rdx
  */
-ENTRY(memset_std)
+.macro MEMSET erms
 	PUSH_FRAME_POINTER
 	movq	%rdi,%r9
 	movq	%rdx,%rcx
 	movzbq	%sil,%r8
 	movabs	$0x0101010101010101,%rax
 	imulq	%r8,%rax
-	cmpq	$15,%rcx
-	jbe	1f
-	shrq	$3,%rcx
-	rep
-	stosq
-	movq	%rdx,%rcx
-	andq	$7,%rcx
-	jne	1f
+
+	cmpq	$32,%rcx
+	jb	1016f
+
+	cmpq	$256,%rcx
+	ja	1256f
+
+1032:
+	movq	%rax,(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rax,24(%rdi)
+	leaq	32(%rdi),%rdi
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	1032b
+	cmpb	$0,%cl
+	je	1000f
+1016:
+	cmpb	$16,%cl
+	jl	1008f
+	movq	%rax,(%rdi)
+	movq	%rax,8(%rdi)
+	subb	$16,%cl
+	jz	1000f
+	leaq	16(%rdi),%rdi
+1008:
+	cmpb	$8,%cl
+	jl	1004f
+	movq	%rax,(%rdi)
+	subb	$8,%cl
+	jz	1000f
+	leaq	8(%rdi),%rdi
+1004:
+	cmpb	$4,%cl
+	jl	1002f
+	movl	%eax,(%rdi)
+	subb	$4,%cl
+	jz	1000f
+	leaq	4(%rdi),%rdi
+1002:
+	cmpb	$2,%cl
+	jl	1001f
+	movw	%ax,(%rdi)
+	subb	$2,%cl
+	jz	1000f
+	leaq	2(%rdi),%rdi
+1001:
+	cmpb	$1,%cl
+	jl	1000f
+	movb	%al,(%rdi)
+1000:
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
-1:
+1256:
+.if \erms == 1
 	rep
 	stosb
+.else
+	shrq	$3,%rcx
+	rep
+	stosq
+	movq	%rdx,%rcx
+	andb	$7,%cl
+	jne	1004b
+.endif
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
+.endm
+
+ENTRY(memset_std)
+	MEMSET erms=0
 END(memset_std)
 
 ENTRY(memset_erms)
-	PUSH_FRAME_POINTER
-	movq	%rdi,%r9
-	movq	%rdx,%rcx
-	movb	%sil,%al
-	rep
-	stosb
-	movq	%r9,%rax
-	POP_FRAME_POINTER
-	ret
+	MEMSET erms=1
 END(memset_erms)
 
 /* fillw(pat, base, cnt) */