Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 10 Apr 2021 13:58:33 GMT
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org
Subject:   git: 9535440569ca - stable/13 - amd64: implement strlen in assembly, take 2
Message-ID:  <202104101358.13ADwXro041180@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=9535440569ca468b12030142d6631704b658ece6

commit 9535440569ca468b12030142d6631704b658ece6
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2021-04-10 13:52:49 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2021-04-10 13:53:46 +0000

    amd64: implement strlen in assembly, take 2
    
    Tested with glibc test suite.
    
    The C variant in libkern performs excessive branching to find the zero
    byte instead of using the bsfq instruction. The same code patched to use
    it is still slower than the routine implemented here as the compiler
    keeps neglecting to perform certain optimizations (like using leaq).
    
    On top of that the routine can be used as a starting point for copyinstr
    which operates on words intead of bytes.
    
    The previous attempt had an instance of swapped operands to andq when
    dealing with fully aligned case, which had a side effect of breaking the
    code for certain corner cases. Noted by jrtc27.
    
    Sample results:
    
    $(perl -e "print 'A' x 3"):
    stock:  211198039
    patched:338626619
    asm:    465609618
    
    $(perl -e "print 'A' x 100"):
    stock:   83151997
    patched: 98285919
    asm:    120719888
    
    Reviewed by:    jhb, kib
    Differential Revision:  https://reviews.freebsd.org/D28779
    
    (cherry picked from commit 5fa12fe0cd203efcbb2ac21e7c3e3fb9b2f801ae)
---
 sys/amd64/amd64/support.S | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 sys/conf/files            |  1 -
 sys/conf/files.arm        |  1 +
 sys/conf/files.i386       |  1 +
 sys/conf/files.mips       |  1 +
 sys/conf/files.powerpc    |  1 +
 sys/conf/files.riscv      |  1 +
 7 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index b623fba277db..4c0f7da87ef8 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -697,6 +697,72 @@ ENTRY(fillw)
 	ret
 END(fillw)
 
+/*
+ * strlen(string)
+ *	  %rdi
+ *
+ * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
+ *
+ * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
+ * with leaq.
+ *
+ * For a description see either:
+ * - "Hacker's Delight" by Henry S. Warren, Jr.
+ * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
+ *   by Agner Fog
+ *
+ * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
+ */
+ENTRY(strlen)
+	PUSH_FRAME_POINTER
+	movabsq	$0xfefefefefefefeff,%r8
+	movabsq	$0x8080808080808080,%r9
+
+	movq	%rdi,%r10
+	movq	%rdi,%rcx
+	testb	$7,%dil
+	jz	2f
+
+	/*
+	 * Handle misaligned reads: align to 8 and fill
+	 * the spurious bytes.
+	 */
+	andq	$~7,%rdi
+	movq	(%rdi),%r11
+	shlq	$3,%rcx
+	movq	$-1,%rdx
+	shlq	%cl,%rdx
+	notq	%rdx
+	orq	%rdx,%r11
+
+	leaq	(%r11,%r8),%rcx
+	notq	%r11
+	andq	%r11,%rcx
+	andq	%r9,%rcx
+	jnz	3f
+
+	/*
+	 * Main loop.
+	 */
+	ALIGN_TEXT
+1:
+	leaq	8(%rdi),%rdi
+2:
+	movq	(%rdi),%r11
+	leaq	(%r11,%r8),%rcx
+	notq	%r11
+	andq	%r11,%rcx
+	andq	%r9,%rcx
+	jz	1b
+3:
+	bsfq	%rcx,%rcx
+	shrq	$3,%rcx
+	leaq	(%rcx,%rdi),%rax
+	subq	%r10,%rax
+	POP_FRAME_POINTER
+	ret
+END(strlen)
+
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
diff --git a/sys/conf/files b/sys/conf/files
index e68aa2118791..9ec7292a741b 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4086,7 +4086,6 @@ libkern/strdup.c		standard
 libkern/strndup.c		standard
 libkern/strlcat.c		standard
 libkern/strlcpy.c		standard
-libkern/strlen.c		standard
 libkern/strncat.c		standard
 libkern/strncmp.c		standard
 libkern/strncpy.c		standard
diff --git a/sys/conf/files.arm b/sys/conf/files.arm
index eb3a23b5fc21..69986585bdf6 100644
--- a/sys/conf/files.arm
+++ b/sys/conf/files.arm
@@ -127,6 +127,7 @@ libkern/lshrdi3.c		standard
 libkern/memcmp.c		standard
 libkern/moddi3.c		standard
 libkern/qdivrem.c		standard
+libkern/strlen.c		standard
 libkern/ucmpdi2.c		standard
 libkern/udivdi3.c		standard
 libkern/umoddi3.c		standard
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index b5192e47a738..de759a9f7c83 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -219,6 +219,7 @@ libkern/memcmp.c		standard
 libkern/memset.c		standard
 libkern/moddi3.c		standard
 libkern/qdivrem.c		standard
+libkern/strlen.c		standard
 libkern/ucmpdi2.c		standard
 libkern/udivdi3.c		standard
 libkern/umoddi3.c		standard
diff --git a/sys/conf/files.mips b/sys/conf/files.mips
index c18f0a5c69be..7ee5b0019bd7 100644
--- a/sys/conf/files.mips
+++ b/sys/conf/files.mips
@@ -66,6 +66,7 @@ libkern/ucmpdi2.c			optional	mips | mipshf | mipsel | mipselhf
 libkern/ashldi3.c			standard
 libkern/ashrdi3.c			standard
 libkern/memcmp.c			standard
+libkern/strlen.c			standard
 
 # cfe support
 dev/cfe/cfe_api.c			optional	cfe
diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc
index 3022fd6f6e39..347abee153d2 100644
--- a/sys/conf/files.powerpc
+++ b/sys/conf/files.powerpc
@@ -129,6 +129,7 @@ libkern/memcmp.c		standard
 libkern/memset.c		standard
 libkern/moddi3.c		optional	powerpc | powerpcspe
 libkern/qdivrem.c		optional	powerpc | powerpcspe
+libkern/strlen.c		standard
 libkern/ucmpdi2.c		optional	powerpc | powerpcspe
 libkern/udivdi3.c		optional	powerpc | powerpcspe
 libkern/umoddi3.c		optional	powerpc | powerpcspe
diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv
index 3969528db07e..7ecea016b9a3 100644
--- a/sys/conf/files.riscv
+++ b/sys/conf/files.riscv
@@ -29,6 +29,7 @@ libkern/flsl.c			standard
 libkern/flsll.c			standard
 libkern/memcmp.c		standard
 libkern/memset.c		standard
+libkern/strlen.c		standard
 riscv/riscv/autoconf.c		standard
 riscv/riscv/bus_machdep.c	standard
 riscv/riscv/bus_space_asm.S	standard



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202104101358.13ADwXro041180>