Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 8 Sep 2023 21:23:27 GMT
From:      Robert Clausecker <fuz@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 474408bb7933 - main - lib/libc/amd64/string: add strcspn(3) scalar, x86-64-v2 implementation
Message-ID:  <202309082123.388LNRtE057265@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=474408bb7933f0383a0da2b01e717bfe683ae77c

commit 474408bb7933f0383a0da2b01e717bfe683ae77c
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-08-13 17:35:01 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-09-08 21:20:19 +0000

    lib/libc/amd64/string: add strcspn(3) scalar, x86-64-v2 implementation
    
    This changeset adds both a scalar and an x86-64-v2 implementation
    of the strcspn(3) function to libc. A baseline implementation does not
    appear to be feasible given the requirements of the function.
    
    The scalar implementation is similar to the generic libc implementation,
    but expands the bit set into a byte set to reduce latency, improving
    performance. This approach could probably be backported to the generic
    C version to benefit other platforms.
    
    The x86-64-v2 implementation is built around the infamous pcmpistri
    instruction. An alternative implementation based on the Muła/Langdale
    algorithm [1] was prototyped, but performed worse than the pcmpistri
    approach except for sets of more than 16 characters with long input
    strings.
    
    All implementations provide special cases for the empty set (reduces to
    strlen as well as single-character sets (reduces to strchr). The
    x86-64-v2 kernel falls back to the scalar implementation for sets of
    more than 32 characters. This limit could be raised by additional
    multiples of 16 through the use of additional pcmpistri code paths, but
    I consider this case to be too rare to be of importance.
    
    [1]: http://0x80.pl/articles/simd-byte-lookup.html
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    mjg
    MFC after:      1 week
    MFC to:         stable/14
    Differential Revision:  https://reviews.freebsd.org/D41557
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/strcspn.S    | 368 +++++++++++++++++++++++++++++++++++++
 2 files changed, 369 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index 4df4ff8f1417..f01a52d9ebea 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -10,5 +10,6 @@ MDSRCS+= \
 	strcat.S \
 	strchrnul.S \
 	strcmp.S \
+	strcspn.S \
 	strlen.S \
 	strcpy.c
diff --git a/lib/libc/amd64/string/strcspn.S b/lib/libc/amd64/string/strcspn.S
new file mode 100644
index 000000000000..de409db6d472
--- /dev/null
+++ b/lib/libc/amd64/string/strcspn.S
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT	.p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(strcspn)
+	ARCHFUNC(strcspn, scalar)
+	NOARCHFUNC
+	ARCHFUNC(strcspn, x86_64_v2)
+ENDARCHFUNCS(strcspn)
+
+ARCHENTRY(strcspn, scalar)
+	push	%rbp			# align stack to enable function call
+	mov	%rsp, %rbp
+	sub	$256, %rsp		# allocate space for lookup table
+
+	/* check for special cases */
+	movzbl	(%rsi), %eax		# first character in the set
+	test	%eax, %eax
+	jz	.Lstrlen
+
+	movzbl	1(%rsi), %edx		# second character in the set
+	test	%edx, %edx
+	jz	.Lstrchr
+
+	/* no special case matches -- prepare lookup table */
+	xor	%r8d, %r8d
+	mov	$28, %ecx
+0:	mov	%r8, (%rsp, %rcx, 8)
+	mov	%r8, 8(%rsp, %rcx, 8)
+	mov	%r8, 16(%rsp, %rcx, 8)
+	mov	%r8, 24(%rsp, %rcx, 8)
+	sub	$4, %ecx
+	jnc	0b
+
+	add	$2, %rsi
+	movb	$1, (%rsp, %rax, 1)	# register first chars in set
+	movb	$1, (%rsp, %rdx, 1)
+	mov	%rdi, %rax		# a copy of the source to iterate over
+
+	/* process remaining chars in set */
+	ALIGN_TEXT
+0:	movzbl	(%rsi), %ecx
+	movb	$1, (%rsp, %rcx, 1)
+	test	%ecx, %ecx
+	jz	1f
+
+	movzbl	1(%rsi), %ecx
+	movb	$1, (%rsp, %rcx, 1)
+	test	%ecx, %ecx
+	jz	1f
+
+	add	$2, %rsi
+	jmp	0b
+
+	/* find match */
+	ALIGN_TEXT
+1:	movzbl	(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	2f
+
+	movzbl	1(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	3f
+
+	movzbl	2(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	4f
+
+	movzbl	3(%rax), %ecx
+	add	$4, %rax
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	1b
+
+	sub	$3, %rax
+4:	dec	%rdi
+3:	inc	%rax
+2:	sub	%rdi, %rax		# number of characters preceding match
+	leave
+	ret
+
+	/* set is empty, degrades to strlen */
+.Lstrlen:
+	leave
+	jmp	CNAME(strlen)
+
+	/* just one character in set, degrades to strchr */
+.Lstrchr:
+	mov	%rdi, (%rsp)		# stash a copy of the string
+	mov	%eax, %esi		# find the character in the set
+	call	CNAME(strchrnul)
+	sub	(%rsp), %rax		# length of prefix before match
+	leave
+	ret
+ARCHEND(strcspn, scalar)
+
+	/*
+	 * This kernel uses pcmpistri to do the heavy lifting.
+	 * We provide five code paths, depending on set size:
+	 *
+	 *      0: call strlen()
+	 *      1: call strchr()
+	 *  2--16: one pcmpistri per 16 bytes of input
+	 * 17--32: two pcmpistri per 16 bytes of input
+	 *   >=33: fall back to look up table
+	 */
+ARCHENTRY(strcspn, x86_64_v2)
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$256, %rsp
+
+	/* check for special cases */
+	movzbl		(%rsi), %eax
+	test		%eax, %eax		# empty string?
+	jz		.Lstrlenv2
+
+	cmpb		$0, 1(%rsi)		# single character string?
+	jz		.Lstrchrv2
+
+	/* find set size and copy up to 32 bytes to (%rsp) */
+	mov		%esi, %ecx
+	and		$~0xf, %rsi		# align set pointer
+	movdqa		(%rsi), %xmm0
+	pxor		%xmm1, %xmm1
+	and		$0xf, %ecx		# amount of bytes rsi is past alignment
+	xor		%edx, %edx
+	pcmpeqb		%xmm0, %xmm1		# end of string reached?
+	movdqa		%xmm0, 32(%rsp)		# transfer head of set to stack
+	pmovmskb	%xmm1, %eax
+	shr		%cl, %eax		# clear out junk before string
+	test		%eax, %eax		# end of set reached?
+	jnz		0f
+
+	movdqa		16(%rsi), %xmm0		# second chunk of the set
+	mov		$16, %edx
+	sub		%ecx, %edx		# length of set preceding xmm0
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm0, %xmm1
+	movdqa		%xmm0, 48(%rsp)
+	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax
+	jnz		1f
+
+	movdqa		32(%rsi), %xmm0		# third chunk
+	add		$16, %edx
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm0, %xmm1
+	movdqa		%xmm0, 64(%rsp)
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax		# still not done?
+	jz		.Lgt32v2
+
+0:	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
+1:	tzcnt		%eax, %eax
+	add		%eax, %edx		# length of set (excluding NUL byte)
+	cmp		$32, %edx		# above 32 bytes?
+	ja		.Lgt32v2
+
+	/*
+	 * At this point we know that we want to use pcmpistri.
+	 * one last problem obtains: the head of the string is not
+	 * aligned and may cross a cacheline.  If this is the case,
+	 * we take the part before the page boundary and repeat the
+	 * last byte to fill up the xmm register.
+	 */
+	mov		%rdi, %rax		# save original string pointer
+	lea		15(%rdi), %esi		# last byte of the head
+	xor		%edi, %esi
+	test		$PAGE_SIZE, %esi	# does the head cross a page?
+	jz		0f
+
+	/* head crosses page: copy to stack to fix up */
+	and		$~0xf, %rax		# align head pointer temporarily
+	movzbl		15(%rax), %esi		# last head byte on the page
+	movdqa		(%rax), %xmm0
+	movabs		$0x0101010101010101, %r8
+	imul		%r8, %rsi		# repeated 8 times
+	movdqa		%xmm0, (%rsp)		# head word on stack
+	mov		%rsi, 16(%rsp)		# followed by filler (last byte x8)
+	mov		%rsi, 24(%rsp)
+	mov		%edi, %eax
+	and		$0xf, %eax		# offset of head from alignment
+	add		%rsp, %rax		# pointer to fake head
+
+0:	movdqu		(%rax), %xmm0		# load head (fake or real)
+	lea		16(%rdi), %rax
+	and		$~0xf, %rax		# second 16 bytes of string (aligned)
+1:	cmp		$16, %edx		# 16--32 bytes?
+	ja		.Lgt16v2
+
+
+	/* set is 2--16 bytes in size */
+
+	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */
+	pcmpistri	$0, %xmm0, %xmm2	# match in head?
+	jbe		.Lheadmatchv2
+
+	ALIGN_TEXT
+0:	pcmpistri	$0, (%rax), %xmm2
+	jbe		1f			# match or end of string?
+	pcmpistri	$0, 16(%rax), %xmm2
+	lea		32(%rax), %rax
+	ja		0b			# match or end of string?
+
+3:	lea		-16(%rax), %rax		# go back to second half
+1:	jc		2f			# jump if match found
+	movdqa		(%rax), %xmm0		# reload string piece
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm1, %xmm0		# where is the NUL byte?
+	pmovmskb	%xmm0, %ecx
+	tzcnt		%ecx, %ecx		# location of NUL byte in (%rax)
+2:	sub		%rdi, %rax		# offset of %xmm0 from beginning of string
+	add		%rcx, %rax		# prefix length before match/NUL
+	leave
+	ret
+
+.Lheadmatchv2:
+	jc		2f			# jump if match found
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm1, %xmm0
+	pmovmskb	%xmm0, %ecx
+	tzcnt		%ecx, %ecx		# location of NUL byte
+2:	mov		%ecx, %eax		# prefix length before match/NUL
+	leave
+	ret
+
+.Lgt16v2:
+	movdqu		48(%rsp, %rcx, 1), %xmm3 # second part of set
+
+	/* set is 17--32 bytes in size */
+	pcmpistri	$0, %xmm0, %xmm2	# match in head?
+	jbe		.Lheadmatchv2
+	pcmpistri	$0, %xmm0, %xmm3	# ZF=1 not possible here
+	jb		.Lheadmatchv2
+
+	ALIGN_TEXT
+0:	movdqa		(%rax), %xmm0
+	pcmpistri	$0, %xmm0, %xmm2
+	jbe		1b
+	pcmpistri	$0, %xmm0, %xmm3
+	jb		1f			# ZF=1 not possible here
+	movdqa		16(%rax), %xmm0
+	add		$32, %rax
+	pcmpistri	$0, %xmm0, %xmm2
+	jbe		3b
+	pcmpistri	$0, %xmm0, %xmm3
+	jae		0b			# ZF=1 not possible here
+
+	sub		$16, %rax		# go back to second half
+1:	add		%rcx, %rax
+	sub		%rdi, %rax
+	leave
+	ret
+
+	/* set is empty, degrades to strlen */
+.Lstrlenv2:
+	leave
+	jmp	CNAME(strlen)
+
+	/* just one character in set, degrades to strchr */
+.Lstrchrv2:
+	mov	%rdi, (%rsp)		# stash a copy of the string
+	mov	%eax, %esi		# find this character
+	call	CNAME(strchrnul)
+	sub	(%rsp), %rax		# length of prefix before match
+	leave
+	ret
+
+	/* set is >=33 bytes in size */
+.Lgt32v2:
+	xorps	%xmm0, %xmm0
+	mov	$256-64, %edx
+
+	/* clear out look up table */
+0:	movaps	%xmm0, (%rsp, %rdx, 1)
+	movaps	%xmm0, 16(%rsp, %rdx, 1)
+	movaps	%xmm0, 32(%rsp, %rdx, 1)
+	movaps	%xmm0, 48(%rsp, %rdx, 1)
+	sub	$64, %edx
+	jnc	0b
+
+	add	%rcx, %rsi		# restore string pointer
+	mov	%rdi, %rax		# keep a copy of the string
+
+	/* initialise look up table */
+	ALIGN_TEXT
+0:	movzbl	(%rsi), %ecx
+	movb	$1, (%rsp, %rcx, 1)
+	test	%ecx, %ecx
+	jz	1f
+
+	movzbl	1(%rsi), %ecx
+	movb	$1, (%rsp, %rcx, 1)
+	test	%ecx, %ecx
+	jz	1f
+
+	movzbl	2(%rsi), %ecx
+	movb	$1, (%rsp, %rcx, 1)
+	test	%ecx, %ecx
+	jz	1f
+
+	movzbl	3(%rsi), %ecx
+	movb	$1, (%rsp, %rcx, 1)
+	test	%ecx, %ecx
+	jz	1f
+
+	add	$4, %rsi
+	jmp	0b
+
+	/* find match */
+	ALIGN_TEXT
+1:	movzbl	(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	2f
+
+	movzbl	1(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	3f
+
+	movzbl	2(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	4f
+
+	movzbl	3(%rax), %ecx
+	add	$4, %rax
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	1b
+
+	sub	$3, %rax
+4:	dec	%rdi
+3:	inc	%rax
+2:	sub	%rdi, %rax		# number of characters preceding match
+	leave
+	ret
+ARCHEND(strcspn, x86_64_v2)
+
+	.section .note.GNU-stack,"",%progbits



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202309082123.388LNRtE057265>