Date: Fri, 10 Jan 2025 15:03:53 GMT From: Robert Clausecker <fuz@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 756b7fc80837 - main - lib/libc/aarch64/string: add strlcpy SIMD implementation Message-ID: <202501101503.50AF3rqI057146@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=756b7fc80837567d114a3c93e9bb987e219a1b23 commit 756b7fc80837567d114a3c93e9bb987e219a1b23 Author: Getz Mikalsen <getz@FreeBSD.org> AuthorDate: 2024-08-26 18:14:31 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2025-01-10 15:02:40 +0000 lib/libc/aarch64/string: add strlcpy SIMD implementation This changeset includes a port of the SIMD implementation of strlcpy for amd64 to Aarch64. It is based on memccpy (D46170) with some minor differences. Performance is significantly better than the scalar implementation. Benchmark results are as usual generated by the strperf utility written by fuz. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46243 --- lib/libc/aarch64/string/Makefile.inc | 3 +- lib/libc/aarch64/string/strlcpy.S | 316 +++++++++++++++++++++++++++++++++++ 2 files changed, 318 insertions(+), 1 deletion(-) diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 0b2974947389..34a84bcfe133 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -26,7 +26,8 @@ MDSRCS+= \ strcspn.S \ strpbrk.c \ strsep.c \ - strcat.c + strcat.c \ + strlcpy.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strlcpy.S b/lib/libc/aarch64/string/strlcpy.S new file mode 100644 index 000000000000..3859aaca447b --- /dev/null +++ b/lib/libc/aarch64/string/strlcpy.S @@ -0,0 +1,316 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> +*/ + +#include <machine/asm.h> + + .weak strlcpy + .set strlcpy, __strlcpy + .text + +ENTRY(__strlcpy) + subs x2, x2, #1 + b.lo .L0 + + mov x9, x0 // stash copy of dst pointer + bic x10, x1, #0xf // src aligned + and x11, x1, #0xf // src offset + + ldr q1, [x10] + cmeq v1.16b, v1.16b, #0 // NUL found in head? + + mov x8, #-1 // fill register with 0xfff..fff + lsl x12, x11, #2 + lsl x8, x8, x12 // mask of bytes in the string + + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + ands x5, x5, x8 + b.ne .Lhead_nul + + ldr q3, [x10, #16] // load second string chunk + ldr q2, [x1] // load true head + mov x8, #32 + sub x8, x8, x11 + + cmeq v1.16b, v3.16b, #0 // NUL found in second chunk? + + subs x2, x2, x8 + b.ls .Lhead_buf_end + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + cbnz x5, .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer */ + ldr q1, [x10, #32] // load next string chunk + str q2, [x0] // deposit head into buffer + sub x0, x0, x11 // adjust x0 + str q3, [x0, #16] // deposit second chunk + add x10, x10, #32 // advance src + add x0, x0, #32 // advance dst + subs x2, x2, #16 // enough left for another round? + b.ls 1f + + /* main loop unrolled twice */ + .p2align 4 +0: + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, 3f + + str q1, [x0] + ldr q1, [x10, #16] // load next chunk + + cmp x2, #16 // more than a full chunk left? + b.ls 2f + + add x10, x10, #32 // advance pointers + add x0, x0, #32 + + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + cbnz x5, 4f // process chunk if match + + str q1, [x0, #-16] + ldr q1, [x10] // load next chunk + + subs x2, x2, #32 + b.hi 0b + +1: + sub x10, x10, #16 // undo second advancement + add x2, x2, #16 + sub x0, x0, #16 + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x4, d2 + + mov x6, #0xf + mov x7, x4 + + lsl x5, x2, #2 // shift 0xf to the limits position + lsl x5, x6, x5 + cmp x2, #16 // dont induce match if limit >=16 + csel x5, x5, xzr, lo + orr x8, x4, x5 // treat limit as if terminator present + + rbit x8, x8 // simulate x86 tzcnt + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + add x0, x0, x8 + + ldr q1, [x10, x8] // load tail + str q1, [x0] // store tail + strb wzr, [x0, #16] + + /* continue to find the end of the string */ + cbnz x7, 1f + + /* we opt for a simpler strlen than the one in libc as the + * cmeq, shrn approach is faster for shorter strings. + */ + .p2align 4 +0: + ldr q1, [x10, #32] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + cbnz x7, 2f + + ldr q1, [x10, #48] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + add x10, x10, #32 + cbz x7, 0b + +1: sub x10, x10, #16 +2: rbit x8, x7 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x10, x10, x1 + add x0, x10, #32 + add x0, x0, x8 + + ret + +4: + sub x10, x10, #16 // undo second advancement + sub x0, x0, #16 // undo second advancement + + /* string has ended but buffer has not */ +3: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + add x0, x0, x8 // restore dst pointer + add x10, x10, x8 + + ldr q1, [x10, #-15] + str q1, [x0, #-15] + add x0, x0, #1 + sub x0, x10, x1 + + ret + +.Lhead_buf_end: + shrn v1.8b, v1.8h, #4 + fmov x8, d1 + + add x2, x2, #32 // restore limit + + mov x7, x8 + mov x6, #0xf + + cmp x2, #16 // should we induce a match or not + b.lo 0f + + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + add x8, x8, #16 + + cmp x8, x2 + csel x8, x8, x2, lo // copy min(buflen, srclen) bytes + b 1f +0: + + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + mov x8, x2 +1: + + sub x8, x8, x11 + strb wzr, [x9, x8] + + /* continue to find the end of the string */ + cbnz x7, 1f + + /* we opt for a simpler strlen than the one in libc as the + * cmeq, shrn approach is faster for shorter strings. + */ + .p2align 4 +0: + ldr q1, [x10, #32] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + cbnz x7, 2f + + ldr q1, [x10, #48] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + add x10, x10, #32 + cbz x7, 0b + +1: sub x10, x10, #16 +2: rbit x6, x7 + clz x6, x6 // index of mismatch + lsr x6, x6, #2 + + sub x10, x10, x1 + add x0, x10, #32 + add x0, x0, x6 + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + b .L1732 + +.Lsecond_nul: + add x2, x2, x8 + + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x5, x8, #2 + + sub x8, x11, #16 + sub x0, x5, x8 // string length + + cmp x0, x2 // did we match or hit limit first? + csel x8, x2, x0, hi + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + strb wzr, [x4] + + /* copy 17-32 bytes */ +.L1732: + cmp x8, #16 + b.lo .L0816 + ldp x16, x17, [x1] + ldp x12, x1, [x5, #-16] + stp x16, x17, [x9] + stp x12, x1, [x4, #-16] + ret + +.Lhead_nul: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x0, x8, x11 + cmp x0, x2 + csel x8, x2, x0, hi + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + strb wzr, [x4] + + /* Copy 8-16 bytes */ +.L0816: + tbz x8, #3, .L0407 + ldr x16, [x1] + ldr x17, [x5, #-8] + str x16, [x9] + str x17, [x4, #-8] + ret + + /* Copy 4-7 bytes */ + .p2align 4 +.L0407: + cmp x8, #3 + b.ls .L0203 + ldr w16, [x1] + ldr w18, [x5, #-4] + str w16, [x9] + str w18, [x4, #-4] + ret + +.L0203: + tbz x8, 1, .L0001 + ldrh w16, [x1] + ldrh w17, [x5, #-2] + strh w16, [x9] + strh w17, [x4, #-2] + ret + +.L0001: + ldrb w16, [x1] + strb w16, [x9] + strb wzr, [x4] + ret + +.L0: + mov x0, x1 + b strlen + ret +END(__strlcpy)
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202501101503.50AF3rqI057146>