Date: Sun, 30 Nov 2025 01:45:54 +0000 From: Robert Clausecker <fuz@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Cc: Strahinja =?utf-8?Q?Stani=C5=A1?==?utf-8?Q?i=C4=87?= <strajabot@FreeBSD.org> Subject: git: a86afcd3f33c - stable/15 - libc: scalar memcpy() in RISC-V assembly Message-ID: <692ba1d2.2a52c.657d441c@gitrepo.freebsd.org>
index | next in thread | raw e-mail
The branch stable/15 has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=a86afcd3f33c4b9a7879faa421367bb8a4423002 commit a86afcd3f33c4b9a7879faa421367bb8a4423002 Author: Strahinja Stanišić <strajabot@FreeBSD.org> AuthorDate: 2024-07-24 23:33:30 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2025-11-30 00:43:05 +0000 libc: scalar memcpy() in RISC-V assembly Optimized assembly implementation of memcpy() for the RISC-V architecture. The implementation has two paths: - An aligned path - (dst - src) % 8 = 0, runs faster - An unaligned path - (dst - src) % 8 != 0, runs slower os: FreeBSD arch: riscv │ memcpy_baseline │ memcpy_scalar │ │ sec/op │ sec/op vs base │ 64Align8 851.6µ ± 1% 488.9µ ± 1% -42.59% (p=0.000 n=12) 4kAlign8 681.5µ ± 1% 255.1µ ± 2% -62.57% (p=0.000 n=12) 256kAlign8 273.0µ ± 2% 230.7µ ± 2% -15.50% (p=0.000 n=12) 16mAlign8 98.07m ± 0% 95.29m ± 0% -2.84% (p=0.000 n=12) 64UAlign 887.5µ ± 1% 531.6µ ± 1% -40.10% (p=0.000 n=12) 4kUAlign 725.6µ ± 1% 262.2µ ± 1% -63.87% (p=0.000 n=12) 256kUAlign 844.1µ ± 2% 322.8µ ± 0% -61.76% (p=0.000 n=12) 16mUAlign 134.9m ± 0% 101.2m ± 0% -24.97% (p=0.000 n=20) geomean 2.410m 1.371m -43.12% │ memcpy_baseline │ memcpy_scalar │ │ MiB/s │ MiB/s vs base │ 64Align8 293.6 ± 1% 511.3 ± 1% +74.18% (p=0.000 n=12) 4kAlign8 366.8 ± 1% 980.0 ± 2% +167.15% (p=0.000 n=12) 256kAlign8 915.8 ± 2% 1083.7 ± 2% +18.34% (p=0.000 n=12) 16mAlign8 163.1 ± 0% 167.9 ± 0% +2.92% (p=0.000 n=12) 64UAlign 281.7 ± 1% 470.3 ± 1% +66.94% (p=0.000 n=12) 4kUAlign 344.5 ± 1% 953.6 ± 1% +176.77% (p=0.000 n=12) 256kUAlign 296.2 ± 2% 774.5 ± 0% +161.49% (p=0.000 n=12) 16mUAlign 118.6 ± 0% 158.1 ± 0% +33.28% (p=0.000 n=20) geomean 293.4 515.8 +75.81% MFC after: 1 month MFC to: stable/15 Approved by: mhorne, markj (mentor) Reviewed by: fuz Sponsored by: Google LLC (GSoC 2024) Differential Revision: https://reviews.freebsd.org/D46139 (cherry picked from commit 25fdd86a4c92b5bdab82db289f3bcd57756778e7) --- lib/libc/riscv/string/Makefile.inc | 1 + lib/libc/riscv/string/memcpy.S | 217 +++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+) diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc index 5853ea114277..ebea8d1d3412 100644 --- a/lib/libc/riscv/string/Makefile.inc +++ b/lib/libc/riscv/string/Makefile.inc @@ -1,5 +1,6 @@ MDSRCS+= \ memchr.S \ + memcpy.S \ memset.S \ strlen.S \ strrchr.S diff --git a/lib/libc/riscv/string/memcpy.S b/lib/libc/riscv/string/memcpy.S new file mode 100644 index 000000000000..7536514df777 --- /dev/null +++ b/lib/libc/riscv/string/memcpy.S @@ -0,0 +1,217 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + +/* + * a0 - void* dst + * a1 - const void* src + * a2 - size_t len + */ +ENTRY(memcpy) + beqz a2, .Lreturn + + /* diff = (dstv - srcv) & 0b111 */ + sub t0, a0, a1 + andi t0, t0, 0b111 + + sltiu t1, a2, 8 + + /* we never change a0, because memcpy returns the original dst */ + mv a3, a0 + + /* len < 8 */ + bnez t1, .Lend + + /* t1 = (-dst) & 0b111 */ + neg t1, a0 + andi t1, t1, 0b111 + + sub a2, a2, t1 + + la t2, .Lduff_start + slli t3, t1, 3 + sub t2, t2, t3 + jr t2 + lb t3, 6(a1) + sb t3, 6(a3) + lb t3, 5(a1) + sb t3, 5(a3) + lb t3, 4(a1) + sb t3, 4(a3) + lb t3, 3(a1) + sb t3, 3(a3) + lb t3, 2(a1) + sb t3, 2(a3) + lb t3, 1(a1) + sb t3, 1(a3) + lb t3, 0(a1) + sb t3, 0(a3) +.Lduff_start: + + add a1, a1, t1 + add a3, a3, t1 + + beqz a2, .Lreturn + + beqz t0, .Lmemcpy8 + + /* + * a4 - size_t right_shift + * a5 - size_t left_shift + * a6 - size_t whole (number of dword stores) + */ + + /* right_shift = (src % 0b111) * 8; */ + andi a4, a1, 0b111 + slli a4, a4, 3 + + /* left_shift = 64 - right_shift */ + neg a5, a4 + + /* whole = len / 8 */ + srli a6, a2, 3 + + /* len = len % 8 */ + andi a2, a2, 0b111 + + /* t0 - uint64_t* ptr */ + + /* ptr = src & ~0b111 */ + andi t0, a1, ~0b111 + + /* src += whole * 8 */ + slli t1, a6, 3 + add a1, a1, t1 + + /* + * t1 - uint64_t low + * t2 - uint64_t high + */ + + /* low = *ptr++ */ + ld t1, (t0) + addi t0, t0, 8 + + /* low >>= right_shift */ + srl t1, t1, a4 + + beqz a6, .Llmain_skip +.Llmain: + /* high = *ptr++ */ + ld t2, (t0) + addi t0, t0, 8 + + /* whole-- */ + addi a6, a6, -1 + + /* temp = (high << left_shift) | low */ + sll t3, t2, a5 + or t3, t3, t1 + + /* low = high >> right_shift */ + srl t1, t2, a4 + + /* *dst++ = temp */ + sd t3, (a3) + addi a3, a3, 8 + + bnez a6, .Llmain + +.Llmain_skip: + +.Lend: + la t1, .Lduff_end + slli t2, a2, 3 + sub t1, t1, t2 + jr t1 + lb t2, 6(a1) + sb t2, 6(a3) + lb t2, 5(a1) + sb t2, 5(a3) + lb t2, 4(a1) + sb t2, 4(a3) + lb t2, 3(a1) + sb t2, 3(a3) + lb t2, 2(a1) + sb t2, 2(a3) + lb t2, 1(a1) + sb t2, 1(a3) + lb t2, 0(a1) + sb t2, 0(a3) +.Lduff_end: + +.Lreturn: + ret + +/* exectued when dst - src is multiple of 8 + * a0 - void* dst + * a1 - const void* src + * a2 - size_t len + */ +.Lmemcpy8: + + beqz a2, .Lreturn + + slti t0, a2, 128 + bnez t0, .Llmain8_64_skip + + /* a4 - uint64_t* end_unroll */ + + /* end_unroll = dst + len / 64 * 64 */ + andi t0, a2, ~0b111111 + add a4, a3, t0 + + /* len = len % 64 */ + andi a2, a2, 0b111111 + +.Llmain8_64: + ld t0, 0(a1) + ld t1, 8(a1) + ld t2, 16(a1) + ld t3, 24(a1) + sd t0, 0(a3) + sd t1, 8(a3) + sd t2, 16(a3) + sd t3, 24(a3) + ld t0, 32(a1) + ld t1, 40(a1) + ld t2, 48(a1) + ld t3, 56(a1) + sd t0, 32(a3) + sd t1, 40(a3) + sd t2, 48(a3) + sd t3, 56(a3) + addi a3, a3, 64 + addi a1, a1, 64 + bne a3, a4, .Llmain8_64 +.Llmain8_64_skip: + + beqz a2, .Lreturn + + /* a4 - uint64_t* end_align */ + + /* end_align = (dst + len) & ~0b111 */ + add a4, a3, a2 + andi a4, a4, ~0b111 + + /* len = len % 8 */ + andi a2, a2, 0b111 + + beq a3, a4, .Llmain8_skip +.Llmain8: + ld t0, (a1) + sd t0, (a3) + addi a3, a3, 8 + addi a1, a1, 8 + bne a3, a4, .Llmain8 +.Llmain8_skip: + + la t1, .Lduff_end + slli t2, a2, 3 + sub t1, t1, t2 + jr t1 +END(memcpy)help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?692ba1d2.2a52c.657d441c>
