Date: Wed, 21 Sep 2022 09:46:48 GMT From: Andrew Turner <andrew@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: 3af87126f68e - stable/13 - Import an optimized arm64 memcmp into the kernel Message-ID: <202209210946.28L9kmIl076201@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by andrew: URL: https://cgit.FreeBSD.org/src/commit/?id=3af87126f68e539453dc530925d7e297ee261c7f commit 3af87126f68e539453dc530925d7e297ee261c7f Author: Andrew Turner <andrew@FreeBSD.org> AuthorDate: 2022-09-07 11:12:30 +0000 Commit: Andrew Turner <andrew@FreeBSD.org> CommitDate: 2022-09-21 09:45:53 +0000 Import an optimized arm64 memcmp into the kernel Bring in a version of the Arm Optimized Routines memcpy from before the VFP registers were used. Imported with modification from: https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S Sponsored by: The FreeBSD Foundation (cherry picked from commit 51a1bf7ba7eb79c760161a2054c113978dce38cb) --- sys/arm64/arm64/memcmp.S | 136 +++++++++++++++++++++++++++++++++++++++++++++++ sys/conf/files.arm64 | 3 +- 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/sys/arm64/arm64/memcmp.S b/sys/arm64/arm64/memcmp.S new file mode 100644 index 000000000000..8517a181f3f3 --- /dev/null +++ b/sys/arm64/arm64/memcmp.S @@ -0,0 +1,136 @@ +/* memcmp - compare memory + * + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#include <machine/asm.h> + +#define L(l) .L ## l + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 + +ENTRY (memcmp) + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret + +END (memcmp) + diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 86ada6e4c924..963ee0aef8f0 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -10,8 +10,6 @@ kern/subr_devmap.c standard kern/subr_intr.c optional intrng kern/subr_physmem.c standard libkern/bcmp.c standard -libkern/memcmp.c standard \ - compile-with "${NORMAL_C:N-fsanitize*}" libkern/memset.c standard \ compile-with "${NORMAL_C:N-fsanitize*}" libkern/strlen.c standard @@ -60,6 +58,7 @@ arm64/arm64/locore.S standard no-obj arm64/arm64/machdep.c standard arm64/arm64/machdep_boot.c standard arm64/arm64/mem.c standard +arm64/arm64/memcmp.S standard arm64/arm64/memcpy.S standard arm64/arm64/minidump_machdep.c standard arm64/arm64/mp_machdep.c optional smp
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202209210946.28L9kmIl076201>