Date: Thu, 8 Sep 2022 13:32:25 GMT From: Andrew Turner <andrew@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 51a1bf7ba7eb - main - Import an optimized arm64 memcmp into the kernel Message-ID: <202209081332.288DWPYX072237@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by andrew: URL: https://cgit.FreeBSD.org/src/commit/?id=51a1bf7ba7eb79c760161a2054c113978dce38cb commit 51a1bf7ba7eb79c760161a2054c113978dce38cb Author: Andrew Turner <andrew@FreeBSD.org> AuthorDate: 2022-09-07 11:12:30 +0000 Commit: Andrew Turner <andrew@FreeBSD.org> CommitDate: 2022-09-08 13:29:37 +0000 Import an optimized arm64 memcmp into the kernel Bring in a version of the Arm Optimized Routines memcpy from before the VFP registers were used. Imported with modification from: https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S Sponsored by: The FreeBSD Foundation --- sys/arm64/arm64/memcmp.S | 136 +++++++++++++++++++++++++++++++++++++++++++++++ sys/conf/files.arm64 | 3 +- 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/sys/arm64/arm64/memcmp.S b/sys/arm64/arm64/memcmp.S new file mode 100644 index 000000000000..8517a181f3f3 --- /dev/null +++ b/sys/arm64/arm64/memcmp.S @@ -0,0 +1,136 @@ +/* memcmp - compare memory + * + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#include <machine/asm.h> + +#define L(l) .L ## l + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 + +ENTRY (memcmp) + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret + +END (memcmp) + diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index a647d4e32230..d01b3f674e9a 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -9,8 +9,6 @@ kern/pic_if.m optional intrng kern/subr_devmap.c standard kern/subr_intr.c optional intrng kern/subr_physmem.c standard -libkern/memcmp.c standard \ - compile-with "${NORMAL_C:N-fsanitize*}" libkern/memset.c standard \ compile-with "${NORMAL_C:N-fsanitize*}" libkern/strlen.c standard @@ -60,6 +58,7 @@ arm64/arm64/locore.S standard no-obj arm64/arm64/machdep.c standard arm64/arm64/machdep_boot.c standard arm64/arm64/mem.c standard +arm64/arm64/memcmp.S standard arm64/arm64/memcpy.S standard arm64/arm64/minidump_machdep.c standard arm64/arm64/mp_machdep.c optional smp
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202209081332.288DWPYX072237>