Date: Wed, 14 May 2025 23:40:39 GMT From: Robert Clausecker <fuz@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: f6210541f9e3 - main - lib/libmd: add optimised SHA1 implementations for aarch64 Message-ID: <202505142340.54ENedfA000200@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=f6210541f9e3c6cfda321e0ad98f277fb98a625b commit f6210541f9e3c6cfda321e0ad98f277fb98a625b Author: Robert Clausecker <fuz@FreeBSD.org> AuthorDate: 2025-05-14 19:18:12 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2025-05-14 23:39:58 +0000 lib/libmd: add optimised SHA1 implementations for aarch64 This provides a scalar implementation and one using the SHA1 instruction set extensions. For the scalar implementation, the w array is kept in registers, speeding up the whole operations. For a 10 GiB file on my Windows 2023 Dev Kit (ARM Cortex A78C / ARM Cortex X1C): Performance core: pre 43.1s (238 MB/s) generic 41.3s (247 MB/s) scalar 35.0s (293 MB/s) sha1 12.8s (800 MB/s) Efficiency core: pre 54.2s (189 MB/s) generic 55.9s (183 MB/s) scalar 43.0s (238 MB/s) sha1 16.2s (632 MB/s) Reviewed by: getz Differential Revision: https://reviews.freebsd.org/D45444 --- lib/libmd/aarch64/sha1block.S | 490 +++++++++++++++++++++++++++++++++++++++ lib/libmd/aarch64/sha1dispatch.c | 24 ++ 2 files changed, 514 insertions(+) diff --git a/lib/libmd/aarch64/sha1block.S b/lib/libmd/aarch64/sha1block.S new file mode 100644 index 000000000000..56a0297efadd --- /dev/null +++ b/lib/libmd/aarch64/sha1block.S @@ -0,0 +1,490 @@ +/*- + * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * + * SPDX-License-Identifier: BSD-2-Clause + * + * sha1block_sha1 implementation based on sha1-arm.c, + * written and placed in public domain by Jeffrey Walton + * based on code from ARM, and by Johannes Schneiders, Skip + * Hovsmith and Barry O'Rourke for the mbedTLS project. + */ + +#include <machine/asm.h> + +/* + * Scalar SHA1 implementation. + * + * Due to the ample register file available on AArch64, the w array is + * kept entirely in registers. The saved a-e variables are instead kept + * in memory as we don't have that much memory. + */ + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_scalar) +ctx .req x0 +buf .req x1 +len .req x2 +w .req sp +a .req w3 +b .req w4 +c .req w5 +d .req w6 +e .req w7 +k .req w8 +f .req w9 +tmp .req w10 +w_0 .req w11 +w_1 .req w12 +w_2 .req w13 +w_3 .req w14 +w_4 .req w15 +w_5 .req w16 +w_6 .req w17 +// w18 is the platform register +w_7 .req w19 +w_8 .req w20 +w_9 .req w21 +w_10 .req w22 +w_11 .req w23 +w_12 .req w24 +w_13 .req w25 +w_14 .req w26 +w_15 .req w27 + +.macro shuffle w_i, w_i3, w_i8, w_i14 + eor \w_i, \w_i, \w_i3 + eor tmp, \w_i8, \w_i14 + eor \w_i, \w_i, tmp // w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3] + ror \w_i, \w_i, #31 // w[i] = ... ror #31 +.endm + +.macro func1 a, b, c, d, e + and f, \c, \b + bic tmp, \d, \b + orr f, f, tmp +.endm + +.macro func2 a, b, c, d, e + eor f, \b, \c + eor f, f, \d +.endm + +.macro func3 a, b, c, d, e + eor tmp, \b, \c + and f, \b, \c + and tmp, tmp, \d + orr f, f, tmp +.endm + +.macro func4 a, b, c, d, e + func2 \a, \b, \c, \d, \e +.endm + +.macro mix a, b, c, d, e, w_i + ror \b, \b, #2 + ror tmp, \a, #27 + add \e, \e, \w_i + add tmp, tmp, k + add \e, \e, f + add \e, \e, tmp // (a ror 27) + e + f + k + w[i] +.endm + +.macro round1 a, b, c, d, e, w_i + func1 \a, \b, \c, \d, \e + rev \w_i, \w_i + mix \a, \b, \c, \d, \e, \w_i +.endm + +.macro round func, a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + shuffle \w_i, \w_i3, \w_i8, \w_i14 + \func \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, \w_i +.endm + +.macro round1x a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func1, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + +.macro round2 a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func2, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + +.macro round3 a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func3, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + +.macro round4 a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func4, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + + ands len, len, #~63 // take length in multiples of block length + beq 1f // bail out if input empty + + sub sp, sp, #24+9*8 // allocate stack space + str x19, [sp, #24+0*8] + stp x20, x21, [sp, #24+1*8] + stp x22, x23, [sp, #24+3*8] + stp x24, x25, [sp, #24+5*8] + stp x26, x27, [sp, #24+7*8] + + ldp a, b, [ctx, #0] // load SHA1 state from context + ldp c, d, [ctx, #8] + ldr e, [ctx, #16] + +0: stp a, b, [sp, #0] // save old SHA1 state + stp c, d, [sp, #8] + str e, [sp, #16] + + movz k, #0x7999 // round constant 1 + movk k, #0x5a82, lsl #16 + + ldp w_0, w_1, [buf, #0*4] + round1 a, b, c, d, e, w_0 + round1 e, a, b, c, d, w_1 + + ldp w_2, w_3, [buf, #2*4] + round1 d, e, a, b, c, w_2 + round1 c, d, e, a, b, w_3 + + ldp w_4, w_5, [buf, #4*4] + round1 b, c, d, e, a, w_4 + round1 a, b, c, d, e, w_5 + + ldp w_6, w_7, [buf, #6*4] + round1 e, a, b, c, d, w_6 + round1 d, e, a, b, c, w_7 + + ldp w_8, w_9, [buf, #8*4] + round1 c, d, e, a, b, w_8 + round1 b, c, d, e, a, w_9 + + ldp w_10, w_11, [buf, #10*4] + round1 a, b, c, d, e, w_10 + round1 e, a, b, c, d, w_11 + + ldp w_12, w_13, [buf, #12*4] + round1 d, e, a, b, c, w_12 + round1 c, d, e, a, b, w_13 + + ldp w_14, w_15, [buf, #14*4] + round1 b, c, d, e, a, w_14 + round1 a, b, c, d, e, w_15 + + round1x e, a, b, c, d, w_0, w_13, w_8, w_2 + round1x d, e, a, b, c, w_1, w_14, w_9, w_3 + round1x c, d, e, a, b, w_2, w_15, w_10, w_4 + round1x b, c, d, e, a, w_3, w_0, w_11, w_5 + + movz k, #0xeba1 // round constant 2 + movk k, #0x6ed9, lsl #16 + + round2 a, b, c, d, e, w_4, w_1, w_12, w_6 + round2 e, a, b, c, d, w_5, w_2, w_13, w_7 + round2 d, e, a, b, c, w_6, w_3, w_14, w_8 + round2 c, d, e, a, b, w_7, w_4, w_15, w_9 + round2 b, c, d, e, a, w_8, w_5, w_0, w_10 + + round2 a, b, c, d, e, w_9, w_6, w_1, w_11 + round2 e, a, b, c, d, w_10, w_7, w_2, w_12 + round2 d, e, a, b, c, w_11, w_8, w_3, w_13 + round2 c, d, e, a, b, w_12, w_9, w_4, w_14 + round2 b, c, d, e, a, w_13, w_10, w_5, w_15 + + round2 a, b, c, d, e, w_14, w_11, w_6, w_0 + round2 e, a, b, c, d, w_15, w_12, w_7, w_1 + round2 d, e, a, b, c, w_0, w_13, w_8, w_2 + round2 c, d, e, a, b, w_1, w_14, w_9, w_3 + round2 b, c, d, e, a, w_2, w_15, w_10, w_4 + + round2 a, b, c, d, e, w_3, w_0, w_11, w_5 + round2 e, a, b, c, d, w_4, w_1, w_12, w_6 + round2 d, e, a, b, c, w_5, w_2, w_13, w_7 + round2 c, d, e, a, b, w_6, w_3, w_14, w_8 + round2 b, c, d, e, a, w_7, w_4, w_15, w_9 + + movz k, #0xbcdc // round constant 3 + movk k, #0x8f1b, lsl #16 + + round3 a, b, c, d, e, w_8, w_5, w_0, w_10 + round3 e, a, b, c, d, w_9, w_6, w_1, w_11 + round3 d, e, a, b, c, w_10, w_7, w_2, w_12 + round3 c, d, e, a, b, w_11, w_8, w_3, w_13 + round3 b, c, d, e, a, w_12, w_9, w_4, w_14 + + round3 a, b, c, d, e, w_13, w_10, w_5, w_15 + round3 e, a, b, c, d, w_14, w_11, w_6, w_0 + round3 d, e, a, b, c, w_15, w_12, w_7, w_1 + round3 c, d, e, a, b, w_0, w_13, w_8, w_2 + round3 b, c, d, e, a, w_1, w_14, w_9, w_3 + + round3 a, b, c, d, e, w_2, w_15, w_10, w_4 + round3 e, a, b, c, d, w_3, w_0, w_11, w_5 + round3 d, e, a, b, c, w_4, w_1, w_12, w_6 + round3 c, d, e, a, b, w_5, w_2, w_13, w_7 + round3 b, c, d, e, a, w_6, w_3, w_14, w_8 + + round3 a, b, c, d, e, w_7, w_4, w_15, w_9 + round3 e, a, b, c, d, w_8, w_5, w_0, w_10 + round3 d, e, a, b, c, w_9, w_6, w_1, w_11 + round3 c, d, e, a, b, w_10, w_7, w_2, w_12 + round3 b, c, d, e, a, w_11, w_8, w_3, w_13 + + movz k, #0xc1d6 // round constant 4 + movk k, #0xca62, lsl #16 + + round4 a, b, c, d, e, w_12, w_9, w_4, w_14 + round4 e, a, b, c, d, w_13, w_10, w_5, w_15 + round4 d, e, a, b, c, w_14, w_11, w_6, w_0 + round4 c, d, e, a, b, w_15, w_12, w_7, w_1 + round4 b, c, d, e, a, w_0, w_13, w_8, w_2 + + round4 a, b, c, d, e, w_1, w_14, w_9, w_3 + round4 e, a, b, c, d, w_2, w_15, w_10, w_4 + round4 d, e, a, b, c, w_3, w_0, w_11, w_5 + round4 c, d, e, a, b, w_4, w_1, w_12, w_6 + round4 b, c, d, e, a, w_5, w_2, w_13, w_7 + + round4 a, b, c, d, e, w_6, w_3, w_14, w_8 + round4 e, a, b, c, d, w_7, w_4, w_15, w_9 + round4 d, e, a, b, c, w_8, w_5, w_0, w_10 + round4 c, d, e, a, b, w_9, w_6, w_1, w_11 + round4 b, c, d, e, a, w_10, w_7, w_2, w_12 + + round4 a, b, c, d, e, w_11, w_8, w_3, w_13 + round4 e, a, b, c, d, w_12, w_9, w_4, w_14 + round4 d, e, a, b, c, w_13, w_10, w_5, w_15 + round4 c, d, e, a, b, w_14, w_11, w_6, w_0 + round4 b, c, d, e, a, w_15, w_12, w_7, w_1 + + ldp w_0, w_1, [sp, #0] // reload saved SHA1 state + ldp w_2, w_3, [sp, #8] + ldr w_4, [sp, #16] + + add a, a, w_0 + add b, b, w_1 + add c, c, w_2 + add d, d, w_3 + add e, e, w_4 + + add buf, buf, #64 + subs len, len, #64 + bhi 0b + + stp a, b, [ctx, #0] // write updated SHA1 state + stp c, d, [ctx, #8] + str e, [ctx, #16] + + ldr x19, [sp, #24+0*8] + ldp x20, x21, [sp, #24+1*8] + ldp x22, x23, [sp, #24+3*8] + ldp x24, x25, [sp, #24+5*8] + ldp x26, x27, [sp, #24+7*8] + add sp, sp, #24+9*8 + +1: ret +END(_libmd_sha1block_scalar) + +/* + * SHA1 implementation using the SHA1 instruction set extension. + */ + + .arch_extension sha2 + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_sha1) + /* ctx, buf, len: same as for sha1block_scalar */ +kaddr .req x3 +abcd .req v0 +abcd_q .req q0 // alias for use with scalar instructions +abcd_s .req s0 +e0 .req s1 +e0_v .req v1 +e1 .req s2 +abcd_saved .req v3 +e0_saved .req v4 +tmp0 .req v5 +tmp1 .req v6 +msg0 .req v16 +msg1 .req v17 +msg2 .req v18 +msg3 .req v19 +k0 .req v20 +k1 .req v21 +k2 .req v22 +k3 .req v23 + + ands len, len, #~63 // take length in multiples of block length + beq 1f // bail out if input empty + + ldr abcd_q, [ctx, #0] + ldr e0, [ctx, #16] + + adrp kaddr, k1234 + add kaddr, kaddr, #:lo12:k1234 + ld4r {k0.4s, k1.4s, k2.4s, k3.4s}, [kaddr] + +0: mov abcd_saved.16b, abcd.16b + mov e0_saved.16b, e0_v.16b + + ld1 {msg0.4s, msg1.4s, msg2.4s, msg3.4s}, [buf], #64 + rev32 msg0.16b, msg0.16b + rev32 msg1.16b, msg1.16b + rev32 msg2.16b, msg2.16b + rev32 msg3.16b, msg3.16b + + add tmp0.4s, msg0.4s, k0.4s + add tmp1.4s, msg1.4s, k0.4s + + /* rounds 0--3 */ + sha1h e1, abcd_s + sha1c abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k0.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 4--7 */ + sha1h e0, abcd_s + sha1c abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k0.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 8--11 */ + sha1h e1, abcd_s + sha1c abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k0.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 12--15 */ + sha1h e0, abcd_s + sha1c abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k1.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 16--19 */ + sha1h e1, abcd_s + sha1c abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k1.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 20--23 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k1.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 24--27 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k1.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 28--31 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k1.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 32--35 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k2.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 36--39 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k2.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 40--43 */ + sha1h e1, abcd_s + sha1m abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k2.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 44--47 */ + sha1h e0, abcd_s + sha1m abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k2.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 48--51 */ + sha1h e1, abcd_s + sha1m abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k2.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 52--55 */ + sha1h e0, abcd_s + sha1m abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k3.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 56--59 */ + sha1h e1, abcd_s + sha1m abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k3.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 60--63 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k3.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 64--67 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k3.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 68--71 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k3.4s + sha1su1 msg0.4s, msg3.4s + + /* rounds 72--75 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + + /* rounds 76--79 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + + add e0_v.4s, e0_v.4s, e0_saved.4s + add abcd.4s, abcd.4s, abcd_saved.4s + + subs len, len, #64 + bhi 0b + + str abcd_q, [ctx, #0] + str e0, [ctx, #16] + +1: ret +END(_libmd_sha1block_sha1) + + .section .rodata + .balign 16 +k1234: .4byte 0x5a827999 + .4byte 0x6ed9eba1 + .4byte 0x8f1bbcdc + .4byte 0xca62c1d6 + .size k1234, .-k1234 + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/aarch64/sha1dispatch.c b/lib/libmd/aarch64/sha1dispatch.c new file mode 100644 index 000000000000..e34bf0a1a344 --- /dev/null +++ b/lib/libmd/aarch64/sha1dispatch.c @@ -0,0 +1,24 @@ +/*- + * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <machine/ifunc.h> +#include <sha.h> +#include <sys/auxv.h> + +extern void _libmd_sha1block_scalar(SHA1_CTX *, const void *, size_t); +extern void _libmd_sha1block_sha1(SHA1_CTX *, const void *, size_t); + +DEFINE_IFUNC(, void, sha1_block, (SHA1_CTX *, const void *, size_t)) +{ + unsigned long hwcap = 0; + + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + + if (hwcap & HWCAP_SHA1) + return (_libmd_sha1block_sha1); + else + return (_libmd_sha1block_scalar); +}
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202505142340.54ENedfA000200>