Date: Wed, 14 May 2025 23:40:38 GMT From: Robert Clausecker <fuz@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 8b4684afcde3 - main - lib/libmd: add optimised SHA1 implementations for amd64 Message-ID: <202505142340.54ENecbI000166@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=8b4684afcde3930eb49490f0b8431c4cb2ad9a46 commit 8b4684afcde3930eb49490f0b8431c4cb2ad9a46 Author: Robert Clausecker <fuz@FreeBSD.org> AuthorDate: 2024-05-28 15:20:41 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2025-05-14 23:39:58 +0000 lib/libmd: add optimised SHA1 implementations for amd64 Three implementations are provided: one using just scalar instructions, one using AVX2, and one using the SHA instructions (SHANI). The AVX2 version uses a complicated multi-block carry scheme described in an Intel whitepaper; the code was carefully transcribed from the implementatio shipped with the Go runtime. The performance is quite good. From my Tiger Lake based NUC: old: 16.7s ( 613 MB/s) scalar: 14.5s ( 706 MB/s) avx2: 10.5s ( 975 MB/s) shani: 5.6s (1829 MB/s) Reviewed by: getz Obtained from: https://github.com/golang/go/blob/b0dfcb74651b82123746273bbf6bb9988cd96e18/src/crypto/sha1/sha1block_amd64.s Differential Revision: https://reviews.freebsd.org/D45444 --- lib/libmd/Makefile | 3 + lib/libmd/amd64/sha1block.S | 1851 ++++++++++++++++++++++++++++++++++++++++ lib/libmd/amd64/sha1dispatch.c | 77 ++ 3 files changed, 1931 insertions(+) diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile index 427da5b9d68f..547a134fc440 100644 --- a/lib/libmd/Makefile +++ b/lib/libmd/Makefile @@ -120,6 +120,9 @@ USE_ASM_SOURCES:=0 .if exists(${MACHINE_ARCH}/sha1block.S) SRCS+= sha1block.S CFLAGS+= -DSHA1_ASM +.if exists(${MACHINE_ARCH}/sha1dispatch.c) +SRCS+= sha1dispatch.c +.endif .endif .if exists(${MACHINE_ARCH}/rmd160.S) SRCS+= rmd160.S diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S new file mode 100644 index 000000000000..0307dcdece32 --- /dev/null +++ b/lib/libmd/amd64/sha1block.S @@ -0,0 +1,1851 @@ +/*- + * Copyright (c) 2013 The Go Authors. All rights reserved. + * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * + * Adapted from Go's crypto/sha1/sha1block_amd64.s. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <machine/asm.h> + +/* + * SHA-1 block routine. See sha1c.c for C equivalent. + * + * There are 80 rounds of 4 types: + * - rounds 0-15 are type 1 and load data (round1 macro). + * - rounds 16-19 are type 1 and do not load data (round1x macro). + * - rounds 20-39 are type 2 and do not load data (round2 macro). + * - rounds 40-59 are type 3 and do not load data (round3 macro). + * - rounds 60-79 are type 4 and do not load data (round4 macro). + * + * Each round loads or shuffles the data, then computes a per-round + * function of b, c, d, and then mixes the result into and rotates the + * five registers a, b, c, d, e holding the intermediate results. + * + * The register rotation is implemented by rotating the arguments to + * the round macros instead of by explicit move instructions. + */ +.macro load index + mov (\index)*4(%rsi), %r10d + bswap %r10d + mov %r10d, (\index)*4(%rsp) +.endm + +.macro shuffle index + mov ((\index )&0xf)*4(%rsp), %r10d + xor ((\index- 3)&0xf)*4(%rsp), %r10d + xor ((\index- 8)&0xf)*4(%rsp), %r10d + xor ((\index-14)&0xf)*4(%rsp), %r10d + rol $1, %r10d + mov %r10d, ((\index)&0xf)*4(%rsp) +.endm + +.macro func1 a, b, c, d, e + mov \d, %r9d + xor \c, %r9d + and \b, %r9d + xor \d, %r9d +.endm + +.macro func2 a, b, c, d, e + mov \b, %r9d + xor \c, %r9d + xor \d, %r9d +.endm + +.macro func3 a, b, c, d, e + mov \b, %r8d + or \c, %r8d + and \d, %r8d + mov \b, %r9d + and \c, %r9d + or %r8d, %r9d +.endm + +.macro func4 a, b, c, d, e + func2 \a, \b, \c, \d, \e +.endm + +.macro mix a, b, c, d, e, const + rol $30, \b + add %r9d, \e + mov \a, %r8d + rol $5, %r8d + lea \const(\e, %r10d, 1), \e + add %r8d, \e +.endm + +.macro round1 a, b, c, d, e, index + load \index + func1 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x5a827999 +.endm + +.macro round1x a, b, c, d, e, index + shuffle \index + func1 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x5a827999 +.endm + +.macro round2 a, b, c, d, e, index + shuffle \index + func2 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x6ed9eba1 +.endm + +.macro round3 a, b, c, d, e, index + shuffle \index + func3 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x8f1bbcdc +.endm + +.macro round4 a, b, c, d, e, index + shuffle \index + func4 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0xca62c1d6 +.endm + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_scalar) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi // rdi: SHA1_CTX + sub $64+8, %rsp // 64 bytes for round keys + // plus alignment + + mov %rdi, %rbp + // rsi: buf + and $~63, %rdx // rdx: length in blocks + lea (%rsi, %rdx, 1), %rdi // rdi: end pointer + mov (%rbp), %eax // c->h0 + mov 4(%rbp), %ebx // c->h1 + mov 8(%rbp), %ecx // c->h2 + mov 12(%rbp), %edx // c->h3 + mov 16(%rbp), %ebp // c->h4 + + cmp %rsi, %rdi // any data to process? + je .Lend + +.Lloop: mov %eax, %r11d + mov %ebx, %r12d + mov %ecx, %r13d + mov %edx, %r14d + mov %ebp, %r15d + + round1 %eax, %ebx, %ecx, %edx, %ebp, 0 + round1 %ebp, %eax, %ebx, %ecx, %edx, 1 + round1 %edx, %ebp, %eax, %ebx, %ecx, 2 + round1 %ecx, %edx, %ebp, %eax, %ebx, 3 + round1 %ebx, %ecx, %edx, %ebp, %eax, 4 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 5 + round1 %ebp, %eax, %ebx, %ecx, %edx, 6 + round1 %edx, %ebp, %eax, %ebx, %ecx, 7 + round1 %ecx, %edx, %ebp, %eax, %ebx, 8 + round1 %ebx, %ecx, %edx, %ebp, %eax, 9 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 10 + round1 %ebp, %eax, %ebx, %ecx, %edx, 11 + round1 %edx, %ebp, %eax, %ebx, %ecx, 12 + round1 %ecx, %edx, %ebp, %eax, %ebx, 13 + round1 %ebx, %ecx, %edx, %ebp, %eax, 14 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 15 + round1x %ebp, %eax, %ebx, %ecx, %edx, 16 + round1x %edx, %ebp, %eax, %ebx, %ecx, 17 + round1x %ecx, %edx, %ebp, %eax, %ebx, 18 + round1x %ebx, %ecx, %edx, %ebp, %eax, 19 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 20 + round2 %ebp, %eax, %ebx, %ecx, %edx, 21 + round2 %edx, %ebp, %eax, %ebx, %ecx, 22 + round2 %ecx, %edx, %ebp, %eax, %ebx, 23 + round2 %ebx, %ecx, %edx, %ebp, %eax, 24 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 25 + round2 %ebp, %eax, %ebx, %ecx, %edx, 26 + round2 %edx, %ebp, %eax, %ebx, %ecx, 27 + round2 %ecx, %edx, %ebp, %eax, %ebx, 28 + round2 %ebx, %ecx, %edx, %ebp, %eax, 29 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 30 + round2 %ebp, %eax, %ebx, %ecx, %edx, 31 + round2 %edx, %ebp, %eax, %ebx, %ecx, 32 + round2 %ecx, %edx, %ebp, %eax, %ebx, 33 + round2 %ebx, %ecx, %edx, %ebp, %eax, 34 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 35 + round2 %ebp, %eax, %ebx, %ecx, %edx, 36 + round2 %edx, %ebp, %eax, %ebx, %ecx, 37 + round2 %ecx, %edx, %ebp, %eax, %ebx, 38 + round2 %ebx, %ecx, %edx, %ebp, %eax, 39 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 40 + round3 %ebp, %eax, %ebx, %ecx, %edx, 41 + round3 %edx, %ebp, %eax, %ebx, %ecx, 42 + round3 %ecx, %edx, %ebp, %eax, %ebx, 43 + round3 %ebx, %ecx, %edx, %ebp, %eax, 44 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 45 + round3 %ebp, %eax, %ebx, %ecx, %edx, 46 + round3 %edx, %ebp, %eax, %ebx, %ecx, 47 + round3 %ecx, %edx, %ebp, %eax, %ebx, 48 + round3 %ebx, %ecx, %edx, %ebp, %eax, 49 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 50 + round3 %ebp, %eax, %ebx, %ecx, %edx, 51 + round3 %edx, %ebp, %eax, %ebx, %ecx, 52 + round3 %ecx, %edx, %ebp, %eax, %ebx, 53 + round3 %ebx, %ecx, %edx, %ebp, %eax, 54 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 55 + round3 %ebp, %eax, %ebx, %ecx, %edx, 56 + round3 %edx, %ebp, %eax, %ebx, %ecx, 57 + round3 %ecx, %edx, %ebp, %eax, %ebx, 58 + round3 %ebx, %ecx, %edx, %ebp, %eax, 59 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 60 + round4 %ebp, %eax, %ebx, %ecx, %edx, 61 + round4 %edx, %ebp, %eax, %ebx, %ecx, 62 + round4 %ecx, %edx, %ebp, %eax, %ebx, 63 + round4 %ebx, %ecx, %edx, %ebp, %eax, 64 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 65 + round4 %ebp, %eax, %ebx, %ecx, %edx, 66 + round4 %edx, %ebp, %eax, %ebx, %ecx, 67 + round4 %ecx, %edx, %ebp, %eax, %ebx, 68 + round4 %ebx, %ecx, %edx, %ebp, %eax, 69 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 70 + round4 %ebp, %eax, %ebx, %ecx, %edx, 71 + round4 %edx, %ebp, %eax, %ebx, %ecx, 72 + round4 %ecx, %edx, %ebp, %eax, %ebx, 73 + round4 %ebx, %ecx, %edx, %ebp, %eax, 74 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 75 + round4 %ebp, %eax, %ebx, %ecx, %edx, 76 + round4 %edx, %ebp, %eax, %ebx, %ecx, 77 + round4 %ecx, %edx, %ebp, %eax, %ebx, 78 + round4 %ebx, %ecx, %edx, %ebp, %eax, 79 + + add %r11d, %eax + add %r12d, %ebx + add %r13d, %ecx + add %r14d, %edx + add %r15d, %ebp + + add $64, %rsi + cmp %rdi, %rsi + jb .Lloop + +.Lend: add $64+8, %rsp + pop %rdi // SHA1_CTX + mov %eax, (%rdi) + mov %ebx, 4(%rdi) + mov %ecx, 8(%rdi) + mov %edx, 12(%rdi) + mov %ebp, 16(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + ret +END(_libmd_sha1block_scalar) + +/* + * This is the implementation using AVX2, BMI1 and BMI2. It is based on: + * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" + * From http://software.intel.com/en-us/articles + * (look for improving-the-performance-of-the-secure-hash-algorithm-1) + * This implementation is 2x unrolled, and interleaves vector instructions, + * used to precompute W, with scalar computation of current round + * for optimal scheduling. + */ + + /* trivial helper macros */ +.macro update_hash a, tb, c, d, e + add (%r9), \a + mov \a, (%r9) + add 4(%r9), \tb + mov \tb, 4(%r9) + add 8(%r9), \c + mov \c, 8(%r9) + add 12(%r9), \d + mov \d, 12(%r9) + add 16(%r9), \e + mov \e, 16(%r9) +.endm + + /* help macros for recalc, which does precomputations */ +.macro precalc0 offset + vmovdqu \offset(%r10), %xmm0 +.endm + +.macro precalc1 offset + vinserti128 $1, \offset(%r13), %ymm0, %ymm0 +.endm + +.macro precalc2 yreg + vpshufb %ymm10, %ymm0, \yreg +.endm + +.macro precalc4 yreg, k_offset + vpaddd \k_offset(%r8), \yreg, %ymm0 +.endm + +.macro precalc7 offset + vmovdqu %ymm0, (\offset)*2(%r14) +.endm + +/* + * Message scheduling pre-compute for rounds 0-15 + * r13 is a pointer to the even 64-byte block + * r10 is a pointer to the odd 64-byte block + * r14 is a pointer to the temp buffer + * xmm0 is used as a temp register + * yreg is clobbered as part of the computation + * offset chooses a 16 byte chunk within a block + * r8 is a pointer to the constants block + * k_offset chooses K constants relevant to this round + * xmm10 holds the swap mask + */ +.macro precalc00_15 offset, yreg + precalc0 \offset + precalc1 \offset + precalc2 \yreg + precalc4 \yreg, 0 + precalc7 \offset +.endm + + /* helper macros for precalc16_31 */ +.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg + vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14] + vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3] +.endm + +.macro precalc17 reg_sub16, reg_sub8, reg + vpxor \reg_sub8, \reg, \reg + vpxor \reg_sub16, %ymm0, %ymm0 +.endm + +.macro precalc18 reg + vpxor %ymm0, \reg, \reg + vpslldq $12, \reg, %ymm9 +.endm + +.macro precalc19 reg + vpslld $1, \reg, %ymm0 + vpsrld $31, \reg, \reg + .endm + +.macro precalc20 reg + vpor \reg, %ymm0, %ymm0 + vpslld $2, %ymm9, \reg +.endm + +.macro precalc21 reg + vpsrld $30, %ymm9, %ymm9 + vpxor \reg, %ymm0, %ymm0 +.endm + +.macro precalc23 reg, k_offset, offset + vpxor %ymm9, %ymm0, \reg + vpaddd \k_offset(%r8), \reg, %ymm0 + vmovdqu %ymm0, (\offset)(%r14) +.endm + +/* + * Message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction. + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency. + + clobbers 5 input ymm registers REG_SUB* + * uses xmm0 and xmm9 as temp registers + * As always, r8 is a pointer to constants block + * and r14 is a pointer to temp buffer + */ +.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset + precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg + precalc17 \reg_sub16, \reg_sub8, \reg + precalc18 \reg + precalc19 \reg + precalc20 \reg + precalc21 \reg + precalc23 \reg, \k_offset, \offset +.endm + + /* helper macros for precalc_32_79 */ +.macro precalc32 reg_sub8, reg_sub4 + vpalignr $8, \reg_sub8, \reg_sub4, %ymm0 +.endm + +.macro precalc33 reg_sub28, reg + vpxor \reg_sub28, \reg, \reg +.endm + +.macro precalc34 reg_sub16 + vpxor \reg_sub16, %ymm0, %ymm0 +.endm + +.macro precalc35 reg + vpxor %ymm0, \reg, \reg +.endm + +.macro precalc36 reg + vpslld $2, \reg, %ymm0 +.endm + +.macro precalc37 reg + vpsrld $30, \reg, \reg + vpor \reg, %ymm0, \reg +.endm + +.macro precalc39 reg, k_offset, offset + vpaddd \k_offset(%r8), \reg, %ymm0 + vmovdqu %ymm0, \offset(%r14) +.endm + +.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset + precalc32 \reg_sub8, \reg_sub4 + precalc33 \reg_sub28, \reg + precalc34 \reg_sub16 + precalc35 \reg + precalc36 \reg + precalc37 \reg + precalc39 \reg, \k_offset, \offset +.endm + +.macro precalc + precalc00_15 0x00, %ymm15 + precalc00_15 0x10, %ymm14 + precalc00_15 0x20, %ymm13 + precalc00_15 0x30, %ymm12 + precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080 + precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0 + precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0 + precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0 + precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100 + precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120 + precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140 + precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160 + precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180 + precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0 + precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0 + precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0 + precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200 + precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220 + precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240 + precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260 +.endm + +/* + * Macros calculating individual rounds have general form + * calc_round_pre + precalc_round + calc_round_post + * calc_round_{pre,post} macros follow + */ +.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e + add \offset(%r15), \reg_e + andn \reg_c, \reg_a, %ebp + add \reg_b, \reg_e // add F from the previous round + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_b // for the next round +.endm + +/* + * Calculate F for the next round + */ +.macro calc_f1_post reg_a, reg_b, reg_e + and \reg_b, \reg_a // b & c + xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d) + add %r12d, \reg_e +.endm + +/* + * Registers are cyclically rotated: + * edx -> eax -> edi -> esi -> ebx -> ecx + */ +.macro calc0 + mov %esi, %ebx // precalculate first round + rorx $2, %esi, %esi + andn %eax, %ebx, %ebp + and %edi, %ebx + xor %ebp, %ebx + calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx + precalc0 0x80 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc1 + calc_f1_pre 0x4, %edx, %ecx, %esi, %eax + precalc1 0x80 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc2 + calc_f1_pre 0x8, %eax, %edx, %ebx, %edi + precalc2 %ymm15 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc3 + calc_f1_pre 0xc, %edi, %eax, %ecx, %esi + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc4 + calc_f1_pre 0x20, %esi, %edi, %edx, %ebx + precalc4 %ymm15, 0x0 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc5 + calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc6 + calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc7 + calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax + precalc7 0x0 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc8 + calc_f1_pre 0x40, %eax, %edx, %ebx, %edi + precalc0 0x90 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc9 + calc_f1_pre 0x44, %edi, %eax, %ecx, %esi + precalc1 0x90 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc10 + calc_f1_pre 0x48, %esi, %edi, %edx, %ebx + precalc2 %ymm14 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc11 + calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc12 + calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx + precalc4 %ymm14, 0 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc13 + calc_f1_pre 0x64, %edx, %ecx, %esi, %eax + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc14 + calc_f1_pre 0x68, %eax, %edx, %ebx, %edi + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc15 + calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi + precalc7 0x10 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc16 + calc_f1_pre 0x80, %esi, %edi, %edx, %ebx + precalc0 0xa0 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc17 + calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx + precalc1 0xa0 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc18 + calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx + precalc2 %ymm13 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc_f2_pre offset, reg_a, reg_b, reg_e + add \offset(%r15), \reg_e + add \reg_b, \reg_e // add F from the previous round + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_b // for next round +.endm + +.macro calc_f2_post reg_a, reg_b, reg_c, reg_e + xor \reg_b, \reg_a + add %r12d, \reg_e + xor \reg_c, \reg_a +.endm + +.macro calc19 + calc_f2_pre 0x8c, %edx, %ecx, %eax + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc20 + calc_f2_pre 0xa0, %eax, %edx, %edi + precalc4 %ymm13, 0x0 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc21 + calc_f2_pre 0xa4, %edi, %eax, %esi + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc22 + calc_f2_pre 0xa8, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc23 + calc_f2_pre 0xac, %ebx, %esi, %ecx + precalc7 0x20 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc24 + calc_f2_pre 0xc0, %ecx, %ebx, %edx + precalc0 0xb0 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc25 + calc_f2_pre 0xc4, %edx, %ecx, %eax + precalc1 0xb0 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc26 + calc_f2_pre 0xc8, %eax, %edx, %edi + precalc2 %ymm12 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc27 + calc_f2_pre 0xcc, %edi, %eax, %esi + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc28 + calc_f2_pre 0xe0, %esi, %edi, %ebx + precalc4 %ymm12, 0x0 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc29 + calc_f2_pre 0xe4, %ebx, %esi, %ecx + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc30 + calc_f2_pre 0xe8, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc31 + calc_f2_pre 0xec, %edx, %ecx, %eax + precalc7 0x30 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc32 + calc_f2_pre 0x100, %eax, %edx, %edi + precalc16 %ymm15, %ymm14, %ymm12, %ymm8 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc33 + calc_f2_pre 0x104, %edi, %eax, %esi + precalc17 %ymm15, %ymm13, %ymm8 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc34 + calc_f2_pre 0x108, %esi, %edi, %ebx + precalc18 %ymm8 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc35 + calc_f2_pre 0x10c, %ebx, %esi, %ecx + precalc19 %ymm8 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc36 + calc_f2_pre 0x120, %ecx, %ebx, %edx + precalc20 %ymm8 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc37 + calc_f2_pre 0x124, %edx, %ecx, %eax + precalc21 %ymm8 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc38 + calc_f2_pre 0x128, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc_f3_pre offset, reg_e + add \offset(%r15), \reg_e +.endm + +.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb + add \reg_tb, \reg_e // add F from the previous round + mov \reg_b, %ebp + or \reg_a, %ebp + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_tb + and \reg_c, %ebp // calculate F for the next round + and \reg_b, \reg_a + or %ebp, \reg_a + add %r12d, \reg_e +.endm + +.macro calc39 + calc_f3_pre 0x12c, %esi + precalc23 %ymm8, 0x0, 0x80 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc40 + calc_f3_pre 0x140, %ebx + precalc16 %ymm14, %ymm13, %ymm8, %ymm7 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc41 + calc_f3_pre 0x144, %ecx + precalc17 %ymm14, %ymm12, %ymm7 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc42 + calc_f3_pre 0x148, %edx + precalc18 %ymm7 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc43 + calc_f3_pre 0x14c, %eax + precalc19 %ymm7 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc44 + calc_f3_pre 0x160, %edi + precalc20 %ymm7 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc45 + calc_f3_pre 0x164, %esi + precalc21 %ymm7 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc46 + calc_f3_pre 0x168, %ebx + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc47 + calc_f3_pre 0x16c, %ecx + vpxor %ymm9, %ymm0, %ymm7 + vpaddd 0x20(%r8), %ymm7, %ymm0 + vmovdqu %ymm0, 0xa0(%r14) + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc48 + calc_f3_pre 0x180, %edx + precalc16 %ymm13, %ymm12, %ymm7, %ymm5 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc49 + calc_f3_pre 0x184, %eax + precalc17 %ymm13, %ymm8, %ymm5 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc50 + calc_f3_pre 0x188, %edi + precalc18 %ymm5 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc51 + calc_f3_pre 0x18c, %esi + precalc19 %ymm5 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc52 + calc_f3_pre 0x1a0, %ebx + precalc20 %ymm5 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc53 + calc_f3_pre 0x1a4, %ecx + precalc21 %ymm5 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc54 + calc_f3_pre 0x1a8, %edx + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc55 + calc_f3_pre 0x1ac, %eax + precalc23 %ymm5, 0x20, 0xc0 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc56 + calc_f3_pre 0x1c0, %edi + precalc16 %ymm12, %ymm8, %ymm5, %ymm3 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc57 + calc_f3_pre 0x1c4, %esi + precalc17 %ymm12, %ymm7, %ymm3 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc58 + calc_f3_pre 0x1c8, %ebx + precalc18 %ymm3 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc59 + calc_f2_pre 0x1cc, %ebx, %esi, %ecx + precalc19 %ymm3 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc60 + calc_f2_pre 0x1e0, %ecx, %ebx, %edx + precalc20 %ymm3 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc61 + calc_f2_pre 0x1e4, %edx, %ecx, %eax + precalc21 %ymm3 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc62 + calc_f2_pre 0x1e8, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc63 + calc_f2_pre 0x1ec, %edi, %eax, %esi + precalc23 %ymm3, 0x20, 0xe0 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc64 + calc_f2_pre 0x200, %esi, %edi, %ebx + precalc32 %ymm5, %ymm3 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc65 + calc_f2_pre 0x204, %ebx, %esi, %ecx + precalc33 %ymm14, %ymm15 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc66 + calc_f2_pre 0x208, %ecx, %ebx, %edx + precalc34 %ymm8 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc67 + calc_f2_pre 0x20c, %edx, %ecx, %eax + precalc35 %ymm15 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc68 + calc_f2_pre 0x220, %eax, %edx, %edi + precalc36 %ymm15 + calc_f2_post %eax, %ecx, %ebx, %edi *** 1002 LINES SKIPPED ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202505142340.54ENecbI000166>