Date: Wed, 25 Oct 2023 20:06:29 GMT From: Ed Maste <emaste@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: 390ab2a4df60 - releng/14.0 - libcrypto: Switch back to the generated assembly in sys/crypto/openssl Message-ID: <202310252006.39PK6Tqu040031@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch releng/14.0 has been updated by emaste: URL: https://cgit.FreeBSD.org/src/commit/?id=390ab2a4df608f1d645f9399a0c4e8e4830843fb commit 390ab2a4df608f1d645f9399a0c4e8e4830843fb Author: John Baldwin <jhb@FreeBSD.org> AuthorDate: 2023-08-29 21:46:44 +0000 Commit: Ed Maste <emaste@FreeBSD.org> CommitDate: 2023-10-25 19:56:33 +0000 libcrypto: Switch back to the generated assembly in sys/crypto/openssl Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D41569 (cherry picked from commit 47d997021fbc7b662e9507deec1897d514d1224c) (cherry picked from commit bf5069fb6a3fc8fbf08ed23a4fd958af48cf902f) Approved by: re (gjb) --- secure/lib/libcrypto/Makefile | 4 +- .../lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S | 6390 --------- secure/lib/libcrypto/arch/aarch64/aesv8-armx.S | 3181 ----- secure/lib/libcrypto/arch/aarch64/arm64cpuid.S | 130 - secure/lib/libcrypto/arch/aarch64/armv8-mont.S | 2125 --- secure/lib/libcrypto/arch/aarch64/chacha-armv8.S | 2035 --- .../libcrypto/arch/aarch64/ecp_nistz256-armv8.S | 4243 ------ secure/lib/libcrypto/arch/aarch64/ghashv8-armx.S | 553 - .../lib/libcrypto/arch/aarch64/keccak1600-armv8.S | 1010 -- secure/lib/libcrypto/arch/aarch64/poly1305-armv8.S | 864 -- secure/lib/libcrypto/arch/aarch64/sha1-armv8.S | 1212 -- secure/lib/libcrypto/arch/aarch64/sha256-armv8.S | 2052 --- secure/lib/libcrypto/arch/aarch64/sha512-armv8.S | 1607 --- secure/lib/libcrypto/arch/aarch64/vpaes-armv8.S | 1197 -- secure/lib/libcrypto/arch/amd64/aes-x86_64.S | 2680 ---- secure/lib/libcrypto/arch/amd64/aesni-gcm-x86_64.S | 811 -- secure/lib/libcrypto/arch/amd64/aesni-mb-x86_64.S | 1610 --- .../lib/libcrypto/arch/amd64/aesni-sha1-x86_64.S | 3057 ----- .../lib/libcrypto/arch/amd64/aesni-sha256-x86_64.S | 4457 ------ secure/lib/libcrypto/arch/amd64/aesni-x86_64.S | 4507 ------ secure/lib/libcrypto/arch/amd64/bsaes-x86_64.S | 2619 ---- secure/lib/libcrypto/arch/amd64/chacha-x86_64.S | 2215 --- secure/lib/libcrypto/arch/amd64/cmll-x86_64.S | 1947 --- secure/lib/libcrypto/arch/amd64/e_padlock-x86_64.S | 1059 -- .../lib/libcrypto/arch/amd64/ecp_nistz256-x86_64.S | 7365 ---------- secure/lib/libcrypto/arch/amd64/ghash-x86_64.S | 1875 --- .../lib/libcrypto/arch/amd64/keccak1600-x86_64.S | 546 - secure/lib/libcrypto/arch/amd64/md5-x86_64.S | 705 - secure/lib/libcrypto/arch/amd64/poly1305-x86_64.S | 2090 --- secure/lib/libcrypto/arch/amd64/rc4-md5-x86_64.S | 1303 -- secure/lib/libcrypto/arch/amd64/rc4-x86_64.S | 657 - secure/lib/libcrypto/arch/amd64/rsaz-avx2.S | 1766 --- secure/lib/libcrypto/arch/amd64/rsaz-avx512.S | 902 -- secure/lib/libcrypto/arch/amd64/rsaz-x86_64.S | 2037 --- secure/lib/libcrypto/arch/amd64/sha1-mb-x86_64.S | 7325 ---------- secure/lib/libcrypto/arch/amd64/sha1-x86_64.S | 5472 -------- secure/lib/libcrypto/arch/amd64/sha256-mb-x86_64.S | 8006 ----------- secure/lib/libcrypto/arch/amd64/sha256-x86_64.S | 5478 -------- secure/lib/libcrypto/arch/amd64/sha512-x86_64.S | 5483 -------- secure/lib/libcrypto/arch/amd64/vpaes-x86_64.S | 880 -- secure/lib/libcrypto/arch/amd64/wp-x86_64.S | 901 -- secure/lib/libcrypto/arch/amd64/x25519-x86_64.S | 824 -- secure/lib/libcrypto/arch/amd64/x86_64-gf2m.S | 333 - secure/lib/libcrypto/arch/amd64/x86_64-mont.S | 1261 -- secure/lib/libcrypto/arch/amd64/x86_64-mont5.S | 3625 ----- secure/lib/libcrypto/arch/amd64/x86_64cpuid.S | 513 - secure/lib/libcrypto/arch/arm/aes-armv4.S | 1198 -- secure/lib/libcrypto/arch/arm/aesv8-armx.S | 1088 -- secure/lib/libcrypto/arch/arm/armv4-gf2m.S | 236 - secure/lib/libcrypto/arch/arm/armv4-mont.S | 961 -- secure/lib/libcrypto/arch/arm/armv4cpuid.S | 273 - secure/lib/libcrypto/arch/arm/bsaes-armv7.S | 2561 ---- secure/lib/libcrypto/arch/arm/chacha-armv4.S | 1478 -- secure/lib/libcrypto/arch/arm/ecp_nistz256-armv4.S | 4430 ------ secure/lib/libcrypto/arch/arm/ghash-armv4.S | 565 - secure/lib/libcrypto/arch/arm/ghashv8-armx.S | 244 - secure/lib/libcrypto/arch/arm/keccak1600-armv4.S | 2694 ---- secure/lib/libcrypto/arch/arm/poly1305-armv4.S | 1169 -- secure/lib/libcrypto/arch/arm/sha1-armv4-large.S | 1499 -- secure/lib/libcrypto/arch/arm/sha256-armv4.S | 2823 ---- secure/lib/libcrypto/arch/arm/sha512-armv4.S | 1877 --- secure/lib/libcrypto/arch/i386/aes-586.S | 6644 --------- secure/lib/libcrypto/arch/i386/aesni-x86.S | 6732 --------- secure/lib/libcrypto/arch/i386/bf-586.S | 1928 --- secure/lib/libcrypto/arch/i386/bn-586.S | 3157 ----- secure/lib/libcrypto/arch/i386/cast-586.S | 2002 --- secure/lib/libcrypto/arch/i386/chacha-x86.S | 2084 --- secure/lib/libcrypto/arch/i386/cmll-x86.S | 4896 ------- secure/lib/libcrypto/arch/i386/co-586.S | 2584 ---- secure/lib/libcrypto/arch/i386/crypt586.S | 1800 --- secure/lib/libcrypto/arch/i386/des-586.S | 3932 ------ secure/lib/libcrypto/arch/i386/e_padlock-x86.S | 2300 ---- secure/lib/libcrypto/arch/i386/ecp_nistz256-x86.S | 10584 -------------- secure/lib/libcrypto/arch/i386/ghash-x86.S | 2636 ---- secure/lib/libcrypto/arch/i386/md5-586.S | 1404 -- secure/lib/libcrypto/arch/i386/poly1305-x86.S | 3938 ------ secure/lib/libcrypto/arch/i386/rc4-586.S | 819 -- secure/lib/libcrypto/arch/i386/rc5-586.S | 1264 -- secure/lib/libcrypto/arch/i386/rmd-586.S | 3976 ------ secure/lib/libcrypto/arch/i386/sha1-586.S | 8016 ----------- secure/lib/libcrypto/arch/i386/sha256-586.S | 13612 ------------------- secure/lib/libcrypto/arch/i386/sha512-586.S | 5704 -------- secure/lib/libcrypto/arch/i386/vpaes-x86.S | 1488 -- secure/lib/libcrypto/arch/i386/wp-mmx.S | 2260 --- secure/lib/libcrypto/arch/i386/x86-gf2m.S | 755 - secure/lib/libcrypto/arch/i386/x86-mont.S | 995 -- secure/lib/libcrypto/arch/i386/x86cpuid.S | 1217 -- secure/lib/libcrypto/arch/powerpc/aes-ppc.S | 1561 --- secure/lib/libcrypto/arch/powerpc/aesp8-ppc.S | 3642 ----- secure/lib/libcrypto/arch/powerpc/bn-ppc.S | 1855 --- secure/lib/libcrypto/arch/powerpc/chacha-ppc.S | 1492 -- secure/lib/libcrypto/arch/powerpc/ghashp8-ppc.S | 569 - secure/lib/libcrypto/arch/powerpc/poly1305-ppc.S | 1301 -- secure/lib/libcrypto/arch/powerpc/poly1305-ppcfp.S | 586 - secure/lib/libcrypto/arch/powerpc/ppc-mont.S | 1787 --- secure/lib/libcrypto/arch/powerpc/ppc.S | 1855 --- secure/lib/libcrypto/arch/powerpc/ppccpuid.S | 356 - secure/lib/libcrypto/arch/powerpc/sha1-ppc.S | 1118 -- secure/lib/libcrypto/arch/powerpc/sha256-ppc.S | 1321 -- secure/lib/libcrypto/arch/powerpc/sha256p8-ppc.S | 735 - secure/lib/libcrypto/arch/powerpc/sha512-ppc.S | 3071 ----- secure/lib/libcrypto/arch/powerpc/sha512p8-ppc.S | 833 -- secure/lib/libcrypto/arch/powerpc/vpaes-ppc.S | 1468 -- secure/lib/libcrypto/arch/powerpc64/aes-ppc.S | 1533 --- secure/lib/libcrypto/arch/powerpc64/aesp8-ppc.S | 3659 ----- secure/lib/libcrypto/arch/powerpc64/bn-ppc.S | 1876 --- secure/lib/libcrypto/arch/powerpc64/chacha-ppc.S | 1499 -- .../libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S | 354 - .../libcrypto/arch/powerpc64/ecp_nistz256-ppc64.S | 4854 ------- secure/lib/libcrypto/arch/powerpc64/ghashp8-ppc.S | 576 - .../libcrypto/arch/powerpc64/keccak1600-ppc64.S | 670 - secure/lib/libcrypto/arch/powerpc64/poly1305-ppc.S | 1142 -- .../lib/libcrypto/arch/powerpc64/poly1305-ppcfp.S | 596 - secure/lib/libcrypto/arch/powerpc64/ppc-mont.S | 1790 --- secure/lib/libcrypto/arch/powerpc64/ppc.S | 1876 --- secure/lib/libcrypto/arch/powerpc64/ppccpuid.S | 387 - secure/lib/libcrypto/arch/powerpc64/sha1-ppc.S | 1121 -- secure/lib/libcrypto/arch/powerpc64/sha256-ppc.S | 1324 -- secure/lib/libcrypto/arch/powerpc64/sha256p8-ppc.S | 738 - secure/lib/libcrypto/arch/powerpc64/sha512-ppc.S | 1420 -- secure/lib/libcrypto/arch/powerpc64/sha512p8-ppc.S | 836 -- secure/lib/libcrypto/arch/powerpc64/vpaes-ppc.S | 1479 -- secure/lib/libcrypto/arch/powerpc64/x25519-ppc64.S | 349 - secure/lib/libcrypto/arch/powerpc64le/aes-ppc.S | 1581 --- secure/lib/libcrypto/arch/powerpc64le/aesp8-ppc.S | 3659 ----- secure/lib/libcrypto/arch/powerpc64le/bn-ppc.S | 1876 --- secure/lib/libcrypto/arch/powerpc64le/chacha-ppc.S | 1371 -- .../arch/powerpc64le/ecp_nistp521-ppc64.S | 354 - .../arch/powerpc64le/ecp_nistz256-ppc64.S | 4854 ------- .../lib/libcrypto/arch/powerpc64le/ghashp8-ppc.S | 576 - .../libcrypto/arch/powerpc64le/keccak1600-ppc64.S | 670 - .../lib/libcrypto/arch/powerpc64le/poly1305-ppc.S | 1128 -- .../libcrypto/arch/powerpc64le/poly1305-ppcfp.S | 591 - secure/lib/libcrypto/arch/powerpc64le/ppc-mont.S | 1790 --- secure/lib/libcrypto/arch/powerpc64le/ppc.S | 1876 --- secure/lib/libcrypto/arch/powerpc64le/ppccpuid.S | 387 - secure/lib/libcrypto/arch/powerpc64le/sha1-ppc.S | 1169 -- secure/lib/libcrypto/arch/powerpc64le/sha256-ppc.S | 1372 -- .../lib/libcrypto/arch/powerpc64le/sha256p8-ppc.S | 746 - secure/lib/libcrypto/arch/powerpc64le/sha512-ppc.S | 1516 --- .../lib/libcrypto/arch/powerpc64le/sha512p8-ppc.S | 848 -- secure/lib/libcrypto/arch/powerpc64le/vpaes-ppc.S | 1479 -- .../lib/libcrypto/arch/powerpc64le/x25519-ppc64.S | 349 - secure/lib/libcrypto/engines/padlock/Makefile | 2 +- secure/lib/libcrypto/modules/fips/Makefile | 4 +- 145 files changed, 5 insertions(+), 310557 deletions(-) diff --git a/secure/lib/libcrypto/Makefile b/secure/lib/libcrypto/Makefile index acf615ac2a8f..dc701d90451e 100644 --- a/secure/lib/libcrypto/Makefile +++ b/secure/lib/libcrypto/Makefile @@ -620,12 +620,12 @@ buildasm cleanasm: PICFLAG+= -DOPENSSL_PIC .if defined(ASM_${MACHINE_CPUARCH}) -.PATH: ${SRCTOP}/secure/lib/libcrypto/arch/${MACHINE_CPUARCH} +.PATH: ${SRCTOP}/sys/crypto/openssl/${MACHINE_CPUARCH} .if defined(ASM_amd64) .PATH: ${LCRYPTO_SRC}/crypto/bn/asm .endif .elif defined(ASM_${MACHINE_ARCH}) -.PATH: ${SRCTOP}/secure/lib/libcrypto/arch/${MACHINE_ARCH} +.PATH: ${SRCTOP}/sys/crypto/openssl/${MACHINE_ARCH} .endif .PATH: ${LCRYPTO_SRC}/crypto \ diff --git a/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S b/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S deleted file mode 100644 index eb85dbc9f996..000000000000 --- a/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S +++ /dev/null @@ -1,6390 +0,0 @@ -/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */ -#include "arm_arch.h" - -#if __ARM_MAX_ARCH__>=8 -.arch armv8-a+crypto -.text -.globl aes_gcm_enc_128_kernel -.type aes_gcm_enc_128_kernel,%function -.align 4 -aes_gcm_enc_128_kernel: - cbz x1, .L128_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 -#ifdef __AARCH64EB__ - rev x10, x10 - rev x11, x11 -#endif - ldp x13, x14, [x8, #160] //load rk10 -#ifdef __AARCH64EB__ - ror x13, x13, #32 - ror x14, x14, #32 -#endif - ld1 {v11.16b}, [x3] - ext v11.16b, v11.16b, v11.16b, #8 - rev64 v11.16b, v11.16b - lsr x5, x1, #3 //byte_len - mov x15, x5 - - ld1 {v18.4s}, [x8], #16 //load rk0 - add x4, x0, x1, lsr #3 //end_input_ptr - sub x5, x5, #1 //byte_len - 1 - - lsr x12, x11, #32 - ldr q15, [x3, #112] //load h4l | h4h -#ifndef __AARCH64EB__ - ext v15.16b, v15.16b, v15.16b, #8 -#endif - fmov d1, x10 //CTR block 1 - rev w12, w12 //rev_ctr32 - - add w12, w12, #1 //increment rev_ctr32 - orr w11, w11, w11 - ld1 {v19.4s}, [x8], #16 //load rk1 - - rev w9, w12 //CTR block 1 - add w12, w12, #1 //CTR block 1 - fmov d3, x10 //CTR block 3 - - orr x9, x11, x9, lsl #32 //CTR block 1 - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible - - fmov v1.d[1], x9 //CTR block 1 - rev w9, w12 //CTR block 2 - - fmov d2, x10 //CTR block 2 - orr x9, x11, x9, lsl #32 //CTR block 2 - add w12, w12, #1 //CTR block 2 - - fmov v2.d[1], x9 //CTR block 2 - rev w9, w12 //CTR block 3 - - orr x9, x11, x9, lsl #32 //CTR block 3 - ld1 {v20.4s}, [x8], #16 //load rk2 - - add w12, w12, #1 //CTR block 3 - fmov v3.d[1], x9 //CTR block 3 - - ldr q14, [x3, #80] //load h3l | h3h -#ifndef __AARCH64EB__ - ext v14.16b, v14.16b, v14.16b, #8 -#endif - aese v1.16b, v18.16b - aesmc v1.16b, v1.16b //AES block 1 - round 0 - ld1 {v21.4s}, [x8], #16 //load rk3 - - aese v2.16b, v18.16b - aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q12, [x3, #32] //load h1l | h1h -#ifndef __AARCH64EB__ - ext v12.16b, v12.16b, v12.16b, #8 -#endif - - aese v0.16b, v18.16b - aesmc v0.16b, v0.16b //AES block 0 - round 0 - ld1 {v22.4s}, [x8], #16 //load rk4 - - aese v3.16b, v18.16b - aesmc v3.16b, v3.16b //AES block 3 - round 0 - ld1 {v23.4s}, [x8], #16 //load rk5 - - aese v2.16b, v19.16b - aesmc v2.16b, v2.16b //AES block 2 - round 1 - trn2 v17.2d, v14.2d, v15.2d //h4l | h3l - - aese v0.16b, v19.16b - aesmc v0.16b, v0.16b //AES block 0 - round 1 - ld1 {v24.4s}, [x8], #16 //load rk6 - - aese v1.16b, v19.16b - aesmc v1.16b, v1.16b //AES block 1 - round 1 - ld1 {v25.4s}, [x8], #16 //load rk7 - - aese v3.16b, v19.16b - aesmc v3.16b, v3.16b //AES block 3 - round 1 - trn1 v9.2d, v14.2d, v15.2d //h4h | h3h - - aese v0.16b, v20.16b - aesmc v0.16b, v0.16b //AES block 0 - round 2 - ld1 {v26.4s}, [x8], #16 //load rk8 - - aese v1.16b, v20.16b - aesmc v1.16b, v1.16b //AES block 1 - round 2 - ldr q13, [x3, #64] //load h2l | h2h -#ifndef __AARCH64EB__ - ext v13.16b, v13.16b, v13.16b, #8 -#endif - - aese v3.16b, v20.16b - aesmc v3.16b, v3.16b //AES block 3 - round 2 - - aese v2.16b, v20.16b - aesmc v2.16b, v2.16b //AES block 2 - round 2 - eor v17.16b, v17.16b, v9.16b //h4k | h3k - - aese v0.16b, v21.16b - aesmc v0.16b, v0.16b //AES block 0 - round 3 - - aese v1.16b, v21.16b - aesmc v1.16b, v1.16b //AES block 1 - round 3 - - aese v2.16b, v21.16b - aesmc v2.16b, v2.16b //AES block 2 - round 3 - ld1 {v27.4s}, [x8], #16 //load rk9 - - aese v3.16b, v21.16b - aesmc v3.16b, v3.16b //AES block 3 - round 3 - - and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - trn2 v16.2d, v12.2d, v13.2d //h2l | h1l - - aese v3.16b, v22.16b - aesmc v3.16b, v3.16b //AES block 3 - round 4 - add x5, x5, x0 - - aese v2.16b, v22.16b - aesmc v2.16b, v2.16b //AES block 2 - round 4 - cmp x0, x5 //check if we have <= 4 blocks - - aese v0.16b, v22.16b - aesmc v0.16b, v0.16b //AES block 0 - round 4 - - aese v3.16b, v23.16b - aesmc v3.16b, v3.16b //AES block 3 - round 5 - - aese v2.16b, v23.16b - aesmc v2.16b, v2.16b //AES block 2 - round 5 - - aese v0.16b, v23.16b - aesmc v0.16b, v0.16b //AES block 0 - round 5 - - aese v3.16b, v24.16b - aesmc v3.16b, v3.16b //AES block 3 - round 6 - - aese v1.16b, v22.16b - aesmc v1.16b, v1.16b //AES block 1 - round 4 - - aese v2.16b, v24.16b - aesmc v2.16b, v2.16b //AES block 2 - round 6 - trn1 v8.2d, v12.2d, v13.2d //h2h | h1h - - aese v0.16b, v24.16b - aesmc v0.16b, v0.16b //AES block 0 - round 6 - - aese v1.16b, v23.16b - aesmc v1.16b, v1.16b //AES block 1 - round 5 - - aese v3.16b, v25.16b - aesmc v3.16b, v3.16b //AES block 3 - round 7 - - aese v0.16b, v25.16b - aesmc v0.16b, v0.16b //AES block 0 - round 7 - - aese v1.16b, v24.16b - aesmc v1.16b, v1.16b //AES block 1 - round 6 - - aese v2.16b, v25.16b - aesmc v2.16b, v2.16b //AES block 2 - round 7 - - aese v0.16b, v26.16b - aesmc v0.16b, v0.16b //AES block 0 - round 8 - - aese v1.16b, v25.16b - aesmc v1.16b, v1.16b //AES block 1 - round 7 - - aese v2.16b, v26.16b - aesmc v2.16b, v2.16b //AES block 2 - round 8 - - aese v3.16b, v26.16b - aesmc v3.16b, v3.16b //AES block 3 - round 8 - - aese v1.16b, v26.16b - aesmc v1.16b, v1.16b //AES block 1 - round 8 - - aese v2.16b, v27.16b //AES block 2 - round 9 - - aese v0.16b, v27.16b //AES block 0 - round 9 - - eor v16.16b, v16.16b, v8.16b //h2k | h1k - - aese v1.16b, v27.16b //AES block 1 - round 9 - - aese v3.16b, v27.16b //AES block 3 - round 9 - b.ge .L128_enc_tail //handle tail - - ldp x6, x7, [x0, #0] //AES block 0 - load plaintext -#ifdef __AARCH64EB__ - rev x6, x6 - rev x7, x7 -#endif - ldp x21, x22, [x0, #32] //AES block 2 - load plaintext -#ifdef __AARCH64EB__ - rev x21, x21 - rev x22, x22 -#endif - ldp x19, x20, [x0, #16] //AES block 1 - load plaintext -#ifdef __AARCH64EB__ - rev x19, x19 - rev x20, x20 -#endif - ldp x23, x24, [x0, #48] //AES block 3 - load plaintext -#ifdef __AARCH64EB__ - rev x23, x23 - rev x24, x24 -#endif - eor x6, x6, x13 //AES block 0 - round 10 low - eor x7, x7, x14 //AES block 0 - round 10 high - - eor x21, x21, x13 //AES block 2 - round 10 low - fmov d4, x6 //AES block 0 - mov low - - eor x19, x19, x13 //AES block 1 - round 10 low - eor x22, x22, x14 //AES block 2 - round 10 high - fmov v4.d[1], x7 //AES block 0 - mov high - - fmov d5, x19 //AES block 1 - mov low - eor x20, x20, x14 //AES block 1 - round 10 high - - eor x23, x23, x13 //AES block 3 - round 10 low - fmov v5.d[1], x20 //AES block 1 - mov high - - fmov d6, x21 //AES block 2 - mov low - eor x24, x24, x14 //AES block 3 - round 10 high - rev w9, w12 //CTR block 4 - - fmov v6.d[1], x22 //AES block 2 - mov high - orr x9, x11, x9, lsl #32 //CTR block 4 - - eor v4.16b, v4.16b, v0.16b //AES block 0 - result - fmov d0, x10 //CTR block 4 - add w12, w12, #1 //CTR block 4 - - fmov v0.d[1], x9 //CTR block 4 - rev w9, w12 //CTR block 5 - - eor v5.16b, v5.16b, v1.16b //AES block 1 - result - fmov d1, x10 //CTR block 5 - orr x9, x11, x9, lsl #32 //CTR block 5 - - add w12, w12, #1 //CTR block 5 - add x0, x0, #64 //AES input_ptr update - fmov v1.d[1], x9 //CTR block 5 - - fmov d7, x23 //AES block 3 - mov low - rev w9, w12 //CTR block 6 - st1 { v4.16b}, [x2], #16 //AES block 0 - store result - - fmov v7.d[1], x24 //AES block 3 - mov high - orr x9, x11, x9, lsl #32 //CTR block 6 - - add w12, w12, #1 //CTR block 6 - eor v6.16b, v6.16b, v2.16b //AES block 2 - result - st1 { v5.16b}, [x2], #16 //AES block 1 - store result - - fmov d2, x10 //CTR block 6 - cmp x0, x5 //check if we have <= 8 blocks - - fmov v2.d[1], x9 //CTR block 6 - rev w9, w12 //CTR block 7 - st1 { v6.16b}, [x2], #16 //AES block 2 - store result - - orr x9, x11, x9, lsl #32 //CTR block 7 - - eor v7.16b, v7.16b, v3.16b //AES block 3 - result - st1 { v7.16b}, [x2], #16 //AES block 3 - store result - b.ge .L128_enc_prepretail //do prepretail - -.L128_enc_main_loop: //main loop start - ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext -#ifdef __AARCH64EB__ - rev x23, x23 - rev x24, x24 -#endif - rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) - rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) - - aese v2.16b, v18.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 - fmov d3, x10 //CTR block 4k+3 - - ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 - rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) - - aese v1.16b, v18.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 - add w12, w12, #1 //CTR block 4k+3 - fmov v3.d[1], x9 //CTR block 4k+3 - - aese v0.16b, v18.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 - mov d31, v6.d[1] //GHASH block 4k+2 - mid - - aese v2.16b, v19.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 - mov d30, v5.d[1] //GHASH block 4k+1 - mid - - aese v1.16b, v19.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 - eor v4.16b, v4.16b, v11.16b //PRE 1 - - aese v3.16b, v18.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 - eor x24, x24, x14 //AES block 4k+3 - round 10 high - - pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high - eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid - ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext -#ifdef __AARCH64EB__ - rev x6, x6 - rev x7, x7 -#endif - aese v0.16b, v19.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 - rev w9, w12 //CTR block 4k+8 - - eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid - mov d8, v4.d[1] //GHASH block 4k - mid - orr x9, x11, x9, lsl #32 //CTR block 4k+8 - - pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high - add w12, w12, #1 //CTR block 4k+8 - mov d10, v17.d[1] //GHASH block 4k - mid - - aese v0.16b, v20.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 - - pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low - eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid - - aese v1.16b, v20.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 - - aese v0.16b, v21.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 - eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high - - pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low - - pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid - rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) - - pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid - - pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low - ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid - - pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high - eor x7, x7, x14 //AES block 4k+4 - round 10 high - - eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid - mov d30, v7.d[1] //GHASH block 4k+3 - mid - - aese v3.16b, v19.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 - eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low - - aese v2.16b, v20.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 - eor x6, x6, x13 //AES block 4k+4 - round 10 low - - aese v1.16b, v21.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 - eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid - - pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high - - aese v2.16b, v21.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 - eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high - - pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid - - pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low - movi v8.8b, #0xc2 - - pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid - eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low - - aese v1.16b, v22.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 - - aese v3.16b, v20.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 - shl d8, d8, #56 //mod_constant - - aese v0.16b, v22.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 - eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high - - aese v1.16b, v23.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 - ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext -#ifdef __AARCH64EB__ - rev x19, x19 - rev x20, x20 -#endif - aese v3.16b, v21.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 - eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid - - aese v0.16b, v23.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 - ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext -#ifdef __AARCH64EB__ - rev x21, x21 - rev x22, x22 -#endif - pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid - eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low - - aese v2.16b, v22.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 - eor x19, x19, x13 //AES block 4k+5 - round 10 low - - aese v3.16b, v22.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 - eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid - - aese v1.16b, v24.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 - eor x23, x23, x13 //AES block 4k+3 - round 10 low - - aese v2.16b, v23.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 - eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up - - fmov d4, x6 //AES block 4k+4 - mov low - aese v0.16b, v24.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 - fmov v4.d[1], x7 //AES block 4k+4 - mov high - - add x0, x0, #64 //AES input_ptr update - fmov d7, x23 //AES block 4k+3 - mov low - ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment - - aese v3.16b, v23.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 - fmov d5, x19 //AES block 4k+5 - mov low - - aese v0.16b, v25.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 - eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up - - aese v2.16b, v24.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 - eor x20, x20, x14 //AES block 4k+5 - round 10 high - - aese v1.16b, v25.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 - fmov v5.d[1], x20 //AES block 4k+5 - mov high - - aese v0.16b, v26.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 - fmov v7.d[1], x24 //AES block 4k+3 - mov high - - aese v3.16b, v24.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 - cmp x0, x5 //.LOOP CONTROL - - aese v1.16b, v26.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 - eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid - - aese v0.16b, v27.16b //AES block 4k+4 - round 9 - eor x21, x21, x13 //AES block 4k+6 - round 10 low - eor x22, x22, x14 //AES block 4k+6 - round 10 high - - aese v3.16b, v25.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 - fmov d6, x21 //AES block 4k+6 - mov low - - aese v1.16b, v27.16b //AES block 4k+5 - round 9 - fmov v6.d[1], x22 //AES block 4k+6 - mov high - - aese v2.16b, v25.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 - eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result - - fmov d0, x10 //CTR block 4k+8 - aese v3.16b, v26.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 - - fmov v0.d[1], x9 //CTR block 4k+8 - rev w9, w12 //CTR block 4k+9 - eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid - - aese v2.16b, v26.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 - eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result - - add w12, w12, #1 //CTR block 4k+9 - orr x9, x11, x9, lsl #32 //CTR block 4k+9 - fmov d1, x10 //CTR block 4k+9 - - pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low - fmov v1.d[1], x9 //CTR block 4k+9 - rev w9, w12 //CTR block 4k+10 - - aese v2.16b, v27.16b //AES block 4k+6 - round 9 - st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result - eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result - orr x9, x11, x9, lsl #32 //CTR block 4k+10 - - aese v3.16b, v27.16b //AES block 4k+7 - round 9 - add w12, w12, #1 //CTR block 4k+10 - ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment - fmov d2, x10 //CTR block 4k+10 - - eor v11.16b, v11.16b, v9.16b //MODULO - fold into low - st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result - - fmov v2.d[1], x9 //CTR block 4k+10 - st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result - rev w9, w12 //CTR block 4k+11 - - orr x9, x11, x9, lsl #32 //CTR block 4k+11 - eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result - - eor v11.16b, v11.16b, v10.16b //MODULO - fold into low - st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result - b.lt .L128_enc_main_loop - -.L128_enc_prepretail: //PREPRETAIL - rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) - fmov d3, x10 //CTR block 4k+3 - rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) - - ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 - add w12, w12, #1 //CTR block 4k+3 - fmov v3.d[1], x9 //CTR block 4k+3 - - aese v1.16b, v18.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 - rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) - - pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low - - rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) - eor v4.16b, v4.16b, v11.16b //PRE 1 - - pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high - - aese v3.16b, v18.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 - mov d30, v5.d[1] //GHASH block 4k+1 - mid - - pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low - mov d8, v4.d[1] //GHASH block 4k - mid - - mov d31, v6.d[1] //GHASH block 4k+2 - mid - mov d10, v17.d[1] //GHASH block 4k - mid - - aese v1.16b, v19.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 - eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid - - eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid - - pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high - eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid - - aese v3.16b, v19.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 - - pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid - eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low - - pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid - - aese v0.16b, v18.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 - ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid - - aese v2.16b, v18.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 - - eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid - mov d30, v7.d[1] //GHASH block 4k+3 - mid - - aese v0.16b, v19.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 - eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high - - pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid - - pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high - eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid - - pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high - - pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low - - aese v2.16b, v19.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 - eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high - - aese v0.16b, v20.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 - - pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low - movi v8.8b, #0xc2 - - aese v2.16b, v20.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 - eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low - - aese v3.16b, v20.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 - - pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid - eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid - - aese v2.16b, v21.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 - - aese v1.16b, v20.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 - eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high - - aese v0.16b, v21.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 - - eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid - shl d8, d8, #56 //mod_constant - - aese v1.16b, v21.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 - eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low - - aese v0.16b, v22.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 - - pmull v28.1q, v9.1d, v8.1d - eor v10.16b, v10.16b, v9.16b //karatsuba tidy up - - aese v1.16b, v22.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 - - aese v0.16b, v23.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 - ext v9.16b, v9.16b, v9.16b, #8 - - aese v3.16b, v21.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 - - aese v2.16b, v22.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 - eor v10.16b, v10.16b, v11.16b - - aese v0.16b, v24.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 - - aese v3.16b, v22.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 - - aese v1.16b, v23.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 - - aese v2.16b, v23.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 - eor v10.16b, v10.16b, v28.16b - - aese v3.16b, v23.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 - - aese v1.16b, v24.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 - - aese v2.16b, v24.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 - - aese v3.16b, v24.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 - eor v10.16b, v10.16b, v9.16b - - aese v0.16b, v25.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 - - aese v2.16b, v25.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 - - aese v3.16b, v25.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 - - pmull v28.1q, v10.1d, v8.1d - - aese v1.16b, v25.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 - ext v10.16b, v10.16b, v10.16b, #8 - - aese v3.16b, v26.16b - aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 - - aese v0.16b, v26.16b - aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 - eor v11.16b, v11.16b, v28.16b - - aese v1.16b, v26.16b - aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 - - aese v3.16b, v27.16b //AES block 4k+7 - round 9 - - aese v2.16b, v26.16b - aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 - - aese v0.16b, v27.16b //AES block 4k+4 - round 9 - - aese v1.16b, v27.16b //AES block 4k+5 - round 9 - eor v11.16b, v11.16b, v10.16b - - aese v2.16b, v27.16b //AES block 4k+6 - round 9 -.L128_enc_tail: //TAIL - - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext -#ifdef __AARCH64EB__ - rev x6, x6 - rev x7, x7 -#endif - cmp x5, #48 - - ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag - eor x6, x6, x13 //AES block 4k+4 - round 10 low - eor x7, x7, x14 //AES block 4k+4 - round 10 high - - fmov d4, x6 //AES block 4k+4 - mov low - - fmov v4.d[1], x7 //AES block 4k+4 - mov high - - eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result - - b.gt .L128_enc_blocks_more_than_3 - - sub w12, w12, #1 - movi v11.8b, #0 - mov v3.16b, v2.16b - - cmp x5, #32 - mov v2.16b, v1.16b - movi v9.8b, #0 - - movi v10.8b, #0 - b.gt .L128_enc_blocks_more_than_2 - - mov v3.16b, v1.16b - cmp x5, #16 - - sub w12, w12, #1 - b.gt .L128_enc_blocks_more_than_1 - - sub w12, w12, #1 - b .L128_enc_blocks_less_than_1 -.L128_enc_blocks_more_than_3: //blocks left > 3 - st1 { v5.16b}, [x2], #16 //AES final-3 block - store result - - ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high *** 310635 LINES SKIPPED ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202310252006.39PK6Tqu040031>