Date: Wed, 25 Oct 2023 20:06:28 GMT From: Ed Maste <emaste@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: 4f3a6a07112b - releng/14.0 - ossl: Update the generated assembly files from OpenSSL 3.0. Message-ID: <202310252006.39PK6SwI039977@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch releng/14.0 has been updated by emaste: URL: https://cgit.FreeBSD.org/src/commit/?id=4f3a6a07112b4f4f3a04ee9a7de01598c0b1d30f commit 4f3a6a07112b4f4f3a04ee9a7de01598c0b1d30f Author: John Baldwin <jhb@FreeBSD.org> AuthorDate: 2023-08-29 21:44:15 +0000 Commit: Ed Maste <emaste@FreeBSD.org> CommitDate: 2023-10-25 19:56:23 +0000 ossl: Update the generated assembly files from OpenSSL 3.0. Tested with: cryptocheck -d ossl0 -a all -z on amd64 Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D41568 (cherry picked from commit c0855eaa3ee9614804b6bd6a255aa9f71e095f43) (cherry picked from commit f0d83d53c3be75ffc7711ba8171af9b934459810) Approved by: re (gjb) --- sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S | 6390 +++++++++++++++++++ sys/crypto/openssl/aarch64/aesv8-armx.S | 3014 ++++++++- sys/crypto/openssl/aarch64/arm64cpuid.S | 7 + sys/crypto/openssl/aarch64/armv8-mont.S | 732 ++- sys/crypto/openssl/aarch64/chacha-armv8.S | 1553 ++--- sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S | 8 +- sys/crypto/openssl/aarch64/ghashv8-armx.S | 1 + sys/crypto/openssl/aarch64/keccak1600-armv8.S | 190 +- sys/crypto/openssl/aarch64/poly1305-armv8.S | 31 +- sys/crypto/openssl/aarch64/sha1-armv8.S | 54 +- sys/crypto/openssl/aarch64/sha256-armv8.S | 28 +- sys/crypto/openssl/aarch64/sha512-armv8.S | 28 +- sys/crypto/openssl/aarch64/vpaes-armv8.S | 276 +- sys/crypto/openssl/amd64/aes-x86_64.S | 2680 ++++++++ sys/crypto/openssl/amd64/aesni-gcm-x86_64.S | 21 + sys/crypto/openssl/amd64/aesni-mb-x86_64.S | 102 + sys/crypto/openssl/amd64/aesni-sha1-x86_64.S | 21 + sys/crypto/openssl/amd64/aesni-sha256-x86_64.S | 21 + sys/crypto/openssl/amd64/aesni-x86_64.S | 32 + sys/crypto/openssl/amd64/bsaes-x86_64.S | 2619 ++++++++ sys/crypto/openssl/amd64/chacha-x86_64.S | 21 + sys/crypto/openssl/amd64/cmll-x86_64.S | 22 + sys/crypto/openssl/amd64/e_padlock-x86_64.S | 21 + sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S | 21 + sys/crypto/openssl/amd64/ghash-x86_64.S | 27 + sys/crypto/openssl/amd64/keccak1600-x86_64.S | 21 + sys/crypto/openssl/amd64/md5-x86_64.S | 29 +- sys/crypto/openssl/amd64/poly1305-x86_64.S | 21 + sys/crypto/openssl/amd64/rc4-md5-x86_64.S | 21 + sys/crypto/openssl/amd64/rc4-x86_64.S | 24 + sys/crypto/openssl/amd64/rsaz-avx2.S | 21 + sys/crypto/openssl/amd64/rsaz-avx512.S | 902 +++ sys/crypto/openssl/amd64/rsaz-x86_64.S | 21 + sys/crypto/openssl/amd64/sha1-mb-x86_64.S | 57 + sys/crypto/openssl/amd64/sha1-x86_64.S | 21 + sys/crypto/openssl/amd64/sha256-mb-x86_64.S | 57 + sys/crypto/openssl/amd64/sha256-x86_64.S | 21 + sys/crypto/openssl/amd64/sha512-x86_64.S | 21 + sys/crypto/openssl/amd64/vpaes-x86_64.S | 26 + sys/crypto/openssl/amd64/wp-x86_64.S | 21 + sys/crypto/openssl/amd64/x25519-x86_64.S | 21 + sys/crypto/openssl/amd64/x86_64-gf2m.S | 21 + sys/crypto/openssl/amd64/x86_64-mont.S | 21 + sys/crypto/openssl/amd64/x86_64-mont5.S | 21 + sys/crypto/openssl/amd64/x86_64cpuid.S | 49 + sys/crypto/openssl/arm/aes-armv4.S | 7 +- sys/crypto/openssl/arm/aesv8-armx.S | 776 ++- sys/crypto/openssl/arm/armv4-gf2m.S | 13 +- sys/crypto/openssl/arm/armv4-mont.S | 17 +- sys/crypto/openssl/arm/armv4cpuid.S | 3 +- sys/crypto/openssl/arm/bsaes-armv7.S | 47 +- sys/crypto/openssl/arm/chacha-armv4.S | 11 +- sys/crypto/openssl/arm/ecp_nistz256-armv4.S | 4 +- sys/crypto/openssl/arm/ghash-armv4.S | 3 +- sys/crypto/openssl/arm/ghashv8-armx.S | 64 +- sys/crypto/openssl/arm/keccak1600-armv4.S | 34 +- sys/crypto/openssl/arm/poly1305-armv4.S | 37 +- sys/crypto/openssl/arm/sha1-armv4-large.S | 15 +- sys/crypto/openssl/arm/sha256-armv4.S | 17 +- sys/crypto/openssl/arm/sha512-armv4.S | 15 +- sys/crypto/openssl/i386/aes-586.S | 6644 ++++++++++++++++++++ sys/crypto/openssl/i386/aesni-x86.S | 254 + sys/crypto/openssl/i386/bf-586.S | 134 + sys/crypto/openssl/i386/bn-586.S | 104 + sys/crypto/openssl/i386/cast-586.S | 134 + sys/crypto/openssl/i386/chacha-x86.S | 64 + sys/crypto/openssl/i386/cmll-x86.S | 144 + sys/crypto/openssl/i386/co-586.S | 74 + sys/crypto/openssl/i386/crypt586.S | 44 + sys/crypto/openssl/i386/des-586.S | 254 + sys/crypto/openssl/i386/e_padlock-x86.S | 214 + sys/crypto/openssl/i386/ecp_nistz256-x86.S | 254 + sys/crypto/openssl/i386/ghash-x86.S | 104 + sys/crypto/openssl/i386/md5-586.S | 64 +- sys/crypto/openssl/i386/poly1305-x86.S | 114 + sys/crypto/openssl/i386/rc4-586.S | 64 + sys/crypto/openssl/i386/rc5-586.S | 134 + sys/crypto/openssl/i386/rmd-586.S | 44 + sys/crypto/openssl/i386/sha1-586.S | 74 + sys/crypto/openssl/i386/sha256-586.S | 44 + sys/crypto/openssl/i386/sha512-586.S | 44 + sys/crypto/openssl/i386/vpaes-x86.S | 164 + sys/crypto/openssl/i386/wp-mmx.S | 44 + sys/crypto/openssl/i386/x86-gf2m.S | 64 + sys/crypto/openssl/i386/x86-mont.S | 44 + sys/crypto/openssl/i386/x86cpuid.S | 154 + sys/crypto/openssl/powerpc/bn-ppc.S | 1855 ++++++ sys/crypto/openssl/powerpc/poly1305-ppc.S | 1091 +++- sys/crypto/openssl/powerpc/vpaes-ppc.S | 14 +- sys/crypto/openssl/powerpc64/bn-ppc.S | 1876 ++++++ sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S | 354 ++ sys/crypto/openssl/powerpc64/keccak1600-ppc64.S | 32 +- sys/crypto/openssl/powerpc64/poly1305-ppc.S | 1011 ++- sys/crypto/openssl/powerpc64/vpaes-ppc.S | 14 +- sys/crypto/openssl/powerpc64le/bn-ppc.S | 1876 ++++++ .../openssl/powerpc64le/ecp_nistp521-ppc64.S | 354 ++ sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S | 32 +- sys/crypto/openssl/powerpc64le/poly1305-ppc.S | 1002 ++- sys/crypto/openssl/powerpc64le/vpaes-ppc.S | 14 +- 99 files changed, 37489 insertions(+), 1910 deletions(-) diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S new file mode 100644 index 000000000000..eb85dbc9f996 --- /dev/null +++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S @@ -0,0 +1,6390 @@ +/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */ +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=8 +.arch armv8-a+crypto +.text +.globl aes_gcm_enc_128_kernel +.type aes_gcm_enc_128_kernel,%function +.align 4 +aes_gcm_enc_128_kernel: + cbz x1, .L128_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #160] //load rk10 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 {v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + lsr x5, x1, #3 //byte_len + mov x15, x5 + + ld1 {v18.4s}, [x8], #16 //load rk0 + add x4, x0, x1, lsr #3 //end_input_ptr + sub x5, x5, #1 //byte_len - 1 + + lsr x12, x11, #32 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + fmov d1, x10 //CTR block 1 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + orr w11, w11, w11 + ld1 {v19.4s}, [x8], #16 //load rk1 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d3, x10 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v20.4s}, [x8], #16 //load rk2 + + add w12, w12, #1 //CTR block 3 + fmov v3.d[1], x9 //CTR block 3 + + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v21.4s}, [x8], #16 //load rk3 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + add x5, x5, x0 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v2.16b, v27.16b //AES block 2 - round 9 + + aese v0.16b, v27.16b //AES block 0 - round 9 + + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v1.16b, v27.16b //AES block 1 - round 9 + + aese v3.16b, v27.16b //AES block 3 - round 9 + b.ge .L128_enc_tail //handle tail + + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + eor x6, x6, x13 //AES block 0 - round 10 low + eor x7, x7, x14 //AES block 0 - round 10 high + + eor x21, x21, x13 //AES block 2 - round 10 low + fmov d4, x6 //AES block 0 - mov low + + eor x19, x19, x13 //AES block 1 - round 10 low + eor x22, x22, x14 //AES block 2 - round 10 high + fmov v4.d[1], x7 //AES block 0 - mov high + + fmov d5, x19 //AES block 1 - mov low + eor x20, x20, x14 //AES block 1 - round 10 high + + eor x23, x23, x13 //AES block 3 - round 10 low + fmov v5.d[1], x20 //AES block 1 - mov high + + fmov d6, x21 //AES block 2 - mov low + eor x24, x24, x14 //AES block 3 - round 10 high + rev w9, w12 //CTR block 4 + + fmov v6.d[1], x22 //AES block 2 - mov high + orr x9, x11, x9, lsl #32 //CTR block 4 + + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + add w12, w12, #1 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + add w12, w12, #1 //CTR block 5 + add x0, x0, #64 //AES input_ptr update + fmov v1.d[1], x9 //CTR block 5 + + fmov d7, x23 //AES block 3 - mov low + rev w9, w12 //CTR block 6 + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 6 + + add w12, w12, #1 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + fmov d2, x10 //CTR block 6 + cmp x0, x5 //check if we have <= 8 blocks + + fmov v2.d[1], x9 //CTR block 6 + rev w9, w12 //CTR block 7 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + + orr x9, x11, x9, lsl #32 //CTR block 7 + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L128_enc_prepretail //do prepretail + +.L128_enc_main_loop: //main loop start + ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov d3, x10 //CTR block 4k+3 + + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + add w12, w12, #1 //CTR block 4k+3 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x24, x24, x14 //AES block 4k+3 - round 10 high + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + rev w9, w12 //CTR block 4k+8 + + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + mov d8, v4.d[1] //GHASH block 4k - mid + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + add w12, w12, #1 //CTR block 4k+8 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor x7, x7, x14 //AES block 4k+4 - round 10 high + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor x6, x6, x13 //AES block 4k+4 - round 10 low + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + shl d8, d8, #56 //mod_constant + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor x19, x19, x13 //AES block 4k+5 - round 10 low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor x23, x23, x13 //AES block 4k+3 - round 10 low + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + fmov d4, x6 //AES block 4k+4 - mov low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + add x0, x0, #64 //AES input_ptr update + fmov d7, x23 //AES block 4k+3 - mov low + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + fmov d5, x19 //AES block 4k+5 - mov low + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor x20, x20, x14 //AES block 4k+5 - round 10 high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + fmov v7.d[1], x24 //AES block 4k+3 - mov high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + cmp x0, x5 //.LOOP CONTROL + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + eor x21, x21, x13 //AES block 4k+6 - round 10 low + eor x22, x22, x14 //AES block 4k+6 - round 10 high + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + fmov d6, x21 //AES block 4k+6 - mov low + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + fmov v6.d[1], x22 //AES block 4k+6 - mov high + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + + fmov d0, x10 //CTR block 4k+8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + + add w12, w12, #1 //CTR block 4k+9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + fmov d1, x10 //CTR block 4k+9 + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + add w12, w12, #1 //CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + fmov d2, x10 //CTR block 4k+10 + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + + fmov v2.d[1], x9 //CTR block 4k+10 + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + rev w9, w12 //CTR block 4k+11 + + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + b.lt .L128_enc_main_loop + +.L128_enc_prepretail: //PREPRETAIL + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + fmov d3, x10 //CTR block 4k+3 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + add w12, w12, #1 //CTR block 4k+3 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + eor v4.16b, v4.16b, v11.16b //PRE 1 + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + mov d31, v6.d[1] //GHASH block 4k+2 - mid + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + shl d8, d8, #56 //mod_constant + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull v28.1q, v9.1d, v8.1d + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + ext v9.16b, v9.16b, v9.16b, #8 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v11.16b + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v28.16b + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v9.16b + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + pmull v28.1q, v10.1d, v8.1d + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v28.16b + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + eor v11.16b, v11.16b, v10.16b + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 +.L128_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + cmp x5, #48 + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + eor x6, x6, x13 //AES block 4k+4 - round 10 low + eor x7, x7, x14 //AES block 4k+4 - round 10 high + + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + + b.gt .L128_enc_blocks_more_than_3 + + sub w12, w12, #1 + movi v11.8b, #0 + mov v3.16b, v2.16b + + cmp x5, #32 + mov v2.16b, v1.16b + movi v9.8b, #0 + + movi v10.8b, #0 + b.gt .L128_enc_blocks_more_than_2 + + mov v3.16b, v1.16b + cmp x5, #16 + + sub w12, w12, #1 + b.gt .L128_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L128_enc_blocks_less_than_1 +.L128_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + eor x7, x7, x14 //AES final-2 block - round 10 high + eor x6, x6, x13 //AES final-2 block - round 10 low + + fmov d5, x6 //AES final-2 block - mov low + + movi v8.8b, #0 //suppress further partial tag feed in + fmov v5.d[1], x7 //AES final-2 block - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov d22, v4.d[1] //GHASH final-3 block - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid +.L128_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v4.16b, v5.16b //GHASH final-2 block + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x6, x6, x13 //AES final-1 block - round 10 low + + fmov d5, x6 //AES final-1 block - mov low + eor x7, x7, x14 //AES final-1 block - round 10 high + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + fmov v5.d[1], x7 //AES final-1 block - mov high + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L128_enc_blocks_more_than_1: //blocks left > 1 *** 45317 LINES SKIPPED ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202310252006.39PK6SwI039977>