Date: Sat, 22 Sep 2018 02:23:03 +0000 (UTC) From: Jung-uk Kim <jkim@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r338875 - in projects/openssl111/secure/lib/libcrypto: . aarch64 Message-ID: <201809220223.w8M2N3d6062796@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: jkim Date: Sat Sep 22 02:23:03 2018 New Revision: 338875 URL: https://svnweb.freebsd.org/changeset/base/338875 Log: Regen assemply files for aarch64. Added: projects/openssl111/secure/lib/libcrypto/aarch64/armv8-mont.S (contents, props changed) projects/openssl111/secure/lib/libcrypto/aarch64/chacha-armv8.S (contents, props changed) projects/openssl111/secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S (contents, props changed) projects/openssl111/secure/lib/libcrypto/aarch64/keccak1600-armv8.S (contents, props changed) projects/openssl111/secure/lib/libcrypto/aarch64/poly1305-armv8.S (contents, props changed) projects/openssl111/secure/lib/libcrypto/aarch64/vpaes-armv8.S (contents, props changed) Modified: projects/openssl111/secure/lib/libcrypto/Makefile.asm projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S projects/openssl111/secure/lib/libcrypto/aarch64/ghashv8-armx.S projects/openssl111/secure/lib/libcrypto/aarch64/sha1-armv8.S projects/openssl111/secure/lib/libcrypto/aarch64/sha256-armv8.S projects/openssl111/secure/lib/libcrypto/aarch64/sha512-armv8.S Modified: projects/openssl111/secure/lib/libcrypto/Makefile.asm ============================================================================== --- projects/openssl111/secure/lib/libcrypto/Makefile.asm Sat Sep 22 01:24:30 2018 (r338874) +++ projects/openssl111/secure/lib/libcrypto/Makefile.asm Sat Sep 22 02:23:03 2018 (r338875) @@ -10,19 +10,35 @@ .PATH: ${LCRYPTO_SRC}/crypto \ ${LCRYPTO_SRC}/crypto/aes/asm \ + ${LCRYPTO_SRC}/crypto/bn/asm \ + ${LCRYPTO_SRC}/crypto/chacha/asm \ + ${LCRYPTO_SRC}/crypto/ec/asm \ ${LCRYPTO_SRC}/crypto/modes/asm \ + ${LCRYPTO_SRC}/crypto/poly1305/asm \ ${LCRYPTO_SRC}/crypto/sha/asm PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm # aes -SRCS= aesv8-armx.pl +SRCS= aesv8-armx.pl vpaes-armv8.pl +# bn +SRCS+= armv8-mont.pl + +# chacha +SRCS+= chacha-armv8.pl + +# ec +SRCS+= ecp_nistz256-armv8.pl + # modes SRCS+= ghashv8-armx.pl +# poly1305 +SRCS+= poly1305-armv8.pl + # sha -SRCS+= sha1-armv8.pl sha512-armv8.pl +SRCS+= keccak1600-armv8.pl sha1-armv8.pl sha512-armv8.pl ASM= ${SRCS:R:S/$/.S/} sha256-armv8.S @@ -32,13 +48,13 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s .SUFFIXES: .pl sha256-armv8.S: sha512-armv8.pl - env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/} + env CC=cc perl ${.ALLSRC} linux64 ${.TARGET:R:S/$/.s/} ( echo '/* $$'FreeBSD'$$ */' ;\ echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\ cat ${.TARGET:R:S/$/.s/}) > ${.TARGET} .pl.S: - env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/} + env CC=cc perl ${.IMPSRC} linux64 ${.TARGET:R:S/$/.s/} ( echo '/* $$'FreeBSD'$$ */' ;\ echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\ cat ${.TARGET:R:S/$/.s/}) > ${.TARGET} @@ -160,10 +176,10 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} aes-armv4.S: aes-armv4.pl ( echo '/* $$'FreeBSD'$$ */' ;\ echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T}. */' ;\ - env CC=cc perl ${.ALLSRC} elf ) > ${.TARGET} + env CC=cc perl ${.ALLSRC} linux32 ) > ${.TARGET} .pl.S: - env CC=cc perl ${.IMPSRC} elf ${.TARGET:R:S/$/.s/} + env CC=cc perl ${.IMPSRC} linux32 ${.TARGET:R:S/$/.s/} ( echo '/* $$'FreeBSD'$$ */' ;\ echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\ cat ${.TARGET:R:S/$/.s/}) > ${.TARGET} Modified: projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S ============================================================================== --- projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S Sat Sep 22 01:24:30 2018 (r338874) +++ projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S Sat Sep 22 02:23:03 2018 (r338875) @@ -5,7 +5,7 @@ #if __ARM_MAX_ARCH__>=7 .text .align 5 -rcon: +.Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b @@ -30,7 +30,7 @@ aes_v8_set_encrypt_key: tst w1,#0x3f b.ne .Lenc_key_abort - adr x3,rcon + adr x3,.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b @@ -54,7 +54,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b @@ -71,7 +71,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b @@ -85,7 +85,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b eor v3.16b,v3.16b,v6.16b st1 {v3.4s},[x2] @@ -116,7 +116,7 @@ aes_v8_set_encrypt_key: dup v5.4s,v3.s[3] eor v5.16b,v5.16b,v4.16b - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b ext v4.16b,v0.16b,v4.16b,#12 shl v1.16b,v1.16b,#1 eor v4.16b,v4.16b,v5.16b @@ -147,7 +147,7 @@ aes_v8_set_encrypt_key: ext v5.16b,v0.16b,v5.16b,#12 eor v3.16b,v3.16b,v5.16b ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b + eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b @@ -291,13 +291,13 @@ aes_v8_cbc_encrypt: ld1 {v6.16b},[x4] ld1 {v0.16b},[x0],x8 - ld1 {v16.4s-v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[x3] // load key schedule... sub w5,w5,#6 add x7,x3,x5,lsl#4 // pointer to last 7 round keys sub w5,w5,#2 - ld1 {v18.4s-v19.4s},[x7],#32 - ld1 {v20.4s-v21.4s},[x7],#32 - ld1 {v22.4s-v23.4s},[x7],#32 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 ld1 {v7.4s},[x7] add x7,x3,#32 @@ -309,7 +309,7 @@ aes_v8_cbc_encrypt: eor v5.16b,v16.16b,v7.16b b.eq .Lcbc_enc128 - ld1 {v2.4s-v3.4s},[x7] + ld1 {v2.4s,v3.4s},[x7] add x7,x3,#16 add x6,x3,#16*4 add x12,x3,#16*5 @@ -323,7 +323,7 @@ aes_v8_cbc_encrypt: .Loop_cbc_enc: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[x1],#16 .Lenter_cbc_enc: aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -347,21 +347,21 @@ aes_v8_cbc_encrypt: .Lcbc_enc192: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - subs x2,x2,#16 + subs x2,x2,#16 aese v0.16b,v17.16b aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq + csel x8,xzr,x8,eq aese v0.16b,v18.16b aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 + ld1 {v16.16b},[x0],x8 aese v0.16b,v20.16b aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b + eor v16.16b,v16.16b,v5.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b @@ -373,35 +373,35 @@ aes_v8_cbc_encrypt: .align 5 .Lcbc_enc128: - ld1 {v2.4s-v3.4s},[x7] + ld1 {v2.4s,v3.4s},[x7] aese v0.16b,v16.16b aesmc v0.16b,v0.16b b .Lenter_cbc_enc128 .Loop_cbc_enc128: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[x1],#16 .Lenter_cbc_enc128: aese v0.16b,v17.16b aesmc v0.16b,v0.16b - subs x2,x2,#16 + subs x2,x2,#16 aese v0.16b,v2.16b aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq + csel x8,xzr,x8,eq aese v0.16b,v3.16b aesmc v0.16b,v0.16b aese v0.16b,v18.16b aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 + ld1 {v16.16b},[x0],x8 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v0.16b,v22.16b aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b + eor v16.16b,v16.16b,v5.16b aese v0.16b,v23.16b eor v6.16b,v0.16b,v7.16b b.hs .Loop_cbc_enc128 @@ -448,58 +448,58 @@ aes_v8_cbc_encrypt: aesimc v1.16b,v1.16b aesd v18.16b,v16.16b aesimc v18.16b,v18.16b - eor v4.16b,v6.16b,v7.16b - subs x2,x2,#0x30 - eor v5.16b,v2.16b,v7.16b - csel x6,x2,x6,lo // x6, w6, is zero at this point + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point aesd v0.16b,v17.16b aesimc v0.16b,v0.16b aesd v1.16b,v17.16b aesimc v1.16b,v1.16b aesd v18.16b,v17.16b aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b - add x0,x0,x6 // x0 is adjusted in such way that + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that // at exit from the loop v1.16b-v18.16b // are loaded with last "words" - orr v6.16b,v19.16b,v19.16b - mov x7,x3 + orr v6.16b,v19.16b,v19.16b + mov x7,x3 aesd v0.16b,v20.16b aesimc v0.16b,v0.16b aesd v1.16b,v20.16b aesimc v1.16b,v1.16b aesd v18.16b,v20.16b aesimc v18.16b,v18.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[x0],#16 aesd v0.16b,v21.16b aesimc v0.16b,v0.16b aesd v1.16b,v21.16b aesimc v1.16b,v1.16b aesd v18.16b,v21.16b aesimc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[x0],#16 aesd v0.16b,v22.16b aesimc v0.16b,v0.16b aesd v1.16b,v22.16b aesimc v1.16b,v1.16b aesd v18.16b,v22.16b aesimc v18.16b,v18.16b - ld1 {v19.16b},[x0],#16 + ld1 {v19.16b},[x0],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v18.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - add w6,w5,#2 + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v18.16b,v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] st1 {v4.16b},[x1],#16 - orr v0.16b,v2.16b,v2.16b + orr v0.16b,v2.16b,v2.16b st1 {v5.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b + orr v1.16b,v3.16b,v3.16b st1 {v18.16b},[x1],#16 - orr v18.16b,v19.16b,v19.16b + orr v18.16b,v19.16b,v19.16b b.hs .Loop3x_cbc_dec cmn x2,#0x30 @@ -532,30 +532,30 @@ aes_v8_cbc_encrypt: aesimc v1.16b,v1.16b aesd v18.16b,v20.16b aesimc v18.16b,v18.16b - cmn x2,#0x20 + cmn x2,#0x20 aesd v1.16b,v21.16b aesimc v1.16b,v1.16b aesd v18.16b,v21.16b aesimc v18.16b,v18.16b - eor v5.16b,v6.16b,v7.16b + eor v5.16b,v6.16b,v7.16b aesd v1.16b,v22.16b aesimc v1.16b,v1.16b aesd v18.16b,v22.16b aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b + eor v17.16b,v3.16b,v7.16b aesd v1.16b,v23.16b aesd v18.16b,v23.16b b.eq .Lcbc_dec_one eor v5.16b,v5.16b,v1.16b eor v17.16b,v17.16b,v18.16b - orr v6.16b,v19.16b,v19.16b + orr v6.16b,v19.16b,v19.16b st1 {v5.16b},[x1],#16 st1 {v17.16b},[x1],#16 b .Lcbc_done .Lcbc_dec_one: eor v5.16b,v5.16b,v18.16b - orr v6.16b,v19.16b,v19.16b + orr v6.16b,v19.16b,v19.16b st1 {v5.16b},[x1],#16 .Lcbc_done: @@ -568,181 +568,181 @@ aes_v8_cbc_encrypt: .type aes_v8_ctr32_encrypt_blocks,%function .align 5 aes_v8_ctr32_encrypt_blocks: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - ldr w5,[x3,#240] + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] - ldr w8, [x4, #12] - ld1 {v0.4s},[x4] + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] - ld1 {v16.4s-v17.4s},[x3] // load key schedule... - sub w5,w5,#4 - mov x12,#16 - cmp x2,#2 - add x7,x3,x5,lsl#4 // pointer to last 5 round keys - sub w5,w5,#2 - ld1 {v20.4s-v21.4s},[x7],#32 - ld1 {v22.4s-v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - add x7,x3,#32 - mov w6,w5 + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 csel x12,xzr,x12,lo #ifndef __ARMEB__ - rev w8, w8 + rev w8, w8 #endif - orr v1.16b,v0.16b,v0.16b - add w10, w8, #1 - orr v18.16b,v0.16b,v0.16b - add w8, w8, #2 - orr v6.16b,v0.16b,v0.16b - rev w10, w10 - mov v1.s[3],w10 - b.ls .Lctr32_tail - rev w12, w8 - sub x2,x2,#3 // bias - mov v18.s[3],w12 - b .Loop3x_ctr32 + orr v1.16b,v0.16b,v0.16b + add w10, w8, #1 + orr v18.16b,v0.16b,v0.16b + add w8, w8, #2 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v1.s[3],w10 + b.ls .Lctr32_tail + rev w12, w8 + sub x2,x2,#3 // bias + mov v18.s[3],w12 + b .Loop3x_ctr32 .align 4 .Loop3x_ctr32: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - aese v18.16b,v16.16b - aesmc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - aese v18.16b,v17.16b - aesmc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt .Loop3x_ctr32 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ctr32 - aese v0.16b,v16.16b - aesmc v4.16b,v0.16b - aese v1.16b,v16.16b - aesmc v5.16b,v1.16b - ld1 {v2.16b},[x0],#16 - orr v0.16b,v6.16b,v6.16b - aese v18.16b,v16.16b - aesmc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 - orr v1.16b,v6.16b,v6.16b - aese v4.16b,v17.16b - aesmc v4.16b,v4.16b - aese v5.16b,v17.16b - aesmc v5.16b,v5.16b - ld1 {v19.16b},[x0],#16 - mov x7,x3 - aese v18.16b,v17.16b - aesmc v17.16b,v18.16b - orr v18.16b,v6.16b,v6.16b - add w9,w8,#1 - aese v4.16b,v20.16b - aesmc v4.16b,v4.16b - aese v5.16b,v20.16b - aesmc v5.16b,v5.16b - eor v2.16b,v2.16b,v7.16b - add w10,w8,#2 - aese v17.16b,v20.16b - aesmc v17.16b,v17.16b - eor v3.16b,v3.16b,v7.16b - add w8,w8,#3 - aese v4.16b,v21.16b - aesmc v4.16b,v4.16b - aese v5.16b,v21.16b - aesmc v5.16b,v5.16b - eor v19.16b,v19.16b,v7.16b - rev w9,w9 - aese v17.16b,v21.16b - aesmc v17.16b,v17.16b - mov v0.s[3], w9 - rev w10,w10 - aese v4.16b,v22.16b - aesmc v4.16b,v4.16b - aese v5.16b,v22.16b - aesmc v5.16b,v5.16b - mov v1.s[3], w10 - rev w12,w8 - aese v17.16b,v22.16b - aesmc v17.16b,v17.16b - mov v18.s[3], w12 - subs x2,x2,#3 - aese v4.16b,v23.16b - aese v5.16b,v23.16b - aese v17.16b,v23.16b + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + orr v0.16b,v6.16b,v6.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + orr v18.16b,v6.16b,v6.16b + add w9,w8,#1 + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + eor v19.16b,v19.16b,v7.16b + rev w9,w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + mov v0.s[3], w9 + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + mov v1.s[3], w10 + rev w12,w8 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + mov v18.s[3], w12 + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b - eor v2.16b,v2.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - st1 {v2.16b},[x1],#16 - eor v3.16b,v3.16b,v5.16b - mov w6,w5 - st1 {v3.16b},[x1],#16 - eor v19.16b,v19.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v19.16b},[x1],#16 - b.hs .Loop3x_ctr32 + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs .Loop3x_ctr32 - adds x2,x2,#3 - b.eq .Lctr32_done - cmp x2,#1 - mov x12,#16 + adds x2,x2,#3 + b.eq .Lctr32_done + cmp x2,#1 + mov x12,#16 csel x12,xzr,x12,eq .Lctr32_tail: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - ld1 {v17.4s},[x7],#16 - b.gt .Lctr32_tail + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lctr32_tail - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - ld1 {v2.16b},[x0],x12 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v1.16b,v20.16b - aesmc v1.16b,v1.16b - ld1 {v3.16b},[x0] - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v1.16b,v21.16b - aesmc v1.16b,v1.16b - eor v2.16b,v2.16b,v7.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - aese v1.16b,v22.16b - aesmc v1.16b,v1.16b - eor v3.16b,v3.16b,v7.16b - aese v0.16b,v23.16b - aese v1.16b,v23.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b - cmp x2,#1 - eor v2.16b,v2.16b,v0.16b - eor v3.16b,v3.16b,v1.16b - st1 {v2.16b},[x1],#16 - b.eq .Lctr32_done - st1 {v3.16b},[x1] + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq .Lctr32_done + st1 {v3.16b},[x1] .Lctr32_done: - ldr x29,[sp],#16 + ldr x29,[sp],#16 ret .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks #endif Added: projects/openssl111/secure/lib/libcrypto/aarch64/armv8-mont.S ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/openssl111/secure/lib/libcrypto/aarch64/armv8-mont.S Sat Sep 22 02:23:03 2018 (r338875) @@ -0,0 +1,1406 @@ +/* $FreeBSD$ */ +/* Do not modify. This file is auto-generated from armv8-mont.pl. */ +.text + +.globl bn_mul_mont +.type bn_mul_mont,%function +.align 5 +bn_mul_mont: + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +.Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,.L1st_skip + +.L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,.L1st + +.L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +.Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,.Linner_skip + +.Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,.Linner + +.Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +.Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,.Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +.Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,.Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size bn_mul_mont,.-bn_mul_mont +.type __bn_sqr8x_mont,%function +.align 5 +__bn_sqr8x_mont: + cmp x1,x2 + b.ne __bn_mul4x_mont +.Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,.Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201809220223.w8M2N3d6062796>