From owner-svn-src-all@freebsd.org Sat Jun 6 00:35:42 2020 Return-Path: Delivered-To: svn-src-all@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 75D4D33C5B1; Sat, 6 Jun 2020 00:35:42 +0000 (UTC) (envelope-from emaste@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 49f0t62pQ8z4HSW; Sat, 6 Jun 2020 00:35:42 +0000 (UTC) (envelope-from emaste@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 5B4D019B86; Sat, 6 Jun 2020 00:35:42 +0000 (UTC) (envelope-from emaste@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 0560Zg4T021205; Sat, 6 Jun 2020 00:35:42 GMT (envelope-from emaste@FreeBSD.org) Received: (from emaste@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id 0560Zg32021203; Sat, 6 Jun 2020 00:35:42 GMT (envelope-from emaste@FreeBSD.org) Message-Id: <202006060035.0560Zg32021203@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: emaste set sender to emaste@FreeBSD.org using -f From: Ed Maste Date: Sat, 6 Jun 2020 00:35:42 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r361853 - in head: lib/libmd sys/crypto/skein/amd64 sys/modules/crypto X-SVN-Group: head X-SVN-Commit-Author: emaste X-SVN-Commit-Paths: in head: lib/libmd sys/crypto/skein/amd64 sys/modules/crypto X-SVN-Commit-Revision: 361853 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.33 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 06 Jun 2020 00:35:42 -0000 Author: emaste Date: Sat Jun 6 00:35:41 2020 New Revision: 361853 URL: https://svnweb.freebsd.org/changeset/base/361853 Log: Rename skein_block_asm.s to .S and assemble using Clang IAS Comparing the object files produced by GNU as 2.17.50 and Clang IAS shows many immaterial changes in strtab etc., and one material change in .text: 1bac: 4c 8b 4f 18 mov 0x18(%rdi),%r9 1bb0: eb 0e jmp 1bc0 - 1bb2: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) - 1bb9: 00 00 00 00 - 1bbd: 0f 1f 00 nopl (%rax) + 1bb2: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) + 1bb9: 00 00 00 + 1bbc: 0f 1f 40 00 nopl 0x0(%rax) 0000000000001bc0 : Skein1024_block_loop(): 1bc0: 4c 8b 47 10 mov 0x10(%rdi),%r8 1bc4: 4c 03 85 c0 00 00 00 add 0xc0(%rbp),%r8 That is, GNU as and Clang's integrated assembler use different multi- byte NOPs for alignment (GNU as emits an 11 byte NOP + a 3 byte NOP, while Clang IAS emits a 10 byte NOP + a 4 byte NOP). Dependency cleanup hacks are not required, because we do not create .depend files from GNU as. Reviewed by: allanjude, arichardson, cem, tsoome Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D8434 Added: head/sys/crypto/skein/amd64/skein_block_asm.S - copied unchanged from r361852, head/sys/crypto/skein/amd64/skein_block_asm.s Deleted: head/sys/crypto/skein/amd64/skein_block_asm.s Modified: head/lib/libmd/Makefile head/sys/modules/crypto/Makefile Modified: head/lib/libmd/Makefile ============================================================================== --- head/lib/libmd/Makefile Sat Jun 6 00:02:50 2020 (r361852) +++ head/lib/libmd/Makefile Sat Jun 6 00:35:41 2020 (r361853) @@ -116,18 +116,15 @@ CFLAGS+= -DSHA1_ASM SRCS+= rmd160.S CFLAGS+= -DRMD160_ASM .endif -.if exists(${MACHINE_ARCH}/skein_block_asm.s) -.if defined(XAS) || ${MK_BINUTILS_BOOTSTRAP} != "no" -AFLAGS += --strip-local-absolute +.if exists(${MACHINE_ARCH}/skein_block_asm.S) # Fully unroll all loops in the assembly optimized version -AFLAGS+= --defsym SKEIN_LOOP=0 --defsym SKEIN_USE_ASM=1792 -SRCS+= skein_block_asm.s +ACFLAGS+= -DSKEIN_LOOP=0 +SRCS+= skein_block_asm.S CFLAGS+= -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792 .else .warning as not available: not using optimized Skein asm .endif -.endif -.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.s) +.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.S) ACFLAGS+= -DELF -Wa,--noexecstack .endif .endif # ${USE_ASM_SOURCES} != 0 Copied: head/sys/crypto/skein/amd64/skein_block_asm.S (from r361852, head/sys/crypto/skein/amd64/skein_block_asm.s) ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/sys/crypto/skein/amd64/skein_block_asm.S Sat Jun 6 00:35:41 2020 (r361853, copy of r361852, head/sys/crypto/skein/amd64/skein_block_asm.s) @@ -0,0 +1,1333 @@ +# +#---------------------------------------------------------------- +# 64-bit x86 assembler code (gnu as) for Skein block functions +# +# Author: Doug Whiting, Hifn/Exar +# +# This code is released to the public domain. +#---------------------------------------------------------------- +# $FreeBSD$ +# + .text + .altmacro +#ifndef __clang__ + .psize 0,128 #list file has no page boundaries +#endif +# +_MASK_ALL_ = (256+512+1024) #all three algorithm bits +_MAX_FRAME_ = 240 +# +################# +#ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +#else +_USE_ASM_ = SKEIN_USE_ASM +#endif +################# +#configure loop unrolling +#ifndef SKEIN_LOOP +_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 +#else +_SKEIN_LOOP = SKEIN_LOOP + .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line +#.print "+++ SKEIN_LOOP = \_NN_" + .endr +#endif +# the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 +# +SKEIN_ASM_UNROLL = 0 + .irp _NN_,256,512,1024 + .if (SKEIN_UNROLL_\_NN_) == 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ + .endif + .endr +################# +# +.ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +.else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) +# only display rounds if default size is changed on command line +.irp _NN_,256,512,1024 + .if _USE_ASM_ && \_NN_ + .irp _RR_,%(ROUNDS_\_NN_) + .if _NN_ < 1024 +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .else +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .endif + .endr + .endif +.endr +.endif +################# +# +.ifdef SKEIN_CODE_SIZE +_SKEIN_CODE_SIZE = (1) +.else +.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined +_SKEIN_CODE_SIZE = (1) +.else +_SKEIN_CODE_SIZE = (0) +.endif +.endif +# +################# +# +.ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +.else +_SKEIN_DEBUG = 1 +.endif +################# +# +# define offsets of fields in hash context structure +# +HASH_BITS = 0 #bits of hash output +BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] +TWEAK = 8 + BCNT #tweak values[0..1] +X_VARS = 16 + TWEAK #chaining vars +# +#(Note: buffer[] in context structure is NOT needed here :-) +# +KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words +FIRST_MASK = ~ (1 << 6) +FIRST_MASK64= ~ (1 << 62) +# +# rotation constants for Skein +# +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +# +# Input: reg +# Output: <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 +# +.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM + .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do? + rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg + .endif +.endm +# +#---------------------------------------------------------------- +# +# MACROS: define local vars and configure stack +# +#---------------------------------------------------------------- +# declare allocated space on the stack +.macro StackVar localName,localSize +\localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(\localSize) +.endm #StackVar +# +#---------------------------------------------------------------- +# +# MACRO: Configure stack frame, allocate local vars +# +.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt + WCNT = (\BLK_BITS)/64 +# +_PushCnt_ = 0 #save nonvolatile regs on stack + .irp _reg_,rbp,rbx,r12,r13,r14,r15 + pushq %\_reg_ +_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment + .endr +# +_STK_OFFS_ = 0 #starting offset from rsp + #---- local variables #<-- rsp + StackVar X_stk ,8*(WCNT) #local context vars + StackVar ksTwk ,8*3 #key schedule: tweak words + StackVar ksKey ,8*(WCNT)+8 #key schedule: key words + .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0 + StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen + .endif + StackVar Wcopy ,8*(WCNT) #copy of input block + .if _SKEIN_DEBUG + .if \debugCnt + 0 #temp location for debug X[] info + StackVar xDebug_\BLK_BITS ,8*(\debugCnt) + .endif + .endif + .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 + StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) +tmpStk_\BLK_BITS = align16 #use this + .endif + #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) + StackVar ctxPtr ,8 #context ptr + StackVar blkPtr ,8 #pointer to block data + StackVar blkCnt ,8 #number of full blocks to process + StackVar bitAdd ,8 #bit count to add to tweak +LOCAL_SIZE = _STK_OFFS_ #size of "local" vars + #---- + StackVar savRegs,8*_PushCnt_ #saved registers + StackVar retAddr,8 #return address + #---- caller's stack frame (aligned mod 16) +# +# set up the stack frame pointer (rbp) +# +FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey + .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range +FRAME_OFFS = _STK_OFFS_ + .endif +F_O = -FRAME_OFFS +# + #put some useful defines in the .lst file (for grep) +__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE +__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ +__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS +# +# Notes on stack frame setup: +# * the most frequently used variable is X_stk[], based at [rsp+0] +# * the next most used is the key schedule arrays, ksKey and ksTwk +# so rbp is "centered" there, allowing short offsets to the key +# schedule even in 1024-bit Skein case +# * the Wcopy variables are infrequently accessed, but they have long +# offsets from both rsp and rbp only in the 1024-bit case. +# * all other local vars and calling parameters can be accessed +# with short offsets, except in the 1024-bit case +# + subq $LOCAL_SIZE,%rsp #make room for the locals + leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets + movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack + movq %rsi, blkPtr+F_O(%rbp) + movq %rdx, blkCnt+F_O(%rbp) + movq %rcx, bitAdd+F_O(%rbp) +# +.endm #Setup_Stack +# +#---------------------------------------------------------------- +# +.macro Reset_Stack + addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?) + .irp _reg_,r15,r14,r13,r12,rbx,rbp + popq %\_reg_ #restore caller's regs +_PushCnt_ = _PushCnt_ - 1 + .endr + .if _PushCnt_ + .error "Mismatched push/pops?" + .endif +.endm # Reset_Stack +# +#---------------------------------------------------------------- +# macros to help debug internals +# +.if _SKEIN_DEBUG + .extern Skein_Show_Block #calls to C routines + .extern Skein_Show_Round +# +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +# +.macro Skein_Debug_Block BLK_BITS +# +#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +# const u08b_t *blkPtr, const u64b_t *wPtr, +# const u64b_t *ksPtr,const u64b_t *tsPtr) +# +_NN_ = 0 + .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 + pushq %\_reg_ #save all volatile regs on tack before the call +_NN_ = _NN_ + 1 + .endr + # get and push call parameters + movq $\BLK_BITS ,%rdi #bits + movq ctxPtr+F_O(%rbp),%rsi #h (pointer) + leaq X_VARS (%rsi),%rdx #X (pointer) + movq blkPtr+F_O(%rbp),%rcx #blkPtr + leaq Wcopy +F_O(%rbp),%r8 #wPtr + leaq ksKey +F_O(%rbp),%r9 #key pointer + leaq ksTwk +F_O(%rbp),%rax #tweak pointer + pushq %rax # (pass on the stack) + call Skein_Show_Block #call external debug handler + addq $8*1,%rsp #discard parameters on stack + .if (_NN_ % 2 ) == 0 #check stack alignment + .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" + .endif + .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax + popq %\_reg_ #restore regs +_NN_ = _NN_ - 1 + .endr + .if _NN_ + .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" + .endif +.endm # Skein_Debug_Block +# +# the macro to "call" to debug a round +# +.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp + # call the appropriate (local) debug "function" + pushq %rdx #save rdx, so we can use it for round "number" + .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) + movq $\R,%rdx + .else #compute round number using edi +_rOffs_ = \RDI_OFFS + 0 + .if \BLK_BITS == 1024 + movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) + leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx + .else + leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx + .endif + .endif + call Skein_Debug_Round_\BLK_BITS + popq %rdx #restore origianl rdx value +# + afterOp +.endm # Skein_Debug_Round +.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) +.macro Skein_Debug_Block BLK_BITS +.endm +# +.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp +.endm +# +.endif # _SKEIN_DEBUG +# +#---------------------------------------------------------------- +# +.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs + .if \immOffs + 0 + leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg + .elseif ((\useAddOp + 0) == 0) + .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! + leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg + .else + addq %\srcReg_A\srcReg_B,%\dstReg + .endif + .else + addq %\srcReg_A\srcReg_B,%\dstReg + .endif +.endm + +# keep Intel-style ordering here, to match addReg +.macro xorReg dstReg,srcReg_A,srcReg_B + xorq %\srcReg_A\srcReg_B,%\dstReg +.endm +# +#---------------------------------------------------------------- +# +.macro C_label lName + \lName: #use both "genders" to work across linkage conventions +_\lName: + .global \lName + .global _\lName +.endm +# +#=================================== Skein_256 ============================================= +# +.if _USE_ASM_ & 256 +# +# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# +# code +# +C_label Skein_256_Process_Block + Setup_Stack 256,((ROUNDS_256/8)+1) + movq TWEAK+8(%rdi),%r14 + jmp Skein_256_block_loop + .p2align 4 + # main hash loop for Skein_256 +Skein_256_block_loop: + # + # general register usage: + # RAX..RDX = X0..X3 + # R08..R12 = ks[0..4] + # R13..R15 = ts[0..2] + # RSP, RBP = stack/frame pointers + # RDI = round counter or context pointer + # RSI = temp + # + movq TWEAK+0(%rdi) ,%r13 + addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 + movq %r14 ,%r15 + xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak + + movq $KW_PARITY ,%r12 + movq X_VARS+ 0(%rdi),%r8 + movq X_VARS+ 8(%rdi),%r9 + movq X_VARS+16(%rdi),%r10 + movq X_VARS+24(%rdi),%r11 + movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] + xorq %r8 ,%r12 #start accumulating overall parity + + movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block + xorq %r9 ,%r12 + movq 0(%rsi) ,%rax #get X[0..3] + xorq %r10 ,%r12 + movq 8(%rsi) ,%rbx + xorq %r11 ,%r12 + movq 16(%rsi) ,%rcx + movq 24(%rsi) ,%rdx + + movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block + movq %rbx,Wcopy+ 8+F_O(%rbp) + movq %rcx,Wcopy+16+F_O(%rbp) + movq %rdx,Wcopy+24+F_O(%rbp) + + addq %r8 ,%rax #initial key injection + addq %r9 ,%rbx + addq %r10,%rcx + addq %r11,%rdx + addq %r13,%rbx + addq %r14,%rcx + +.if _SKEIN_DEBUG + movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) + movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block + movq %r9 ,ksKey+ 8+F_O(%rbp) + movq %r10,ksKey+16+F_O(%rbp) + movq %r11,ksKey+24+F_O(%rbp) + movq %r12,ksKey+32+F_O(%rbp) + + movq %r13,ksTwk+ 0+F_O(%rbp) + movq %r14,ksTwk+ 8+F_O(%rbp) + movq %r15,ksTwk+16+F_O(%rbp) + + movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block + movq %rbx,X_stk + 8(%rsp) + movq %rcx,X_stk +16(%rsp) + movq %rdx,X_stk +24(%rsp) + + Skein_Debug_Block 256 #debug dump + Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL +.endif +# +.if ((SKEIN_ASM_UNROLL & 256) == 0) + movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code + movq %r9 ,ksKey+ 8+F_O(%rbp) + movq %r10,ksKey+16+F_O(%rbp) + movq %r11,ksKey+24+F_O(%rbp) + movq %r12,ksKey+32+F_O(%rbp) + + movq %r13,ksTwk+24+F_O(%rbp) + movq %r14,ksTwk+ 8+F_O(%rbp) + movq %r15,ksTwk+16+F_O(%rbp) +.endif + addq $WCNT*8,%rsi #skip the block + movq %rsi,blkPtr +F_O(%rbp) #update block pointer + # + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 256 +_UNROLL_CNT = ROUNDS_256/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_256 + .if ((ROUNDS_256/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_256" + .endif + xorq %rdi,%rdi #rdi = iteration count +Skein_256_round_loop: +.endif +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) + # round 4*_RBase_ + 0 + addReg rax, rbx + RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 + addReg rcx, rdx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 + .endif + xorReg rbx, rax + RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 + xorReg rdx, rcx + .if SKEIN_ASM_UNROLL & 256 + .irp _r0_,%( 8+(_Rbase_+3) % 5) + .irp _r1_,%(13+(_Rbase_+2) % 3) + leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx + .endr + .endr + .endif + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 + .endif + Skein_Debug_Round 256,%(4*_Rbase_+1) + + # round 4*_Rbase_ + 1 + addReg rax, rdx + RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 + xorReg rdx, rax + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 + .endif + addReg rcx, rbx + RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 + xorReg rbx, rcx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 + .endif + Skein_Debug_Round 256,%(4*_Rbase_+2) + .if SKEIN_ASM_UNROLL & 256 + .irp _r0_,%( 8+(_Rbase_+2) % 5) + .irp _r1_,%(13+(_Rbase_+1) % 3) + leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx + .endr + .endr + .endif + # round 4*_Rbase_ + 2 + addReg rax, rbx + RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 + addReg rcx, rdx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 + .endif + xorReg rbx, rax + RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 + xorReg rdx, rcx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key + leaq 1(%r11,%rdi),%r11 #precompute key + tweak + .endif + Skein_Debug_Round 256,%(4*_Rbase_+3) + # round 4*_Rbase_ + 3 + addReg rax, rdx + RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 + addReg rcx, rbx + .if (SKEIN_ASM_UNROLL & 256) == 0 + addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak + movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak + .endif + xorReg rdx, rax + RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 + xorReg rbx, rcx + Skein_Debug_Round 256,%(4*_Rbase_+4) + .if (SKEIN_ASM_UNROLL & 256) == 0 + addReg r9 ,r13 #precompute key+tweak + .endif + #inject key schedule words +_Rbase_ = _Rbase_+1 + .if SKEIN_ASM_UNROLL & 256 + addReg rax,r,%(8+((_Rbase_+0) % 5)) + addReg rbx,rsi + addReg rcx,rdi + addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ + .else + incq %rdi + addReg rax,r8 + addReg rcx,r10 + addReg rbx,r9 + addReg rdx,r11 + .endif + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 256) == 0 + cmpq $2*(ROUNDS_256/8),%rdi + jb Skein_256_round_loop +.endif # (SKEIN_ASM_UNROLL & 256) == 0 + movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context + + #---------------------------- + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} + movq $FIRST_MASK64 ,%r14 + xorq Wcopy + 0+F_O (%rbp),%rax + xorq Wcopy + 8+F_O (%rbp),%rbx + xorq Wcopy +16+F_O (%rbp),%rcx + xorq Wcopy +24+F_O (%rbp),%rdx + andq TWEAK + 8 (%rdi),%r14 + movq %rax,X_VARS+ 0(%rdi) #store final result + movq %rbx,X_VARS+ 8(%rdi) + movq %rcx,X_VARS+16(%rdi) + movq %rdx,X_VARS+24(%rdi) + + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD + + # go back for more blocks, if needed + decq blkCnt+F_O(%rbp) + jnz Skein_256_block_loop + movq %r14,TWEAK + 8(%rdi) + Reset_Stack + ret +Skein_256_Process_Block_End: + + .if _SKEIN_DEBUG +Skein_Debug_Round_256: #here with rdx == round "number" from macro + pushq %rsi #save two regs for BLK_BITS-specific parms + pushq %rdi + movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi + movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it + movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) + movq %rcx,X_stk+16+F_O(%rbp) + movq %rdi,X_stk+24+F_O(%rbp) + + movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr + movq $256,%rdi #now are set for the call + jmp Skein_Debug_Round_Common + .endif +# +.if _SKEIN_CODE_SIZE +C_label Skein_256_Process_Block_CodeSize + movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax + ret +# +C_label Skein_256_Unroll_Cnt + .if _UNROLL_CNT <> ROUNDS_256/8 + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif #_USE_ASM_ & 256 +# +#=================================== Skein_512 ============================================= +# +.if _USE_ASM_ & 512 +# +# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) +# +# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) +# +################# +# MACRO: one round for 512-bit blocks +# +.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 +# + addReg r\rn0, r\rn1 + RotL64 r\rn1, 512,%((\_Rn_) % 8),0 + xorReg r\rn1, r\rn0 + \op1 + addReg r\rn2, r\rn3 + RotL64 r\rn3, 512,%((\_Rn_) % 8),1 + xorReg r\rn3, r\rn2 + \op2 + addReg r\rn4, r\rn5 + RotL64 r\rn5, 512,%((\_Rn_) % 8),2 + xorReg r\rn5, r\rn4 + \op3 + addReg r\rn6, r\rn7 + RotL64 r\rn7, 512,%((\_Rn_) % 8),3 + xorReg r\rn7, r\rn6 + \op4 + Skein_Debug_Round 512,%(\_Rn_+1),-4 +# +.endm #R_512_OneRound +# +################# +# MACRO: eight rounds for 512-bit blocks +# +.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) + .if (SKEIN_ASM_UNROLL && 512) + # here for fully unrolled case. + _II_ = ((\_RR_)/4) + 1 #key injection counter + R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),,, + R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),,, + R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),,, + R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),, + # inject the key schedule + addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 + addReg r11, rax + addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 + addReg r12, rbx + addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 + addReg r13, rcx + addReg r14, rdx + addReg r15, rsi,,,(_II_) + .else + # here for looping case #"rotate" key/tweak schedule (move up on stack) + incq %rdi #bump key injection counter + R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),,, + R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),,, + R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),,, + R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),, + # inject the key schedule + addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 + addReg r11, rax + addReg r12, rbx + addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 + addReg r13, rcx + addReg r14, rdx + addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 + addReg r15, rsi + addReg r15, rdi #inject the round number + .endif + + #show the result of the key injection + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT +.endm #R_512_EightRounds +# +################# +# instantiated code +# +C_label Skein_512_Process_Block + Setup_Stack 512,ROUNDS_512/8 + movq TWEAK+ 8(%rdi),%rbx + jmp Skein_512_block_loop + .p2align 4 + # main hash loop for Skein_512 +Skein_512_block_loop: + # general register usage: + # RAX..RDX = temps for key schedule pre-loads + # R8 ..R15 = X0..X7 + # RSP, RBP = stack/frame pointers + # RDI = round counter or context pointer + # RSI = temp + # + movq TWEAK + 0(%rdi),%rax + addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 + movq %rbx,%rcx + xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule + movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] + movq %rax,ksTwk+ 0+F_O(%rbp) + movq $KW_PARITY,%rdx + movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block + movq %rbx,ksTwk+ 8+F_O(%rbp) + movq %rcx,ksTwk+16+F_O(%rbp) + .irp _Rn_,8,9,10,11,12,13,14,15 + movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_ + xorq %r\_Rn_,%rdx #compute overall parity + movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp) + .endr #load state into %r8 ..%r15, compute parity + movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity + + addReg r13,rax #precompute key injection for tweak + addReg r14, rbx +.if _SKEIN_DEBUG + movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below +.endif + movq 0(%rsi),%rax #load input block + movq 8(%rsi),%rbx + movq 16(%rsi),%rcx + movq 24(%rsi),%rdx + addReg r8 , rax #do initial key injection + addReg r9 , rbx + movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward + movq %rbx,Wcopy+ 8+F_O(%rbp) + addReg r10, rcx + addReg r11, rdx + movq %rcx,Wcopy+16+F_O(%rbp) + movq %rdx,Wcopy+24+F_O(%rbp) + + movq 32(%rsi),%rax + movq 40(%rsi),%rbx + movq 48(%rsi),%rcx + movq 56(%rsi),%rdx + addReg r12, rax + addReg r13, rbx + addReg r14, rcx + addReg r15, rdx + movq %rax,Wcopy+32+F_O(%rbp) + movq %rbx,Wcopy+40+F_O(%rbp) + movq %rcx,Wcopy+48+F_O(%rbp) + movq %rdx,Wcopy+56+F_O(%rbp) + +.if _SKEIN_DEBUG + .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output + movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp) + .endr + + Skein_Debug_Block 512 #debug dump + Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL +.endif + addq $8*WCNT,%rsi #skip the block + movq %rsi,blkPtr+F_O(%rbp) #update block pointer + # + ################# + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 512 +_UNROLL_CNT = ROUNDS_512/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_512 + .if ((ROUNDS_512/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_512" + .endif + xorq %rdi,%rdi #rdi = round counter +Skein_512_round_loop: +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + R_512_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 512) == 0 + cmpq $2*(ROUNDS_512/8),%rdi + jb Skein_512_round_loop + movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context +.endif + # end of rounds + ################# + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} + .irp _Rn_,8,9,10,11,12,13,14,15 + .if (\_Rn_ == 8) + movq $FIRST_MASK64,%rbx + .endif + xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR + movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result + .if (\_Rn_ == 14) + andq TWEAK+ 8(%rdi),%rbx + .endif + .endr + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD + + # go back for more blocks, if needed + decq blkCnt+F_O(%rbp) + jnz Skein_512_block_loop + movq %rbx,TWEAK + 8(%rdi) + + Reset_Stack + ret +Skein_512_Process_Block_End: +# + .if _SKEIN_DEBUG +# call here with rdx = "round number" +Skein_Debug_Round_512: + pushq %rsi #save two regs for BLK_BITS-specific parms + pushq %rdi + .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it + movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp) + .endr + movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr + movq $512,%rdi #now are set for the call + jmp Skein_Debug_Round_Common + .endif +# +.if _SKEIN_CODE_SIZE +C_label Skein_512_Process_Block_CodeSize + movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax + ret +# +C_label Skein_512_Unroll_Cnt + .if _UNROLL_CNT <> (ROUNDS_512/8) + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif # _USE_ASM_ & 512 +# +#=================================== Skein1024 ============================================= +.if _USE_ASM_ & 1024 +# +# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# use details of permutation to make register assignments +# +o1K_rdi = 0 #offsets in X[] associated with each register +o1K_rsi = 1 +o1K_rbp = 2 +o1K_rax = 3 +o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate +o1K_rbx = 5 +o1K_rdx = 7 +o1K_r8 = 8 +o1K_r9 = 9 +o1K_r10 = 10 +o1K_r11 = 11 +o1K_r12 = 12 +o1K_r13 = 13 +o1K_r14 = 14 +o1K_r15 = 15 +# +rIdx_offs = tmpStk_1024 +# +.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 + addReg \reg0 , \reg1 #perform the MIX *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***