From owner-svn-src-all@freebsd.org  Sat Jun  6 00:35:42 2020
Return-Path: <owner-svn-src-all@freebsd.org>
Delivered-To: svn-src-all@mailman.nyi.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1])
 by mailman.nyi.freebsd.org (Postfix) with ESMTP id 75D4D33C5B1;
 Sat,  6 Jun 2020 00:35:42 +0000 (UTC)
 (envelope-from emaste@FreeBSD.org)
Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org
 [IPv6:2610:1c1:1:606c::19:3])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256
 client-signature RSA-PSS (4096 bits) client-digest SHA256)
 (Client CN "mxrelay.nyi.freebsd.org",
 Issuer "Let's Encrypt Authority X3" (verified OK))
 by mx1.freebsd.org (Postfix) with ESMTPS id 49f0t62pQ8z4HSW;
 Sat,  6 Jun 2020 00:35:42 +0000 (UTC)
 (envelope-from emaste@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 5B4D019B86;
 Sat,  6 Jun 2020 00:35:42 +0000 (UTC)
 (envelope-from emaste@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 0560Zg4T021205;
 Sat, 6 Jun 2020 00:35:42 GMT (envelope-from emaste@FreeBSD.org)
Received: (from emaste@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id 0560Zg32021203;
 Sat, 6 Jun 2020 00:35:42 GMT (envelope-from emaste@FreeBSD.org)
Message-Id: <202006060035.0560Zg32021203@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: emaste set sender to
 emaste@FreeBSD.org using -f
From: Ed Maste <emaste@FreeBSD.org>
Date: Sat, 6 Jun 2020 00:35:42 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-head@freebsd.org
Subject: svn commit: r361853 - in head: lib/libmd sys/crypto/skein/amd64
 sys/modules/crypto
X-SVN-Group: head
X-SVN-Commit-Author: emaste
X-SVN-Commit-Paths: in head: lib/libmd sys/crypto/skein/amd64
 sys/modules/crypto
X-SVN-Commit-Revision: 361853
X-SVN-Commit-Repository: base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-all@freebsd.org
X-Mailman-Version: 2.1.33
Precedence: list
List-Id: "SVN commit messages for the entire src tree \(except for &quot;
 user&quot; and &quot; projects&quot; \)" <svn-src-all.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-all/>
List-Post: <mailto:svn-src-all@freebsd.org>
List-Help: <mailto:svn-src-all-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Sat, 06 Jun 2020 00:35:42 -0000

Author: emaste
Date: Sat Jun  6 00:35:41 2020
New Revision: 361853
URL: https://svnweb.freebsd.org/changeset/base/361853

Log:
  Rename skein_block_asm.s to .S and assemble using Clang IAS
  
  Comparing the object files produced by GNU as 2.17.50 and Clang IAS
  shows many immaterial changes in strtab etc., and one material change
  in .text:
  
     1bac:  4c 8b 4f 18             mov    0x18(%rdi),%r9
     1bb0:  eb 0e                   jmp    1bc0 <Skein1024_block_loop>
  -  1bb2:  66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
  -  1bb9:  00 00 00 00
  -  1bbd:  0f 1f 00                nopl   (%rax)
  +  1bb2:  66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
  +  1bb9:  00 00 00
  +  1bbc:  0f 1f 40 00             nopl   0x0(%rax)
  
   0000000000001bc0 <Skein1024_block_loop>:
   Skein1024_block_loop():
     1bc0:  4c 8b 47 10             mov    0x10(%rdi),%r8
     1bc4:  4c 03 85 c0 00 00 00    add    0xc0(%rbp),%r8
  
  That is, GNU as and Clang's integrated assembler use different multi-
  byte NOPs for alignment (GNU as emits an 11 byte NOP + a 3 byte NOP,
  while Clang IAS emits a 10 byte NOP + a 4 byte NOP).
  
  Dependency cleanup hacks are not required, because we do not create
  .depend files from GNU as.
  
  Reviewed by:	allanjude, arichardson, cem, tsoome
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D8434

Added:
  head/sys/crypto/skein/amd64/skein_block_asm.S
     - copied unchanged from r361852, head/sys/crypto/skein/amd64/skein_block_asm.s
Deleted:
  head/sys/crypto/skein/amd64/skein_block_asm.s
Modified:
  head/lib/libmd/Makefile
  head/sys/modules/crypto/Makefile

Modified: head/lib/libmd/Makefile
==============================================================================
--- head/lib/libmd/Makefile	Sat Jun  6 00:02:50 2020	(r361852)
+++ head/lib/libmd/Makefile	Sat Jun  6 00:35:41 2020	(r361853)
@@ -116,18 +116,15 @@ CFLAGS+= -DSHA1_ASM
 SRCS+=	rmd160.S
 CFLAGS+= -DRMD160_ASM
 .endif
-.if exists(${MACHINE_ARCH}/skein_block_asm.s)
-.if defined(XAS) || ${MK_BINUTILS_BOOTSTRAP} != "no"
-AFLAGS += --strip-local-absolute
+.if exists(${MACHINE_ARCH}/skein_block_asm.S)
 # Fully unroll all loops in the assembly optimized version
-AFLAGS+= --defsym SKEIN_LOOP=0 --defsym SKEIN_USE_ASM=1792
-SRCS+= skein_block_asm.s
+ACFLAGS+= -DSKEIN_LOOP=0
+SRCS+= skein_block_asm.S
 CFLAGS+= -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792
 .else
 .warning as not available: not using optimized Skein asm
 .endif
-.endif
-.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.s)
+.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.S)
 ACFLAGS+= -DELF -Wa,--noexecstack
 .endif
 .endif # ${USE_ASM_SOURCES} != 0

Copied: head/sys/crypto/skein/amd64/skein_block_asm.S (from r361852, head/sys/crypto/skein/amd64/skein_block_asm.s)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/crypto/skein/amd64/skein_block_asm.S	Sat Jun  6 00:35:41 2020	(r361853, copy of r361852, head/sys/crypto/skein/amd64/skein_block_asm.s)
@@ -0,0 +1,1333 @@
+#
+#----------------------------------------------------------------
+# 64-bit x86 assembler code (gnu as) for Skein block functions
+#
+# Author: Doug Whiting, Hifn/Exar
+#
+# This code is released to the public domain.
+#----------------------------------------------------------------
+# $FreeBSD$
+#
+    .text
+    .altmacro
+#ifndef __clang__
+    .psize 0,128                            #list file has no page boundaries
+#endif
+#
+_MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
+_MAX_FRAME_ =  240
+#
+#################
+#ifndef SKEIN_USE_ASM
+_USE_ASM_         = _MASK_ALL_
+#else
+_USE_ASM_         = SKEIN_USE_ASM
+#endif
+#################
+#configure loop unrolling
+#ifndef SKEIN_LOOP
+_SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
+#else
+_SKEIN_LOOP       = SKEIN_LOOP
+  .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
+#.print  "+++ SKEIN_LOOP = \_NN_"
+  .endr
+#endif
+# the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
+#
+SKEIN_ASM_UNROLL  = 0
+  .irp _NN_,256,512,1024
+    .if (SKEIN_UNROLL_\_NN_) == 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
+    .endif
+  .endr
+#################
+#
+.ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+.else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
+# only display rounds if default size is changed on command line
+.irp _NN_,256,512,1024
+  .if _USE_ASM_ && \_NN_
+    .irp _RR_,%(ROUNDS_\_NN_)
+      .if _NN_ < 1024
+.print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
+      .else
+.print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+      .endif
+    .endr
+  .endif
+.endr
+.endif
+#################
+#
+.ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE = (1)
+.else
+.ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE = (1)
+.else
+_SKEIN_CODE_SIZE = (0)
+.endif
+.endif
+#
+#################
+#
+.ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+.else
+_SKEIN_DEBUG      = 1
+.endif
+#################
+#
+# define offsets of fields in hash context structure
+#
+HASH_BITS   =   0                   #bits of hash output
+BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
+TWEAK       =   8 + BCNT            #tweak values[0..1]
+X_VARS      =  16 + TWEAK           #chaining vars
+#
+#(Note: buffer[] in context structure is NOT needed here :-)
+#
+KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
+FIRST_MASK  =   ~ (1 <<  6)
+FIRST_MASK64=   ~ (1 << 62)
+#
+# rotation constants for Skein
+#
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+#
+#  Input:  reg
+# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+#
+.macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+  .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM  #is there anything to do?
+    rolq    $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
+  .endif
+.endm
+#
+#----------------------------------------------------------------
+#
+# MACROS: define local vars and configure stack
+#
+#----------------------------------------------------------------
+# declare allocated space on the stack
+.macro StackVar localName,localSize
+\localName  =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(\localSize)
+.endm #StackVar
+#
+#----------------------------------------------------------------
+#
+# MACRO: Configure stack frame, allocate local vars
+#
+.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
+    WCNT    =    (\BLK_BITS)/64
+#
+_PushCnt_   =   0                   #save nonvolatile regs on stack
+  .irp _reg_,rbp,rbx,r12,r13,r14,r15
+       pushq    %\_reg_
+_PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
+  .endr
+#
+_STK_OFFS_  =   0                   #starting offset from rsp
+    #---- local  variables         #<-- rsp
+    StackVar    X_stk  ,8*(WCNT)    #local context vars
+    StackVar    ksTwk  ,8*3         #key schedule: tweak words
+    StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
+  .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
+    StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
+  .endif
+    StackVar    Wcopy  ,8*(WCNT)    #copy of input block    
+  .if _SKEIN_DEBUG
+  .if \debugCnt + 0                 #temp location for debug X[] info
+    StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
+  .endif
+  .endif
+  .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
+    StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
+tmpStk_\BLK_BITS = align16          #use this
+  .endif
+    #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
+    StackVar    ctxPtr ,8           #context ptr
+    StackVar    blkPtr ,8           #pointer to block data
+    StackVar    blkCnt ,8           #number of full blocks to process
+    StackVar    bitAdd ,8           #bit count to add to tweak
+LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
+    #---- 
+    StackVar    savRegs,8*_PushCnt_ #saved registers
+    StackVar    retAddr,8           #return address
+    #---- caller's stack frame (aligned mod 16)
+#
+# set up the stack frame pointer (rbp)
+#
+FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
+  .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
+FRAME_OFFS  =      _STK_OFFS_
+  .endif
+F_O         =   -FRAME_OFFS
+#
+  #put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
+#
+# Notes on stack frame setup:
+#   * the most frequently used variable is X_stk[], based at [rsp+0]
+#   * the next most used is the key schedule arrays, ksKey and ksTwk
+#       so rbp is "centered" there, allowing short offsets to the key 
+#       schedule even in 1024-bit Skein case
+#   * the Wcopy variables are infrequently accessed, but they have long 
+#       offsets from both rsp and rbp only in the 1024-bit case.
+#   * all other local vars and calling parameters can be accessed 
+#       with short offsets, except in the 1024-bit case
+#
+    subq    $LOCAL_SIZE,%rsp        #make room for the locals
+    leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
+    movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
+    movq    %rsi, blkPtr+F_O(%rbp)
+    movq    %rdx, blkCnt+F_O(%rbp)
+    movq    %rcx, bitAdd+F_O(%rbp)
+#
+.endm #Setup_Stack
+#
+#----------------------------------------------------------------
+#
+.macro Reset_Stack
+    addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe?)
+  .irp _reg_,r15,r14,r13,r12,rbx,rbp
+    popq    %\_reg_                 #restore caller's regs
+_PushCnt_ = _PushCnt_ - 1
+  .endr
+  .if _PushCnt_
+    .error  "Mismatched push/pops?"
+  .endif
+.endm # Reset_Stack
+#
+#----------------------------------------------------------------
+# macros to help debug internals
+#
+.if _SKEIN_DEBUG
+    .extern  Skein_Show_Block     #calls to C routines
+    .extern  Skein_Show_Round
+#
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+#
+.macro Skein_Debug_Block BLK_BITS
+#
+#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+#                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+#                     const u64b_t *ksPtr,const u64b_t *tsPtr)
+#
+_NN_ = 0
+  .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
+    pushq   %\_reg_                 #save all volatile regs on tack before the call
+_NN_ = _NN_ + 1
+  .endr
+    # get and push call parameters
+    movq    $\BLK_BITS      ,%rdi   #bits
+    movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
+    leaq    X_VARS    (%rsi),%rdx   #X (pointer)
+    movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
+    leaq    Wcopy +F_O(%rbp),%r8    #wPtr
+    leaq    ksKey +F_O(%rbp),%r9    #key pointer
+    leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
+    pushq   %rax                    #   (pass on the stack)
+    call    Skein_Show_Block        #call external debug handler
+    addq    $8*1,%rsp               #discard parameters on stack
+  .if (_NN_ % 2 ) == 0              #check stack alignment
+    .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
+  .endif
+  .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
+    popq    %\_reg_                 #restore regs
+_NN_ = _NN_ - 1
+  .endr
+  .if _NN_
+    .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
+  .endif
+.endm # Skein_Debug_Block
+#
+# the macro to "call" to debug a round
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+    # call the appropriate (local) debug "function"
+    pushq   %rdx                    #save rdx, so we can use it for round "number"
+  .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
+    movq    $\R,%rdx
+  .else                             #compute round number using edi
+_rOffs_ = \RDI_OFFS + 0
+   .if \BLK_BITS == 1024
+    movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
+    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
+   .else
+    leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
+   .endif
+  .endif
+    call    Skein_Debug_Round_\BLK_BITS
+    popq    %rdx                    #restore origianl rdx value
+#
+    afterOp
+.endm  #  Skein_Debug_Round
+.else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+.macro Skein_Debug_Block BLK_BITS
+.endm
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+.endm
+#
+.endif # _SKEIN_DEBUG
+#
+#----------------------------------------------------------------
+#
+.macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+  .if \immOffs + 0
+       leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+  .elseif ((\useAddOp + 0) == 0)
+    .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
+       leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+    .else
+       addq    %\srcReg_A\srcReg_B,%\dstReg
+    .endif
+  .else
+       addq    %\srcReg_A\srcReg_B,%\dstReg
+  .endif
+.endm
+
+# keep Intel-style ordering here, to match addReg
+.macro  xorReg dstReg,srcReg_A,srcReg_B
+        xorq   %\srcReg_A\srcReg_B,%\dstReg
+.endm
+#
+#----------------------------------------------------------------
+#
+.macro C_label lName
+ \lName:        #use both "genders" to work across linkage conventions
+_\lName:
+    .global  \lName
+    .global _\lName
+.endm
+#
+#=================================== Skein_256 =============================================
+#
+.if _USE_ASM_ & 256
+#
+# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+#
+# code
+#
+C_label Skein_256_Process_Block
+    Setup_Stack 256,((ROUNDS_256/8)+1)
+    movq    TWEAK+8(%rdi),%r14
+    jmp     Skein_256_block_loop
+    .p2align 4
+    # main hash loop for Skein_256
+Skein_256_block_loop:
+    #
+    # general register usage:
+    #   RAX..RDX        = X0..X3    
+    #   R08..R12        = ks[0..4]
+    #   R13..R15        = ts[0..2]
+    #   RSP, RBP        = stack/frame pointers
+    #   RDI             = round counter or context pointer
+    #   RSI             = temp
+    #
+    movq    TWEAK+0(%rdi)     ,%r13
+    addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
+    movq    %r14              ,%r15
+    xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak 
+
+    movq    $KW_PARITY        ,%r12
+    movq       X_VARS+ 0(%rdi),%r8
+    movq       X_VARS+ 8(%rdi),%r9 
+    movq       X_VARS+16(%rdi),%r10
+    movq       X_VARS+24(%rdi),%r11
+    movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
+    xorq    %r8               ,%r12  #start accumulating overall parity
+
+    movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
+    xorq    %r9               ,%r12
+    movq     0(%rsi)          ,%rax  #get X[0..3]
+    xorq    %r10              ,%r12
+    movq     8(%rsi)          ,%rbx
+    xorq    %r11              ,%r12
+    movq    16(%rsi)          ,%rcx
+    movq    24(%rsi)          ,%rdx
+
+    movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
+    movq    %rbx,Wcopy+ 8+F_O(%rbp)    
+    movq    %rcx,Wcopy+16+F_O(%rbp)    
+    movq    %rdx,Wcopy+24+F_O(%rbp)    
+
+    addq    %r8 ,%rax                #initial key injection
+    addq    %r9 ,%rbx 
+    addq    %r10,%rcx
+    addq    %r11,%rdx
+    addq    %r13,%rbx
+    addq    %r14,%rcx
+
+.if _SKEIN_DEBUG
+    movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
+    movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
+    movq    %r9 ,ksKey+ 8+F_O(%rbp)    
+    movq    %r10,ksKey+16+F_O(%rbp)    
+    movq    %r11,ksKey+24+F_O(%rbp)    
+    movq    %r12,ksKey+32+F_O(%rbp)    
+                                       
+    movq    %r13,ksTwk+ 0+F_O(%rbp)    
+    movq    %r14,ksTwk+ 8+F_O(%rbp)    
+    movq    %r15,ksTwk+16+F_O(%rbp)    
+                                       
+    movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
+    movq    %rbx,X_stk + 8(%rsp)       
+    movq    %rcx,X_stk +16(%rsp)       
+    movq    %rdx,X_stk +24(%rsp)       
+
+    Skein_Debug_Block 256            #debug dump
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+.endif
+#
+.if ((SKEIN_ASM_UNROLL & 256) == 0)
+    movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
+    movq    %r9 ,ksKey+ 8+F_O(%rbp)    
+    movq    %r10,ksKey+16+F_O(%rbp)    
+    movq    %r11,ksKey+24+F_O(%rbp)    
+    movq    %r12,ksKey+32+F_O(%rbp)    
+                                       
+    movq    %r13,ksTwk+24+F_O(%rbp)    
+    movq    %r14,ksTwk+ 8+F_O(%rbp)    
+    movq    %r15,ksTwk+16+F_O(%rbp)    
+.endif
+    addq    $WCNT*8,%rsi             #skip the block
+    movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
+    #
+    # now the key schedule is computed. Start the rounds
+    #
+.if SKEIN_ASM_UNROLL & 256
+_UNROLL_CNT =   ROUNDS_256/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_256
+  .if ((ROUNDS_256/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_256"
+  .endif
+    xorq    %rdi,%rdi                #rdi = iteration count
+Skein_256_round_loop:
+.endif
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+    # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
+    # round 4*_RBase_ + 0
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
+    addReg  rcx, rdx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
+                .endif
+    xorReg  rbx, rax
+    RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
+    xorReg  rdx, rcx
+  .if SKEIN_ASM_UNROLL & 256
+    .irp _r0_,%( 8+(_Rbase_+3) % 5)
+    .irp _r1_,%(13+(_Rbase_+2) % 3)
+      leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
+    .endr
+    .endr
+  .endif
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
+                .endif
+    Skein_Debug_Round 256,%(4*_Rbase_+1)
+
+    # round 4*_Rbase_ + 1
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
+    xorReg  rdx, rax
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
+                .endif
+    addReg  rcx, rbx
+    RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
+    xorReg  rbx, rcx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
+                .endif
+    Skein_Debug_Round 256,%(4*_Rbase_+2)
+ .if SKEIN_ASM_UNROLL & 256
+    .irp _r0_,%( 8+(_Rbase_+2) % 5)
+    .irp _r1_,%(13+(_Rbase_+1) % 3)
+      leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
+    .endr
+    .endr
+ .endif
+    # round 4*_Rbase_ + 2
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
+    addReg  rcx, rdx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
+                .endif
+    xorReg  rbx, rax
+    RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
+    xorReg  rdx, rcx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
+                    leaq 1(%r11,%rdi),%r11               #precompute key + tweak
+                .endif
+    Skein_Debug_Round 256,%(4*_Rbase_+3)
+    # round 4*_Rbase_ + 3
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
+    addReg  rcx, rbx
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
+                    movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
+                .endif
+    xorReg  rdx, rax
+    RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
+    xorReg  rbx, rcx
+    Skein_Debug_Round 256,%(4*_Rbase_+4)
+                .if (SKEIN_ASM_UNROLL & 256) == 0
+                    addReg r9 ,r13           #precompute key+tweak
+                .endif
+      #inject key schedule words
+_Rbase_ = _Rbase_+1
+  .if SKEIN_ASM_UNROLL & 256
+    addReg    rax,r,%(8+((_Rbase_+0) % 5))
+    addReg    rbx,rsi
+    addReg    rcx,rdi
+    addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
+  .else
+    incq      %rdi
+    addReg    rax,r8 
+    addReg    rcx,r10
+    addReg    rbx,r9 
+    addReg    rdx,r11
+  .endif
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 256) == 0
+    cmpq    $2*(ROUNDS_256/8),%rdi
+    jb      Skein_256_round_loop
+.endif # (SKEIN_ASM_UNROLL & 256) == 0
+    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
+
+    #----------------------------
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+    movq    $FIRST_MASK64 ,%r14
+    xorq    Wcopy + 0+F_O (%rbp),%rax
+    xorq    Wcopy + 8+F_O (%rbp),%rbx
+    xorq    Wcopy +16+F_O (%rbp),%rcx
+    xorq    Wcopy +24+F_O (%rbp),%rdx
+    andq    TWEAK + 8     (%rdi),%r14
+    movq    %rax,X_VARS+ 0(%rdi)             #store final result
+    movq    %rbx,X_VARS+ 8(%rdi)        
+    movq    %rcx,X_VARS+16(%rdi)        
+    movq    %rdx,X_VARS+24(%rdi)        
+
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
+
+    # go back for more blocks, if needed
+    decq    blkCnt+F_O(%rbp)
+    jnz     Skein_256_block_loop
+    movq    %r14,TWEAK + 8(%rdi)
+    Reset_Stack
+    ret
+Skein_256_Process_Block_End:
+
+  .if _SKEIN_DEBUG
+Skein_Debug_Round_256:               #here with rdx == round "number" from macro
+    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
+    pushq   %rdi
+    movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
+    movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
+    movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
+    movq    %rcx,X_stk+16+F_O(%rbp)
+    movq    %rdi,X_stk+24+F_O(%rbp)
+
+    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
+    movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
+    jmp     Skein_Debug_Round_Common
+  .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label  Skein_256_Process_Block_CodeSize
+    movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
+    ret
+#
+C_label Skein_256_Unroll_Cnt
+  .if _UNROLL_CNT <> ROUNDS_256/8
+    movq    $_UNROLL_CNT,%rax
+  .else
+    xorq    %rax,%rax
+  .endif
+    ret
+.endif
+#
+.endif #_USE_ASM_ & 256
+#
+#=================================== Skein_512 =============================================
+#
+.if _USE_ASM_ & 512
+#
+# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
+#
+# X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
+#
+#################
+# MACRO: one round for 512-bit blocks
+#
+.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
+#
+    addReg      r\rn0, r\rn1
+    RotL64      r\rn1, 512,%((\_Rn_) % 8),0
+    xorReg      r\rn1, r\rn0
+            \op1
+    addReg      r\rn2, r\rn3
+    RotL64      r\rn3, 512,%((\_Rn_) % 8),1
+    xorReg      r\rn3, r\rn2
+            \op2
+    addReg      r\rn4, r\rn5
+    RotL64      r\rn5, 512,%((\_Rn_) % 8),2
+    xorReg      r\rn5, r\rn4
+            \op3
+    addReg      r\rn6, r\rn7
+    RotL64      r\rn7, 512,%((\_Rn_) % 8),3
+    xorReg      r\rn7, r\rn6
+            \op4
+    Skein_Debug_Round 512,%(\_Rn_+1),-4
+#
+.endm #R_512_OneRound
+#
+#################
+# MACRO: eight rounds for 512-bit blocks
+#
+.macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
+  .if (SKEIN_ASM_UNROLL && 512)
+    # here for fully unrolled case.
+    _II_ = ((\_RR_)/4) + 1       #key injection counter
+    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
+    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
+    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
+    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
+    # inject the key schedule
+    addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
+    addReg   r11, rax
+    addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
+    addReg   r12, rbx
+    addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
+    addReg   r13, rcx
+    addReg   r14, rdx
+    addReg   r15, rsi,,,(_II_)
+  .else
+    # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
+    incq    %rdi                 #bump key injection counter
+    R_512_OneRound  8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
+    R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
+    R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
+    R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
+    # inject the key schedule
+    addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
+    addReg   r11, rax
+    addReg   r12, rbx
+    addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
+    addReg   r13, rcx
+    addReg   r14, rdx
+    addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
+    addReg   r15, rsi
+    addReg   r15, rdi              #inject the round number
+  .endif
+
+    #show the result of the key injection
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
+.endm #R_512_EightRounds
+#
+#################
+# instantiated code
+#
+C_label Skein_512_Process_Block
+    Setup_Stack 512,ROUNDS_512/8
+    movq    TWEAK+ 8(%rdi),%rbx
+    jmp     Skein_512_block_loop
+    .p2align 4
+    # main hash loop for Skein_512
+Skein_512_block_loop:
+    # general register usage:
+    #   RAX..RDX       = temps for key schedule pre-loads
+    #   R8 ..R15       = X0..X7
+    #   RSP, RBP       = stack/frame pointers
+    #   RDI            = round counter or context pointer
+    #   RSI            = temp
+    #
+    movq    TWEAK +  0(%rdi),%rax
+    addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
+    movq    %rbx,%rcx
+    xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
+    movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
+    movq    %rax,ksTwk+ 0+F_O(%rbp)
+    movq    $KW_PARITY,%rdx
+    movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
+    movq    %rbx,ksTwk+ 8+F_O(%rbp)
+    movq    %rcx,ksTwk+16+F_O(%rbp)
+    .irp _Rn_,8,9,10,11,12,13,14,15
+      movq  X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
+      xorq  %r\_Rn_,%rdx              #compute overall parity
+      movq  %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
+    .endr                             #load state into %r8 ..%r15, compute parity
+      movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
+
+    addReg   r13,rax                  #precompute key injection for tweak
+    addReg   r14, rbx
+.if _SKEIN_DEBUG
+    movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
+.endif
+    movq     0(%rsi),%rax             #load input block
+    movq     8(%rsi),%rbx 
+    movq    16(%rsi),%rcx 
+    movq    24(%rsi),%rdx 
+    addReg   r8 , rax                 #do initial key injection
+    addReg   r9 , rbx
+    movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
+    movq    %rbx,Wcopy+ 8+F_O(%rbp)
+    addReg   r10, rcx
+    addReg   r11, rdx
+    movq    %rcx,Wcopy+16+F_O(%rbp)
+    movq    %rdx,Wcopy+24+F_O(%rbp)
+
+    movq    32(%rsi),%rax
+    movq    40(%rsi),%rbx 
+    movq    48(%rsi),%rcx 
+    movq    56(%rsi),%rdx
+    addReg   r12, rax
+    addReg   r13, rbx
+    addReg   r14, rcx
+    addReg   r15, rdx
+    movq    %rax,Wcopy+32+F_O(%rbp)    
+    movq    %rbx,Wcopy+40+F_O(%rbp)    
+    movq    %rcx,Wcopy+48+F_O(%rbp)    
+    movq    %rdx,Wcopy+56+F_O(%rbp)    
+
+.if _SKEIN_DEBUG
+    .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
+      movq  %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
+    .endr
+
+    Skein_Debug_Block 512             #debug dump
+    Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
+.endif
+    addq    $8*WCNT,%rsi              #skip the block
+    movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
+    #
+    #################
+    # now the key schedule is computed. Start the rounds
+    #
+.if SKEIN_ASM_UNROLL & 512
+_UNROLL_CNT =   ROUNDS_512/8
+.else
+_UNROLL_CNT =   SKEIN_UNROLL_512
+  .if ((ROUNDS_512/8) % _UNROLL_CNT)
+    .error "Invalid SKEIN_UNROLL_512"
+  .endif
+    xorq    %rdi,%rdi                 #rdi = round counter
+Skein_512_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+      R_512_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 512) == 0
+    cmpq    $2*(ROUNDS_512/8),%rdi
+    jb      Skein_512_round_loop
+    movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
+.endif
+    # end of rounds
+    #################
+    # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
+    .irp _Rn_,8,9,10,11,12,13,14,15
+  .if (\_Rn_ == 8)
+    movq    $FIRST_MASK64,%rbx
+  .endif
+      xorq  Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
+      movq  %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi)     #and store result
+  .if (\_Rn_ == 14)
+    andq    TWEAK+ 8(%rdi),%rbx
+  .endif
+    .endr
+    Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+
+    # go back for more blocks, if needed
+    decq    blkCnt+F_O(%rbp)
+    jnz     Skein_512_block_loop
+    movq    %rbx,TWEAK + 8(%rdi)
+
+    Reset_Stack
+    ret
+Skein_512_Process_Block_End:
+#
+  .if _SKEIN_DEBUG
+# call here with rdx  = "round number"
+Skein_Debug_Round_512:
+    pushq   %rsi                     #save two regs for BLK_BITS-specific parms
+    pushq   %rdi
+  .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
+    movq    %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
+  .endr
+    movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
+    movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
+    jmp     Skein_Debug_Round_Common
+  .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein_512_Process_Block_CodeSize
+    movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
+    ret
+#
+C_label Skein_512_Unroll_Cnt
+  .if _UNROLL_CNT <> (ROUNDS_512/8)
+    movq    $_UNROLL_CNT,%rax
+  .else
+    xorq    %rax,%rax
+  .endif
+    ret
+.endif
+#
+.endif # _USE_ASM_ & 512
+#
+#=================================== Skein1024 =============================================
+.if _USE_ASM_ & 1024
+#
+# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+# use details of permutation to make register assignments
+# 
+o1K_rdi =  0        #offsets in X[] associated with each register
+o1K_rsi =  1 
+o1K_rbp =  2 
+o1K_rax =  3 
+o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
+o1K_rbx =  5 
+o1K_rdx =  7 
+o1K_r8  =  8  
+o1K_r9  =  9  
+o1K_r10 = 10
+o1K_r11 = 11
+o1K_r12 = 12
+o1K_r13 = 13
+o1K_r14 = 14
+o1K_r15 = 15
+#
+rIdx_offs = tmpStk_1024
+#
+.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
+    addReg      \reg0 , \reg1                      #perform the MIX

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***