From owner-svn-src-projects@FreeBSD.ORG Fri Aug 22 14:27:42 2014 Return-Path: Delivered-To: svn-src-projects@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 2F469B4C; Fri, 22 Aug 2014 14:27:42 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 195543844; Fri, 22 Aug 2014 14:27:42 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id s7MERgFO071699; Fri, 22 Aug 2014 14:27:42 GMT (envelope-from rrs@FreeBSD.org) Received: (from rrs@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id s7MERf01071695; Fri, 22 Aug 2014 14:27:41 GMT (envelope-from rrs@FreeBSD.org) Message-Id: <201408221427.s7MERf01071695@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: rrs set sender to rrs@FreeBSD.org using -f From: Randall Stewart Date: Fri, 22 Aug 2014 14:27:41 +0000 (UTC) To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r270330 - in projects/rrs_socrypto_tls/sys: crypto/aesni libkern opencrypto X-SVN-Group: projects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-projects@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: "SVN commit messages for the src " projects" tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 22 Aug 2014 14:27:42 -0000 Author: rrs Date: Fri Aug 22 14:27:41 2014 New Revision: 270330 URL: http://svnweb.freebsd.org/changeset/base/270330 Log: We need to have these added too. Obtained from: jmg MFC after: never AM sys/crypto/aesni/aesni_ghash.c AM sys/libkern/explicit_bzero.c AM sys/opencrypto/gfmult.c AM sys/opencrypto/gfmult.h AM sys/opencrypto/gmac.c AM sys/opencrypto/gmac.h Added: projects/rrs_socrypto_tls/sys/crypto/aesni/aesni_ghash.c (contents, props changed) projects/rrs_socrypto_tls/sys/libkern/explicit_bzero.c (contents, props changed) projects/rrs_socrypto_tls/sys/opencrypto/gfmult.c (contents, props changed) projects/rrs_socrypto_tls/sys/opencrypto/gfmult.h (contents, props changed) projects/rrs_socrypto_tls/sys/opencrypto/gmac.c (contents, props changed) projects/rrs_socrypto_tls/sys/opencrypto/gmac.h (contents, props changed) Added: projects/rrs_socrypto_tls/sys/crypto/aesni/aesni_ghash.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/rrs_socrypto_tls/sys/crypto/aesni/aesni_ghash.c Fri Aug 22 14:27:41 2014 (r270330) @@ -0,0 +1,767 @@ +/*- + * Copyright (c) 2014 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by John-Mark Gurney under + * the sponsorship from the FreeBSD Foundation. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * + * $Id$ + * + */ + +/* + * Figure 5, 8 and 12 are copied from the Intel white paper: + * Intel® Carry-Less Multiplication Instruction and its Usage for + * Computing the GCM Mode + * + * and as such are: + * Copyright © 2010 Intel Corporation. All rights reserved. + * + * Please see white paper for complete license. + */ + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include +#include +#include + +static inline int +m128icmp(__m128i a, __m128i b) +{ + __m128i cmp; + + cmp = _mm_cmpeq_epi32(a, b); + + return _mm_movemask_epi8(cmp) == 0xffff; +} + +/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ + +/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ +static void +gfmul(__m128i a, __m128i b, __m128i *res) +{ + __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + + tmp3 = _mm_clmulepi64_si128(a, b, 0x00); + tmp4 = _mm_clmulepi64_si128(a, b, 0x10); + tmp5 = _mm_clmulepi64_si128(a, b, 0x01); + tmp6 = _mm_clmulepi64_si128(a, b, 0x11); + + tmp4 = _mm_xor_si128(tmp4, tmp5); + tmp5 = _mm_slli_si128(tmp4, 8); + tmp4 = _mm_srli_si128(tmp4, 8); + tmp3 = _mm_xor_si128(tmp3, tmp5); + tmp6 = _mm_xor_si128(tmp6, tmp4); + + tmp7 = _mm_srli_epi32(tmp3, 31); + tmp8 = _mm_srli_epi32(tmp6, 31); + tmp3 = _mm_slli_epi32(tmp3, 1); + tmp6 = _mm_slli_epi32(tmp6, 1); + + tmp9 = _mm_srli_si128(tmp7, 12); + tmp8 = _mm_slli_si128(tmp8, 4); + tmp7 = _mm_slli_si128(tmp7, 4); + tmp3 = _mm_or_si128(tmp3, tmp7); + tmp6 = _mm_or_si128(tmp6, tmp8); + tmp6 = _mm_or_si128(tmp6, tmp9); + + tmp7 = _mm_slli_epi32(tmp3, 31); + tmp8 = _mm_slli_epi32(tmp3, 30); + tmp9 = _mm_slli_epi32(tmp3, 25); + + tmp7 = _mm_xor_si128(tmp7, tmp8); + tmp7 = _mm_xor_si128(tmp7, tmp9); + tmp8 = _mm_srli_si128(tmp7, 4); + tmp7 = _mm_slli_si128(tmp7, 12); + tmp3 = _mm_xor_si128(tmp3, tmp7); + + tmp2 = _mm_srli_epi32(tmp3, 1); + tmp4 = _mm_srli_epi32(tmp3, 2); + tmp5 = _mm_srli_epi32(tmp3, 7); + tmp2 = _mm_xor_si128(tmp2, tmp4); + tmp2 = _mm_xor_si128(tmp2, tmp5); + tmp2 = _mm_xor_si128(tmp2, tmp8); + tmp3 = _mm_xor_si128(tmp3, tmp2); + tmp6 = _mm_xor_si128(tmp6, tmp3); + + *res = tmp6; +} + +/* + * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction + * Method */ +static void +reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, + __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) +{ + /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ + __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, + H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9; + + H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); + H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); + H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); + H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); + + lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); + lo = _mm_xor_si128(lo, H3_X3_lo); + lo = _mm_xor_si128(lo, H4_X4_lo); + + H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); + H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); + H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); + H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); + + hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); + hi = _mm_xor_si128(hi, H3_X3_hi); + hi = _mm_xor_si128(hi, H4_X4_hi); + + tmp0 = _mm_shuffle_epi32(H1, 78); + tmp4 = _mm_shuffle_epi32(X1, 78); + tmp0 = _mm_xor_si128(tmp0, H1); + tmp4 = _mm_xor_si128(tmp4, X1); + tmp1 = _mm_shuffle_epi32(H2, 78); + tmp5 = _mm_shuffle_epi32(X2, 78); + tmp1 = _mm_xor_si128(tmp1, H2); + tmp5 = _mm_xor_si128(tmp5, X2); + tmp2 = _mm_shuffle_epi32(H3, 78); + tmp6 = _mm_shuffle_epi32(X3, 78); + tmp2 = _mm_xor_si128(tmp2, H3); + tmp6 = _mm_xor_si128(tmp6, X3); + tmp3 = _mm_shuffle_epi32(H4, 78); + tmp7 = _mm_shuffle_epi32(X4, 78); + tmp3 = _mm_xor_si128(tmp3, H4); + tmp7 = _mm_xor_si128(tmp7, X4); + + tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); + tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); + tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); + tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); + + tmp0 = _mm_xor_si128(tmp0, lo); + tmp0 = _mm_xor_si128(tmp0, hi); + tmp0 = _mm_xor_si128(tmp1, tmp0); + tmp0 = _mm_xor_si128(tmp2, tmp0); + tmp0 = _mm_xor_si128(tmp3, tmp0); + + tmp4 = _mm_slli_si128(tmp0, 8); + tmp0 = _mm_srli_si128(tmp0, 8); + + lo = _mm_xor_si128(tmp4, lo); + hi = _mm_xor_si128(tmp0, hi); + + tmp3 = lo; + tmp6 = hi; + + tmp7 = _mm_srli_epi32(tmp3, 31); + tmp8 = _mm_srli_epi32(tmp6, 31); + tmp3 = _mm_slli_epi32(tmp3, 1); + tmp6 = _mm_slli_epi32(tmp6, 1); + + tmp9 = _mm_srli_si128(tmp7, 12); + tmp8 = _mm_slli_si128(tmp8, 4); + tmp7 = _mm_slli_si128(tmp7, 4); + tmp3 = _mm_or_si128(tmp3, tmp7); + tmp6 = _mm_or_si128(tmp6, tmp8); + tmp6 = _mm_or_si128(tmp6, tmp9); + + tmp7 = _mm_slli_epi32(tmp3, 31); + tmp8 = _mm_slli_epi32(tmp3, 30); + tmp9 = _mm_slli_epi32(tmp3, 25); + + tmp7 = _mm_xor_si128(tmp7, tmp8); + tmp7 = _mm_xor_si128(tmp7, tmp9); + tmp8 = _mm_srli_si128(tmp7, 4); + tmp7 = _mm_slli_si128(tmp7, 12); + tmp3 = _mm_xor_si128(tmp3, tmp7); + + tmp2 = _mm_srli_epi32(tmp3, 1); + tmp4 = _mm_srli_epi32(tmp3, 2); + tmp5 = _mm_srli_epi32(tmp3, 7); + tmp2 = _mm_xor_si128(tmp2, tmp4); + tmp2 = _mm_xor_si128(tmp2, tmp5); + tmp2 = _mm_xor_si128(tmp2, tmp8); + tmp3 = _mm_xor_si128(tmp3, tmp2); + tmp6 = _mm_xor_si128(tmp6, tmp3); + + *res = tmp6; +} + +/* + * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated + * Every Four Blocks + */ +/* + * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or + * 2^32-256*8*16 bytes. + */ +void +AES_GCM_encrypt(const unsigned char *in, unsigned char *out, + const unsigned char *addt, const unsigned char *ivec, + unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, + const unsigned char *key, int nr) +{ + int i, j ,k; + __m128i tmp1, tmp2, tmp3, tmp4; + __m128i tmp5, tmp6, tmp7, tmp8; + __m128i H, H2, H3, H4, Y, T; + __m128i *KEY = (__m128i*)key; + __m128i ctr1, ctr2, ctr3, ctr4; + __m128i ctr5, ctr6, ctr7, ctr8; + __m128i last_block = _mm_setzero_si128(); + __m128i ONE = _mm_set_epi32(0, 1, 0, 0); + __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); + __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, + 7); + __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, + 15); + __m128i X = _mm_setzero_si128(); + + if (ibytes == 96/8) { + Y = _mm_loadu_si128((__m128i*)ivec); + Y = _mm_insert_epi32(Y, 0x1000000, 3); + /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp2 = _mm_xor_si128(Y, KEY[0]); + for (j=1; j < nr-1; j+=2) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); + + tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); + } + tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); + + H = _mm_aesenclast_si128(tmp1, KEY[nr]); + T = _mm_aesenclast_si128(tmp2, KEY[nr]); + + H = _mm_shuffle_epi8(H, BSWAP_MASK); + } else { + tmp1 = _mm_xor_si128(X, KEY[0]); + for (j=1; j +#include + +/* + * explicit_bzero - don't let the compiler optimize away bzero + */ +void +explicit_bzero(void *p, size_t n) +{ + bzero(p, n); +} Added: projects/rrs_socrypto_tls/sys/opencrypto/gfmult.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/rrs_socrypto_tls/sys/opencrypto/gfmult.c Fri Aug 22 14:27:41 2014 (r270330) @@ -0,0 +1,274 @@ +/*- + * Copyright (c) 2014 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by John-Mark Gurney under + * the sponsorship from the FreeBSD Foundation. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#include "gfmult.h" + +#define REV_POLY_REDUCT 0xe1 /* 0x87 bit reversed */ + +/* reverse the bits of a nibble */ +static const uint8_t nib_rev[] = { + 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, + 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, +}; + +/* calulate v * 2 */ +static inline struct gf128 +gf128_mulalpha(struct gf128 v) +{ + uint64_t mask; + + mask = !!(v.v[1] & 1); + mask = ~(mask - 1); + v.v[1] = (v.v[1] >> 1) | ((v.v[0] & 1) << 63); + v.v[0] = (v.v[0] >> 1) ^ ((mask & REV_POLY_REDUCT) << 56); + + return v; +} + +/* + * Generate a table for 0-16 * h. Store the results in the table w/ indexes + * bit reversed, and the words striped across the values. + */ +void +gf128_genmultable(struct gf128 h, struct gf128table *t) +{ + struct gf128 tbl[16]; + int i; + + tbl[0] = MAKE_GF128(0, 0); + tbl[1] = h; + + for (i = 2; i < 16; i += 2) { + tbl[i] = gf128_mulalpha(tbl[i / 2]); + tbl[i + 1] = gf128_add(tbl[i], h); + } + + for (i = 0; i < 16; i++) { + t->a[nib_rev[i]] = tbl[i].v[0] >> 32; + t->b[nib_rev[i]] = tbl[i].v[0]; + t->c[nib_rev[i]] = tbl[i].v[1] >> 32; + t->d[nib_rev[i]] = tbl[i].v[1]; + } +} + +/* + * Generate tables containing h, h^2, h^3 and h^4, starting at 0. + */ +void +gf128_genmultable4(struct gf128 h, struct gf128table4 *t) +{ + struct gf128 h2, h3, h4; + + gf128_genmultable(h, &t->tbls[0]); + + h2 = gf128_mul(h, &t->tbls[0]); + + gf128_genmultable(h2, &t->tbls[1]); + + h3 = gf128_mul(h, &t->tbls[1]); + gf128_genmultable(h3, &t->tbls[2]); + + h4 = gf128_mul(h2, &t->tbls[1]); + gf128_genmultable(h4, &t->tbls[3]); +} + +/* + * Read a row from the table. + */ +static inline struct gf128 +readrow(struct gf128table *tbl, unsigned bits) +{ + struct gf128 r; + + bits = bits % 16; + + r.v[0] = ((uint64_t)tbl->a[bits] << 32) | tbl->b[bits]; + r.v[1] = ((uint64_t)tbl->c[bits] << 32) | tbl->d[bits]; + + return r; +} + +/* + * These are the reduction values. Since we are dealing with bit reversed + * version, the values need to be bit reversed, AND the indexes are also + * bit reversed to make lookups quicker. + */ +static uint16_t reduction[] = { + 0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0, + 0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0, +}; + +/* + * Calculate: + * (x*2^4 + word[3,0]*h) * + * 2^4 + word[7,4]*h) * + * ... + * 2^4 + word[63,60]*h + */ +static struct gf128 +gfmultword(uint64_t word, struct gf128 x, struct gf128table *tbl) +{ + struct gf128 row; + unsigned bits; + unsigned redbits; + int i; + + for (i = 0; i < 64; i += 4) { + bits = word % 16; + + /* fetch row */ + row = readrow(tbl, bits); + + /* x * 2^4 */ + redbits = x.v[1] % 16; + x.v[1] = (x.v[1] >> 4) | (x.v[0] % 16) << 60; + x.v[0] >>= 4; + x.v[0] ^= (uint64_t)reduction[redbits] << (64 - 16); + + word >>= 4; + + x = gf128_add(x, row); + } + + return x; +} + +/* + * Calculate + * (x*2^4 + worda[3,0]*h^4+wordb[3,0]*h^3+...+wordd[3,0]*h) * + * ... + * 2^4 + worda[63,60]*h^4+ ... + wordd[63,60]*h + * + * Passing/returning struct is .5% faster than passing in via pointer on + * amd64. + */ +static struct gf128 +gfmultword4(uint64_t worda, uint64_t wordb, uint64_t wordc, uint64_t wordd, + struct gf128 x, struct gf128table4 *tbl) +{ + struct gf128 rowa, rowb, rowc, rowd; + unsigned bitsa, bitsb, bitsc, bitsd; + unsigned redbits; + int i; + + /* + * XXX - nibble reverse words to save a shift? probably not as + * nibble reverse would take 20 ops (5 * 4) verse 16 + */ + + for (i = 0; i < 64; i += 4) { + bitsa = worda % 16; + bitsb = wordb % 16; + bitsc = wordc % 16; + bitsd = wordd % 16; + + /* fetch row */ + rowa = readrow(&tbl->tbls[3], bitsa); + rowb = readrow(&tbl->tbls[2], bitsb); + rowc = readrow(&tbl->tbls[1], bitsc); + rowd = readrow(&tbl->tbls[0], bitsd); + + /* x * 2^4 */ + redbits = x.v[1] % 16; + x.v[1] = (x.v[1] >> 4) | (x.v[0] % 16) << 60; + x.v[0] >>= 4; + x.v[0] ^= (uint64_t)reduction[redbits] << (64 - 16); + + worda >>= 4; + wordb >>= 4; + wordc >>= 4; *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***