From owner-svn-src-stable-11@freebsd.org Wed Apr 19 16:16:43 2017 Return-Path: Delivered-To: svn-src-stable-11@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 52E3DD458C1; Wed, 19 Apr 2017 16:16:43 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 21BED1967; Wed, 19 Apr 2017 16:16:43 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id v3JGGgYD082809; Wed, 19 Apr 2017 16:16:42 GMT (envelope-from markj@FreeBSD.org) Received: (from markj@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id v3JGGfun082801; Wed, 19 Apr 2017 16:16:41 GMT (envelope-from markj@FreeBSD.org) Message-Id: <201704191616.v3JGGfun082801@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: markj set sender to markj@FreeBSD.org using -f From: Mark Johnston Date: Wed, 19 Apr 2017 16:16:41 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org Subject: svn commit: r317149 - in stable/11: sys/conf sys/libkern sys/libkern/x86 sys/sys tests/sys/kern X-SVN-Group: stable-11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable-11@freebsd.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: SVN commit messages for only the 11-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 19 Apr 2017 16:16:43 -0000 Author: markj Date: Wed Apr 19 16:16:41 2017 New Revision: 317149 URL: https://svnweb.freebsd.org/changeset/base/317149 Log: MFC r313006 (by cem), r315983 (by bde): Add an SSE4.2 implementation of crc32 for x86. Added: stable/11/sys/libkern/x86/ - copied from r313006, head/sys/libkern/x86/ stable/11/tests/sys/kern/libkern_crc32.c - copied unchanged from r313006, head/tests/sys/kern/libkern_crc32.c Modified: stable/11/sys/conf/files.amd64 stable/11/sys/conf/files.i386 stable/11/sys/libkern/crc32.c stable/11/sys/libkern/x86/crc32_sse42.c stable/11/sys/sys/libkern.h stable/11/tests/sys/kern/Makefile Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/conf/files.amd64 ============================================================================== --- stable/11/sys/conf/files.amd64 Wed Apr 19 16:12:02 2017 (r317148) +++ stable/11/sys/conf/files.amd64 Wed Apr 19 16:16:41 2017 (r317149) @@ -552,6 +552,9 @@ isa/syscons_isa.c optional sc isa/vga_isa.c optional vga kern/kern_clocksource.c standard kern/link_elf_obj.c standard +libkern/x86/crc32_sse42.c standard +libkern/memmove.c standard +libkern/memset.c standard # # IA32 binary support # @@ -609,9 +612,6 @@ compat/ndis/subr_pe.c optional ndisapi compat/ndis/subr_usbd.c optional ndisapi pci compat/ndis/winx64_wrap.S optional ndisapi pci # -libkern/memmove.c standard -libkern/memset.c standard -# # x86 real mode BIOS emulator, required by dpms/pci/vesa # compat/x86bios/x86bios.c optional x86bios | dpms | pci | vesa Modified: stable/11/sys/conf/files.i386 ============================================================================== --- stable/11/sys/conf/files.i386 Wed Apr 19 16:12:02 2017 (r317148) +++ stable/11/sys/conf/files.i386 Wed Apr 19 16:16:41 2017 (r317149) @@ -564,6 +564,7 @@ libkern/qdivrem.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard +libkern/x86/crc32_sse42.c standard i386/xbox/xbox.c optional xbox i386/xbox/xboxfb.c optional xboxfb dev/fb/boot_font.c optional xboxfb Modified: stable/11/sys/libkern/crc32.c ============================================================================== --- stable/11/sys/libkern/crc32.c Wed Apr 19 16:12:02 2017 (r317148) +++ stable/11/sys/libkern/crc32.c Wed Apr 19 16:16:41 2017 (r317149) @@ -46,8 +46,14 @@ __FBSDID("$FreeBSD$"); #include +#include #include +#if defined(__amd64__) || defined(__i386__) +#include +#include +#endif + const uint32_t crc32_tab[] = { 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, @@ -749,6 +755,11 @@ calculate_crc32c(uint32_t crc32c, const unsigned char *buffer, unsigned int length) { +#if defined(__amd64__) || defined(__i386__) + if ((cpu_feature2 & CPUID2_SSE42) != 0) { + return (sse42_crc32c(crc32c, buffer, length)); + } else +#endif if (length < 4) { return (singletable_crc32c(crc32c, buffer, length)); } else { Modified: stable/11/sys/libkern/x86/crc32_sse42.c ============================================================================== --- head/sys/libkern/x86/crc32_sse42.c Tue Jan 31 03:26:32 2017 (r313006) +++ stable/11/sys/libkern/x86/crc32_sse42.c Wed Apr 19 16:16:41 2017 (r317149) @@ -31,14 +31,40 @@ __FBSDID("$FreeBSD$"); */ #ifdef USERSPACE_TESTING #include +#include #else #include -#include -#include #include +#include #endif -#include +static __inline uint32_t +_mm_crc32_u8(uint32_t x, uint8_t y) +{ + /* + * clang (at least 3.9.[0-1]) pessimizes "rm" (y) and "m" (y) + * significantly and "r" (y) a lot by copying y to a different + * local variable (on the stack or in a register), so only use + * the latter. This costs a register and an instruction but + * not a uop. + */ + __asm("crc32b %1,%0" : "+r" (x) : "r" (y)); + return (x); +} + +static __inline uint32_t +_mm_crc32_u32(uint32_t x, uint32_t y) +{ + __asm("crc32l %1,%0" : "+r" (x) : "r" (y)); + return (x); +} + +static __inline uint64_t +_mm_crc32_u64(uint64_t x, uint64_t y) +{ + __asm("crc32q %1,%0" : "+r" (x) : "r" (y)); + return (x); +} /* CRC-32C (iSCSI) polynomial in reversed bit order. */ #define POLY 0x82f63b78 @@ -47,12 +73,18 @@ __FBSDID("$FreeBSD$"); * Block sizes for three-way parallel crc computation. LONG and SHORT must * both be powers of two. */ -#define LONG 8192 -#define SHORT 256 +#define LONG 128 +#define SHORT 64 -/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */ +/* + * Tables for updating a crc for LONG, 2 * LONG, SHORT and 2 * SHORT bytes + * of value 0 later in the input stream, in the same way that the hardware + * would, but in software without calculating intermediate steps. + */ static uint32_t crc32c_long[4][256]; +static uint32_t crc32c_2long[4][256]; static uint32_t crc32c_short[4][256]; +static uint32_t crc32c_2short[4][256]; /* * Multiply a matrix times a vector over the Galois field of two elements, @@ -171,7 +203,9 @@ __attribute__((__constructor__)) crc32c_init_hw(void) { crc32c_zeros(crc32c_long, LONG); + crc32c_zeros(crc32c_2long, 2 * LONG); crc32c_zeros(crc32c_short, SHORT); + crc32c_zeros(crc32c_2short, 2 * SHORT); } #ifdef _KERNEL SYSINIT(crc32c_sse42, SI_SUB_LOCK, SI_ORDER_ANY, crc32c_init_hw, NULL); @@ -190,7 +224,11 @@ sse42_crc32c(uint32_t crc, const unsigne const size_t align = 4; #endif const unsigned char *next, *end; - uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ +#ifdef __amd64__ + uint64_t crc0, crc1, crc2; +#else + uint32_t crc0, crc1, crc2; +#endif next = buf; crc0 = crc; @@ -202,6 +240,7 @@ sse42_crc32c(uint32_t crc, const unsigne len--; } +#if LONG > SHORT /* * Compute the crc on sets of LONG*3 bytes, executing three independent * crc instructions, each on LONG bytes -- this is optimized for the @@ -209,6 +248,7 @@ sse42_crc32c(uint32_t crc, const unsigne * have a throughput of one crc per cycle, but a latency of three * cycles. */ + crc = 0; while (len >= LONG * 3) { crc1 = 0; crc2 = 0; @@ -229,16 +269,64 @@ sse42_crc32c(uint32_t crc, const unsigne #endif next += align; } while (next < end); - crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1; - crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2; + /*- + * Update the crc. Try to do it in parallel with the inner + * loop. 'crc' is used to accumulate crc0 and crc1 + * produced by the inner loop so that the next iteration + * of the loop doesn't depend on anything except crc2. + * + * The full expression for the update is: + * crc = S*S*S*crc + S*S*crc0 + S*crc1 + * where the terms are polynomials modulo the CRC polynomial. + * We regroup this subtly as: + * crc = S*S * (S*crc + crc0) + S*crc1. + * This has an extra dependency which reduces possible + * parallelism for the expression, but it turns out to be + * best to intentionally delay evaluation of this expression + * so that it competes less with the inner loop. + * + * We also intentionally reduce parallelism by feedng back + * crc2 to the inner loop as crc0 instead of accumulating + * it in crc. This synchronizes the loop with crc update. + * CPU and/or compiler schedulers produced bad order without + * this. + * + * Shifts take about 12 cycles each, so 3 here with 2 + * parallelizable take about 24 cycles and the crc update + * takes slightly longer. 8 dependent crc32 instructions + * can run in 24 cycles, so the 3-way blocking is worse + * than useless for sizes less than 8 * = 64 + * on amd64. In practice, SHORT = 32 confirms these + * timing calculations by giving a small improvement + * starting at size 96. Then the inner loop takes about + * 12 cycles and the crc update about 24, but these are + * partly in parallel so the total time is less than the + * 36 cycles that 12 dependent crc32 instructions would + * take. + * + * To have a chance of completely hiding the overhead for + * the crc update, the inner loop must take considerably + * longer than 24 cycles. LONG = 64 makes the inner loop + * take about 24 cycles, so is not quite large enough. + * LONG = 128 works OK. Unhideable overheads are about + * 12 cycles per inner loop. All assuming timing like + * Haswell. + */ + crc = crc32c_shift(crc32c_long, crc) ^ crc0; + crc1 = crc32c_shift(crc32c_long, crc1); + crc = crc32c_shift(crc32c_2long, crc) ^ crc1; + crc0 = crc2; next += LONG * 2; len -= LONG * 3; } + crc0 ^= crc; +#endif /* LONG > SHORT */ /* * Do the same thing, but now on SHORT*3 blocks for the remaining data * less than a LONG*3 block */ + crc = 0; while (len >= SHORT * 3) { crc1 = 0; crc2 = 0; @@ -259,11 +347,14 @@ sse42_crc32c(uint32_t crc, const unsigne #endif next += align; } while (next < end); - crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1; - crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2; + crc = crc32c_shift(crc32c_short, crc) ^ crc0; + crc1 = crc32c_shift(crc32c_short, crc1); + crc = crc32c_shift(crc32c_2short, crc) ^ crc1; + crc0 = crc2; next += SHORT * 2; len -= SHORT * 3; } + crc0 ^= crc; /* Compute the crc on the remaining bytes at native word size. */ end = next + (len - (len & (align - 1))); Modified: stable/11/sys/sys/libkern.h ============================================================================== --- stable/11/sys/sys/libkern.h Wed Apr 19 16:12:02 2017 (r317148) +++ stable/11/sys/sys/libkern.h Wed Apr 19 16:16:41 2017 (r317149) @@ -178,6 +178,11 @@ crc32(const void *buf, size_t size) uint32_t calculate_crc32c(uint32_t crc32c, const unsigned char *buffer, unsigned int length); +#ifdef _KERNEL +#if defined(__amd64__) || defined(__i386__) +uint32_t sse42_crc32c(uint32_t, const unsigned char *, unsigned); +#endif +#endif LIBKERN_INLINE void *memset(void *, int, size_t); Modified: stable/11/tests/sys/kern/Makefile ============================================================================== --- stable/11/tests/sys/kern/Makefile Wed Apr 19 16:12:02 2017 (r317148) +++ stable/11/tests/sys/kern/Makefile Wed Apr 19 16:16:41 2017 (r317149) @@ -25,13 +25,19 @@ NETBSD_ATF_TESTS_C+= mqueue_test CFLAGS.mqueue_test+= -I${SRCTOP}/tests LIBADD.mqueue_test+= rt +.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" +ATF_TESTS_C+= libkern_crc32 +CFLAGS.libkern_crc32+= -msse4 -DUSERSPACE_TESTING +LDADD.libkern_crc32+= ${SRCTOP}/sys/libkern/x86/crc32_sse42.c +.endif + # subr_unit.c contains functions whose prototypes lie in headers that cannot be # included in userland. But as far as subr_unit_test goes, they're effectively # static. So it's ok to disable -Wmissing-prototypes for this program. CFLAGS.subr_unit.c+= -Wno-missing-prototypes SRCS.subr_unit_test+= subr_unit.c -WARNS?= 5 +WARNS?= 3 TESTS_SUBDIRS+= acct TESTS_SUBDIRS+= execve Copied: stable/11/tests/sys/kern/libkern_crc32.c (from r313006, head/tests/sys/kern/libkern_crc32.c) ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ stable/11/tests/sys/kern/libkern_crc32.c Wed Apr 19 16:16:41 2017 (r317149, copy of r313006, head/tests/sys/kern/libkern_crc32.c) @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2017 Conrad Meyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include + +#include + +#include + +extern uint32_t sse42_crc32c(uint32_t, const unsigned char *, unsigned); + +ATF_TC_WITHOUT_HEAD(crc32c_basic_correctness); +ATF_TC_BODY(crc32c_basic_correctness, tc) +{ + const uint64_t inputs[] = { + 0xf408c634b3a9142, + 0x80539e8c7c352e2b, + 0x62e9121db6e4d649, + 0x899345850ed0a286, + 0x2302df11b4a43b15, + 0xe943de7b3d35d70, + 0xdf1ff2bf41abf56b, + 0x9bc138abae315de2, + 0x31cc82e56234f0ff, + 0xce63c0cd6988e847, + 0x3e42f6b78ee352fa, + 0xfa4085436078cfa6, + 0x53349558bf670a4b, + 0x2714e10e7d722c61, + 0xc0d3261addfc6908, + 0xd1567c3181d3a1bf, + }; + const uint32_t results[] = { + 0x2ce33ede, + 0xc49cc573, + 0xb8683c96, + 0x6918660d, + 0xa904e522, + 0x52dbc42c, + 0x98863c22, + 0x894d5d2c, + 0xb003745d, + 0xfc496dbd, + 0x97d2fbb5, + 0x3c062ef1, + 0xcc2eff18, + 0x6a9b09f6, + 0x420242c1, + 0xfd562dc3, + }; + size_t i; + uint32_t act; + + ATF_REQUIRE(nitems(inputs) == nitems(results)); + + for (i = 0; i < nitems(inputs); i++) { + act = sse42_crc32c(~0, (const void *)&inputs[i], + sizeof(inputs[0])); + ATF_REQUIRE_MSG(act == results[i], + "crc32c(0x%jx) = 0x%08x, got 0x%08x", (uintmax_t)inputs[i], + results[i], act); + } +} + +ATF_TC_WITHOUT_HEAD(crc32c_alignment); +ATF_TC_BODY(crc32c_alignment, tc) +{ + const uint64_t input = 0xf408c634b3a9142; + const uint32_t result = 0x2ce33ede; + unsigned char buf[15]; + size_t i; + uint32_t act; + + + for (i = 1; i < 8; i++) { + memcpy(&buf[i], &input, sizeof(input)); + + act = sse42_crc32c(~0, (const void *)&buf[i], sizeof(input)); + ATF_REQUIRE_MSG(act == result, + "crc32c(0x%jx) = 0x%08x, got 0x%08x", (uintmax_t)input, + result, act); + } +} + +ATF_TC_WITHOUT_HEAD(crc32c_trailing_bytes); +ATF_TC_BODY(crc32c_trailing_bytes, tc) +{ + const unsigned char input[] = { + 0x87, 0x54, 0x74, 0xd2, 0xb, 0x9b, 0xdd, 0xf6, 0x68, 0x37, + 0xd4, 0x4, 0x5e, 0xa9, 0xb3 + }; + const uint32_t result = 0xec638d62; + uint32_t act; + + act = sse42_crc32c(~0, input, sizeof(input)); + ATF_REQUIRE_MSG(act == result, "expected 0x%08x, got 0x%08x", result, + act); +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, crc32c_basic_correctness); + ATF_TP_ADD_TC(tp, crc32c_alignment); + ATF_TP_ADD_TC(tp, crc32c_trailing_bytes); + return (atf_no_error()); +}