Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 27 Apr 2023 00:47:20 GMT
From:      Kyle Evans <kevans@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 42f0ac5f1bd2 - main - Fix BLAKE3 aarch64 assembly for FreeBSD and macOS
Message-ID:  <202304270047.33R0lKKf025428@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=42f0ac5f1bd2bb5c779ce51c369a0e47c62cbf9b

commit 42f0ac5f1bd2bb5c779ce51c369a0e47c62cbf9b
Author:     Tino Reichardt <milky-zfs@mcmilk.de>
AuthorDate: 2023-04-26 19:40:26 +0000
Commit:     Kyle Evans <kevans@FreeBSD.org>
CommitDate: 2023-04-27 00:46:47 +0000

    Fix BLAKE3 aarch64 assembly for FreeBSD and macOS
    
    The x18 register isn't useable within FreeBSD kernel space, so we
    have to fix the BLAKE3 aarch64 assembly for not using it.
    
    The source files are here: https://github.com/mcmilk/BLAKE3-tests
    
    Reviewed-by: Kyle Evans <kevans@FreeBSD.org>
    Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
    Closes #14728
---
 .../icp/asm-aarch64/blake3/b3_aarch64_sse2.S       | 4163 +++++++++---------
 .../icp/asm-aarch64/blake3/b3_aarch64_sse41.S      | 4447 ++++++++++----------
 2 files changed, 4078 insertions(+), 4532 deletions(-)

diff --git a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
index 8237f0eb5a4e..dc2719d142db 100644
--- a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -22,480 +22,61 @@
 /*
  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
  * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
- * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * Copyright (c) 2022-2023 Tino Reichardt <milky-zfs@mcmilk.de>
  *
  * This is converted assembly: SSE2 -> ARMv8-A
  * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
  */
 
 #if defined(__aarch64__)
 	.text
-	.section	.rodata.cst16,"aM",@progbits,16
-	.p2align	4
-.LCPI0_0:
-	.word	1779033703
-	.word	3144134277
-	.word	1013904242
-	.word	2773480762
-.LCPI0_1:
-	.xword	0
-	.xword	-4294967296
-.LCPI0_2:
-	.xword	-1
-	.xword	4294967295
+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
+.Lsec_end0:
 	.text
 	.globl	zfs_blake3_compress_in_place_sse2
 	.p2align	2
 	.type	zfs_blake3_compress_in_place_sse2,@function
 zfs_blake3_compress_in_place_sse2:
 	.cfi_startproc
-	ldp	q3, q2, [x0]
-	ldp	q5, q6, [x1]
-	add	x10, x1, #32
-	lsr	x11, x3, #32
-	fmov	s4, w3
-	ld2	{ v17.4s, v18.4s }, [x10]
-	adrp	x10, .LCPI0_2
-	and	w8, w2, #0xff
-	mov	v4.s[1], w11
-	ldr	q1, [x10, :lo12:.LCPI0_2]
-	and	w9, w4, #0xff
-	adrp	x12, .LCPI0_0
-	mov	v4.s[2], w8
-	uzp1	v19.4s, v5.4s, v6.4s
-	add	v3.4s, v2.4s, v3.4s
-	ldr	q7, [x12, :lo12:.LCPI0_0]
-	mov	v4.s[3], w9
-	add	v3.4s, v3.4s, v19.4s
-	uzp2	v5.4s, v5.4s, v6.4s
-	ext	v21.16b, v18.16b, v18.16b, #12
-	uzp1	v6.4s, v19.4s, v19.4s
-	ext	v22.16b, v19.16b, v19.16b, #12
-	eor	v4.16b, v3.16b, v4.16b
-	ext	v20.16b, v17.16b, v17.16b, #12
-	ext	v6.16b, v6.16b, v19.16b, #8
-	ext	v19.16b, v19.16b, v22.16b, #12
-	zip1	v22.2d, v21.2d, v5.2d
-	rev32	v24.8h, v4.8h
-	mov	v4.16b, v1.16b
-	zip2	v23.4s, v5.4s, v21.4s
-	uzp2	v6.4s, v6.4s, v5.4s
-	bsl	v4.16b, v22.16b, v20.16b
-	add	v3.4s, v3.4s, v5.4s
-	zip1	v5.4s, v23.4s, v20.4s
-	zip1	v22.4s, v20.4s, v23.4s
-	add	v23.4s, v24.4s, v7.4s
-	ext	v7.16b, v6.16b, v6.16b, #4
-	ext	v25.16b, v4.16b, v4.16b, #12
-	ext	v5.16b, v22.16b, v5.16b, #8
-	eor	v2.16b, v23.16b, v2.16b
-	uzp1	v4.4s, v4.4s, v25.4s
-	uzp1	v22.4s, v7.4s, v7.4s
-	ext	v25.16b, v7.16b, v7.16b, #12
-	ext	v22.16b, v22.16b, v7.16b, #8
-	ext	v7.16b, v7.16b, v25.16b, #12
-	ushr	v25.4s, v2.4s, #12
-	shl	v2.4s, v2.4s, #20
-	orr	v2.16b, v2.16b, v25.16b
-	add	v3.4s, v3.4s, v2.4s
-	eor	v24.16b, v3.16b, v24.16b
-	add	v3.4s, v3.4s, v17.4s
-	ushr	v17.4s, v24.4s, #8
-	shl	v18.4s, v24.4s, #24
-	orr	v17.16b, v18.16b, v17.16b
-	add	v18.4s, v17.4s, v23.4s
-	eor	v2.16b, v18.16b, v2.16b
-	ushr	v23.4s, v2.4s, #7
-	shl	v2.4s, v2.4s, #25
-	ext	v3.16b, v3.16b, v3.16b, #12
-	orr	v2.16b, v2.16b, v23.16b
-	ext	v17.16b, v17.16b, v17.16b, #8
-	add	v3.4s, v2.4s, v3.4s
-	adrp	x11, .LCPI0_1
-	eor	v17.16b, v3.16b, v17.16b
-	ldr	q16, [x11, :lo12:.LCPI0_1]
-	ext	v18.16b, v18.16b, v18.16b, #4
-	rev32	v24.8h, v17.8h
-	movi	v0.2d, #0xffffffff00000000
-	add	v23.4s, v3.4s, v21.4s
-	mov	v21.s[1], v20.s[2]
-	add	v20.4s, v18.4s, v24.4s
-	bit	v19.16b, v21.16b, v0.16b
-	eor	v3.16b, v20.16b, v2.16b
-	uzp2	v2.4s, v22.4s, v19.4s
-	zip1	v17.2d, v5.2d, v19.2d
-	zip2	v18.4s, v19.4s, v5.4s
-	ushr	v21.4s, v3.4s, #12
-	shl	v3.4s, v3.4s, #20
-	ext	v22.16b, v2.16b, v2.16b, #4
-	bsl	v16.16b, v4.16b, v17.16b
-	zip1	v17.4s, v18.4s, v4.4s
-	zip1	v18.4s, v4.4s, v18.4s
-	orr	v21.16b, v3.16b, v21.16b
-	ext	v25.16b, v16.16b, v16.16b, #12
-	ext	v3.16b, v18.16b, v17.16b, #8
-	uzp1	v18.4s, v22.4s, v22.4s
-	ext	v26.16b, v22.16b, v22.16b, #12
-	add	v23.4s, v23.4s, v21.4s
-	uzp1	v17.4s, v16.4s, v25.4s
-	ext	v16.16b, v18.16b, v22.16b, #8
-	ext	v18.16b, v22.16b, v26.16b, #12
-	eor	v22.16b, v23.16b, v24.16b
-	add	v6.4s, v23.4s, v6.4s
-	ushr	v23.4s, v22.4s, #8
-	shl	v22.4s, v22.4s, #24
-	orr	v22.16b, v22.16b, v23.16b
-	add	v20.4s, v22.4s, v20.4s
-	eor	v21.16b, v20.16b, v21.16b
-	ushr	v23.4s, v21.4s, #7
-	shl	v21.4s, v21.4s, #25
-	ext	v6.16b, v6.16b, v6.16b, #4
-	orr	v21.16b, v21.16b, v23.16b
-	ext	v22.16b, v22.16b, v22.16b, #8
-	add	v6.4s, v21.4s, v6.4s
-	eor	v22.16b, v6.16b, v22.16b
-	ext	v20.16b, v20.16b, v20.16b, #12
-	add	v6.4s, v6.4s, v19.4s
-	rev32	v19.8h, v22.8h
-	add	v20.4s, v20.4s, v19.4s
-	eor	v21.16b, v20.16b, v21.16b
-	ushr	v22.4s, v21.4s, #12
-	shl	v21.4s, v21.4s, #20
-	orr	v21.16b, v21.16b, v22.16b
-	add	v6.4s, v6.4s, v21.4s
-	eor	v19.16b, v6.16b, v19.16b
-	ushr	v22.4s, v19.4s, #8
-	shl	v19.4s, v19.4s, #24
-	orr	v19.16b, v19.16b, v22.16b
-	add	v20.4s, v19.4s, v20.4s
-	eor	v21.16b, v20.16b, v21.16b
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ushr	v22.4s, v21.4s, #7
-	shl	v21.4s, v21.4s, #25
-	add	v6.4s, v6.4s, v4.4s
-	orr	v21.16b, v21.16b, v22.16b
-	ext	v19.16b, v19.16b, v19.16b, #8
-	add	v6.4s, v6.4s, v21.4s
-	eor	v19.16b, v6.16b, v19.16b
-	ext	v20.16b, v20.16b, v20.16b, #4
-	rev32	v19.8h, v19.8h
-	add	v20.4s, v20.4s, v19.4s
-	add	v6.4s, v6.4s, v5.4s
-	mov	v5.s[1], v4.s[2]
-	eor	v4.16b, v20.16b, v21.16b
-	ushr	v21.4s, v4.4s, #12
-	shl	v4.4s, v4.4s, #20
-	orr	v21.16b, v4.16b, v21.16b
-	add	v6.4s, v6.4s, v21.4s
-	eor	v19.16b, v6.16b, v19.16b
-	add	v2.4s, v6.4s, v2.4s
-	ushr	v6.4s, v19.4s, #8
-	shl	v19.4s, v19.4s, #24
-	orr	v6.16b, v19.16b, v6.16b
-	add	v19.4s, v6.4s, v20.4s
-	eor	v20.16b, v19.16b, v21.16b
-	ushr	v21.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	ext	v2.16b, v2.16b, v2.16b, #4
-	orr	v20.16b, v20.16b, v21.16b
-	ext	v6.16b, v6.16b, v6.16b, #8
-	add	v2.4s, v20.4s, v2.4s
-	eor	v6.16b, v2.16b, v6.16b
-	ext	v19.16b, v19.16b, v19.16b, #12
-	rev32	v6.8h, v6.8h
-	add	v19.4s, v19.4s, v6.4s
-	mov	v22.16b, v0.16b
-	eor	v20.16b, v19.16b, v20.16b
-	bsl	v22.16b, v5.16b, v7.16b
-	ushr	v21.4s, v20.4s, #12
-	shl	v20.4s, v20.4s, #20
-	add	v2.4s, v2.4s, v22.4s
-	orr	v20.16b, v20.16b, v21.16b
-	add	v2.4s, v2.4s, v20.4s
-	eor	v6.16b, v2.16b, v6.16b
-	ushr	v21.4s, v6.4s, #8
-	shl	v6.4s, v6.4s, #24
-	orr	v6.16b, v6.16b, v21.16b
-	add	v19.4s, v6.4s, v19.4s
-	eor	v20.16b, v19.16b, v20.16b
-	ext	v2.16b, v2.16b, v2.16b, #12
-	ushr	v21.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	add	v2.4s, v2.4s, v17.4s
-	orr	v20.16b, v20.16b, v21.16b
-	ext	v6.16b, v6.16b, v6.16b, #8
-	add	v2.4s, v2.4s, v20.4s
-	eor	v6.16b, v2.16b, v6.16b
-	uzp2	v5.4s, v16.4s, v22.4s
-	zip1	v7.2d, v3.2d, v22.2d
-	zip2	v16.4s, v22.4s, v3.4s
-	ext	v19.16b, v19.16b, v19.16b, #4
-	rev32	v22.8h, v6.8h
-	ext	v23.16b, v5.16b, v5.16b, #4
-	bif	v7.16b, v17.16b, v1.16b
-	zip1	v24.4s, v16.4s, v17.4s
-	zip1	v16.4s, v17.4s, v16.4s
-	add	v21.4s, v2.4s, v3.4s
-	mov	v3.s[1], v17.s[2]
-	add	v17.4s, v19.4s, v22.4s
-	mov	v19.16b, v0.16b
-	ext	v25.16b, v7.16b, v7.16b, #12
-	ext	v4.16b, v16.16b, v24.16b, #8
-	uzp1	v16.4s, v23.4s, v23.4s
-	bsl	v19.16b, v3.16b, v18.16b
-	eor	v2.16b, v17.16b, v20.16b
-	uzp1	v7.4s, v7.4s, v25.4s
-	ext	v25.16b, v16.16b, v23.16b, #8
-	zip1	v3.2d, v4.2d, v19.2d
-	ushr	v20.4s, v2.4s, #12
-	shl	v2.4s, v2.4s, #20
-	ext	v24.16b, v23.16b, v23.16b, #12
-	uzp2	v6.4s, v25.4s, v19.4s
-	zip2	v18.4s, v19.4s, v4.4s
-	bif	v3.16b, v7.16b, v1.16b
-	orr	v20.16b, v2.16b, v20.16b
-	ext	v16.16b, v23.16b, v24.16b, #12
-	ext	v23.16b, v6.16b, v6.16b, #4
-	zip1	v24.4s, v18.4s, v7.4s
-	zip1	v18.4s, v7.4s, v18.4s
-	ext	v25.16b, v3.16b, v3.16b, #12
-	add	v21.4s, v21.4s, v20.4s
-	ext	v2.16b, v18.16b, v24.16b, #8
-	uzp1	v18.4s, v23.4s, v23.4s
-	ext	v24.16b, v23.16b, v23.16b, #12
-	uzp1	v3.4s, v3.4s, v25.4s
-	eor	v22.16b, v21.16b, v22.16b
-	ext	v25.16b, v18.16b, v23.16b, #8
-	dup	v18.4s, v2.s[3]
-	ext	v23.16b, v23.16b, v24.16b, #12
-	add	v5.4s, v21.4s, v5.4s
-	trn1	v21.4s, v3.4s, v3.4s
-	ushr	v24.4s, v22.4s, #8
-	shl	v22.4s, v22.4s, #24
-	ext	v18.16b, v21.16b, v18.16b, #8
-	orr	v21.16b, v22.16b, v24.16b
-	add	v17.4s, v21.4s, v17.4s
-	eor	v20.16b, v17.16b, v20.16b
-	ushr	v22.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	ext	v5.16b, v5.16b, v5.16b, #4
-	orr	v20.16b, v20.16b, v22.16b
-	ext	v21.16b, v21.16b, v21.16b, #8
-	add	v5.4s, v20.4s, v5.4s
-	eor	v21.16b, v5.16b, v21.16b
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v5.4s, v5.4s, v19.4s
-	rev32	v19.8h, v21.8h
-	add	v17.4s, v17.4s, v19.4s
-	eor	v20.16b, v17.16b, v20.16b
-	ushr	v21.4s, v20.4s, #12
-	shl	v20.4s, v20.4s, #20
-	orr	v20.16b, v20.16b, v21.16b
-	add	v5.4s, v5.4s, v20.4s
-	eor	v19.16b, v5.16b, v19.16b
-	ushr	v21.4s, v19.4s, #8
-	shl	v19.4s, v19.4s, #24
-	orr	v19.16b, v19.16b, v21.16b
-	add	v17.4s, v19.4s, v17.4s
-	eor	v20.16b, v17.16b, v20.16b
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ushr	v21.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	add	v5.4s, v5.4s, v7.4s
-	orr	v20.16b, v20.16b, v21.16b
-	ext	v19.16b, v19.16b, v19.16b, #8
-	add	v5.4s, v5.4s, v20.4s
-	eor	v19.16b, v5.16b, v19.16b
-	ext	v17.16b, v17.16b, v17.16b, #4
-	rev32	v22.8h, v19.8h
-	add	v21.4s, v5.4s, v4.4s
-	mov	v4.s[1], v7.s[2]
-	add	v19.4s, v17.4s, v22.4s
-	bit	v16.16b, v4.16b, v0.16b
-	eor	v5.16b, v19.16b, v20.16b
-	uzp2	v4.4s, v25.4s, v16.4s
-	zip1	v7.2d, v2.2d, v16.2d
-	zip2	v17.4s, v16.4s, v2.4s
-	ushr	v20.4s, v5.4s, #12
-	shl	v5.4s, v5.4s, #20
-	ext	v24.16b, v4.16b, v4.16b, #4
-	bif	v7.16b, v3.16b, v1.16b
-	zip1	v25.4s, v17.4s, v3.4s
-	zip1	v17.4s, v3.4s, v17.4s
-	orr	v20.16b, v5.16b, v20.16b
-	ext	v26.16b, v7.16b, v7.16b, #12
-	ext	v5.16b, v17.16b, v25.16b, #8
-	uzp1	v17.4s, v24.4s, v24.4s
-	ext	v25.16b, v24.16b, v24.16b, #12
-	bit	v23.16b, v18.16b, v0.16b
-	add	v21.4s, v21.4s, v20.4s
-	uzp1	v7.4s, v7.4s, v26.4s
-	ext	v26.16b, v17.16b, v24.16b, #8
-	ext	v17.16b, v24.16b, v25.16b, #12
-	eor	v22.16b, v21.16b, v22.16b
-	add	v6.4s, v21.4s, v6.4s
-	zip1	v21.2d, v5.2d, v23.2d
-	zip2	v24.4s, v23.4s, v5.4s
-	bif	v21.16b, v7.16b, v1.16b
-	zip1	v1.4s, v24.4s, v7.4s
-	zip1	v24.4s, v7.4s, v24.4s
-	ext	v1.16b, v24.16b, v1.16b, #8
-	ushr	v24.4s, v22.4s, #8
-	shl	v22.4s, v22.4s, #24
-	orr	v22.16b, v22.16b, v24.16b
-	add	v19.4s, v22.4s, v19.4s
-	ext	v24.16b, v21.16b, v21.16b, #12
-	eor	v20.16b, v19.16b, v20.16b
-	uzp1	v21.4s, v21.4s, v24.4s
-	ushr	v24.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	orr	v20.16b, v20.16b, v24.16b
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v22.16b, v22.16b, v22.16b, #8
-	add	v6.4s, v20.4s, v6.4s
-	eor	v22.16b, v6.16b, v22.16b
-	ext	v19.16b, v19.16b, v19.16b, #12
-	add	v6.4s, v6.4s, v16.4s
-	rev32	v16.8h, v22.8h
-	add	v19.4s, v19.4s, v16.4s
-	eor	v20.16b, v19.16b, v20.16b
-	ushr	v22.4s, v20.4s, #12
-	shl	v20.4s, v20.4s, #20
-	orr	v20.16b, v20.16b, v22.16b
-	add	v6.4s, v6.4s, v20.4s
-	eor	v16.16b, v6.16b, v16.16b
-	ext	v6.16b, v6.16b, v6.16b, #12
-	add	v3.4s, v6.4s, v3.4s
-	ushr	v6.4s, v16.4s, #8
-	shl	v16.4s, v16.4s, #24
-	orr	v6.16b, v16.16b, v6.16b
-	add	v16.4s, v6.4s, v19.4s
-	eor	v19.16b, v16.16b, v20.16b
-	ushr	v20.4s, v19.4s, #7
-	shl	v19.4s, v19.4s, #25
-	orr	v19.16b, v19.16b, v20.16b
-	ext	v6.16b, v6.16b, v6.16b, #8
-	add	v3.4s, v3.4s, v19.4s
-	eor	v6.16b, v3.16b, v6.16b
-	ext	v16.16b, v16.16b, v16.16b, #4
-	add	v2.4s, v3.4s, v2.4s
-	rev32	v3.8h, v6.8h
-	add	v6.4s, v16.4s, v3.4s
-	eor	v16.16b, v6.16b, v19.16b
-	ushr	v19.4s, v16.4s, #12
-	shl	v16.4s, v16.4s, #20
-	orr	v16.16b, v16.16b, v19.16b
-	add	v2.4s, v2.4s, v16.4s
-	eor	v3.16b, v2.16b, v3.16b
-	add	v2.4s, v2.4s, v4.4s
-	ushr	v4.4s, v3.4s, #8
-	shl	v3.4s, v3.4s, #24
-	orr	v3.16b, v3.16b, v4.16b
-	add	v4.4s, v3.4s, v6.4s
-	eor	v6.16b, v4.16b, v16.16b
-	ushr	v16.4s, v6.4s, #7
-	shl	v6.4s, v6.4s, #25
-	ext	v2.16b, v2.16b, v2.16b, #4
-	orr	v6.16b, v6.16b, v16.16b
-	ext	v3.16b, v3.16b, v3.16b, #8
-	add	v2.4s, v6.4s, v2.4s
-	eor	v3.16b, v2.16b, v3.16b
-	ext	v4.16b, v4.16b, v4.16b, #12
-	rev32	v3.8h, v3.8h
-	add	v4.4s, v4.4s, v3.4s
-	eor	v6.16b, v4.16b, v6.16b
-	ushr	v16.4s, v6.4s, #12
-	shl	v6.4s, v6.4s, #20
-	add	v2.4s, v2.4s, v23.4s
-	orr	v6.16b, v6.16b, v16.16b
-	add	v2.4s, v2.4s, v6.4s
-	eor	v3.16b, v2.16b, v3.16b
-	ushr	v16.4s, v3.4s, #8
-	shl	v3.4s, v3.4s, #24
-	orr	v3.16b, v3.16b, v16.16b
-	add	v4.4s, v3.4s, v4.4s
-	eor	v6.16b, v4.16b, v6.16b
-	ext	v2.16b, v2.16b, v2.16b, #12
-	ushr	v16.4s, v6.4s, #7
-	shl	v6.4s, v6.4s, #25
-	add	v2.4s, v2.4s, v7.4s
-	orr	v6.16b, v6.16b, v16.16b
-	ext	v3.16b, v3.16b, v3.16b, #8
-	add	v2.4s, v2.4s, v6.4s
-	eor	v3.16b, v2.16b, v3.16b
-	ext	v4.16b, v4.16b, v4.16b, #4
-	rev32	v3.8h, v3.8h
-	add	v2.4s, v2.4s, v5.4s
-	mov	v5.s[1], v7.s[2]
-	add	v4.4s, v4.4s, v3.4s
-	bsl	v0.16b, v5.16b, v17.16b
-	eor	v5.16b, v4.16b, v6.16b
-	ushr	v6.4s, v5.4s, #12
-	shl	v5.4s, v5.4s, #20
-	orr	v5.16b, v5.16b, v6.16b
-	add	v2.4s, v2.4s, v5.4s
-	eor	v3.16b, v2.16b, v3.16b
-	ushr	v6.4s, v3.4s, #8
-	shl	v3.4s, v3.4s, #24
-	orr	v3.16b, v3.16b, v6.16b
-	add	v4.4s, v3.4s, v4.4s
-	uzp2	v18.4s, v26.4s, v18.4s
-	eor	v5.16b, v4.16b, v5.16b
-	add	v2.4s, v2.4s, v18.4s
-	ushr	v6.4s, v5.4s, #7
-	shl	v5.4s, v5.4s, #25
-	ext	v2.16b, v2.16b, v2.16b, #4
-	orr	v5.16b, v5.16b, v6.16b
-	ext	v3.16b, v3.16b, v3.16b, #8
-	add	v2.4s, v5.4s, v2.4s
-	eor	v3.16b, v2.16b, v3.16b
-	ext	v4.16b, v4.16b, v4.16b, #12
-	add	v0.4s, v2.4s, v0.4s
-	rev32	v2.8h, v3.8h
-	add	v3.4s, v4.4s, v2.4s
-	eor	v4.16b, v3.16b, v5.16b
-	ushr	v5.4s, v4.4s, #12
-	shl	v4.4s, v4.4s, #20
-	orr	v4.16b, v4.16b, v5.16b
-	add	v0.4s, v0.4s, v4.4s
-	eor	v2.16b, v0.16b, v2.16b
-	ushr	v5.4s, v2.4s, #8
-	shl	v2.4s, v2.4s, #24
-	orr	v2.16b, v2.16b, v5.16b
-	add	v3.4s, v2.4s, v3.4s
-	eor	v4.16b, v3.16b, v4.16b
-	ext	v0.16b, v0.16b, v0.16b, #12
-	ushr	v5.4s, v4.4s, #7
-	shl	v4.4s, v4.4s, #25
-	add	v0.4s, v0.4s, v21.4s
-	orr	v4.16b, v4.16b, v5.16b
-	ext	v2.16b, v2.16b, v2.16b, #8
-	add	v0.4s, v0.4s, v4.4s
-	eor	v2.16b, v0.16b, v2.16b
-	ext	v3.16b, v3.16b, v3.16b, #4
-	add	v0.4s, v0.4s, v1.4s
-	rev32	v1.8h, v2.8h
-	add	v2.4s, v3.4s, v1.4s
-	eor	v3.16b, v2.16b, v4.16b
-	ushr	v4.4s, v3.4s, #12
-	shl	v3.4s, v3.4s, #20
-	orr	v3.16b, v3.16b, v4.16b
-	add	v0.4s, v0.4s, v3.4s
-	eor	v1.16b, v0.16b, v1.16b
-	ushr	v4.4s, v1.4s, #8
-	shl	v1.4s, v1.4s, #24
-	orr	v1.16b, v1.16b, v4.16b
-	add	v2.4s, v1.4s, v2.4s
-	eor	v3.16b, v2.16b, v3.16b
-	ext	v0.16b, v0.16b, v0.16b, #4
-	ext	v2.16b, v2.16b, v2.16b, #12
-	ushr	v4.4s, v3.4s, #7
-	shl	v3.4s, v3.4s, #25
-	ext	v1.16b, v1.16b, v1.16b, #8
+	hint	#25
+	.cfi_negate_ra_state
+	sub	sp, sp, #96
+	stp	x29, x30, [sp, #64]
+	add	x29, sp, #64
+	str	x19, [sp, #80]
+	.cfi_def_cfa w29, 32
+	.cfi_offset w19, -16
+	.cfi_offset w30, -24
+	.cfi_offset w29, -32
+	mov	x19, x0
+	mov	w5, w4
+	mov	x4, x3
+	mov	w3, w2
+	mov	x2, x1
+	mov	x0, sp
+	mov	x1, x19
+	bl	compress_pre
+	ldp	q0, q1, [sp]
+	ldp	q2, q3, [sp, #32]
 	eor	v0.16b, v2.16b, v0.16b
-	orr	v2.16b, v3.16b, v4.16b
-	eor	v1.16b, v2.16b, v1.16b
-	stp	q0, q1, [x0]
+	eor	v1.16b, v3.16b, v1.16b
+	ldp	x29, x30, [sp, #64]
+	stp	q0, q1, [x19]
+	ldr	x19, [sp, #80]
+	add	sp, sp, #96
+	hint	#29
 	ret
 .Lfunc_end0:
 	.size	zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
@@ -504,483 +85,518 @@ zfs_blake3_compress_in_place_sse2:
 	.section	.rodata.cst16,"aM",@progbits,16
 	.p2align	4
 .LCPI1_0:
-	.word	1779033703
-	.word	3144134277
-	.word	1013904242
-	.word	2773480762
-.LCPI1_1:
-	.xword	0
-	.xword	-4294967296
-.LCPI1_2:
-	.xword	-1
-	.xword	4294967295
+	.xword	-4942790177982912921
+	.xword	-6534734903820487822
 	.text
-	.globl	zfs_blake3_compress_xof_sse2
 	.p2align	2
-	.type	zfs_blake3_compress_xof_sse2,@function
-zfs_blake3_compress_xof_sse2:
+	.type	compress_pre,@function
+compress_pre:
 	.cfi_startproc
-	ldp	q3, q2, [x0]
-	ldp	q5, q6, [x1]
-	add	x10, x1, #32
-	lsr	x11, x3, #32
-	fmov	s4, w3
-	ld2	{ v17.4s, v18.4s }, [x10]
-	adrp	x10, .LCPI1_2
-	and	w8, w2, #0xff
-	mov	v4.s[1], w11
-	ldr	q1, [x10, :lo12:.LCPI1_2]
-	and	w9, w4, #0xff
-	adrp	x12, .LCPI1_0
-	mov	v4.s[2], w8
-	uzp1	v19.4s, v5.4s, v6.4s
-	add	v3.4s, v2.4s, v3.4s
-	ldr	q7, [x12, :lo12:.LCPI1_0]
-	mov	v4.s[3], w9
-	add	v3.4s, v3.4s, v19.4s
-	uzp2	v5.4s, v5.4s, v6.4s
-	ext	v21.16b, v18.16b, v18.16b, #12
-	uzp1	v6.4s, v19.4s, v19.4s
-	ext	v22.16b, v19.16b, v19.16b, #12
-	eor	v4.16b, v3.16b, v4.16b
-	ext	v20.16b, v17.16b, v17.16b, #12
-	ext	v6.16b, v6.16b, v19.16b, #8
-	ext	v19.16b, v19.16b, v22.16b, #12
-	zip1	v22.2d, v21.2d, v5.2d
-	rev32	v24.8h, v4.8h
-	mov	v4.16b, v1.16b
-	zip2	v23.4s, v5.4s, v21.4s
-	uzp2	v6.4s, v6.4s, v5.4s
-	bsl	v4.16b, v22.16b, v20.16b
-	add	v3.4s, v3.4s, v5.4s
-	zip1	v5.4s, v23.4s, v20.4s
-	zip1	v22.4s, v20.4s, v23.4s
-	add	v23.4s, v24.4s, v7.4s
-	ext	v7.16b, v6.16b, v6.16b, #4
-	ext	v25.16b, v4.16b, v4.16b, #12
-	ext	v5.16b, v22.16b, v5.16b, #8
-	eor	v2.16b, v23.16b, v2.16b
-	uzp1	v4.4s, v4.4s, v25.4s
-	uzp1	v22.4s, v7.4s, v7.4s
-	ext	v25.16b, v7.16b, v7.16b, #12
-	ext	v22.16b, v22.16b, v7.16b, #8
-	ext	v7.16b, v7.16b, v25.16b, #12
-	ushr	v25.4s, v2.4s, #12
-	shl	v2.4s, v2.4s, #20
-	orr	v2.16b, v2.16b, v25.16b
-	add	v3.4s, v3.4s, v2.4s
-	eor	v24.16b, v3.16b, v24.16b
-	add	v3.4s, v3.4s, v17.4s
-	ushr	v17.4s, v24.4s, #8
-	shl	v18.4s, v24.4s, #24
-	orr	v17.16b, v18.16b, v17.16b
-	add	v18.4s, v17.4s, v23.4s
-	eor	v2.16b, v18.16b, v2.16b
-	ushr	v23.4s, v2.4s, #7
-	shl	v2.4s, v2.4s, #25
-	ext	v3.16b, v3.16b, v3.16b, #12
-	orr	v2.16b, v2.16b, v23.16b
-	ext	v17.16b, v17.16b, v17.16b, #8
-	add	v3.4s, v2.4s, v3.4s
-	adrp	x11, .LCPI1_1
-	eor	v17.16b, v3.16b, v17.16b
-	ldr	q16, [x11, :lo12:.LCPI1_1]
-	ext	v18.16b, v18.16b, v18.16b, #4
-	rev32	v24.8h, v17.8h
-	movi	v0.2d, #0xffffffff00000000
-	add	v23.4s, v3.4s, v21.4s
-	mov	v21.s[1], v20.s[2]
-	add	v20.4s, v18.4s, v24.4s
-	bit	v19.16b, v21.16b, v0.16b
-	eor	v3.16b, v20.16b, v2.16b
-	uzp2	v2.4s, v22.4s, v19.4s
-	zip1	v17.2d, v5.2d, v19.2d
-	zip2	v18.4s, v19.4s, v5.4s
-	ushr	v21.4s, v3.4s, #12
-	shl	v3.4s, v3.4s, #20
-	ext	v22.16b, v2.16b, v2.16b, #4
-	bsl	v16.16b, v4.16b, v17.16b
-	zip1	v17.4s, v18.4s, v4.4s
-	zip1	v18.4s, v4.4s, v18.4s
-	orr	v21.16b, v3.16b, v21.16b
-	ext	v25.16b, v16.16b, v16.16b, #12
-	ext	v3.16b, v18.16b, v17.16b, #8
-	uzp1	v18.4s, v22.4s, v22.4s
-	ext	v26.16b, v22.16b, v22.16b, #12
-	add	v23.4s, v23.4s, v21.4s
-	uzp1	v17.4s, v16.4s, v25.4s
-	ext	v16.16b, v18.16b, v22.16b, #8
-	ext	v18.16b, v22.16b, v26.16b, #12
-	eor	v22.16b, v23.16b, v24.16b
-	add	v6.4s, v23.4s, v6.4s
-	ushr	v23.4s, v22.4s, #8
-	shl	v22.4s, v22.4s, #24
-	orr	v22.16b, v22.16b, v23.16b
-	add	v20.4s, v22.4s, v20.4s
-	eor	v21.16b, v20.16b, v21.16b
-	ushr	v23.4s, v21.4s, #7
-	shl	v21.4s, v21.4s, #25
-	ext	v6.16b, v6.16b, v6.16b, #4
-	orr	v21.16b, v21.16b, v23.16b
-	ext	v22.16b, v22.16b, v22.16b, #8
-	add	v6.4s, v21.4s, v6.4s
-	eor	v22.16b, v6.16b, v22.16b
-	ext	v20.16b, v20.16b, v20.16b, #12
-	add	v6.4s, v6.4s, v19.4s
-	rev32	v19.8h, v22.8h
-	add	v20.4s, v20.4s, v19.4s
-	eor	v21.16b, v20.16b, v21.16b
-	ushr	v22.4s, v21.4s, #12
-	shl	v21.4s, v21.4s, #20
-	orr	v21.16b, v21.16b, v22.16b
-	add	v6.4s, v6.4s, v21.4s
-	eor	v19.16b, v6.16b, v19.16b
-	ushr	v22.4s, v19.4s, #8
-	shl	v19.4s, v19.4s, #24
-	orr	v19.16b, v19.16b, v22.16b
-	add	v20.4s, v19.4s, v20.4s
-	eor	v21.16b, v20.16b, v21.16b
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ushr	v22.4s, v21.4s, #7
-	shl	v21.4s, v21.4s, #25
-	add	v6.4s, v6.4s, v4.4s
-	orr	v21.16b, v21.16b, v22.16b
-	ext	v19.16b, v19.16b, v19.16b, #8
-	add	v6.4s, v6.4s, v21.4s
-	eor	v19.16b, v6.16b, v19.16b
-	ext	v20.16b, v20.16b, v20.16b, #4
-	rev32	v19.8h, v19.8h
-	add	v20.4s, v20.4s, v19.4s
-	add	v6.4s, v6.4s, v5.4s
-	mov	v5.s[1], v4.s[2]
-	eor	v4.16b, v20.16b, v21.16b
-	ushr	v21.4s, v4.4s, #12
-	shl	v4.4s, v4.4s, #20
-	orr	v21.16b, v4.16b, v21.16b
-	add	v6.4s, v6.4s, v21.4s
-	eor	v19.16b, v6.16b, v19.16b
-	add	v2.4s, v6.4s, v2.4s
-	ushr	v6.4s, v19.4s, #8
-	shl	v19.4s, v19.4s, #24
-	orr	v6.16b, v19.16b, v6.16b
-	add	v19.4s, v6.4s, v20.4s
-	eor	v20.16b, v19.16b, v21.16b
-	ushr	v21.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	ext	v2.16b, v2.16b, v2.16b, #4
-	orr	v20.16b, v20.16b, v21.16b
-	ext	v6.16b, v6.16b, v6.16b, #8
-	add	v2.4s, v20.4s, v2.4s
-	eor	v6.16b, v2.16b, v6.16b
-	ext	v19.16b, v19.16b, v19.16b, #12
-	rev32	v6.8h, v6.8h
-	add	v19.4s, v19.4s, v6.4s
-	mov	v22.16b, v0.16b
-	eor	v20.16b, v19.16b, v20.16b
-	bsl	v22.16b, v5.16b, v7.16b
-	ushr	v21.4s, v20.4s, #12
-	shl	v20.4s, v20.4s, #20
-	add	v2.4s, v2.4s, v22.4s
-	orr	v20.16b, v20.16b, v21.16b
-	add	v2.4s, v2.4s, v20.4s
-	eor	v6.16b, v2.16b, v6.16b
-	ushr	v21.4s, v6.4s, #8
-	shl	v6.4s, v6.4s, #24
-	orr	v6.16b, v6.16b, v21.16b
-	add	v19.4s, v6.4s, v19.4s
-	eor	v20.16b, v19.16b, v20.16b
-	ext	v2.16b, v2.16b, v2.16b, #12
-	ushr	v21.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	add	v2.4s, v2.4s, v17.4s
-	orr	v20.16b, v20.16b, v21.16b
-	ext	v6.16b, v6.16b, v6.16b, #8
-	add	v2.4s, v2.4s, v20.4s
-	eor	v6.16b, v2.16b, v6.16b
-	uzp2	v5.4s, v16.4s, v22.4s
-	zip1	v7.2d, v3.2d, v22.2d
-	zip2	v16.4s, v22.4s, v3.4s
-	ext	v19.16b, v19.16b, v19.16b, #4
-	rev32	v22.8h, v6.8h
-	ext	v23.16b, v5.16b, v5.16b, #4
-	bif	v7.16b, v17.16b, v1.16b
-	zip1	v24.4s, v16.4s, v17.4s
-	zip1	v16.4s, v17.4s, v16.4s
-	add	v21.4s, v2.4s, v3.4s
-	mov	v3.s[1], v17.s[2]
-	add	v17.4s, v19.4s, v22.4s
-	mov	v19.16b, v0.16b
-	ext	v25.16b, v7.16b, v7.16b, #12
-	ext	v4.16b, v16.16b, v24.16b, #8
-	uzp1	v16.4s, v23.4s, v23.4s
-	bsl	v19.16b, v3.16b, v18.16b
-	eor	v2.16b, v17.16b, v20.16b
-	uzp1	v7.4s, v7.4s, v25.4s
-	ext	v25.16b, v16.16b, v23.16b, #8
-	zip1	v3.2d, v4.2d, v19.2d
-	ushr	v20.4s, v2.4s, #12
-	shl	v2.4s, v2.4s, #20
-	ext	v24.16b, v23.16b, v23.16b, #12
-	uzp2	v6.4s, v25.4s, v19.4s
-	zip2	v18.4s, v19.4s, v4.4s
-	bif	v3.16b, v7.16b, v1.16b
-	orr	v20.16b, v2.16b, v20.16b
-	ext	v16.16b, v23.16b, v24.16b, #12
-	ext	v23.16b, v6.16b, v6.16b, #4
-	zip1	v24.4s, v18.4s, v7.4s
-	zip1	v18.4s, v7.4s, v18.4s
-	ext	v25.16b, v3.16b, v3.16b, #12
-	add	v21.4s, v21.4s, v20.4s
-	ext	v2.16b, v18.16b, v24.16b, #8
-	uzp1	v18.4s, v23.4s, v23.4s
-	ext	v24.16b, v23.16b, v23.16b, #12
-	uzp1	v3.4s, v3.4s, v25.4s
-	eor	v22.16b, v21.16b, v22.16b
-	ext	v25.16b, v18.16b, v23.16b, #8
-	dup	v18.4s, v2.s[3]
-	ext	v23.16b, v23.16b, v24.16b, #12
-	add	v5.4s, v21.4s, v5.4s
-	trn1	v21.4s, v3.4s, v3.4s
-	ushr	v24.4s, v22.4s, #8
-	shl	v22.4s, v22.4s, #24
-	ext	v18.16b, v21.16b, v18.16b, #8
-	orr	v21.16b, v22.16b, v24.16b
-	add	v17.4s, v21.4s, v17.4s
-	eor	v20.16b, v17.16b, v20.16b
-	ushr	v22.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	ext	v5.16b, v5.16b, v5.16b, #4
-	orr	v20.16b, v20.16b, v22.16b
-	ext	v21.16b, v21.16b, v21.16b, #8
-	add	v5.4s, v20.4s, v5.4s
-	eor	v21.16b, v5.16b, v21.16b
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v5.4s, v5.4s, v19.4s
-	rev32	v19.8h, v21.8h
-	add	v17.4s, v17.4s, v19.4s
-	eor	v20.16b, v17.16b, v20.16b
-	ushr	v21.4s, v20.4s, #12
-	shl	v20.4s, v20.4s, #20
-	orr	v20.16b, v20.16b, v21.16b
-	add	v5.4s, v5.4s, v20.4s
-	eor	v19.16b, v5.16b, v19.16b
-	ushr	v21.4s, v19.4s, #8
-	shl	v19.4s, v19.4s, #24
-	orr	v19.16b, v19.16b, v21.16b
-	add	v17.4s, v19.4s, v17.4s
-	eor	v20.16b, v17.16b, v20.16b
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ushr	v21.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	add	v5.4s, v5.4s, v7.4s
-	orr	v20.16b, v20.16b, v21.16b
-	ext	v19.16b, v19.16b, v19.16b, #8
-	add	v5.4s, v5.4s, v20.4s
-	eor	v19.16b, v5.16b, v19.16b
-	ext	v17.16b, v17.16b, v17.16b, #4
-	rev32	v22.8h, v19.8h
-	add	v21.4s, v5.4s, v4.4s
-	mov	v4.s[1], v7.s[2]
-	add	v19.4s, v17.4s, v22.4s
-	bit	v16.16b, v4.16b, v0.16b
-	eor	v5.16b, v19.16b, v20.16b
-	uzp2	v4.4s, v25.4s, v16.4s
-	zip1	v7.2d, v2.2d, v16.2d
-	zip2	v17.4s, v16.4s, v2.4s
-	ushr	v20.4s, v5.4s, #12
-	shl	v5.4s, v5.4s, #20
-	ext	v24.16b, v4.16b, v4.16b, #4
-	bif	v7.16b, v3.16b, v1.16b
-	zip1	v25.4s, v17.4s, v3.4s
-	zip1	v17.4s, v3.4s, v17.4s
-	orr	v20.16b, v5.16b, v20.16b
-	ext	v26.16b, v7.16b, v7.16b, #12
-	ext	v5.16b, v17.16b, v25.16b, #8
-	uzp1	v17.4s, v24.4s, v24.4s
-	ext	v25.16b, v24.16b, v24.16b, #12
-	bit	v23.16b, v18.16b, v0.16b
-	add	v21.4s, v21.4s, v20.4s
-	uzp1	v7.4s, v7.4s, v26.4s
-	ext	v26.16b, v17.16b, v24.16b, #8
-	ext	v17.16b, v24.16b, v25.16b, #12
-	eor	v22.16b, v21.16b, v22.16b
-	add	v6.4s, v21.4s, v6.4s
-	zip1	v21.2d, v5.2d, v23.2d
-	zip2	v24.4s, v23.4s, v5.4s
-	bif	v21.16b, v7.16b, v1.16b
-	zip1	v1.4s, v24.4s, v7.4s
-	zip1	v24.4s, v7.4s, v24.4s
-	ext	v1.16b, v24.16b, v1.16b, #8
-	ushr	v24.4s, v22.4s, #8
-	shl	v22.4s, v22.4s, #24
-	orr	v22.16b, v22.16b, v24.16b
-	add	v19.4s, v22.4s, v19.4s
-	ext	v24.16b, v21.16b, v21.16b, #12
-	eor	v20.16b, v19.16b, v20.16b
-	uzp1	v21.4s, v21.4s, v24.4s
-	ushr	v24.4s, v20.4s, #7
-	shl	v20.4s, v20.4s, #25
-	orr	v20.16b, v20.16b, v24.16b
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v22.16b, v22.16b, v22.16b, #8
-	add	v6.4s, v20.4s, v6.4s
-	eor	v22.16b, v6.16b, v22.16b
-	ext	v19.16b, v19.16b, v19.16b, #12
-	add	v6.4s, v6.4s, v16.4s
-	rev32	v16.8h, v22.8h
-	add	v19.4s, v19.4s, v16.4s
-	eor	v20.16b, v19.16b, v20.16b
-	ushr	v22.4s, v20.4s, #12
-	shl	v20.4s, v20.4s, #20
-	orr	v20.16b, v20.16b, v22.16b
-	add	v6.4s, v6.4s, v20.4s
-	eor	v16.16b, v6.16b, v16.16b
-	ext	v6.16b, v6.16b, v6.16b, #12
-	add	v3.4s, v6.4s, v3.4s
-	ushr	v6.4s, v16.4s, #8
-	shl	v16.4s, v16.4s, #24
-	orr	v6.16b, v16.16b, v6.16b
-	add	v16.4s, v6.4s, v19.4s
-	eor	v19.16b, v16.16b, v20.16b
-	ushr	v20.4s, v19.4s, #7
-	shl	v19.4s, v19.4s, #25
-	orr	v19.16b, v19.16b, v20.16b
-	ext	v6.16b, v6.16b, v6.16b, #8
-	add	v3.4s, v3.4s, v19.4s
-	eor	v6.16b, v3.16b, v6.16b
-	ext	v16.16b, v16.16b, v16.16b, #4
-	add	v2.4s, v3.4s, v2.4s
-	rev32	v3.8h, v6.8h
-	add	v6.4s, v16.4s, v3.4s
-	eor	v16.16b, v6.16b, v19.16b
-	ushr	v19.4s, v16.4s, #12
-	shl	v16.4s, v16.4s, #20
-	orr	v16.16b, v16.16b, v19.16b
-	add	v2.4s, v2.4s, v16.4s
-	eor	v3.16b, v2.16b, v3.16b
-	add	v2.4s, v2.4s, v4.4s
-	ushr	v4.4s, v3.4s, #8
-	shl	v3.4s, v3.4s, #24
-	orr	v3.16b, v3.16b, v4.16b
-	add	v4.4s, v3.4s, v6.4s
-	eor	v6.16b, v4.16b, v16.16b
-	ushr	v16.4s, v6.4s, #7
-	shl	v6.4s, v6.4s, #25
-	ext	v2.16b, v2.16b, v2.16b, #4
-	orr	v6.16b, v6.16b, v16.16b
-	ext	v3.16b, v3.16b, v3.16b, #8
-	add	v2.4s, v6.4s, v2.4s
+	hint	#34
+	fmov	s1, w3
+	movi	d0, #0x0000ff000000ff
+	ldr	q2, [x1]
+	fmov	d3, x4
+	adrp	x8, .LCPI1_0
+	mov	v1.s[1], w5
+	str	q2, [x0]
+	ldr	q4, [x8, :lo12:.LCPI1_0]
+	add	x8, x2, #32
+	ldr	q5, [x1, #16]
+	and	v0.8b, v1.8b, v0.8b
+	stp	q5, q4, [x0, #16]
+	mov	v3.d[1], v0.d[0]
+	str	q3, [x0, #48]
+	ldp	q0, q6, [x2]
+	uzp1	v1.4s, v0.4s, v6.4s
+	uzp2	v0.4s, v0.4s, v6.4s
+	add	v2.4s, v2.4s, v1.4s
+	uzp1	v18.4s, v1.4s, v1.4s
+	add	v2.4s, v2.4s, v5.4s
 	eor	v3.16b, v2.16b, v3.16b
-	ext	v4.16b, v4.16b, v4.16b, #12
+	add	v2.4s, v2.4s, v0.4s
 	rev32	v3.8h, v3.8h
-	add	v4.4s, v4.4s, v3.4s
-	eor	v6.16b, v4.16b, v6.16b
-	ushr	v16.4s, v6.4s, #12
-	shl	v6.4s, v6.4s, #20
-	add	v2.4s, v2.4s, v23.4s
-	orr	v6.16b, v6.16b, v16.16b
-	add	v2.4s, v2.4s, v6.4s
+	add	v4.4s, v3.4s, v4.4s
+	eor	v5.16b, v4.16b, v5.16b
+	ushr	v6.4s, v5.4s, #12
+	shl	v5.4s, v5.4s, #20
+	orr	v5.16b, v5.16b, v6.16b
+	add	v2.4s, v2.4s, v5.4s
 	eor	v3.16b, v2.16b, v3.16b
-	ushr	v16.4s, v3.4s, #8
+	ushr	v6.4s, v3.4s, #8
 	shl	v3.4s, v3.4s, #24
-	orr	v3.16b, v3.16b, v16.16b
+	orr	v3.16b, v3.16b, v6.16b
+	ld2	{ v6.4s, v7.4s }, [x8]
 	add	v4.4s, v3.4s, v4.4s
-	eor	v6.16b, v4.16b, v6.16b
-	ext	v2.16b, v2.16b, v2.16b, #12
-	ushr	v16.4s, v6.4s, #7
-	shl	v6.4s, v6.4s, #25
-	add	v2.4s, v2.4s, v7.4s
-	orr	v6.16b, v6.16b, v16.16b
 	ext	v3.16b, v3.16b, v3.16b, #8
 	add	v2.4s, v2.4s, v6.4s
-	eor	v3.16b, v2.16b, v3.16b
+	eor	v5.16b, v4.16b, v5.16b
 	ext	v4.16b, v4.16b, v4.16b, #4
-	rev32	v3.8h, v3.8h
+	ext	v6.16b, v6.16b, v6.16b, #12
+	ext	v2.16b, v2.16b, v2.16b, #12
+	ushr	v16.4s, v5.4s, #7
+	shl	v5.4s, v5.4s, #25
*** 7941 LINES SKIPPED ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202304270047.33R0lKKf025428>