Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 25 Jul 2020 00:24:11 +0000 (UTC)
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org
Subject:   svn commit: r363505 - in stable/12: lib/libc/amd64/string sys/amd64/amd64
Message-ID:  <202007250024.06P0OBOp053636@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mjg
Date: Sat Jul 25 00:24:11 2020
New Revision: 363505
URL: https://svnweb.freebsd.org/changeset/base/363505

Log:
  MFC r357208,r357309,r357239,r357310
  
      amd64: revamp memcmp
      amd64: speed up failing case for memcmp
      amd64: sync up libc memcmp with the kernel version (r357208)
      amd64: sync up libc memcmp with the kernel version (r357309)

Modified:
  stable/12/lib/libc/amd64/string/memcmp.S
  stable/12/sys/amd64/amd64/support.S
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/lib/libc/amd64/string/memcmp.S
==============================================================================
--- stable/12/lib/libc/amd64/string/memcmp.S	Sat Jul 25 00:03:23 2020	(r363504)
+++ stable/12/lib/libc/amd64/string/memcmp.S	Sat Jul 25 00:24:11 2020	(r363505)
@@ -31,91 +31,176 @@
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
+#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
 ENTRY(memcmp)
-	cmpq	$16,%rdx
-	jae	5f
-1:
-	testq	%rdx,%rdx
-	je	3f
-	xorl	%ecx,%ecx
-2:
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jne	2b
-3:
 	xorl	%eax,%eax
+10:
+	cmpq	$16,%rdx
+	ja	101632f
+
+100816:
+	cmpb	$8,%dl
+	jl	100408f
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10081608f
 	ret
-4:
+100408:
+	cmpb	$4,%dl
+	jl	100204f
+	movl	(%rdi),%r8d
+	movl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	80f
+	movl	-4(%rdi,%rdx),%r8d
+	movl	-4(%rsi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	10040804f
+	ret
+100204:
+	cmpb	$2,%dl
+	jl	100001f
+	movzwl	(%rdi),%r8d
+	movzwl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	movzwl	-2(%rdi,%rdx),%r8d
+	movzwl	-2(%rsi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	ret
+100001:
+	cmpb	$1,%dl
+	jl	100000f
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
 	subl	%r8d,%eax
+100000:
 	ret
-5:
+ALIGN_TEXT
+101632:
 	cmpq	$32,%rdx
-	jae	7f
-6:
-	/*
-	 * 8 bytes
-	 */
+	ja	103200f
 	movq	(%rdi),%r8
 	movq	(%rsi),%r9
 	cmpq	%r8,%r9
-	jne	1b
-	leaq	8(%rdi),%rdi
-	leaq	8(%rsi),%rsi
-	subq	$8,%rdx
-	cmpq	$8,%rdx
-	jae	6b
-	jl	1b
-	jmp	3b
-7:
-	/*
-	 * 32 bytes
-	 */
-	movq	(%rsi),%r8
+	jne	80f
+	movq	8(%rdi),%r8
 	movq	8(%rsi),%r9
-	subq	(%rdi),%r8
-	subq	8(%rdi),%r9
-	or	%r8,%r9
-	jnz	1b
+	cmpq	%r8,%r9
+	jne	10163208f
+	movq	-16(%rdi,%rdx),%r8
+	movq	-16(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163216f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163224f
+	ret
+ALIGN_TEXT
+103200:
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	subq	(%rsi),%r8
+	subq	8(%rsi),%r9
+	orq	%r8,%r9
+	jnz	10320000f
 
-	movq	16(%rsi),%r8
-	movq	24(%rsi),%r9
-	subq	16(%rdi),%r8
-	subq	24(%rdi),%r9
-	or	%r8,%r9
-	jnz	1b
+	movq    16(%rdi),%r8
+	movq    24(%rdi),%r9
+	subq    16(%rsi),%r8
+	subq    24(%rsi),%r9
+	orq	%r8,%r9
+	jnz     10320016f
 
 	leaq	32(%rdi),%rdi
 	leaq	32(%rsi),%rsi
 	subq	$32,%rdx
 	cmpq	$32,%rdx
-	jae	7b
-	jnz	1b
-	jmp	3b
+	jae	103200b
+	cmpb	$0,%dl
+	jne	10b
+	ret
+
+/*
+ * Mismatch was found.
+ *
+ * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
+ */
+ALIGN_TEXT
+10320016:
+	leaq	16(%rdi),%rdi
+	leaq	16(%rsi),%rsi
+10320000:
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	80f
+ALIGN_TEXT
+10081608:
+10163224:
+	leaq	-8(%rdi,%rdx),%rdi
+	leaq	-8(%rsi,%rdx),%rsi
+	jmp	80f
+ALIGN_TEXT
+10163216:
+	leaq	-16(%rdi,%rdx),%rdi
+	leaq	-16(%rsi,%rdx),%rsi
+	jmp	80f
+ALIGN_TEXT
+10163208:
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	80f
+ALIGN_TEXT
+10040804:
+	leaq	-4(%rdi,%rdx),%rdi
+	leaq	-4(%rsi,%rdx),%rsi
+	jmp	1f
+
+ALIGN_TEXT
+80:
+	movl	(%rdi),%r8d
+	movl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	leaq	4(%rdi),%rdi
+	leaq	4(%rsi),%rsi
+
+/*
+ * We have up to 4 bytes to inspect.
+ */
+1:
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	1(%rdi),%eax
+	movzbl	1(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	2(%rdi),%eax
+	movzbl	2(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	3(%rdi),%eax
+	movzbl	3(%rsi),%r8d
+2:
+	subl	%r8d,%eax
+	ret
 END(memcmp)
 
 	.section .note.GNU-stack,"",%progbits

Modified: stable/12/sys/amd64/amd64/support.S
==============================================================================
--- stable/12/sys/amd64/amd64/support.S	Sat Jul 25 00:03:23 2020	(r363504)
+++ stable/12/sys/amd64/amd64/support.S	Sat Jul 25 00:24:11 2020	(r363505)
@@ -107,96 +107,185 @@ END(sse2_pagezero)
 
 /*
  * memcmpy(b1, b2, len)
- *	   rdi,rsi,len
+ *	   rdi,rsi,rdx
  */
 ENTRY(memcmp)
 	PUSH_FRAME_POINTER
-	cmpq	$16,%rdx
-	jae	5f
-1:
-	testq	%rdx,%rdx
-	je	3f
-	xorl	%ecx,%ecx
-2:
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq    $1,%rcx
-	cmpq    %rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jz	3f
-	movzbl	(%rdi,%rcx,1),%eax
-	movzbl	(%rsi,%rcx,1),%r8d
-	cmpb	%r8b,%al
-	jne	4f
-	addq	$1,%rcx
-	cmpq	%rcx,%rdx
-	jne	2b
-3:
+
 	xorl	%eax,%eax
+10:
+	cmpq	$16,%rdx
+	ja	101632f
+
+100816:
+	cmpb	$8,%dl
+	jl	100408f
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10081608f
 	POP_FRAME_POINTER
 	ret
-4:
+100408:
+	cmpb	$4,%dl
+	jl	100204f
+	movl	(%rdi),%r8d
+	movl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	80f
+	movl	-4(%rdi,%rdx),%r8d
+	movl	-4(%rsi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	10040804f
+	POP_FRAME_POINTER
+	ret
+100204:
+	cmpb	$2,%dl
+	jl	100001f
+	movzwl	(%rdi),%r8d
+	movzwl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	movzwl	-2(%rdi,%rdx),%r8d
+	movzwl	-2(%rsi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	POP_FRAME_POINTER
+	ret
+100001:
+	cmpb	$1,%dl
+	jl	100000f
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
 	subl	%r8d,%eax
+100000:
 	POP_FRAME_POINTER
 	ret
-5:
+ALIGN_TEXT
+101632:
 	cmpq	$32,%rdx
-	jae	7f
-6:
-	/*
-	 * 8 bytes
-	 */
-	movq    (%rdi),%r8
-	movq    (%rsi),%r9
-	cmpq    %r8,%r9
-	jne	1b
+	ja	103200f
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	movq	8(%rdi),%r8
+	movq	8(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	10163208f
+	movq	-16(%rdi,%rdx),%r8
+	movq	-16(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163216f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163224f
+	POP_FRAME_POINTER
+	ret
+ALIGN_TEXT
+103200:
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	subq	(%rsi),%r8
+	subq	8(%rsi),%r9
+	orq	%r8,%r9
+	jnz	10320000f
+
+	movq    16(%rdi),%r8
+	movq    24(%rdi),%r9
+	subq    16(%rsi),%r8
+	subq    24(%rsi),%r9
+	orq	%r8,%r9
+	jnz     10320016f
+
+	leaq	32(%rdi),%rdi
+	leaq	32(%rsi),%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	103200b
+	cmpb	$0,%dl
+	jne	10b
+	POP_FRAME_POINTER
+	ret
+
+/*
+ * Mismatch was found.
+ *
+ * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
+ */
+ALIGN_TEXT
+10320016:
+	leaq	16(%rdi),%rdi
+	leaq	16(%rsi),%rsi
+10320000:
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
 	leaq	8(%rdi),%rdi
 	leaq	8(%rsi),%rsi
-	subq	$8,%rdx
-	cmpq	$8,%rdx
-	jae	6b
-	jl	1b
-	jmp	3b
-7:
-	/*
-	 * 32 bytes
-	 */
-	movq    (%rsi),%r8
-	movq    8(%rsi),%r9
-	subq    (%rdi),%r8
-	subq    8(%rdi),%r9
-	or	%r8,%r9
-	jnz	1b
+	jmp	80f
+ALIGN_TEXT
+10081608:
+10163224:
+	leaq	-8(%rdi,%rdx),%rdi
+	leaq	-8(%rsi,%rdx),%rsi
+	jmp	80f
+ALIGN_TEXT
+10163216:
+	leaq	-16(%rdi,%rdx),%rdi
+	leaq	-16(%rsi,%rdx),%rsi
+	jmp	80f
+ALIGN_TEXT
+10163208:
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	80f
+ALIGN_TEXT
+10040804:
+	leaq	-4(%rdi,%rdx),%rdi
+	leaq	-4(%rsi,%rdx),%rsi
+	jmp	1f
 
-	movq    16(%rsi),%r8
-	movq    24(%rsi),%r9
-	subq    16(%rdi),%r8
-	subq    24(%rdi),%r9
-	or	%r8,%r9
-	jnz	1b
+ALIGN_TEXT
+80:
+	movl	(%rdi),%r8d
+	movl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	leaq	4(%rdi),%rdi
+	leaq	4(%rsi),%rsi
 
-	leaq    32(%rdi),%rdi
-	leaq    32(%rsi),%rsi
-	subq    $32,%rdx
-	cmpq    $32,%rdx
-	jae	7b
-	jnz	1b
-	jmp	3b
+/*
+ * We have up to 4 bytes to inspect.
+ */
+1:
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	1(%rdi),%eax
+	movzbl	1(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	2(%rdi),%eax
+	movzbl	2(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	3(%rdi),%eax
+	movzbl	3(%rsi),%r8d
+2:
+	subl	%r8d,%eax
+	POP_FRAME_POINTER
+	ret
 END(memcmp)
 
 /*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202007250024.06P0OBOp053636>