From owner-svn-src-all@freebsd.org Thu Oct 11 23:37:58 2018 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 94EBA10CB8FA; Thu, 11 Oct 2018 23:37:58 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 3EE7F874D4; Thu, 11 Oct 2018 23:37:58 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 34CC81757E; Thu, 11 Oct 2018 23:37:58 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w9BNbwjo004025; Thu, 11 Oct 2018 23:37:58 GMT (envelope-from mjg@FreeBSD.org) Received: (from mjg@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w9BNbwP8004024; Thu, 11 Oct 2018 23:37:58 GMT (envelope-from mjg@FreeBSD.org) Message-Id: <201810112337.w9BNbwP8004024@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: mjg set sender to mjg@FreeBSD.org using -f From: Mateusz Guzik Date: Thu, 11 Oct 2018 23:37:58 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r339322 - head/sys/amd64/amd64 X-SVN-Group: head X-SVN-Commit-Author: mjg X-SVN-Commit-Paths: head/sys/amd64/amd64 X-SVN-Commit-Revision: 339322 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.27 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 11 Oct 2018 23:37:58 -0000 Author: mjg Date: Thu Oct 11 23:37:57 2018 New Revision: 339322 URL: https://svnweb.freebsd.org/changeset/base/339322 Log: amd64: make memmove and memcpy less slow with mov The reasoning is the same as with the memset change, see r339205 Reviewed by: kib (previous version) Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17441 Modified: head/sys/amd64/amd64/support.S Modified: head/sys/amd64/amd64/support.S ============================================================================== --- head/sys/amd64/amd64/support.S Thu Oct 11 23:28:04 2018 (r339321) +++ head/sys/amd64/amd64/support.S Thu Oct 11 23:37:57 2018 (r339322) @@ -200,82 +200,236 @@ END(memcmp) * Adapted from bcopy written by: * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ -ENTRY(memmove_std) - PUSH_FRAME_POINTER - movq %rdi,%rax - movq %rdx,%rcx +/* + * Register state at entry is supposed to be as follows: + * rdi - destination + * rsi - source + * rdx - count + * + * The macro possibly clobbers the above and: rcx, r8. + * It does not clobber rax, r10 nor r11. + */ +.macro MEMMOVE erms overlap begin end + \begin +.if \overlap == 1 movq %rdi,%r8 subq %rsi,%r8 - cmpq %rcx,%r8 /* overlapping && src < dst? */ + cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 2f +.endif - cmpq $15,%rcx - jbe 1f - shrq $3,%rcx /* copy by 64-bit words */ - rep - movsq - movq %rdx,%rcx - andq $7,%rcx /* any bytes left? */ - jne 1f - POP_FRAME_POINTER + cmpq $32,%rcx + jb 1016f + + cmpq $256,%rcx + ja 1256f + +1032: + movq (%rsi),%rdx + movq %rdx,(%rdi) + movq 8(%rsi),%rdx + movq %rdx,8(%rdi) + movq 16(%rsi),%rdx + movq %rdx,16(%rdi) + movq 24(%rsi),%rdx + movq %rdx,24(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + subq $32,%rcx + cmpq $32,%rcx + jae 1032b + cmpb $0,%cl + jne 1016f + \end ret ALIGN_TEXT -1: +1016: + cmpb $16,%cl + jl 1008f + movq (%rsi),%rdx + movq %rdx,(%rdi) + movq 8(%rsi),%rdx + movq %rdx,8(%rdi) + subb $16,%cl + jz 1000f + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi +1008: + cmpb $8,%cl + jl 1004f + movq (%rsi),%rdx + movq %rdx,(%rdi) + subb $8,%cl + jz 1000f + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi +1004: + cmpb $4,%cl + jl 1002f + movl (%rsi),%edx + movl %edx,(%rdi) + subb $4,%cl + jz 1000f + leaq 4(%rsi),%rsi + leaq 4(%rdi),%rdi +1002: + cmpb $2,%cl + jl 1001f + movw (%rsi),%dx + movw %dx,(%rdi) + subb $2,%cl + jz 1000f + leaq 2(%rsi),%rsi + leaq 2(%rdi),%rdi +1001: + cmpb $1,%cl + jl 1000f + movb (%rsi),%dl + movb %dl,(%rdi) +1000: + \end + ret + + ALIGN_TEXT +1256: +.if \erms == 1 rep movsb - POP_FRAME_POINTER +.else + shrq $3,%rcx /* copy by 64-bit words */ + rep + movsq + movq %rdx,%rcx + andb $7,%cl /* any bytes left? */ + jne 1004b +.endif + \end ret - /* ALIGN_TEXT */ +.if \overlap == 1 + /* + * Copy backwards. + */ + ALIGN_TEXT 2: - addq %rcx,%rdi /* copy backwards */ + addq %rcx,%rdi addq %rcx,%rsi + + cmpq $32,%rcx + jb 2016f + + cmpq $256,%rcx + ja 2256f + +2032: + movq -8(%rsi),%rdx + movq %rdx,-8(%rdi) + movq -16(%rsi),%rdx + movq %rdx,-16(%rdi) + movq -24(%rsi),%rdx + movq %rdx,-24(%rdi) + movq -32(%rsi),%rdx + movq %rdx,-32(%rdi) + leaq -32(%rsi),%rsi + leaq -32(%rdi),%rdi + subq $32,%rcx + cmpq $32,%rcx + jae 2032b + cmpb $0,%cl + jne 2016f + \end + ret + ALIGN_TEXT +2016: + cmpb $16,%cl + jl 2008f + movq -8(%rsi),%rdx + movq %rdx,-8(%rdi) + movq -16(%rsi),%rdx + movq %rdx,-16(%rdi) + subb $16,%cl + jz 2000f + leaq -16(%rsi),%rsi + leaq -16(%rdi),%rdi +2008: + cmpb $8,%cl + jl 2004f + movq -8(%rsi),%rdx + movq %rdx,-8(%rdi) + subb $8,%cl + jz 2000f + leaq -8(%rsi),%rsi + leaq -8(%rdi),%rdi +2004: + cmpb $4,%cl + jl 2002f + movl -4(%rsi),%edx + movl %edx,-4(%rdi) + subb $4,%cl + jz 2000f + leaq -4(%rsi),%rsi + leaq -4(%rdi),%rdi +2002: + cmpb $2,%cl + jl 2001f + movw -2(%rsi),%dx + movw %dx,-2(%rdi) + subb $2,%cl + jz 2000f + leaq -2(%rsi),%rsi + leaq -2(%rdi),%rdi +2001: + cmpb $1,%cl + jl 2000f + movb -1(%rsi),%dl + movb %dl,-1(%rdi) +2000: + \end + ret + ALIGN_TEXT +2256: decq %rdi decq %rsi std - andq $7,%rcx /* any fractional bytes? */ +.if \erms == 1 + rep + movsb +.else + andq $7,%rcx /* any fractional bytes? */ je 3f rep movsb 3: - movq %rdx,%rcx /* copy remainder by 32-bit words */ + movq %rdx,%rcx /* copy remainder by 32-bit words */ shrq $3,%rcx subq $7,%rsi subq $7,%rdi rep movsq +.endif cld - POP_FRAME_POINTER + \end ret -END(memmove_std) +.endif +.endm -ENTRY(memmove_erms) +.macro MEMMOVE_BEGIN PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx +.endm - movq %rdi,%r8 - subq %rsi,%r8 - cmpq %rcx,%r8 /* overlapping && src < dst? */ - jb 1f - - rep - movsb +.macro MEMMOVE_END POP_FRAME_POINTER - ret +.endm -1: - addq %rcx,%rdi /* copy backwards */ - addq %rcx,%rsi - decq %rdi - decq %rsi - std - rep - movsb - cld - POP_FRAME_POINTER - ret +ENTRY(memmove_std) + MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END +END(memmove_std) + +ENTRY(memmove_erms) + MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memmove_erms) /* @@ -285,35 +439,11 @@ END(memmove_erms) * Note: memcpy does not support overlapping copies */ ENTRY(memcpy_std) - PUSH_FRAME_POINTER - movq %rdi,%rax - movq %rdx,%rcx - cmpq $15,%rcx - jbe 1f - shrq $3,%rcx /* copy by 64-bit words */ - rep - movsq - movq %rdx,%rcx - andq $7,%rcx /* any bytes left? */ - jne 1f - POP_FRAME_POINTER - ret - ALIGN_TEXT -1: - rep - movsb - POP_FRAME_POINTER - ret + MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memcpy_std) ENTRY(memcpy_erms) - PUSH_FRAME_POINTER - movq %rdi,%rax - movq %rdx,%rcx - rep - movsb - POP_FRAME_POINTER - ret + MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memcpy_erms) /*