Date: Tue, 2 Aug 2005 02:25:18 +0800 From: Xin LI <delphij@frontfree.net> To: freebsd-arch@FreeBSD.org, freebsd-amd64@FreeBSD.org Cc: obrien@FreeBSD.org Subject: [RFC] Port of NetBSD's optimized amd64 string code Message-ID: <20050801182518.GA85423@frontfree.net>
next in thread | raw e-mail | index | archive | help
--EuxKj2iCbKjpUGkD Content-Type: multipart/mixed; boundary="vtzGhvizbBRQ85DL" Content-Disposition: inline --vtzGhvizbBRQ85DL Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable Hi, Guys, Here is a patchset that I have produced to make our libc aware of the NetBSD assembly implementation of the string related operations. Cheers, --=20 Xin LI <delphij frontfree net> http://www.delphij.net/ See complete headers for GPG key and other information. --vtzGhvizbBRQ85DL Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="patch-libc::amd64-string" Content-Transfer-Encoding: quoted-printable Index: Makefile.inc =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: /home/ncvs/src/lib/libc/amd64/string/Makefile.inc,v retrieving revision 1.5 diff -u -r1.5 Makefile.inc --- Makefile.inc 10 Apr 2005 18:58:49 -0000 1.5 +++ Makefile.inc 1 Aug 2005 18:18:29 -0000 @@ -1,4 +1,5 @@ # $FreeBSD: src/lib/libc/amd64/string/Makefile.inc,v 1.5 2005/04/10 18:58:= 49 alc Exp $ =20 -MDSRCS+=3D bcmp.S bcopy.S bzero.S memcmp.S memcpy.S memmove.S memset.S \ - strcat.S strcmp.S strcpy.S +MDSRCS+=3D bcmp.S bcopy.S bzero.S ffs.S index.S memchr.S memcmp.S memcpy.S= \ + memmove.S memset.S rindex.S strcat.S strchr.S strcmp.S strcpy.S \ + strlen.S strncmp.S strrchr.S swab.S Index: ffs.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: ffs.S diff -N ffs.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ ffs.S 1 Aug 2005 17:54:04 -0000 @@ -0,0 +1,22 @@ +/* + * Written by J.T. Conklin <jtc@NetBSD.org>. + * Public domain. + * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.c= om> + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: ffs.S,v 1.2 2003/07/26 19:24:38 salo Exp $") +#endif + +ENTRY(ffs) + bsfl %edi,%eax + jz L1 /* ZF is set if all bits are 0 */ + incl %eax /* bits numbered from 1, not 0 */ + ret + + .align 4 +L1: xorl %eax,%eax /* clear result */ + ret Index: index.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: index.S diff -N index.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ index.S 1 Aug 2005 18:08:21 -0000 @@ -0,0 +1,5 @@ +/* $NetBSD: index.S,v 1.3 2004/07/19 20:04:41 drochner Exp $ */ +/* $FreeBSD$ */ + +#define INDEX +#include "strchr.S" Index: memchr.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: memchr.S diff -N memchr.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ memchr.S 1 Aug 2005 18:09:44 -0000 @@ -0,0 +1,112 @@ +/* + * Written by J.T. Conklin <jtc@acorntoolworks.com> + * Public domain. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: memchr.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") +#endif + +ENTRY(memchr) + movzbq %sil,%rcx + + /* + * Align to word boundry + * Consider unrolling loop? + */ + testq %rdx,%rdx /* nbytes =3D=3D 0? */ + je .Lzero +.Lalign: + testb $7,%dil + je .Lword_aligned + movq %rdi,%rax + cmpb (%rdi),%cl + je .Ldone + incq %rdi + decq %rdx + jnz .Lalign + jmp .Lzero + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movq %rcx,%rsi + salq $16,%rcx + orq %rsi,%rcx + movq %rcx,%rsi + salq $32,%rcx + orq %rsi,%rcx + + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 + + .align 4 +.Lloop: + cmpq $7,%rdx /* nbytes > 8 */ + jbe .Lbyte + movq (%rdi),%rsi + addq $8,%rdi + xorq %rcx,%rsi + subq $8,%rdx + subq %r8,%rsi + testq %r9,%rsi + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word are + * equal to ch. + */ + + leaq -8(%rdi),%rax + cmpb -8(%rdi),%cl /* 1st byte =3D=3D ch? */ + je .Ldone + + leaq -7(%rdi),%rax + cmpb -7(%rdi),%cl /* 2nd byte =3D=3D ch? */ + je .Ldone + + leaq -6(%rdi),%rax + cmpb -6(%rdi),%cl /* 3rd byte =3D=3D ch? */ + je .Ldone + + leaq -5(%rdi),%rax + cmpb -5(%rdi),%cl /* 4th byte =3D=3D ch? */ + je .Ldone + + leaq -4(%rdi),%rax + cmpb -4(%rdi),%cl /* 5th byte =3D=3D ch? */ + je .Ldone + + leaq -3(%rdi),%rax + cmpb -3(%rdi),%cl /* 6th byte =3D=3D ch? */ + je .Ldone + + leaq -2(%rdi),%rax + cmpb -2(%rdi),%cl /* 7th byte =3D=3D ch? */ + je .Ldone + + leaq -1(%rdi),%rax + cmpb -1(%rdi),%cl /* 7th byte =3D=3D ch? */ + jne .Lloop + ret + +.Lbyte: + testq %rdx,%rdx + je .Lzero +.Lbyte_loop: + movq %rdi,%rax + cmpb (%rdi),%cl + je .Ldone + incq %rdi + decq %rdx + jnz .Lbyte_loop + +.Lzero: + xorq %rax,%rax + +.Ldone: + ret Index: rindex.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: rindex.S diff -N rindex.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ rindex.S 1 Aug 2005 18:10:36 -0000 @@ -0,0 +1,5 @@ +/* $NetBSD: rindex.S,v 1.3 2004/07/19 20:04:41 drochner Exp $ */ +/* $FreeBSD$ */ + +#define RINDEX +#include "strrchr.S" Index: strchr.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: strchr.S diff -N strchr.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ strchr.S 1 Aug 2005 18:11:51 -0000 @@ -0,0 +1,137 @@ +/* + * Written by J.T. Conklin <jtc@acorntoolworks.com> + * Public domain. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: strchr.S,v 1.2 2004/07/19 20:04:41 drochner Exp $") +#endif + +#ifdef INDEX +ENTRY(index) +#else +ENTRY(strchr) +#endif + movzbq %sil,%rcx + + /* + * Align to word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $7,%dil + je .Lword_aligned + movb (%rdi),%dl + cmpb %cl,%dl + je .Ldone + incq %rdi + testb %dl,%dl + jne .Lalign + jmp .Lzero + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movq %rcx,%rdx + salq $16,%rcx + orq %rdx,%rcx + movq %rcx,%rdx + salq $32,%rcx + orq %rdx,%rcx + + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 + + /* Check whether any byte in the word is equal to ch or 0. */ + .align 4 +.Lloop: + movq (%rdi),%rdx + addq $8,%rdi + movq %rdx,%rsi + subq %r8,%rdx + xorq %rcx,%rsi + subq %r8,%rsi + orq %rsi,%rdx + testq %r9,%rdx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word match + * ch or are equal to 0. + */ + + movb -8(%rdi),%dl + cmpb %cl,%dl /* 1st byte =3D=3D ch? */ + jne 1f + subq $8,%rdi + jmp .Ldone +1: testb %dl,%dl /* 1st byte =3D=3D 0? */ + je .Lzero + + movb -7(%rdi),%dl + cmpb %cl,%dl /* 2nd byte =3D=3D ch? */ + jne 1f + subq $7,%rdi + jmp .Ldone +1: testb %dl,%dl /* 2nd byte =3D=3D 0? */ + je .Lzero + + movb -6(%rdi),%dl + cmpb %cl,%dl /* 3rd byte =3D=3D ch? */ + jne 1f + subq $6,%rdi + jmp .Ldone +1: testb %dl,%dl /* 3rd byte =3D=3D 0? */ + je .Lzero + + movb -5(%rdi),%dl + cmpb %cl,%dl /* 4th byte =3D=3D ch? */ + jne 1f + subq $5,%rdi + jmp .Ldone +1: testb %dl,%dl /* 4th byte =3D=3D 0? */ + je .Lzero + + movb -4(%rdi),%dl + cmpb %cl,%dl /* 5th byte =3D=3D ch? */ + jne 1f + subq $4,%rdi + jmp .Ldone +1: testb %dl,%dl /* 5th byte =3D=3D 0? */ + je .Lzero + + movb -3(%rdi),%dl + cmpb %cl,%dl /* 6th byte =3D=3D ch? */ + jne 1f + subq $3,%rdi + jmp .Ldone +1: testb %dl,%dl /* 6th byte =3D=3D 0? */ + je .Lzero + + movb -2(%rdi),%dl + cmpb %cl,%dl /* 7th byte =3D=3D ch? */ + jne 1f + subq $2,%rdi + jmp .Ldone +1: testb %dl,%dl /* 7th byte =3D=3D 0? */ + je .Lzero + + movb -1(%rdi),%dl + cmpb %cl,%dl /* 8th byte =3D=3D ch? */ + jne 1f + subq $1,%rdi + jmp .Ldone +1: testb %dl,%dl /* 8th byte =3D=3D 0? */ + jne .Lloop + +.Lzero: + /* If a ch wasn't found, return 0. */ + xorq %rdi,%rdi + +.Ldone: + movq %rdi,%rax + ret Index: strlen.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: strlen.S diff -N strlen.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ strlen.S 1 Aug 2005 18:12:48 -0000 @@ -0,0 +1,157 @@ +/* + * Written by J.T. Conklin <jtc@acorntoolworks.com> + * Public domain. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: strlen.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") +#endif + +ENTRY(strlen) + movq %rdi,%rax + negq %rdi + +.Lalign: + /* Consider unrolling loop? */ + testb $7,%al + je .Lword_aligned + cmpb $0,(%rax) + jne 1f + leaq (%rdi,%rax),%rax + ret +1: incq %rax + jmp .Lalign + + /* + * There are many well known branch-free sequences which are used + * for determining whether a zero-byte is contained within a word. + * These sequences are generally much more efficent than loading + * and comparing each byte individually. + * + * The expression [1,2]: + * + * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f)) + * + * evaluates to a non-zero value if any of the bytes in the + * original word is zero. + * + * It also has the useful property that bytes in the result word + * that coorespond to non-zero bytes in the original word have + * the value 0x00, while bytes cooresponding to zero bytes have + * the value 0x80. This allows calculation of the first (and + * last) occurance of a zero byte within the word (useful for C's + * str* primitives) by counting the number of leading (or + * trailing) zeros and dividing the result by 8. On machines + * without (or with slow) clz() / ctz() instructions, testing + * each byte in the result word for zero is necessary. + * + * This typically takes 4 instructions (5 on machines without + * "not-or") not including those needed to load the constant. + * + * + * The expression: + * + * (2) ((x - 0x01....01) & ~x & 0x80....80) + * + * evaluates to a non-zero value if any of the bytes in the + * original word is zero. + * + * On little endian machines, the first byte in the result word + * that cooresponds to a zero byte in the original byte is 0x80, + * so clz() can be used as above. On big endian machines, and + * little endian machines without (or with a slow) clz() insn, + * testing each byte in the original for zero is necessary + * + * This typically takes 3 instructions (4 on machines without + * "and with complement") not including those needed to load + * constants. + * + * + * The expression: + * + * (3) ((x - 0x01....01) & 0x80....80) + * + * always evaluates to a non-zero value if any of the bytes in + * the original word is zero. However, in rare cases, it also + * evaluates to a non-zero value when none of the bytes in the + * original word is zero. + * + * To account for possible false positives, each byte of the + * original word must be checked when the expression evaluates to + * a non-zero value. However, because it is simpler than those + * presented above, code that uses it will be faster as long as + * the rate of false positives is low. + * + * This is likely, because the the false positive can only occur + * if the most siginificant bit of a byte within the word is set. + * The expression will never fail for typical 7-bit ASCII strings. + * + * This typically takes 2 instructions not including those needed + * to load constants. + * + * + * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 + * + * [2] International Business Machines, "The PowerPC Compiler Writer's + * Guide", Warthman Associates, 1996 + */ + + .align 4 +.Lword_aligned: + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 +.Lloop: + movq (%rax),%rdx + addq $8,%rax + subq %r8,%rdx + testq %r9,%rdx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word equal 0. + */ + cmpb $0,-8(%rax) /* 1st byte =3D=3D 0? */ + je .Lsub8 + cmpb $0,-7(%rax) /* 2nd byte =3D=3D 0? */ + je .Lsub7 + cmpb $0,-6(%rax) /* 3rd byte =3D=3D 0? */ + je .Lsub6 + cmpb $0,-5(%rax) /* 4th byte =3D=3D 0? */ + je .Lsub5 + cmpb $0,-4(%rax) /* 5th byte =3D=3D 0? */ + je .Lsub4 + cmpb $0,-3(%rax) /* 6th byte =3D=3D 0? */ + je .Lsub3 + cmpb $0,-2(%rax) /* 7th byte =3D=3D 0? */ + je .Lsub2 + cmpb $0,-1(%rax) /* 8th byte =3D=3D 0? */ + jne .Lloop + +.Lsub1: + leaq -1(%rdi,%rax),%rax + ret +.Lsub2: + leaq -2(%rdi,%rax),%rax + ret +.Lsub3: + leaq -3(%rdi,%rax),%rax + ret +.Lsub4: + leaq -4(%rdi,%rax),%rax + ret +.Lsub5: + leaq -5(%rdi,%rax),%rax + ret +.Lsub6: + leaq -6(%rdi,%rax),%rax + ret +.Lsub7: + leaq -7(%rdi,%rax),%rax + ret +.Lsub8: + leaq -8(%rdi,%rax),%rax + ret Index: strncmp.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: strncmp.S diff -N strncmp.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ strncmp.S 1 Aug 2005 18:13:51 -0000 @@ -0,0 +1,108 @@ +/* + * Written by J.T. Conklin <jtc@NetBSD.org>. + * Public domain. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: strncmp.S,v 1.2 2003/07/26 19:24:40 salo Exp $") +#endif + +/* + * NOTE: I've unrolled the loop eight times: large enough to make a + * significant difference, and small enough not to totally trash the + * cache. + */ + +ENTRY(strncmp) + testq %rdx,%rdx + jmp L2 /* Jump into the loop! */ + +L1: incq %rdi + incq %rsi + decq %rdx +L2: jz L4 /* strings are equal */ + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + jne L3 + + incq %rdi + incq %rsi + decq %rdx + jz L4 + movb (%rdi),%al + testb %al,%al + jz L3 + cmpb %al,(%rsi) + je L1 + +L3: movzbl (%rdi),%eax /* unsigned comparision */ + movzbl (%rsi),%ecx + subl %ecx,%eax + ret +L4: xorl %eax,%eax + ret Index: strrchr.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: strrchr.S diff -N strrchr.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ strrchr.S 1 Aug 2005 18:15:07 -0000 @@ -0,0 +1,127 @@ +/* + * Written by J.T. Conklin <jtc@acorntoolworks.com> + * Public domain. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: strrchr.S,v 1.2 2004/07/19 20:04:41 drochner Exp $") +#endif + +#ifdef RINDEX +ENTRY(rindex) +#else +ENTRY(strrchr) +#endif + movzbq %sil,%rcx + + /* zero return value */ + xorq %rax,%rax + + /* + * Align to word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $7,%dil + je .Lword_aligned + movb (%rdi),%dl + cmpb %cl,%dl + cmoveq %rdi,%rax + incq %rdi + testb %dl,%dl + jne .Lalign + jmp .Ldone + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movq %rcx,%rdx + salq $16,%rcx + orq %rdx,%rcx + movq %rcx,%rdx + salq $32,%rcx + orq %rdx,%rcx + + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 + + /* Check whether any byte in the word is equal to ch or 0. */ + .align 4 +.Lloop: + movq (%rdi),%rdx + addq $8,%rdi + movq %rdx,%rsi + subq %r8,%rdx + xorq %rcx,%rsi + subq %r8,%rsi + orq %rsi,%rdx + testq %r9,%rdx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word match + * ch or are equal to 0. + */ + + movb -8(%rdi),%dl + cmpb %cl,%dl /* 1st byte =3D=3D ch? */ + jne 1f + leaq -8(%rdi),%rax +1: testb %dl,%dl /* 1st byte =3D=3D 0? */ + je .Ldone + + movb -7(%rdi),%dl + cmpb %cl,%dl /* 2nd byte =3D=3D ch? */ + jne 1f + leaq -7(%rdi),%rax +1: testb %dl,%dl /* 2nd byte =3D=3D 0? */ + je .Ldone + + movb -6(%rdi),%dl + cmpb %cl,%dl /* 3rd byte =3D=3D ch? */ + jne 1f + leaq -6(%rdi),%rax +1: testb %dl,%dl /* 3rd byte =3D=3D 0? */ + je .Ldone + + movb -5(%rdi),%dl + cmpb %cl,%dl /* 4th byte =3D=3D ch? */ + jne 1f + leaq -5(%rdi),%rax +1: testb %dl,%dl /* 4th byte =3D=3D 0? */ + je .Ldone + + movb -4(%rdi),%dl + cmpb %cl,%dl /* 5th byte =3D=3D ch? */ + jne 1f + leaq -4(%rdi),%rax +1: testb %dl,%dl /* 5th byte =3D=3D 0? */ + je .Ldone + + movb -3(%rdi),%dl + cmpb %cl,%dl /* 6th byte =3D=3D ch? */ + jne 1f + leaq -3(%rdi),%rax +1: testb %dl,%dl /* 6th byte =3D=3D 0? */ + je .Ldone + + movb -2(%rdi),%dl + cmpb %cl,%dl /* 7th byte =3D=3D ch? */ + jne 1f + leaq -2(%rdi),%rax +1: testb %dl,%dl /* 7th byte =3D=3D 0? */ + je .Ldone + + movb -1(%rdi),%dl + cmpb %cl,%dl /* 8th byte =3D=3D ch? */ + jne 1f + leaq -1(%rdi),%rax +1: testb %dl,%dl /* 8th byte =3D=3D 0? */ + jne .Lloop + +.Ldone: + ret Index: swab.S =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: swab.S diff -N swab.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ swab.S 1 Aug 2005 18:18:17 -0000 @@ -0,0 +1,47 @@ +/* + * Written by J.T. Conklin <jtc@NetBSD.org>. + * Public domain. + */ + +#include <machine/asm.h> +__FBSDID("$FreeBSD$"); + +#if 0 + RCSID("$NetBSD: swab.S,v 1.2 2003/07/26 19:24:40 salo Exp $") +#endif + +#define LOAD_SWAP_STORE_WORD \ + lodsw ; \ + xchgb %al,%ah ; \ + stosw + +ENTRY(swab) + xchgq %rdi,%rsi + cld # set direction forward + + shrq $1,%rdx + testq $7,%rdx # copy first group of 1 to 7 words + jz L2 # while swaping alternate bytes. +L1: lodsw + rorw $8,%ax + stosw + decq %rdx + testq $7,%rdx + jnz L1 + +L2: shrq $3,%rdx # copy remainder 8 words at a time + jz L4 # while swapping alternate bytes. +L3: + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + LOAD_SWAP_STORE_WORD + + decq %rdx + jnz L3 +L4: + ret --vtzGhvizbBRQ85DL-- --EuxKj2iCbKjpUGkD Content-Type: application/pgp-signature Content-Disposition: inline -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.1 (FreeBSD) iD8DBQFC7mkO/cVsHxFZiIoRArcuAJ9AF9F0+YFYsQpLVPvnd3hGmKNXBgCdFAIS mrMJ3TeaXKrzkBqS3vxeQGI= =TJe9 -----END PGP SIGNATURE----- --EuxKj2iCbKjpUGkD--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20050801182518.GA85423>