Date: Sun, 08 Sep 2019 05:55:51 -0000 From: "Stefan Kanthak" <stefan.kanthak@nexgo.de> To: <freebsd-numerics@freebsd.org>, <freebsd-hackers@freebsd.org> Cc: <das@FreeBSD.ORG> Subject: Shorterr releng/12.0/lib/msun/i387/s_remquo.S, releng/12.0/lib/msun/amd64/s_remquo.S, ... Message-ID: <174BDDD122964DA9AD32D77663AB863D@H270>
next in thread | raw e-mail | index | archive | help
Hi, here's a patch to shave 4 instructions (and about 25% code size) from http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquo.S http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquof.S http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquol.S http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquo.S http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquof.S http://sources.freebsd.org/releng/12.0/lib/msun/amd64/s_remquol.S Especially the negation is rather clumsy: 1. the 2 shifts by 16 to propagate the sign to all bits can be replaced with a single shift by 31, or with a CLTD alias CDQ (which is 2 bytes shorter); 2. the conversion of -1 to +1 via AND and its addition can be replaced by subtraction of -1. The minor differences between the code for the float, double and long double as well as the i387 and amd64 implementations are intended; pick the variant you like best. I prefer and recommend the variant with 3 ADC and 2 SHL instructions used for the i387 double-precision function http://sources.freebsd.org/releng/12.0/lib/msun/i387/s_remquo.S, which comes first. stay tuned Stefan Kanthak PS: if you ever need to run these functions on a CPU without barrel shifter, replace the first SHL or ROR with BT $14,%eax and the second SHL or ROL with BT $9,%eax ... and hope that BT doesn't use a slow shift under the hood. --- -/releng/12.0/lib/msun/i387/s_remquo.S +++ +/releng/12.0/lib/msun/i387/s_remquo.S @@ -34,1 +34,2 @@ ENTRY(remquo) + xorl %ecx,%ecx @@ -42,22 +43,17 @@ /* Extract the three low-order bits of the quotient from C0,C3,C1. */ - shrl $6,%eax - movl %eax,%ecx - andl $0x108,%eax - rorl $7,%eax - orl %eax,%ecx - roll $4,%eax - orl %ecx,%eax - andl $7,%eax + adcl %ecx,%ecx + shll $18,%eax + adcl %ecx,%ecx + shll $5,%eax + adcl %ecx,%ecx /* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */ - movl 16(%esp),%ecx - xorl 8(%esp),%ecx - sarl $16,%ecx - sarl $16,%ecx - xorl %ecx,%eax - andl $1,%ecx - addl %ecx,%eax + movl 16(%esp),%eax + xorl 8(%esp),%eax + cltd + xorl %edx,%ecx + subl %edx,%ecx /* Store the quotient and return. */ - movl 20(%esp),%ecx - movl %eax,(%ecx) + movl 20(%esp),%eax + movl %ecx,(%eax) ret END(remquo) --- -/releng/12.0/lib/msun/i387/s_remquof.S +++ +/releng/12.0/lib/msun/i387/s_remquof.S @@ -42,22 +42,18 @@ /* Extract the three low-order bits of the quotient from C0,C3,C1. */ - shrl $6,%eax - movl %eax,%ecx - andl $0x108,%eax - rorl $7,%eax - orl %eax,%ecx - roll $4,%eax - orl %ecx,%eax - andl $7,%eax + sbbl %ecx,%ecx + negl %ecx + shll $18,%eax + adcl %ecx,%ecx + shll $5,%eax + adcl %ecx,%ecx /* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */ - movl 8(%esp),%ecx - xorl 4(%esp),%ecx - sarl $16,%ecx - sarl $16,%ecx - xorl %ecx,%eax - andl $1,%ecx - addl %ecx,%eax + movl 8(%esp),%eax + xorl 4(%esp),%eax + cltd + xorl %edx,%ecx + subl %edx,%ecx /* Store the quotient and return. */ - movl 12(%esp),%ecx - movl %eax,(%ecx) + movl 12(%esp),%eax + movl %ecx,(%eax) ret END(remquof) --- -/releng/12.0/lib/msun/i387/s_remquol.S +++ +/releng/12.0/lib/msun/i387/s_remquol.S @@ -42,22 +42,19 @@ /* Extract the three low-order bits of the quotient from C0,C3,C1. */ - shrl $6,%eax - movl %eax,%ecx - andl $0x108,%eax - rorl $7,%eax - orl %eax,%ecx - roll $4,%eax - orl %ecx,%eax - andl $7,%eax + setc %cl + movzbl %cl,%ecx + shll $18,%eax + adcl %ecx,%ecx + shll $5,%eax + adcl %ecx,%ecx /* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */ - movl 24(%esp),%ecx - xorl 12(%esp),%ecx - movsx %cx,%ecx - sarl $16,%ecx - sarl $16,%ecx - xorl %ecx,%eax - andl $1,%ecx - addl %ecx,%eax + movl 24(%esp),%eax + xorl 12(%esp),%eax + cwtl + cltd + xorl %edx,%ecx + subl %edx,%ecx /* Store the quotient and return. */ - movl 28(%esp),%ecx - movl %eax,(%ecx) + movl 28(%esp),%eax + movl %ecx,(%eax) ret +END(remquol) --- -/releng/12.0/lib/msun/amd64/s_remquo.S --- +/releng/12.0/lib/msun/amd64/s_remquo.S @@ -34,1 +35,2 @@ ENTRY(remquo) + xorl %ecx,%ecx @@ -44,19 +45,14 @@ /* Extract the three low-order bits of the quotient from C0,C3,C1. */ - shrl $6,%eax - movl %eax,%ecx - andl $0x108,%eax - rorl $7,%eax - orl %eax,%ecx - roll $4,%eax - orl %ecx,%eax - andl $7,%eax + adcl %ecx,%ecx + rorl $15,%eax + adcl %ecx,%ecx + roll $6,%eax + adcl %ecx,%ecx /* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */ - movl -12(%rsp),%ecx - xorl -4(%rsp),%ecx - sarl $16,%ecx - sarl $16,%ecx - xorl %ecx,%eax - andl $1,%ecx - addl %ecx,%eax + movl -12(%rsp),%eax + xorl -4(%rsp),%eax + cltd + xorl %edx,%ecx + subl %edx,%ecx /* Store the quotient and return. */ - movl %eax,(%rdi) + movl %ecx,(%rdi) --- -/releng/12.0/lib/msun/amd64/s_remquof.S --- +/releng/12.0/lib/msun/amd64/s_remquof.S @@ -44,19 +44,15 @@ /* Extract the three low-order bits of the quotient from C0,C3,C1. */ - shrl $6,%eax - movl %eax,%ecx - andl $0x108,%eax - rorl $7,%eax - orl %eax,%ecx - roll $4,%eax - orl %ecx,%eax - andl $7,%eax + sbbl %ecx,%ecx + negl %ecx + rorl $15,%eax + adcl %ecx,%ecx + roll $6,%eax + adcl %ecx,%ecx /* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */ - movl -8(%rsp),%ecx - xorl -4(%rsp),%ecx - sarl $16,%ecx - sarl $16,%ecx - xorl %ecx,%eax - andl $1,%ecx - addl %ecx,%eax + movl -8(%rsp),%eax + xorl -4(%rsp),%eax + cltd + xorl %edx,%ecx + subl %edx,%ecx /* Store the quotient and return. */ - movl %eax,(%rdi) + movl %ecx,(%rdi) --- -/releng/12.0/lib/msun/amd64/s_remquol.S --- +/releng/12.0/lib/msun/amd64/s_remquol.S @@ -42,21 +42,18 @@ /* Extract the three low-order bits of the quotient from C0,C3,C1. */ - shrl $6,%eax - movl %eax,%ecx - andl $0x108,%eax - rorl $7,%eax - orl %eax,%ecx - roll $4,%eax - orl %ecx,%eax - andl $7,%eax + setc %cl + movzbl %cl,%ecx + rorl $15,%eax + adcl %ecx,%ecx + roll $6,%eax + adcl %ecx,%ecx /* Negate the quotient bits if x*y<0. Avoid using an unpredictable branch. */ - movl 32(%rsp),%ecx - xorl 16(%rsp),%ecx - movsx %cx,%ecx - sarl $16,%ecx - sarl $16,%ecx - xorl %ecx,%eax - andl $1,%ecx - addl %ecx,%eax + movl 32(%rsp),%eax + xorl 16(%rsp),%eax + cwtl + cltd + xorl %edx,%ecx + subl %edx,%ecx /* Store the quotient and return. */ - movl %eax,(%rdi) + movl %ecx,(%rdi) ret +END(remquol)
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?174BDDD122964DA9AD32D77663AB863D>