FreeBSD Mail Archives

Date:      Wed, 9 Feb 2005 05:20:05 GMT
From:      David Schultz <das@FreeBSD.ORG>
To:        freebsd-i386@FreeBSD.org
Subject:   Re: i386/67469: src/lib/msun/i387/s_tan.S gives incorrect results for large inputs
Message-ID:  <200502090520.j195K5sR097911@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

The following reply was made to PR i386/67469; it has been noted by GNATS.

From: David Schultz <das@FreeBSD.ORG>
To: Bruce Evans <bde@zeta.org.au>
Cc: FreeBSD-gnats-submit@FreeBSD.ORG, freebsd-i386@FreeBSD.ORG,
	bde@FreeBSD.ORG
Subject: Re: i386/67469: src/lib/msun/i387/s_tan.S gives incorrect results for large inputs
Date: Wed, 9 Feb 2005 00:14:01 -0500

 I ran some careful performance comparisons between the version of
 i387 tan() I posted earlier and the fdlibm tan().  Executive
 summary: the fdlibm tan() is faster for virtually all inputs on a
 Pentium 4.  Pentium 3s seem to have lower-latency FPUs, but fdlibm
 still beats the fptan instruction for the important cases where
 fptan actually gets the right answer.  I used the following sets
 of inputs:
 
 tbl1: small numbers
 -7.910377e-286
 -5.142120e-78
 -8.305257e-262
 -2.089491e-228
 9.625342e-237
 -5.867161e-64
 9.000611e-34
 5.192603e-280
 -1.255581e-153
 1.275985e-153
 
 tbl2: numbers on [-8pi,8pi] greater in magnitude than 2^-18
 -2.410568e-05
 -6.482504e-01
 -1.971384e-03
 -1.686721e-04
 -3.629842e-04
 1.036818e-03
 2.987541e-04
 6.186103e+00
 2.165271e-02
 1.032138e-04
 
 tbl3: large numbers
 2.379610e+172
 3.238483e+204
 -4.680033e+194
 -1.442727e+225
 3.090948e+48
 1.778800e+185
 5.177174e+295
 -1.237869e+204
 -4.577895e+223
 -6.735385e+171
 
 tbl4: special cases
 nan
 inf
 -inf
 -0.0
 +0.0
 
 The results below are divided into four columns.  The first is the
 average number of clock cycles taken by the fdlibm tan() for the
 corresponding table input above on a Pentium 4, the second is the
 clock cycles for the assembly tan(), the third is the difference,
 and the fourth is the percentage difference relative to column 1.
 
 das@VARK:/home/t/freebsd> paste perf1 perf1md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 1259.000000     1697.000000     438.000000      +35%
 1162.000000     1488.000000     326.000000      +28%
 1060.000000     1445.000000     385.000000      +36%
 1059.000000     1453.000000     394.000000      +37%
 1065.000000     1459.000000     394.000000      +37%
 1059.000000     1458.000000     399.000000      +38%
 1031.000000     1461.000000     430.000000      +42%
 1054.000000     1436.000000     382.000000      +36%
 1056.000000     1455.000000     399.000000      +38%
 1073.000000     1459.000000     386.000000      +36%
 das@VARK:/home/t/freebsd> paste perf2 perf2md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 2018.000000     1985.000000     -33.000000      -2%
 1713.000000     1821.000000     108.000000      +6%
 1694.000000     1730.000000     36.000000       +2%
 1708.000000     1737.000000     29.000000       +2%
 1703.000000     1745.000000     42.000000       +2%
 1696.000000     1762.000000     66.000000       +4%
 1718.000000     1774.000000     56.000000       +3%
 1890.000000     2108.000000     218.000000      +12%
 1693.000000     1744.000000     51.000000       +3%
 1702.000000     1714.000000     12.000000       +1%
 das@VARK:/home/t/freebsd> paste perf3 perf3md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 5737.000000     6078.000000     341.000000      +6%
 5110.000000     5200.000000     90.000000       +2%
 6726.000000     7030.000000     304.000000      +5%
 5370.000000     5403.000000     33.000000       +1%
 5032.000000     5206.000000     174.000000      +3%
 5229.000000     5199.000000     -30.000000      -1%
 6313.000000     6523.000000     210.000000      +3%
 5201.000000     5583.000000     382.000000      +7%
 6443.000000     6560.000000     117.000000      +2%
 5172.000000     5356.000000     184.000000      +4%
 das@VARK:/home/t/freebsd> paste perf4 perf4md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 4726.000000     3234.000000     -1492.000000    -32%
 3181.000000     2850.000000     -331.000000     -10%
 3149.000000     2842.000000     -307.000000     -10%
 1045.000000     1091.000000     46.000000       +4%
 1076.000000     1114.000000     38.000000       +4%
 
 (P.S.: Oops, forgot to compile s_sin.c with -O.)
 
 I also ran the first three tests on freefall (Pentium III, using
 the old reduction code), and got results that aren't as favorable
 for the fdlibm version:
 
 das@freefall:~> paste perf1 perf1md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 1384.000000     442.000000      -942.000000     -68%
 584.000000      440.000000      -144.000000     -25%
 589.000000      441.000000      -148.000000     -25%
 584.000000      441.000000      -143.000000     -24%
 585.000000      441.000000      -144.000000     -25%
 585.000000      440.000000      -145.000000     -25%
 585.000000      441.000000      -144.000000     -25%
 584.000000      440.000000      -144.000000     -25%
 584.000000      441.000000      -143.000000     -24%
 584.000000      441.000000      -143.000000     -24%
 das@freefall:~> paste perf2 perf2md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 639.000000      656.000000      17.000000       +3%
 820.000000      648.000000      -172.000000     -21%
 640.000000      654.000000      14.000000       +2%
 639.000000      702.000000      63.000000       +10%
 638.000000      654.000000      16.000000       +3%
 639.000000      658.000000      19.000000       +3%
 638.000000      655.000000      17.000000       +3%
 1789.000000     654.000000      -1135.000000    -63%
 639.000000      654.000000      15.000000       +2%
 638.000000      656.000000      18.000000       +3%
 das@freefall:~> paste perf3 perf3md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 5751.000000     1918.000000     -3833.000000    -67%
 4383.000000     2351.000000     -2032.000000    -46%
 6241.000000     2150.000000     -4091.000000    -66%
 4401.000000     2483.000000     -1918.000000    -44%
 4387.000000     1182.000000     -3205.000000    -73%
 4276.000000     2004.000000     -2272.000000    -53%
 5814.000000     2735.000000     -3079.000000    -53%
 4389.000000     2180.000000     -2209.000000    -50%
 5779.000000     2270.000000     -3509.000000    -61%
 4379.000000     2020.000000     -2359.000000    -54%
 
 Here, fdlibm usually wins for tbl2, which is the most important
 class of inputs.  It is slower for the two inputs in tbl2 that are
 close to multiples of 2pi and for large inputs, but in all
 fairness, the i387 gets the wrong answer in those cases---hence,
 this PR.  The i387 legitimately beats fdlibm for the small inputs,
 for which tan(x) == x, so a special case for those earlier in
 fdlibm would probably be beneficial.
 
 Conclusion: We should toss out the assembly versions of tan() and
 tanf(), and possibly special-case small inputs in fdlibm tan().
 
 I found similar results for sin(), so we should probably do the
 same for that, too.  Again, this is on my Pentium 4; YMMV.
 
 das@VARK:/home/t/freebsd> paste perf1 perf1md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 1254.000000     1981.000000     727.000000      +58%
 1109.000000     1603.000000     494.000000      +45%
 1056.000000     1579.000000     523.000000      +50%
 1051.000000     1565.000000     514.000000      +49%
 1043.000000     1569.000000     526.000000      +50%
 1077.000000     1572.000000     495.000000      +46%
 1050.000000     1570.000000     520.000000      +50%
 1051.000000     1566.000000     515.000000      +49%
 1042.000000     1569.000000     527.000000      +51%
 1048.000000     1568.000000     520.000000      +50%
 das@VARK:/home/t/freebsd> paste perf2 perf2md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 1404.000000     1656.000000     252.000000      +18%
 1222.000000     1563.000000     341.000000      +28%
 1205.000000     1614.000000     409.000000      +34%
 1210.000000     1603.000000     393.000000      +32%
 1206.000000     1616.000000     410.000000      +34%
 1218.000000     1636.000000     418.000000      +34%
 1207.000000     1645.000000     438.000000      +36%
 1772.000000     1610.000000     -162.000000     -9%
 1213.000000     1615.000000     402.000000      +33%
 1208.000000     1617.000000     409.000000      +34%
 das@VARK:/home/t/freebsd> paste perf3 perf3md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 5057.000000     5260.000000     203.000000      +4%
 5076.000000     6708.000000     1632.000000     +32%
 6528.000000     5978.000000     -550.000000     -8%
 5074.000000     6922.000000     1848.000000     +36%
 5116.000000     2889.000000     -2227.000000    -44%
 4702.000000     5190.000000     488.000000      +10%
 5884.000000     7223.000000     1339.000000     +23%
 5138.000000     5927.000000     789.000000      +15%
 5834.000000     5923.000000     89.000000       +2%
 5051.000000     5618.000000     567.000000      +11%
 das@VARK:/home/t/freebsd> paste perf4 perf4md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
 3536.000000     3576.000000     40.000000       +1%
 3190.000000     2705.000000     -485.000000     -15%
 3189.000000     2698.000000     -491.000000     -15%
 1053.000000     1171.000000     118.000000      +11%
 1059.000000     1174.000000     115.000000      +11%
 
 The above data was generated using the program below, executed as
 follows:
 	./a.out < tblN | grep avg | awk '{print $2}' > perfN
 When compiling the program, it is necessary to add
 -Dfunc=tan or -Dfunc=itan.
 
 #include <limits.h>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 
 #define	ITER		10000
 
 #define	rdtsc(rv)	__asm __volatile("xor %%ax,%%ax\n\tcpuid\n\trdtsc" \
 					 : "=A" (*(rv)) : : "ebx", "ecx")
 
 double itan(double);
 
 static void
 runtest(double d)
 {
 	volatile double result;
 	double avg, sd;
 	uint64_t start, end;
 	int64_t total;
 	int t[ITER];
 	int i, n;
 	int tmax, tmin;
 
 	printf("%a\n", d);
 	total = 0;
 	for (i = 0; i < ITER; i++) {
 		rdtsc(&start);
 		result = func(d);
 		rdtsc(&end);
 
 		t[i] = end - start;
 		total += t[i];
 	}
 
 	/* compute initial avg and sd */
 	avg = (double)total / ITER;
 	sd = 0;
 	for (i = 0; i < ITER; i++)
 		sd += (avg - t[i]) * (avg - t[i]);
 	sd = sqrt(sd / ITER);
 
 	/* recompute avg and sd with outliers removed, find max and min */
 	n = 0;
 	tmax = 0;
 	tmin = INT_MAX;
 	for (i = 0; i < ITER; i++) {
 		if (fabs(avg - t[i]) <= sd * 2) {
 			total += t[i];
 			n++;
 			if (t[i] > tmax)
 				tmax = t[i];
 			if (t[i] < tmin)
 				tmin = t[i];
 		} else {
 			t[i] = -1;
 		}
 	}
 	avg = (double)total / n;
 	sd = 0;
 	for (i = 0; i < ITER; i++) {
 		if (t[i] >= 0)
 			sd += (avg - t[i]) * (avg - t[i]);
 	}
 	sd = sqrt(sd / n);
 
 	printf("avg:\t%.0f\nsd:\t%.0f\nmin:\t%d\nmax:\t%d\nout:\t%.02f%%\n\n",
 	    avg, sd, tmin, tmax, (double)(ITER - n) * 100 / ITER);
 }
 
 int
 main(int argc, char *argv[])
 {
 	double d;
 
 	while (scanf("%lf", &d) > 0)
 		runtest(d);
 
 	return (0);
 }

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200502090520.j195K5sR097911>

Header And Logo

Peripheral Links

Site Navigation

Header And Logo

Peripheral Links

Search

Site Navigation