From owner-svn-src-all@FreeBSD.ORG Sat Oct 15 05:23:28 2011 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id C82F2106566B; Sat, 15 Oct 2011 05:23:28 +0000 (UTC) (envelope-from das@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id B68958FC08; Sat, 15 Oct 2011 05:23:28 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id p9F5NS5B096727; Sat, 15 Oct 2011 05:23:28 GMT (envelope-from das@svn.freebsd.org) Received: (from das@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id p9F5NSGP096720; Sat, 15 Oct 2011 05:23:28 GMT (envelope-from das@svn.freebsd.org) Message-Id: <201110150523.p9F5NSGP096720@svn.freebsd.org> From: David Schultz Date: Sat, 15 Oct 2011 05:23:28 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r226376 - head/lib/msun/src X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 15 Oct 2011 05:23:28 -0000 Author: das Date: Sat Oct 15 05:23:28 2011 New Revision: 226376 URL: http://svn.freebsd.org/changeset/base/226376 Log: Various changes to improve the accuracy and speed of log{2,10}{,f}. - Rename __kernel_log() to k_log1p(). - Move some of the work that was previously done in the kernel log into the callers. This enables further refactoring to improve accuracy or speed, although I don't recall the details. - Use extra precision when adding the final scaling term, which improves accuracy. - Describe and work around compiler problems that break some of the multiprecision calculations. A fix for a small bug is also included: - Add a special case for log*(1). This is needed to ensure that log*(1) == +0 instead of -0, even when the rounding mode is FE_DOWNWARD. Submitted by: bde Modified: head/lib/msun/src/e_log10.c head/lib/msun/src/e_log10f.c head/lib/msun/src/e_log2.c head/lib/msun/src/e_log2f.c head/lib/msun/src/k_log.h head/lib/msun/src/k_logf.h Modified: head/lib/msun/src/e_log10.c ============================================================================== --- head/lib/msun/src/e_log10.c Sat Oct 15 05:00:56 2011 (r226375) +++ head/lib/msun/src/e_log10.c Sat Oct 15 05:23:28 2011 (r226376) @@ -17,6 +17,9 @@ __FBSDID("$FreeBSD$"); /* * Return the base 10 logarithm of x. See e_log.c and k_log.h for most * comments. + * + * log10(x) = (f - 0.5*f*f + k_log1p(f)) / ln10 + k * log10(2) + * in not-quite-routine extra precision. */ #include "math.h" @@ -35,7 +38,7 @@ static const double zero = 0.0; double __ieee754_log10(double x) { - double f,hi,lo,y,z; + double f,hfsq,hi,lo,r,val_hi,val_lo,w,y,y2; int32_t i,k,hx; u_int32_t lx; @@ -50,16 +53,35 @@ __ieee754_log10(double x) GET_HIGH_WORD(hx,x); } if (hx >= 0x7ff00000) return x+x; + if (hx == 0x3ff00000 && lx == 0) + return zero; /* log(1) = +0 */ k += (hx>>20)-1023; hx &= 0x000fffff; i = (hx+0x95f64)&0x100000; SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */ k += (i>>20); y = (double)k; - f = __kernel_log(x); - hi = x = x - 1; + f = x - 1.0; + hfsq = 0.5*f*f; + r = k_log1p(f); + + /* See e_log2.c for most details. */ + hi = f - hfsq; SET_LOW_WORD(hi,0); - lo = x - hi; - z = y*log10_2lo + (x+f)*ivln10lo + (lo+f)*ivln10hi + hi*ivln10hi; - return z+y*log10_2hi; + lo = (f - hi) - hfsq + r; + val_hi = hi*ivln10hi; + y2 = y*log10_2hi; + val_lo = y*log10_2lo + (lo+hi)*ivln10lo + lo*ivln10hi; + + /* + * Extra precision in for adding y*log10_2hi is not strictly needed + * since there is no very large cancellation near x = sqrt(2) or + * x = 1/sqrt(2), but we do it anyway since it costs little on CPUs + * with some parallelism and it reduces the error for many args. + */ + w = y2 + val_hi; + val_lo += (y2 - w) + val_hi; + val_hi = w; + + return val_lo + val_hi; } Modified: head/lib/msun/src/e_log10f.c ============================================================================== --- head/lib/msun/src/e_log10f.c Sat Oct 15 05:00:56 2011 (r226375) +++ head/lib/msun/src/e_log10f.c Sat Oct 15 05:23:28 2011 (r226376) @@ -32,7 +32,7 @@ static const float zero = 0.0; float __ieee754_log10f(float x) { - float f,hi,lo,y,z; + float f,hfsq,hi,lo,r,y,y2; int32_t i,k,hx; GET_FLOAT_WORD(hx,x); @@ -46,17 +46,26 @@ __ieee754_log10f(float x) GET_FLOAT_WORD(hx,x); } if (hx >= 0x7f800000) return x+x; + if (hx == 0x3f800000) + return zero; /* log(1) = +0 */ k += (hx>>23)-127; hx &= 0x007fffff; i = (hx+(0x4afb0d))&0x800000; SET_FLOAT_WORD(x,hx|(i^0x3f800000)); /* normalize x or x/2 */ k += (i>>23); y = (float)k; - f = __kernel_logf(x); - x = x - (float)1.0; - GET_FLOAT_WORD(hx,x); + f = x - (float)1.0; + hfsq = (float)0.5*f*f; + r = k_log1pf(f); + + /* See e_log2f.c and e_log2.c for details. */ + if (sizeof(float_t) > sizeof(float)) + return (r - hfsq + f) * ((float_t)ivln10lo + ivln10hi) + + y * ((float_t)log10_2lo + log10_2hi); + hi = f - hfsq; + GET_FLOAT_WORD(hx,hi); SET_FLOAT_WORD(hi,hx&0xfffff000); - lo = x - hi; - z = y*log10_2lo + (x+f)*ivln10lo + (lo+f)*ivln10hi + hi*ivln10hi; - return z+y*log10_2hi; + lo = (f - hi) - hfsq + r; + return y*log10_2lo + (lo+hi)*ivln10lo + lo*ivln10hi + hi*ivln10hi + + y*log10_2hi; } Modified: head/lib/msun/src/e_log2.c ============================================================================== --- head/lib/msun/src/e_log2.c Sat Oct 15 05:00:56 2011 (r226375) +++ head/lib/msun/src/e_log2.c Sat Oct 15 05:23:28 2011 (r226376) @@ -17,6 +17,11 @@ __FBSDID("$FreeBSD$"); /* * Return the base 2 logarithm of x. See e_log.c and k_log.h for most * comments. + * + * This reduces x to {k, 1+f} exactly as in e_log.c, then calls the kernel, + * then does the combining and scaling steps + * log2(x) = (f - 0.5*f*f + k_log1p(f)) / ln2 + k + * in not-quite-routine extra precision. */ #include "math.h" @@ -33,7 +38,7 @@ static const double zero = 0.0; double __ieee754_log2(double x) { - double f,hi,lo; + double f,hfsq,hi,lo,r,val_hi,val_lo,w,y; int32_t i,k,hx; u_int32_t lx; @@ -48,14 +53,58 @@ __ieee754_log2(double x) GET_HIGH_WORD(hx,x); } if (hx >= 0x7ff00000) return x+x; + if (hx == 0x3ff00000 && lx == 0) + return zero; /* log(1) = +0 */ k += (hx>>20)-1023; hx &= 0x000fffff; i = (hx+0x95f64)&0x100000; SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */ k += (i>>20); - f = __kernel_log(x); - hi = x = x - 1; + y = (double)k; + f = x - 1.0; + hfsq = 0.5*f*f; + r = k_log1p(f); + + /* + * f-hfsq must (for args near 1) be evaluated in extra precision + * to avoid a large cancellation when x is near sqrt(2) or 1/sqrt(2). + * This is fairly efficient since f-hfsq only depends on f, so can + * be evaluated in parallel with R. Not combining hfsq with R also + * keeps R small (though not as small as a true `lo' term would be), + * so that extra precision is not needed for terms involving R. + * + * Compiler bugs involving extra precision used to break Dekker's + * theorem for spitting f-hfsq as hi+lo, unless double_t was used + * or the multi-precision calculations were avoided when double_t + * has extra precision. These problems are now automatically + * avoided as a side effect of the optimization of combining the + * Dekker splitting step with the clear-low-bits step. + * + * y must (for args near sqrt(2) and 1/sqrt(2)) be added in extra + * precision to avoid a very large cancellation when x is very near + * these values. Unlike the above cancellations, this problem is + * specific to base 2. It is strange that adding +-1 is so much + * harder than adding +-ln2 or +-log10_2. + * + * This uses Dekker's theorem to normalize y+val_hi, so the + * compiler bugs are back in some configurations, sigh. And I + * don't want to used double_t to avoid them, since that gives a + * pessimization and the support for avoiding the pessimization + * is not yet available. + * + * The multi-precision calculations for the multiplications are + * routine. + */ + hi = f - hfsq; SET_LOW_WORD(hi,0); - lo = x - hi; - return (x+f)*ivln2lo + (lo+f)*ivln2hi + hi*ivln2hi + k; + lo = (f - hi) - hfsq + r; + val_hi = hi*ivln2hi; + val_lo = (lo+hi)*ivln2lo + lo*ivln2hi; + + /* spadd(val_hi, val_lo, y), except for not using double_t: */ + w = y + val_hi; + val_lo += (y - w) + val_hi; + val_hi = w; + + return val_lo + val_hi; } Modified: head/lib/msun/src/e_log2f.c ============================================================================== --- head/lib/msun/src/e_log2f.c Sat Oct 15 05:00:56 2011 (r226375) +++ head/lib/msun/src/e_log2f.c Sat Oct 15 05:23:28 2011 (r226376) @@ -30,7 +30,7 @@ static const float zero = 0.0; float __ieee754_log2f(float x) { - float f,hi,lo; + float f,hfsq,hi,lo,r,y; int32_t i,k,hx; GET_FLOAT_WORD(hx,x); @@ -44,15 +44,38 @@ __ieee754_log2f(float x) GET_FLOAT_WORD(hx,x); } if (hx >= 0x7f800000) return x+x; + if (hx == 0x3f800000) + return zero; /* log(1) = +0 */ k += (hx>>23)-127; hx &= 0x007fffff; i = (hx+(0x4afb0d))&0x800000; SET_FLOAT_WORD(x,hx|(i^0x3f800000)); /* normalize x or x/2 */ k += (i>>23); - f = __kernel_logf(x); - x = x - (float)1.0; - GET_FLOAT_WORD(hx,x); + y = (float)k; + f = x - (float)1.0; + hfsq = (float)0.5*f*f; + r = k_log1pf(f); + + /* + * We no longer need to avoid falling into the multi-precision + * calculations due to compiler bugs breaking Dekker's theorem. + * Keep avoiding this as an optimization. See e_log2.c for more + * details (some details are here only because the optimization + * is not yet available in double precision). + * + * Another compiler bug turned up. With gcc on i386, + * (ivln2lo + ivln2hi) would be evaluated in float precision + * despite runtime evaluations using double precision. So we + * must cast one of its terms to float_t. This makes the whole + * expression have type float_t, so return is forced to waste + * time clobbering its extra precision. + */ + if (sizeof(float_t) > sizeof(float)) + return (r - hfsq + f) * ((float_t)ivln2lo + ivln2hi) + y; + + hi = f - hfsq; + GET_FLOAT_WORD(hx,hi); SET_FLOAT_WORD(hi,hx&0xfffff000); - lo = x - hi; - return (x+f)*ivln2lo + (lo+f)*ivln2hi + hi*ivln2hi + k; + lo = (f - hi) - hfsq + r; + return (lo+hi)*ivln2lo + lo*ivln2hi + hi*ivln2hi + y; } Modified: head/lib/msun/src/k_log.h ============================================================================== --- head/lib/msun/src/k_log.h Sat Oct 15 05:00:56 2011 (r226375) +++ head/lib/msun/src/k_log.h Sat Oct 15 05:23:28 2011 (r226376) @@ -14,8 +14,9 @@ #include __FBSDID("$FreeBSD$"); -/* __kernel_log(x) - * Return log(x) - (x-1) for x in ~[sqrt(2)/2, sqrt(2)]. +/* + * k_log1p(f): + * Return log(1+f) - f for 1+f in ~[sqrt(2)/2, sqrt(2)]. * * The following describes the overall strategy for computing * logarithms in base e. The argument reduction and adding the final @@ -80,37 +81,20 @@ Lg6 = 1.531383769920937332e-01, /* 3FC3 Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */ /* - * We always inline __kernel_log(), since doing so produces a + * We always inline k_log1p(), since doing so produces a * substantial performance improvement (~40% on amd64). */ static inline double -__kernel_log(double x) +k_log1p(double f) { - double hfsq,f,s,z,R,w,t1,t2; - int32_t hx,i,j; - u_int32_t lx; - - EXTRACT_WORDS(hx,lx,x); + double hfsq,s,z,R,w,t1,t2; - f = x-1.0; - if((0x000fffff&(2+hx))<3) { /* -2**-20 <= f < 2**-20 */ - if(f==0.0) return 0.0; - return f*f*(0.33333333333333333*f-0.5); - } - s = f/(2.0+f); + s = f/(2.0+f); z = s*s; - hx &= 0x000fffff; - i = hx-0x6147a; w = z*z; - j = 0x6b851-hx; - t1= w*(Lg2+w*(Lg4+w*Lg6)); - t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7))); - i |= j; + t1= w*(Lg2+w*(Lg4+w*Lg6)); + t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7))); R = t2+t1; - if (i>0) { - hfsq=0.5*f*f; - return s*(hfsq+R) - hfsq; - } else { - return s*(R-f); - } + hfsq=0.5*f*f; + return s*(hfsq+R); } Modified: head/lib/msun/src/k_logf.h ============================================================================== --- head/lib/msun/src/k_logf.h Sat Oct 15 05:00:56 2011 (r226375) +++ head/lib/msun/src/k_logf.h Sat Oct 15 05:23:28 2011 (r226376) @@ -13,7 +13,7 @@ __FBSDID("$FreeBSD$"); /* - * float version of __kernel_log(x). See k_log.c for details. + * Float version of k_log.h. See the latter for most comments. */ static const float @@ -24,32 +24,16 @@ Lg3 = 0x91e9ee.0p-25, /* 0.28498786 Lg4 = 0xf89e26.0p-26; /* 0.24279078841 */ static inline float -__kernel_logf(float x) +k_log1pf(float f) { - float hfsq,f,s,z,R,w,t1,t2; - int32_t ix,i,j; + float hfsq,s,z,R,w,t1,t2; - GET_FLOAT_WORD(ix,x); - - f = x-(float)1.0; - if((0x007fffff&(0x8000+ix))<0xc000) { /* -2**-9 <= f < 2**-9 */ - if(f==0.0f) return 0.0f; - return f*f*((float)0.33333333333333333*f-(float)0.5); - } s = f/((float)2.0+f); z = s*s; - ix &= 0x007fffff; - i = ix-(0x6147a<<3); w = z*z; - j = (0x6b851<<3)-ix; t1= w*(Lg2+w*Lg4); t2= z*(Lg1+w*Lg3); - i |= j; R = t2+t1; - if(i>0) { - hfsq=(float)0.5*f*f; - return s*(hfsq+R) - hfsq; - } else { - return s*(R-f); - } + hfsq=(float)0.5*f*f; + return s*(hfsq+R); }