Date: Sat, 15 Sep 2012 23:45:47 +1000 (EST) From: Bruce Evans <brde@optusnet.com.au> To: Stephen Montgomery-Smith <stephen@missouri.edu> Cc: freebsd-numerics@freebsd.org Subject: Re: Complex arg-trig functions Message-ID: <20120915231032.C2669@besplex.bde.org> In-Reply-To: <50538E28.6050400@missouri.edu> References: <5017111E.6060003@missouri.edu> <20120807205725.GA10572@server.rulingia.com> <20120809025220.N4114@besplex.bde.org> <5027F07E.9060409@missouri.edu> <20120814003614.H3692@besplex.bde.org> <50295F5C.6010800@missouri.edu> <20120814072946.S5260@besplex.bde.org> <50297CA5.5010900@missouri.edu> <50297E43.7090309@missouri.edu> <20120814201105.T934@besplex.bde.org> <502A780B.2010106@missouri.edu> <20120815223631.N1751@besplex.bde.org> <502C0CF8.8040003@missouri.edu> <20120906221028.O1542@besplex.bde.org> <5048D00B.8010401@missouri.edu> <504D3CCD.2050006@missouri.edu> <504FF726.9060001@missouri.edu> <20120912191556.F1078@besplex.bde.org> <20120912225847.J1771@besplex.bde.org> <50511B40.3070009@missouri.edu> <20120913204808.T1964@besplex.bde.org> <5051F59C.6000603@missouri.edu> <20120914014208.I2862@besplex.bde.org> <50526050.2070303@missouri.edu> <20120914212403.H1983@besplex.bde.org> <50538E28.6050400@missouri.edu>
next in thread | previous in thread | raw e-mail | index | archive | help
On Fri, 14 Sep 2012, Stephen Montgomery-Smith wrote: > OK, I think I put in all your latest changes. I tried some simple optimizations but got nowhere, except for long doubles, repeating the earlier optimization for floats works well. It also works slightly better than the more special version of it in double precision. Float precision is still strangely slower than double precision, but only on i386. catrigf.c is now faster on amd64, but catrigf_ver2.c is still faster for most functions on i386. I can't see any reason for this in the code, and suspect it is data-dependent, due to something like float precision generating more exceptions (denormal and underflow?) when catrigf.c is used and the exceptions only being slower on (only Intel?) i386 (the final results should still have the exceptions, but intermediate caclulations usually won't). I don't plan to do much more optimization. The next step is to fix style bugs. The main style bugs are multiple statements per line and missing spaces around +-. indent(1) fixes these well. % diff -u2 catrig.c~ catrig.c % --- catrig.c~ 2012-09-14 17:04:45.000000000 +0000 % +++ catrig.c 2012-09-15 10:55:22.271430000 +0000 % @@ -32,17 +32,16 @@ % % #undef isinf % -#define isinf(bx) (((((bx).parts.msw & 0x7fffffff) - 0x7ff00000) | \ % - (bx).parts.lsw) == 0) % +#define isinf(x) (fabs(x) == INFINITY) % #undef isnan % -#define isnan(x) ((x)!=(x)) % +#define isnan(x) ((x) != (x)) % #undef signbit % -#define signbit(bx) (((bx).parts.msw & 0x80000000) != 0) % +#define signbit(x) (__builtin_signbit(x)) Use the same method as in catrigf.c instead of a more specialized method. Replacing these macros is just working around bugs in <math.h>. __builtin_inf() could be used too, but I don't use it because gcc-4.2 turns it into a libcall. The libcall is to isinf(), which only exists for historical reasons. clang does have a real bultin for it, and this does exactly the same as the above. __builtin_isnan() could be used too, but I don't use it because gcc-3.3 turns it into a libcall. % % /* We need that DBL_EPSILON^2 is larger than FOUR_SQRT_MIN. */ % static const double % A_crossover = 10, /* Hull et al suggest 1.5, but 10 works better */ % -B_crossover = 0.6417, /* suggested by Hull et al */ % +B_crossover = 0.6417, /* suggested by Hull et al */ % FOUR_SQRT_MIN = 0x1p-509, /* >= 4 * sqrt(DBL_MIN) */ % -QUARTER_SQRT_MAX = 0x1p509, /* <= 0.25 * sqrt(DBL_MAX) */ % +QUARTER_SQRT_MAX = 0x1p509, /* <= sqrt(DBL_MAX) / 4 */ Old style fixes. % m_e = 2.7182818284590452e0, /* 0x15bf0a8b145769.0p-51 */ % m_ln2 = 6.9314718055994531e-1, /* 0x162e42fefa39ef.0p-53 */ % @@ -271,5 +270,4 @@ % { % double x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y; % - ieee_double_shape_type bx, by; /* The "bits" of x and y. */ % int B_is_usable; % double complex w; % @@ -277,6 +275,4 @@ % x = creal(z); % y = cimag(z); % - bx.value = x; % - by.value = y; % ax = fabs(x); % ay = fabs(y); Removing the specail isfoo()'s gives some nice simplifications. I checked the object code, and saw that things like isinf() were being optimized well, partly because we take fabs() anyway and don't clobber the result. gcc uses the fabs() calculated above to feed to isinf() and doesn't take fabs() again. % @@ -284,8 +280,8 @@ % if (isnan(x) || isnan(y)) { % /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */ % - if (isinf(bx)) % + if (isinf(x)) % return (cpack(x, y+y)); % - /* casinh(NaN + I*+-Inf) = opt(-)Inf + I*NaN */ % - if (isinf(by)) % + /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */ % + if (isinf(y)) % return (cpack(y, x+x)); % /* casinh(NaN + I*0) = NaN + I*0 */ % @@ -301,6 +297,6 @@ % % if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) % - if (isinf(bx) || isinf(by) || (int)(1+tiny)==1) { /* raise inexact */ % - if (signbit(bx) == 0) % + if (isinf(x) || isinf(y) || (int)(1+tiny)==1) { /* raise inexact */ % + if (signbit(x) == 0) % w = clog_for_large_values(z) + m_ln2; % else % @@ -348,5 +344,4 @@ % { % double x, y, ax, ay, rx, ry, B, sqrt_A2mx2, new_x; % - ieee_double_shape_type bx, by; /* The "bits" of x and y. */ % int sx, sy; % int B_is_usable; % @@ -355,8 +350,6 @@ % x = creal(z); % y = cimag(z); % - bx.value = x; % - by.value = y; % - sx = signbit(bx); % - sy = signbit(by); % + sx = signbit(x); % + sy = signbit(y); % ax = fabs(x); % ay = fabs(y); % @@ -364,8 +357,8 @@ % if (isnan(x) || isnan(y)) { % /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */ % - if (isinf(bx)) % + if (isinf(x)) % return (cpack(y+y, -INFINITY)); % /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */ % - if (isinf(by)) % + if (isinf(y)) % return (cpack(x+x, -y)); % /* cacos(0 + I*NaN) = PI/2 + I*NaN */ % @@ -380,5 +373,5 @@ % % if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) % - if (isinf(bx) || isinf(by) || (int)(1+tiny)==1) { /* raise inexact */ % + if (isinf(x) || isinf(y) || (int)(1+tiny)==1) { /* raise inexact */ % w = clog_for_large_values(z); % rx = fabs(cimag(w)); % @@ -548,10 +541,7 @@ % { % double x, y, ax, ay, rx, ry; % - ieee_double_shape_type bx, by; /* The "bits" of x and y. */ % % x = creal(z); % y = cimag(z); % - bx.value = x; % - by.value = y; % ax = fabs(x); % ay = fabs(y); % @@ -559,8 +549,8 @@ % if (isnan(x) || isnan(y)) { % /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */ % - if (isinf(bx)) % + if (isinf(x)) % return (cpack(copysign(0, x), y+y)); % /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */ % - if (isinf(by)) % + if (isinf(y)) % return (cpack(copysign(0, x), copysign(m_pi_2, y))); % /* catanh(+-0 + I*NaN) = +-0 + I*NaN */ % @@ -576,5 +566,5 @@ % % /* If x or y is inf, then catanh(x + I*y) = 0 + I*sign(y)*PI/2 */ % - if (isinf(bx) || isinf(by)) % + if (isinf(x) || isinf(y)) % return (cpack(copysign(0, x), copysign(m_pi_2, y))); % The isfoo() changes in double precision give a tiny optimization. % diff -u2 catrigf.c~ catrigf.c % --- catrigf.c~ 2012-09-14 17:16:16.000000000 +0000 % +++ catrigf.c 2012-09-15 10:55:32.594098000 +0000 % @@ -44,7 +44,7 @@ % #define isinf(x) (fabsf(x) == INFINITY) % #undef isnan % -#define isnan(x) ((x)!=(x)) % +#define isnan(x) ((x) != (x)) Old style fix. % #undef signbit % -#define signbit(x) (__builtin_signbit(x)) % +#define signbit(x) (__builtin_signbitf(x)) % % static const float This builtin is not type-generic, so we should use a type suffix in it. Without the type suffix, it works. __builtin_isnan() would work too. But __builtin_isinf() would be broken. Without a type suffix or with a wrong one, larger code is generated but this is only a minor pessimization. % @@ -55,5 +55,5 @@ % m_e = 2.7182818285e0, /* 0xadf854.0p-22 */ % m_ln2 = 6.9314718056e-1, /* 0xb17218.0p-24 */ % -m_pi_2 = 1.5707963268e0, /* 0xc90fdb.0p-23 */ % +m_pi_2 = 1.5707963268e0, /* 0xc90fdb.0p-23 */ % RECIP_EPSILON = 1/FLT_EPSILON, % SQRT_MIN = 0x1p-63; Old style fix. % diff -u2 catrigl.c~ catrigl.c % --- catrigl.c~ 2012-09-14 17:12:54.000000000 +0000 % +++ catrigl.c 2012-09-15 10:56:19.985657000 +0000 % @@ -42,4 +42,11 @@ % #include "math_private.h" % % +#undef isinf % +#define isinf(x) (fabsl(x) == INFINITY) % +#undef isnan % +#define isnan(x) ((x) != (x)) % +#undef signbit % +#define signbit(x) (__builtin_signbitl(x)) % + % static const long double % A_crossover = 10, Now it is a fairly large optimization to avoid using extern functions for these. % @@ -65,5 +72,5 @@ % #else % #error "Unsupported long double format" % -#endif % +#endif % % static const volatile long double Example of routine style fixes: @ --- catrig.c.BAK 2012-09-15 13:38:59.482263000 +0000 @ +++ catrig.c 2012-09-15 13:38:59.485259000 +0000 @ @@ -125,7 +125,9 @@ @ f(double a, double b, double hypot_a_b) @ { @ - if (b < 0) return ((hypot_a_b - b) / 2); @ - if (b == 0) return (a/2); @ - return (a*a / (hypot_a_b + b) / 2); @ + if (b < 0) @ + return ((hypot_a_b - b) / 2); @ + if (b == 0) @ + return (a / 2); @ + return (a * a / (hypot_a_b + b) / 2); @ } indent(1) produces a 424-line patch with about 300 lines correct and looking like the above. I don't mind a*a, but insist on the multiple statements per line being fixed, and mixed styles are hard to maintain. Also function calls in initializers and long lines. indent(1) doesn't understand the last 2. Bruce
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20120915231032.C2669>