From owner-freebsd-numerics@FreeBSD.ORG  Fri Sep 14 12:06:35 2012
Return-Path: <owner-freebsd-numerics@FreeBSD.ORG>
Delivered-To: freebsd-numerics@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52])
	by hub.freebsd.org (Postfix) with ESMTP id 4DC74106564A
	for <freebsd-numerics@freebsd.org>;
	Fri, 14 Sep 2012 12:06:35 +0000 (UTC)
	(envelope-from brde@optusnet.com.au)
Received: from mail02.syd.optusnet.com.au (mail02.syd.optusnet.com.au
	[211.29.132.183])
	by mx1.freebsd.org (Postfix) with ESMTP id 91B2D8FC14
	for <freebsd-numerics@freebsd.org>;
	Fri, 14 Sep 2012 12:06:34 +0000 (UTC)
Received: from c122-106-157-84.carlnfd1.nsw.optusnet.com.au
	(c122-106-157-84.carlnfd1.nsw.optusnet.com.au [122.106.157.84])
	by mail02.syd.optusnet.com.au (8.13.1/8.13.1) with ESMTP id
	q8EC6Uvc007136
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO);
	Fri, 14 Sep 2012 22:06:32 +1000
Date: Fri, 14 Sep 2012 22:06:30 +1000 (EST)
From: Bruce Evans <brde@optusnet.com.au>
X-X-Sender: bde@besplex.bde.org
To: Stephen Montgomery-Smith <stephen@missouri.edu>
In-Reply-To: <50526050.2070303@missouri.edu>
Message-ID: <20120914212403.H1983@besplex.bde.org>
References: <5017111E.6060003@missouri.edu> <501EC015.3000808@missouri.edu>
	<20120805191954.GA50379@troutmask.apl.washington.edu>
	<20120807205725.GA10572@server.rulingia.com>
	<20120809025220.N4114@besplex.bde.org>
	<5027F07E.9060409@missouri.edu> <20120814003614.H3692@besplex.bde.org>
	<50295F5C.6010800@missouri.edu> <20120814072946.S5260@besplex.bde.org>
	<50297CA5.5010900@missouri.edu> <50297E43.7090309@missouri.edu>
	<20120814201105.T934@besplex.bde.org> <502A780B.2010106@missouri.edu>
	<20120815223631.N1751@besplex.bde.org> <502C0CF8.8040003@missouri.edu>
	<20120906221028.O1542@besplex.bde.org> <5048D00B.8010401@missouri.edu>
	<504D3CCD.2050006@missouri.edu> <504FF726.9060001@missouri.edu>
	<20120912191556.F1078@besplex.bde.org>
	<20120912225847.J1771@besplex.bde.org>
	<50511B40.3070009@missouri.edu> <20120913204808.T1964@besplex.bde.org>
	<5051F59C.6000603@missouri.edu> <20120914014208.I2862@besplex.bde.org>
	<50526050.2070303@missouri.edu>
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII; format=flowed
Cc: freebsd-numerics@freebsd.org
Subject: Re: Complex arg-trig functions
X-BeenThere: freebsd-numerics@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: "Discussions of high quality implementation of libm functions."
	<freebsd-numerics.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-numerics>, 
	<mailto:freebsd-numerics-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/freebsd-numerics>
List-Post: <mailto:freebsd-numerics@freebsd.org>
List-Help: <mailto:freebsd-numerics-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-numerics>, 
	<mailto:freebsd-numerics-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Fri, 14 Sep 2012 12:06:35 -0000

On Thu, 13 Sep 2012, Stephen Montgomery-Smith wrote:

> OK, I think I have implemented all your suggestions now.  Tell me what you 
> think.  (Today I am a little more spacey than usual, but I think I got it 
> all.)

Not quite.  After merging them, I get:

% diff -u2 catrig.c~ catrig.c
% --- catrig.c~	2012-09-14 01:49:04.000000000 +0000
% +++ catrig.c	2012-09-14 11:15:42.305034000 +0000
% @@ -27,5 +27,6 @@
%  #include <complex.h>
%  #include <float.h>
% -#include <math.h>
% +
% +#include "math.h"
%  #include "math_private.h"
%

Start cleaning up includes.

% @@ -37,13 +38,14 @@
%  /* We need that DBL_EPSILON^2 is larger than FOUR_SQRT_MIN. */
%  static const double
% -A_crossover =		10, /* Hull et al suggest 1.5, but 10 works better. */
% -B_crossover =		0.6417, /* suggested by Hull et al. */
% +A_crossover =		10, /* Hull et al suggest 1.5, but 10 works better */
% +B_crossover =		0.6417,		/* suggested by Hull et al */

Old changes.

%  FOUR_SQRT_MIN =		0x1p-509,	/* greater than 4 * sqrt(DBL_MIN) */
% -QUARTER_SQRT_MAX =	0x1p509,	/* less than 0.25 * sqrt(DBL_MAX) */
% +					/* XXX this equal, not greater */
% +QUARTER_SQRT_MAX =	0x1p509,	/* less than sqrt(DBL_MAX) / 4 */

Change halfs and quarters to division by 2 and 4 in comments too.  This
gives large diffs.

QUARTER_SQRT_MAX satisfies the fixed comment (it is smaller by a factor
of about sqrt(2), since DBL_MAX is close to 2**1023 and not close to
2**1022), but FOUR_SQRT_MIN doesn't satisify the fixed comment (it is
exact since DBL_MIN is exactly 2**-1022 and I calculated the sqrt()
to accurately by halving the exponent).  This seems to be harmless,
but the comment should agree.

%  m_e =			2.7182818284590452e0,	/*  0x15bf0a8b145769.0p-51 */
%  m_ln2 =			6.9314718055994531e-1,	/*  0x162e42fefa39ef.0p-53 */
%  m_pi_2 =		1.5707963267948966e0,	/*  0x1921fb54442d18.0p-52 */
%  RECIP_EPSILON =		1/DBL_EPSILON,
% -SQRT_MIN =		0x1p-511;	/* greater than sqrt(DBL_MIN) */
% +SQRT_MIN =		0x1p-511;	/* >= sqrt(DBL_MIN) */

For all uses of this, having a precise threshold is good, so I changed the
comment.  There was only 1 use, but I added another.

% 
%  static const volatile double
% @@ -82,6 +84,6 @@
%   * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B)
%   * where
% - * A = 0.5 * (|z+I| + |z-I|)
% - * B = 0.5 * (|z+I| - |z-I|) = y/A
% + * A = (|z+I| + |z-I|) / 2
% + * B = (|z+I| - |z-I|) / 2 = y/A
%   *
%   * These formulas become numerically unstable:
% @@ -93,5 +95,5 @@
%   *
%   * These numerical problems are overcome by defining
% - * f(a, b) = 0.5 * (hypot(a, b) - b) = 0.5 * a*a / (hypot(a, b) + b)
% + * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2
%   * Then if A < A_crossover, we use
%   *   log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1)))
% @@ -116,5 +118,5 @@
% 
%  /*
% - * Function f(a, b, hypot_a_b) = 0.5 * (hypot(a, b) - b).
% + * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
%   * Pass hypot(a, b) as the third argument.
%   */
% @@ -146,5 +148,5 @@
%  	S = hypot(x, y-1); /* |z-I| */
% 
% -	/* A = 0.5 * (|z+I| + |z-I|) */
% +	/* A = (|z+I| + |z-I|) / 2 */
%  	A = (R + S) / 2;
%  	/*
% @@ -170,5 +172,5 @@
%  		} else if (y == 1) {
%  			/*
% -			 * fp is of order x^2, and fm = 0.5*x.
% +			 * fp is of order x^2, and fm = x/2.
%  			 * A = 1 (inexactly).
%  			 */
% @@ -177,5 +179,5 @@
%  		} else if (y < 1) {
%  			/*
% -			 * fp = 0.25*x*x/(1+y), fm = 0.25*x*x/(1-y), and
% +			 * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and
%  			 * A = 1 (inexactly).
%  			 */
% @@ -208,5 +210,5 @@
%  	}
% 
% -	/* B = 0.5 * (|z+I| - |z-I|) = y/A */
% +	/* B = (|z+I| - |z-I|) / 2 = y/A */
%  	*B = y/A;
%  	*B_is_usable = 1;
% @@ -228,12 +230,12 @@
%  		} else if (y == 1) {
%  			/*
% -			 * fp is of order x^2, and fm = 0.5*x.
% +			 * fp is of order x^2, and fm = x/2.
%  			 * A = 1 (inexactly).
%  			 */
%  			if ((int)x==0) /* raise inexact */
% -				*sqrt_A2my2 = sqrt(x)*sqrt(0.5*(A+y));
% +				*sqrt_A2my2 = sqrt(x)*sqrt((A+y)/2);

Stray 0.5 in the code too.

%  		} else if (y > 1) {
%  			/*
% -			 * fp = 0.25*x*x/(y+1), fm = 0.25*x*x/(y-1), and
% +			 * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and
%  			 * A = y (inexactly).
%  			 *
% @@ -282,7 +284,7 @@
%  		if (ISINF(bx))
%  			return (cpack(x, y+y));
% -		/* casinh(NaN + I*+-Inf) = +-Inf + I*NaN (sign optional) */
% +		/* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
%  		if (ISINF(by))
% -			return (cpack(INFINITY, x+x));
% +			return (cpack(y, x+x));
%  		/* casinh(NaN + I*0) = NaN + I*0 */
%  		if (y == 0) return (cpack(x+x, y));

You put the optional choice in the wrong function (cacos() instead of
casin()).  My choice makes casin() odd for this infinity.

Start using the notation opt(whatever) for the choice.

% @@ -359,7 +361,7 @@
% 
%  	if (ISNAN(x) || ISNAN(y)) {
% -		/* cacos(+-Inf + I*NaN) = NaN + I*-+Inf */
% +		/* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
%  		if (ISINF(bx))
% -			return (cpack(y+y, -x));
% +			return (cpack(y+y, -INFINITY));
%  		/* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
%  		if (ISINF(by))

Now my choice is a fixed sign (minus, selected from the signs visible
in the description of the input parameters).  My choice makes cacos()
even for this infinitity, and hopefully as continuous as possible
when x -> +-Inf.

% @@ -416,8 +418,10 @@
%  {
%  	double complex w;
% -	double x, y, rx, ry;
% +	double rx, ry;
% +
%  	w = cacos(z);
%  	rx = creal(w);
%  	ry = cimag(w);
% +
%  	/* cacosh(NaN + I*NaN) = NaN + I*NaN */
%  	if (ISNAN(rx) && ISNAN(ry))
% @@ -434,5 +438,5 @@
% 
%  /*
% - * Optimized version of clog() for |z| finite and larger than ~1/DBL_EPSILON.
% + * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
%   */
%  static double complex
% @@ -453,4 +457,5 @@
% 
%  /*
% + * XXX this comment is now misindented.
%   * Avoid overflow in hypot() when x and y are both very large.
%   * Divide x and y by E, and then add 1 to the logarithm.  This depends on E
% @@ -462,9 +467,9 @@
%  		return (cpack(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x)));
% 
% -/*
% - * Avoid overflow when x or y is large.  Avoid underflow when x or
% - * y is small.
% - */
% -	if (ay < FOUR_SQRT_MIN || ax > QUARTER_SQRT_MAX)
% +	/*
% +	 * Avoid overflow when x or y is large.  Avoid underflow when x or
% +	 * y is small.
% +	 */
% +	if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
%  		return (cpack(log(hypot(x, y)), atan2(y, x)));
%

SQRT_MIN is the correct threshold, so use it now that it is available.
This is just an optimization.  ax > EPSILON, so we could discard y*y
below a much larger threshold, but by using the smallest possible
threshold we get the fast code in all possible cases.

Old change to sort the comparisons in the code to match the comment.

% @@ -527,8 +532,8 @@
%  }
% 
% -/*
% - * catanh(z) = 0.5 * log((1+z)/(1-z))
% - *           = 0.25 * log1p(4*x / |z-1|^2)
% - *             + 0.5 * I * atan2(2y, (1-x)*(1+x)-y*y)
% +/* 
% + * catanh(z) = log((1+z)/(1-z)) / 2
% + *           = log1p(4*x / |z-1|^2) / 4
% + *             + I * atan2(2y, (1-x)*(1+x)-y*y) / 2
%   *
%   * catanh(z) = z + O(1/|z|^3)   as z -> 0
% @@ -556,8 +561,8 @@
%  		if (ISINF(bx))
%  			return (cpack(copysign(0, x), y+y));
% -		/* catanh(NaN + I*+-Inf) = 0 + I*+-PI/2 */
% +		/* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
%  		if (ISINF(by))
%  			return (cpack(copysign(0, x), copysign(m_pi_2, y)));
% -		/* catanh(0 + I*NaN) = 0 + I*NaN */
% +		/* catanh(+-0 + I*NaN) = +-0 + I*NaN */
%  		if (x == 0)
%  			return (cpack(x, y+y));

You made some subtle changes near here, and I noticed these.

The first change documents copying the sign of the NaN to a signed 0.
I think copying the sign of a NaN shouldn't be done, especually when
it takes extra code as here, but that's what the code does.

The sccond change documents that the result is a signed 0, with the
sign coming from the arg 0.  Perhaps plain 0 should have these semantics,
with the sign only mentioned when it is not there on either side, but
that seems backwards.

% diff -u2 catrigf.c~ catrigf.c
% --- catrigf.c~	2012-09-14 01:49:19.000000000 +0000
% +++ catrigf.c	2012-09-14 10:49:46.482399000 +0000
% @@ -37,17 +37,25 @@
%  #include <complex.h>
%  #include <float.h>
% -#include <math.h>
% +
% +#include "math.h"
%  #include "math_private.h"
% 
% +#undef isinf
% +#define isinf(x)	(fabsf(x) == INFINITY)
% +#undef isnan
% +#define isnan(x)	((x) != (x))
% +#undef signbit
% +#define signbit(x)	(__builtin_signbit(x))
% +

Local hacks to try to speed this up.  It still seems slower than double
precision.

%  static const float
%  A_crossover =		10,
%  B_crossover =		0.6417,
%  FOUR_SQRT_MIN =		0x1p-61,

This may be too small.

% -QUARTER_SQRT_MAX =	0x1p60,
% +QUARTER_SQRT_MAX =	0x1p61,

0x1p61 was already an underestimate, so it doesn't need to be reduced
more.

%  m_e =			2.7182818285e0,		/*  0xadf854.0p-22 */
%  m_ln2 =			6.9314718056e-1,	/*  0xb17218.0p-24 */
% -m_pi_2 =		1.5707963268e0,		/*  0xc90fdb.0p-23 */ 
% +m_pi_2 =		1.5707963268e0,		/*  0xc90fdb.0p-23 */
%  RECIP_EPSILON =		1/FLT_EPSILON,
% -SQRT_MIN =		0x1p-511;
% +SQRT_MIN =		0x1p-63;

This was broken (underflowed to 0).

% 
%  static const volatile float
% @@ -113,5 +121,5 @@
%  		} else if (y == 1) {
%  			if ((int)x==0)
% -				*sqrt_A2my2 = sqrtf(x)*sqrtf(0.5*(A+y));
% +				*sqrt_A2my2 = sqrtf(x)*sqrtf((A+y)/2);
%  		} else if (y > 1) {
%  			if ((int)x==0)
% @@ -141,5 +149,5 @@
%  			return (cpackf(x, y+y));
%  		if (isinf(y))
% -			return (cpackf(INFINITY, x+x));
% +			return (cpackf(y, x+x));
%  		if (y == 0) return (cpackf(x+x, y));
%  		return (cpackf(x+0.0L+(y+0), x+0.0L+(y+0)));
% @@ -191,5 +199,5 @@
%  	if (isnan(x) || isnan(y)) {
%  		if (isinf(x))
% -			return (cpackf(y+y, -x));
% +			return (cpackf(y+y, -INFINITY));
%  		if (isinf(y))
%  			return (cpackf(x+x, -y));
% @@ -235,5 +243,6 @@
%  {
%  	float complex w;
% -	float x, y, rx, ry;
% +	float rx, ry;
% +
%  	w = cacosf(z);
%  	rx = crealf(w);
% @@ -267,5 +276,5 @@
%  		return (cpackf(logf(hypotf(x / m_e, y / m_e)) + 1, atan2f(y, x)));
% 
% -	if (ay < FOUR_SQRT_MIN || ax > QUARTER_SQRT_MAX)
% +	if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
%  		return (cpackf(logf(hypotf(x, y)), atan2f(y, x)));
% 
% diff -u2 catrigl.c~ catrigl.c
% --- catrigl.c~	2012-09-14 01:49:21.000000000 +0000
% +++ catrigl.c	2012-09-14 10:15:19.216734000 +0000
% @@ -37,15 +37,16 @@
%  #include <complex.h>
%  #include <float.h>
% -#include <math.h>
% +
% +#include "fpmath.h"
% +#include "math.h"
%  #include "math_private.h"
% -#include <_fpmath.h>

The normal _fpmath.h is libc/i386/_fpmath.h; it is supposed to be
included by fpmath.h, which is libc/include/fpmath.h.  The
non-modularity of these can cause problems, but I have include paths
to them set up in dozens of scripts and makefiles, where the script
or makefile has complications to figure out a path to an out-of-tree
libc, so I only notice the problems when the source tree layout changes.
Another -I path turns <math.h> into the source tree math.h, so that
<math.h> instead of "math.h" is just a style bug.

% 
%  static const long double
% -FOUR_SQRT_MIN =		0x1p-8000L,
% -QUARTER_SQRT_MAX =	0x1p8000L,
%  A_crossover =		10,
%  B_crossover =		0.6417,
% +FOUR_SQRT_MIN =		0x1p-8189L,
% +QUARTER_SQRT_MAX =	0x1p8189L,

Use my possibly-too-exact constents and sort them.

%  RECIP_EPSILON =		1/LDBL_EPSILON,
% -SQRT_MIN =		0x1p-8191;
% +SQRT_MIN =		0x1p-8191L;

This was broken (underflowed to 0 in a different way).

% 
%  #if LDBL_MANT_DIG == 64
% @@ -64,8 +65,8 @@
%  #else
%  #error "Unsupported long double format"
% -#endif 
% +#endif
% 
%  static const volatile long double
% -tiny =			0x1p-16000L;
% +tiny =			0x1p-10000L;

Use the normal magic number.

% 
%  static long double complex clog_for_large_values(long double complex z);
% @@ -128,5 +129,5 @@
%  		} else if (y == 1) {
%  			if ((int)x==0)
% -				*sqrt_A2my2 = sqrtl(x)*sqrtl(0.5*(A+y));
% +				*sqrt_A2my2 = sqrtl(x)*sqrtl((A+y)/2);
%  		} else if (y > 1) {
%  			if ((int)x==0)
% @@ -156,5 +157,5 @@
%  			return (cpackl(x, y+y));
%  		if (isinf(y))
% -			return (cpackl(INFINITY, x+x));
% +			return (cpackl(y, x+x));
%  		if (y == 0) return (cpackl(x+x, y));
%  		return (cpackl(x+0.0L+(y+0), x+0.0L+(y+0)));
% @@ -206,5 +207,5 @@
%  	if (isnan(x) || isnan(y)) {
%  		if (isinf(x))
% -			return (cpackl(y+y, -x));
% +			return (cpackl(y+y, -INFINITY));
%  		if (isinf(y))
%  			return (cpackl(x+x, -y));
% @@ -250,5 +251,6 @@
%  {
%  	long double complex w;
% -	long double x, y, rx, ry;
% +	long double rx, ry;
% +
%  	w = cacosl(z);
%  	rx = creall(w);
% @@ -282,5 +284,5 @@
%  		return (cpackl(logl(hypotl(x / m_e, y / m_e)) + 1, atan2l(y, x)));
% 
% -	if (ay < FOUR_SQRT_MIN || ax > QUARTER_SQRT_MAX)
% +	if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
%  		return (cpackl(logl(hypotl(x, y)), atan2l(y, x)));
%

Bruce