Date: Mon, 21 Jul 2003 17:32:28 -0700 (PDT) From: Peter Wemm <peter@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 34799 for review Message-ID: <200307220032.h6M0WSSJ023138@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=34799 Change 34799 by peter@peter_hammer on 2003/07/21 17:31:45 initial shot at fleshing out SSE support in the fp*() API Affected files ... .. //depot/projects/hammer/sys/amd64/include/ieeefp.h#2 edit Differences ... ==== //depot/projects/hammer/sys/amd64/include/ieeefp.h#2 (text+ko) ==== @@ -90,6 +90,13 @@ #define FP_RND_FLD 0xc00 /* round control field */ #define FP_STKY_FLD 0x3f /* sticky flags field */ +#define SSE_STKY_FLD 0x3f /* exception flags */ +#define SSE_DAZ_FLD 0x40 /* Denormals are zero */ +#define SSE_MSKS_FLD 0x1f80 /* exception masks field */ +#define SSE_RND_FLD 0x6000 /* rounding control */ +#define SSE_FZ_FLD 0x8000 /* flush to zero on underflow */ + + /* * FP register bit field offsets */ @@ -98,84 +105,164 @@ #define FP_RND_OFF 10 /* round control offset */ #define FP_STKY_OFF 0 /* sticky flags offset */ +#define SSE_STKY_OFF 0 /* exception flags offset */ +#define SSE_DAZ_OFF 6 /* DAZ exception mask offset */ +#define SSE_MSKS_OFF 7 /* other exception masks offset */ +#define SSE_RND_OFF 13 /* rounding control offset */ +#define SSE_FZ_OFF 15 /* flush to zero offset */ + #ifdef __GNUC__ #define __fldenv(addr) __asm __volatile("fldenv %0" : : "m" (*(addr))) #define __fnstenv(addr) __asm __volatile("fnstenv %0" : "=m" (*(addr))) +#define __fldcw(addr) __asm __volatile("fldcw %0" : "=m" (*(addr))) #define __fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr))) #define __fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) +#define __ldmxcsr(addr) __asm __volatile("ldmxcsr %0" : "=m" (*(addr))) +#define __stmxcsr(addr) __asm __volatile("stmxcsr %0" : "=m" (*(addr))) /* - * return the contents of a FP register + * General notes about conflicting SSE vs FP status bits. + * This code assumes that software will not fiddle with the control + * bits of the SSE and x87 in such a way to get them out of sync and + * still expect this to work. Break this at your peril. + * Because I based this on the i386 port, the x87 state is used for + * the fpget*() functions, and is shadowed into the SSE state for + * the fpset*() functions. For dual source fpget*() functions, I + * merge the two together. I think. + */ + +/* Set rounding control */ +static __inline__ fp_rnd_t +fpgetround(void) +{ + unsigned short _cw; + + __fnstcw(&_cw); + return ((_cw & FP_RND_FLD) >> FP_RND_OFF); +} + +static __inline__ fp_rnd_t +fpsetround(fp_rnd_t _m) +{ + unsigned short _cw; + unsigned int _mxcsr; + fp_rnd_t _p; + + __fnstcw(&_cw); + _p = (_cw & FP_RND_FLD) >> FP_RND_OFF; + _cw &= ~FP_RND_FLD; + _cw |= (_m << FP_RND_OFF) & FP_RND_FLD; + __fldcw(&_cw); + __stmxcsr(&_mxcsr); + _mxcsr &= ~SSE_RND_FLD; + _mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD; + __ldmxcsr(&_mxcsr); + return (_p); +} + +/* + * Set precision for fadd/fsub/fsqrt etc x87 instructions + * There is no equivalent SSE mode or control. It always runs + * in 64 bit precision mode for SSE2 calculations. */ -static __inline__ int -__fpgetreg(int _reg) +static __inline__ fp_prec_t +fpgetprec(void) +{ + unsigned short _cw; + + __fnstcw(&_cw); + return ((_cw & FP_PRC_FLD) >> FP_PRC_OFF); +} + +static __inline__ fp_rnd_t +fpsetprec(fp_rnd_t _m) { - unsigned short _mem; + unsigned short _cw; + fp_prec_t _p; - /*- - * This is more efficient than it looks. The switch gets optimized - * away if _reg is constant. - * - * The default case only supports _reg == 0. We could handle more - * registers (e.g., tags) using fnstenv, but the interface doesn't - * support more. - */ - switch(_reg) { - default: - __fnstcw(&_mem); - break; - case FP_STKY_REG: - __fnstsw(&_mem); - break; - } - return _mem; + __fnstcw(&_cw); + _p = (_cw & FP_PRC_FLD) >> FP_PRC_OFF; + _cw &= ~FP_PRC_FLD; + _cw |= (_m << FP_PRC_OFF) & FP_PRC_FLD; + __fldcw(&_cw); + return (_p); } /* - * set a FP mode; return previous mode + * Look at the exception masks + * Note that x87 masks are inverse of the fp*() functions + * API. ie: mask = 1 means disable for x87 and SSE, but + * for the fp*() api, mask = 1 means enabled. */ -static __inline__ int -__fpsetreg(int _m, int _reg, int _fld, int _off) +static __inline__ fp_except_t +fpgetmask(void) +{ + unsigned short _cw; + + __fnstcw(&_cw); + return ((~_cw) & FP_MSKS_FLD); +} + +static __inline__ fp_except_t +fpsetmask(fp_except_t _m) +{ + unsigned short _cw; + unsigned int _mxcsr; + fp_except_t _p; + + __fnstcw(&_cw); + _p = (~_cw) & FP_MSKS_FLD; + _cw ~= ~FP_MSKS_FLD; + _cw |= (~_m) & FP_MSKS_FLD; + __fldcw(&_cw); + __stmxcsr(&_mxcxr); + /* XXX should clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD */ + _mxcsr &= ~SSE_MSKS_FLD; + _mxcsr |= ((~_m) << SSE_MSKS_OFF) & SSE_MSKS_FLD); + __ldmxcsr(&_mxcsr); + return (_p); +} + +/* See which sticky exceptions are pending, and reset them */ +static __inline__ fp_except_t +fpgetsticky(void) +{ + unsigned short _sw; + unsigned int _mxcsr; + fp_except_t _ex; + + __fnstsw(&_sw); + _ex = _sw & FP_STKY_FLD; + __stmxcsr(&_mxcsr); + _ex |= _mxcsr & SSE_STKY_FLD; + return (_ex); +} + +static __inline__ fp_except_t +fpresetsticky(fp_except_t _m) { unsigned _env[7]; - unsigned _p; + unsigned int _mxcsr; + fp_except_t _p; - /* - * _reg == 0 could be handled better using fnstcw/fldcw. - */ __fnstenv(_env); - _p = (_env[_reg] & _fld) >> _off; - _env[_reg] = (_env[_reg] & ~_fld) | (_m << _off & _fld); + _p = _env[FP_STKY_REG] & _m; + __stmxcsr(&_mxcsr); + _p |= _mxcsr & SSE_STKY_FLD; + _env[FP_STKY_REG] &= ~_m; __fldenv(_env); - return _p; + _mxcsr &= ~_m; + __ldmxcsr(&_mxcsr); + return (_p); } -#endif /* __GNUC__ */ - -/* - * SysV/386 FP control interface - */ -#define fpgetround() ((fp_rnd_t) \ - ((__fpgetreg(FP_RND_REG) & FP_RND_FLD) >> FP_RND_OFF)) -#define fpsetround(m) ((fp_rnd_t) \ - __fpsetreg((m), FP_RND_REG, FP_RND_FLD, FP_RND_OFF)) -#define fpgetprec() ((fp_prec_t) \ - ((__fpgetreg(FP_PRC_REG) & FP_PRC_FLD) >> FP_PRC_OFF)) -#define fpsetprec(m) ((fp_prec_t) \ - __fpsetreg((m), FP_PRC_REG, FP_PRC_FLD, FP_PRC_OFF)) -#define fpgetmask() ((fp_except_t) \ - ((~__fpgetreg(FP_MSKS_REG) & FP_MSKS_FLD) >> FP_MSKS_OFF)) -#define fpsetmask(m) ((fp_except_t) \ - (~__fpsetreg(~(m), FP_MSKS_REG, FP_MSKS_FLD, FP_MSKS_OFF)) & \ - (FP_MSKS_FLD >> FP_MSKS_OFF)) -#define fpgetsticky() ((fp_except_t) \ - ((__fpgetreg(FP_STKY_REG) & FP_STKY_FLD) >> FP_STKY_OFF)) -#define fpresetsticky(m) ((fp_except_t) \ - __fpsetreg(0, FP_STKY_REG, (m), FP_STKY_OFF)) +/* It is called fpsetsticky(), but is really a reset function */ #define fpsetsticky(m) fpresetsticky(m) /* Suppress prototypes in the MI header. */ #define _IEEEFP_INLINED_ 1 +#endif /* __GNUC__ */ + #endif /* !_MACHINE_IEEEFP_H_ */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200307220032.h6M0WSSJ023138>