Date: Sun, 16 Sep 2007 23:29:24 +0400 From: Andrey Chernov <ache@nagual.pp.ru> To: current@freebsd.org, i18n@freebsd.org Cc: perky@freebsd.org, petr.hroudny@gmail.com Subject: Ctype patch for review Message-ID: <20070916192924.GA12678@nagual.pp.ru>
next in thread | raw e-mail | index | archive | help
--fdj2RfSjLxBAspz7 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline The problem is: currently our single byte ctype functions are broken for wide characters locales in the argument range >= 0x80 - they may return false positives. For example, for UTF-8 locale we currently have: iswspace(0xA0)==1 and isspace(0xA0)==1 (because iswspace() and isspace() are the same code) but must have isspace(0xA0)==0 (because there is no such character and all others in the range 0x80..0xff for the wide locales, they keep ASCII only in the single byte range because our internal wchar_t representation is UCS-4). Attached patch address this issue and also fix iswascii() (currently iswascii() is broken for arguments > 0xFF). This patch is 100% binary compatible with old binaries, their (broken) behaviour is not changed. I want to hear some comments. -- http://ache.pp.ru/ --fdj2RfSjLxBAspz7 Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="ctype.patch" --- _ctype.h.old 2007-09-16 21:13:59.000000000 +0400 +++ _ctype.h 2007-09-16 23:00:38.000000000 +0400 @@ -63,6 +63,7 @@ #define _CTYPE_I 0x00080000L /* Ideogram */ #define _CTYPE_T 0x00100000L /* Special */ #define _CTYPE_Q 0x00200000L /* Phonogram */ +#define _CTYPE_WID 0x10000000L /* wide character function */ #define _CTYPE_SW0 0x20000000L /* 0 width character */ #define _CTYPE_SW1 0x40000000L /* 1 width character */ #define _CTYPE_SW2 0x80000000L /* 2 width character */ @@ -87,6 +88,8 @@ #define __inline #endif +extern int __mb_cur_max; + /* * Use inline functions if we are allowed to and the compiler supports them. */ @@ -98,8 +101,11 @@ static __inline int __maskrune(__ct_rune_t _c, unsigned long _f) { - return ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) : + return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 : + ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) : _CurrentRuneLocale->__runetype[_c]) & _f; + /* We never set _CTYPE_WID in the locale data, */ + /* so can skip ... & (_f & ~_CTYPE_WID). */ } static __inline int @@ -111,8 +117,11 @@ static __inline int __isctype(__ct_rune_t _c, unsigned long _f) { - return (_c < 0 || _c >= _CACHED_RUNES) ? 0 : + return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 : + (_c < 0 || _c >= _CACHED_RUNES) ? 0 : !!(_DefaultRuneLocale.__runetype[_c] & _f); + /* We never set _CTYPE_WID in the locale data, */ + /* so can skip ... & (_f & ~_CTYPE_WID). */ } static __inline __ct_rune_t @@ -129,6 +138,22 @@ _CurrentRuneLocale->__maplower[_c]; } +static __inline __ct_rune_t +__tosupper(__ct_rune_t _c) +{ + return __mb_cur_max > 1 && (_c >= 0x80) ? _c : + (_c < 0 || _c >= _CACHED_RUNES) ? ___toupper(_c) : + _CurrentRuneLocale->__mapupper[_c]; +} + +static __inline __ct_rune_t +__toslower(__ct_rune_t _c) +{ + return __mb_cur_max > 1 && (_c >= 0x80) ? _c : + (_c < 0 || _c >= _CACHED_RUNES) ? ___tolower(_c) : + _CurrentRuneLocale->__maplower[_c]; +} + static __inline int __wcwidth(__ct_rune_t _c) { @@ -150,6 +175,8 @@ int __isctype(__ct_rune_t, unsigned long); __ct_rune_t __toupper(__ct_rune_t); __ct_rune_t __tolower(__ct_rune_t); +__ct_rune_t __tosupper(__ct_rune_t); +__ct_rune_t __toslower(__ct_rune_t); int __wcwidth(__ct_rune_t); __END_DECLS #endif /* using inlines */ --- ctype.h.old 2007-09-16 22:03:55.000000000 +0400 +++ ctype.h 2007-09-16 22:56:10.000000000 +0400 @@ -97,8 +97,8 @@ #define isspace(c) __istype((c), _CTYPE_S) #define isupper(c) __istype((c), _CTYPE_U) #define isxdigit(c) __isctype((c), _CTYPE_X) /* ANSI -- locale independent */ -#define tolower(c) __tolower(c) -#define toupper(c) __toupper(c) +#define tolower(c) __toslower(c) +#define toupper(c) __tosupper(c) #if __XSI_VISIBLE /* @@ -112,8 +112,8 @@ * * XXX isascii() and toascii() should similarly be undocumented. */ -#define _tolower(c) __tolower(c) -#define _toupper(c) __toupper(c) +#define _tolower(c) __toslower(c) +#define _toupper(c) __tosupper(c) #define isascii(c) (((c) & ~0x7F) == 0) #define toascii(c) ((c) & 0x7F) #endif @@ -128,7 +128,7 @@ #define isideogram(c) __istype((c), _CTYPE_I) #define isnumber(c) __istype((c), _CTYPE_D) #define isphonogram(c) __istype((c), _CTYPE_Q) -#define isrune(c) __istype((c), 0xFFFFFF00L) +#define isrune(c) __istype((c), 0xFFFFFF00L & ~_CTYPE_WID) #define isspecial(c) __istype((c), _CTYPE_T) #endif --- wctype.h.old 2007-09-16 21:59:37.000000000 +0400 +++ wctype.h 2007-09-16 22:56:44.000000000 +0400 @@ -89,30 +89,30 @@ #endif __END_DECLS -#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D) -#define iswalpha(wc) __istype((wc), _CTYPE_A) -#define iswblank(wc) __istype((wc), _CTYPE_B) -#define iswcntrl(wc) __istype((wc), _CTYPE_C) -#define iswctype(wc, charclass) __istype((wc), (charclass)) -#define iswdigit(wc) __isctype((wc), _CTYPE_D) -#define iswgraph(wc) __istype((wc), _CTYPE_G) -#define iswlower(wc) __istype((wc), _CTYPE_L) -#define iswprint(wc) __istype((wc), _CTYPE_R) -#define iswpunct(wc) __istype((wc), _CTYPE_P) -#define iswspace(wc) __istype((wc), _CTYPE_S) -#define iswupper(wc) __istype((wc), _CTYPE_U) -#define iswxdigit(wc) __isctype((wc), _CTYPE_X) +#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D|_CTYPE_WID) +#define iswalpha(wc) __istype((wc), _CTYPE_A|_CTYPE_WID) +#define iswblank(wc) __istype((wc), _CTYPE_B|_CTYPE_WID) +#define iswcntrl(wc) __istype((wc), _CTYPE_C|_CTYPE_WID) +#define iswctype(wc, charclass) __istype((wc), (charclass)|_CTYPE_WID) +#define iswdigit(wc) __isctype((wc), _CTYPE_D|_CTYPE_WID) +#define iswgraph(wc) __istype((wc), _CTYPE_G|_CTYPE_WID) +#define iswlower(wc) __istype((wc), _CTYPE_L|_CTYPE_WID) +#define iswprint(wc) __istype((wc), _CTYPE_R|_CTYPE_WID) +#define iswpunct(wc) __istype((wc), _CTYPE_P|_CTYPE_WID) +#define iswspace(wc) __istype((wc), _CTYPE_S|_CTYPE_WID) +#define iswupper(wc) __istype((wc), _CTYPE_U|_CTYPE_WID) +#define iswxdigit(wc) __isctype((wc), _CTYPE_X|_CTYPE_WID) #define towlower(wc) __tolower(wc) #define towupper(wc) __toupper(wc) #if __BSD_VISIBLE -#define iswascii(wc) (((wc) & ~0x7F) == 0) -#define iswhexnumber(wc) __istype((wc), _CTYPE_X) -#define iswideogram(wc) __istype((wc), _CTYPE_I) -#define iswnumber(wc) __istype((wc), _CTYPE_D) -#define iswphonogram(wc) __istype((wc), _CTYPE_Q) -#define iswrune(wc) __istype((wc), 0xFFFFFF00L) -#define iswspecial(wc) __istype((wc), _CTYPE_T) +#define iswascii(wc) ((wc) < 0x80) +#define iswhexnumber(wc) __istype((wc), _CTYPE_X|_CTYPE_WID) +#define iswideogram(wc) __istype((wc), _CTYPE_I|_CTYPE_WID) +#define iswnumber(wc) __istype((wc), _CTYPE_D|_CTYPE_WID) +#define iswphonogram(wc) __istype((wc), _CTYPE_Q|_CTYPE_WID) +#define iswrune(wc) __istype((wc), 0xFFFFFF00L) /* already have _CTYPE_WID */ +#define iswspecial(wc) __istype((wc), _CTYPE_T|_CTYPE_WID) #endif #endif /* _WCTYPE_H_ */ --- isctype.c.old 2007-09-16 22:31:26.000000000 +0400 +++ isctype.c 2007-09-16 22:37:54.000000000 +0400 @@ -168,7 +168,7 @@ isrune(c) int c; { - return (__istype(c, 0xFFFFFF00L)); + return (__istype(c, 0xFFFFFF00L & ~_CTYPE_WID)); } #undef isspace @@ -216,7 +216,7 @@ tolower(c) int c; { - return (__tolower(c)); + return (__toslower(c)); } #undef toupper @@ -224,6 +224,6 @@ toupper(c) int c; { - return (__toupper(c)); + return (__tosupper(c)); } --- iswctype.c.old 2007-09-16 22:31:30.000000000 +0400 +++ iswctype.c 2007-09-16 22:41:39.000000000 +0400 @@ -45,7 +45,7 @@ iswalnum(wc) wint_t wc; { - return (__istype(wc, _CTYPE_A|_CTYPE_D)); + return (__istype(wc, _CTYPE_A|_CTYPE_D|_CTYPE_WID)); } #undef iswalpha @@ -53,7 +53,7 @@ iswalpha(wc) wint_t wc; { - return (__istype(wc, _CTYPE_A)); + return (__istype(wc, _CTYPE_A|_CTYPE_WID))); } #undef iswascii @@ -61,7 +61,7 @@ iswascii(wc) wint_t wc; { - return ((wc & ~0x7F) == 0); + return (wc < 0x80); } #undef iswblank @@ -69,7 +69,7 @@ iswblank(wc) wint_t wc; { - return (__istype(wc, _CTYPE_B)); + return (__istype(wc, _CTYPE_B|_CTYPE_WID))); } #undef iswcntrl @@ -77,7 +77,7 @@ iswcntrl(wc) wint_t wc; { - return (__istype(wc, _CTYPE_C)); + return (__istype(wc, _CTYPE_C|_CTYPE_WID))); } #undef iswdigit @@ -85,7 +85,7 @@ iswdigit(wc) wint_t wc; { - return (__isctype(wc, _CTYPE_D)); + return (__isctype(wc, _CTYPE_D|_CTYPE_WID))); } #undef iswgraph @@ -93,7 +93,7 @@ iswgraph(wc) wint_t wc; { - return (__istype(wc, _CTYPE_G)); + return (__istype(wc, _CTYPE_G|_CTYPE_WID))); } #undef iswhexnumber @@ -101,7 +101,7 @@ iswhexnumber(wc) wint_t wc; { - return (__istype(wc, _CTYPE_X)); + return (__istype(wc, _CTYPE_X|_CTYPE_WID))); } #undef iswideogram @@ -109,7 +109,7 @@ iswideogram(wc) wint_t wc; { - return (__istype(wc, _CTYPE_I)); + return (__istype(wc, _CTYPE_I|_CTYPE_WID))); } #undef iswlower @@ -117,7 +117,7 @@ iswlower(wc) wint_t wc; { - return (__istype(wc, _CTYPE_L)); + return (__istype(wc, _CTYPE_L|_CTYPE_WID))); } #undef iswnumber @@ -125,7 +125,7 @@ iswnumber(wc) wint_t wc; { - return (__istype(wc, _CTYPE_D)); + return (__istype(wc, _CTYPE_D|_CTYPE_WID))); } #undef iswphonogram @@ -133,7 +133,7 @@ iswphonogram(wc) wint_t wc; { - return (__istype(wc, _CTYPE_Q)); + return (__istype(wc, _CTYPE_Q|_CTYPE_WID))); } #undef iswprint @@ -141,7 +141,7 @@ iswprint(wc) wint_t wc; { - return (__istype(wc, _CTYPE_R)); + return (__istype(wc, _CTYPE_R|_CTYPE_WID))); } #undef iswpunct @@ -149,7 +149,7 @@ iswpunct(wc) wint_t wc; { - return (__istype(wc, _CTYPE_P)); + return (__istype(wc, _CTYPE_P|_CTYPE_WID))); } #undef iswrune @@ -157,7 +157,7 @@ iswrune(wc) wint_t wc; { - return (__istype(wc, 0xFFFFFF00L)); + return (__istype(wc, 0xFFFFFF00L)); /* already have _CTYPE_WID */ } #undef iswspace @@ -165,7 +165,7 @@ iswspace(wc) wint_t wc; { - return (__istype(wc, _CTYPE_S)); + return (__istype(wc, _CTYPE_S|_CTYPE_WID))); } #undef iswspecial @@ -173,7 +173,7 @@ iswspecial(wc) wint_t wc; { - return (__istype(wc, _CTYPE_T)); + return (__istype(wc, _CTYPE_T|_CTYPE_WID))); } #undef iswupper @@ -181,7 +181,7 @@ iswupper(wc) wint_t wc; { - return (__istype(wc, _CTYPE_U)); + return (__istype(wc, _CTYPE_U|_CTYPE_WID))); } #undef iswxdigit @@ -189,7 +189,7 @@ iswxdigit(wc) wint_t wc; { - return (__isctype(wc, _CTYPE_X)); + return (__isctype(wc, _CTYPE_X|_CTYPE_WID))); } #undef towlower --fdj2RfSjLxBAspz7--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20070916192924.GA12678>