From owner-freebsd-current@FreeBSD.ORG Sun Sep 16 19:40:05 2007 Return-Path: Delivered-To: current@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id C216616A41B for ; Sun, 16 Sep 2007 19:40:05 +0000 (UTC) (envelope-from ache@nagual.pp.ru) Received: from nagual.pp.ru (nagual.pp.ru [194.87.13.69]) by mx1.freebsd.org (Postfix) with ESMTP id D2D2013C465 for ; Sun, 16 Sep 2007 19:40:04 +0000 (UTC) (envelope-from ache@nagual.pp.ru) Received: from nagual.pp.ru (ache@localhost [127.0.0.1]) by nagual.pp.ru (8.14.1/8.14.1) with ESMTP id l8GJTPZa013006; Sun, 16 Sep 2007 23:29:25 +0400 (MSD) (envelope-from ache@nagual.pp.ru) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=nagual.pp.ru; s=default; t=1189970965; bh=xuhy8++5NY6qoZt8MIxebItAWrZBnE7mFCv2Rka on1g=; l=10963; h=Date:From:To:Cc:Subject:Message-ID: Mail-Followup-To:MIME-Version:Content-Type:Content-Disposition: User-Agent; b=qaNBqyFWQDZ8EJq15uj0yVc1T3hZ3Z4QfqeySKzoJe/TqTeDy6J/ a8tzhTCfrGFiQqSP+fL3E/nYCuA1WJ4Cl8CXYQ3lk3M8FRtVJDLA21coK7m7h7Ervdc CqnOUlJTpt8JtWuXGATshJreMEoa6im//AghpjzW0jJb83ZzCzAs= Received: (from ache@localhost) by nagual.pp.ru (8.14.1/8.14.1/Submit) id l8GJTOF9013005; Sun, 16 Sep 2007 23:29:24 +0400 (MSD) (envelope-from ache) Date: Sun, 16 Sep 2007 23:29:24 +0400 From: Andrey Chernov To: current@freebsd.org, i18n@freebsd.org Message-ID: <20070916192924.GA12678@nagual.pp.ru> Mail-Followup-To: Andrey Chernov , current@freebsd.org, i18n@freebsd.org, perky@FreeBSD.org, petr.hroudny@gmail.com MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="fdj2RfSjLxBAspz7" Content-Disposition: inline User-Agent: Mutt/1.5.16 (2007-06-09) Cc: perky@freebsd.org, petr.hroudny@gmail.com Subject: Ctype patch for review X-BeenThere: freebsd-current@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Discussions about the use of FreeBSD-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 16 Sep 2007 19:40:05 -0000 --fdj2RfSjLxBAspz7 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline The problem is: currently our single byte ctype functions are broken for wide characters locales in the argument range >= 0x80 - they may return false positives. For example, for UTF-8 locale we currently have: iswspace(0xA0)==1 and isspace(0xA0)==1 (because iswspace() and isspace() are the same code) but must have isspace(0xA0)==0 (because there is no such character and all others in the range 0x80..0xff for the wide locales, they keep ASCII only in the single byte range because our internal wchar_t representation is UCS-4). Attached patch address this issue and also fix iswascii() (currently iswascii() is broken for arguments > 0xFF). This patch is 100% binary compatible with old binaries, their (broken) behaviour is not changed. I want to hear some comments. -- http://ache.pp.ru/ --fdj2RfSjLxBAspz7 Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="ctype.patch" --- _ctype.h.old 2007-09-16 21:13:59.000000000 +0400 +++ _ctype.h 2007-09-16 23:00:38.000000000 +0400 @@ -63,6 +63,7 @@ #define _CTYPE_I 0x00080000L /* Ideogram */ #define _CTYPE_T 0x00100000L /* Special */ #define _CTYPE_Q 0x00200000L /* Phonogram */ +#define _CTYPE_WID 0x10000000L /* wide character function */ #define _CTYPE_SW0 0x20000000L /* 0 width character */ #define _CTYPE_SW1 0x40000000L /* 1 width character */ #define _CTYPE_SW2 0x80000000L /* 2 width character */ @@ -87,6 +88,8 @@ #define __inline #endif +extern int __mb_cur_max; + /* * Use inline functions if we are allowed to and the compiler supports them. */ @@ -98,8 +101,11 @@ static __inline int __maskrune(__ct_rune_t _c, unsigned long _f) { - return ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) : + return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 : + ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) : _CurrentRuneLocale->__runetype[_c]) & _f; + /* We never set _CTYPE_WID in the locale data, */ + /* so can skip ... & (_f & ~_CTYPE_WID). */ } static __inline int @@ -111,8 +117,11 @@ static __inline int __isctype(__ct_rune_t _c, unsigned long _f) { - return (_c < 0 || _c >= _CACHED_RUNES) ? 0 : + return __mb_cur_max > 1 && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 : + (_c < 0 || _c >= _CACHED_RUNES) ? 0 : !!(_DefaultRuneLocale.__runetype[_c] & _f); + /* We never set _CTYPE_WID in the locale data, */ + /* so can skip ... & (_f & ~_CTYPE_WID). */ } static __inline __ct_rune_t @@ -129,6 +138,22 @@ _CurrentRuneLocale->__maplower[_c]; } +static __inline __ct_rune_t +__tosupper(__ct_rune_t _c) +{ + return __mb_cur_max > 1 && (_c >= 0x80) ? _c : + (_c < 0 || _c >= _CACHED_RUNES) ? ___toupper(_c) : + _CurrentRuneLocale->__mapupper[_c]; +} + +static __inline __ct_rune_t +__toslower(__ct_rune_t _c) +{ + return __mb_cur_max > 1 && (_c >= 0x80) ? _c : + (_c < 0 || _c >= _CACHED_RUNES) ? ___tolower(_c) : + _CurrentRuneLocale->__maplower[_c]; +} + static __inline int __wcwidth(__ct_rune_t _c) { @@ -150,6 +175,8 @@ int __isctype(__ct_rune_t, unsigned long); __ct_rune_t __toupper(__ct_rune_t); __ct_rune_t __tolower(__ct_rune_t); +__ct_rune_t __tosupper(__ct_rune_t); +__ct_rune_t __toslower(__ct_rune_t); int __wcwidth(__ct_rune_t); __END_DECLS #endif /* using inlines */ --- ctype.h.old 2007-09-16 22:03:55.000000000 +0400 +++ ctype.h 2007-09-16 22:56:10.000000000 +0400 @@ -97,8 +97,8 @@ #define isspace(c) __istype((c), _CTYPE_S) #define isupper(c) __istype((c), _CTYPE_U) #define isxdigit(c) __isctype((c), _CTYPE_X) /* ANSI -- locale independent */ -#define tolower(c) __tolower(c) -#define toupper(c) __toupper(c) +#define tolower(c) __toslower(c) +#define toupper(c) __tosupper(c) #if __XSI_VISIBLE /* @@ -112,8 +112,8 @@ * * XXX isascii() and toascii() should similarly be undocumented. */ -#define _tolower(c) __tolower(c) -#define _toupper(c) __toupper(c) +#define _tolower(c) __toslower(c) +#define _toupper(c) __tosupper(c) #define isascii(c) (((c) & ~0x7F) == 0) #define toascii(c) ((c) & 0x7F) #endif @@ -128,7 +128,7 @@ #define isideogram(c) __istype((c), _CTYPE_I) #define isnumber(c) __istype((c), _CTYPE_D) #define isphonogram(c) __istype((c), _CTYPE_Q) -#define isrune(c) __istype((c), 0xFFFFFF00L) +#define isrune(c) __istype((c), 0xFFFFFF00L & ~_CTYPE_WID) #define isspecial(c) __istype((c), _CTYPE_T) #endif --- wctype.h.old 2007-09-16 21:59:37.000000000 +0400 +++ wctype.h 2007-09-16 22:56:44.000000000 +0400 @@ -89,30 +89,30 @@ #endif __END_DECLS -#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D) -#define iswalpha(wc) __istype((wc), _CTYPE_A) -#define iswblank(wc) __istype((wc), _CTYPE_B) -#define iswcntrl(wc) __istype((wc), _CTYPE_C) -#define iswctype(wc, charclass) __istype((wc), (charclass)) -#define iswdigit(wc) __isctype((wc), _CTYPE_D) -#define iswgraph(wc) __istype((wc), _CTYPE_G) -#define iswlower(wc) __istype((wc), _CTYPE_L) -#define iswprint(wc) __istype((wc), _CTYPE_R) -#define iswpunct(wc) __istype((wc), _CTYPE_P) -#define iswspace(wc) __istype((wc), _CTYPE_S) -#define iswupper(wc) __istype((wc), _CTYPE_U) -#define iswxdigit(wc) __isctype((wc), _CTYPE_X) +#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D|_CTYPE_WID) +#define iswalpha(wc) __istype((wc), _CTYPE_A|_CTYPE_WID) +#define iswblank(wc) __istype((wc), _CTYPE_B|_CTYPE_WID) +#define iswcntrl(wc) __istype((wc), _CTYPE_C|_CTYPE_WID) +#define iswctype(wc, charclass) __istype((wc), (charclass)|_CTYPE_WID) +#define iswdigit(wc) __isctype((wc), _CTYPE_D|_CTYPE_WID) +#define iswgraph(wc) __istype((wc), _CTYPE_G|_CTYPE_WID) +#define iswlower(wc) __istype((wc), _CTYPE_L|_CTYPE_WID) +#define iswprint(wc) __istype((wc), _CTYPE_R|_CTYPE_WID) +#define iswpunct(wc) __istype((wc), _CTYPE_P|_CTYPE_WID) +#define iswspace(wc) __istype((wc), _CTYPE_S|_CTYPE_WID) +#define iswupper(wc) __istype((wc), _CTYPE_U|_CTYPE_WID) +#define iswxdigit(wc) __isctype((wc), _CTYPE_X|_CTYPE_WID) #define towlower(wc) __tolower(wc) #define towupper(wc) __toupper(wc) #if __BSD_VISIBLE -#define iswascii(wc) (((wc) & ~0x7F) == 0) -#define iswhexnumber(wc) __istype((wc), _CTYPE_X) -#define iswideogram(wc) __istype((wc), _CTYPE_I) -#define iswnumber(wc) __istype((wc), _CTYPE_D) -#define iswphonogram(wc) __istype((wc), _CTYPE_Q) -#define iswrune(wc) __istype((wc), 0xFFFFFF00L) -#define iswspecial(wc) __istype((wc), _CTYPE_T) +#define iswascii(wc) ((wc) < 0x80) +#define iswhexnumber(wc) __istype((wc), _CTYPE_X|_CTYPE_WID) +#define iswideogram(wc) __istype((wc), _CTYPE_I|_CTYPE_WID) +#define iswnumber(wc) __istype((wc), _CTYPE_D|_CTYPE_WID) +#define iswphonogram(wc) __istype((wc), _CTYPE_Q|_CTYPE_WID) +#define iswrune(wc) __istype((wc), 0xFFFFFF00L) /* already have _CTYPE_WID */ +#define iswspecial(wc) __istype((wc), _CTYPE_T|_CTYPE_WID) #endif #endif /* _WCTYPE_H_ */ --- isctype.c.old 2007-09-16 22:31:26.000000000 +0400 +++ isctype.c 2007-09-16 22:37:54.000000000 +0400 @@ -168,7 +168,7 @@ isrune(c) int c; { - return (__istype(c, 0xFFFFFF00L)); + return (__istype(c, 0xFFFFFF00L & ~_CTYPE_WID)); } #undef isspace @@ -216,7 +216,7 @@ tolower(c) int c; { - return (__tolower(c)); + return (__toslower(c)); } #undef toupper @@ -224,6 +224,6 @@ toupper(c) int c; { - return (__toupper(c)); + return (__tosupper(c)); } --- iswctype.c.old 2007-09-16 22:31:30.000000000 +0400 +++ iswctype.c 2007-09-16 22:41:39.000000000 +0400 @@ -45,7 +45,7 @@ iswalnum(wc) wint_t wc; { - return (__istype(wc, _CTYPE_A|_CTYPE_D)); + return (__istype(wc, _CTYPE_A|_CTYPE_D|_CTYPE_WID)); } #undef iswalpha @@ -53,7 +53,7 @@ iswalpha(wc) wint_t wc; { - return (__istype(wc, _CTYPE_A)); + return (__istype(wc, _CTYPE_A|_CTYPE_WID))); } #undef iswascii @@ -61,7 +61,7 @@ iswascii(wc) wint_t wc; { - return ((wc & ~0x7F) == 0); + return (wc < 0x80); } #undef iswblank @@ -69,7 +69,7 @@ iswblank(wc) wint_t wc; { - return (__istype(wc, _CTYPE_B)); + return (__istype(wc, _CTYPE_B|_CTYPE_WID))); } #undef iswcntrl @@ -77,7 +77,7 @@ iswcntrl(wc) wint_t wc; { - return (__istype(wc, _CTYPE_C)); + return (__istype(wc, _CTYPE_C|_CTYPE_WID))); } #undef iswdigit @@ -85,7 +85,7 @@ iswdigit(wc) wint_t wc; { - return (__isctype(wc, _CTYPE_D)); + return (__isctype(wc, _CTYPE_D|_CTYPE_WID))); } #undef iswgraph @@ -93,7 +93,7 @@ iswgraph(wc) wint_t wc; { - return (__istype(wc, _CTYPE_G)); + return (__istype(wc, _CTYPE_G|_CTYPE_WID))); } #undef iswhexnumber @@ -101,7 +101,7 @@ iswhexnumber(wc) wint_t wc; { - return (__istype(wc, _CTYPE_X)); + return (__istype(wc, _CTYPE_X|_CTYPE_WID))); } #undef iswideogram @@ -109,7 +109,7 @@ iswideogram(wc) wint_t wc; { - return (__istype(wc, _CTYPE_I)); + return (__istype(wc, _CTYPE_I|_CTYPE_WID))); } #undef iswlower @@ -117,7 +117,7 @@ iswlower(wc) wint_t wc; { - return (__istype(wc, _CTYPE_L)); + return (__istype(wc, _CTYPE_L|_CTYPE_WID))); } #undef iswnumber @@ -125,7 +125,7 @@ iswnumber(wc) wint_t wc; { - return (__istype(wc, _CTYPE_D)); + return (__istype(wc, _CTYPE_D|_CTYPE_WID))); } #undef iswphonogram @@ -133,7 +133,7 @@ iswphonogram(wc) wint_t wc; { - return (__istype(wc, _CTYPE_Q)); + return (__istype(wc, _CTYPE_Q|_CTYPE_WID))); } #undef iswprint @@ -141,7 +141,7 @@ iswprint(wc) wint_t wc; { - return (__istype(wc, _CTYPE_R)); + return (__istype(wc, _CTYPE_R|_CTYPE_WID))); } #undef iswpunct @@ -149,7 +149,7 @@ iswpunct(wc) wint_t wc; { - return (__istype(wc, _CTYPE_P)); + return (__istype(wc, _CTYPE_P|_CTYPE_WID))); } #undef iswrune @@ -157,7 +157,7 @@ iswrune(wc) wint_t wc; { - return (__istype(wc, 0xFFFFFF00L)); + return (__istype(wc, 0xFFFFFF00L)); /* already have _CTYPE_WID */ } #undef iswspace @@ -165,7 +165,7 @@ iswspace(wc) wint_t wc; { - return (__istype(wc, _CTYPE_S)); + return (__istype(wc, _CTYPE_S|_CTYPE_WID))); } #undef iswspecial @@ -173,7 +173,7 @@ iswspecial(wc) wint_t wc; { - return (__istype(wc, _CTYPE_T)); + return (__istype(wc, _CTYPE_T|_CTYPE_WID))); } #undef iswupper @@ -181,7 +181,7 @@ iswupper(wc) wint_t wc; { - return (__istype(wc, _CTYPE_U)); + return (__istype(wc, _CTYPE_U|_CTYPE_WID))); } #undef iswxdigit @@ -189,7 +189,7 @@ iswxdigit(wc) wint_t wc; { - return (__isctype(wc, _CTYPE_X)); + return (__isctype(wc, _CTYPE_X|_CTYPE_WID))); } #undef towlower --fdj2RfSjLxBAspz7--