From owner-svn-src-head@freebsd.org Mon Jul 11 21:23:51 2016 Return-Path: Delivered-To: svn-src-head@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id BD83BB92715; Mon, 11 Jul 2016 21:23:51 +0000 (UTC) (envelope-from ache@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 912111BBC; Mon, 11 Jul 2016 21:23:51 +0000 (UTC) (envelope-from ache@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u6BLNoqr084239; Mon, 11 Jul 2016 21:23:50 GMT (envelope-from ache@FreeBSD.org) Received: (from ache@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id u6BLNoT7084236; Mon, 11 Jul 2016 21:23:50 GMT (envelope-from ache@FreeBSD.org) Message-Id: <201607112123.u6BLNoT7084236@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: ache set sender to ache@FreeBSD.org using -f From: "Andrey A. Chernov" Date: Mon, 11 Jul 2016 21:23:50 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r302594 - head/usr.bin/tr X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.22 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 11 Jul 2016 21:23:51 -0000 Author: ache Date: Mon Jul 11 21:23:50 2016 New Revision: 302594 URL: https://svnweb.freebsd.org/changeset/base/302594 Log: 1) Following r302512 (remove collation support for [a-z]-ranges in libc) remove collation support for a-z ranges here too. It was implemented for single byte locales only in any case. 2) Reduce [Cc]flag loop to WCHAR_MAX, WINT_MAX here includes WEOF which is not a character. 3) Optimize [Cc]flag case: don't repeatedly add the last character of string2 to squeeze cset when string2 reach its EOS state. 4) Reflect in the manpage that [=equiv=] is implemented for single byte locales only. Modified: head/usr.bin/tr/str.c head/usr.bin/tr/tr.1 head/usr.bin/tr/tr.c Modified: head/usr.bin/tr/str.c ============================================================================== --- head/usr.bin/tr/str.c Mon Jul 11 20:15:46 2016 (r302593) +++ head/usr.bin/tr/str.c Mon Jul 11 21:23:50 2016 (r302594) @@ -53,7 +53,7 @@ static int backslash(STR *, int *); static int bracket(STR *); static void genclass(STR *); static void genequiv(STR *); -static int genrange(STR *, int); +static int genrange(STR *); static void genseq(STR *); wint_t @@ -93,7 +93,7 @@ next(STR *s) } /* We can start a range at any time. */ - if (s->str[0] == '-' && genrange(s, is_octal)) + if (s->str[0] == '-' && genrange(s)) return (next(s)); return (1); case RANGE: @@ -237,18 +237,16 @@ genequiv(STR *s) } static int -genrange(STR *s, int was_octal) +genrange(STR *s) { - int stopval, octal; + int stopval; char *savestart; - int n, cnt, *p; size_t clen; wchar_t wc; - octal = 0; savestart = s->str; if (*++s->str == '\\') - stopval = backslash(s, &octal); + stopval = backslash(s, NULL); else { clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); if (clen == (size_t)-1 || clen == (size_t)-2) @@ -256,37 +254,13 @@ genrange(STR *s, int was_octal) stopval = wc; s->str += clen; } - /* - * XXX Characters are not ordered according to collating sequence in - * multibyte locales. - */ - if (octal || was_octal || MB_CUR_MAX > 1) { - if (stopval < s->lastch) { - s->str = savestart; - return (0); - } - s->cnt = stopval - s->lastch + 1; - s->state = RANGE; - --s->lastch; - return (1); - } - if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { + if (stopval < s->lastch) { s->str = savestart; return (0); } - if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) - err(1, "genrange() malloc"); - for (cnt = 0; cnt < NCHARS_SB; cnt++) - if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && - charcoll((const void *)&cnt, (const void *)&stopval) <= 0) - *p++ = cnt; - *p = OOBCH; - n = p - s->set; - - s->cnt = 0; - s->state = SET; - if (n > 1) - mergesort(s->set, n, sizeof(*(s->set)), charcoll); + s->cnt = stopval - s->lastch + 1; + s->state = RANGE; + --s->lastch; return (1); } Modified: head/usr.bin/tr/tr.1 ============================================================================== --- head/usr.bin/tr/tr.1 Mon Jul 11 20:15:46 2016 (r302593) +++ head/usr.bin/tr/tr.1 Mon Jul 11 21:23:50 2016 (r302594) @@ -164,14 +164,6 @@ as defined by the collation sequence. If either or both of the range endpoints are octal sequences, it represents the range of specific coded values between the range endpoints, inclusive. -.Pp -.Bf Em -See the -.Sx COMPATIBILITY -section below for an important note regarding -differences in the way the current -implementation interprets range expressions differently from -previous implementations. .Ef .It [:class:] Represents all characters belonging to the defined character class. @@ -307,22 +299,16 @@ Remove diacritical marks from all accent .Pp .Dl "tr \*q[=e=]\*q \*qe\*q" .Sh COMPATIBILITY -Previous .Fx implementations of .Nm did not order characters in range expressions according to the current -locale's collation order, making it possible to convert unaccented Latin -characters (esp.\& as found in English text) from upper to lower case using +locale's collation order, making it possible to convert accented Latin +characters from upper to lower case using the traditional .Ux idiom of .Dq Li "tr A-Z a-z" . -Since -.Nm -now obeys the locale's collation order, this idiom may not produce -correct results when there is not a 1:1 mapping between lower and -upper case, or when the order of characters within the two cases differs. As noted in the .Sx EXAMPLES section above, the character class expressions @@ -334,6 +320,9 @@ should be used instead of explicit chara and .Dq Li A-Z . .Pp +.Dq Li [=equiv=] +expression is implemented for single byte locales only. +.Pp System V has historically implemented character ranges using the syntax .Dq Li [c-c] instead of the Modified: head/usr.bin/tr/tr.c ============================================================================== --- head/usr.bin/tr/tr.c Mon Jul 11 20:15:46 2016 (r302593) +++ head/usr.bin/tr/tr.c Mon Jul 11 21:23:50 2016 (r302594) @@ -68,10 +68,8 @@ static void usage(void); int main(int argc, char **argv) { - static int carray[NCHARS_SB]; struct cmap *map; struct cset *delete, *squeeze; - int n, *p; int Cflag, cflag, dflag, sflag, isstring2; wint_t ch, cnt, lastch; @@ -254,7 +252,7 @@ main(int argc, char **argv) (void)next(&s2); } endloop: - if (cflag || (Cflag && MB_CUR_MAX > 1)) { + if (cflag || Cflag) { /* * This is somewhat tricky: since the character set is * potentially huge, we need to avoid allocating a map @@ -268,14 +266,15 @@ endloop: */ s2.str = argv[1]; s2.state = NORMAL; - for (cnt = 0; cnt < WINT_MAX; cnt++) { + for (cnt = 0; cnt <= WCHAR_MAX; cnt++) { if (Cflag && !iswrune(cnt)) continue; if (cmap_lookup(map, cnt) == OOBCH) { - if (next(&s2)) + if (next(&s2)) { cmap_add(map, cnt, s2.lastch); - if (sflag) - cset_add(squeeze, s2.lastch); + if (sflag) + cset_add(squeeze, s2.lastch); + } } else cmap_add(map, cnt, cnt); if ((s2.state == EOS || s2.state == INFINITE) && @@ -283,30 +282,6 @@ endloop: break; } cmap_default(map, s2.lastch); - } else if (Cflag) { - for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) { - if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt)) - *p++ = cnt; - else - cmap_add(map, cnt, cnt); - } - n = p - carray; - if (Cflag && n > 1) - (void)mergesort(carray, n, sizeof(*carray), charcoll); - - s2.str = argv[1]; - s2.state = NORMAL; - for (cnt = 0; cnt < n; cnt++) { - (void)next(&s2); - cmap_add(map, carray[cnt], s2.lastch); - /* - * Chars taken from s2 can be different this time - * due to lack of complex upper/lower processing, - * so fill string2 again to not miss some. - */ - if (sflag) - cset_add(squeeze, s2.lastch); - } } cset_cache(squeeze); @@ -351,16 +326,6 @@ setup(char *arg, STR *str, int cflag, in return (cs); } -int -charcoll(const void *a, const void *b) -{ - static char sa[2], sb[2]; - - sa[0] = *(const int *)a; - sb[0] = *(const int *)b; - return (strcoll(sa, sb)); -} - static void usage(void) {