From owner-freebsd-current@FreeBSD.ORG Thu Jul 31 17:44:13 2003 Return-Path: Delivered-To: freebsd-current@freebsd.org Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125]) by hub.freebsd.org (Postfix) with ESMTP id 3DE2F37B401; Thu, 31 Jul 2003 17:44:13 -0700 (PDT) Received: from nagual.pp.ru (pobrecita.freebsd.ru [194.87.13.42]) by mx1.FreeBSD.org (Postfix) with ESMTP id 40BC943F3F; Thu, 31 Jul 2003 17:44:12 -0700 (PDT) (envelope-from ache@pobrecita.freebsd.ru) Received: from pobrecita.freebsd.ru (ache@localhost [127.0.0.1]) by nagual.pp.ru (8.12.9/8.12.9) with ESMTP id h710i8Yk022101; Fri, 1 Aug 2003 04:44:11 +0400 (MSD) (envelope-from ache@pobrecita.freebsd.ru) Received: (from ache@localhost) by pobrecita.freebsd.ru (8.12.9/8.12.9/Submit) id h710i8Ag022100; Fri, 1 Aug 2003 04:44:08 +0400 (MSD) Date: Fri, 1 Aug 2003 04:44:08 +0400 From: Andrey Chernov To: current@freebsd.org Message-ID: <20030801004408.GA22054@nagual.pp.ru> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.4i cc: i18n@freebsd.org Subject: Serious 'tr' bug, patch for review included X-BeenThere: freebsd-current@freebsd.org X-Mailman-Version: 2.1.1 Precedence: list List-Id: Discussions about the use of FreeBSD-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 01 Aug 2003 00:44:13 -0000 This patch address two problems. 1st one is relatively minor: according our own manpage, upper and lower classes must be sorted, but currently not. 2nd one is serious: tr '[:lower:]' '[:upper:]' (and vice versa) currently works only if upper and lower classes have exact the same number of elements. When it is not true, like for many ISO8859-x locales which have bigger amount of lowercase letters, tr may do nasty things. The patch is complex, because whole conversion string need to be processed each time l-u or u->l conversion occurse, not single character at time, like in previous variant. See this page http://www.opengroup.org/onlinepubs/007908799/xcu/tr.html for detailed description of desired tr behaviour in such cases. Please test this patch on your system & locale and report me any strange things. diff -u ./extern.h /usr/src/usr.bin/tr/extern.h --- ./extern.h Fri Jun 14 19:56:52 2002 +++ /usr/src/usr.bin/tr/extern.h Fri Aug 1 04:19:36 2003 @@ -40,7 +40,8 @@ typedef struct { enum { STRING1, STRING2 } which; - enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state; + enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, + SET, SET_UPPER, SET_LOWER } state; int cnt; /* character count */ int lastch; /* last character */ int equiv[NCHARS]; /* equivalence set */ @@ -49,3 +50,5 @@ } STR; int next(STR *); +int charcoll(const void *, const void *); + diff -u ./str.c /usr/src/usr.bin/tr/str.c --- ./str.c Fri Jul 5 13:28:13 2002 +++ /usr/src/usr.bin/tr/str.c Fri Aug 1 04:22:11 2003 @@ -106,6 +106,8 @@ } return (1); case SET: + case SET_UPPER: + case SET_LOWER: if ((s->lastch = s->set[s->cnt++]) == OOBCH) { s->state = NORMAL; return (next(s)); @@ -194,7 +196,7 @@ { int cnt, (*func)(int); CLASS *cp, tmp; - int *p; + int *p, n; tmp.name = s->str; if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / @@ -208,10 +210,18 @@ if ((func)(cnt)) *p++ = cnt; *p = OOBCH; + n = p - cp->set; s->cnt = 0; - s->state = SET; s->set = cp->set; + if (strcmp(s->str, "upper") == 0) + s->state = SET_UPPER; + else if (strcmp(s->str, "lower") == 0) { + s->state = SET_LOWER; + } else + s->state = SET; + if ((s->state == SET_LOWER || s->state == SET_UPPER) && n > 1) + mergesort(s->set, n, sizeof(*(s->set)), charcoll); } static int diff -u ./tr.c /usr/src/usr.bin/tr/tr.c --- ./tr.c Thu Sep 5 03:29:07 2002 +++ /usr/src/usr.bin/tr/tr.c Fri Aug 1 04:32:01 2003 @@ -101,8 +101,9 @@ STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL }; STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL }; -static int charcoll(const void *, const void *); static void setup(int *, char *, STR *, int, int); +static void process_upper(int); +static void process_lower(int); static void usage(void); int @@ -110,7 +111,7 @@ { static int collorder[NCHARS], tmpmap[NCHARS]; int ch, cnt, lastch, *p; - int Cflag, cflag, dflag, sflag, isstring2; + int Cflag, cflag, dflag, sflag, isstring2, do_upper, do_lower; (void)setlocale(LC_ALL, ""); @@ -224,19 +225,67 @@ if (!next(&s2)) errx(1, "empty string2"); - ch = s2.lastch; + do_upper = do_lower = 0; /* If string2 runs out of characters, use the last one specified. */ - if (sflag) - while (next(&s1)) { - string1[s1.lastch] = ch = s2.lastch; - string2[ch] = 1; - (void)next(&s2); - } - else - while (next(&s1)) { - string1[s1.lastch] = ch = s2.lastch; - (void)next(&s2); + while (next(&s1)) { + if (s1.state == SET_LOWER && + s2.state == SET_UPPER) { + if (do_lower) { + process_lower(sflag); + do_lower = 0; + } + do_upper = 1; + } else if (s1.state == SET_UPPER && + s2.state == SET_LOWER) { + if (do_upper) { + process_upper(sflag); + do_upper = 0; + } + do_lower = 1; + } else { + if (do_lower) { + /* Skip until aligned */ + if (s1.state == SET_UPPER) { + do { + if (!next(&s1)) + goto endloop; + } while (s1.state == SET_UPPER); + } else if (s2.state == SET_LOWER) { + do { + if (!next(&s2)) + break; + } while (s2.state == SET_LOWER); + } + process_lower(sflag); + do_lower = 0; + } else if (do_upper) { + /* Skip until aligned */ + if (s1.state == SET_LOWER) { + do { + if (!next(&s1)) + goto endloop; + } while (s1.state == SET_LOWER); + } else if (s2.state == SET_UPPER) { + do { + if (!next(&s2)) + break; + } while (s2.state == SET_UPPER); + } + process_upper(sflag); + do_upper = 0; + } + string1[s1.lastch] = s2.lastch; + if (sflag) + string2[s2.lastch] = 1; } + (void)next(&s2); + } +endloop: + if (do_lower) + process_lower(sflag); + else if (do_upper) + process_upper(sflag); + /* End of upper & lower special processing */ if (cflag || Cflag) { s2.str = argv[1]; @@ -294,15 +343,55 @@ string[cnt] = !string[cnt] && ISCHAR(cnt); } -static int +int charcoll(const void *a, const void *b) { - char sa[2], sb[2]; + static char sa[2], sb[2]; sa[0] = *(const int *)a; sb[0] = *(const int *)b; - sa[1] = sb[1] = '\0'; return (strcoll(sa, sb)); +} + + +/* + * For -s result will contain only those characters defined + * as the second characters in each of the toupper or tolower + * pairs. + */ + +static void +process_upper(int sflag) +{ + int cnt, ch; + + for (cnt = 0; cnt < NCHARS; cnt++) { + ch = string1[cnt]; + if (ch == OOBCH) /* [Cc]flag */ + ch = cnt; + if (islower(ch)) { + string1[cnt] = ch = toupper(ch); + if (sflag && isupper(ch)) + string2[ch] = 1; + } + } +} + +static void +process_lower(int sflag) +{ + int cnt, ch; + + for (cnt = 0; cnt < NCHARS; cnt++) { + ch = string1[cnt]; + if (ch == OOBCH) /* [Cc]flag */ + ch = cnt; + if (isupper(ch)) { + string1[cnt] = ch = tolower(ch); + if (sflag && islower(ch)) + string2[ch] = 1; + } + } } static void