Date: Wed, 20 Nov 2002 14:54:12 +0200 From: Ruslan Ermilov <ru@FreeBSD.org> To: Tim Robbins <tjr@FreeBSD.org> Cc: "Andrey A. Chernov" <ache@nagual.pp.ru>, "David O'Brien" <obrien@FreeBSD.org>, current@FreeBSD.org Subject: Re: awk(1) is locale unaware (was: Re: buildworld breakage during "make depend" at usr.bin/kdump) Message-ID: <20021120125412.GB48212@sunbay.com> In-Reply-To: <20021120142753.A11292@dilbert.robbins.dropbear.id.au> References: <20011101220836.A76061@nagual.pp.ru> <91693.1004642592@axl.seasidesoftware.co.za> <20011101114213.F79520@dragon.nuxi.com> <20011102044411.A81844@nagual.pp.ru> <20011101175808.A82798@dragon.nuxi.com> <20021119125202.GA37987@sunbay.com> <20021120013838.GB19233@nagual.pp.ru> <20021120142753.A11292@dilbert.robbins.dropbear.id.au>
next in thread | previous in thread | raw e-mail | index | archive | help
--WplhKdTI2c8ulnbP Content-Type: multipart/mixed; boundary="+pHx0qQiF2pBVqBT" Content-Disposition: inline --+pHx0qQiF2pBVqBT Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On Wed, Nov 20, 2002 at 02:27:53PM +1100, Tim Robbins wrote: > On Wed, Nov 20, 2002 at 04:38:38AM +0300, Andrey A. Chernov wrote: >=20 > > On Tue, Nov 19, 2002 at 14:52:02 +0200, Ruslan Ermilov wrote: > > > It seems that this patch has never been committed. This is a critical > > > bug that should be fixed before 5.0-RELEASE is out. > >=20 > > I agree. There is no locale yet and I never see that patch. >=20 > This patch seems to work, I used the logic from regcomp.c in libc. > Long lines make it ugly, but it was like that when I got here ;) > Index: src/usr.bin/awk/Makefile > =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D > RCS file: /x/freebsd/src/usr.bin/awk/Makefile,v > retrieving revision 1.9 > diff -u -r1.9 Makefile > --- src/usr.bin/awk/Makefile 10 May 2002 20:36:21 -0000 1.9 > +++ src/usr.bin/awk/Makefile 20 Nov 2002 03:13:50 -0000 > @@ -6,7 +6,7 @@ > PROG=3D nawk > SRCS=3D awkgram.y b.c lex.c lib.c main.c parse.c proctab.c run.c tran.c = ytab.h > =20 > -CFLAGS+=3D -I. -I${AWKSRC} > +CFLAGS+=3D -I. -I${AWKSRC} -I${.CURDIR}/../../lib/libc/locale > =20 Ouch. > DPADD=3D ${LIBM} > LDADD=3D -lm > Index: src/contrib/one-true-awk/b.c > =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D > RCS file: /x/freebsd/src/contrib/one-true-awk/b.c,v > retrieving revision 1.1.1.2 > diff -u -r1.1.1.2 b.c > --- src/contrib/one-true-awk/b.c 19 Feb 2002 09:35:24 -0000 1.1.1.2 > +++ src/contrib/one-true-awk/b.c 20 Nov 2002 03:16:10 -0000 > @@ -32,6 +32,7 @@ > #include <stdlib.h> > #include "awk.h" > #include "ytab.h" > +#include "collate.h" > =20 > #define HAT (NCHARS-2) /* matches ^ in regular expr */ > /* NCHARS is 2**n */ > @@ -284,7 +285,7 @@ > =20 > char *cclenter(char *argp) /* add a character class */ > { > - int i, c, c2; > + int i, j, c, c2; > uschar *p =3D (uschar *) argp; > uschar *op, *bp; > static uschar *buf =3D 0; > @@ -308,12 +309,24 @@ > i--; > continue; > } > - while (c < c2) { > - if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0= )) > - FATAL("out of space for character class [%.10s...] 2", p); > - *bp++ =3D ++c; > - i++; > - } > + if (__collate_load_error) { > + while (c < c2) { > + if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, = 0)) > + FATAL("out of space for character class [%.10s...] 2", p); > + *bp++ =3D ++c; > + i++; > + } > + } else { > + for (j =3D CHAR_MIN; j <=3D CHAR_MAX; j++) { > + if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, = 0)) > + FATAL("out of space for character class [%.10s...] 2", p); > + if (__collate_range_cmp(c, j) <=3D 0 > + && __collate_range_cmp(j, c2) <=3D 0) { > + *bp++ =3D j; > + i++; > + } > + } > + } > continue; > } > } There are a number of problems here: 1. The "empty range" check preceding this block should be made locale-aware too. 2. CHAR_MAX evaluates to 127 here. Here's my version of the above fix plus [[:class:]] fixes Andrey mentioned. I gave it only light testing. The collate_range_cmp() was stolen from the old awk(1). Cheers, --=20 Ruslan Ermilov Sysadmin and DBA, ru@sunbay.com Sunbay Software AG, ru@FreeBSD.org FreeBSD committer, +380.652.512.251 Simferopol, Ukraine http://www.FreeBSD.org The Power To Serve http://www.oracle.com Enabling The Information Age --+pHx0qQiF2pBVqBT Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=p Content-Transfer-Encoding: quoted-printable Index: b.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: /home/ncvs/src/contrib/one-true-awk/b.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 b.c --- b.c 19 Feb 2002 09:35:24 -0000 1.1.1.2 +++ b.c 20 Nov 2002 12:51:10 -0000 @@ -282,9 +282,25 @@ int quoted(char **pp) /* pick up next th return c; } =20 +static int collate_range_cmp (a, b) + int a, b; +{ + int r; + static char s[2][2]; + + if ((unsigned char)a =3D=3D (unsigned char)b) + return 0; + s[0][0] =3D a; + s[1][0] =3D b; + if ((r =3D strcoll(s[0], s[1])) =3D=3D 0) + r =3D (unsigned char)a - (unsigned char)b; + return r; +} + char *cclenter(char *argp) /* add a character class */ { int i, c, c2; + int j; uschar *p =3D (uschar *) argp; uschar *op, *bp; static uschar *buf =3D 0; @@ -303,15 +319,18 @@ char *cclenter(char *argp) /* add a char c2 =3D *p++; if (c2 =3D=3D '\\') c2 =3D quoted((char **) &p); - if (c > c2) { /* empty; ignore */ + if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */ bp--; i--; continue; } - while (c < c2) { + for (j =3D 0; j < NCHARS; j++) { + if ((collate_range_cmp(c, j) > 0) || + collate_range_cmp(j, c2) > 0) + continue; if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0)) FATAL("out of space for character class [%.10s...] 2", p); - *bp++ =3D ++c; + *bp++ =3D j; i++; } continue; @@ -696,20 +715,20 @@ Node *unary(Node *np) struct charclass { const char *cc_name; int cc_namelen; - const char *cc_expand; + int (*cc_func)(int); } charclasses[] =3D { - { "alnum", 5, "0-9A-Za-z" }, - { "alpha", 5, "A-Za-z" }, - { "blank", 5, " \t" }, - { "cntrl", 5, "\000-\037\177" }, - { "digit", 5, "0-9" }, - { "graph", 5, "\041-\176" }, - { "lower", 5, "a-z" }, - { "print", 5, " \041-\176" }, - { "punct", 5, "\041-\057\072-\100\133-\140\173-\176" }, - { "space", 5, " \f\n\r\t\v" }, - { "upper", 5, "A-Z" }, - { "xdigit", 6, "0-9A-Fa-f" }, + { "alnum", 5, isalnum }, + { "alpha", 5, isalpha }, + { "blank", 5, isblank }, + { "cntrl", 5, iscntrl }, + { "digit", 5, isdigit }, + { "graph", 5, isgraph }, + { "lower", 5, islower }, + { "print", 5, isprint }, + { "punct", 5, ispunct }, + { "space", 5, isspace }, + { "upper", 5, isupper }, + { "xdigit", 6, isxdigit }, { NULL, 0, NULL }, }; =20 @@ -722,7 +741,7 @@ int relex(void) /* lexical analyzer for static int bufsz =3D 100; uschar *bp; struct charclass *cc; - const uschar *p; + int i; =20 switch (c =3D *prestr++) { case '|': return OR; @@ -771,8 +790,14 @@ int relex(void) /* lexical analyzer for if (cc->cc_name !=3D NULL && prestr[1 + cc->cc_namelen] =3D=3D ':' && prestr[2 + cc->cc_namelen] =3D=3D ']') { prestr +=3D cc->cc_namelen + 3; - for (p =3D (const uschar *) cc->cc_expand; *p; p++) - *bp++ =3D *p; + for (i =3D 0; i < NCHARS; i++) { + if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0)) + FATAL("out of space for reg expr %.10s...", lastre); + if (cc->cc_func(i)) { + *bp++ =3D i; + n++; + } + } } else *bp++ =3D c; } else if (c =3D=3D '\0') { Index: main.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: /home/ncvs/src/contrib/one-true-awk/main.c,v retrieving revision 1.1.1.3 diff -u -p -r1.1.1.3 main.c --- main.c 16 Mar 2002 16:50:56 -0000 1.1.1.3 +++ main.c 20 Nov 2002 12:51:10 -0000 @@ -27,6 +27,7 @@ char *version =3D "version 20020210"; #define DEBUG #include <stdio.h> #include <ctype.h> +#include <locale.h> #include <stdlib.h> #include <string.h> #include <signal.h> @@ -54,6 +55,7 @@ int main(int argc, char *argv[]) { char *fs =3D NULL; =20 + setlocale(LC_ALL, ""); cmdname =3D argv[0]; if (argc =3D=3D 1) { fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v= var=3Dvalue] [files]\n", cmdname); Index: run.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: /home/ncvs/src/contrib/one-true-awk/run.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 run.c --- run.c 19 Feb 2002 09:35:25 -0000 1.1.1.2 +++ run.c 20 Nov 2002 12:51:10 -0000 @@ -1504,11 +1504,11 @@ Cell *bltin(Node **a, int n) /* builtin=20 if (t =3D=3D FTOUPPER) { for (p =3D buf; *p; p++) if (islower((uschar) *p)) - *p =3D toupper(*p); + *p =3D toupper((uschar)*p); } else { for (p =3D buf; *p; p++) if (isupper((uschar) *p)) - *p =3D tolower(*p); + *p =3D tolower((uschar)*p); } tempfree(x); x =3D gettemp(); --+pHx0qQiF2pBVqBT-- --WplhKdTI2c8ulnbP Content-Type: application/pgp-signature Content-Disposition: inline -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.2.1 (FreeBSD) iD8DBQE924X0Ukv4P6juNwoRAqnOAJ9MTa20RannZfhJTgGArLI02uQ44gCfQVol E0j0tCVYkgoDjwhyreSmeIY= =Wp/D -----END PGP SIGNATURE----- --WplhKdTI2c8ulnbP-- To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-current" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20021120125412.GB48212>