Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 20 Nov 2002 14:54:12 +0200
From:      Ruslan Ermilov <ru@FreeBSD.org>
To:        Tim Robbins <tjr@FreeBSD.org>
Cc:        "Andrey A. Chernov" <ache@nagual.pp.ru>, "David O'Brien" <obrien@FreeBSD.org>, current@FreeBSD.org
Subject:   Re: awk(1) is locale unaware (was: Re: buildworld breakage during "make depend" at usr.bin/kdump)
Message-ID:  <20021120125412.GB48212@sunbay.com>
In-Reply-To: <20021120142753.A11292@dilbert.robbins.dropbear.id.au>
References:  <20011101220836.A76061@nagual.pp.ru> <91693.1004642592@axl.seasidesoftware.co.za> <20011101114213.F79520@dragon.nuxi.com> <20011102044411.A81844@nagual.pp.ru> <20011101175808.A82798@dragon.nuxi.com> <20021119125202.GA37987@sunbay.com> <20021120013838.GB19233@nagual.pp.ru> <20021120142753.A11292@dilbert.robbins.dropbear.id.au>

next in thread | previous in thread | raw e-mail | index | archive | help

--WplhKdTI2c8ulnbP
Content-Type: multipart/mixed; boundary="+pHx0qQiF2pBVqBT"
Content-Disposition: inline


--+pHx0qQiF2pBVqBT
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Wed, Nov 20, 2002 at 02:27:53PM +1100, Tim Robbins wrote:
> On Wed, Nov 20, 2002 at 04:38:38AM +0300, Andrey A. Chernov wrote:
>=20
> > On Tue, Nov 19, 2002 at 14:52:02 +0200, Ruslan Ermilov wrote:
> > > It seems that this patch has never been committed.  This is a critical
> > > bug that should be fixed before 5.0-RELEASE is out.
> >=20
> > I agree. There is no locale yet and I never see that patch.
>=20
> This patch seems to work, I used the logic from regcomp.c in libc.
> Long lines make it ugly, but it was like that when I got here ;)

> Index: src/usr.bin/awk/Makefile
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /x/freebsd/src/usr.bin/awk/Makefile,v
> retrieving revision 1.9
> diff -u -r1.9 Makefile
> --- src/usr.bin/awk/Makefile	10 May 2002 20:36:21 -0000	1.9
> +++ src/usr.bin/awk/Makefile	20 Nov 2002 03:13:50 -0000
> @@ -6,7 +6,7 @@
>  PROG=3D	nawk
>  SRCS=3D	awkgram.y b.c lex.c lib.c main.c parse.c proctab.c run.c tran.c =
ytab.h
> =20
> -CFLAGS+=3D -I. -I${AWKSRC}
> +CFLAGS+=3D -I. -I${AWKSRC} -I${.CURDIR}/../../lib/libc/locale
> =20
Ouch.

>  DPADD=3D	${LIBM}
>  LDADD=3D	-lm
> Index: src/contrib/one-true-awk/b.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /x/freebsd/src/contrib/one-true-awk/b.c,v
> retrieving revision 1.1.1.2
> diff -u -r1.1.1.2 b.c
> --- src/contrib/one-true-awk/b.c	19 Feb 2002 09:35:24 -0000	1.1.1.2
> +++ src/contrib/one-true-awk/b.c	20 Nov 2002 03:16:10 -0000
> @@ -32,6 +32,7 @@
>  #include <stdlib.h>
>  #include "awk.h"
>  #include "ytab.h"
> +#include "collate.h"
> =20
>  #define	HAT	(NCHARS-2)	/* matches ^ in regular expr */
>  				/* NCHARS is 2**n */
> @@ -284,7 +285,7 @@
> =20
>  char *cclenter(char *argp)	/* add a character class */
>  {
> -	int i, c, c2;
> +	int i, j, c, c2;
>  	uschar *p =3D (uschar *) argp;
>  	uschar *op, *bp;
>  	static uschar *buf =3D 0;
> @@ -308,12 +309,24 @@
>  					i--;
>  					continue;
>  				}
> -				while (c < c2) {
> -					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0=
))
> -						FATAL("out of space for character class [%.10s...] 2", p);
> -					*bp++ =3D ++c;
> -					i++;
> -				}
> +				if (__collate_load_error) {
> +					while (c < c2) {
> +						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, =
0))
> +							FATAL("out of space for character class [%.10s...] 2", p);
> +						*bp++ =3D ++c;
> +						i++;
> +					}
> +				} else {
> +					for (j =3D CHAR_MIN; j <=3D CHAR_MAX; j++) {
> +						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, =
0))
> +							FATAL("out of space for character class [%.10s...] 2", p);
> +						if (__collate_range_cmp(c, j) <=3D 0
> +						    && __collate_range_cmp(j, c2) <=3D 0) {
> +							*bp++ =3D j;
> +							i++;
> +						}
> +					}
> +                                }
>  				continue;
>  			}
>  		}

There are a number of problems here:

1.  The "empty range" check preceding this block should be made
    locale-aware too.

2.  CHAR_MAX evaluates to 127 here.

Here's my version of the above fix plus [[:class:]] fixes Andrey mentioned.
I gave it only light testing.

The collate_range_cmp() was stolen from the old awk(1).


Cheers,
--=20
Ruslan Ermilov		Sysadmin and DBA,
ru@sunbay.com		Sunbay Software AG,
ru@FreeBSD.org		FreeBSD committer,
+380.652.512.251	Simferopol, Ukraine

http://www.FreeBSD.org	The Power To Serve
http://www.oracle.com	Enabling The Information Age

--+pHx0qQiF2pBVqBT
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=p
Content-Transfer-Encoding: quoted-printable

Index: b.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /home/ncvs/src/contrib/one-true-awk/b.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 b.c
--- b.c	19 Feb 2002 09:35:24 -0000	1.1.1.2
+++ b.c	20 Nov 2002 12:51:10 -0000
@@ -282,9 +282,25 @@ int quoted(char **pp)	/* pick up next th
 	return c;
 }
=20
+static int collate_range_cmp (a, b)
+	int a, b;
+{
+	int r;
+	static char s[2][2];
+
+	if ((unsigned char)a =3D=3D (unsigned char)b)
+		return 0;
+	s[0][0] =3D a;
+	s[1][0] =3D b;
+	if ((r =3D strcoll(s[0], s[1])) =3D=3D 0)
+		r =3D (unsigned char)a - (unsigned char)b;
+	return r;
+}
+
 char *cclenter(char *argp)	/* add a character class */
 {
 	int i, c, c2;
+	int j;
 	uschar *p =3D (uschar *) argp;
 	uschar *op, *bp;
 	static uschar *buf =3D 0;
@@ -303,15 +319,18 @@ char *cclenter(char *argp)	/* add a char
 				c2 =3D *p++;
 				if (c2 =3D=3D '\\')
 					c2 =3D quoted((char **) &p);
-				if (c > c2) {	/* empty; ignore */
+				if (collate_range_cmp(c, c2) > 0) {	/* empty; ignore */
 					bp--;
 					i--;
 					continue;
 				}
-				while (c < c2) {
+				for (j =3D 0; j < NCHARS; j++) {
+					if ((collate_range_cmp(c, j) > 0) ||
+					    collate_range_cmp(j, c2) > 0)
+						continue;
 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
 						FATAL("out of space for character class [%.10s...] 2", p);
-					*bp++ =3D ++c;
+					*bp++ =3D j;
 					i++;
 				}
 				continue;
@@ -696,20 +715,20 @@ Node *unary(Node *np)
 struct charclass {
 	const char *cc_name;
 	int cc_namelen;
-	const char *cc_expand;
+	int (*cc_func)(int);
 } charclasses[] =3D {
-	{ "alnum",	5,	"0-9A-Za-z" },
-	{ "alpha",	5,	"A-Za-z" },
-	{ "blank",	5,	" \t" },
-	{ "cntrl",	5,	"\000-\037\177" },
-	{ "digit",	5,	"0-9" },
-	{ "graph",	5,	"\041-\176" },
-	{ "lower",	5,	"a-z" },
-	{ "print",	5,	" \041-\176" },
-	{ "punct",	5,	"\041-\057\072-\100\133-\140\173-\176" },
-	{ "space",	5,	" \f\n\r\t\v" },
-	{ "upper",	5,	"A-Z" },
-	{ "xdigit",	6,	"0-9A-Fa-f" },
+	{ "alnum",	5,	isalnum },
+	{ "alpha",	5,	isalpha },
+	{ "blank",	5,	isblank },
+	{ "cntrl",	5,	iscntrl },
+	{ "digit",	5,	isdigit },
+	{ "graph",	5,	isgraph },
+	{ "lower",	5,	islower },
+	{ "print",	5,	isprint },
+	{ "punct",	5,	ispunct },
+	{ "space",	5,	isspace },
+	{ "upper",	5,	isupper },
+	{ "xdigit",	6,	isxdigit },
 	{ NULL,		0,	NULL },
 };
=20
@@ -722,7 +741,7 @@ int relex(void)		/* lexical analyzer for
 	static int bufsz =3D 100;
 	uschar *bp;
 	struct charclass *cc;
-	const uschar *p;
+	int i;
=20
 	switch (c =3D *prestr++) {
 	case '|': return OR;
@@ -771,8 +790,14 @@ int relex(void)		/* lexical analyzer for
 				if (cc->cc_name !=3D NULL && prestr[1 + cc->cc_namelen] =3D=3D ':' &&
 				    prestr[2 + cc->cc_namelen] =3D=3D ']') {
 					prestr +=3D cc->cc_namelen + 3;
-					for (p =3D (const uschar *) cc->cc_expand; *p; p++)
-						*bp++ =3D *p;
+					for (i =3D 0; i < NCHARS; i++) {
+						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0))
+						    FATAL("out of space for reg expr %.10s...", lastre);
+						if (cc->cc_func(i)) {
+							*bp++ =3D i;
+							n++;
+						}
+					}
 				} else
 					*bp++ =3D c;
 			} else if (c =3D=3D '\0') {
Index: main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /home/ncvs/src/contrib/one-true-awk/main.c,v
retrieving revision 1.1.1.3
diff -u -p -r1.1.1.3 main.c
--- main.c	16 Mar 2002 16:50:56 -0000	1.1.1.3
+++ main.c	20 Nov 2002 12:51:10 -0000
@@ -27,6 +27,7 @@ char	*version =3D "version 20020210";
 #define DEBUG
 #include <stdio.h>
 #include <ctype.h>
+#include <locale.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
@@ -54,6 +55,7 @@ int main(int argc, char *argv[])
 {
 	char *fs =3D NULL;
=20
+	setlocale(LC_ALL, "");
 	cmdname =3D argv[0];
 	if (argc =3D=3D 1) {
 		fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v=
 var=3Dvalue] [files]\n", cmdname);
Index: run.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /home/ncvs/src/contrib/one-true-awk/run.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 run.c
--- run.c	19 Feb 2002 09:35:25 -0000	1.1.1.2
+++ run.c	20 Nov 2002 12:51:10 -0000
@@ -1504,11 +1504,11 @@ Cell *bltin(Node **a, int n)	/* builtin=20
 		if (t =3D=3D FTOUPPER) {
 			for (p =3D buf; *p; p++)
 				if (islower((uschar) *p))
-					*p =3D toupper(*p);
+					*p =3D toupper((uschar)*p);
 		} else {
 			for (p =3D buf; *p; p++)
 				if (isupper((uschar) *p))
-					*p =3D tolower(*p);
+					*p =3D tolower((uschar)*p);
 		}
 		tempfree(x);
 		x =3D gettemp();

--+pHx0qQiF2pBVqBT--

--WplhKdTI2c8ulnbP
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.1 (FreeBSD)

iD8DBQE924X0Ukv4P6juNwoRAqnOAJ9MTa20RannZfhJTgGArLI02uQ44gCfQVol
E0j0tCVYkgoDjwhyreSmeIY=
=Wp/D
-----END PGP SIGNATURE-----

--WplhKdTI2c8ulnbP--

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-current" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20021120125412.GB48212>