Date: Sat, 9 Feb 2002 02:43:19 +1100 From: Tim Robbins <tim@robbins.dropbear.id.au> To: freebsd-standards@FreeBSD.ORG Subject: tr equivalence class support Message-ID: <20020209024319.A42708@descent.robbins.dropbear.id.au>
next in thread | raw e-mail | index | archive | help
This patch adds equivalence class support to tr, and an example to the manual page to show what it can be used for. It could be better implemented if some of the data structures or functions from libc/locale/collate.c were exported; right now it relies on the fact that strxfrm() writes the primary collation value to the output string, which may not be true forever. Index: tr/extern.h =================================================================== RCS file: /home/ncvs/src/usr.bin/tr/extern.h,v retrieving revision 1.2 diff -u -r1.2 extern.h --- tr/extern.h 1997/08/18 07:24:54 1.2 +++ tr/extern.h 2002/02/08 15:39:15 @@ -33,18 +33,18 @@ * @(#)extern.h 8.1 (Berkeley) 6/6/93 */ +#include <limits.h> +#define NCHARS (UCHAR_MAX + 1) /* Number of possible characters. */ +#define OOBCH (UCHAR_MAX + 1) /* Out of band character value. */ + typedef struct { enum { STRING1, STRING2 } which; enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state; int cnt; /* character count */ int lastch; /* last character */ - int equiv[2]; /* equivalence set */ + int equiv[NCHARS]; /* equivalence set */ int *set; /* set of characters */ char *str; /* user's string */ } STR; - -#include <limits.h> -#define NCHARS (UCHAR_MAX + 1) /* Number of possible characters. */ -#define OOBCH (UCHAR_MAX + 1) /* Out of band character value. */ int next __P((STR *)); Index: tr/str.c =================================================================== RCS file: /home/ncvs/src/usr.bin/tr/str.c,v retrieving revision 1.11 diff -u -r1.11 str.c --- tr/str.c 2001/12/11 23:36:25 1.11 +++ tr/str.c 2002/02/08 15:39:15 @@ -54,6 +54,7 @@ static int backslash __P((STR *)); static int bracket __P((STR *)); static int c_class __P((const void *, const void *)); +static u_long collval __P((char)); static void genclass __P((STR *)); static void genequiv __P((STR *)); static int genrange __P((STR *)); @@ -217,19 +218,43 @@ } /* - * English doesn't have any equivalence classes, so for now - * we just syntax check and grab the character. + * Get the primary collation value for a character. This is a hack, + * needed until libc can do this for us. */ +static u_long +collval(c) + char c; +{ + char buf[2], xbuf[32]; + u_long v; + int i, n; + + buf[0] = c; + buf[1] = '\0'; + if ((n = strxfrm(xbuf, buf, sizeof(xbuf))) >= (int)sizeof(xbuf)) + n = sizeof(xbuf); + for (v = 0, i = 0; i < n; i++) { + v <<= 8; + v |= (unsigned char)xbuf[i]; + } + + return v; +} + static void genequiv(s) STR *s; { + u_long cvc; + int i, *p; + char c; + if (*s->str == '\\') { - s->equiv[0] = backslash(s); + c = backslash(s); if (*s->str != '=') errx(1, "misplaced equivalence equals sign"); } else { - s->equiv[0] = s->str[0]; + c = s->str[0]; if (s->str[1] != '=') errx(1, "misplaced equivalence equals sign"); } @@ -237,6 +262,12 @@ s->cnt = 0; s->state = SET; s->set = s->equiv; + + cvc = collval(c); + for (p = s->equiv, i = 1; i < NCHARS; i++) + if (collval((char)i) == cvc) + *p++ = i; + *p = OOBCH; } static int Index: tr/tr.1 =================================================================== RCS file: /home/ncvs/src/usr.bin/tr/tr.1,v retrieving revision 1.14 diff -u -r1.14 tr.1 --- tr/tr.1 2001/08/15 09:09:44 1.14 +++ tr/tr.1 2002/02/08 15:39:16 @@ -203,10 +203,6 @@ Represents all characters or collating (sorting) elements belonging to the same equivalence class as .Ar equiv . -If -there is a secondary ordering within the equivalence class, the characters -are ordered in ascending sequence. -Otherwise, they are ordered after their encoded values. An example of an equivalence class might be ``c'' and ``ch'' in Spanish; English has no equivalence classes. .It [#*n] @@ -245,6 +241,10 @@ Strip out non-printable characters from file1. .Pp .D1 Li "tr -cd \*q[:print:]\*q < file1" +.Pp +Strip diacritical marks from accented variants of the ``e'' character. +.Pp +.D1 Li "tr \*q[=e=]\*q \*qe\*q < file1" .Sh COMPATIBILITY System V has historically implemented character ranges using the syntax ``[c-c]'' instead of the ``c-c'' used by historic Index: tr/tr.c =================================================================== RCS file: /home/ncvs/src/usr.bin/tr/tr.c,v retrieving revision 1.9 diff -u -r1.9 tr.c --- tr/tr.c 2001/12/11 23:36:25 1.9 +++ tr/tr.c 2002/02/08 15:39:17 @@ -105,6 +105,7 @@ int ch, cnt, lastch, *p; int cflag, dflag, sflag, isstring2; + (void) setlocale(LC_COLLATE, ""); (void) setlocale(LC_CTYPE, ""); cflag = dflag = sflag = 0; To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-standards" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020209024319.A42708>