Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 9 Feb 2002 02:43:19 +1100
From:      Tim Robbins <tim@robbins.dropbear.id.au>
To:        freebsd-standards@FreeBSD.ORG
Subject:   tr equivalence class support
Message-ID:  <20020209024319.A42708@descent.robbins.dropbear.id.au>

next in thread | raw e-mail | index | archive | help
This patch adds equivalence class support to tr, and an example to the
manual page to show what it can be used for.

It could be better implemented if some of the data structures or functions
from libc/locale/collate.c were exported; right now it relies on the fact
that strxfrm() writes the primary collation value to the output string,
which may not be true forever.


Index: tr/extern.h
===================================================================
RCS file: /home/ncvs/src/usr.bin/tr/extern.h,v
retrieving revision 1.2
diff -u -r1.2 extern.h
--- tr/extern.h	1997/08/18 07:24:54	1.2
+++ tr/extern.h	2002/02/08 15:39:15
@@ -33,18 +33,18 @@
  *	@(#)extern.h	8.1 (Berkeley) 6/6/93
  */
 
+#include <limits.h>
+#define	NCHARS	(UCHAR_MAX + 1)		/* Number of possible characters. */
+#define	OOBCH	(UCHAR_MAX + 1)		/* Out of band character value. */
+
 typedef struct {
 	enum { STRING1, STRING2 } which;
 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
 	int	 cnt;			/* character count */
 	int	 lastch;		/* last character */
-	int	equiv[2];		/* equivalence set */
+	int	equiv[NCHARS];		/* equivalence set */
 	int	*set;			/* set of characters */
 	char	*str;			/* user's string */
 } STR;
-
-#include <limits.h>
-#define	NCHARS	(UCHAR_MAX + 1)		/* Number of possible characters. */
-#define	OOBCH	(UCHAR_MAX + 1)		/* Out of band character value. */
 
 int	 next __P((STR *));
Index: tr/str.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/tr/str.c,v
retrieving revision 1.11
diff -u -r1.11 str.c
--- tr/str.c	2001/12/11 23:36:25	1.11
+++ tr/str.c	2002/02/08 15:39:15
@@ -54,6 +54,7 @@
 static int	backslash __P((STR *));
 static int	bracket __P((STR *));
 static int	c_class __P((const void *, const void *));
+static u_long	collval __P((char));
 static void	genclass __P((STR *));
 static void	genequiv __P((STR *));
 static int	genrange __P((STR *));
@@ -217,19 +218,43 @@
 }
 
 /*
- * English doesn't have any equivalence classes, so for now
- * we just syntax check and grab the character.
+ * Get the primary collation value for a character. This is a hack,
+ * needed until libc can do this for us.
  */
+static u_long
+collval(c)
+	char c;
+{
+	char buf[2], xbuf[32];
+	u_long v;
+	int i, n;
+
+	buf[0] = c;
+	buf[1] = '\0';
+	if ((n = strxfrm(xbuf, buf, sizeof(xbuf))) >= (int)sizeof(xbuf))
+		n = sizeof(xbuf);
+	for (v = 0, i = 0; i < n; i++) {
+		v <<= 8;
+		v |= (unsigned char)xbuf[i];
+	}
+
+	return v;
+}
+
 static void
 genequiv(s)
 	STR *s;
 {
+	u_long cvc;
+	int i, *p;
+	char c;
+
 	if (*s->str == '\\') {
-		s->equiv[0] = backslash(s);
+		c = backslash(s);
 		if (*s->str != '=')
 			errx(1, "misplaced equivalence equals sign");
 	} else {
-		s->equiv[0] = s->str[0];
+		c = s->str[0];
 		if (s->str[1] != '=')
 			errx(1, "misplaced equivalence equals sign");
 	}
@@ -237,6 +262,12 @@
 	s->cnt = 0;
 	s->state = SET;
 	s->set = s->equiv;
+
+	cvc = collval(c);
+	for (p = s->equiv, i = 1; i < NCHARS; i++)
+		if (collval((char)i) == cvc)
+			*p++ = i;
+	*p = OOBCH;
 }
 
 static int
Index: tr/tr.1
===================================================================
RCS file: /home/ncvs/src/usr.bin/tr/tr.1,v
retrieving revision 1.14
diff -u -r1.14 tr.1
--- tr/tr.1	2001/08/15 09:09:44	1.14
+++ tr/tr.1	2002/02/08 15:39:16
@@ -203,10 +203,6 @@
 Represents all characters or collating (sorting) elements belonging to
 the same equivalence class as
 .Ar equiv .
-If
-there is a secondary ordering within the equivalence class, the characters
-are ordered in ascending sequence.
-Otherwise, they are ordered after their encoded values.
 An example of an equivalence class might be ``c'' and ``ch'' in Spanish;
 English has no equivalence classes.
 .It [#*n]
@@ -245,6 +241,10 @@
 Strip out non-printable characters from file1.
 .Pp
 .D1 Li "tr -cd \*q[:print:]\*q < file1"
+.Pp
+Strip diacritical marks from accented variants of the ``e'' character.
+.Pp
+.D1 Li "tr \*q[=e=]\*q \*qe\*q < file1"
 .Sh COMPATIBILITY
 System V has historically implemented character ranges using the syntax
 ``[c-c]'' instead of the ``c-c'' used by historic
Index: tr/tr.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/tr/tr.c,v
retrieving revision 1.9
diff -u -r1.9 tr.c
--- tr/tr.c	2001/12/11 23:36:25	1.9
+++ tr/tr.c	2002/02/08 15:39:17
@@ -105,6 +105,7 @@
 	int ch, cnt, lastch, *p;
 	int cflag, dflag, sflag, isstring2;
 
+	(void) setlocale(LC_COLLATE, "");
 	(void) setlocale(LC_CTYPE, "");
 
 	cflag = dflag = sflag = 0;

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-standards" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020209024319.A42708>