Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 3 Feb 2002 23:07:58 +1100
From:      "Tim J. Robbins" <tim@robbins.dropbear.id.au>
To:        freebsd-standards@FreeBSD.ORG
Subject:   wc -m option
Message-ID:  <20020203230758.A19532@descent.robbins.dropbear.id.au>

next in thread | raw e-mail | index | archive | help

--envbJBWh7q8WU6mo
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

This patch adds the SUSV2 -m option to wc to handle multibyte characters.
Since libc is missing iswspace() from <wctype.h>, I've had to use isspace(),
which doesn't detect some wide space characters. For example, the Japanese
locale given in euc(4):

     /*
      * Code Set 1
      */
[snip]
     SPACE           0x09 - 0x0d 0x20
[snip]
     /*
      * Code Set 2
      */

     SPACE           0xa1a1
[snip]

The space characters in Code Set 1 can be used to separate words, but the
one in Code Set 2 will not be detected.


Tim

--envbJBWh7q8WU6mo
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="wc.diff"

Index: wc/wc.1
===================================================================
RCS file: /home/ncvs/src/usr.bin/wc/wc.1,v
retrieving revision 1.10
diff -u -r1.10 wc.1
--- wc/wc.1	2001/08/15 09:09:45	1.10
+++ wc/wc.1	2002/02/03 11:40:21
@@ -43,7 +43,8 @@
 .Nd word, line, and byte count
 .Sh SYNOPSIS
 .Nm
-.Op Fl clw
+.Op Fl c | m
+.Op Fl lw
 .Op Ar
 .Sh DESCRIPTION
 The
@@ -71,6 +72,9 @@
 .It Fl l
 The number of lines in each input file
 is written to the standard output.
+.It Fl m
+The number of characters in each input file
+is written to the standard output.
 .It Fl w
 The number of words in each input file
 is written to the standard output.
@@ -79,7 +83,12 @@
 When an option is specified,
 .Nm
 only reports the information requested by that option.
-The default action is equivalent to specifying all of the flags.
+The default action is equivalent to specifying the
+.Fl c ,
+.Fl l
+and
+.Fl w
+flags.
 .Pp
 If no files are specified, the standard input is used and no
 file name is displayed.
@@ -108,9 +117,11 @@
 The
 .Nm
 function conforms to
-.St -p1003.2 .
+.St -susv2 .
 .Sh HISTORY
 A
 .Nm
 command appeared in
 .At v1 .
+.Sh BUGS
+Some multibyte space characters are not detected correctly.
Index: wc/wc.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.12
diff -u -r1.12 wc.c
--- wc/wc.c	2001/12/11 22:23:53	1.12
+++ wc/wc.c	2002/02/03 11:40:22
@@ -51,14 +51,22 @@
 #include <ctype.h>
 #include <err.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
 
+/* XXX iswspace() is not yet implemented by the C library. */
+#ifndef _WCTYPE_H_
+#define iswspace(c) ((c) <= UCHAR_MAX && isspace(c))
+#endif
+
 u_quad_t tlinect, twordct, tcharct;
-int doline, doword, dochar;
+int doline, doword, dochar, multi;
 
 int cnt __P((const char *));
 void usage __P((void));
@@ -72,17 +80,20 @@
 
 	(void) setlocale(LC_CTYPE, "");
 
-	while ((ch = getopt(argc, argv, "lwc")) != -1)
-		switch((char)ch) {
+	while ((ch = getopt(argc, argv, "clmw")) != -1)
+		switch(ch) {
+		case 'm':
+			multi = 1;
+			/*FALLTHROUGH*/
+		case 'c':
+			dochar = 1;
+			break;
 		case 'l':
 			doline = 1;
 			break;
 		case 'w':
 			doword = 1;
 			break;
-		case 'c':
-			dochar = 1;
-			break;
 		case '?':
 		default:
 			usage();
@@ -128,10 +139,11 @@
 {
 	struct stat sb;
 	u_quad_t linect, wordct, charct;
-	int fd, len;
+	int fd, len, mbpos, n;
 	short gotsp;
 	u_char *p;
-	u_char buf[MAXBSIZE], ch;
+	u_char buf[MAXBSIZE], mbbuf[MB_LEN_MAX+1];
+	wchar_t ch;
 
 	linect = wordct = charct = 0;
 	if (file == NULL) {
@@ -149,7 +161,7 @@
 		 * lines than to get words, since the word count requires some
 		 * logic.
 		 */
-		if (doline) {
+		if (doline && !multi) {
 			while ((len = read(fd, buf, MAXBSIZE))) {
 				if (len == -1) {
 					warn("%s: read", file);
@@ -174,7 +186,7 @@
 		 * If all we need is the number of characters and it's a
 		 * regular or linked file, just stat the puppy.
 		 */
-		if (dochar) {
+		if (dochar && !multi) {
 			if (fstat(fd, &sb)) {
 				warn("%s: fstat", file);
 				(void)close(fd);
@@ -190,23 +202,34 @@
 	}
 
 	/* Do it the hard way... */
-word:	for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
+word:	mbpos = 0;
+	mbtowc(NULL, NULL, 0);
+	for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
 		if (len == -1) {
 			warn("%s: read", file);
 			(void)close(fd);
 			return (1);
 		}
-		/*
-		 * This loses in the presence of multi-byte characters.
-		 * To do it right would require a function to return a
-		 * character while knowing how many bytes it consumed.
-		 */
-		charct += len;
 		for (p = buf; len--;) {
-			ch = *p++;
-			if (ch == '\n')
+			if (!multi)
+				ch = (wchar_t)*p++;
+			else {
+				mbbuf[mbpos++] = *p++;
+				if (mbpos > MB_LEN_MAX) {
+					/* Multibyte sequence too long. */
+					mbtowc(NULL, NULL, 0);
+					mbpos = 0;
+					continue;
+				}
+				if ((n = mbtowc(&ch, mbbuf, mbpos)) <= 0)
+					/* Multibyte sequence not done yet. */
+					continue;
+				mbpos = 0;
+			}
+			charct++;
+			if (ch == (wchar_t)'\n')
 				++linect;
-			if (isspace(ch))
+			if (iswspace(ch))
 				gotsp = 1;
 			else if (gotsp) {
 				gotsp = 0;
@@ -233,6 +256,6 @@
 void
 usage()
 {
-	(void)fprintf(stderr, "usage: wc [-clw] [file ...]\n");
+	(void)fprintf(stderr, "usage: wc [-c|-m] [-lw] [file ...]\n");
 	exit(1);
 }

--envbJBWh7q8WU6mo--

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-standards" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020203230758.A19532>