From owner-freebsd-standards Sun Feb 3 4: 9:31 2002 Delivered-To: freebsd-standards@freebsd.org Received: from descent.robbins.dropbear.id.au (060.a.011.mel.iprimus.net.au [210.50.216.60]) by hub.freebsd.org (Postfix) with ESMTP id A2BBB37B402 for ; Sun, 3 Feb 2002 04:08:30 -0800 (PST) Received: (from tim@localhost) by descent.robbins.dropbear.id.au (8.11.6/8.11.6) id g13C82I19702 for freebsd-standards@FreeBSD.ORG; Sun, 3 Feb 2002 23:08:02 +1100 (EST) (envelope-from tim) Date: Sun, 3 Feb 2002 23:07:58 +1100 From: "Tim J. Robbins" To: freebsd-standards@FreeBSD.ORG Subject: wc -m option Message-ID: <20020203230758.A19532@descent.robbins.dropbear.id.au> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="envbJBWh7q8WU6mo" Content-Disposition: inline User-Agent: Mutt/1.2.5.1i Sender: owner-freebsd-standards@FreeBSD.ORG Precedence: bulk List-ID: List-Archive: (Web Archive) List-Help: (List Instructions) List-Subscribe: List-Unsubscribe: X-Loop: FreeBSD.ORG --envbJBWh7q8WU6mo Content-Type: text/plain; charset=us-ascii Content-Disposition: inline This patch adds the SUSV2 -m option to wc to handle multibyte characters. Since libc is missing iswspace() from , I've had to use isspace(), which doesn't detect some wide space characters. For example, the Japanese locale given in euc(4): /* * Code Set 1 */ [snip] SPACE 0x09 - 0x0d 0x20 [snip] /* * Code Set 2 */ SPACE 0xa1a1 [snip] The space characters in Code Set 1 can be used to separate words, but the one in Code Set 2 will not be detected. Tim --envbJBWh7q8WU6mo Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="wc.diff" Index: wc/wc.1 =================================================================== RCS file: /home/ncvs/src/usr.bin/wc/wc.1,v retrieving revision 1.10 diff -u -r1.10 wc.1 --- wc/wc.1 2001/08/15 09:09:45 1.10 +++ wc/wc.1 2002/02/03 11:40:21 @@ -43,7 +43,8 @@ .Nd word, line, and byte count .Sh SYNOPSIS .Nm -.Op Fl clw +.Op Fl c | m +.Op Fl lw .Op Ar .Sh DESCRIPTION The @@ -71,6 +72,9 @@ .It Fl l The number of lines in each input file is written to the standard output. +.It Fl m +The number of characters in each input file +is written to the standard output. .It Fl w The number of words in each input file is written to the standard output. @@ -79,7 +83,12 @@ When an option is specified, .Nm only reports the information requested by that option. -The default action is equivalent to specifying all of the flags. +The default action is equivalent to specifying the +.Fl c , +.Fl l +and +.Fl w +flags. .Pp If no files are specified, the standard input is used and no file name is displayed. @@ -108,9 +117,11 @@ The .Nm function conforms to -.St -p1003.2 . +.St -susv2 . .Sh HISTORY A .Nm command appeared in .At v1 . +.Sh BUGS +Some multibyte space characters are not detected correctly. Index: wc/wc.c =================================================================== RCS file: /home/ncvs/src/usr.bin/wc/wc.c,v retrieving revision 1.12 diff -u -r1.12 wc.c --- wc/wc.c 2001/12/11 22:23:53 1.12 +++ wc/wc.c 2002/02/03 11:40:22 @@ -51,14 +51,22 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +/* XXX iswspace() is not yet implemented by the C library. */ +#ifndef _WCTYPE_H_ +#define iswspace(c) ((c) <= UCHAR_MAX && isspace(c)) +#endif + u_quad_t tlinect, twordct, tcharct; -int doline, doword, dochar; +int doline, doword, dochar, multi; int cnt __P((const char *)); void usage __P((void)); @@ -72,17 +80,20 @@ (void) setlocale(LC_CTYPE, ""); - while ((ch = getopt(argc, argv, "lwc")) != -1) - switch((char)ch) { + while ((ch = getopt(argc, argv, "clmw")) != -1) + switch(ch) { + case 'm': + multi = 1; + /*FALLTHROUGH*/ + case 'c': + dochar = 1; + break; case 'l': doline = 1; break; case 'w': doword = 1; break; - case 'c': - dochar = 1; - break; case '?': default: usage(); @@ -128,10 +139,11 @@ { struct stat sb; u_quad_t linect, wordct, charct; - int fd, len; + int fd, len, mbpos, n; short gotsp; u_char *p; - u_char buf[MAXBSIZE], ch; + u_char buf[MAXBSIZE], mbbuf[MB_LEN_MAX+1]; + wchar_t ch; linect = wordct = charct = 0; if (file == NULL) { @@ -149,7 +161,7 @@ * lines than to get words, since the word count requires some * logic. */ - if (doline) { + if (doline && !multi) { while ((len = read(fd, buf, MAXBSIZE))) { if (len == -1) { warn("%s: read", file); @@ -174,7 +186,7 @@ * If all we need is the number of characters and it's a * regular or linked file, just stat the puppy. */ - if (dochar) { + if (dochar && !multi) { if (fstat(fd, &sb)) { warn("%s: fstat", file); (void)close(fd); @@ -190,23 +202,34 @@ } /* Do it the hard way... */ -word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) { +word: mbpos = 0; + mbtowc(NULL, NULL, 0); + for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) { if (len == -1) { warn("%s: read", file); (void)close(fd); return (1); } - /* - * This loses in the presence of multi-byte characters. - * To do it right would require a function to return a - * character while knowing how many bytes it consumed. - */ - charct += len; for (p = buf; len--;) { - ch = *p++; - if (ch == '\n') + if (!multi) + ch = (wchar_t)*p++; + else { + mbbuf[mbpos++] = *p++; + if (mbpos > MB_LEN_MAX) { + /* Multibyte sequence too long. */ + mbtowc(NULL, NULL, 0); + mbpos = 0; + continue; + } + if ((n = mbtowc(&ch, mbbuf, mbpos)) <= 0) + /* Multibyte sequence not done yet. */ + continue; + mbpos = 0; + } + charct++; + if (ch == (wchar_t)'\n') ++linect; - if (isspace(ch)) + if (iswspace(ch)) gotsp = 1; else if (gotsp) { gotsp = 0; @@ -233,6 +256,6 @@ void usage() { - (void)fprintf(stderr, "usage: wc [-clw] [file ...]\n"); + (void)fprintf(stderr, "usage: wc [-c|-m] [-lw] [file ...]\n"); exit(1); } --envbJBWh7q8WU6mo-- To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-standards" in the body of the message