Date: Sun, 3 Feb 2002 23:07:58 +1100 From: "Tim J. Robbins" <tim@robbins.dropbear.id.au> To: freebsd-standards@FreeBSD.ORG Subject: wc -m option Message-ID: <20020203230758.A19532@descent.robbins.dropbear.id.au>
index | next in thread | raw e-mail
[-- Attachment #1 --]
This patch adds the SUSV2 -m option to wc to handle multibyte characters.
Since libc is missing iswspace() from <wctype.h>, I've had to use isspace(),
which doesn't detect some wide space characters. For example, the Japanese
locale given in euc(4):
/*
* Code Set 1
*/
[snip]
SPACE 0x09 - 0x0d 0x20
[snip]
/*
* Code Set 2
*/
SPACE 0xa1a1
[snip]
The space characters in Code Set 1 can be used to separate words, but the
one in Code Set 2 will not be detected.
Tim
[-- Attachment #2 --]
Index: wc/wc.1
===================================================================
RCS file: /home/ncvs/src/usr.bin/wc/wc.1,v
retrieving revision 1.10
diff -u -r1.10 wc.1
--- wc/wc.1 2001/08/15 09:09:45 1.10
+++ wc/wc.1 2002/02/03 11:40:21
@@ -43,7 +43,8 @@
.Nd word, line, and byte count
.Sh SYNOPSIS
.Nm
-.Op Fl clw
+.Op Fl c | m
+.Op Fl lw
.Op Ar
.Sh DESCRIPTION
The
@@ -71,6 +72,9 @@
.It Fl l
The number of lines in each input file
is written to the standard output.
+.It Fl m
+The number of characters in each input file
+is written to the standard output.
.It Fl w
The number of words in each input file
is written to the standard output.
@@ -79,7 +83,12 @@
When an option is specified,
.Nm
only reports the information requested by that option.
-The default action is equivalent to specifying all of the flags.
+The default action is equivalent to specifying the
+.Fl c ,
+.Fl l
+and
+.Fl w
+flags.
.Pp
If no files are specified, the standard input is used and no
file name is displayed.
@@ -108,9 +117,11 @@
The
.Nm
function conforms to
-.St -p1003.2 .
+.St -susv2 .
.Sh HISTORY
A
.Nm
command appeared in
.At v1 .
+.Sh BUGS
+Some multibyte space characters are not detected correctly.
Index: wc/wc.c
===================================================================
RCS file: /home/ncvs/src/usr.bin/wc/wc.c,v
retrieving revision 1.12
diff -u -r1.12 wc.c
--- wc/wc.c 2001/12/11 22:23:53 1.12
+++ wc/wc.c 2002/02/03 11:40:22
@@ -51,14 +51,22 @@
#include <ctype.h>
#include <err.h>
#include <fcntl.h>
+#include <limits.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
+/* XXX iswspace() is not yet implemented by the C library. */
+#ifndef _WCTYPE_H_
+#define iswspace(c) ((c) <= UCHAR_MAX && isspace(c))
+#endif
+
u_quad_t tlinect, twordct, tcharct;
-int doline, doword, dochar;
+int doline, doword, dochar, multi;
int cnt __P((const char *));
void usage __P((void));
@@ -72,17 +80,20 @@
(void) setlocale(LC_CTYPE, "");
- while ((ch = getopt(argc, argv, "lwc")) != -1)
- switch((char)ch) {
+ while ((ch = getopt(argc, argv, "clmw")) != -1)
+ switch(ch) {
+ case 'm':
+ multi = 1;
+ /*FALLTHROUGH*/
+ case 'c':
+ dochar = 1;
+ break;
case 'l':
doline = 1;
break;
case 'w':
doword = 1;
break;
- case 'c':
- dochar = 1;
- break;
case '?':
default:
usage();
@@ -128,10 +139,11 @@
{
struct stat sb;
u_quad_t linect, wordct, charct;
- int fd, len;
+ int fd, len, mbpos, n;
short gotsp;
u_char *p;
- u_char buf[MAXBSIZE], ch;
+ u_char buf[MAXBSIZE], mbbuf[MB_LEN_MAX+1];
+ wchar_t ch;
linect = wordct = charct = 0;
if (file == NULL) {
@@ -149,7 +161,7 @@
* lines than to get words, since the word count requires some
* logic.
*/
- if (doline) {
+ if (doline && !multi) {
while ((len = read(fd, buf, MAXBSIZE))) {
if (len == -1) {
warn("%s: read", file);
@@ -174,7 +186,7 @@
* If all we need is the number of characters and it's a
* regular or linked file, just stat the puppy.
*/
- if (dochar) {
+ if (dochar && !multi) {
if (fstat(fd, &sb)) {
warn("%s: fstat", file);
(void)close(fd);
@@ -190,23 +202,34 @@
}
/* Do it the hard way... */
-word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
+word: mbpos = 0;
+ mbtowc(NULL, NULL, 0);
+ for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
if (len == -1) {
warn("%s: read", file);
(void)close(fd);
return (1);
}
- /*
- * This loses in the presence of multi-byte characters.
- * To do it right would require a function to return a
- * character while knowing how many bytes it consumed.
- */
- charct += len;
for (p = buf; len--;) {
- ch = *p++;
- if (ch == '\n')
+ if (!multi)
+ ch = (wchar_t)*p++;
+ else {
+ mbbuf[mbpos++] = *p++;
+ if (mbpos > MB_LEN_MAX) {
+ /* Multibyte sequence too long. */
+ mbtowc(NULL, NULL, 0);
+ mbpos = 0;
+ continue;
+ }
+ if ((n = mbtowc(&ch, mbbuf, mbpos)) <= 0)
+ /* Multibyte sequence not done yet. */
+ continue;
+ mbpos = 0;
+ }
+ charct++;
+ if (ch == (wchar_t)'\n')
++linect;
- if (isspace(ch))
+ if (iswspace(ch))
gotsp = 1;
else if (gotsp) {
gotsp = 0;
@@ -233,6 +256,6 @@
void
usage()
{
- (void)fprintf(stderr, "usage: wc [-clw] [file ...]\n");
+ (void)fprintf(stderr, "usage: wc [-c|-m] [-lw] [file ...]\n");
exit(1);
}
help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020203230758.A19532>
