Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 6 Mar 2010 19:21:57 +0000 (UTC)
From:      "Andrey A. Chernov" <ache@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r204803 - head/usr.bin/uniq
Message-ID:  <201003061921.o26JLv36014114@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: ache
Date: Sat Mar  6 19:21:57 2010
New Revision: 204803
URL: http://svn.freebsd.org/changeset/base/204803

Log:
  1) Rewrite input processing to not exit with error on the first EILSEQ found
  in the input data but fallback to "binary equal" check instead.
  
  POSIX says: "The input file shall be a text file", nothing more,
  so the text file with illegal sequence is valid input.
  BTW, GNU sort does not fails on EILSEQ too.
  
  2) Speedup input processing a bit in complex cases like skipping fields,
  chars or ignore case.
  
  3) Enforce the implied LINE_MAX limit (from POSIX definition of "text file"
  and POSIX uniq(1) description).

Modified:
  head/usr.bin/uniq/uniq.c

Modified: head/usr.bin/uniq/uniq.c
==============================================================================
--- head/usr.bin/uniq/uniq.c	Sat Mar  6 17:31:09 2010	(r204802)
+++ head/usr.bin/uniq/uniq.c	Sat Mar  6 19:21:57 2010	(r204803)
@@ -60,31 +60,25 @@ static const char rcsid[] =
 #include <wchar.h>
 #include <wctype.h>
 
-#define	INITLINELEN	(LINE_MAX + 1)
-#define	MAXLINELEN	((SIZE_MAX / sizeof(wchar_t)) / 2)
-
-int cflag, dflag, uflag;
+int cflag, dflag, uflag, iflag;
 int numchars, numfields, repeats;
 
 FILE	*file(const char *, const char *);
-wchar_t	*getline(wchar_t *, size_t *, FILE *);
-void	 show(FILE *, wchar_t *);
+wchar_t	*convert(wchar_t *, const char *);
+char	*getlinemax(char *, FILE *);
+void	 show(FILE *, const char *);
 wchar_t	*skip(wchar_t *);
 void	 obsolete(char *[]);
 static void	 usage(void);
-int      wcsicoll(wchar_t *, wchar_t *);
 
 int
 main (int argc, char *argv[])
 {
-	wchar_t *t1, *t2;
+	wchar_t *tprev, *tthis, *wprev, *wthis, *wp;
 	FILE *ifp, *ofp;
-	int ch, b1;
-	size_t prevbuflen, thisbuflen;
-	wchar_t *prevline, *thisline;
-	char *p;
+	int ch, comp;
+	char *prevline, *thisline, *p;
 	const char *ifn;
-	int iflag = 0, comp;
 
 	(void) setlocale(LC_ALL, "");
 
@@ -139,48 +133,47 @@ main (int argc, char *argv[])
 	if (argc > 1)
 		ofp = file(argv[1], "w");
 
- 	prevbuflen = INITLINELEN;
- 	thisbuflen = INITLINELEN;
- 	prevline = malloc(prevbuflen * sizeof(*prevline));
- 	thisline = malloc(thisbuflen * sizeof(*thisline));
-	if (prevline == NULL || thisline == NULL)
+	prevline = malloc(LINE_MAX);
+	thisline = malloc(LINE_MAX);
+	wprev = malloc(LINE_MAX * sizeof(*wprev));
+	wthis = malloc(LINE_MAX * sizeof(*wthis));
+	if (prevline == NULL || thisline == NULL ||
+	    wprev == NULL || wthis == NULL)
 		err(1, "malloc");
 
-	if ((prevline = getline(prevline, &prevbuflen, ifp)) == NULL) {
+	if ((prevline = getlinemax(prevline, ifp)) == NULL) {
 		if (ferror(ifp))
 			err(1, "%s", ifn);
 		exit(0);
 	}
+	tprev = convert(wprev, prevline);
+
 	if (!cflag && uflag && dflag)
 		show(ofp, prevline);
 
-	while ((thisline = getline(thisline, &thisbuflen, ifp)) != NULL) {
-		/* If requested get the chosen fields + character offsets. */
-		if (numfields || numchars) {
-			t1 = skip(thisline);
-			t2 = skip(prevline);
-		} else {
-			t1 = thisline;
-			t2 = prevline;
-		}
+	while ((thisline = getlinemax(thisline, ifp)) != NULL) {
+		tthis = convert(wthis, thisline);
 
-		/* If different, print; set previous to new value. */
-		if (iflag)
-			comp = wcsicoll(t1, t2);
+		if (tthis == NULL && tprev == NULL)
+			comp = strcmp(thisline, prevline);
+		else if (tthis == NULL || tprev == NULL)
+			comp = 1;
 		else
-			comp = wcscoll(t1, t2);
+			comp = wcscoll(tthis, tprev);
 
 		if (comp) {
+			/* If different, print; set previous to new value. */
 			if (cflag || !dflag || !uflag)
 				show(ofp, prevline);
-			t1 = prevline;
-			b1 = prevbuflen;
+			p = prevline;
+			wp = wprev;
 			prevline = thisline;
-			prevbuflen = thisbuflen;
+			wprev = wthis;
+			tprev = tthis;
 			if (!cflag && uflag && dflag)
 				show(ofp, prevline);
-			thisline = t1;
-			thisbuflen = b1;
+			thisline = p;
+			wthis = wp;
 			repeats = 0;
 		} else
 			++repeats;
@@ -192,44 +185,61 @@ main (int argc, char *argv[])
 	exit(0);
 }
 
-wchar_t *
-getline(wchar_t *buf, size_t *buflen, FILE *fp)
+char *
+getlinemax(char *buf, FILE *fp)
 {
 	size_t bufpos;
-	wint_t ch;
+	int ch;
 
 	bufpos = 0;
-	while ((ch = getwc(fp)) != WEOF && ch != '\n') {
-		if (bufpos + 1 >= *buflen) {
-			*buflen = *buflen * 2;
-			if (*buflen > MAXLINELEN)
-				errx(1,
-				    "Maximum line buffer length (%zu) exceeded",
-				    MAXLINELEN);
-			buf = reallocf(buf, *buflen * sizeof(*buf));
-			if (buf == NULL)
-				err(1, "reallocf");
-		}
+	while ((ch = getc(fp)) != EOF && ch != '\n') {
 		buf[bufpos++] = ch;
+		if (bufpos >= LINE_MAX)
+			errx(1, "Maximum line length (%zu) exceeded",
+			     LINE_MAX);
 	}
 	buf[bufpos] = '\0';
 
 	return (bufpos != 0 || ch == '\n' ? buf : NULL);
 }
 
+wchar_t *
+convert(wchar_t *buf, const char *str)
+{
+	size_t n;
+	wchar_t *p, *ret;
+
+	if ((n = mbstowcs(buf, str, LINE_MAX)) == LINE_MAX)
+		errx(1, "Maximum line length (%zu) exceeded", LINE_MAX);
+	else if (n != (size_t)-1) {
+		/* If requested get the chosen fields + character offsets. */
+		if (numfields || numchars)
+			ret = skip(buf);
+		else
+			ret = buf;
+		if (iflag) {
+			for (p = ret; *p != L'\0'; p++)
+				*p = towlower(*p);
+		}
+	} else
+		ret = NULL;
+
+	return (ret);
+}
+
 /*
  * show --
  *	Output a line depending on the flags and number of repetitions
  *	of the line.
  */
 void
-show(FILE *ofp, wchar_t *str)
+show(FILE *ofp, const char *str)
 {
 
 	if (cflag)
-		(void)fprintf(ofp, "%4d %ls\n", repeats + 1, str);
+		(void)fprintf(ofp, "%4d %s\n", repeats + 1, str);
 	if ((dflag && repeats) || (uflag && !repeats))
-		(void)fprintf(ofp, "%ls\n", str);
+		(void)fprintf(ofp, "%s\n", str);
 }
 
 wchar_t *
@@ -237,13 +247,14 @@ skip(wchar_t *str)
 {
 	int nchars, nfields;
 
-	for (nfields = 0; *str != '\0' && nfields++ != numfields; ) {
+	for (nfields = 0; *str != L'\0' && nfields++ != numfields; ) {
 		while (iswblank(*str))
 			str++;
-		while (*str != '\0' && !iswblank(*str))
+		while (*str != L'\0' && !iswblank(*str))
 			str++;
 	}
-	for (nchars = numchars; nchars-- && *str; ++str);
+	for (nchars = numchars; nchars-- && *str != L'\0'; ++str)
+		;
 	return(str);
 }
 
@@ -293,52 +304,3 @@ usage(void)
 "usage: uniq [-c | -d | -u] [-i] [-f fields] [-s chars] [input [output]]\n");
 	exit(1);
 }
-
-static size_t wcsicoll_l1_buflen = 0, wcsicoll_l2_buflen = 0;
-static wchar_t *wcsicoll_l1_buf = NULL, *wcsicoll_l2_buf = NULL;
-
-int
-wcsicoll(wchar_t *s1, wchar_t *s2)
-{
-	wchar_t *p;
-	size_t l1, l2;
-	size_t new_l1_buflen, new_l2_buflen;
-
-	l1 = wcslen(s1) + 1;
-	l2 = wcslen(s2) + 1;
-	new_l1_buflen = wcsicoll_l1_buflen;
-	new_l2_buflen = wcsicoll_l2_buflen;
-	while (new_l1_buflen < l1) {
-		if (new_l1_buflen == 0)
-			new_l1_buflen = INITLINELEN;
-		else
-			new_l1_buflen *= 2;
-	}
-	while (new_l2_buflen < l2) {
-		if (new_l2_buflen == 0)
-			new_l2_buflen = INITLINELEN;
-		else
-			new_l2_buflen *= 2;
-	}
-	if (new_l1_buflen > wcsicoll_l1_buflen) {
-		wcsicoll_l1_buf = reallocf(wcsicoll_l1_buf, new_l1_buflen * sizeof(*wcsicoll_l1_buf));
-		if (wcsicoll_l1_buf == NULL)
-                	err(1, "reallocf");
-		wcsicoll_l1_buflen = new_l1_buflen;
-	}
-	if (new_l2_buflen > wcsicoll_l2_buflen) {
-		wcsicoll_l2_buf = reallocf(wcsicoll_l2_buf, new_l2_buflen * sizeof(*wcsicoll_l2_buf));
-		if (wcsicoll_l2_buf == NULL)
-                	err(1, "reallocf");
-		wcsicoll_l2_buflen = new_l2_buflen;
-	}
-
-	for (p = wcsicoll_l1_buf; *s1; s1++)
-		*p++ = towlower(*s1);
-	*p = '\0';
-	for (p = wcsicoll_l2_buf; *s2; s2++)
-		*p++ = towlower(*s2);
-	*p = '\0';
-
-	return (wcscoll(wcsicoll_l1_buf, wcsicoll_l2_buf));
-}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201003061921.o26JLv36014114>