Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 18 Jan 2013 22:50:01 GMT
From:      Brooks Davis <brooks@FreeBSD.org>
To:        freebsd-bugs@FreeBSD.org
Subject:   Re: bin/175418: update vis(3) and vis(1) to support multibyte characters
Message-ID:  <201301182250.r0IMo10K040592@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help
The following reply was made to PR bin/175418; it has been noted by GNATS.

From: Brooks Davis <brooks@FreeBSD.org>
To: "J.R. Oldroyd" <fbsd@opal.com>
Cc: FreeBSD-gnats-submit@FreeBSD.org
Subject: Re: bin/175418: update vis(3) and vis(1) to support multibyte
 characters
Date: Fri, 18 Jan 2013 16:40:16 -0600

 On Fri, Jan 18, 2013 at 03:12:30PM -0500, J.R. Oldroyd wrote:
 > The vis(3) library calls and the vis(1) program do not support multibyte
 > character sets.  As a result many printable characters are not displayed
 > properly and vice-versa.  This patch enhances vis(3) to support multibyte
 > characters according to the setting of LC_CTYPE and also adjusts vis(1)
 > so that it reads input in multibyte aware manner.
 
 Thank you for your submission.  In a case of lousy timing, I merged the
 replaceming of our vis(3) implementation with NetBSD's to stable/9 four
 days ago.  Any changes now need to go through NetBSD.  The good news is
 that we share a common heritage so much of your patch may still apply (I
 haven't tried).
 
 -- Brooks
 
 > 
 > Since vis(3) is also used by ps(1), this patch fixes ps(1) so that wide
 > characters in command arguments are displayed properly.
 > >How-To-Repeat:
 > n/a
 > >Fix:
 > --- lib/libc/gen/vis.c.orig	2013-01-02 19:26:41.000000000 -0500
 > +++ lib/libc/gen/vis.c	2013-01-17 14:45:55.000000000 -0500
 > @@ -35,167 +35,233 @@
 >  
 >  #include <sys/types.h>
 >  #include <limits.h>
 > +#include <stdlib.h>
 > +#include <wchar.h>
 > +#include <wctype.h>
 > +#include <string.h>
 >  #include <ctype.h>
 >  #include <stdio.h>
 >  #include <vis.h>
 >  
 > -#define	isoctal(c)	(((u_char)(c)) >= '0' && ((u_char)(c)) <= '7')
 > +#define	iswoctal(c)	(((u_char)(c)) >= L'0' && ((u_char)(c)) <= L'7')
 >  
 >  /*
 > - * vis - visually encode characters
 > + * _vis - visually encode wide characters
 >   */
 > -char *
 > -vis(dst, c, flag, nextc)
 > -	char *dst;
 > -	int c, nextc;
 > +wchar_t *
 > +_vis(dst, c, flag, nextc)
 > +	wchar_t *dst;
 > +	wint_t c, nextc;
 >  	int flag;
 >  {
 > -	c = (unsigned char)c;
 > -
 >  	if (flag & VIS_HTTPSTYLE) {
 >  		/* Described in RFC 1808 */
 > -		if (!(isalnum(c) /* alpha-numeric */
 > +		if (!(iswalnum(c) /* alpha-numeric */
 >  		    /* safe */
 > -		    || c == '$' || c == '-' || c == '_' || c == '.' || c == '+'
 > +		    || c == L'$' || c == L'-' || c == L'_' || c == L'.' || c == L'+'
 >  		    /* extra */
 > -		    || c == '!' || c == '*' || c == '\'' || c == '('
 > -		    || c == ')' || c == ',')) {
 > -			*dst++ = '%';
 > -			snprintf(dst, 4, (c < 16 ? "0%X" : "%X"), c);
 > +		    || c == L'!' || c == L'*' || c == L'\'' || c == L'('
 > +		    || c == L')' || c == L',')) {
 > +			*dst++ = L'%';
 > +			swprintf(dst, 4, (c < 16 ? L"0%X" : L"%X"), c);
 >  			dst += 2;
 >  			goto done;
 >  		}
 >  	}
 >  
 >  	if ((flag & VIS_GLOB) &&
 > -	    (c == '*' || c == '?' || c == '[' || c == '#'))
 > +	    (c == L'*' || c == L'?' || c == L'[' || c == L'#'))
 >  		;
 > -	else if (isgraph(c) ||
 > -	   ((flag & VIS_SP) == 0 && c == ' ') ||
 > -	   ((flag & VIS_TAB) == 0 && c == '\t') ||
 > -	   ((flag & VIS_NL) == 0 && c == '\n') ||
 > -	   ((flag & VIS_SAFE) && (c == '\b' || c == '\007' || c == '\r'))) {
 > +	else if (iswgraph(c) ||
 > +	   ((flag & VIS_SP) == 0 && c == L' ') ||
 > +	   ((flag & VIS_TAB) == 0 && c == L'\t') ||
 > +	   ((flag & VIS_NL) == 0 && c == L'\n') ||
 > +	   ((flag & VIS_SAFE) && (c == L'\b' || c == L'\007' || c == L'\r'))) {
 >  		*dst++ = c;
 > -		if (c == '\\' && (flag & VIS_NOSLASH) == 0)
 > -			*dst++ = '\\';
 > -		*dst = '\0';
 > -		return (dst);
 > +		if (c == L'\\' && (flag & VIS_NOSLASH) == 0)
 > +			*dst++ = L'\\';
 > +		goto done;
 >  	}
 >  
 >  	if (flag & VIS_CSTYLE) {
 >  		switch(c) {
 > -		case '\n':
 > -			*dst++ = '\\';
 > -			*dst++ = 'n';
 > -			goto done;
 > -		case '\r':
 > -			*dst++ = '\\';
 > -			*dst++ = 'r';
 > -			goto done;
 > -		case '\b':
 > -			*dst++ = '\\';
 > -			*dst++ = 'b';
 > -			goto done;
 > -		case '\a':
 > -			*dst++ = '\\';
 > -			*dst++ = 'a';
 > -			goto done;
 > -		case '\v':
 > -			*dst++ = '\\';
 > -			*dst++ = 'v';
 > -			goto done;
 > -		case '\t':
 > -			*dst++ = '\\';
 > -			*dst++ = 't';
 > -			goto done;
 > -		case '\f':
 > -			*dst++ = '\\';
 > -			*dst++ = 'f';
 > -			goto done;
 > -		case ' ':
 > -			*dst++ = '\\';
 > -			*dst++ = 's';
 > -			goto done;
 > -		case '\0':
 > -			*dst++ = '\\';
 > -			*dst++ = '0';
 > -			if (isoctal(nextc)) {
 > -				*dst++ = '0';
 > -				*dst++ = '0';
 > +		case L'\n':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'n';
 > +			goto done;
 > +		case L'\r':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'r';
 > +			goto done;
 > +		case L'\b':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'b';
 > +			goto done;
 > +		case L'\a':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'a';
 > +			goto done;
 > +		case L'\v':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'v';
 > +			goto done;
 > +		case L'\t':
 > +			*dst++ = L'\\';
 > +			*dst++ = L't';
 > +			goto done;
 > +		case L'\f':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'f';
 > +			goto done;
 > +		case L' ':
 > +			*dst++ = L'\\';
 > +			*dst++ = L's';
 > +			goto done;
 > +		case L'\0':
 > +			*dst++ = L'\\';
 > +			*dst++ = L'0';
 > +			if (iswoctal(nextc)) {
 > +				*dst++ = L'0';
 > +				*dst++ = L'0';
 >  			}
 >  			goto done;
 >  		}
 >  	}
 > -	if (((c & 0177) == ' ') || isgraph(c) || (flag & VIS_OCTAL)) {
 > -		*dst++ = '\\';
 > -		*dst++ = ((u_char)c >> 6 & 07) + '0';
 > -		*dst++ = ((u_char)c >> 3 & 07) + '0';
 > -		*dst++ = ((u_char)c & 07) + '0';
 > +	if (((c & 0177) == L' ') || (flag & VIS_OCTAL)) {
 > +		*dst++ = L'\\';
 > +		*dst++ = ((u_char)c >> 6 & 07) + L'0';
 > +		*dst++ = ((u_char)c >> 3 & 07) + L'0';
 > +		*dst++ = ((u_char)c & 07) + L'0';
 >  		goto done;
 >  	}
 >  	if ((flag & VIS_NOSLASH) == 0)
 > -		*dst++ = '\\';
 > +		*dst++ = L'\\';
 >  	if (c & 0200) {
 >  		c &= 0177;
 > -		*dst++ = 'M';
 > +		*dst++ = L'M';
 >  	}
 > -	if (iscntrl(c)) {
 > -		*dst++ = '^';
 > +	if (iswcntrl(c)) {
 > +		*dst++ = L'^';
 >  		if (c == 0177)
 > -			*dst++ = '?';
 > +			*dst++ = L'?';
 >  		else
 > -			*dst++ = c + '@';
 > +			*dst++ = c + L'@';
 >  	} else {
 > -		*dst++ = '-';
 > +		*dst++ = L'-';
 >  		*dst++ = c;
 >  	}
 >  done:
 > -	*dst = '\0';
 > +	*dst = L'\0';
 >  	return (dst);
 >  }
 >  
 >  /*
 > + * vis - visually encode characters
 > + */
 > +char *
 > +vis(dst, c, flag, nextc)
 > +	char *dst;
 > +	int c, nextc;
 > +	int flag;
 > +{
 > +	/*
 > +	 * Output may be up to 4 times the size of input plus
 > +	 * 1 for the NUL.
 > +	 */
 > +	wchar_t res[5];
 > +
 > +	_vis(res, (wint_t) c, flag, (wint_t) nextc);
 > +	wcstombs(dst, res, wcslen(res)+sizeof(wchar_t));
 > +	return (dst + strlen(dst));
 > +}
 > +
 > +/*
 >   * strvis, strvisx - visually encode characters from src into dst
 >   *
 >   *	Dst must be 4 times the size of src to account for possible
 >   *	expansion.  The length of dst, not including the trailing NUL,
 >   *	is returned.
 >   *
 > - *	Strvisx encodes exactly len bytes from src into dst.
 > + *	Strvisx encodes exactly len characters from src into dst.
 >   *	This is useful for encoding a block of data.
 >   */
 >  int
 > -strvis(dst, src, flag)
 > -	char *dst;
 > -	const char *src;
 > +strvis(mbdst, mbsrc, flag)
 > +	char *mbdst;
 > +	const char *mbsrc;
 >  	int flag;
 >  {
 > -	char c;
 > -	char *start;
 > +	wchar_t *dst, *src;
 > +	wchar_t *pdst, *psrc;
 > +	wchar_t c;
 > +	wchar_t *start;
 > +
 > +	if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1),
 > +	    sizeof(wchar_t))) == NULL)
 > +		return -1;
 > +	if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1),
 > +	    sizeof(wchar_t))) == NULL) {
 > +		free((void *) psrc);
 > +		return -1;
 > +	}
 > +
 > +	dst = pdst;
 > +	src = psrc;
 > +
 > +	mbstowcs(src, mbsrc, strlen(mbsrc) + 1);
 >  
 >  	for (start = dst; (c = *src); )
 > -		dst = vis(dst, c, flag, *++src);
 > -	*dst = '\0';
 > +		dst = _vis(dst, c, flag, *++src);
 > +
 > +	wcstombs(mbdst, start, dst - start + sizeof(wchar_t));
 > +
 > +	free((void *) pdst);
 > +	free((void *) psrc);
 > +
 >  	return (dst - start);
 >  }
 >  
 >  int
 > -strvisx(dst, src, len, flag)
 > -	char *dst;
 > -	const char *src;
 > -	size_t len;
 > +strvisx(mbdst, mbsrc, mblen, flag)
 > +	char *mbdst;
 > +	const char *mbsrc;
 > +	size_t mblen;
 >  	int flag;
 >  {
 > -	int c;
 > -	char *start;
 > +	wchar_t *dst, *src;
 > +	wchar_t *pdst, *psrc;
 > +	wchar_t c;
 > +	wchar_t *start;
 > +	size_t len;
 > +
 > +	if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1),
 > +	    sizeof(wchar_t))) == NULL)
 > +		return -1;
 > +	if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1),
 > +	    sizeof(wchar_t))) == NULL) {
 > +		free((void *) psrc);
 > +		return -1;
 > +	}
 > +
 > +	dst = pdst;
 > +	src = psrc;
 >  
 > -	for (start = dst; len > 1; len--) {
 > +	len = mbstowcs(src, mbsrc, strlen(mbsrc) + 1);
 > +
 > +	if (len < mblen)
 > +		mblen = len;
 > +
 > +	for (start = dst; mblen > 1; mblen--) {
 >  		c = *src;
 > -		dst = vis(dst, c, flag, *++src);
 > +		dst = _vis(dst, c, flag, *++src);
 >  	}
 > -	if (len)
 > -		dst = vis(dst, *src, flag, '\0');
 > -	*dst = '\0';
 > +	if (mblen)
 > +		dst = _vis(dst, *src, flag, L'\0');
 > +
 > +	wcstombs(mbdst, start, dst - start + sizeof(wchar_t));
 > +
 > +	free((void *) pdst);
 > +	free((void *) psrc);
 >  
 >  	return (dst - start);
 >  }
 > --- lib/libc/gen/vis.3.orig	2013-01-02 19:26:40.000000000 -0500
 > +++ lib/libc/gen/vis.3	2013-01-17 14:28:02.000000000 -0500
 > @@ -300,9 +300,6 @@
 >  .Sh HISTORY
 >  These functions first appeared in
 >  .Bx 4.4 .
 > -.Sh BUGS
 > -The
 > -.Nm
 > -family of functions do not recognize multibyte characters, and thus
 > -may consider them to be non-printable when they are in fact printable
 > -(and vice versa.)
 > +.Pp
 > +The functions were augmented to add multibyte character support in
 > +.Fx 9.1 .
 > --- usr.bin/vis/vis.c.orig	2013-01-02 19:15:19.000000000 -0500
 > +++ usr.bin/vis/vis.c	2013-01-16 20:21:54.000000000 -0500
 > @@ -45,6 +45,7 @@
 >  #include <locale.h>
 >  #include <stdio.h>
 >  #include <stdlib.h>
 > +#include <wchar.h>
 >  #include <unistd.h>
 >  #include <vis.h>
 >  
 > @@ -139,12 +140,12 @@
 >  	static int col = 0;
 >  	static char dummy[] = "\0";
 >  	char *cp = dummy+1; /* so *(cp-1) starts out != '\n' */
 > -	int c, rachar;
 > +	wint_t c, rachar;
 >  	char buff[5];
 >  
 > -	c = getc(fp);
 > +	c = getwc(fp);
 >  	while (c != EOF) {
 > -		rachar = getc(fp);
 > +		rachar = getwc(fp);
 >  		if (none) {
 >  			cp = buff;
 >  			*cp++ = c;
 > @@ -159,7 +160,7 @@
 >  			*cp++ = '\n';
 >  			*cp = '\0';
 >  		} else
 > -			(void) vis(buff, (char)c, eflags, (char)rachar);
 > +			(void) vis(buff, c, eflags, rachar);
 >  
 >  		cp = buff;
 >  		if (fold) {
 > --- usr.bin/vis/vis.1.orig	2013-01-02 19:15:19.000000000 -0500
 > +++ usr.bin/vis/vis.1	2013-01-17 14:34:16.000000000 -0500
 > @@ -128,11 +128,11 @@
 >  .Nm
 >  command appeared in
 >  .Bx 4.4 .
 > -.Sh BUGS
 > -Due to limitations in the underlying
 > +.Pp
 > +The underlying
 >  .Xr vis 3
 > -function, the
 > +function was augmented to add multibyte character support in
 > +.Fx 9.1
 > +at which point the
 >  .Nm
 > -utility
 > -does not recognize multibyte characters, and thus may consider them to be
 > -non-printable when they are in fact printable (and vice versa).
 > +utility was also updated to be multibyte character aware.
 > >Release-Note:
 > >Audit-Trail:
 > >Unformatted:
 > _______________________________________________
 > freebsd-bugs@freebsd.org mailing list
 > http://lists.freebsd.org/mailman/listinfo/freebsd-bugs
 > To unsubscribe, send any mail to "freebsd-bugs-unsubscribe@freebsd.org"
 > 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201301182250.r0IMo10K040592>