Date: Fri, 18 Jan 2013 22:50:01 GMT From: Brooks Davis <brooks@FreeBSD.org> To: freebsd-bugs@FreeBSD.org Subject: Re: bin/175418: update vis(3) and vis(1) to support multibyte characters Message-ID: <201301182250.r0IMo10K040592@freefall.freebsd.org>
next in thread | raw e-mail | index | archive | help
The following reply was made to PR bin/175418; it has been noted by GNATS. From: Brooks Davis <brooks@FreeBSD.org> To: "J.R. Oldroyd" <fbsd@opal.com> Cc: FreeBSD-gnats-submit@FreeBSD.org Subject: Re: bin/175418: update vis(3) and vis(1) to support multibyte characters Date: Fri, 18 Jan 2013 16:40:16 -0600 On Fri, Jan 18, 2013 at 03:12:30PM -0500, J.R. Oldroyd wrote: > The vis(3) library calls and the vis(1) program do not support multibyte > character sets. As a result many printable characters are not displayed > properly and vice-versa. This patch enhances vis(3) to support multibyte > characters according to the setting of LC_CTYPE and also adjusts vis(1) > so that it reads input in multibyte aware manner. Thank you for your submission. In a case of lousy timing, I merged the replaceming of our vis(3) implementation with NetBSD's to stable/9 four days ago. Any changes now need to go through NetBSD. The good news is that we share a common heritage so much of your patch may still apply (I haven't tried). -- Brooks > > Since vis(3) is also used by ps(1), this patch fixes ps(1) so that wide > characters in command arguments are displayed properly. > >How-To-Repeat: > n/a > >Fix: > --- lib/libc/gen/vis.c.orig 2013-01-02 19:26:41.000000000 -0500 > +++ lib/libc/gen/vis.c 2013-01-17 14:45:55.000000000 -0500 > @@ -35,167 +35,233 @@ > > #include <sys/types.h> > #include <limits.h> > +#include <stdlib.h> > +#include <wchar.h> > +#include <wctype.h> > +#include <string.h> > #include <ctype.h> > #include <stdio.h> > #include <vis.h> > > -#define isoctal(c) (((u_char)(c)) >= '0' && ((u_char)(c)) <= '7') > +#define iswoctal(c) (((u_char)(c)) >= L'0' && ((u_char)(c)) <= L'7') > > /* > - * vis - visually encode characters > + * _vis - visually encode wide characters > */ > -char * > -vis(dst, c, flag, nextc) > - char *dst; > - int c, nextc; > +wchar_t * > +_vis(dst, c, flag, nextc) > + wchar_t *dst; > + wint_t c, nextc; > int flag; > { > - c = (unsigned char)c; > - > if (flag & VIS_HTTPSTYLE) { > /* Described in RFC 1808 */ > - if (!(isalnum(c) /* alpha-numeric */ > + if (!(iswalnum(c) /* alpha-numeric */ > /* safe */ > - || c == '$' || c == '-' || c == '_' || c == '.' || c == '+' > + || c == L'$' || c == L'-' || c == L'_' || c == L'.' || c == L'+' > /* extra */ > - || c == '!' || c == '*' || c == '\'' || c == '(' > - || c == ')' || c == ',')) { > - *dst++ = '%'; > - snprintf(dst, 4, (c < 16 ? "0%X" : "%X"), c); > + || c == L'!' || c == L'*' || c == L'\'' || c == L'(' > + || c == L')' || c == L',')) { > + *dst++ = L'%'; > + swprintf(dst, 4, (c < 16 ? L"0%X" : L"%X"), c); > dst += 2; > goto done; > } > } > > if ((flag & VIS_GLOB) && > - (c == '*' || c == '?' || c == '[' || c == '#')) > + (c == L'*' || c == L'?' || c == L'[' || c == L'#')) > ; > - else if (isgraph(c) || > - ((flag & VIS_SP) == 0 && c == ' ') || > - ((flag & VIS_TAB) == 0 && c == '\t') || > - ((flag & VIS_NL) == 0 && c == '\n') || > - ((flag & VIS_SAFE) && (c == '\b' || c == '\007' || c == '\r'))) { > + else if (iswgraph(c) || > + ((flag & VIS_SP) == 0 && c == L' ') || > + ((flag & VIS_TAB) == 0 && c == L'\t') || > + ((flag & VIS_NL) == 0 && c == L'\n') || > + ((flag & VIS_SAFE) && (c == L'\b' || c == L'\007' || c == L'\r'))) { > *dst++ = c; > - if (c == '\\' && (flag & VIS_NOSLASH) == 0) > - *dst++ = '\\'; > - *dst = '\0'; > - return (dst); > + if (c == L'\\' && (flag & VIS_NOSLASH) == 0) > + *dst++ = L'\\'; > + goto done; > } > > if (flag & VIS_CSTYLE) { > switch(c) { > - case '\n': > - *dst++ = '\\'; > - *dst++ = 'n'; > - goto done; > - case '\r': > - *dst++ = '\\'; > - *dst++ = 'r'; > - goto done; > - case '\b': > - *dst++ = '\\'; > - *dst++ = 'b'; > - goto done; > - case '\a': > - *dst++ = '\\'; > - *dst++ = 'a'; > - goto done; > - case '\v': > - *dst++ = '\\'; > - *dst++ = 'v'; > - goto done; > - case '\t': > - *dst++ = '\\'; > - *dst++ = 't'; > - goto done; > - case '\f': > - *dst++ = '\\'; > - *dst++ = 'f'; > - goto done; > - case ' ': > - *dst++ = '\\'; > - *dst++ = 's'; > - goto done; > - case '\0': > - *dst++ = '\\'; > - *dst++ = '0'; > - if (isoctal(nextc)) { > - *dst++ = '0'; > - *dst++ = '0'; > + case L'\n': > + *dst++ = L'\\'; > + *dst++ = L'n'; > + goto done; > + case L'\r': > + *dst++ = L'\\'; > + *dst++ = L'r'; > + goto done; > + case L'\b': > + *dst++ = L'\\'; > + *dst++ = L'b'; > + goto done; > + case L'\a': > + *dst++ = L'\\'; > + *dst++ = L'a'; > + goto done; > + case L'\v': > + *dst++ = L'\\'; > + *dst++ = L'v'; > + goto done; > + case L'\t': > + *dst++ = L'\\'; > + *dst++ = L't'; > + goto done; > + case L'\f': > + *dst++ = L'\\'; > + *dst++ = L'f'; > + goto done; > + case L' ': > + *dst++ = L'\\'; > + *dst++ = L's'; > + goto done; > + case L'\0': > + *dst++ = L'\\'; > + *dst++ = L'0'; > + if (iswoctal(nextc)) { > + *dst++ = L'0'; > + *dst++ = L'0'; > } > goto done; > } > } > - if (((c & 0177) == ' ') || isgraph(c) || (flag & VIS_OCTAL)) { > - *dst++ = '\\'; > - *dst++ = ((u_char)c >> 6 & 07) + '0'; > - *dst++ = ((u_char)c >> 3 & 07) + '0'; > - *dst++ = ((u_char)c & 07) + '0'; > + if (((c & 0177) == L' ') || (flag & VIS_OCTAL)) { > + *dst++ = L'\\'; > + *dst++ = ((u_char)c >> 6 & 07) + L'0'; > + *dst++ = ((u_char)c >> 3 & 07) + L'0'; > + *dst++ = ((u_char)c & 07) + L'0'; > goto done; > } > if ((flag & VIS_NOSLASH) == 0) > - *dst++ = '\\'; > + *dst++ = L'\\'; > if (c & 0200) { > c &= 0177; > - *dst++ = 'M'; > + *dst++ = L'M'; > } > - if (iscntrl(c)) { > - *dst++ = '^'; > + if (iswcntrl(c)) { > + *dst++ = L'^'; > if (c == 0177) > - *dst++ = '?'; > + *dst++ = L'?'; > else > - *dst++ = c + '@'; > + *dst++ = c + L'@'; > } else { > - *dst++ = '-'; > + *dst++ = L'-'; > *dst++ = c; > } > done: > - *dst = '\0'; > + *dst = L'\0'; > return (dst); > } > > /* > + * vis - visually encode characters > + */ > +char * > +vis(dst, c, flag, nextc) > + char *dst; > + int c, nextc; > + int flag; > +{ > + /* > + * Output may be up to 4 times the size of input plus > + * 1 for the NUL. > + */ > + wchar_t res[5]; > + > + _vis(res, (wint_t) c, flag, (wint_t) nextc); > + wcstombs(dst, res, wcslen(res)+sizeof(wchar_t)); > + return (dst + strlen(dst)); > +} > + > +/* > * strvis, strvisx - visually encode characters from src into dst > * > * Dst must be 4 times the size of src to account for possible > * expansion. The length of dst, not including the trailing NUL, > * is returned. > * > - * Strvisx encodes exactly len bytes from src into dst. > + * Strvisx encodes exactly len characters from src into dst. > * This is useful for encoding a block of data. > */ > int > -strvis(dst, src, flag) > - char *dst; > - const char *src; > +strvis(mbdst, mbsrc, flag) > + char *mbdst; > + const char *mbsrc; > int flag; > { > - char c; > - char *start; > + wchar_t *dst, *src; > + wchar_t *pdst, *psrc; > + wchar_t c; > + wchar_t *start; > + > + if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1), > + sizeof(wchar_t))) == NULL) > + return -1; > + if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1), > + sizeof(wchar_t))) == NULL) { > + free((void *) psrc); > + return -1; > + } > + > + dst = pdst; > + src = psrc; > + > + mbstowcs(src, mbsrc, strlen(mbsrc) + 1); > > for (start = dst; (c = *src); ) > - dst = vis(dst, c, flag, *++src); > - *dst = '\0'; > + dst = _vis(dst, c, flag, *++src); > + > + wcstombs(mbdst, start, dst - start + sizeof(wchar_t)); > + > + free((void *) pdst); > + free((void *) psrc); > + > return (dst - start); > } > > int > -strvisx(dst, src, len, flag) > - char *dst; > - const char *src; > - size_t len; > +strvisx(mbdst, mbsrc, mblen, flag) > + char *mbdst; > + const char *mbsrc; > + size_t mblen; > int flag; > { > - int c; > - char *start; > + wchar_t *dst, *src; > + wchar_t *pdst, *psrc; > + wchar_t c; > + wchar_t *start; > + size_t len; > + > + if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1), > + sizeof(wchar_t))) == NULL) > + return -1; > + if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1), > + sizeof(wchar_t))) == NULL) { > + free((void *) psrc); > + return -1; > + } > + > + dst = pdst; > + src = psrc; > > - for (start = dst; len > 1; len--) { > + len = mbstowcs(src, mbsrc, strlen(mbsrc) + 1); > + > + if (len < mblen) > + mblen = len; > + > + for (start = dst; mblen > 1; mblen--) { > c = *src; > - dst = vis(dst, c, flag, *++src); > + dst = _vis(dst, c, flag, *++src); > } > - if (len) > - dst = vis(dst, *src, flag, '\0'); > - *dst = '\0'; > + if (mblen) > + dst = _vis(dst, *src, flag, L'\0'); > + > + wcstombs(mbdst, start, dst - start + sizeof(wchar_t)); > + > + free((void *) pdst); > + free((void *) psrc); > > return (dst - start); > } > --- lib/libc/gen/vis.3.orig 2013-01-02 19:26:40.000000000 -0500 > +++ lib/libc/gen/vis.3 2013-01-17 14:28:02.000000000 -0500 > @@ -300,9 +300,6 @@ > .Sh HISTORY > These functions first appeared in > .Bx 4.4 . > -.Sh BUGS > -The > -.Nm > -family of functions do not recognize multibyte characters, and thus > -may consider them to be non-printable when they are in fact printable > -(and vice versa.) > +.Pp > +The functions were augmented to add multibyte character support in > +.Fx 9.1 . > --- usr.bin/vis/vis.c.orig 2013-01-02 19:15:19.000000000 -0500 > +++ usr.bin/vis/vis.c 2013-01-16 20:21:54.000000000 -0500 > @@ -45,6 +45,7 @@ > #include <locale.h> > #include <stdio.h> > #include <stdlib.h> > +#include <wchar.h> > #include <unistd.h> > #include <vis.h> > > @@ -139,12 +140,12 @@ > static int col = 0; > static char dummy[] = "\0"; > char *cp = dummy+1; /* so *(cp-1) starts out != '\n' */ > - int c, rachar; > + wint_t c, rachar; > char buff[5]; > > - c = getc(fp); > + c = getwc(fp); > while (c != EOF) { > - rachar = getc(fp); > + rachar = getwc(fp); > if (none) { > cp = buff; > *cp++ = c; > @@ -159,7 +160,7 @@ > *cp++ = '\n'; > *cp = '\0'; > } else > - (void) vis(buff, (char)c, eflags, (char)rachar); > + (void) vis(buff, c, eflags, rachar); > > cp = buff; > if (fold) { > --- usr.bin/vis/vis.1.orig 2013-01-02 19:15:19.000000000 -0500 > +++ usr.bin/vis/vis.1 2013-01-17 14:34:16.000000000 -0500 > @@ -128,11 +128,11 @@ > .Nm > command appeared in > .Bx 4.4 . > -.Sh BUGS > -Due to limitations in the underlying > +.Pp > +The underlying > .Xr vis 3 > -function, the > +function was augmented to add multibyte character support in > +.Fx 9.1 > +at which point the > .Nm > -utility > -does not recognize multibyte characters, and thus may consider them to be > -non-printable when they are in fact printable (and vice versa). > +utility was also updated to be multibyte character aware. > >Release-Note: > >Audit-Trail: > >Unformatted: > _______________________________________________ > freebsd-bugs@freebsd.org mailing list > http://lists.freebsd.org/mailman/listinfo/freebsd-bugs > To unsubscribe, send any mail to "freebsd-bugs-unsubscribe@freebsd.org" >
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201301182250.r0IMo10K040592>