Date: Fri, 18 Jan 2013 15:12:30 -0500 (EST) From: "J.R. Oldroyd" <fbsd@opal.com> To: FreeBSD-gnats-submit@freebsd.org Subject: bin/175418: update vis(3) and vis(1) to support multibyte characters Message-ID: <201301182012.r0IKCUiE077301@shibato.opal.com> Resent-Message-ID: <201301182020.r0IKK0pf014238@freefall.freebsd.org>
next in thread | raw e-mail | index | archive | help
>Number: 175418 >Category: bin >Synopsis: update vis(3) and vis(1) to support multibyte characters >Confidential: no >Severity: non-critical >Priority: medium >Responsible: freebsd-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: sw-bug >Submitter-Id: current-users >Arrival-Date: Fri Jan 18 20:20:00 UTC 2013 >Closed-Date: >Last-Modified: >Originator: J.R. Oldroyd >Release: FreeBSD 9.1-RELEASE amd64 >Organization: >Environment: System: FreeBSD xx.opal.com 9.1-RELEASE FreeBSD 9.1-RELEASE #0 r244985: Tue Jan 8 10:51:13 EST 2013 xx@shibato.opal.com:/usr/src/sys/amd64/compile/GENERIC amd64 >Description: The vis(3) library calls and the vis(1) program do not support multibyte character sets. As a result many printable characters are not displayed properly and vice-versa. This patch enhances vis(3) to support multibyte characters according to the setting of LC_CTYPE and also adjusts vis(1) so that it reads input in multibyte aware manner. Since vis(3) is also used by ps(1), this patch fixes ps(1) so that wide characters in command arguments are displayed properly. >How-To-Repeat: n/a >Fix: --- lib/libc/gen/vis.c.orig 2013-01-02 19:26:41.000000000 -0500 +++ lib/libc/gen/vis.c 2013-01-17 14:45:55.000000000 -0500 @@ -35,167 +35,233 @@ #include <sys/types.h> #include <limits.h> +#include <stdlib.h> +#include <wchar.h> +#include <wctype.h> +#include <string.h> #include <ctype.h> #include <stdio.h> #include <vis.h> -#define isoctal(c) (((u_char)(c)) >= '0' && ((u_char)(c)) <= '7') +#define iswoctal(c) (((u_char)(c)) >= L'0' && ((u_char)(c)) <= L'7') /* - * vis - visually encode characters + * _vis - visually encode wide characters */ -char * -vis(dst, c, flag, nextc) - char *dst; - int c, nextc; +wchar_t * +_vis(dst, c, flag, nextc) + wchar_t *dst; + wint_t c, nextc; int flag; { - c = (unsigned char)c; - if (flag & VIS_HTTPSTYLE) { /* Described in RFC 1808 */ - if (!(isalnum(c) /* alpha-numeric */ + if (!(iswalnum(c) /* alpha-numeric */ /* safe */ - || c == '$' || c == '-' || c == '_' || c == '.' || c == '+' + || c == L'$' || c == L'-' || c == L'_' || c == L'.' || c == L'+' /* extra */ - || c == '!' || c == '*' || c == '\'' || c == '(' - || c == ')' || c == ',')) { - *dst++ = '%'; - snprintf(dst, 4, (c < 16 ? "0%X" : "%X"), c); + || c == L'!' || c == L'*' || c == L'\'' || c == L'(' + || c == L')' || c == L',')) { + *dst++ = L'%'; + swprintf(dst, 4, (c < 16 ? L"0%X" : L"%X"), c); dst += 2; goto done; } } if ((flag & VIS_GLOB) && - (c == '*' || c == '?' || c == '[' || c == '#')) + (c == L'*' || c == L'?' || c == L'[' || c == L'#')) ; - else if (isgraph(c) || - ((flag & VIS_SP) == 0 && c == ' ') || - ((flag & VIS_TAB) == 0 && c == '\t') || - ((flag & VIS_NL) == 0 && c == '\n') || - ((flag & VIS_SAFE) && (c == '\b' || c == '\007' || c == '\r'))) { + else if (iswgraph(c) || + ((flag & VIS_SP) == 0 && c == L' ') || + ((flag & VIS_TAB) == 0 && c == L'\t') || + ((flag & VIS_NL) == 0 && c == L'\n') || + ((flag & VIS_SAFE) && (c == L'\b' || c == L'\007' || c == L'\r'))) { *dst++ = c; - if (c == '\\' && (flag & VIS_NOSLASH) == 0) - *dst++ = '\\'; - *dst = '\0'; - return (dst); + if (c == L'\\' && (flag & VIS_NOSLASH) == 0) + *dst++ = L'\\'; + goto done; } if (flag & VIS_CSTYLE) { switch(c) { - case '\n': - *dst++ = '\\'; - *dst++ = 'n'; - goto done; - case '\r': - *dst++ = '\\'; - *dst++ = 'r'; - goto done; - case '\b': - *dst++ = '\\'; - *dst++ = 'b'; - goto done; - case '\a': - *dst++ = '\\'; - *dst++ = 'a'; - goto done; - case '\v': - *dst++ = '\\'; - *dst++ = 'v'; - goto done; - case '\t': - *dst++ = '\\'; - *dst++ = 't'; - goto done; - case '\f': - *dst++ = '\\'; - *dst++ = 'f'; - goto done; - case ' ': - *dst++ = '\\'; - *dst++ = 's'; - goto done; - case '\0': - *dst++ = '\\'; - *dst++ = '0'; - if (isoctal(nextc)) { - *dst++ = '0'; - *dst++ = '0'; + case L'\n': + *dst++ = L'\\'; + *dst++ = L'n'; + goto done; + case L'\r': + *dst++ = L'\\'; + *dst++ = L'r'; + goto done; + case L'\b': + *dst++ = L'\\'; + *dst++ = L'b'; + goto done; + case L'\a': + *dst++ = L'\\'; + *dst++ = L'a'; + goto done; + case L'\v': + *dst++ = L'\\'; + *dst++ = L'v'; + goto done; + case L'\t': + *dst++ = L'\\'; + *dst++ = L't'; + goto done; + case L'\f': + *dst++ = L'\\'; + *dst++ = L'f'; + goto done; + case L' ': + *dst++ = L'\\'; + *dst++ = L's'; + goto done; + case L'\0': + *dst++ = L'\\'; + *dst++ = L'0'; + if (iswoctal(nextc)) { + *dst++ = L'0'; + *dst++ = L'0'; } goto done; } } - if (((c & 0177) == ' ') || isgraph(c) || (flag & VIS_OCTAL)) { - *dst++ = '\\'; - *dst++ = ((u_char)c >> 6 & 07) + '0'; - *dst++ = ((u_char)c >> 3 & 07) + '0'; - *dst++ = ((u_char)c & 07) + '0'; + if (((c & 0177) == L' ') || (flag & VIS_OCTAL)) { + *dst++ = L'\\'; + *dst++ = ((u_char)c >> 6 & 07) + L'0'; + *dst++ = ((u_char)c >> 3 & 07) + L'0'; + *dst++ = ((u_char)c & 07) + L'0'; goto done; } if ((flag & VIS_NOSLASH) == 0) - *dst++ = '\\'; + *dst++ = L'\\'; if (c & 0200) { c &= 0177; - *dst++ = 'M'; + *dst++ = L'M'; } - if (iscntrl(c)) { - *dst++ = '^'; + if (iswcntrl(c)) { + *dst++ = L'^'; if (c == 0177) - *dst++ = '?'; + *dst++ = L'?'; else - *dst++ = c + '@'; + *dst++ = c + L'@'; } else { - *dst++ = '-'; + *dst++ = L'-'; *dst++ = c; } done: - *dst = '\0'; + *dst = L'\0'; return (dst); } /* + * vis - visually encode characters + */ +char * +vis(dst, c, flag, nextc) + char *dst; + int c, nextc; + int flag; +{ + /* + * Output may be up to 4 times the size of input plus + * 1 for the NUL. + */ + wchar_t res[5]; + + _vis(res, (wint_t) c, flag, (wint_t) nextc); + wcstombs(dst, res, wcslen(res)+sizeof(wchar_t)); + return (dst + strlen(dst)); +} + +/* * strvis, strvisx - visually encode characters from src into dst * * Dst must be 4 times the size of src to account for possible * expansion. The length of dst, not including the trailing NUL, * is returned. * - * Strvisx encodes exactly len bytes from src into dst. + * Strvisx encodes exactly len characters from src into dst. * This is useful for encoding a block of data. */ int -strvis(dst, src, flag) - char *dst; - const char *src; +strvis(mbdst, mbsrc, flag) + char *mbdst; + const char *mbsrc; int flag; { - char c; - char *start; + wchar_t *dst, *src; + wchar_t *pdst, *psrc; + wchar_t c; + wchar_t *start; + + if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1), + sizeof(wchar_t))) == NULL) + return -1; + if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1), + sizeof(wchar_t))) == NULL) { + free((void *) psrc); + return -1; + } + + dst = pdst; + src = psrc; + + mbstowcs(src, mbsrc, strlen(mbsrc) + 1); for (start = dst; (c = *src); ) - dst = vis(dst, c, flag, *++src); - *dst = '\0'; + dst = _vis(dst, c, flag, *++src); + + wcstombs(mbdst, start, dst - start + sizeof(wchar_t)); + + free((void *) pdst); + free((void *) psrc); + return (dst - start); } int -strvisx(dst, src, len, flag) - char *dst; - const char *src; - size_t len; +strvisx(mbdst, mbsrc, mblen, flag) + char *mbdst; + const char *mbsrc; + size_t mblen; int flag; { - int c; - char *start; + wchar_t *dst, *src; + wchar_t *pdst, *psrc; + wchar_t c; + wchar_t *start; + size_t len; + + if ((psrc = (wchar_t *) calloc((strlen(mbsrc) + 1), + sizeof(wchar_t))) == NULL) + return -1; + if ((pdst = (wchar_t *) calloc(((4 * strlen(mbsrc)) + 1), + sizeof(wchar_t))) == NULL) { + free((void *) psrc); + return -1; + } + + dst = pdst; + src = psrc; - for (start = dst; len > 1; len--) { + len = mbstowcs(src, mbsrc, strlen(mbsrc) + 1); + + if (len < mblen) + mblen = len; + + for (start = dst; mblen > 1; mblen--) { c = *src; - dst = vis(dst, c, flag, *++src); + dst = _vis(dst, c, flag, *++src); } - if (len) - dst = vis(dst, *src, flag, '\0'); - *dst = '\0'; + if (mblen) + dst = _vis(dst, *src, flag, L'\0'); + + wcstombs(mbdst, start, dst - start + sizeof(wchar_t)); + + free((void *) pdst); + free((void *) psrc); return (dst - start); } --- lib/libc/gen/vis.3.orig 2013-01-02 19:26:40.000000000 -0500 +++ lib/libc/gen/vis.3 2013-01-17 14:28:02.000000000 -0500 @@ -300,9 +300,6 @@ .Sh HISTORY These functions first appeared in .Bx 4.4 . -.Sh BUGS -The -.Nm -family of functions do not recognize multibyte characters, and thus -may consider them to be non-printable when they are in fact printable -(and vice versa.) +.Pp +The functions were augmented to add multibyte character support in +.Fx 9.1 . --- usr.bin/vis/vis.c.orig 2013-01-02 19:15:19.000000000 -0500 +++ usr.bin/vis/vis.c 2013-01-16 20:21:54.000000000 -0500 @@ -45,6 +45,7 @@ #include <locale.h> #include <stdio.h> #include <stdlib.h> +#include <wchar.h> #include <unistd.h> #include <vis.h> @@ -139,12 +140,12 @@ static int col = 0; static char dummy[] = "\0"; char *cp = dummy+1; /* so *(cp-1) starts out != '\n' */ - int c, rachar; + wint_t c, rachar; char buff[5]; - c = getc(fp); + c = getwc(fp); while (c != EOF) { - rachar = getc(fp); + rachar = getwc(fp); if (none) { cp = buff; *cp++ = c; @@ -159,7 +160,7 @@ *cp++ = '\n'; *cp = '\0'; } else - (void) vis(buff, (char)c, eflags, (char)rachar); + (void) vis(buff, c, eflags, rachar); cp = buff; if (fold) { --- usr.bin/vis/vis.1.orig 2013-01-02 19:15:19.000000000 -0500 +++ usr.bin/vis/vis.1 2013-01-17 14:34:16.000000000 -0500 @@ -128,11 +128,11 @@ .Nm command appeared in .Bx 4.4 . -.Sh BUGS -Due to limitations in the underlying +.Pp +The underlying .Xr vis 3 -function, the +function was augmented to add multibyte character support in +.Fx 9.1 +at which point the .Nm -utility -does not recognize multibyte characters, and thus may consider them to be -non-printable when they are in fact printable (and vice versa). +utility was also updated to be multibyte character aware. >Release-Note: >Audit-Trail: >Unformatted:
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201301182012.r0IKCUiE077301>