Date: Fri, 25 Feb 2011 15:53:11 +0100 From: Jilles Tjoelker <jilles@stack.nl> To: freebsd-hackers@freebsd.org, freebsd-i18n@freebsd.org Subject: Basic UTF-8 support for sh(1) Message-ID: <20110225145311.GA4423@stack.nl>
next in thread | raw e-mail | index | archive | help
--ikeVEW9yuYc//A+q Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Here is a patch that adds basic UTF-8 support to sh(1). This is enabled if the locale is set appropriately. Features: * ${#var} counts codepoints. (Really, bytes with (b & 0xc0) != 0x80.) * ?, [...] patterns match codepoints instead of bytes. They do not match invalid sequences. This is so that ${var#?} removes the first codepoint, not the first byte. However, * continues to match any string and an invalid sequence matches an identical invalid sequence. (This differs from fnmatch(3).) Internal: * CTL* bytes are moved to bytes that cannot occur in UTF-8 so that mbrtowc(3) can be used directly. The new locations do occur in iso-8859-* encodings. Limitations: * Only UTF-8 support is added, not any other multibyte encodings. I do not want to bloat up sh with mbrtowc(3) and similar everywhere. * Invalid sequences may not be handled as desired. It seems aborting on invalid UTF-8 sequences would break things, so they are let through. This also avoids bloating the code up with checking everywhere. * There is no special treatment for combining characters, accented letters may match ? or ?? or even more depending on normalization form. This matches other code in FreeBSD and is usually good enough because normalization forms that use as few codepoints as possible tend to be used. * IFS remains byte-based as in ksh93 (but unlike bash and zsh). * Our version of libedit does not support UTF-8 so sh will still be rather unpleasant to use interactively with characters not in us-ascii. Is this useful and worth the (small) bloat? A somewhat related feature is support for \uNNNN and \UNNNNNNNN sequences in $'...' (this will be added to POSIX, see http://austingroupbugs.net/view.php?id=249 and I plan to add it to sh). Ideally, these are converted using iconv(3) but as long as it is not unconditionally available in base or if it is not supposed to be used, the codepoints can be encoded in UTF-8 for UTF-8 locales, leaving other locales with question marks. -- Jilles Tjoelker --ikeVEW9yuYc//A+q Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="sh-utf8.patch" Index: parser.h =================================================================== --- parser.h (revision 218371) +++ parser.h (working copy) @@ -34,16 +34,16 @@ */ /* control characters in argument strings */ -#define CTLESC '\201' -#define CTLVAR '\202' -#define CTLENDVAR '\203' -#define CTLBACKQ '\204' +#define CTLESC '\300' +#define CTLVAR '\301' +#define CTLENDVAR '\371' +#define CTLBACKQ '\372' #define CTLQUOTE 01 /* ored with CTLBACKQ code if in quotes */ /* CTLBACKQ | CTLQUOTE == '\205' */ -#define CTLARI '\206' -#define CTLENDARI '\207' -#define CTLQUOTEMARK '\210' -#define CTLQUOTEEND '\211' /* only for ${v+-...} */ +#define CTLARI '\374' +#define CTLENDARI '\375' +#define CTLQUOTEMARK '\376' +#define CTLQUOTEEND '\377' /* only for ${v+-...} */ /* variable substitution byte (follows CTLVAR) */ #define VSTYPE 0x0f /* type of variable substitution */ Index: sh.1 =================================================================== --- sh.1 (revision 218467) +++ sh.1 (working copy) @@ -2510,4 +2510,7 @@ was originally written by .Sh BUGS The .Nm -utility does not recognize multibyte characters. +utility does not recognize multibyte characters other than UTF-8. +The line editing library +.Xr editline 3 +does not recognize multibyte characters. Index: expand.c =================================================================== --- expand.c (revision 218371) +++ expand.c (working copy) @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <wchar.h> /* * Routines to expand arguments to commands. We have to deal with @@ -111,16 +112,16 @@ static void addfname(char *); static struct strlist *expsort(struct strlist *); static struct strlist *msort(struct strlist *, int); static char *cvtnum(int, char *); -static int collate_range_cmp(int, int); +static int collate_range_cmp(wchar_t, wchar_t); static int -collate_range_cmp(int c1, int c2) +collate_range_cmp(wchar_t c1, wchar_t c2) { - static char s1[2], s2[2]; + static wchar_t s1[2], s2[2]; s1[0] = c1; s2[0] = c2; - return (strcoll(s1, s2)); + return (wcscoll(s1, s2)); } /* @@ -665,6 +666,7 @@ evalvar(char *p, int flag) int special; int startloc; int varlen; + int varlenb; int easy; int quotes = flag & (EXP_FULL | EXP_CASE | EXP_REDIR); @@ -712,8 +714,15 @@ again: /* jump here after setting a variable with if (special) { varvalue(var, varflags & VSQUOTE, subtype, flag); if (subtype == VSLENGTH) { - varlen = expdest - stackblock() - startloc; - STADJUST(-varlen, expdest); + varlenb = expdest - stackblock() - startloc; + varlen = varlenb; + if (localeisutf8) { + val = stackblock() + startloc; + for (;val != expdest; val++) + if ((*val & 0xC0) == 0x80) + varlen--; + } + STADJUST(-varlenb, expdest); } } else { char const *syntax = (varflags & VSQUOTE) ? DQSYNTAX @@ -721,7 +730,9 @@ again: /* jump here after setting a variable with if (subtype == VSLENGTH) { for (;*val; val++) - varlen++; + if (!localeisutf8 || + (*val & 0xC0) != 0x80) + varlen++; } else { if (quotes) @@ -1367,6 +1378,23 @@ msort(struct strlist *list, int len) +static wchar_t +get_wc(const char **p) +{ + wchar_t c; + int chrlen; + + chrlen = mbtowc(&c, *p, 4); + if (chrlen == 0) + return 0; + else if (chrlen == -1) + c = *(*p)++; + else + *p += chrlen; + return c; +} + + /* * Returns true if the pattern matches the string. */ @@ -1376,6 +1404,7 @@ patmatch(const char *pattern, const char *string, { const char *p, *q; char c; + wchar_t wc, wc2; p = pattern; q = string; @@ -1394,7 +1423,11 @@ patmatch(const char *pattern, const char *string, case '?': if (squoted && *q == CTLESC) q++; - if (*q++ == '\0') + if (localeisutf8) + wc = get_wc(&q); + else + wc = *q++; + if (wc == '\0') return 0; break; case '*': @@ -1424,7 +1457,7 @@ patmatch(const char *pattern, const char *string, case '[': { const char *endp; int invert, found; - char chr; + wchar_t chr; endp = p; if (*endp == '!' || *endp == '^') @@ -1445,8 +1478,11 @@ patmatch(const char *pattern, const char *string, p++; } found = 0; - chr = *q++; - if (squoted && chr == CTLESC) + if (squoted && *q == CTLESC) + q++; + if (localeisutf8) + chr = get_wc(&q); + else chr = *q++; if (chr == '\0') return 0; @@ -1456,19 +1492,27 @@ patmatch(const char *pattern, const char *string, continue; if (c == CTLESC) c = *p++; + if (localeisutf8 && c & 0x80) { + p--; + wc = get_wc(&p); + } else + wc = c; if (*p == '-' && p[1] != ']') { p++; while (*p == CTLQUOTEMARK) p++; if (*p == CTLESC) p++; - if ( collate_range_cmp(chr, c) >= 0 - && collate_range_cmp(chr, *p) <= 0 + if (localeisutf8) + wc2 = get_wc(&p); + else + wc2 = *p++; + if ( collate_range_cmp(chr, wc) >= 0 + && collate_range_cmp(chr, wc2) <= 0 ) found = 1; - p++; } else { - if (chr == c) + if (chr == wc) found = 1; } } while ((c = *p++) != ']'); Index: main.c =================================================================== --- main.c (revision 218371) +++ main.c (working copy) @@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$"); int rootpid; int rootshell; struct jmploc main_handler; +int localeisutf8; static void read_profile(const char *); static char *find_dot_file(char *); @@ -96,6 +97,7 @@ main(int argc, char *argv[]) char *shinit; (void) setlocale(LC_ALL, ""); + updatecharset(); state = 0; if (setjmp(main_handler.loc)) { switch (exception) { Index: var.c =================================================================== --- var.c (revision 218371) +++ var.c (working copy) @@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$"); */ #include <locale.h> +#include <langinfo.h> #include "shell.h" #include "output.h" @@ -361,6 +362,7 @@ setvareq(char *s, int flags) if ((vp->flags & VEXPORT) && localevar(s)) { change_env(s, 1); (void) setlocale(LC_ALL, ""); + updatecharset(); } INTON; return; @@ -379,6 +381,7 @@ setvareq(char *s, int flags) if ((vp->flags & VEXPORT) && localevar(s)) { change_env(s, 1); (void) setlocale(LC_ALL, ""); + updatecharset(); } INTON; } @@ -480,6 +483,7 @@ bltinsetlocale(void) if (loc != NULL) { setlocale(LC_ALL, loc); INTON; + updatecharset(); return; } locdef = bltinlookup("LANG", 0); @@ -491,6 +495,7 @@ bltinsetlocale(void) setlocale(locale_categories[i], loc); } INTON; + updatecharset(); } /* @@ -505,13 +510,25 @@ bltinunsetlocale(void) for (lp = cmdenviron ; lp ; lp = lp->next) { if (localevar(lp->text)) { setlocale(LC_ALL, ""); + updatecharset(); return; } } INTON; } +/* + * Update the localeisutf8 flag. + */ +void +updatecharset(void) +{ + char *charset; + charset = nl_langinfo(CODESET); + localeisutf8 = !strcmp(charset, "UTF-8"); +} + /* * Generate a list of exported variables. This routine is used to construct * the third argument to execve when executing a program. @@ -656,6 +673,7 @@ exportcmd(int argc, char **argv) if ((vp->flags & VEXPORT) && localevar(vp->text)) { change_env(vp->text, 1); (void) setlocale(LC_ALL, ""); + updatecharset(); } goto found; } @@ -850,6 +868,7 @@ unsetvar(const char *s) if ((vp->flags & VEXPORT) && localevar(vp->text)) { change_env(s, 0); setlocale(LC_ALL, ""); + updatecharset(); } vp->flags &= ~VEXPORT; vp->flags |= VUNSET; Index: var.h =================================================================== --- var.h (revision 218371) +++ var.h (working copy) @@ -81,6 +81,8 @@ extern struct var vhistsize; extern struct var vterm; #endif +extern int localeisutf8; + /* * The following macros access the values of the above variables. * They have to skip over the name. They return the null string @@ -112,6 +114,7 @@ char *lookupvar(const char *); char *bltinlookup(const char *, int); void bltinsetlocale(void); void bltinunsetlocale(void); +void updatecharset(void); char **environment(void); int showvarscmd(int, char **); int exportcmd(int, char **); --ikeVEW9yuYc//A+q--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20110225145311.GA4423>