Date: Fri, 25 Feb 2011 15:53:11 +0100 From: Jilles Tjoelker <jilles@stack.nl> To: freebsd-hackers@freebsd.org, freebsd-i18n@freebsd.org Subject: Basic UTF-8 support for sh(1) Message-ID: <20110225145311.GA4423@stack.nl>
index | next in thread | raw e-mail
[-- Attachment #1 --]
Here is a patch that adds basic UTF-8 support to sh(1). This is enabled
if the locale is set appropriately.
Features:
* ${#var} counts codepoints. (Really, bytes with (b & 0xc0) != 0x80.)
* ?, [...] patterns match codepoints instead of bytes. They do not match
invalid sequences. This is so that ${var#?} removes the first
codepoint, not the first byte. However, * continues to match any
string and an invalid sequence matches an identical invalid sequence.
(This differs from fnmatch(3).)
Internal:
* CTL* bytes are moved to bytes that cannot occur in UTF-8 so that
mbrtowc(3) can be used directly. The new locations do occur in
iso-8859-* encodings.
Limitations:
* Only UTF-8 support is added, not any other multibyte encodings. I do
not want to bloat up sh with mbrtowc(3) and similar everywhere.
* Invalid sequences may not be handled as desired. It seems aborting on
invalid UTF-8 sequences would break things, so they are let through.
This also avoids bloating the code up with checking everywhere.
* There is no special treatment for combining characters, accented
letters may match ? or ?? or even more depending on normalization
form. This matches other code in FreeBSD and is usually good enough
because normalization forms that use as few codepoints as possible
tend to be used.
* IFS remains byte-based as in ksh93 (but unlike bash and zsh).
* Our version of libedit does not support UTF-8 so sh will still be
rather unpleasant to use interactively with characters not in
us-ascii.
Is this useful and worth the (small) bloat?
A somewhat related feature is support for \uNNNN and \UNNNNNNNN
sequences in $'...' (this will be added to POSIX, see
http://austingroupbugs.net/view.php?id=249 and I plan to add it to sh).
Ideally, these are converted using iconv(3) but as long as it is not
unconditionally available in base or if it is not supposed to be used,
the codepoints can be encoded in UTF-8 for UTF-8 locales, leaving other
locales with question marks.
--
Jilles Tjoelker
[-- Attachment #2 --]
Index: parser.h
===================================================================
--- parser.h (revision 218371)
+++ parser.h (working copy)
@@ -34,16 +34,16 @@
*/
/* control characters in argument strings */
-#define CTLESC '\201'
-#define CTLVAR '\202'
-#define CTLENDVAR '\203'
-#define CTLBACKQ '\204'
+#define CTLESC '\300'
+#define CTLVAR '\301'
+#define CTLENDVAR '\371'
+#define CTLBACKQ '\372'
#define CTLQUOTE 01 /* ored with CTLBACKQ code if in quotes */
/* CTLBACKQ | CTLQUOTE == '\205' */
-#define CTLARI '\206'
-#define CTLENDARI '\207'
-#define CTLQUOTEMARK '\210'
-#define CTLQUOTEEND '\211' /* only for ${v+-...} */
+#define CTLARI '\374'
+#define CTLENDARI '\375'
+#define CTLQUOTEMARK '\376'
+#define CTLQUOTEEND '\377' /* only for ${v+-...} */
/* variable substitution byte (follows CTLVAR) */
#define VSTYPE 0x0f /* type of variable substitution */
Index: sh.1
===================================================================
--- sh.1 (revision 218467)
+++ sh.1 (working copy)
@@ -2510,4 +2510,7 @@ was originally written by
.Sh BUGS
The
.Nm
-utility does not recognize multibyte characters.
+utility does not recognize multibyte characters other than UTF-8.
+The line editing library
+.Xr editline 3
+does not recognize multibyte characters.
Index: expand.c
===================================================================
--- expand.c (revision 218371)
+++ expand.c (working copy)
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
/*
* Routines to expand arguments to commands. We have to deal with
@@ -111,16 +112,16 @@ static void addfname(char *);
static struct strlist *expsort(struct strlist *);
static struct strlist *msort(struct strlist *, int);
static char *cvtnum(int, char *);
-static int collate_range_cmp(int, int);
+static int collate_range_cmp(wchar_t, wchar_t);
static int
-collate_range_cmp(int c1, int c2)
+collate_range_cmp(wchar_t c1, wchar_t c2)
{
- static char s1[2], s2[2];
+ static wchar_t s1[2], s2[2];
s1[0] = c1;
s2[0] = c2;
- return (strcoll(s1, s2));
+ return (wcscoll(s1, s2));
}
/*
@@ -665,6 +666,7 @@ evalvar(char *p, int flag)
int special;
int startloc;
int varlen;
+ int varlenb;
int easy;
int quotes = flag & (EXP_FULL | EXP_CASE | EXP_REDIR);
@@ -712,8 +714,15 @@ again: /* jump here after setting a variable with
if (special) {
varvalue(var, varflags & VSQUOTE, subtype, flag);
if (subtype == VSLENGTH) {
- varlen = expdest - stackblock() - startloc;
- STADJUST(-varlen, expdest);
+ varlenb = expdest - stackblock() - startloc;
+ varlen = varlenb;
+ if (localeisutf8) {
+ val = stackblock() + startloc;
+ for (;val != expdest; val++)
+ if ((*val & 0xC0) == 0x80)
+ varlen--;
+ }
+ STADJUST(-varlenb, expdest);
}
} else {
char const *syntax = (varflags & VSQUOTE) ? DQSYNTAX
@@ -721,7 +730,9 @@ again: /* jump here after setting a variable with
if (subtype == VSLENGTH) {
for (;*val; val++)
- varlen++;
+ if (!localeisutf8 ||
+ (*val & 0xC0) != 0x80)
+ varlen++;
}
else {
if (quotes)
@@ -1367,6 +1378,23 @@ msort(struct strlist *list, int len)
+static wchar_t
+get_wc(const char **p)
+{
+ wchar_t c;
+ int chrlen;
+
+ chrlen = mbtowc(&c, *p, 4);
+ if (chrlen == 0)
+ return 0;
+ else if (chrlen == -1)
+ c = *(*p)++;
+ else
+ *p += chrlen;
+ return c;
+}
+
+
/*
* Returns true if the pattern matches the string.
*/
@@ -1376,6 +1404,7 @@ patmatch(const char *pattern, const char *string,
{
const char *p, *q;
char c;
+ wchar_t wc, wc2;
p = pattern;
q = string;
@@ -1394,7 +1423,11 @@ patmatch(const char *pattern, const char *string,
case '?':
if (squoted && *q == CTLESC)
q++;
- if (*q++ == '\0')
+ if (localeisutf8)
+ wc = get_wc(&q);
+ else
+ wc = *q++;
+ if (wc == '\0')
return 0;
break;
case '*':
@@ -1424,7 +1457,7 @@ patmatch(const char *pattern, const char *string,
case '[': {
const char *endp;
int invert, found;
- char chr;
+ wchar_t chr;
endp = p;
if (*endp == '!' || *endp == '^')
@@ -1445,8 +1478,11 @@ patmatch(const char *pattern, const char *string,
p++;
}
found = 0;
- chr = *q++;
- if (squoted && chr == CTLESC)
+ if (squoted && *q == CTLESC)
+ q++;
+ if (localeisutf8)
+ chr = get_wc(&q);
+ else
chr = *q++;
if (chr == '\0')
return 0;
@@ -1456,19 +1492,27 @@ patmatch(const char *pattern, const char *string,
continue;
if (c == CTLESC)
c = *p++;
+ if (localeisutf8 && c & 0x80) {
+ p--;
+ wc = get_wc(&p);
+ } else
+ wc = c;
if (*p == '-' && p[1] != ']') {
p++;
while (*p == CTLQUOTEMARK)
p++;
if (*p == CTLESC)
p++;
- if ( collate_range_cmp(chr, c) >= 0
- && collate_range_cmp(chr, *p) <= 0
+ if (localeisutf8)
+ wc2 = get_wc(&p);
+ else
+ wc2 = *p++;
+ if ( collate_range_cmp(chr, wc) >= 0
+ && collate_range_cmp(chr, wc2) <= 0
)
found = 1;
- p++;
} else {
- if (chr == c)
+ if (chr == wc)
found = 1;
}
} while ((c = *p++) != ']');
Index: main.c
===================================================================
--- main.c (revision 218371)
+++ main.c (working copy)
@@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$");
int rootpid;
int rootshell;
struct jmploc main_handler;
+int localeisutf8;
static void read_profile(const char *);
static char *find_dot_file(char *);
@@ -96,6 +97,7 @@ main(int argc, char *argv[])
char *shinit;
(void) setlocale(LC_ALL, "");
+ updatecharset();
state = 0;
if (setjmp(main_handler.loc)) {
switch (exception) {
Index: var.c
===================================================================
--- var.c (revision 218371)
+++ var.c (working copy)
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
*/
#include <locale.h>
+#include <langinfo.h>
#include "shell.h"
#include "output.h"
@@ -361,6 +362,7 @@ setvareq(char *s, int flags)
if ((vp->flags & VEXPORT) && localevar(s)) {
change_env(s, 1);
(void) setlocale(LC_ALL, "");
+ updatecharset();
}
INTON;
return;
@@ -379,6 +381,7 @@ setvareq(char *s, int flags)
if ((vp->flags & VEXPORT) && localevar(s)) {
change_env(s, 1);
(void) setlocale(LC_ALL, "");
+ updatecharset();
}
INTON;
}
@@ -480,6 +483,7 @@ bltinsetlocale(void)
if (loc != NULL) {
setlocale(LC_ALL, loc);
INTON;
+ updatecharset();
return;
}
locdef = bltinlookup("LANG", 0);
@@ -491,6 +495,7 @@ bltinsetlocale(void)
setlocale(locale_categories[i], loc);
}
INTON;
+ updatecharset();
}
/*
@@ -505,13 +510,25 @@ bltinunsetlocale(void)
for (lp = cmdenviron ; lp ; lp = lp->next) {
if (localevar(lp->text)) {
setlocale(LC_ALL, "");
+ updatecharset();
return;
}
}
INTON;
}
+/*
+ * Update the localeisutf8 flag.
+ */
+void
+updatecharset(void)
+{
+ char *charset;
+ charset = nl_langinfo(CODESET);
+ localeisutf8 = !strcmp(charset, "UTF-8");
+}
+
/*
* Generate a list of exported variables. This routine is used to construct
* the third argument to execve when executing a program.
@@ -656,6 +673,7 @@ exportcmd(int argc, char **argv)
if ((vp->flags & VEXPORT) && localevar(vp->text)) {
change_env(vp->text, 1);
(void) setlocale(LC_ALL, "");
+ updatecharset();
}
goto found;
}
@@ -850,6 +868,7 @@ unsetvar(const char *s)
if ((vp->flags & VEXPORT) && localevar(vp->text)) {
change_env(s, 0);
setlocale(LC_ALL, "");
+ updatecharset();
}
vp->flags &= ~VEXPORT;
vp->flags |= VUNSET;
Index: var.h
===================================================================
--- var.h (revision 218371)
+++ var.h (working copy)
@@ -81,6 +81,8 @@ extern struct var vhistsize;
extern struct var vterm;
#endif
+extern int localeisutf8;
+
/*
* The following macros access the values of the above variables.
* They have to skip over the name. They return the null string
@@ -112,6 +114,7 @@ char *lookupvar(const char *);
char *bltinlookup(const char *, int);
void bltinsetlocale(void);
void bltinunsetlocale(void);
+void updatecharset(void);
char **environment(void);
int showvarscmd(int, char **);
int exportcmd(int, char **);
help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20110225145311.GA4423>
