Date: Tue, 17 Nov 2009 07:21:28 +0000 (UTC) From: Edwin Groothuis <edwin@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r199350 - user/edwin/locale/usr.bin/unicodename2utf8 Message-ID: <200911170721.nAH7LS5t094878@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: edwin Date: Tue Nov 17 07:21:27 2009 New Revision: 199350 URL: http://svn.freebsd.org/changeset/base/199350 Log: Add the utf82unicode feature as requested by gabor@ Modified: user/edwin/locale/usr.bin/unicodename2utf8/Makefile user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1 user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c Modified: user/edwin/locale/usr.bin/unicodename2utf8/Makefile ============================================================================== --- user/edwin/locale/usr.bin/unicodename2utf8/Makefile Tue Nov 17 07:06:41 2009 (r199349) +++ user/edwin/locale/usr.bin/unicodename2utf8/Makefile Tue Nov 17 07:21:27 2009 (r199350) @@ -3,8 +3,11 @@ PROG= unicodename2utf8 SRCS= unicodename2utf8.c -NO_MAN= yes WARNS?= 6 +MAN= unicodename2utf8.1 + +LINKS= ${BINDIR}/unicodename2utf8 ${BINDIR}/utf82unicodename +MLINKS= unicodename2utf8.1 utf82unicodename.1 test: ./unicodename2utf8 \ Modified: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1 ============================================================================== --- user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1 Tue Nov 17 07:06:41 2009 (r199349) +++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.1 Tue Nov 17 07:21:27 2009 (r199350) @@ -28,22 +28,31 @@ .Dt unicode2utf8 1 .Os .Sh NAME -.Nm unicode2utf8 -.Nd converts a file with Unicode name definitions into UTF-8 character -definitions. +.Nm unicodename2utf8 , +.Nm utf82unicodename +.Nd convert a file with Unicode name definitions into UTF-8 character and +vice-versa. .Sh SYNOPSIS .Nm +.Fl cldr Ar directory +.Op Fl -input Ar filename +.Op Fl -output Ar filename +.Op Fl -reverse +.Nm utf82unicode .Fl -cldr Ar directory -.Fl -input Ar filename -.Fl -output Ar filename +.Op Fl -input Ar filename +.Op Fl -output Ar filename +.Op Fl -reverse .Sh DESCRIPTION The .Nm -utility is made to convert the Unicode encoded strings in the -contents of the specified input file with the corresponding UTF-8 -character definitions. -.Pp -Lines starting with a # are copied as-is. +utility converts the Unicode encoded strings in the contents of the +specified input file with the corresponding UTF-8 character +definitions. +The +.Nm utf82unicodename +utility converts the UTF-8 encoded strings in the contents of the +specified input file with the corresponding Unicode names. .Pp The Unicode encoded strings are specified between a '<' and a '>' sign. @@ -66,15 +75,16 @@ By default this should point to but for maintainers of the FreeBSD locale database this could point to their own extracted copy of the CLDR database. .It Fl -input Ar filename -The source file with the Unicode encoded strings. +The source file. +If not specific, stdin will be used. .It Fl -output Ar filename -The destination file with the Unicode encoded strings replaced with -their UTF-8 equivalents. +The output file. +If not specific, stdout will be used. +.It Fl -reverse +If specified, do the reverse conversions. .El .Sh EXIT STATUS -The -.Nm -utility exits 0 on success, and >0 if an error occurs. +The utilties exit with 0 on success, and >0 if an error occurs. .Sh SEE ALSO .Xr iconv 1 , .Xr bsdiconv 1 @@ -85,7 +95,5 @@ the maintainers of the file .Pa /usr/share/misc/UTF-8.cm .El .Sh AUTHORS -The -.Nm -utility and this manual page were written by +The utilities and this manual page were written by .An Edwin Groothuis Aq edwin@FreeBSD.org . Modified: user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c ============================================================================== --- user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c Tue Nov 17 07:06:41 2009 (r199349) +++ user/edwin/locale/usr.bin/unicodename2utf8/unicodename2utf8.c Tue Nov 17 07:21:27 2009 (r199350) @@ -37,21 +37,25 @@ __FBSDID("$FreeBSD$"); #include <string.h> #include <sysexits.h> -#define MAXBUF 512 +#define MAXBUF 4098 struct utf8map { - char *uniname; + char *unicodename; char *utf8char; int utf8len; - struct utf8map *next; + struct utf8map *next_utf8; + struct utf8map *next_unicodename; }; -struct utf8map *utf8map_head[256]; +struct utf8map *head_utf8[256]; +struct utf8map *head_unicodename[256]; void usage(void); -struct utf8map *get_utf8map(char *dir); -struct utf8map *find_utf8map(char *unidata); -void translate(char *file_in, char *file_out); +struct utf8map *get_mappings(char *dir); +struct utf8map *find_utf8map(char *unicodename); +struct utf8map *find_unicodemap(char *utf8data, int len); +void translate_into_utf8(char *file_in, char *file_out); +void translate_into_unicodename(char *file_in, char *file_out); int debug = 0; @@ -60,12 +64,17 @@ main(int argc, char *argv[]) { char *cldr = NULL, *file_in = NULL, *file_out = NULL; char ch; + int reverse = 0; + + if (strcmp(argv[0], "utf82unicode") == 0) + reverse = 1; static struct option longopts[] = { { "cldr", required_argument, NULL, 1 }, + { "debug", no_argument, NULL, 4 }, { "input", required_argument, NULL, 2 }, { "output", required_argument, NULL, 3 }, - { "debug", no_argument, NULL, 4 }, + { "reverse", no_argument, NULL, 5 }, { NULL, 0, NULL, 0 } }; @@ -83,6 +92,9 @@ main(int argc, char *argv[]) case 4: debug++; break; + case 5: + reverse = !reverse; + break; default: usage(); } @@ -90,32 +102,110 @@ main(int argc, char *argv[]) argc -= optind; argv += optind; - if (cldr == NULL || file_in == NULL || file_out == NULL) + if (cldr == NULL) usage(); - get_utf8map(cldr); - translate(file_in, file_out); + get_mappings(cldr); + if (!reverse) + translate_into_utf8(file_in, file_out); + else + translate_into_unicodename(file_in, file_out); +} + +void +translate_into_unicodename(char *file_in, char *file_out) +{ + struct utf8map *map; + FILE *fin, *fout; + unsigned char *p, line[MAXBUF]; + int len; + + if (file_in == NULL) + fin = stdin; + else + if ((fin = fopen(file_in, "r")) == NULL) + errx(EX_DATAERR, "Cannot open %s for reading.", + file_in); + if (file_out == NULL) + fout = stdout; + else + if ((fout = fopen(file_out, "w")) == NULL) + errx(EX_DATAERR, "Cannot open %s for writing.", + file_out); + + fprintf(fout, +"#\n" +"# Do not edit this file, it is created automatically by the utf82unicodename\n" +"# utility. All changes to this file will be lost.\n" +"# The source of this file was %s\n" +"#\n", + file_in == NULL ? "read from stdin" : file_in); + + while (!feof(fin)) { + if (fgets(line, sizeof(line), fin) != NULL) { + if (line[0] == '#') { + fprintf(fout, "%s", line); + continue; + } + + p = line; + while (*p != '\0') { + if (*p == 0x0a) { + fwrite("\n", 1, 1, fout); + p++; + continue; + } + if ((*p > 0x7F && *p < 0xC2) + || (*p > 0xDF && *p < 0xE0) + || (*p > 0xEF)) + errx(EX_DATAERR, + "Invalid UTF-8 character '%c'", + *p); + + len = *p <= 0x7F ? 1 : *p <= 0xDF ? 2 : 3; + if ((map = find_unicodemap(p, len)) == NULL) { + errx(EX_DATAERR, + "Cannot find translation for '%s'", + p + 1); + } + fprintf(fout, "<%s>", map->unicodename); + p += len; + } + + } + } + + fclose(fin); + fclose(fout); } void -translate(char *file_in, char *file_out) +translate_into_utf8(char *file_in, char *file_out) { struct utf8map *map; FILE *fin, *fout; char *p, *q1, *q2, line[MAXBUF]; - if ((fin = fopen(file_in, "r")) == NULL) - errx(EX_DATAERR, "Cannot open %s for reading.", file_in); - if ((fout = fopen(file_out, "w")) == NULL) - errx(EX_DATAERR, "Cannot open %s for writing.", file_out); + if (file_in == NULL) + fin = stdin; + else + if ((fin = fopen(file_in, "r")) == NULL) + errx(EX_DATAERR, "Cannot open %s for reading.", + file_in); + if (file_out == NULL) + fout = stdout; + else + if ((fout = fopen(file_out, "w")) == NULL) + errx(EX_DATAERR, "Cannot open %s for writing.", + file_out); fprintf(fout, "#\n" -"# Do not edit this file, it is created automatically by the unicode2utf8\n" +"# Do not edit this file, it is created automatically by the unicodename2utf8\n" "# utility. All changes to this file will be lost.\n" "# The source of this file was %s\n" "#\n", - file_in); + file_in == NULL ? "read from stdin" : file_in); while (!feof(fin)) { if (fgets(line, sizeof(line), fin) != NULL) { @@ -156,29 +246,47 @@ translate(char *file_in, char *file_out) } struct utf8map * +find_unicodemap(char *utf8, int len) +{ + struct utf8map *p; + int hashindex = utf8[len - 1]; + + p = head_utf8[hashindex]; + while (p != NULL) { + if (debug) + printf("'%s' - '%s'\n", p->utf8char, utf8); + if (strncmp(p->utf8char, utf8, len) == 0) + return p; + p = p->next_utf8; + } + + return NULL; +} + +struct utf8map * find_utf8map(char *uniname) { struct utf8map *p; int hashindex = uniname[strlen(uniname) - 1]; - p = utf8map_head[hashindex]; + p = head_unicodename[hashindex]; while (p != NULL) { - if (strcmp(p->uniname, uniname) == 0) - return p; if (debug) - printf("'%s' - '%s'\n", p->uniname, uniname); - p = p->next; + printf("'%s' - '%s'\n", p->unicodename, uniname); + if (strcmp(p->unicodename, uniname) == 0) + return p; + p = p->next_unicodename; } return NULL; } struct utf8map * -get_utf8map(char *dir) +get_mappings(char *dir) { - struct utf8map *new; + struct utf8map *new, *prev = NULL; FILE *fin; - int len, i, hashindex; + int len, i, hashindex_utf8, hashindex_unicodename; char filename[MAXPATHLEN], uniname[MAXBUF], utf8char[MAXBUF], *p; sprintf(filename, "%s/posix/UTF-8.cm", dir); @@ -207,7 +315,10 @@ get_utf8map(char *dir) if ((p = strchr(uniname, '>')) == NULL) errx(EX_DATAERR, "No trailing '>' for %s", uniname); - hashindex = p[-1]; + + /* Use the last character in the for hashing */ + hashindex_unicodename = p[-1]; + *p = '\0'; if (uniname[0] != '<') errx(EX_DATAERR, "No leading '<' for %s", @@ -236,15 +347,30 @@ get_utf8map(char *dir) utf8char[len] = 0; } + /* use the last character in the utf8data for hashing */ + hashindex_utf8 = utf8char[len - 1]; + if (debug) printf("-%s-%s-\n", uniname, utf8char); new = (struct utf8map *)malloc(sizeof(struct utf8map)); - new->next = utf8map_head[hashindex]; - new->uniname = strdup(uniname + 1); + new->next_utf8 = head_utf8[hashindex_utf8]; + new->next_unicodename = + head_unicodename[hashindex_unicodename]; + new->unicodename = strdup(uniname + 1); new->utf8char = strdup(utf8char); new->utf8len = len; - utf8map_head[hashindex] = new; + head_unicodename[hashindex_unicodename] = new; + + /* + * If the previous UTF-8 character has the same name as + * this one, then don't put it in the hash_utf8 array. + * For example: <DIGIT ONE> and <one> + */ + if (prev == NULL + || strncmp(prev->utf8char, utf8char, len) != 0) + head_utf8[hashindex_utf8] = new; + prev = new; } } @@ -259,7 +385,10 @@ void usage(void) { - printf("Usage: unicode2utf8 --cldr=dir --input=file --output=file\n"); + printf( +"Usage: unicodename2utf8 --cldr=dir [--input=file] [--output=file] [--reverse]\n" +"Usage: utf82unicodename --cldr=dir [--input=file] [--output=file] [--reverse]\n" +); exit(EX_USAGE); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200911170721.nAH7LS5t094878>