Date: Thu, 11 Apr 2002 10:30:25 +0800 (CST) From: Christopher Hall <hsw@acm.org> To: FreeBSD-gnats-submit@FreeBSD.org Subject: kern/36983: CD9660 unicode to utf-8 [hack] Message-ID: <200204110230.g3B2UPVA005049@x4.tucheng.generalresources.com>
next in thread | raw e-mail | index | archive | help
>Number: 36983 >Category: kern >Synopsis: CD9660 unicode to utf-8 [hack] >Confidential: no >Severity: non-critical >Priority: low >Responsible: freebsd-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: sw-bug >Submitter-Id: current-users >Arrival-Date: Wed Apr 10 19:40:01 PDT 2002 >Closed-Date: >Last-Modified: >Originator: Christopher Hall >Release: FreeBSD 4.5-STABLE i386 >Organization: >Environment: System: FreeBSD 4.5-STABLE #4: Wed Apr 10 18:00:12 CST 2002 root@:/usr/obj/usr/src/sys/GENERIC i386 >Description: The cd9660 filing system cannot handle most unicode characters in file/directory names (especially Chinese/Japanese). This patch was a quick fix so I could retrieve files from such a CDROM, but it might be useful for some one who needs to read this kind of CDROM. >How-To-Repeat: A CDROM burned by Nero Burning ROM (Chinese file names) mount -t cd9660 -o ro /dev/cd0a /cdrom ls -l /cdrom See all files as ????? (various numbers of '?') it is only possible to access one file/directory from each set that has the same number of question marks. >Fix: Difficult since the isochar routine in sys/isofs/cd9660/cd9660_util.c assumes a 1:1 correspondence between the unicode characters and the native single byte character set. Most unicode chars are converted to '?'. Here is a quick work around for anyone who has a similar problem: The following hack adds a state machine to the isochar routine to fool the caller of isochar into getting multiple bytes per unicode char. The resulting bytes are utf-8 except for '?' '/' '%' space and control chars which are converted to '%' and two hex digits. Apply the patches in sys/isofs/cd9660/ to: cd9660_rrip.c cd9660_util.c iso.h --- cd9660_rrip.c.orig Sat Aug 28 08:46:06 1999 +++ cd9660_rrip.c Tue Apr 9 10:28:19 2002 @@ -508,7 +508,7 @@ pwhead = isodir->name + isonum_711(isodir->name_len); if (!(isonum_711(isodir->name_len)&1)) pwhead++; - isochar(isodir->name, pwhead, ana->imp->joliet_level, &c); + isochar(isodir->name, pwhead, ana->imp->joliet_level, &c, NULL); /* If it's not the '.' entry of the root dir obey SP field */ if (c != 0 || isonum_733(isodir->extent) != ana->imp->root_extent) @@ -645,7 +645,7 @@ *outlen = 0; isochar(isodir->name, isodir->name + isonum_711(isodir->name_len), - imp->joliet_level, &c); + imp->joliet_level, &c, NULL); tab = rrip_table_getname; if (c == 0 || c == 1) { cd9660_rrip_defname(isodir,&analyze); --- cd9660_util.c.orig Fri Mar 2 15:17:36 2001 +++ cd9660_util.c Wed Apr 10 17:54:12 2002 @@ -60,11 +60,12 @@ * Return number of bytes consumed */ int -isochar(isofn, isoend, joliet_level, c) +isochar(isofn, isoend, joliet_level, c, utf_state) u_char *isofn; u_char *isoend; int joliet_level; u_char *c; + int *utf_state; { *c = *isofn++; if (joliet_level == 0 || isofn == isoend) @@ -81,10 +82,58 @@ break; } /* XXX: if Unicode conversion routine is loaded then use it */ - if (cd9660_wchar2char != NULL) + if (cd9660_wchar2char != NULL) { *c = cd9660_wchar2char((*(isofn - 1) << 8) | *isofn); + *utf_state = 0; + } else if (utf_state != NULL) { /* XXX: convert to UTF-8 */ + static const u_char hex[16] = "0123456789abcdef"; + int unichar = (*(isofn - 1) << 8) | *isofn; + if (unichar == '?' || unichar == '%' || unichar <= ' ' + || unichar == '\\' || unichar == '/') { + switch (*utf_state) { + case 0: + *c = '%'; + *utf_state = 1; + break; + case 1: + *c = (u_char)(hex[(unichar >> 4) & 0x0f]); + *utf_state = 2; + break; + default: + *c = (u_char)(hex[unichar & 0x0f]); + *utf_state = 0; + break; + } + } else if (unichar < 128) { + *c = (u_char)(unichar); + *utf_state = 0; + } else if ((unichar > 127) && (unichar < 2048)) { + if (*utf_state == 0) { + *c = (u_char)((unichar >> 6) | 192); + *utf_state = 1; + } else { + *c = (u_char)((unichar & 63) | 128); + *utf_state = 0; + } + } else { + switch (*utf_state) { + case 0: + *c = (u_char)((unichar >> 12) | 224); + *utf_state = 1; + break; + case 1: + *c = (u_char)(((unichar >> 6) & 63) | 128); + *utf_state = 2; + break; + default: + *c = (u_char)((unichar & 63) | 128); + *utf_state = 0; + break; + } + } + } - return 2; + return (utf_state == NULL || *utf_state == 0) ? 2 : 0; } /* @@ -101,12 +150,13 @@ int joliet_level; { int i, j; + int utf_state = 0; u_char c, *fnend = fn + fnlen, *isoend = isofn + isolen; for (; fn != fnend; fn++) { if (isofn == isoend) return *fn; - isofn += isochar(isofn, isoend, joliet_level, &c); + isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state); if (c == ';') { if (*fn++ != ';') return fn[-1]; @@ -117,7 +167,7 @@ } for (j = 0; isofn != isoend; j = j * 10 + c - '0') isofn += isochar(isofn, isoend, - joliet_level, &c); + joliet_level, &c, &utf_state); return i - j; } if (c != *fn) { @@ -133,13 +183,13 @@ } } if (isofn != isoend) { - isofn += isochar(isofn, isoend, joliet_level, &c); + isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state); switch (c) { default: return -c; case '.': if (isofn != isoend) { - isochar(isofn, isoend, joliet_level, &c); + isochar(isofn, isoend, joliet_level, &c, &utf_state); if (c == ';') return 0; } @@ -165,6 +215,7 @@ int joliet_level; { int fnidx = 0; + int utf_state = 0; u_char c, d = '\0', *infnend = infn + infnlen; if (assoc) { @@ -172,7 +223,7 @@ fnidx++; } for (; infn != infnend; fnidx++) { - infn += isochar(infn, infnend, joliet_level, &c); + infn += isochar(infn, infnend, joliet_level, &c, &utf_state); if (!original && !joliet_level && c >= 'A' && c <= 'Z') *outfn++ = c + ('a' - 'A'); --- iso.h.orig Wed Apr 10 10:30:10 2002 +++ iso.h Tue Apr 9 11:41:14 2002 @@ -266,7 +266,7 @@ extern vop_t **cd9660_specop_p; extern vop_t **cd9660_fifoop_p; -int isochar __P((u_char *, u_char *, int, u_char *)); +int isochar __P((u_char *, u_char *, int, u_char *, int *)); int isofncmp __P((u_char *, int, u_char *, int, int)); void isofntrans __P((u_char *, int, u_char *, u_short *, int, int, int)); ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *)); >Release-Note: >Audit-Trail: >Unformatted: To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-bugs" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200204110230.g3B2UPVA005049>