Date: Thu, 11 Apr 2002 10:30:25 +0800 (CST) From: Christopher Hall <hsw@acm.org> To: FreeBSD-gnats-submit@FreeBSD.org Subject: kern/36983: CD9660 unicode to utf-8 [hack] Message-ID: <200204110230.g3B2UPVA005049@x4.tucheng.generalresources.com>
next in thread | raw e-mail | index | archive | help
>Number: 36983
>Category: kern
>Synopsis: CD9660 unicode to utf-8 [hack]
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: freebsd-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: sw-bug
>Submitter-Id: current-users
>Arrival-Date: Wed Apr 10 19:40:01 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator: Christopher Hall
>Release: FreeBSD 4.5-STABLE i386
>Organization:
>Environment:
System: FreeBSD 4.5-STABLE #4: Wed Apr 10 18:00:12 CST 2002 root@:/usr/obj/usr/src/sys/GENERIC i386
>Description:
The cd9660 filing system cannot handle most unicode characters
in file/directory names (especially Chinese/Japanese).
This patch was a quick fix so I could retrieve files from
such a CDROM, but it might be useful for some one who needs to
read this kind of CDROM.
>How-To-Repeat:
A CDROM burned by Nero Burning ROM (Chinese file names)
mount -t cd9660 -o ro /dev/cd0a /cdrom
ls -l /cdrom
See all files as ????? (various numbers of '?')
it is only possible to access one file/directory from each set
that has the same number of question marks.
>Fix:
Difficult since the isochar routine in sys/isofs/cd9660/cd9660_util.c
assumes a 1:1 correspondence between the unicode characters and the
native single byte character set. Most unicode chars are converted
to '?'.
Here is a quick work around for anyone who has a similar problem:
The following hack adds a state machine to the isochar routine
to fool the caller of isochar into getting multiple bytes per
unicode char.
The resulting bytes are utf-8 except for '?' '/' '%' space
and control chars which are converted to '%' and two hex digits.
Apply the patches in sys/isofs/cd9660/ to:
cd9660_rrip.c
cd9660_util.c
iso.h
--- cd9660_rrip.c.orig Sat Aug 28 08:46:06 1999
+++ cd9660_rrip.c Tue Apr 9 10:28:19 2002
@@ -508,7 +508,7 @@
pwhead = isodir->name + isonum_711(isodir->name_len);
if (!(isonum_711(isodir->name_len)&1))
pwhead++;
- isochar(isodir->name, pwhead, ana->imp->joliet_level, &c);
+ isochar(isodir->name, pwhead, ana->imp->joliet_level, &c, NULL);
/* If it's not the '.' entry of the root dir obey SP field */
if (c != 0 || isonum_733(isodir->extent) != ana->imp->root_extent)
@@ -645,7 +645,7 @@
*outlen = 0;
isochar(isodir->name, isodir->name + isonum_711(isodir->name_len),
- imp->joliet_level, &c);
+ imp->joliet_level, &c, NULL);
tab = rrip_table_getname;
if (c == 0 || c == 1) {
cd9660_rrip_defname(isodir,&analyze);
--- cd9660_util.c.orig Fri Mar 2 15:17:36 2001
+++ cd9660_util.c Wed Apr 10 17:54:12 2002
@@ -60,11 +60,12 @@
* Return number of bytes consumed
*/
int
-isochar(isofn, isoend, joliet_level, c)
+isochar(isofn, isoend, joliet_level, c, utf_state)
u_char *isofn;
u_char *isoend;
int joliet_level;
u_char *c;
+ int *utf_state;
{
*c = *isofn++;
if (joliet_level == 0 || isofn == isoend)
@@ -81,10 +82,58 @@
break;
}
/* XXX: if Unicode conversion routine is loaded then use it */
- if (cd9660_wchar2char != NULL)
+ if (cd9660_wchar2char != NULL) {
*c = cd9660_wchar2char((*(isofn - 1) << 8) | *isofn);
+ *utf_state = 0;
+ } else if (utf_state != NULL) { /* XXX: convert to UTF-8 */
+ static const u_char hex[16] = "0123456789abcdef";
+ int unichar = (*(isofn - 1) << 8) | *isofn;
+ if (unichar == '?' || unichar == '%' || unichar <= ' '
+ || unichar == '\\' || unichar == '/') {
+ switch (*utf_state) {
+ case 0:
+ *c = '%';
+ *utf_state = 1;
+ break;
+ case 1:
+ *c = (u_char)(hex[(unichar >> 4) & 0x0f]);
+ *utf_state = 2;
+ break;
+ default:
+ *c = (u_char)(hex[unichar & 0x0f]);
+ *utf_state = 0;
+ break;
+ }
+ } else if (unichar < 128) {
+ *c = (u_char)(unichar);
+ *utf_state = 0;
+ } else if ((unichar > 127) && (unichar < 2048)) {
+ if (*utf_state == 0) {
+ *c = (u_char)((unichar >> 6) | 192);
+ *utf_state = 1;
+ } else {
+ *c = (u_char)((unichar & 63) | 128);
+ *utf_state = 0;
+ }
+ } else {
+ switch (*utf_state) {
+ case 0:
+ *c = (u_char)((unichar >> 12) | 224);
+ *utf_state = 1;
+ break;
+ case 1:
+ *c = (u_char)(((unichar >> 6) & 63) | 128);
+ *utf_state = 2;
+ break;
+ default:
+ *c = (u_char)((unichar & 63) | 128);
+ *utf_state = 0;
+ break;
+ }
+ }
+ }
- return 2;
+ return (utf_state == NULL || *utf_state == 0) ? 2 : 0;
}
/*
@@ -101,12 +150,13 @@
int joliet_level;
{
int i, j;
+ int utf_state = 0;
u_char c, *fnend = fn + fnlen, *isoend = isofn + isolen;
for (; fn != fnend; fn++) {
if (isofn == isoend)
return *fn;
- isofn += isochar(isofn, isoend, joliet_level, &c);
+ isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state);
if (c == ';') {
if (*fn++ != ';')
return fn[-1];
@@ -117,7 +167,7 @@
}
for (j = 0; isofn != isoend; j = j * 10 + c - '0')
isofn += isochar(isofn, isoend,
- joliet_level, &c);
+ joliet_level, &c, &utf_state);
return i - j;
}
if (c != *fn) {
@@ -133,13 +183,13 @@
}
}
if (isofn != isoend) {
- isofn += isochar(isofn, isoend, joliet_level, &c);
+ isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state);
switch (c) {
default:
return -c;
case '.':
if (isofn != isoend) {
- isochar(isofn, isoend, joliet_level, &c);
+ isochar(isofn, isoend, joliet_level, &c, &utf_state);
if (c == ';')
return 0;
}
@@ -165,6 +215,7 @@
int joliet_level;
{
int fnidx = 0;
+ int utf_state = 0;
u_char c, d = '\0', *infnend = infn + infnlen;
if (assoc) {
@@ -172,7 +223,7 @@
fnidx++;
}
for (; infn != infnend; fnidx++) {
- infn += isochar(infn, infnend, joliet_level, &c);
+ infn += isochar(infn, infnend, joliet_level, &c, &utf_state);
if (!original && !joliet_level && c >= 'A' && c <= 'Z')
*outfn++ = c + ('a' - 'A');
--- iso.h.orig Wed Apr 10 10:30:10 2002
+++ iso.h Tue Apr 9 11:41:14 2002
@@ -266,7 +266,7 @@
extern vop_t **cd9660_specop_p;
extern vop_t **cd9660_fifoop_p;
-int isochar __P((u_char *, u_char *, int, u_char *));
+int isochar __P((u_char *, u_char *, int, u_char *, int *));
int isofncmp __P((u_char *, int, u_char *, int, int));
void isofntrans __P((u_char *, int, u_char *, u_short *, int, int, int));
ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *));
>Release-Note:
>Audit-Trail:
>Unformatted:
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200204110230.g3B2UPVA005049>
