Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 11 Apr 2002 10:30:25 +0800 (CST)
From:      Christopher Hall <hsw@acm.org>
To:        FreeBSD-gnats-submit@FreeBSD.org
Subject:   kern/36983: CD9660 unicode to utf-8 [hack]
Message-ID:  <200204110230.g3B2UPVA005049@x4.tucheng.generalresources.com>

next in thread | raw e-mail | index | archive | help

>Number:         36983
>Category:       kern
>Synopsis:       CD9660 unicode to utf-8 [hack]
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Apr 10 19:40:01 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator:     Christopher Hall
>Release:        FreeBSD 4.5-STABLE i386
>Organization:
>Environment:
System: FreeBSD 4.5-STABLE #4: Wed Apr 10 18:00:12 CST 2002 root@:/usr/obj/usr/src/sys/GENERIC i386

>Description:

        The cd9660 filing system cannot handle most unicode characters
	in file/directory names (especially Chinese/Japanese).

	This patch was a quick fix so I could retrieve files from
	such a CDROM, but it might be useful for some one who needs to
	read this kind of CDROM.

>How-To-Repeat:
	A CDROM burned by Nero Burning ROM (Chinese file names)

        mount -t cd9660 -o ro /dev/cd0a /cdrom
	ls -l /cdrom

        See all files as ????? (various numbers of '?')
        it is only possible to access one file/directory from each set
        that has the same number of question marks.

>Fix:

	Difficult since the isochar routine in sys/isofs/cd9660/cd9660_util.c
	assumes a 1:1 correspondence between the unicode characters and the
	native single byte character set.  Most unicode chars are converted
	to '?'.

	Here is a quick work around for anyone who has a similar problem:
	The following hack adds a state machine to the isochar routine
	to fool the caller of isochar into getting multiple bytes per
	unicode char.

	The resulting bytes are utf-8 except for '?' '/' '%' space
	and control chars which are converted to '%' and two hex digits.

	Apply the patches in sys/isofs/cd9660/ to:
	  cd9660_rrip.c
	  cd9660_util.c
	  iso.h


--- cd9660_rrip.c.orig	Sat Aug 28 08:46:06 1999
+++ cd9660_rrip.c	Tue Apr  9 10:28:19 2002
@@ -508,7 +508,7 @@
 	pwhead = isodir->name + isonum_711(isodir->name_len);
 	if (!(isonum_711(isodir->name_len)&1))
 		pwhead++;
-	isochar(isodir->name, pwhead, ana->imp->joliet_level, &c);
+	isochar(isodir->name, pwhead, ana->imp->joliet_level, &c, NULL);

 	/* If it's not the '.' entry of the root dir obey SP field */
 	if (c != 0 || isonum_733(isodir->extent) != ana->imp->root_extent)
@@ -645,7 +645,7 @@
 	*outlen = 0;

 	isochar(isodir->name, isodir->name + isonum_711(isodir->name_len),
-		imp->joliet_level, &c);
+		imp->joliet_level, &c, NULL);
 	tab = rrip_table_getname;
 	if (c == 0 || c == 1) {
 		cd9660_rrip_defname(isodir,&analyze);



--- cd9660_util.c.orig	Fri Mar  2 15:17:36 2001
+++ cd9660_util.c	Wed Apr 10 17:54:12 2002
@@ -60,11 +60,12 @@
  * Return number of bytes consumed
  */
 int
-isochar(isofn, isoend, joliet_level, c)
+isochar(isofn, isoend, joliet_level, c, utf_state)
       u_char *isofn;
       u_char *isoend;
       int joliet_level;
       u_char *c;
+      int *utf_state;
 {
       *c = *isofn++;
       if (joliet_level == 0 || isofn == isoend)
@@ -81,10 +82,58 @@
               break;
       }
       /* XXX: if Unicode conversion routine is loaded then use it */
-      if (cd9660_wchar2char != NULL)
+      if (cd9660_wchar2char != NULL) {
             *c = cd9660_wchar2char((*(isofn - 1) << 8) | *isofn);
+            *utf_state = 0;
+      } else if (utf_state != NULL) { /* XXX: convert to UTF-8 */
+	    static const u_char hex[16] = "0123456789abcdef";
+            int unichar  = (*(isofn - 1) << 8) | *isofn;
+            if (unichar == '?' || unichar == '%' || unichar <= ' '
+		|| unichar == '\\' || unichar == '/') {
+	          switch (*utf_state) {
+                  case 0:
+                        *c = '%';
+                        *utf_state = 1;
+                        break;
+                  case 1:
+                        *c = (u_char)(hex[(unichar >> 4) & 0x0f]);
+                        *utf_state = 2;
+                        break;
+                  default:
+                        *c = (u_char)(hex[unichar & 0x0f]);
+                        *utf_state = 0;
+                        break;
+                  }
+	    } else if (unichar < 128) {
+                  *c = (u_char)(unichar);
+                  *utf_state = 0;
+	    } else if ((unichar > 127) && (unichar < 2048)) {
+	          if (*utf_state == 0) {
+                        *c = (u_char)((unichar >> 6) | 192);
+                        *utf_state = 1;
+                  } else {
+                        *c = (u_char)((unichar & 63) | 128);
+                        *utf_state = 0;
+		  }
+            } else {
+	          switch (*utf_state) {
+                  case 0:
+                        *c = (u_char)((unichar >> 12) | 224);
+                        *utf_state = 1;
+                        break;
+                  case 1:
+                        *c = (u_char)(((unichar >> 6) & 63) | 128);
+                        *utf_state = 2;
+                        break;
+                  default:
+                        *c = (u_char)((unichar & 63) | 128);
+                        *utf_state = 0;
+                        break;
+                  }
+            }
+      }

-      return 2;
+      return (utf_state == NULL || *utf_state == 0) ? 2 : 0;
 }

 /*
@@ -101,12 +150,13 @@
 	int joliet_level;
 {
 	int i, j;
+        int utf_state = 0;
 	u_char c, *fnend = fn + fnlen, *isoend = isofn + isolen;

 	for (; fn != fnend; fn++) {
 		if (isofn == isoend)
 			return *fn;
-		isofn += isochar(isofn, isoend, joliet_level, &c);
+		isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state);
 		if (c == ';') {
 			if (*fn++ != ';')
 				return fn[-1];
@@ -117,7 +167,7 @@
 			}
 			for (j = 0; isofn != isoend; j = j * 10 + c - '0')
 				isofn += isochar(isofn, isoend,
-						 joliet_level, &c);
+						 joliet_level, &c, &utf_state);
 			return i - j;
 		}
 		if (c != *fn) {
@@ -133,13 +183,13 @@
 		}
 	}
 	if (isofn != isoend) {
-		isofn += isochar(isofn, isoend, joliet_level, &c);
+		isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state);
 		switch (c) {
 		default:
 			return -c;
 		case '.':
 			if (isofn != isoend) {
-				isochar(isofn, isoend, joliet_level, &c);
+				isochar(isofn, isoend, joliet_level, &c, &utf_state);
 				if (c == ';')
 					return 0;
 			}
@@ -165,6 +215,7 @@
 	int joliet_level;
 {
 	int fnidx = 0;
+	int utf_state = 0;
 	u_char c, d = '\0', *infnend = infn + infnlen;

 	if (assoc) {
@@ -172,7 +223,7 @@
 		fnidx++;
 	}
 	for (; infn != infnend; fnidx++) {
-		infn += isochar(infn, infnend, joliet_level, &c);
+		infn += isochar(infn, infnend, joliet_level, &c, &utf_state);

 		if (!original && !joliet_level && c >= 'A' && c <= 'Z')
 			*outfn++ = c + ('a' - 'A');



--- iso.h.orig	Wed Apr 10 10:30:10 2002
+++ iso.h	Tue Apr  9 11:41:14 2002
@@ -266,7 +266,7 @@
 extern vop_t **cd9660_specop_p;
 extern vop_t **cd9660_fifoop_p;

-int isochar __P((u_char *, u_char *, int, u_char *));
+int isochar __P((u_char *, u_char *, int, u_char *, int *));
 int isofncmp __P((u_char *, int, u_char *, int, int));
 void isofntrans __P((u_char *, int, u_char *, u_short *, int, int, int));
 ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *));
>Release-Note:
>Audit-Trail:
>Unformatted:

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200204110230.g3B2UPVA005049>