From owner-svn-src-user@FreeBSD.ORG Mon Aug 3 21:41:30 2009 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 602D0106564A; Mon, 3 Aug 2009 21:41:30 +0000 (UTC) (envelope-from edwin@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 4E03F8FC0C; Mon, 3 Aug 2009 21:41:30 +0000 (UTC) (envelope-from edwin@FreeBSD.org) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n73LfUgI098153; Mon, 3 Aug 2009 21:41:30 GMT (envelope-from edwin@svn.freebsd.org) Received: (from edwin@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n73LfUUa098150; Mon, 3 Aug 2009 21:41:30 GMT (envelope-from edwin@svn.freebsd.org) Message-Id: <200908032141.n73LfUUa098150@svn.freebsd.org> From: Edwin Groothuis Date: Mon, 3 Aug 2009 21:41:30 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r196060 - in user/edwin/locale: . tools X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 03 Aug 2009 21:41:30 -0000 Author: edwin Date: Mon Aug 3 21:41:30 2009 New Revision: 196060 URL: http://svn.freebsd.org/changeset/base/196060 Log: Instead of using Text::Iconv, use private conversion routines based on the unicode.org UNIDATA and CLDR data. Added: user/edwin/locale/Makefile Modified: user/edwin/locale/tools/charmaps.pm user/edwin/locale/tools/cldr2def.pl Added: user/edwin/locale/Makefile ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/edwin/locale/Makefile Mon Aug 3 21:41:30 2009 (r196060) @@ -0,0 +1,41 @@ +# +# $FreeBSD$ +# + +UNICODEDIR?= /home/edwin/unicode/ +CLDRVERSION?= 1.7.0 +CLDRDIR?= ${UNICODEDIR}/cldr/${CLDRVERSION}/ +UNIDATAVERSION?=5.1.0 +UNIDATADIR?= ${UNICODEDIR}/UNIDATA/${UNIDATAVERSION}/ + +XMLDIR?= /home/edwin/svn/edwin/locale/tools/ +XMLFILE?= charmaps.xml + +TYPES?= monetdef numericdef msgdef timedef + +all: +.for t in ${TYPES} + test -d ${t} || mkdir ${t} + make build-${t} +.endfor + @echo "" + @find . -name *failed + +install: +.for t in ${TYPES} + cd ${t} && make + cd ${t} && sudo DESTDIR=/home/edwin/locale/new make install +.endfor + +.for t in ${TYPES} +build-${t}: + test -d ${t} || mkdir ${t} + perl -I tools tools/cldr2def.pl ${CLDRDIR} ${UNIDATADIR} ${XMLDIR} ${XMLDIR}/${XMLFILE} ${t} ${LC} +.endfor + +clean: +.for t in ${TYPES} + -rm ${t}/* + -rmdir ${t} +.endfor + Modified: user/edwin/locale/tools/charmaps.pm ============================================================================== --- user/edwin/locale/tools/charmaps.pm Mon Aug 3 21:39:04 2009 (r196059) +++ user/edwin/locale/tools/charmaps.pm Mon Aug 3 21:41:30 2009 (r196060) @@ -64,6 +64,7 @@ sub h_start { if (defined $encoding) { foreach my $e (split(" ", $encoding)) { $d{L}{$name}{$f}{data}{$c}{$e} = undef; + $d{E}{$e} = 0; # not read } } $d{L}{$name}{$f}{data}{$c}{"UTF-8"} = undef; @@ -74,19 +75,25 @@ sub h_start { if ($index == 2 && $data{element}{1} eq "translations" && $element eq "translation") { - if (defined $attrs{hex}) { - my $k = "<" . $attrs{cldr} . ">"; - my $hs = $attrs{hex}; - $d{T}{$attrs{encoding}}{$k} = ""; - while ($hs ne "") { - $d{T}{$attrs{encoding}}{$k} .= - chr(hex(substr($hs, 0, 2))); - $hs = substr($hs, 2); + foreach my $e (split(" ", $attrs{encoding})) { + if (defined $attrs{hex}) { + my $k = $attrs{cldr}; + my $hs = $attrs{hex}; + $d{T}{$e}{$k}{hex} = $hs; + } + if (defined $attrs{string}) { + my $s = ""; + for (my $i = 0; $i < length($attrs{string}); $i++) { + $s .= sprintf("%02x", + ord(substr($attrs{string}, $i, 1))); + } + $d{T}{$e}{$attrs{cldr}}{hex} = $s; + } + if (defined $attrs{unicode}) { + my $k = $attrs{cldr}; + my $uc = $attrs{unicode}; + $d{T}{$e}{$k}{unicode} = $uc; } - } - if (defined $attrs{string}) { - $d{T}{$attrs{encoding}}{"<" . $attrs{cldr} . ">"} = - $attrs{string}; } return; } @@ -115,7 +122,8 @@ sub h_end { foreach my $c (split(/,/, $data{fields}{countries})) { my $m = $data{fields}{text}; - $m =~ s/[\t ]//g; + $m =~ s/^[\t ]//g; + $m =~ s/[\t ]$//g; $d{AM}{$data{fields}{name}}{$c} = $m; } $data{fields} = (); Modified: user/edwin/locale/tools/cldr2def.pl ============================================================================== --- user/edwin/locale/tools/cldr2def.pl Mon Aug 3 21:39:04 2009 (r196059) +++ user/edwin/locale/tools/cldr2def.pl Mon Aug 3 21:41:30 2009 (r196060) @@ -6,19 +6,20 @@ use strict; use XML::Parser; -use Text::Iconv; use Tie::IxHash; use Data::Dumper; use Digest::SHA qw(sha1_hex); require "charmaps.pm"; if ($#ARGV < 2) { - print "Usage: $0 [la_CC]\n"; + print "Usage: $0 [la_CC]\n"; exit(1); } my $DEFENCODING = "UTF-8"; -my $DIR = shift(@ARGV); +my $CLDRDIR = shift(@ARGV); +my $UNIDATADIR = shift(@ARGV); +my $XMLDIR = shift(@ARGV); my $CHARMAPS = shift(@ARGV); my $TYPE = shift(@ARGV); my $doonly = shift(@ARGV); @@ -26,15 +27,20 @@ my @filter = (); my %convertors = (); +my %ucd = (); my %values = (); my %hashtable = (); my %languages = (); my %translations = (); +my %encodings = (); my %alternativemonths = (); get_languages(); -my %cm = (); -get_utf8map(); +my %utf8map = (); +my %utf8aliases = (); +get_unidata($UNIDATADIR); +get_utf8map("$CLDRDIR/posix/$DEFENCODING.cm"); +get_encodings("$XMLDIR/charmaps"); my %keys = (); tie(%keys, "Tie::IxHash"); @@ -50,7 +56,7 @@ my %FILESNAMES = ( my %callback = ( mdorder => \&callback_mdorder, altmon => \&callback_altmon, - data => (), + data => undef, ); my %DESC = ( @@ -189,27 +195,83 @@ sub callback_altmon { ############################ +sub get_unidata { + my $directory = shift; + + open(FIN, "$directory/UnicodeData.txt"); + my @lines = ; + chomp(@lines); + close(FIN); + + foreach my $l (@lines) { + my @a = split(/;/, $l); + + $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name + $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code + } +} + sub get_utf8map { - open(FIN, "$DIR/posix/$DEFENCODING.cm"); + my $file = shift; + + open(FIN, $file); my @lines = ; close(FIN); chomp(@lines); + + my $prev_k = undef; + my $prev_v = ""; my $incharmap = 0; foreach my $l (@lines) { $l =~ s/\r//; next if ($l =~ /^\#/); next if ($l eq ""); + if ($l eq "CHARMAP") { $incharmap = 1; next; } + next if (!$incharmap); last if ($l eq "END CHARMAP"); - $l =~ /^([^\s]+)\s+(.*)/; + + $l =~ /^<([^\s]+)>\s+(.*)/; my $k = $1; my $v = $2; - $v =~ s/\\x//g; - $cm{$k} = $v; + $k =~ s/_/ /g; # unicode char string + $v =~ s/\\x//g; # UTF-8 char code + $utf8map{$k} = $v; + + $utf8aliases{$k} = $prev_k if ($prev_v eq $v); + + $prev_v = $v; + $prev_k = $k; + } +} + +sub get_encodings { + my $dir = shift; + foreach my $e (sort(keys(%encodings))) { + if (!open(FIN, "$dir/$e.TXT")) { + print "Cannot open charmap for $e\n"; + next; + + } + $encodings{$e} = 1; + my @lines = ; + close(FIN); + chomp(@lines); + foreach my $l (@lines) { + $l =~ s/\r//; + next if ($l =~ /^\#/); + next if ($l eq ""); + + my @a = split(" ", $l); + next if ($#a < 1); + $a[0] =~ s/^0[xX]//; # local char code + $a[1] =~ s/^0[xX]//; # unicode char code + $convertors{$e}{uc($a[1])} = uc($a[0]); + } } } @@ -218,6 +280,7 @@ sub get_languages { %languages = %{$data{L}}; %translations = %{$data{T}}; %alternativemonths = %{$data{AM}}; + %encodings = %{$data{E}}; return if (!defined $doonly); @@ -248,14 +311,15 @@ sub get_fields { $file = $l . "_"; $file .= $f . "_" if ($f ne "x"); $file .= $c; - if (!open(FIN, "$DIR/posix/$file.$DEFENCODING.src")) { + if (!open(FIN, "$CLDRDIR/posix/$file.$DEFENCODING.src")) { if (!defined $languages{$l}{$f}{fallback}) { print STDERR "Cannot open $file.$DEFENCODING.src\n"; next; } $file = $languages{$l}{$f}{fallback}; - if (!open(FIN, "$DIR/posix/$file.$DEFENCODING.src")) { + if (!open(FIN, + "$CLDRDIR/posix/$file.$DEFENCODING.src")) { print STDERR "Cannot open fallback " . "$file.$DEFENCODING.src\n"; @@ -283,6 +347,12 @@ sub get_fields { $continue = ($line =~ /\/$/); $line =~ s/\/$// if ($continue); + + while ($line =~ /_/) { + $line =~ + s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; + } + die "_ in data - $line" if ($line =~ /_/); $values{$l}{$c}{$k} .= $line; last if (!$continue); @@ -294,15 +364,57 @@ sub get_fields { } sub decodecldr { + my $e = shift; my $s = shift; - my $v = $cm{$s}; + + my $v = undef; + + if ($e eq "UTF-8") { + # + # Conversion to UTF-8 can be done from the Unicode name to + # the UTF-8 character code. + # + $v = $utf8map{$s}; + die "Cannot convert $s in $e (charmap)" if (!defined $v); + } else { + # + # Conversion to these encodings can be done from the Unicode + # name to Unicode code to the encodings code. + # + my $ucc = undef; + $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s}); + $ucc = $ucd{name2code}{$utf8aliases{$s}} + if (!defined $ucc + && $utf8aliases{$s} + && defined $ucd{name2code}{$utf8aliases{$s}}); + + die "Cannot convert $s in $e (ucd string)" if (!defined $ucc); + $v = $convertors{$e}{$ucc}; + + $v = $translations{$e}{$s}{hex} + if (!defined $v && defined $translations{$e}{$s}{hex}); + + if (!defined $v && defined $translations{$e}{$s}{unicode}) { + my $ucn = $translations{$e}{$s}{unicode}; + $ucc = $ucd{name2code}{$ucn} + if (defined $ucd{name2code}{$ucn}); + $ucc = $ucd{name2code}{$utf8aliases{$ucn}} + if (!defined $ucc + && defined $ucd{name2code}{$utf8aliases{$ucn}}); + $v = $convertors{$e}{$ucc}; + } + + die "Cannot convert $s in $e (charmap)" if (!defined $v); + } return pack("C", hex($v)) if (length($v) == 2); return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) if (length($v) == 4); return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), hex(substr($v, 4, 2))) if (length($v) == 6); + print STDERR "Cannot convert $e $s\n"; return "length = " . length($v); + } sub translate { @@ -331,13 +443,9 @@ sub print_fields { $file .= "_" . $c; print "Writing to $file in $enc\n"; - eval { - $convertors{$enc} = - Text::Iconv->new($DEFENCODING, $enc); - } if (!defined $convertors{$enc}); - if (!defined $convertors{$enc}) { - print "Failed! Cannot convert between " . - "$DEFENCODING and $enc.\n"; + if ($enc ne $DEFENCODING && + !defined $convertors{$enc}) { + print "Failed! Cannot convert to $enc.\n"; next; }; @@ -398,23 +506,24 @@ EOF $v =~ s/^"//; $v =~ s/"$//; my $cm = ""; - while ($v =~ /^(.*?)(<.*?>)(.*)/) { + while ($v =~ /^(.*?)<(.*?)>(.*)/) { + my $p1 = $1; $cm = $2; - $v = $1 . decodecldr($2) . $3; - } - my $fv = - $convertors{$enc}->convert("$v"); - $fv = translate($enc, $cm) - if (!defined $fv); - if (!defined $fv) { - print STDERR - "Could not convert $k " . - "($cm) from $DEFENCODING " . - "to $enc\n"; - $okay = 0; - next; + my $p3 = $3; + + my $rv = decodecldr($enc, $cm); +# $rv = translate($enc, $cm) +# if (!defined $rv); + if (!defined $rv) { + print STDERR +"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; + $okay = 0; + next; + } + + $v = $p1 . $rv . $p3; } - $output .= "$fv\n"; + $output .= "$v\n"; next; } if ($f eq "as") { @@ -422,26 +531,27 @@ EOF $v =~ s/^"//; $v =~ s/"$//; my $cm = ""; - while ($v =~ /^(.*?)(<.*?>)(.*)/) { + while ($v =~ /^(.*?)<(.*?)>(.*)/) { + my $p1 = $1; $cm = $2; - $v = $1 . - decodecldr($2) . $3; - } - my $fv = - $convertors{$enc}->convert("$v"); - $fv = translate($enc, $cm) - if (!defined $fv); - if (!defined $fv) { - print STDERR - "Could not " . - "convert $k ($cm)" . - " from " . - "$DEFENCODING to " . - "$enc\n"; - $okay = 0; - next; + my $p3 = $3; + + my $rv = + decodecldr($enc, + $cm); +# $rv = translate($enc, +# $cm) +# if (!defined $rv); + if (!defined $rv) { + print STDERR +"Could not convert $k ($cm) from $DEFENCODING to $enc\n"; + $okay = 0; + next; + } + + $v = $1 . $rv . $3; } - $output .= "$fv\n"; + $output .= "$v\n"; } next; }