From owner-svn-src-all@freebsd.org Sat Apr 16 17:36:03 2016 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id EB184B10EBC; Sat, 16 Apr 2016 17:36:03 +0000 (UTC) (envelope-from bapt@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id B638B106D; Sat, 16 Apr 2016 17:36:03 +0000 (UTC) (envelope-from bapt@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u3GHa3VZ064757; Sat, 16 Apr 2016 17:36:03 GMT (envelope-from bapt@FreeBSD.org) Received: (from bapt@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id u3GHa2MM064750; Sat, 16 Apr 2016 17:36:02 GMT (envelope-from bapt@FreeBSD.org) Message-Id: <201604161736.u3GHa2MM064750@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: bapt set sender to bapt@FreeBSD.org using -f From: Baptiste Daroussin Date: Sat, 16 Apr 2016 17:36:02 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r298116 - in head/tools/tools/locale: . etc etc/charmaps tools X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.21 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 16 Apr 2016 17:36:04 -0000 Author: bapt Date: Sat Apr 16 17:36:02 2016 New Revision: 298116 URL: https://svnweb.freebsd.org/changeset/base/298116 Log: Rework collation generation: When building collation database for non unicode encodings use the proper unicode mapping (this fixes collation not working properly for those encodings) For locales where new characters are added but only for unicode, stop trying to map the new characters, directly extract from CLDR the collation files for the said encoding Stop trying to generate encoding map from unicode version for GB2312 and encCN It was not reliable. Instead use the map provide by the CLDR project Reported by: ache Added: head/tools/tools/locale/tools/extract-colldef.awk (contents, props changed) Modified: head/tools/tools/locale/Makefile head/tools/tools/locale/etc/charmaps.xml head/tools/tools/locale/etc/charmaps/charmaps.txt head/tools/tools/locale/tools/cldr2def.pl head/tools/tools/locale/tools/convert_map.pl head/tools/tools/locale/tools/finalize Modified: head/tools/tools/locale/Makefile ============================================================================== --- head/tools/tools/locale/Makefile Sat Apr 16 13:41:10 2016 (r298115) +++ head/tools/tools/locale/Makefile Sat Apr 16 17:36:02 2016 (r298116) @@ -22,6 +22,23 @@ KNOWN= monetdef numericdef msgdef timed TYPES?= ${KNOWN} LOCALE_DESTDIR?= /tmp/generated-locales/ +COLLATION_SPECIAL?= \ + cs_CZ ISO8859-2 \ + da_DK ISO8859-1 \ + da_DK ISO8859-15 \ + hr_HR ISO8859-2 \ + hu_HU ISO8859-2 \ + nb_NO ISO8859-1 \ + nb_NO ISO8859-15 \ + sk_SK ISO8859-2 \ + zh_Hans_CN GB2312 \ + zh_Hans_CN eucCN \ + +.for area enc in ${COLLATION_SPECIAL} +COLLATIONS_SPECIAL_ENV+= ${area}.${enc} +.endfor +PASSON+= COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}" + .if defined(LC) LC:= --lc=${LC} .endif @@ -55,17 +72,26 @@ post-install: .endfor .for t in ${TYPES} -build-${t}: +gen-${t}: mkdir -p ${t} ${t}.draft perl -I tools tools/cldr2def.pl \ --cldr=$$(realpath ${CLDRDIR}) \ --unidata=$$(realpath ${UNIDATADIR}) \ --etc=$$(realpath ${ETCDIR}) \ --type=${t} ${LC} + +build-${t}: gen-${t} env ${PASSON} tools/finalize ${t} .endfor -build-ctypedef: transfer-rollup +gen-ctypedef: transfer-rollup +static-colldef: gen-colldef +build-colldef: static-colldef + +static-colldef: +.for area enc in ${COLLATION_SPECIAL} + awk -f tools/extract-colldef.awk ${CLDRDIR}/posix/${area}.${enc}.src > colldef/${area}.${enc}.src +.endfor transfer-rollup: cp ${ETCDIR}/common.UTF-8.src ${CLDRDIR}/posix/xx_Comm_US.UTF-8.src @@ -93,12 +119,34 @@ BASE_LOCALES_OF_INTEREST?= \ uk_UA \ kk_Cyrl_KZ mn_Cyrl_MN sr_Cyrl_RS sr_Latn_RS \ zh_Hans_CN zh_Hant_HK zh_Hant_TW \ - \ - \ bn_IN gu_IN or_IN ta_IN te_IN kn_IN ml_IN si_LK \ th_TH lo_LA bo_IN my_MM pa_Guru_IN ka_GE chr_US \ km_KH shi_Tfng_MA ii_CN vai_Vaii_LR vi_VN +ENCODINGS= Big5 \ + CP1251 \ + CP866 \ + CP949 \ + eucCN \ + eucJP \ + eucKR \ + GB18030 \ + GB2312 \ + GBK \ + ISO8859-1 \ + ISO8859-13 \ + ISO8859-15 \ + ISO8859-2 \ + ISO8859-5 \ + ISO8859-7 \ + ISO8859-9 \ + KOI8-R \ + KOI8-U \ + SJIS \ + US-ASCII \ + UTF-8 \ + + POSIX: .if exists (${CLDRDIR}/tools/java/cldr.jar) mkdir -p ${CLDRDIR}/posix @@ -109,11 +157,20 @@ POSIX: -d ${CLDRDIR}/posix -m ${area} -c UTF-8 . endif . endfor -. if !exists(${CLDRDIR}/posix/UTF-8.cm) +. for area encoding in ${COLLATION_SPECIAL} +. if !exists(${CLDRDIR}/posix/${area}.${encoding}.src) + java -DCLDR_DIR=${CLDRDIR:Q} -jar ${CLDRDIR}/tools/java/cldr.jar \ + org.unicode.cldr.posix.GeneratePOSIX \ + -d ${CLDRDIR}/posix -m ${area} -c ${encoding} +. endif +. endfor +. for enc in ${ENCODINGS} +. if !exists(${CLDRDIR}/posix/${enc}.cm) java -DCLDR_DIR=${CLDRDIR:Q} -jar ${CLDRDIR}/tools/java/cldr.jar \ org.unicode.cldr.posix.GenerateCharmap \ - -d ${CLDRDIR}/posix + -d ${CLDRDIR}/posix -c ${enc} . endif +. endfor .else @echo "Please install CLDR toolset for the desired release" @echo "It should go at ${CLDRDIR}/tools" Modified: head/tools/tools/locale/etc/charmaps.xml ============================================================================== --- head/tools/tools/locale/etc/charmaps.xml Sat Apr 16 13:41:10 2016 (r298115) +++ head/tools/tools/locale/etc/charmaps.xml Sat Apr 16 17:36:02 2016 (r298116) @@ -187,10 +187,6 @@ countries="CN" /> - @@ -444,69 +440,69 @@ unicode="FULLWIDTH HYPHEN-MINUS" /> - - - - - - - - - - - - - - - - - - - - @@ -516,11 +512,11 @@ cldr="CJK UNIFIED IDEOGRAPH-706B" ucc="706B" /> - - - Modified: head/tools/tools/locale/etc/charmaps/charmaps.txt ============================================================================== --- head/tools/tools/locale/etc/charmaps/charmaps.txt Sat Apr 16 13:41:10 2016 (r298115) +++ head/tools/tools/locale/etc/charmaps/charmaps.txt Sat Apr 16 17:36:02 2016 (r298116) @@ -8,7 +8,6 @@ haible.de: http://haible.de/bruno/charse ARMSCII-8 haible.de: Armenian.html Big5 unicodeorg: OBSOLETE/EASTASIA/OTHER - Big5HKSCS haible.de: BIG5-HKSCS.html / CP1131 haible.de: CP1131.html / aix-4.3.2/IBM-1131.TXT CP1251 unicode.org: VENDORS/MICSFT/WINDOWS CP866 unicode.org: VENDORS/MICSFT/PC Modified: head/tools/tools/locale/tools/cldr2def.pl ============================================================================== --- head/tools/tools/locale/tools/cldr2def.pl Sat Apr 16 13:41:10 2016 (r298115) +++ head/tools/tools/locale/tools/cldr2def.pl Sat Apr 16 17:36:02 2016 (r298116) @@ -808,14 +808,24 @@ sub make_makefile { my $SRCOUT; my $SRCOUT2; my $SRCOUT3 = ""; + my $SRCOUT4 = ""; my $MAPLOC; if ($TYPE eq "colldef") { $SRCOUT = "localedef -D -U -i \${.IMPSRC} \\\n" . - "\t-f \${MAPLOC}/map.UTF-8 " . + "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E} " . "\${.OBJDIR}/\${.IMPSRC:T:R}"; $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . "locale/etc/final-maps\n"; $SRCOUT2 = "LC_COLLATE"; + $SRCOUT3 = "" . + ".for f t in \${LOCALES_MAPPED}\n" . + "FILES+=\t\$t.LC_COLLATE\n" . + "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" . + "\tlocaledef -D -U -i \${.ALLSRC} \\\n" . + "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E} \\\n" . + "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" . + ".endfor\n\n"; + $SRCOUT4 = "## LOCALES_MAPPED\n"; } elsif ($TYPE eq "ctypedef") { $SRCOUT = "localedef -D -U -c -w \${MAPLOC}/widths.txt \\\n" . @@ -855,6 +865,8 @@ ${MAPLOC} ## PLACEHOLDER +${SRCOUT4} + EOF foreach my $hash (keys(%hashtable)) { Modified: head/tools/tools/locale/tools/convert_map.pl ============================================================================== --- head/tools/tools/locale/tools/convert_map.pl Sat Apr 16 13:41:10 2016 (r298115) +++ head/tools/tools/locale/tools/convert_map.pl Sat Apr 16 17:36:02 2016 (r298116) @@ -1,5 +1,7 @@ #! /usr/local/bin/perl # +# $FreeBSD$ +# # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version @@ -167,7 +169,6 @@ elsif ($codeset eq "eucKR") { $max_m elsif ($codeset eq "GBK") { $max_mb = 2 } elsif ($codeset eq "GB2312") { $max_mb = 2 } elsif ($codeset eq "Big5") { $max_mb = 2 } -elsif ($codeset eq "Big5HKSCS") { $max_mb = 2 } else { $max_mb = 1 }; print(" \"$codeset\"\n"); print(" 1\n"); Added: head/tools/tools/locale/tools/extract-colldef.awk ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/tools/tools/locale/tools/extract-colldef.awk Sat Apr 16 17:36:02 2016 (r298116) @@ -0,0 +1,18 @@ +# $FreeBSD$ + +BEGIN { + print "# Warning: Do not edit. This is automatically extracted" + print "# from CLDR project data, obtained from http://cldr.unicode.org/" + print "# -----------------------------------------------------------------------------" +} +$1 == "comment_char" { print $0 } +$1 == "escape_char" { print $0 } +$1 == "LC_COLLATE" { + print $0 + while (getline line) { + print line + if (line == "END LC_COLLATE") { + break + } + } +} Modified: head/tools/tools/locale/tools/finalize ============================================================================== --- head/tools/tools/locale/tools/finalize Sat Apr 16 13:41:10 2016 (r298115) +++ head/tools/tools/locale/tools/finalize Sat Apr 16 17:36:02 2016 (r298116) @@ -26,12 +26,15 @@ new=${base}/../${1} TEMP=/tmp/${1}.locales TEMP2=/tmp/${1}.hashes TEMP3=/tmp/${1}.symlinks +TEMP4=/tmp/${1}.mapped FULLMAP=/tmp/utf8-map FULLEXTRACT=/tmp/extracted-names AWKCMD="/## PLACEHOLDER/ { \ while ( getline line < \"${TEMP}\" ) {print line} } \ /## SYMPAIRS/ { \ while ( getline line < \"${TEMP3}\" ) {print line} } \ + /## LOCALES_MAPPED/ { \ + while ( getline line < \"${TEMP4}\" ) {print line} } \ !/## / { print \$0 }" grep '^LOCALES+' ${old}/Makefile > ${TEMP} @@ -51,21 +54,23 @@ then /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${CLDRDIR}/posix/UTF-8.cm \ > ${base}/../etc/final-maps/map.UTF-8 - CHARMAPS="ARMSCII-8 Big5 Big5HKSCS CP1131 CP1251 \ + /usr/bin/sed -E -e 's/[ ]+/ /g' \ + ${CLDRDIR}/posix/eucCN.cm \ + > ${base}/../etc/final-maps/map.eucCN + /usr/bin/sed -E -e 's/[ ]+/ /g' \ + ${CLDRDIR}/posix/eucCN.cm \ + > ${base}/../etc/final-maps/map.GB2312 + CHARMAPS="ARMSCII-8 Big5 CP1131 CP1251 \ CP866 GB2312 GBK ISCII-DEV ISO8859-1 \ ISO8859-13 ISO8859-15 ISO8859-2 ISO8859-4 \ ISO8859-5 ISO8859-7 ISO8859-9 KOI8-R KOI8-U \ - PT154 SJIS US-ASCII eucCN eucJP eucKR" + PT154 SJIS US-ASCII eucJP eucKR" # GB18030 blows up, use pre-generate Illumos version for map in ${CHARMAPS} do encoding=${map} - if [ ${map} = "Big5HKSCS" ] - then - encoding="Big5" - fi /usr/local/bin/perl ${base}/convert_map.pl \ ${base}/../etc/charmaps/${map}.TXT ${encoding} \ | /usr/bin/sed -E -e 's/ +/ /g' \ @@ -73,6 +78,31 @@ then echo map ${map} converted. done +elif [ $1 = "colldef" ] +then + awk -v tmp4=${TEMP4} '$1 == "SAME+=" && $0 !~ /legacy/ { + orig=$2 + dest=$3 + gsub(/.*\./, "", orig) + gsub(/.*\./, "", dest) + if (orig != dest ) + print "LOCALES_MAPPED+=\t"$2 " "$3 > tmp4 + }' ${old}/Makefile + + for line in $(awk '{ print $3 }' ${TEMP4}); do + sed -i '' "/^SAME.*$line$/d" ${old}/Makefile + done + echo "" >> ${TEMP4} + for enc in ${COLLATIONS_SPECIAL}; do + sed -i '' "/^.*${enc}$/d" ${TEMP4} + echo "LOCALES+= ${enc}" >> ${TEMP4} + done + + keep=$(cat ${TEMP} | awk '{ print $2 }') + for original in ${keep} + do + cp ${old}/${original}.src ${new}/ + done else # below is everything but ctypedef keep=$(cat ${TEMP} | awk '{ print $2 }') @@ -85,4 +115,4 @@ fi grep -v '^LOCALES+' ${old}/Makefile | awk "${AWKCMD}" > ${new}/Makefile -rm -f ${TEMP} ${TEMP3} +rm -f ${TEMP} ${TEMP3} ${TEMP4}