Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 9 Sep 2017 00:06:46 +0000 (UTC)
From:      Warren Block <wblock@FreeBSD.org>
To:        doc-committers@freebsd.org, svn-doc-all@freebsd.org, svn-doc-head@freebsd.org
Subject:   svn commit: r50811 - head/share/tools/convert2utf8
Message-ID:  <201709090006.v8906kGc004911@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: wblock
Date: Sat Sep  9 00:06:46 2017
New Revision: 50811
URL: https://svnweb.freebsd.org/changeset/doc/50811

Log:
  Add a utility to convert documentation language subdirectories to UTF-8.

Added:
  head/share/tools/convert2utf8/
  head/share/tools/convert2utf8/convert2utf8   (contents, props changed)

Added: head/share/tools/convert2utf8/convert2utf8
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/share/tools/convert2utf8/convert2utf8	Sat Sep  9 00:06:46 2017	(r50811)
@@ -0,0 +1,194 @@
+#!/usr/bin/env perl
+# ts=4
+
+# Copyright (c) 2017 Warren Block
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# * SUCH DAMAGE.
+# *
+# $FreeBSD$
+
+# convert a FreeBSD documentation language subdirectory to UTF-8
+
+use strict;
+use warnings;
+use utf8;
+use open qw/:std :utf8/;
+
+use File::Basename;
+use Getopt::Std;
+
+our ($opt_d, $opt_e, $opt_h, $opt_n, $opt_v);
+
+my $file  = '/usr/bin/file';
+my $find  = '/usr/bin/find';
+my $grep  = '/usr/bin/grep';
+my $iconv = '/usr/bin/iconv';
+my $make  = '/usr/bin/make';
+my $svn   = '/usr/local/bin/svn';
+my $xargs = '/usr/bin/xargs';
+
+my $docdir = '/usr/doc/en_US.ISO8859-1';
+my $exclpath = 'htdocs/releases/*/*.html';
+my $verbose = 0;
+
+sub usage {
+	my $prog = basename($0);
+	print "$prog: convert FreeBSD documentation language subdirectory to UTF-8\n\n";
+	print "Usage: $prog -h\n";
+	print "       $prog [-v] [-n] [-d docdir] [-e excludepath]\n\n";
+	print "       -v  verbose\n";
+	print "       -n  trial run\n";
+	print "       -d docdir (default $docdir)\n";
+	print "       -e excludepath (relative to docdir, default $exclpath)\n\n";
+	print "This program converts FreeBSD documentation files in legacy\n";
+	print "encoding directories like en_US.ISO8859-1 to UTF-8.  The\n";
+	print "documentation directory must be a Subversion checkout.  After\n";
+	print "files are converted, the directory is renamed to *.UTF-8.\n\n";
+	print "The default exclude path prevents conversion of statically-\n";
+	print "generated HTML files in htdocs/releases/*/*.html.\n";
+	exit 0;
+}
+
+sub check_local_copy {
+	my ($dn) = @_;
+	print "checking local copy\n" if $verbose;
+	my $rev = `$svn info $dn`;
+	my $localrev = $1 if ( `$svn info $dn` =~ /Revision: (\d+)/ );
+	die "** error checking local revision\n" if $?;
+	print "localrev = $localrev\n" if $verbose;
+	my $remoterev = $1 if ( `$svn info $dn -rHEAD` =~ /Revision: (\d+)/ );
+	die "** error checking remote revision\n" if $?;
+	print "remoterev = $remoterev\n" if $verbose;
+	die "** local copy not up to date, run svn up\n" if $localrev != $remoterev;
+}
+
+sub make_clean {
+	my ($dn) = @_;
+	print "cleaning '$dn'\n" if $verbose;
+	my $cmd = "$make -C $dn FORMATS=html,html-split,pdf,epub clean";
+	if ( $opt_n ) {
+		print "$cmd\n" if $verbose;
+	} else {
+		`$cmd`;
+	}
+}
+
+sub find_files {
+	my ($dn) = @_;
+	print "finding files to be converted in '$dn'\n" if $verbose;
+	print "excluding files matching \"$dn$exclpath/*\"\n" if $verbose;
+	return map(/^(.*):/, `$find $dn -not -path \"$dn$exclpath\" -type f -print0 | $xargs -0 $file | $grep 'XML\\|SGML\\|BSD'`);
+}
+
+sub convert_file {
+	my ($fn, $bd, $iso, $from, $to) = @_;
+	my $lcto = lc($to);
+	print "converting '$fn' from $from to $to\n" if $verbose;
+
+	# convert to utf-8 variable
+	my $contents = `$iconv -f $from -t $to $fn`;
+	die "** error converting '$file'\n" if $?;
+
+	# do fixups:
+	# change <?xml version="1.0" encoding="whatever" ?> to target code
+	$contents =~ s/encoding=".*?"/encoding="$lcto"/g;
+	# change HTML charset, but do not change :charset=: strings
+	$contents =~ s/[^:]charset=((?:[-a-z0-9])+)/charset=$lcto/g;
+	# change "en_US.ISO8859-1"
+	$contents =~ s/$bd/$iso.$to/g;
+	# change <!ENTITY xml.encoding        'iso-8859-1'>
+	$contents =~ s/<!ENTITY xml\.encoding(\s+).*>/<!ENTITY xml.encoding$1'$lcto'>/;
+	# print "$contents\n" if $verbose;
+
+	# change character entities?
+
+	unless ( $opt_n ) {
+		# overwrite original
+		open my $fh, ">", "$fn" or die "** could not open '$fn' to write: $!\n";
+		print $fh $contents;
+		close $fh or die "** could not close '$fn': $!\n";
+	}
+}
+
+sub rename_dir {
+	my ($fromdir, $todir) = @_;
+	my $cmd = "$svn mv $docdir $todir";
+	if ( $opt_n ) {
+		print "$cmd\n" if $verbose;
+	} else {
+		`$cmd`;
+	}
+}
+
+
+sub main {
+	getopts('d:e:hnv');
+
+	usage() if $opt_h;
+	$verbose = 1 if $opt_v;
+	$docdir = $opt_d if $opt_d;
+	$exclpath = $opt_e if $opt_e;
+
+	# sanitize exclude path, find is very picky about matches
+	# convert multiple slashes to single
+	$exclpath =~ s%/{2,}%/%;
+	die "** exclude path '$exclpath' must be relative (under '$docdir')\n" if $exclpath =~ m%^/%;
+
+	my $basedir = basename($docdir);
+	die "** '$docdir' does not have a standard ISO xy_AB directory name\n" unless $basedir =~ /^([a-z]{2}_[A-Z]{1,3})\./;
+	my $isolang = $1;
+
+	die "** no language code on target directory\n" unless $basedir =~ /\.(.*)$/;
+	my $fromcode = $1;
+
+	die "** no Makefile in '$docdir'\n" unless -f "$docdir/Makefile";
+
+	my $tocode = "UTF-8";
+	my $targdir = dirname($docdir) . "/$isolang.$tocode";
+
+	if ( $verbose ) {
+		print "docdir   = '$docdir'\n";
+		print "basedir  = '$basedir'\n";
+		print "isolang  = '$isolang'\n";
+		print "fromcode = '$fromcode'\n";
+		print "tocode   = '$tocode'\n";
+		print "targdir  = '$targdir'\n";
+	}
+
+	check_local_copy($docdir);
+
+	make_clean($docdir);
+
+	my @files = find_files($docdir);
+
+	for my $f (@files) {
+		convert_file($f, $basedir, $isolang, $fromcode, $tocode);
+	}
+
+	rename_dir($docdir, $targdir);
+
+	print "Done.\n";
+	print "XML files in $docdir converted from $fromcode to $tocode, directory renamed to $targdir\n";
+}
+
+main();



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201709090006.v8906kGc004911>