Date: Sun, 15 Jun 2014 21:55:22 GMT From: ghostmansd@FreeBSD.org To: svn-soc-all@FreeBSD.org Subject: socsvn commit: r269600 - soc2014/ghostmansd/normalize Message-ID: <201406152155.s5FLtMsW021408@socsvn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: ghostmansd Date: Sun Jun 15 21:55:22 2014 New Revision: 269600 URL: http://svnweb.FreeBSD.org/socsvn/?view=rev&rev=269600 Log: Unicode Normalization Algorithm: Hangul composition Unicode Normalization Algorithm is reimplemented. NFD and NFC algorithms are available as standalone functions. Hangul composition and decomposition are arithmetically-based. hangul_syllable() function was added to quickly check type of Hangul syllable (lead, vowel, trail or combination). The normalization functions are almost finished; the last part is to implement database lookup, using Unicode Database files (e.g. UnicodeData.txt). Added: soc2014/ghostmansd/normalize/Makefile soc2014/ghostmansd/normalize/hangul.h Modified: soc2014/ghostmansd/normalize/main.c soc2014/ghostmansd/normalize/strnorm.c soc2014/ghostmansd/normalize/wcsnorm.c Added: soc2014/ghostmansd/normalize/Makefile ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ soc2014/ghostmansd/normalize/Makefile Sun Jun 15 21:55:22 2014 (r269600) @@ -0,0 +1,8 @@ +CFLAGS := -g -std=c89 -O0 -pedantic \ +-Werror -Wall -Wextra -Wundef -Wshadow -Waggregate-return -Wstrict-prototypes \ +-Wcast-qual -Wcast-align -Wswitch-default -Wswitch-enum -Wwrite-strings \ +-Wpointer-arith -Wno-long-long -Wno-format -Wno-unreachable-code \ +-Wno-unused-function + +all: + $(CC) $(CFLAGS) -o main main.c wcsnorm.c Added: soc2014/ghostmansd/normalize/hangul.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ soc2014/ghostmansd/normalize/hangul.h Sun Jun 15 21:55:22 2014 (r269600) @@ -0,0 +1,600 @@ +/* + * Copyright (c) 2014 Dmitry Selyutin <ghostmansd@FreeBSD.org> + * at Lomonosov Moscow State University - www.msu.ru + * All rights reserved. + * + * Copyright (c) 2014 The FreeBSD Foundation + * All rights reserved. + * Portions of this software were developed by David Chisnall + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _HANGUL_H_ +#define _HANGUL_H_ + + +#define HANGUL_LEAD_TYPE 1 +#define HANGUL_VOWEL_TYPE 2 +#define HANGUL_TRAIL_TYPE 3 +#define HANGUL_LEAD_VOWEL_TYPE 4 +#define HANGUL_LEAD_VOWEL_TRAIL_TYPE 5 + + +/* The source code was automatically generated from HangulSyllableType.txt. + * This function is used to check type of the Hangul syllables. */ +static int hangul_syllable(wchar_t code) +{ + if (((0x1100 <= code) && (code <= 0x115F)) + || ((0xA960 <= code) && (code <= 0xA97C))) + return HANGUL_LEAD_TYPE; + + if (((0x1160 <= code) && (code <= 0x11A7)) + || ((0xD7B0 <= code) && (code <= 0xD7C6))) + return HANGUL_VOWEL_TYPE; + + if (((0x11A8 <= code) && (code <= 0x11FF)) + || ((0xD7CB <= code) && (code <= 0xD7FB))) + return HANGUL_TRAIL_TYPE; + + if ((code == 0xAC00)|| (code == 0xAC1C) || (code == 0xAC38) + || (code == 0xAC54) || (code == 0xAC70) || (code == 0xAC8C) + || (code == 0xACA8) || (code == 0xACC4) || (code == 0xACE0) + || (code == 0xACFC) || (code == 0xAD18) || (code == 0xAD34) + || (code == 0xAD50) || (code == 0xAD6C) || (code == 0xAD88) + || (code == 0xADA4) || (code == 0xADC0) || (code == 0xADDC) + || (code == 0xADF8) || (code == 0xAE14) || (code == 0xAE30) + || (code == 0xAE4C) || (code == 0xAE68) || (code == 0xAE84) + || (code == 0xAEA0) || (code == 0xAEBC) || (code == 0xAED8) + || (code == 0xAEF4) || (code == 0xAF10) || (code == 0xAF2C) + || (code == 0xAF48) || (code == 0xAF64) || (code == 0xAF80) + || (code == 0xAF9C) || (code == 0xAFB8) || (code == 0xAFD4) + || (code == 0xAFF0) || (code == 0xB00C) || (code == 0xB028) + || (code == 0xB044) || (code == 0xB060) || (code == 0xB07C) + || (code == 0xB098) || (code == 0xB0B4) || (code == 0xB0D0) + || (code == 0xB0EC) || (code == 0xB108) || (code == 0xB124) + || (code == 0xB140) || (code == 0xB15C) || (code == 0xB178) + || (code == 0xB194) || (code == 0xB1B0) || (code == 0xB1CC) + || (code == 0xB1E8) || (code == 0xB204) || (code == 0xB220) + || (code == 0xB23C) || (code == 0xB258) || (code == 0xB274) + || (code == 0xB290) || (code == 0xB2AC) || (code == 0xB2C8) + || (code == 0xB2E4) || (code == 0xB300) || (code == 0xB31C) + || (code == 0xB338) || (code == 0xB354) || (code == 0xB370) + || (code == 0xB38C) || (code == 0xB3A8) || (code == 0xB3C4) + || (code == 0xB3E0) || (code == 0xB3FC) || (code == 0xB418) + || (code == 0xB434) || (code == 0xB450) || (code == 0xB46C) + || (code == 0xB488) || (code == 0xB4A4) || (code == 0xB4C0) + || (code == 0xB4DC) || (code == 0xB4F8) || (code == 0xB514) + || (code == 0xB530) || (code == 0xB54C) || (code == 0xB568) + || (code == 0xB584) || (code == 0xB5A0) || (code == 0xB5BC) + || (code == 0xB5D8) || (code == 0xB5F4) || (code == 0xB610) + || (code == 0xB62C) || (code == 0xB648) || (code == 0xB664) + || (code == 0xB680) || (code == 0xB69C) || (code == 0xB6B8) + || (code == 0xB6D4) || (code == 0xB6F0) || (code == 0xB70C) + || (code == 0xB728) || (code == 0xB744) || (code == 0xB760) + || (code == 0xB77C) || (code == 0xB798) || (code == 0xB7B4) + || (code == 0xB7D0) || (code == 0xB7EC) || (code == 0xB808) + || (code == 0xB824) || (code == 0xB840) || (code == 0xB85C) + || (code == 0xB878) || (code == 0xB894) || (code == 0xB8B0) + || (code == 0xB8CC) || (code == 0xB8E8) || (code == 0xB904) + || (code == 0xB920) || (code == 0xB93C) || (code == 0xB958) + || (code == 0xB974) || (code == 0xB990) || (code == 0xB9AC) + || (code == 0xB9C8) || (code == 0xB9E4) || (code == 0xBA00) + || (code == 0xBA1C) || (code == 0xBA38) || (code == 0xBA54) + || (code == 0xBA70) || (code == 0xBA8C) || (code == 0xBAA8) + || (code == 0xBAC4) || (code == 0xBAE0) || (code == 0xBAFC) + || (code == 0xBB18) || (code == 0xBB34) || (code == 0xBB50) + || (code == 0xBB6C) || (code == 0xBB88) || (code == 0xBBA4) + || (code == 0xBBC0) || (code == 0xBBDC) || (code == 0xBBF8) + || (code == 0xBC14) || (code == 0xBC30) || (code == 0xBC4C) + || (code == 0xBC68) || (code == 0xBC84) || (code == 0xBCA0) + || (code == 0xBCBC) || (code == 0xBCD8) || (code == 0xBCF4) + || (code == 0xBD10) || (code == 0xBD2C) || (code == 0xBD48) + || (code == 0xBD64) || (code == 0xBD80) || (code == 0xBD9C) + || (code == 0xBDB8) || (code == 0xBDD4) || (code == 0xBDF0) + || (code == 0xBE0C) || (code == 0xBE28) || (code == 0xBE44) + || (code == 0xBE60) || (code == 0xBE7C) || (code == 0xBE98) + || (code == 0xBEB4) || (code == 0xBED0) || (code == 0xBEEC) + || (code == 0xBF08) || (code == 0xBF24) || (code == 0xBF40) + || (code == 0xBF5C) || (code == 0xBF78) || (code == 0xBF94) + || (code == 0xBFB0) || (code == 0xBFCC) || (code == 0xBFE8) + || (code == 0xC004) || (code == 0xC020) || (code == 0xC03C) + || (code == 0xC058) || (code == 0xC074) || (code == 0xC090) + || (code == 0xC0AC) || (code == 0xC0C8) || (code == 0xC0E4) + || (code == 0xC100) || (code == 0xC11C) || (code == 0xC138) + || (code == 0xC154) || (code == 0xC170) || (code == 0xC18C) + || (code == 0xC1A8) || (code == 0xC1C4) || (code == 0xC1E0) + || (code == 0xC1FC) || (code == 0xC218) || (code == 0xC234) + || (code == 0xC250) || (code == 0xC26C) || (code == 0xC288) + || (code == 0xC2A4) || (code == 0xC2C0) || (code == 0xC2DC) + || (code == 0xC2F8) || (code == 0xC314) || (code == 0xC330) + || (code == 0xC34C) || (code == 0xC368) || (code == 0xC384) + || (code == 0xC3A0) || (code == 0xC3BC) || (code == 0xC3D8) + || (code == 0xC3F4) || (code == 0xC410) || (code == 0xC42C) + || (code == 0xC448) || (code == 0xC464) || (code == 0xC480) + || (code == 0xC49C) || (code == 0xC4B8) || (code == 0xC4D4) + || (code == 0xC4F0) || (code == 0xC50C) || (code == 0xC528) + || (code == 0xC544) || (code == 0xC560) || (code == 0xC57C) + || (code == 0xC598) || (code == 0xC5B4) || (code == 0xC5D0) + || (code == 0xC5EC) || (code == 0xC608) || (code == 0xC624) + || (code == 0xC640) || (code == 0xC65C) || (code == 0xC678) + || (code == 0xC694) || (code == 0xC6B0) || (code == 0xC6CC) + || (code == 0xC6E8) || (code == 0xC704) || (code == 0xC720) + || (code == 0xC73C) || (code == 0xC758) || (code == 0xC774) + || (code == 0xC790) || (code == 0xC7AC) || (code == 0xC7C8) + || (code == 0xC7E4) || (code == 0xC800) || (code == 0xC81C) + || (code == 0xC838) || (code == 0xC854) || (code == 0xC870) + || (code == 0xC88C) || (code == 0xC8A8) || (code == 0xC8C4) + || (code == 0xC8E0) || (code == 0xC8FC) || (code == 0xC918) + || (code == 0xC934) || (code == 0xC950) || (code == 0xC96C) + || (code == 0xC988) || (code == 0xC9A4) || (code == 0xC9C0) + || (code == 0xC9DC) || (code == 0xC9F8) || (code == 0xCA14) + || (code == 0xCA30) || (code == 0xCA4C) || (code == 0xCA68) + || (code == 0xCA84) || (code == 0xCAA0) || (code == 0xCABC) + || (code == 0xCAD8) || (code == 0xCAF4) || (code == 0xCB10) + || (code == 0xCB2C) || (code == 0xCB48) || (code == 0xCB64) + || (code == 0xCB80) || (code == 0xCB9C) || (code == 0xCBB8) + || (code == 0xCBD4) || (code == 0xCBF0) || (code == 0xCC0C) + || (code == 0xCC28) || (code == 0xCC44) || (code == 0xCC60) + || (code == 0xCC7C) || (code == 0xCC98) || (code == 0xCCB4) + || (code == 0xCCD0) || (code == 0xCCEC) || (code == 0xCD08) + || (code == 0xCD24) || (code == 0xCD40) || (code == 0xCD5C) + || (code == 0xCD78) || (code == 0xCD94) || (code == 0xCDB0) + || (code == 0xCDCC) || (code == 0xCDE8) || (code == 0xCE04) + || (code == 0xCE20) || (code == 0xCE3C) || (code == 0xCE58) + || (code == 0xCE74) || (code == 0xCE90) || (code == 0xCEAC) + || (code == 0xCEC8) || (code == 0xCEE4) || (code == 0xCF00) + || (code == 0xCF1C) || (code == 0xCF38) || (code == 0xCF54) + || (code == 0xCF70) || (code == 0xCF8C) || (code == 0xCFA8) + || (code == 0xCFC4) || (code == 0xCFE0) || (code == 0xCFFC) + || (code == 0xD018) || (code == 0xD034) || (code == 0xD050) + || (code == 0xD06C) || (code == 0xD088) || (code == 0xD0A4) + || (code == 0xD0C0) || (code == 0xD0DC) || (code == 0xD0F8) + || (code == 0xD114) || (code == 0xD130) || (code == 0xD14C) + || (code == 0xD168) || (code == 0xD184) || (code == 0xD1A0) + || (code == 0xD1BC) || (code == 0xD1D8) || (code == 0xD1F4) + || (code == 0xD210) || (code == 0xD22C) || (code == 0xD248) + || (code == 0xD264) || (code == 0xD280) || (code == 0xD29C) + || (code == 0xD2B8) || (code == 0xD2D4) || (code == 0xD2F0) + || (code == 0xD30C) || (code == 0xD328) || (code == 0xD344) + || (code == 0xD360) || (code == 0xD37C) || (code == 0xD398) + || (code == 0xD3B4) || (code == 0xD3D0) || (code == 0xD3EC) + || (code == 0xD408) || (code == 0xD424) || (code == 0xD440) + || (code == 0xD45C) || (code == 0xD478) || (code == 0xD494) + || (code == 0xD4B0) || (code == 0xD4CC) || (code == 0xD4E8) + || (code == 0xD504) || (code == 0xD520) || (code == 0xD53C) + || (code == 0xD558) || (code == 0xD574) || (code == 0xD590) + || (code == 0xD5AC) || (code == 0xD5C8) || (code == 0xD5E4) + || (code == 0xD600) || (code == 0xD61C) || (code == 0xD638) + || (code == 0xD654) || (code == 0xD670) || (code == 0xD68C) + || (code == 0xD6A8) || (code == 0xD6C4) || (code == 0xD6E0) + || (code == 0xD6FC) || (code == 0xD718) || (code == 0xD734) + || (code == 0xD750) || (code == 0xD76C) || (code == 0xD788)) + return HANGUL_LEAD_VOWEL_TYPE; + + if (((0xAC01 <= code) && (code <= 0xAC1B)) + || ((0xAC1D <= code) && (code <= 0xAC37)) + || ((0xAC39 <= code) && (code <= 0xAC53)) + || ((0xAC55 <= code) && (code <= 0xAC6F)) + || ((0xAC71 <= code) && (code <= 0xAC8B)) + || ((0xAC8D <= code) && (code <= 0xACA7)) + || ((0xACA9 <= code) && (code <= 0xACC3)) + || ((0xACC5 <= code) && (code <= 0xACDF)) + || ((0xACE1 <= code) && (code <= 0xACFB)) + || ((0xACFD <= code) && (code <= 0xAD17)) + || ((0xAD19 <= code) && (code <= 0xAD33)) + || ((0xAD35 <= code) && (code <= 0xAD4F)) + || ((0xAD51 <= code) && (code <= 0xAD6B)) + || ((0xAD6D <= code) && (code <= 0xAD87)) + || ((0xAD89 <= code) && (code <= 0xADA3)) + || ((0xADA5 <= code) && (code <= 0xADBF)) + || ((0xADC1 <= code) && (code <= 0xADDB)) + || ((0xADDD <= code) && (code <= 0xADF7)) + || ((0xADF9 <= code) && (code <= 0xAE13)) + || ((0xAE15 <= code) && (code <= 0xAE2F)) + || ((0xAE31 <= code) && (code <= 0xAE4B)) + || ((0xAE4D <= code) && (code <= 0xAE67)) + || ((0xAE69 <= code) && (code <= 0xAE83)) + || ((0xAE85 <= code) && (code <= 0xAE9F)) + || ((0xAEA1 <= code) && (code <= 0xAEBB)) + || ((0xAEBD <= code) && (code <= 0xAED7)) + || ((0xAED9 <= code) && (code <= 0xAEF3)) + || ((0xAEF5 <= code) && (code <= 0xAF0F)) + || ((0xAF11 <= code) && (code <= 0xAF2B)) + || ((0xAF2D <= code) && (code <= 0xAF47)) + || ((0xAF49 <= code) && (code <= 0xAF63)) + || ((0xAF65 <= code) && (code <= 0xAF7F)) + || ((0xAF81 <= code) && (code <= 0xAF9B)) + || ((0xAF9D <= code) && (code <= 0xAFB7)) + || ((0xAFB9 <= code) && (code <= 0xAFD3)) + || ((0xAFD5 <= code) && (code <= 0xAFEF)) + || ((0xAFF1 <= code) && (code <= 0xB00B)) + || ((0xB00D <= code) && (code <= 0xB027)) + || ((0xB029 <= code) && (code <= 0xB043)) + || ((0xB045 <= code) && (code <= 0xB05F)) + || ((0xB061 <= code) && (code <= 0xB07B)) + || ((0xB07D <= code) && (code <= 0xB097)) + || ((0xB099 <= code) && (code <= 0xB0B3)) + || ((0xB0B5 <= code) && (code <= 0xB0CF)) + || ((0xB0D1 <= code) && (code <= 0xB0EB)) + || ((0xB0ED <= code) && (code <= 0xB107)) + || ((0xB109 <= code) && (code <= 0xB123)) + || ((0xB125 <= code) && (code <= 0xB13F)) + || ((0xB141 <= code) && (code <= 0xB15B)) + || ((0xB15D <= code) && (code <= 0xB177)) + || ((0xB179 <= code) && (code <= 0xB193)) + || ((0xB195 <= code) && (code <= 0xB1AF)) + || ((0xB1B1 <= code) && (code <= 0xB1CB)) + || ((0xB1CD <= code) && (code <= 0xB1E7)) + || ((0xB1E9 <= code) && (code <= 0xB203)) + || ((0xB205 <= code) && (code <= 0xB21F)) + || ((0xB221 <= code) && (code <= 0xB23B)) + || ((0xB23D <= code) && (code <= 0xB257)) + || ((0xB259 <= code) && (code <= 0xB273)) + || ((0xB275 <= code) && (code <= 0xB28F)) + || ((0xB291 <= code) && (code <= 0xB2AB)) + || ((0xB2AD <= code) && (code <= 0xB2C7)) + || ((0xB2C9 <= code) && (code <= 0xB2E3)) + || ((0xB2E5 <= code) && (code <= 0xB2FF)) + || ((0xB301 <= code) && (code <= 0xB31B)) + || ((0xB31D <= code) && (code <= 0xB337)) + || ((0xB339 <= code) && (code <= 0xB353)) + || ((0xB355 <= code) && (code <= 0xB36F)) + || ((0xB371 <= code) && (code <= 0xB38B)) + || ((0xB38D <= code) && (code <= 0xB3A7)) + || ((0xB3A9 <= code) && (code <= 0xB3C3)) + || ((0xB3C5 <= code) && (code <= 0xB3DF)) + || ((0xB3E1 <= code) && (code <= 0xB3FB)) + || ((0xB3FD <= code) && (code <= 0xB417)) + || ((0xB419 <= code) && (code <= 0xB433)) + || ((0xB435 <= code) && (code <= 0xB44F)) + || ((0xB451 <= code) && (code <= 0xB46B)) + || ((0xB46D <= code) && (code <= 0xB487)) + || ((0xB489 <= code) && (code <= 0xB4A3)) + || ((0xB4A5 <= code) && (code <= 0xB4BF)) + || ((0xB4C1 <= code) && (code <= 0xB4DB)) + || ((0xB4DD <= code) && (code <= 0xB4F7)) + || ((0xB4F9 <= code) && (code <= 0xB513)) + || ((0xB515 <= code) && (code <= 0xB52F)) + || ((0xB531 <= code) && (code <= 0xB54B)) + || ((0xB54D <= code) && (code <= 0xB567)) + || ((0xB569 <= code) && (code <= 0xB583)) + || ((0xB585 <= code) && (code <= 0xB59F)) + || ((0xB5A1 <= code) && (code <= 0xB5BB)) + || ((0xB5BD <= code) && (code <= 0xB5D7)) + || ((0xB5D9 <= code) && (code <= 0xB5F3)) + || ((0xB5F5 <= code) && (code <= 0xB60F)) + || ((0xB611 <= code) && (code <= 0xB62B)) + || ((0xB62D <= code) && (code <= 0xB647)) + || ((0xB649 <= code) && (code <= 0xB663)) + || ((0xB665 <= code) && (code <= 0xB67F)) + || ((0xB681 <= code) && (code <= 0xB69B)) + || ((0xB69D <= code) && (code <= 0xB6B7)) + || ((0xB6B9 <= code) && (code <= 0xB6D3)) + || ((0xB6D5 <= code) && (code <= 0xB6EF)) + || ((0xB6F1 <= code) && (code <= 0xB70B)) + || ((0xB70D <= code) && (code <= 0xB727)) + || ((0xB729 <= code) && (code <= 0xB743)) + || ((0xB745 <= code) && (code <= 0xB75F)) + || ((0xB761 <= code) && (code <= 0xB77B)) + || ((0xB77D <= code) && (code <= 0xB797)) + || ((0xB799 <= code) && (code <= 0xB7B3)) + || ((0xB7B5 <= code) && (code <= 0xB7CF)) + || ((0xB7D1 <= code) && (code <= 0xB7EB)) + || ((0xB7ED <= code) && (code <= 0xB807)) + || ((0xB809 <= code) && (code <= 0xB823)) + || ((0xB825 <= code) && (code <= 0xB83F)) + || ((0xB841 <= code) && (code <= 0xB85B)) + || ((0xB85D <= code) && (code <= 0xB877)) + || ((0xB879 <= code) && (code <= 0xB893)) + || ((0xB895 <= code) && (code <= 0xB8AF)) + || ((0xB8B1 <= code) && (code <= 0xB8CB)) + || ((0xB8CD <= code) && (code <= 0xB8E7)) + || ((0xB8E9 <= code) && (code <= 0xB903)) + || ((0xB905 <= code) && (code <= 0xB91F)) + || ((0xB921 <= code) && (code <= 0xB93B)) + || ((0xB93D <= code) && (code <= 0xB957)) + || ((0xB959 <= code) && (code <= 0xB973)) + || ((0xB975 <= code) && (code <= 0xB98F)) + || ((0xB991 <= code) && (code <= 0xB9AB)) + || ((0xB9AD <= code) && (code <= 0xB9C7)) + || ((0xB9C9 <= code) && (code <= 0xB9E3)) + || ((0xB9E5 <= code) && (code <= 0xB9FF)) + || ((0xBA01 <= code) && (code <= 0xBA1B)) + || ((0xBA1D <= code) && (code <= 0xBA37)) + || ((0xBA39 <= code) && (code <= 0xBA53)) + || ((0xBA55 <= code) && (code <= 0xBA6F)) + || ((0xBA71 <= code) && (code <= 0xBA8B)) + || ((0xBA8D <= code) && (code <= 0xBAA7)) + || ((0xBAA9 <= code) && (code <= 0xBAC3)) + || ((0xBAC5 <= code) && (code <= 0xBADF)) + || ((0xBAE1 <= code) && (code <= 0xBAFB)) + || ((0xBAFD <= code) && (code <= 0xBB17)) + || ((0xBB19 <= code) && (code <= 0xBB33)) + || ((0xBB35 <= code) && (code <= 0xBB4F)) + || ((0xBB51 <= code) && (code <= 0xBB6B)) + || ((0xBB6D <= code) && (code <= 0xBB87)) + || ((0xBB89 <= code) && (code <= 0xBBA3)) + || ((0xBBA5 <= code) && (code <= 0xBBBF)) + || ((0xBBC1 <= code) && (code <= 0xBBDB)) + || ((0xBBDD <= code) && (code <= 0xBBF7)) + || ((0xBBF9 <= code) && (code <= 0xBC13)) + || ((0xBC15 <= code) && (code <= 0xBC2F)) + || ((0xBC31 <= code) && (code <= 0xBC4B)) + || ((0xBC4D <= code) && (code <= 0xBC67)) + || ((0xBC69 <= code) && (code <= 0xBC83)) + || ((0xBC85 <= code) && (code <= 0xBC9F)) + || ((0xBCA1 <= code) && (code <= 0xBCBB)) + || ((0xBCBD <= code) && (code <= 0xBCD7)) + || ((0xBCD9 <= code) && (code <= 0xBCF3)) + || ((0xBCF5 <= code) && (code <= 0xBD0F)) + || ((0xBD11 <= code) && (code <= 0xBD2B)) + || ((0xBD2D <= code) && (code <= 0xBD47)) + || ((0xBD49 <= code) && (code <= 0xBD63)) + || ((0xBD65 <= code) && (code <= 0xBD7F)) + || ((0xBD81 <= code) && (code <= 0xBD9B)) + || ((0xBD9D <= code) && (code <= 0xBDB7)) + || ((0xBDB9 <= code) && (code <= 0xBDD3)) + || ((0xBDD5 <= code) && (code <= 0xBDEF)) + || ((0xBDF1 <= code) && (code <= 0xBE0B)) + || ((0xBE0D <= code) && (code <= 0xBE27)) + || ((0xBE29 <= code) && (code <= 0xBE43)) + || ((0xBE45 <= code) && (code <= 0xBE5F)) + || ((0xBE61 <= code) && (code <= 0xBE7B)) + || ((0xBE7D <= code) && (code <= 0xBE97)) + || ((0xBE99 <= code) && (code <= 0xBEB3)) + || ((0xBEB5 <= code) && (code <= 0xBECF)) + || ((0xBED1 <= code) && (code <= 0xBEEB)) + || ((0xBEED <= code) && (code <= 0xBF07)) + || ((0xBF09 <= code) && (code <= 0xBF23)) + || ((0xBF25 <= code) && (code <= 0xBF3F)) + || ((0xBF41 <= code) && (code <= 0xBF5B)) + || ((0xBF5D <= code) && (code <= 0xBF77)) + || ((0xBF79 <= code) && (code <= 0xBF93)) + || ((0xBF95 <= code) && (code <= 0xBFAF)) + || ((0xBFB1 <= code) && (code <= 0xBFCB)) + || ((0xBFCD <= code) && (code <= 0xBFE7)) + || ((0xBFE9 <= code) && (code <= 0xC003)) + || ((0xC005 <= code) && (code <= 0xC01F)) + || ((0xC021 <= code) && (code <= 0xC03B)) + || ((0xC03D <= code) && (code <= 0xC057)) + || ((0xC059 <= code) && (code <= 0xC073)) + || ((0xC075 <= code) && (code <= 0xC08F)) + || ((0xC091 <= code) && (code <= 0xC0AB)) + || ((0xC0AD <= code) && (code <= 0xC0C7)) + || ((0xC0C9 <= code) && (code <= 0xC0E3)) + || ((0xC0E5 <= code) && (code <= 0xC0FF)) + || ((0xC101 <= code) && (code <= 0xC11B)) + || ((0xC11D <= code) && (code <= 0xC137)) + || ((0xC139 <= code) && (code <= 0xC153)) + || ((0xC155 <= code) && (code <= 0xC16F)) + || ((0xC171 <= code) && (code <= 0xC18B)) + || ((0xC18D <= code) && (code <= 0xC1A7)) + || ((0xC1A9 <= code) && (code <= 0xC1C3)) + || ((0xC1C5 <= code) && (code <= 0xC1DF)) + || ((0xC1E1 <= code) && (code <= 0xC1FB)) + || ((0xC1FD <= code) && (code <= 0xC217)) + || ((0xC219 <= code) && (code <= 0xC233)) + || ((0xC235 <= code) && (code <= 0xC24F)) + || ((0xC251 <= code) && (code <= 0xC26B)) + || ((0xC26D <= code) && (code <= 0xC287)) + || ((0xC289 <= code) && (code <= 0xC2A3)) + || ((0xC2A5 <= code) && (code <= 0xC2BF)) + || ((0xC2C1 <= code) && (code <= 0xC2DB)) + || ((0xC2DD <= code) && (code <= 0xC2F7)) + || ((0xC2F9 <= code) && (code <= 0xC313)) + || ((0xC315 <= code) && (code <= 0xC32F)) + || ((0xC331 <= code) && (code <= 0xC34B)) + || ((0xC34D <= code) && (code <= 0xC367)) + || ((0xC369 <= code) && (code <= 0xC383)) + || ((0xC385 <= code) && (code <= 0xC39F)) + || ((0xC3A1 <= code) && (code <= 0xC3BB)) + || ((0xC3BD <= code) && (code <= 0xC3D7)) + || ((0xC3D9 <= code) && (code <= 0xC3F3)) + || ((0xC3F5 <= code) && (code <= 0xC40F)) + || ((0xC411 <= code) && (code <= 0xC42B)) + || ((0xC42D <= code) && (code <= 0xC447)) + || ((0xC449 <= code) && (code <= 0xC463)) + || ((0xC465 <= code) && (code <= 0xC47F)) + || ((0xC481 <= code) && (code <= 0xC49B)) + || ((0xC49D <= code) && (code <= 0xC4B7)) + || ((0xC4B9 <= code) && (code <= 0xC4D3)) + || ((0xC4D5 <= code) && (code <= 0xC4EF)) + || ((0xC4F1 <= code) && (code <= 0xC50B)) + || ((0xC50D <= code) && (code <= 0xC527)) + || ((0xC529 <= code) && (code <= 0xC543)) + || ((0xC545 <= code) && (code <= 0xC55F)) + || ((0xC561 <= code) && (code <= 0xC57B)) + || ((0xC57D <= code) && (code <= 0xC597)) + || ((0xC599 <= code) && (code <= 0xC5B3)) + || ((0xC5B5 <= code) && (code <= 0xC5CF)) + || ((0xC5D1 <= code) && (code <= 0xC5EB)) + || ((0xC5ED <= code) && (code <= 0xC607)) + || ((0xC609 <= code) && (code <= 0xC623)) + || ((0xC625 <= code) && (code <= 0xC63F)) + || ((0xC641 <= code) && (code <= 0xC65B)) + || ((0xC65D <= code) && (code <= 0xC677)) + || ((0xC679 <= code) && (code <= 0xC693)) + || ((0xC695 <= code) && (code <= 0xC6AF)) + || ((0xC6B1 <= code) && (code <= 0xC6CB)) + || ((0xC6CD <= code) && (code <= 0xC6E7)) + || ((0xC6E9 <= code) && (code <= 0xC703)) + || ((0xC705 <= code) && (code <= 0xC71F)) + || ((0xC721 <= code) && (code <= 0xC73B)) + || ((0xC73D <= code) && (code <= 0xC757)) + || ((0xC759 <= code) && (code <= 0xC773)) + || ((0xC775 <= code) && (code <= 0xC78F)) + || ((0xC791 <= code) && (code <= 0xC7AB)) + || ((0xC7AD <= code) && (code <= 0xC7C7)) + || ((0xC7C9 <= code) && (code <= 0xC7E3)) + || ((0xC7E5 <= code) && (code <= 0xC7FF)) + || ((0xC801 <= code) && (code <= 0xC81B)) + || ((0xC81D <= code) && (code <= 0xC837)) + || ((0xC839 <= code) && (code <= 0xC853)) + || ((0xC855 <= code) && (code <= 0xC86F)) + || ((0xC871 <= code) && (code <= 0xC88B)) + || ((0xC88D <= code) && (code <= 0xC8A7)) + || ((0xC8A9 <= code) && (code <= 0xC8C3)) + || ((0xC8C5 <= code) && (code <= 0xC8DF)) + || ((0xC8E1 <= code) && (code <= 0xC8FB)) + || ((0xC8FD <= code) && (code <= 0xC917)) + || ((0xC919 <= code) && (code <= 0xC933)) + || ((0xC935 <= code) && (code <= 0xC94F)) + || ((0xC951 <= code) && (code <= 0xC96B)) + || ((0xC96D <= code) && (code <= 0xC987)) + || ((0xC989 <= code) && (code <= 0xC9A3)) + || ((0xC9A5 <= code) && (code <= 0xC9BF)) + || ((0xC9C1 <= code) && (code <= 0xC9DB)) + || ((0xC9DD <= code) && (code <= 0xC9F7)) + || ((0xC9F9 <= code) && (code <= 0xCA13)) + || ((0xCA15 <= code) && (code <= 0xCA2F)) + || ((0xCA31 <= code) && (code <= 0xCA4B)) + || ((0xCA4D <= code) && (code <= 0xCA67)) + || ((0xCA69 <= code) && (code <= 0xCA83)) + || ((0xCA85 <= code) && (code <= 0xCA9F)) + || ((0xCAA1 <= code) && (code <= 0xCABB)) + || ((0xCABD <= code) && (code <= 0xCAD7)) + || ((0xCAD9 <= code) && (code <= 0xCAF3)) + || ((0xCAF5 <= code) && (code <= 0xCB0F)) + || ((0xCB11 <= code) && (code <= 0xCB2B)) + || ((0xCB2D <= code) && (code <= 0xCB47)) + || ((0xCB49 <= code) && (code <= 0xCB63)) + || ((0xCB65 <= code) && (code <= 0xCB7F)) + || ((0xCB81 <= code) && (code <= 0xCB9B)) + || ((0xCB9D <= code) && (code <= 0xCBB7)) + || ((0xCBB9 <= code) && (code <= 0xCBD3)) + || ((0xCBD5 <= code) && (code <= 0xCBEF)) + || ((0xCBF1 <= code) && (code <= 0xCC0B)) + || ((0xCC0D <= code) && (code <= 0xCC27)) + || ((0xCC29 <= code) && (code <= 0xCC43)) + || ((0xCC45 <= code) && (code <= 0xCC5F)) + || ((0xCC61 <= code) && (code <= 0xCC7B)) + || ((0xCC7D <= code) && (code <= 0xCC97)) + || ((0xCC99 <= code) && (code <= 0xCCB3)) + || ((0xCCB5 <= code) && (code <= 0xCCCF)) + || ((0xCCD1 <= code) && (code <= 0xCCEB)) + || ((0xCCED <= code) && (code <= 0xCD07)) + || ((0xCD09 <= code) && (code <= 0xCD23)) + || ((0xCD25 <= code) && (code <= 0xCD3F)) + || ((0xCD41 <= code) && (code <= 0xCD5B)) + || ((0xCD5D <= code) && (code <= 0xCD77)) + || ((0xCD79 <= code) && (code <= 0xCD93)) + || ((0xCD95 <= code) && (code <= 0xCDAF)) + || ((0xCDB1 <= code) && (code <= 0xCDCB)) + || ((0xCDCD <= code) && (code <= 0xCDE7)) + || ((0xCDE9 <= code) && (code <= 0xCE03)) + || ((0xCE05 <= code) && (code <= 0xCE1F)) + || ((0xCE21 <= code) && (code <= 0xCE3B)) + || ((0xCE3D <= code) && (code <= 0xCE57)) + || ((0xCE59 <= code) && (code <= 0xCE73)) + || ((0xCE75 <= code) && (code <= 0xCE8F)) + || ((0xCE91 <= code) && (code <= 0xCEAB)) + || ((0xCEAD <= code) && (code <= 0xCEC7)) + || ((0xCEC9 <= code) && (code <= 0xCEE3)) + || ((0xCEE5 <= code) && (code <= 0xCEFF)) + || ((0xCF01 <= code) && (code <= 0xCF1B)) + || ((0xCF1D <= code) && (code <= 0xCF37)) + || ((0xCF39 <= code) && (code <= 0xCF53)) + || ((0xCF55 <= code) && (code <= 0xCF6F)) + || ((0xCF71 <= code) && (code <= 0xCF8B)) + || ((0xCF8D <= code) && (code <= 0xCFA7)) + || ((0xCFA9 <= code) && (code <= 0xCFC3)) + || ((0xCFC5 <= code) && (code <= 0xCFDF)) + || ((0xCFE1 <= code) && (code <= 0xCFFB)) + || ((0xCFFD <= code) && (code <= 0xD017)) + || ((0xD019 <= code) && (code <= 0xD033)) + || ((0xD035 <= code) && (code <= 0xD04F)) + || ((0xD051 <= code) && (code <= 0xD06B)) + || ((0xD06D <= code) && (code <= 0xD087)) + || ((0xD089 <= code) && (code <= 0xD0A3)) + || ((0xD0A5 <= code) && (code <= 0xD0BF)) + || ((0xD0C1 <= code) && (code <= 0xD0DB)) + || ((0xD0DD <= code) && (code <= 0xD0F7)) + || ((0xD0F9 <= code) && (code <= 0xD113)) + || ((0xD115 <= code) && (code <= 0xD12F)) + || ((0xD131 <= code) && (code <= 0xD14B)) + || ((0xD14D <= code) && (code <= 0xD167)) + || ((0xD169 <= code) && (code <= 0xD183)) + || ((0xD185 <= code) && (code <= 0xD19F)) + || ((0xD1A1 <= code) && (code <= 0xD1BB)) + || ((0xD1BD <= code) && (code <= 0xD1D7)) + || ((0xD1D9 <= code) && (code <= 0xD1F3)) + || ((0xD1F5 <= code) && (code <= 0xD20F)) + || ((0xD211 <= code) && (code <= 0xD22B)) + || ((0xD22D <= code) && (code <= 0xD247)) + || ((0xD249 <= code) && (code <= 0xD263)) + || ((0xD265 <= code) && (code <= 0xD27F)) + || ((0xD281 <= code) && (code <= 0xD29B)) + || ((0xD29D <= code) && (code <= 0xD2B7)) + || ((0xD2B9 <= code) && (code <= 0xD2D3)) + || ((0xD2D5 <= code) && (code <= 0xD2EF)) + || ((0xD2F1 <= code) && (code <= 0xD30B)) + || ((0xD30D <= code) && (code <= 0xD327)) + || ((0xD329 <= code) && (code <= 0xD343)) + || ((0xD345 <= code) && (code <= 0xD35F)) + || ((0xD361 <= code) && (code <= 0xD37B)) + || ((0xD37D <= code) && (code <= 0xD397)) + || ((0xD399 <= code) && (code <= 0xD3B3)) + || ((0xD3B5 <= code) && (code <= 0xD3CF)) + || ((0xD3D1 <= code) && (code <= 0xD3EB)) + || ((0xD3ED <= code) && (code <= 0xD407)) + || ((0xD409 <= code) && (code <= 0xD423)) + || ((0xD425 <= code) && (code <= 0xD43F)) + || ((0xD441 <= code) && (code <= 0xD45B)) + || ((0xD45D <= code) && (code <= 0xD477)) + || ((0xD479 <= code) && (code <= 0xD493)) + || ((0xD495 <= code) && (code <= 0xD4AF)) + || ((0xD4B1 <= code) && (code <= 0xD4CB)) + || ((0xD4CD <= code) && (code <= 0xD4E7)) + || ((0xD4E9 <= code) && (code <= 0xD503)) + || ((0xD505 <= code) && (code <= 0xD51F)) + || ((0xD521 <= code) && (code <= 0xD53B)) + || ((0xD53D <= code) && (code <= 0xD557)) + || ((0xD559 <= code) && (code <= 0xD573)) + || ((0xD575 <= code) && (code <= 0xD58F)) + || ((0xD591 <= code) && (code <= 0xD5AB)) + || ((0xD5AD <= code) && (code <= 0xD5C7)) + || ((0xD5C9 <= code) && (code <= 0xD5E3)) + || ((0xD5E5 <= code) && (code <= 0xD5FF)) + || ((0xD601 <= code) && (code <= 0xD61B)) + || ((0xD61D <= code) && (code <= 0xD637)) + || ((0xD639 <= code) && (code <= 0xD653)) + || ((0xD655 <= code) && (code <= 0xD66F)) + || ((0xD671 <= code) && (code <= 0xD68B)) + || ((0xD68D <= code) && (code <= 0xD6A7)) + || ((0xD6A9 <= code) && (code <= 0xD6C3)) + || ((0xD6C5 <= code) && (code <= 0xD6DF)) + || ((0xD6E1 <= code) && (code <= 0xD6FB)) + || ((0xD6FD <= code) && (code <= 0xD717)) + || ((0xD719 <= code) && (code <= 0xD733)) + || ((0xD735 <= code) && (code <= 0xD74F)) + || ((0xD751 <= code) && (code <= 0xD76B)) + || ((0xD76D <= code) && (code <= 0xD787)) + || ((0xD789 <= code) && (code <= 0xD7A3))) + return HANGUL_LEAD_VOWEL_TRAIL_TYPE; + + return (int) 0; +} + + +#endif /* _HANGUL_H_ */ Modified: soc2014/ghostmansd/normalize/main.c ============================================================================== --- soc2014/ghostmansd/normalize/main.c Sun Jun 15 20:14:11 2014 (r269599) +++ soc2014/ghostmansd/normalize/main.c Sun Jun 15 21:55:22 2014 (r269600) @@ -32,25 +32,52 @@ #include "normalize.h" #include <stdio.h> +#include <errno.h> +#include <stdlib.h> int main(int argc, char const **argv) { - size_t req = 0; - size_t size = 100; + size_t index = 0; + size_t reqsize = 0; wchar_t buffer[100] = {0}; - wchar_t const *ptr = NULL; - wchar_t const nfd[] = { + wchar_t const nfc[] = { 0x1100, 0x1162, 0x11AC, - 0x1100, 0x1162, 0x11AC, 0x0000}; + 0x1100, 0x1162, 0x11AC, + 0x1100, 0x1162, 0x11AC, + 0x0000}; + wchar_t const nfd[] = {0xac21, 0xac21, 0xac21, 0x0000}; + size_t const size = 10; + + /* compose */ + (void) argc; + (void) argv; + wmemset(buffer, 0, size); + reqsize = __wcsnorm(buffer, size, nfc, __NORM_NFC); + if ((reqsize == 0) || (reqsize > size)) + { + perror("NORM_NFC"); + printf("\tArguments: size=%lu, reqsize=%lu\n", size, reqsize); + return EXIT_FAILURE; + } + printf("NFC: "); + for (index = 0; index < reqsize; ++index) + printf("\\u%04X", buffer[index]); + printf("\n"); /* decompose */ - req = __wcsnorm(buffer, size, nfd, __NORM_NFC); - printf("size=%lu\n", size); - printf("req=%lu\n", req); - printf("buffer="); - for (ptr = buffer; *ptr; ++ptr) - printf("0x%04x,", (unsigned int) *ptr); + wmemset(buffer, 0, size); + reqsize = __wcsnorm(buffer, size, nfd, __NORM_NFD); + if ((reqsize == 0) || (reqsize > size)) + { + perror("NORM_NFD"); + printf("\tArguments: size=%lu, reqsize=%lu\n", size, reqsize); + return EXIT_FAILURE; + } + printf("NFD: "); + for (index = 0; index < reqsize; ++index) + printf("\\u%04X", buffer[index]); printf("\n"); - return 0; + + return EXIT_SUCCESS; } Modified: soc2014/ghostmansd/normalize/strnorm.c ============================================================================== --- soc2014/ghostmansd/normalize/strnorm.c Sun Jun 15 20:14:11 2014 (r269599) +++ soc2014/ghostmansd/normalize/strnorm.c Sun Jun 15 21:55:22 2014 (r269600) @@ -32,12 +32,107 @@ #include "strnorm.h" -size_t -__strnorm(char *buffer, size_t size, char const *str, int form) + +static size_t __norm_encode(char *buffer, size_t size, wchar_t const *wstr, + locale_t locale) +{ + static const mbstate_t initial; + size_t length = 0; + char *mbs = NULL; + wchar_t const *wcs = wstr; + + FIX_LOCALE(locale); + length = wcsrtombs_l(NULL, &wcs, 0, &state, locale); + if (length == ((size_t)-1)) + return 0; + else if (length > size) + { + if (size == 0) + return length; + errno = ERANGE; + return 0; + } + return wcsrtombs_l(mbs, &wstr, length, &state, locale); +} + + +static wchar_t *__norm_decode(char const *str, locale_t locale) +{ + static const mbstate_t initial; + size_t length = 0; + wchar_t *wcs = NULL; + const char *mbs = str; + mbstate_t state = initial; + + FIX_LOCALE(locale); + length = mbsrtowcs_l(NULL, &mbs, 0, &state, locale); + if (length == ((size_t)-1)) + return NULL; + if ((wcs = malloc((length + 1) * sizeof(wchar_t))) == NULL) + __collate_err(EX_OSERR, __func__); + mbsrtowcs_l(wcs, &str, length, &state, loc); + wcs[length] = 0; + state = initial; + return wcs; +} + + +size_t __strnorm_l(char *buffer, size_t size, char const *str, int form, + locale_t locale) { size_t wsize = 0; + size_t reqsize = 0; + size_t wreqsize = 0; wchar_t *wstr = NULL; wchar_t *wbuffer = NULL; - return 0; +#define __strnorm_failure() \ +do { \ + free(wbuffer); \ + free(wstr); \ + return 0; \ +} while (0) + + /* Check initial arguments. */ + if ((str == NULL) || ((buffer != NULL) && (size == 0))) + { + errno = EINVAL; + return 0; + } + switch (form) + { + case __NORM_NFD: + case __NORM_NFC: + case __NORM_NFKD: + case __NORM_NFKC: + break; + default: + errno = EINVAL; + return 0; + } + + /* Acquire __wcsnorm() arguments. */ + FIX_LOCALE(locale); + if (locale->__collate_load_error) + __strnorm_failure(); + if ((wstr = __strnorm_mbstowcs(str, locale)) == NULL) + __strnorm_failure(); + if ((wsize = __wcsnorm(NULL, 0, wstr, form) == 0)) + __strnorm_failure(); + if ((wbuffer = malloc(wsize * sizeof(wchar_t))) == NULL) + __strnorm_failure(); + + /* Normalize the wide string. */ + wreqsize = __wcsnorm(wbuffer, wsize, wstr, form); + if ((wreqsize == 0) || (wreqsize > wsize)) + __strnorm_failure(); + + /* Check if byte buffer is large enough. */ + reqsize = __norm_encode(buffer, size, wbuffer); + if (reqsize == ((size_t)-1)) + __strnorm_failure(); } + + +size_t __strnorm(char *buffer, size_t size, char const *str, int form) +{ return __strnorm_l(buffer, size, str, form, __get_current_locale()); } Modified: soc2014/ghostmansd/normalize/wcsnorm.c ============================================================================== --- soc2014/ghostmansd/normalize/wcsnorm.c Sun Jun 15 20:14:11 2014 (r269599) +++ soc2014/ghostmansd/normalize/wcsnorm.c Sun Jun 15 21:55:22 2014 (r269600) @@ -30,150 +30,197 @@ * SUCH DAMAGE. */ -#include "normalize.h" #include <errno.h> #include <stdint.h> #include <stdio.h> +#include "normalize.h" +#include "hangul.h" #define HANGUL_MIN 0xAC00 #define HANGUL_MAX 0xD7A4 - -#define HANGUL_BASE 0xAC00 -#define HANGUL_LEAD_BASE 0x1100 -#define HANGUL_VOWEL_BASE 0x1161 -#define HANGUL_TRAIL_BASE 0x11A7 +#define HANGUL_BASE HANGUL_MIN #define HANGUL_LEAD_COUNT 19 #define HANGUL_VOWEL_COUNT 21 #define HANGUL_TRAIL_COUNT 28 +#define HANGUL_LEAD_MIN 0x1100 +#define HANGUL_VOWEL_MIN 0x1161 +#define HANGUL_TRAIL_MIN 0x11A7 + +#define HANGUL_LEAD_MAX ((HANGUL_LEAD_MIN + HANGUL_LEAD_COUNT) - 1) +#define HANGUL_VOWEL_MAX ((HANGUL_VOWEL_MIN + HANGUL_VOWEL_COUNT) - 1) +#define HANGUL_TRAIL_MAX ((HANGUL_TRAIL_MIN + HANGUL_TRAIL_COUNT) - 1) + #define HANGUL_BASE_COUNT 588 #define HANGUL_FULL_COUNT 11172 -size_t -__wcsnorm(wchar_t *buffer, size_t size, wchar_t const *str, int form) +static size_t compose(wchar_t *buffer, size_t size, wchar_t const *str) { - int error = 0; - size_t count = 0; + int32_t prev = 0; + int32_t curr = 0; + size_t index = 0; + size_t length = 0; size_t reqsize = 0; - wchar_t curr = 0xFFFF; - wchar_t last = 0xFFFF; - int32_t lead = 0xFFFF; - int32_t vowel = 0xFFFF; - int32_t trail = 0xFFFF; - int32_t hangul = 0xFFFF; - wchar_t const *iter = str; - if (!str || (buffer && !size)) - { - errno = EINVAL; - return 0; - } - switch (form) + prev = *str; + length = wcslen(str); + if (buffer != NULL) + *buffer = prev; + for (index = 1; index < length; ++index) { - case __NORM_NFD: - case __NORM_NFC: - case __NORM_NFKD: - case __NORM_NFKC: - break; - default: - errno = EINVAL; - return 0; - } + if ((buffer != NULL) && (reqsize > size)) + return compose(NULL, 0, str); + curr = str[index]; - size -= 1; - last = 0x00; - do { - curr = *iter; - if (curr == L'\0') - break; - if (reqsize > size) + /* Text exclusively containing ASCII characters (U+0000..U+007F) + * is left unaffected by all of the Normalization Forms. + * Text exclusively containing Latin-1 characters (U+0000..U+00FF) + * is left unaffected by NFC. This is effectively the same as saying + * that all Latin-1 text is already normalized to NFC. */ + if (prev < 0xFF) { if (buffer != NULL) - buffer[++size] = L'\0'; - return __wcsnorm(NULL, 0, str, form); + *buffer++ = prev; + prev = curr; + ++reqsize; + continue; } - /* Text exclusively containing ASCII characters (U+0000..U+007F) - * is left unaffected by all of the Normalization Forms. - * Text exclusively containing Latin-1 characters (U+0000..U+00FF) - * is left unaffected by NFC. This is effectively the same as saying - * that all Latin-1 text is already normalized to NFC. */ - if ((curr < 0x80) || ((curr <= 0xFF) && (form == __NORM_NFC))) + /* Hangul script composition normalization algorithm. */ + else if (((HANGUL_LEAD_MIN <= prev) && (prev <= HANGUL_LEAD_MAX)) + && ((HANGUL_VOWEL_MIN <= curr) && (curr <= HANGUL_VOWEL_MAX))) + { + prev = (HANGUL_BASE \ + + ((prev - HANGUL_LEAD_MIN) * HANGUL_BASE_COUNT) + + ((curr - HANGUL_VOWEL_MIN) * HANGUL_TRAIL_COUNT)); + } + else if ((hangul_syllable(prev) == HANGUL_LEAD_VOWEL_TYPE) + && ((HANGUL_TRAIL_MIN <= curr) && (curr <= HANGUL_TRAIL_MAX))) + prev += (curr - HANGUL_TRAIL_MIN); + else { if (buffer != NULL) - *buffer = curr; - count = 1; + *buffer++ = prev; + prev = curr; + ++reqsize; } + } - /* Hangul script uses a special normalization algorithm. */ - if (((HANGUL_MIN <= curr) && (curr <= HANGUL_MAX)) - && ((form == __NORM_NFD) || (form == __NORM_NFKD))) + if ((reqsize + 2) > size) + { + if (buffer != NULL) + *--buffer = L'\0'; + return (reqsize + 2); + } + ++reqsize; + if ((reqsize + 1) > size) + return ++reqsize; + if (buffer != NULL) + { + *buffer++ = prev; + *buffer++ = L'\0'; + } + return reqsize; +} + + +static size_t decompose(wchar_t *buffer, size_t size, wchar_t const *str) +{ + size_t count = 0; + size_t reqsize = 0; + int32_t curr = 0xFFFF; + int32_t lead = 0xFFFF; + int32_t vowel = 0xFFFF; + int32_t trail = 0xFFFF; + wchar_t const *iter = str; + + do { + curr = *iter; + if (curr == L'\0') + break; + if ((buffer != NULL) && (reqsize > size)) + return decompose(NULL, 0, str); + + /* Hangul script decomposition normalization algorithm. */ + if (hangul_syllable(curr) != 0) { - hangul = (curr - HANGUL_BASE); - lead = (HANGUL_LEAD_BASE + (hangul / HANGUL_BASE_COUNT)); - vowel = (HANGUL_VOWEL_BASE + \ - ((hangul % HANGUL_BASE_COUNT) / HANGUL_TRAIL_COUNT)); - trail = (HANGUL_TRAIL_BASE + (hangul % HANGUL_TRAIL_COUNT)); - count = ((trail != HANGUL_TRAIL_BASE) ? 3 : 2); + curr = (curr - HANGUL_BASE); + lead = (HANGUL_LEAD_MIN + (curr / HANGUL_BASE_COUNT)); + vowel = (HANGUL_VOWEL_MIN + \ + ((curr % HANGUL_BASE_COUNT) / HANGUL_TRAIL_COUNT)); + trail = (HANGUL_TRAIL_MIN + (curr % HANGUL_TRAIL_COUNT)); + count = ((trail != HANGUL_TRAIL_MIN) ? 3 : 2); if ((reqsize + count) > size) count = (size - reqsize); if (buffer != NULL) { if (count >= 1) - buffer[0] = lead; + buffer[0] = (wchar_t) lead; if (count >= 2) - buffer[1] = vowel; - if ((count >= 3) && (trail != HANGUL_TRAIL_BASE)) - buffer[2] = trail; + buffer[1] = (wchar_t) vowel; + if ((count >= 3) && (trail != HANGUL_TRAIL_MIN)) + buffer[2] = (wchar_t) trail; } - count = ((curr != HANGUL_TRAIL_BASE) ? 3 : 2); + count = ((trail != HANGUL_TRAIL_MIN) ? 3 : 2); } - else *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201406152155.s5FLtMsW021408>