Date: Thu, 9 Aug 2007 19:07:01 GMT From: Fredrik Lindberg <fli@FreeBSD.org> To: Perforce Change Reviews <perforce@FreeBSD.org> Subject: PERFORCE change 124965 for review Message-ID: <200708091907.l79J71wC086318@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=124965 Change 124965 by fli@fli_nexus on 2007/08/09 19:06:10 - Add utf8_casecmp() to do case insensitive string compairsons. - Add utf8_tolower() that converts a string to lower case. - const'ify some argument in utf8_{en,de}code while here. Affected files ... .. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.c#2 edit .. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.h#2 edit .. //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8_cfold.c#1 add Differences ... ==== //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.c#2 (text+ko) ==== @@ -26,10 +26,14 @@ #include <sys/types.h> +#include <string.h> #include <wchar.h> #include "utf8.h" +extern struct casemap unicm_up2low[]; +extern int unicm_up2low_size; + /* * utf8_encode * Encodes a wide character string into an UTF-8 byte sequence @@ -46,10 +50,11 @@ * This function is partially based on code from libarchive by Tim Kientzle */ ssize_t -utf8_encode(wchar_t *src, char *dst, size_t dlen) +utf8_encode(const wchar_t *src, char *dst, size_t dlen) { char *p; - wchar_t *wp, wc; + const wchar_t *wp; + wchar_t wc; size_t len; len = 0; @@ -106,10 +111,11 @@ * Returns logical length of decoded string or -1 on failure */ ssize_t -utf8_decode(char *src, size_t slen, wchar_t *dst, size_t dlen) +utf8_decode(const char *src, size_t slen, wchar_t *dst, size_t dlen) { size_t len; - char c, *p; + const char *p; + char c; wchar_t *wp; if (dlen < slen) @@ -149,3 +155,184 @@ return (len); } + +static inline int +chrdec(const char *p, uint32_t *v, const char *end) +{ + char c = *p; + + if ((c & 0xf8) == 0xf0 && ((p + 3) < end)) { + *v = (p[0] & 0x7) << 18; + *v |= (p[1] & 0x3f) << 12; + *v |= (p[2] & 0x3f) << 6; + *v |= (p[3] & 0x3f); + return (4); + } + else if ((c & 0xf0) == 0xe0 && ((p + 2) < end)) { + *v = (p[0] & 0xf) << 12; + *v |= (p[1] & 0x3f) << 6; + *v |= (p[2] & 0x3f); + return (3); + } + else if ((c & 0xe0) == 0xc0 && ((p + 1) < end)) { + *v = (p[0] & 0x1f) << 6; + *v |= (p[1] & 0x3f); + return (2); + } + else if ((c & 0x80) == 0) { + *v = c & 0x7f; + return (1); + } + return (0); +} + +static inline int +chrenc(char *p, uint32_t val, char *end) +{ + + if (val <= 0x7f) { + *p = (char)val; + return (1); + } else if (val <= 0x7ff && ((p + 1) < end)) { + p[0] = 0xc0 | ((val >> 6) & 0x1f); + p[1] = 0x80 | (val & 0x3f); + return (2); + } else if (val <= 0xffff && ((p + 2) < end)) { + p[0] = 0xe0 | ((val >> 12) & 0x0f); + p[1] = 0x80 | ((val >> 6) & 0x3f); + p[2] = 0x80 | (val & 0x3f); + return (3); + } else if (val <= 0x10ffff && ((p + 3) < end)) { + p[0] = 0xf0 | ((val >> 18) & 0x07); + p[1] = 0x80 | ((val >> 12) & 0x3f); + p[2] = 0x80 | ((val >> 6) & 0x3f); + p[3] = 0x80 | (val & 0x3f); + return (3); + } + return (0); +} + +/* + * Look up a case mapping from a case folding table + */ +static inline uint32_t +chrcase(uint32_t val, struct casemap *cm, size_t cmsz) +{ + uint32_t nval; + int start, end, n; + + nval = val; + start = 0; + end = cmsz - 1; + while (start <= end) { + n = (start + end) / 2; + if (cm[n].cm_val1 > val) { + end = n - 1; + continue; + } + else if (cm[n].cm_val1 < val) { + start = n + 1; + continue; + } + + nval = cm[n].cm_val2; + break; + } + return (nval); +} + +static inline uint32_t +chrlcase(uint32_t val) +{ + + return (chrcase(val, unicm_up2low, unicm_up2low_size)); +} + +/* + * utf8_casecmp + * Compares two UTF-8 strings case in-sensitive + * Arguments + * str1 - First string + * str2 - Second string + * + * Returns 0 if the strings are identical. If string one is binary larger 1 + * is returned, if the second string is binary larger -1 is returned. + */ +int +utf8_casecmp(const char *str1, const char *str2) +{ + const char *p, *q, *pe, *qe; + int l1, l2; + uint32_t v1, v2, nv1, nv2; + size_t len1, len2; + + len1 = strlen(str1); + len2 = strlen(str2); + + p = str1; + q = str2; + pe = p + len1; + qe = q + len2; + + while (*p != '\0' && *q != '\0') { + l1 = chrdec(p, &v1, pe); + l2 = chrdec(q, &v2, qe); + + if (l1 == 0 || l2 == 0) + return (-2); + + nv1 = chrlcase(v1); + nv2 = chrlcase(v2); + + if (nv1 != nv2) + return (nv1 > nv2) ? 1 : -1; + + p += l1; + q += l2; + } + + if (*p == '\0' && *q != '\0') + return (1); + else if (*p != '\0' && *q == '\0') + return (-1); + else + return (0); +} + +/* + * utf8_tolower + * Converts a UTF-8 string to lower case + * Arguments + * src - Original string ('\0'-terminated) + * dst - Pointer to space where the new string is stored + * dlen - Length of destination buffer + * + * Returns the length of the converted lower case string or a value + * less than 0 if a failure occurs. + */ +int +utf8_tolower(const char *src, char *dst, size_t dlen) +{ + const char *p, *pe; + char *q, *qe; + uint32_t val, nval; + size_t slen, i; + int l1, l2; + + slen = strlen(src); + p = src; + q = dst; + pe = src + slen; + qe = dst + dlen; + for (i = 0; i < slen; i++) { + l1 = chrdec(p, &val, pe); + nval = chrlcase(val); + l2 = chrenc(q, nval, qe); + if (l1 == 0 || l2 == 0) + return (-1); + p += l1; + q += l2; + } + *q = '\0'; + return (q - dst); +} ==== //depot/projects/soc2007/fli-mdns_sd/mdnsd/utf8.h#2 (text+ko) ==== @@ -27,7 +27,20 @@ #ifndef _UTF8_H_ #define _UTF8_H_ -ssize_t utf8_encode(wchar_t *, char *, size_t); -ssize_t utf8_decode(char *, size_t, wchar_t *, size_t); +#include <sys/types.h> + +#include <stdint.h> +#include <wchar.h> + +/* Case mapping */ +struct casemap { + uint32_t cm_val1; + uint32_t cm_val2; +}; + +ssize_t utf8_encode(const wchar_t *, char *, size_t); +ssize_t utf8_decode(const char *, size_t, wchar_t *, size_t); +int utf8_casecmp(const char *, const char *); +int utf8_tolower(const char *, char *, size_t); #endif /* _UTF8_H_ */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200708091907.l79J71wC086318>