Date: Sat, 8 Dec 2018 19:45:05 +0000 (UTC) From: Yuri Pankov <yuripv@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org Subject: svn commit: r341745 - in stable/12/lib/libc: regex tests/regex Message-ID: <201812081945.wB8Jj5jh023781@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: yuripv Date: Sat Dec 8 19:45:05 2018 New Revision: 341745 URL: https://svnweb.freebsd.org/changeset/base/341745 Log: MFC r340835: regexec: fix processing multibyte strings. Matcher function incorrectly assumed that moffset that we get from findmust is in bytes. Fix this by introducing a stepback function, taking short path if MB_CUR_MAX is 1, and going back byte-by-byte, checking if we have a legal character sequence otherwise. PR: 153502 Reviewed by: pfg, kevans Differential revision: https://reviews.freebsd.org/D18297 Added: stable/12/lib/libc/tests/regex/multibyte.sh - copied unchanged from r340835, head/lib/libc/tests/regex/multibyte.sh Modified: stable/12/lib/libc/regex/engine.c stable/12/lib/libc/tests/regex/Makefile Directory Properties: stable/12/ (props changed) Modified: stable/12/lib/libc/regex/engine.c ============================================================================== --- stable/12/lib/libc/regex/engine.c Sat Dec 8 19:42:01 2018 (r341744) +++ stable/12/lib/libc/regex/engine.c Sat Dec 8 19:45:05 2018 (r341745) @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); */ #ifdef SNAMES +#define stepback sstepback #define matcher smatcher #define walk swalk #define dissect sdissect @@ -58,6 +59,7 @@ __FBSDID("$FreeBSD$"); #define match smat #endif #ifdef LNAMES +#define stepback lstepback #define matcher lmatcher #define walk lwalk #define dissect ldissect @@ -68,6 +70,7 @@ __FBSDID("$FreeBSD$"); #define match lmat #endif #ifdef MNAMES +#define stepback mstepback #define matcher mmatcher #define walk mwalk #define dissect mdissect @@ -142,6 +145,39 @@ static const char *pchar(int ch); #endif /* + * Given a multibyte string pointed to by start, step back nchar characters + * from current position pointed to by cur. + */ +static const char * +stepback(const char *start, const char *cur, int nchar) +{ + const char *ret; + int wc, mbc; + mbstate_t mbs; + size_t clen; + + if (MB_CUR_MAX == 1) + return ((cur - nchar) > start ? cur - nchar : NULL); + + ret = cur; + for (wc = nchar; wc > 0; wc--) { + for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { + if ((ret - mbc) < start) + return (NULL); + memset(&mbs, 0, sizeof(mbs)); + clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); + if (clen != (size_t)-1 && clen != (size_t)-2) + break; + } + if (mbc > MB_CUR_MAX) + return (NULL); + ret -= mbc; + } + + return (ret); +} + +/* - matcher - the actual matching engine == static int matcher(struct re_guts *g, const char *string, \ == size_t nmatch, regmatch_t pmatch[], int eflags); @@ -244,9 +280,14 @@ matcher(struct re_guts *g, ZAPSTATE(&m->mbs); /* Adjust start according to moffset, to speed things up */ - if (dp != NULL && g->moffset > -1) - start = ((dp - g->moffset) < start) ? start : dp - g->moffset; + if (dp != NULL && g->moffset > -1) { + const char *nstart; + nstart = stepback(start, dp, g->moffset); + if (nstart != NULL) + start = nstart; + } + SP("mloop", m->st, *start); /* this loop does only one repetition except for backrefs */ @@ -1083,6 +1124,7 @@ pchar(int ch) #endif #endif +#undef stepback #undef matcher #undef walk #undef dissect Modified: stable/12/lib/libc/tests/regex/Makefile ============================================================================== --- stable/12/lib/libc/tests/regex/Makefile Sat Dec 8 19:42:01 2018 (r341744) +++ stable/12/lib/libc/tests/regex/Makefile Sat Dec 8 19:45:05 2018 (r341745) @@ -2,6 +2,9 @@ PACKAGE= tests +# local test cases +ATF_TESTS_SH+= multibyte + .include "Makefile.inc" .include "${.CURDIR:H}/Makefile.netbsd-tests" .include <bsd.test.mk> Copied: stable/12/lib/libc/tests/regex/multibyte.sh (from r340835, head/lib/libc/tests/regex/multibyte.sh) ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ stable/12/lib/libc/tests/regex/multibyte.sh Sat Dec 8 19:45:05 2018 (r341745, copy of r340835, head/lib/libc/tests/regex/multibyte.sh) @@ -0,0 +1,35 @@ +# $FreeBSD$ + +atf_test_case multibyte +multibyte_head() +{ + atf_set "descr" "Check matching multibyte characters (PR153502)" +} +multibyte_body() +{ + export LC_CTYPE="C.UTF-8" + + printf 'é' | atf_check -o "inline:é" \ + sed -ne '/^.$/p' + printf 'éé' | atf_check -o "inline:éé" \ + sed -ne '/^..$/p' + printf 'aéa' | atf_check -o "inline:aéa" \ + sed -ne '/a.a/p' + printf 'aéa'| atf_check -o "inline:aéa" \ + sed -ne '/a.*a/p' + printf 'aaéaa' | atf_check -o "inline:aaéaa" \ + sed -ne '/aa.aa/p' + printf 'aéaéa' | atf_check -o "inline:aéaéa" \ + sed -ne '/a.a.a/p' + printf 'éa' | atf_check -o "inline:éa" \ + sed -ne '/.a/p' + printf 'aéaa' | atf_check -o "inline:aéaa" \ + sed -ne '/a.aa/p' + printf 'éaé' | atf_check -o "inline:éaé" \ + sed -ne '/.a./p' +} + +atf_init_test_cases() +{ + atf_add_test_case multibyte +}
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201812081945.wB8Jj5jh023781>