Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 12 Dec 2018 04:23:01 +0000 (UTC)
From:      Yuri Pankov <yuripv@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r341838 - in head/lib/libc: regex tests/regex
Message-ID:  <201812120423.wBC4N10E024486@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: yuripv
Date: Wed Dec 12 04:23:00 2018
New Revision: 341838
URL: https://svnweb.freebsd.org/changeset/base/341838

Log:
  regcomp: reduce size of bitmap for multibyte locales
  
  This fixes the obscure endless loop seen with case-insensitive
  patterns containing characters in 128-255 range;  originally
  found running GNU grep test suite.
  
  Our regex implementation being kludgy translates the characters
  in case-insensitive pattern to bracket expression containing both
  cases for the character and doesn't correctly handle the case when
  original character is in bitmap and the other case is not, falling
  into the endless loop going through in p_bracket(), ordinary(),
  and bothcases().
  
  Reducing the bitmap to 0-127 range for multibyte locales solves this
  as none of these characters have other case mapping outside of bitmap.
  We are also safe in the case when the original character outside of
  bitmap has other case mapping in the bitmap (there are several of those
  in our current ctype maps having unidirectional mapping into bitmap).
  
  Reviewed by:	bapt, kevans, pfg
  Differential revision:	https://reviews.freebsd.org/D18302

Modified:
  head/lib/libc/regex/regcomp.c
  head/lib/libc/regex/regex2.h
  head/lib/libc/regex/utils.h
  head/lib/libc/tests/regex/multibyte.sh

Modified: head/lib/libc/regex/regcomp.c
==============================================================================
--- head/lib/libc/regex/regcomp.c	Wed Dec 12 02:33:01 2018	(r341837)
+++ head/lib/libc/regex/regcomp.c	Wed Dec 12 04:23:00 2018	(r341838)
@@ -1841,21 +1841,29 @@ computejumps(struct parse *p, struct re_guts *g)
 {
 	int ch;
 	int mindex;
+	int cmin, cmax;
 
+	/*
+	 * For UTF-8 we process only the first 128 characters corresponding to
+	 * the POSIX locale.
+	 */
+	cmin = MB_CUR_MAX == 1 ? CHAR_MIN : 0;
+	cmax = MB_CUR_MAX == 1 ? CHAR_MAX : 127;
+
 	/* Avoid making errors worse */
 	if (p->error != 0)
 		return;
 
-	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
+	g->charjump = (int *)malloc((cmax - cmin + 1) * sizeof(int));
 	if (g->charjump == NULL)	/* Not a fatal error */
 		return;
 	/* Adjust for signed chars, if necessary */
-	g->charjump = &g->charjump[-(CHAR_MIN)];
+	g->charjump = &g->charjump[-(cmin)];
 
 	/* If the character does not exist in the pattern, the jump
 	 * is equal to the number of characters in the pattern.
 	 */
-	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
+	for (ch = cmin; ch < cmax + 1; ch++)
 		g->charjump[ch] = g->mlen;
 
 	/* If the character does exist, compute the jump that would

Modified: head/lib/libc/regex/regex2.h
==============================================================================
--- head/lib/libc/regex/regex2.h	Wed Dec 12 02:33:01 2018	(r341837)
+++ head/lib/libc/regex/regex2.h	Wed Dec 12 04:23:00 2018	(r341838)
@@ -113,7 +113,7 @@ typedef struct {
 	wint_t		max;
 } crange;
 typedef struct {
-	unsigned char	bmp[NC / 8];
+	unsigned char	bmp[NC_MAX / 8];
 	wctype_t	*types;
 	unsigned int	ntypes;
 	wint_t		*wides;
@@ -133,9 +133,14 @@ CHIN1(cset *cs, wint_t ch)
 	if (ch < NC)
 		return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
 		    cs->invert);
-	for (i = 0; i < cs->nwides; i++)
-		if (ch == cs->wides[i])
+	for (i = 0; i < cs->nwides; i++) {
+		if (cs->icase) {
+			if (ch == towlower(cs->wides[i]) ||
+			    ch == towupper(cs->wides[i]))
+				return (!cs->invert);
+		} else if (ch == cs->wides[i])
 			return (!cs->invert);
+	}
 	for (i = 0; i < cs->nranges; i++)
 		if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)
 			return (!cs->invert);

Modified: head/lib/libc/regex/utils.h
==============================================================================
--- head/lib/libc/regex/utils.h	Wed Dec 12 02:33:01 2018	(r341837)
+++ head/lib/libc/regex/utils.h	Wed Dec 12 04:23:00 2018	(r341838)
@@ -39,7 +39,9 @@
 /* utility definitions */
 #define	DUPMAX		_POSIX2_RE_DUP_MAX	/* xxx is this right? */
 #define	INFINITY	(DUPMAX + 1)
-#define	NC		(CHAR_MAX - CHAR_MIN + 1)
+
+#define	NC_MAX		(CHAR_MAX - CHAR_MIN + 1)
+#define	NC		((MB_CUR_MAX) == 1 ? (NC_MAX) : (128))
 typedef unsigned char uch;
 
 /* switch off assertions (if not already off) if no REDEBUG */

Modified: head/lib/libc/tests/regex/multibyte.sh
==============================================================================
--- head/lib/libc/tests/regex/multibyte.sh	Wed Dec 12 02:33:01 2018	(r341837)
+++ head/lib/libc/tests/regex/multibyte.sh	Wed Dec 12 04:23:00 2018	(r341838)
@@ -1,11 +1,11 @@
 # $FreeBSD$
 
-atf_test_case multibyte
-multibyte_head()
+atf_test_case bmpat
+bmpat_head()
 {
 	atf_set "descr" "Check matching multibyte characters (PR153502)"
 }
-multibyte_body()
+bmpat_body()
 {
 	export LC_CTYPE="C.UTF-8"
 
@@ -29,7 +29,25 @@ multibyte_body()
 	    sed -ne '/.a./p'
 }
 
+atf_test_case icase
+icase_head()
+{
+	atf_set "descr" "Check case-insensitive matching for characters 128-255"
+}
+icase_body()
+{
+	export LC_CTYPE="C.UTF-8"
+
+	a=$(printf '\302\265\n')	# U+00B5
+	b=$(printf '\316\234\n')	# U+039C
+	c=$(printf '\316\274\n')	# U+03BC
+
+	echo $b | atf_check -o "inline:$b\n" sed -ne "/$a/Ip"
+	echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
+}
+
 atf_init_test_cases()
 {
-	atf_add_test_case multibyte
+	atf_add_test_case bmpat
+	atf_add_test_case icase
 }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201812120423.wBC4N10E024486>