Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 18 Jan 2018 22:13:32 +0000 (UTC)
From:      Kyle Evans <kevans@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r328152 - stable/11/lib/libc/regex
Message-ID:  <201801182213.w0IMDWkG028053@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kevans
Date: Thu Jan 18 22:13:31 2018
New Revision: 328152
URL: https://svnweb.freebsd.org/changeset/base/328152

Log:
  MFC r322288: regex(3): Refactor fast/slow stepping bits in matching engine
  
  Adding features for matching is fairly straightforward, but this requires
  some duplication because of this fast/slow setup. They can be fairly
  trivially combined into a single walk(), so do it to make future additions
  less error prone.

Modified:
  stable/11/lib/libc/regex/engine.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/lib/libc/regex/engine.c
==============================================================================
--- stable/11/lib/libc/regex/engine.c	Thu Jan 18 22:10:00 2018	(r328151)
+++ stable/11/lib/libc/regex/engine.c	Thu Jan 18 22:13:31 2018	(r328152)
@@ -36,6 +36,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <stdbool.h>
+
 /*
  * The matching engine and friends.  This file is #included by regexec.c
  * after suitable #defines of a variety of macros used herein, so that
@@ -45,8 +47,7 @@ __FBSDID("$FreeBSD$");
 
 #ifdef SNAMES
 #define	matcher	smatcher
-#define	fast	sfast
-#define	slow	sslow
+#define	walk	swalk
 #define	dissect	sdissect
 #define	backref	sbackref
 #define	step	sstep
@@ -56,8 +57,7 @@ __FBSDID("$FreeBSD$");
 #endif
 #ifdef LNAMES
 #define	matcher	lmatcher
-#define	fast	lfast
-#define	slow	lslow
+#define	walk	lwalk
 #define	dissect	ldissect
 #define	backref	lbackref
 #define	step	lstep
@@ -67,8 +67,7 @@ __FBSDID("$FreeBSD$");
 #endif
 #ifdef MNAMES
 #define	matcher	mmatcher
-#define	fast	mfast
-#define	slow	mslow
+#define	walk	mwalk
 #define	dissect	mdissect
 #define	backref	mbackref
 #define	step	mstep
@@ -104,8 +103,7 @@ extern "C" {
 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
-static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
-static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
+static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
 #define MAX_RECURSION	100
 #define	BOL	(OUT-1)
@@ -251,7 +249,7 @@ matcher(struct re_guts *g,
 
 	/* this loop does only one repetition except for backrefs */
 	for (;;) {
-		endp = fast(m, start, stop, gf, gl);
+		endp = walk(m, start, stop, gf, gl, true);
 		if (endp == NULL) {		/* a miss */
 			if (m->pmatch != NULL)
 				free((char *)m->pmatch);
@@ -267,7 +265,7 @@ matcher(struct re_guts *g,
 		assert(m->coldp != NULL);
 		for (;;) {
 			NOTE("finding start");
-			endp = slow(m, m->coldp, stop, gf, gl);
+			endp = walk(m, m->coldp, stop, gf, gl, false);
 			if (endp != NULL)
 				break;
 			assert(m->coldp < m->endp);
@@ -312,7 +310,7 @@ matcher(struct re_guts *g,
 			if (dp != NULL || endp <= m->coldp)
 				break;		/* defeat */
 			NOTE("backoff");
-			endp = slow(m, m->coldp, endp-1, gf, gl);
+			endp = walk(m, m->coldp, endp-1, gf, gl, false);
 			if (endp == NULL)
 				break;		/* defeat */
 			/* try it on a shorter possibility */
@@ -430,10 +428,10 @@ dissect(struct match *m,
 			stp = stop;
 			for (;;) {
 				/* how long could this one be? */
-				rest = slow(m, sp, stp, ss, es);
+				rest = walk(m, sp, stp, ss, es, false);
 				assert(rest != NULL);	/* it did match */
 				/* could the rest match the rest? */
-				tail = slow(m, rest, stop, es, stopst);
+				tail = walk(m, rest, stop, es, stopst, false);
 				if (tail == stop)
 					break;		/* yes! */
 				/* no -- try a shorter match for this one */
@@ -443,7 +441,7 @@ dissect(struct match *m,
 			ssub = ss + 1;
 			esub = es - 1;
 			/* did innards match? */
-			if (slow(m, sp, rest, ssub, esub) != NULL) {
+			if (walk(m, sp, rest, ssub, esub, false) != NULL) {
 				dp = dissect(m, sp, rest, ssub, esub);
 				assert(dp == rest);
 			} else		/* no */
@@ -454,10 +452,10 @@ dissect(struct match *m,
 			stp = stop;
 			for (;;) {
 				/* how long could this one be? */
-				rest = slow(m, sp, stp, ss, es);
+				rest = walk(m, sp, stp, ss, es, false);
 				assert(rest != NULL);	/* it did match */
 				/* could the rest match the rest? */
-				tail = slow(m, rest, stop, es, stopst);
+				tail = walk(m, rest, stop, es, stopst, false);
 				if (tail == stop)
 					break;		/* yes! */
 				/* no -- try a shorter match for this one */
@@ -469,7 +467,7 @@ dissect(struct match *m,
 			ssp = sp;
 			oldssp = ssp;
 			for (;;) {	/* find last match of innards */
-				sep = slow(m, ssp, rest, ssub, esub);
+				sep = walk(m, ssp, rest, ssub, esub, false);
 				if (sep == NULL || sep == ssp)
 					break;	/* failed or matched null */
 				oldssp = ssp;	/* on to next try */
@@ -481,7 +479,7 @@ dissect(struct match *m,
 				ssp = oldssp;
 			}
 			assert(sep == rest);	/* must exhaust substring */
-			assert(slow(m, ssp, sep, ssub, esub) == rest);
+			assert(walk(m, ssp, sep, ssub, esub, false) == rest);
 			dp = dissect(m, ssp, sep, ssub, esub);
 			assert(dp == sep);
 			sp = rest;
@@ -490,10 +488,10 @@ dissect(struct match *m,
 			stp = stop;
 			for (;;) {
 				/* how long could this one be? */
-				rest = slow(m, sp, stp, ss, es);
+				rest = walk(m, sp, stp, ss, es, false);
 				assert(rest != NULL);	/* it did match */
 				/* could the rest match the rest? */
-				tail = slow(m, rest, stop, es, stopst);
+				tail = walk(m, rest, stop, es, stopst, false);
 				if (tail == stop)
 					break;		/* yes! */
 				/* no -- try a shorter match for this one */
@@ -504,7 +502,7 @@ dissect(struct match *m,
 			esub = ss + OPND(m->g->strip[ss]) - 1;
 			assert(OP(m->g->strip[esub]) == OOR1);
 			for (;;) {	/* find first matching branch */
-				if (slow(m, sp, rest, ssub, esub) == rest)
+				if (walk(m, sp, rest, ssub, esub, false) == rest)
 					break;	/* it matched all of it */
 				/* that one missed, try next one */
 				assert(OP(m->g->strip[esub]) == OOR1);
@@ -757,124 +755,16 @@ backref(struct match *m,
 }
 
 /*
- - fast - step through the string at top speed
- == static const char *fast(struct match *m, const char *start, \
- ==	const char *stop, sopno startst, sopno stopst);
+ - walk - step through the string either quickly or slowly
+ == static const char *walk(struct match *m, const char *start, \
+ ==	const char *stop, sopno startst, sopno stopst, bool fast);
  */
-static const char *		/* where tentative match ended, or NULL */
-fast(	struct match *m,
-	const char *start,
-	const char *stop,
-	sopno startst,
-	sopno stopst)
+static const char * /* where it ended, or NULL */
+walk(struct match *m, const char *start, const char *stop, sopno startst,
+	sopno stopst, bool fast)
 {
 	states st = m->st;
 	states fresh = m->fresh;
-	states tmp = m->tmp;
-	const char *p = start;
-	wint_t c;
-	wint_t lastc;		/* previous c */
-	wint_t flagch;
-	int i;
-	const char *coldp;	/* last p after which no match was underway */
-	size_t clen;
-
-	CLEAR(st);
-	SET1(st, startst);
-	SP("fast", st, *p);
-	st = step(m->g, startst, stopst, st, NOTHING, st);
-	ASSIGN(fresh, st);
-	SP("start", st, *p);
-	coldp = NULL;
-	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
-		c = OUT;
-	else {
-		/*
-		 * XXX Wrong if the previous character was multi-byte.
-		 * Newline never is (in encodings supported by FreeBSD),
-		 * so this only breaks the ISWORD tests below.
-		 */
-		c = (uch)*(start - 1);
-	}
-	for (;;) {
-		/* next character */
-		lastc = c;
-		if (p == m->endp) {
-			clen = 0;
-			c = OUT;
-		} else
-			clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
-		if (EQ(st, fresh))
-			coldp = p;
-
-		/* is there an EOL and/or BOL between lastc and c? */
-		flagch = '\0';
-		i = 0;
-		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
-				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
-			flagch = BOL;
-			i = m->g->nbol;
-		}
-		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
-				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
-			flagch = (flagch == BOL) ? BOLEOL : EOL;
-			i += m->g->neol;
-		}
-		if (i != 0) {
-			for (; i > 0; i--)
-				st = step(m->g, startst, stopst, st, flagch, st);
-			SP("boleol", st, c);
-		}
-
-		/* how about a word boundary? */
-		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
-					(c != OUT && ISWORD(c)) ) {
-			flagch = BOW;
-		}
-		if ( (lastc != OUT && ISWORD(lastc)) &&
-				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
-			flagch = EOW;
-		}
-		if (flagch == BOW || flagch == EOW) {
-			st = step(m->g, startst, stopst, st, flagch, st);
-			SP("boweow", st, c);
-		}
-
-		/* are we done? */
-		if (ISSET(st, stopst) || p == stop || clen > stop - p)
-			break;		/* NOTE BREAK OUT */
-
-		/* no, we must deal with this character */
-		ASSIGN(tmp, st);
-		ASSIGN(st, fresh);
-		assert(c != OUT);
-		st = step(m->g, startst, stopst, tmp, c, st);
-		SP("aft", st, c);
-		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
-		p += clen;
-	}
-
-	assert(coldp != NULL);
-	m->coldp = coldp;
-	if (ISSET(st, stopst))
-		return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
-	else
-		return(NULL);
-}
-
-/*
- - slow - step through the string more deliberately
- == static const char *slow(struct match *m, const char *start, \
- ==	const char *stop, sopno startst, sopno stopst);
- */
-static const char *		/* where it ended */
-slow(	struct match *m,
-	const char *start,
-	const char *stop,
-	sopno startst,
-	sopno stopst)
-{
-	states st = m->st;
 	states empty = m->empty;
 	states tmp = m->tmp;
 	const char *p = start;
@@ -890,6 +780,8 @@ slow(	struct match *m,
 	SET1(st, startst);
 	SP("sstart", st, *p);
 	st = step(m->g, startst, stopst, st, NOTHING, st);
+	if (fast)
+		ASSIGN(fresh, st);
 	matchp = NULL;
 	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
 		c = OUT;
@@ -910,6 +802,9 @@ slow(	struct match *m,
 		} else
 			clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
 
+		if (fast && EQ(st, fresh))
+			matchp = p;
+
 		/* is there an EOL and/or BOL between lastc and c? */
 		flagch = '\0';
 		i = 0;
@@ -944,14 +839,21 @@ slow(	struct match *m,
 		}
 
 		/* are we done? */
-		if (ISSET(st, stopst))
-			matchp = p;
+		if (ISSET(st, stopst)) {
+			if (fast)
+				break;
+			else
+				matchp = p;
+		}
 		if (EQ(st, empty) || p == stop || clen > stop - p)
 			break;		/* NOTE BREAK OUT */
 
 		/* no, we must deal with this character */
 		ASSIGN(tmp, st);
-		ASSIGN(st, empty);
+		if (fast)
+			ASSIGN(st, fresh);
+		else
+			ASSIGN(st, empty);
 		assert(c != OUT);
 		st = step(m->g, startst, stopst, tmp, c, st);
 		SP("saft", st, c);
@@ -959,10 +861,17 @@ slow(	struct match *m,
 		p += clen;
 	}
 
-	return(matchp);
+	if (fast) {
+		assert(matchp != NULL);
+		m->coldp = matchp;
+		if (ISSET(st, stopst))
+			return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
+		else
+			return (NULL);
+	} else
+		return (matchp);
 }
 
-
 /*
  - step - map set of states reachable before char to set reachable after
  == static states step(struct re_guts *g, sopno start, sopno stop, \
@@ -1173,8 +1082,7 @@ pchar(int ch)
 #endif
 
 #undef	matcher
-#undef	fast
-#undef	slow
+#undef	walk
 #undef	dissect
 #undef	backref
 #undef	step



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201801182213.w0IMDWkG028053>