From owner-svn-src-head@freebsd.org Thu Jul 6 17:01:52 2017 Return-Path: Delivered-To: svn-src-head@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id B5AC2D87FF6; Thu, 6 Jul 2017 17:01:52 +0000 (UTC) (envelope-from kevans@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 847AA7737A; Thu, 6 Jul 2017 17:01:52 +0000 (UTC) (envelope-from kevans@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id v66H1pvt005365; Thu, 6 Jul 2017 17:01:51 GMT (envelope-from kevans@FreeBSD.org) Received: (from kevans@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id v66H1pLw005363; Thu, 6 Jul 2017 17:01:51 GMT (envelope-from kevans@FreeBSD.org) Message-Id: <201707061701.v66H1pLw005363@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: kevans set sender to kevans@FreeBSD.org using -f From: Kyle Evans Date: Thu, 6 Jul 2017 17:01:51 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r320742 - head/lib/libc/regex X-SVN-Group: head X-SVN-Commit-Author: kevans X-SVN-Commit-Paths: head/lib/libc/regex X-SVN-Commit-Revision: 320742 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 06 Jul 2017 17:01:52 -0000 Author: kevans Date: Thu Jul 6 17:01:51 2017 New Revision: 320742 URL: https://svnweb.freebsd.org/changeset/base/320742 Log: The impending libregex will implement GNU extensions to bring BREs and EREs closer together. Prepare for this and reduce the diff of libregex changes by refactoring and combining the top-level parsers for EREs/BREs ahead of time. Branching functionality has been split out to make it easier to follow the combined version of the top-level parser. It may also be enabled in the parsing context to make it easier when libregex enables branching for BREs. A branching context was also added for the various branching functions and so that BREs, for instance, can determine if they're the first expression in a chain of expressions within the current branch and treat '*' as ordinary if so. This should have no functional impact and negligible performance impact. Reviewed by: cem, emaste, pfg Approved by: emaste (mentor) MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D10920 Modified: head/lib/libc/regex/regcomp.c head/lib/libc/regex/regex2.h Modified: head/lib/libc/regex/regcomp.c ============================================================================== --- head/lib/libc/regex/regcomp.c Thu Jul 6 15:27:34 2017 (r320741) +++ head/lib/libc/regex/regcomp.c Thu Jul 6 17:01:51 2017 (r320742) @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -62,6 +63,24 @@ __FBSDID("$FreeBSD$"); #include "cname.h" /* + * Branching context, used to keep track of branch state for all of the branch- + * aware functions. In addition to keeping track of branch positions for the + * p_branch_* functions, we use this to simplify some clumsiness in BREs for + * detection of whether ^ is acting as an anchor or being used erroneously and + * also for whether we're in a sub-expression or not. + */ +struct branchc { + sopno start; + sopno back; + sopno fwd; + + int nbranch; + int nchain; + bool outer; + bool terminate; +}; + +/* * parse structure, passed up and down to avoid global variables and * other clumsinesses */ @@ -77,6 +96,11 @@ struct parse { # define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ + bool allowbranch; /* can this expression branch? */ + bool bre; /* convenience; is this a BRE? */ + bool (*parse_expr)(struct parse *, struct branchc *); + void (*pre_parse)(struct parse *, struct branchc *); + void (*post_parse)(struct parse *, struct branchc *); }; /* ========= begin header generated by ./mkh ========= */ @@ -85,11 +109,17 @@ extern "C" { #endif /* === regcomp.c === */ -static void p_ere(struct parse *p, int stop); -static void p_ere_exp(struct parse *p); +static bool p_ere_exp(struct parse *p, struct branchc *bc); static void p_str(struct parse *p); -static void p_bre(struct parse *p, int end1, int end2); -static int p_simp_re(struct parse *p, int starordinary); +static int p_branch_eat_delim(struct parse *p, struct branchc *bc); +static void p_branch_ins_offset(struct parse *p, struct branchc *bc); +static void p_branch_fix_tail(struct parse *p, struct branchc *bc); +static void p_branch_empty(struct parse *p, struct branchc *bc); +static bool p_branch_do(struct parse *p, struct branchc *bc); +static void p_bre_pre_parse(struct parse *p, struct branchc *bc); +static void p_bre_post_parse(struct parse *p, struct branchc *bc); +static void p_re(struct parse *p, int end1, int end2); +static bool p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); @@ -139,6 +169,7 @@ static char nuls[10]; /* place to point scanner in ev #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) @@ -247,6 +278,19 @@ regcomp(regex_t * __restrict preg, p->pbegin[i] = 0; p->pend[i] = 0; } + if (cflags & REG_EXTENDED) { + p->allowbranch = true; + p->bre = false; + p->parse_expr = p_ere_exp; + p->pre_parse = NULL; + p->post_parse = NULL; + } else { + p->allowbranch = false; + p->bre = true; + p->parse_expr = p_simp_re; + p->pre_parse = p_bre_pre_parse; + p->post_parse = p_bre_post_parse; + } g->sets = NULL; g->ncsets = 0; g->cflags = cflags; @@ -264,12 +308,10 @@ regcomp(regex_t * __restrict preg, /* do it */ EMIT(OEND, 0); g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) + if (cflags & REG_NOSPEC) p_str(p); else - p_bre(p, OUT, OUT); + p_re(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); @@ -305,58 +347,14 @@ regcomp(regex_t * __restrict preg, } /* - - p_ere - ERE parser top level, concatenation and alternation - == static void p_ere(struct parse *p, int_t stop); + - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, + - return whether we should terminate or not + == static bool p_ere_exp(struct parse *p); */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ +static bool +p_ere_exp(struct parse *p, struct branchc *bc) { char c; - sopno prevback; - sopno prevfwd; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - == static void p_ere_exp(struct parse *p); - */ -static void -p_ere_exp(struct parse *p) -{ - char c; wint_t wc; sopno pos; int count; @@ -377,7 +375,7 @@ p_ere_exp(struct parse *p) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) - p_ere(p, ')'); + p_re(p, ')', IGN); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -445,7 +443,7 @@ p_ere_exp(struct parse *p) /* FALLTHROUGH */ default: if (p->error != 0) - return; + return (false); p->next--; wc = WGETNEXT(); ordinary(p, wc); @@ -453,12 +451,12 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (false); c = PEEK(); /* we call { a repetition if followed by a digit */ if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) - return; /* no repetition, we're done */ + return (false); /* no repetition, we're done */ NEXT(); (void)REQUIRE(!wascaret, REG_BADRPT); @@ -504,12 +502,13 @@ p_ere_exp(struct parse *p) } if (!MORE()) - return; + return (false); c = PEEK(); if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) - return; + return (false); SETERROR(REG_BADRPT); + return (false); } /* @@ -525,50 +524,177 @@ p_str(struct parse *p) } /* - - p_bre - BRE parser top level, anchoring and concatenation - == static void p_bre(struct parse *p, int end1, \ - == int end2); - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. - * The amount of lookahead needed to avoid this kludge is excessive. + * Eat consecutive branch delimiters for the kind of expression that we are + * parsing, return the number of delimiters that we ate. */ +static int +p_branch_eat_delim(struct parse *p, struct branchc *bc) +{ + int nskip; + + nskip = 0; + while (EAT('|')) + ++nskip; + return (nskip); +} + +/* + * Insert necessary branch book-keeping operations. This emits a + * bogus 'next' offset, since we still have more to parse + */ static void -p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ +p_branch_ins_offset(struct parse *p, struct branchc *bc) { - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; + if (bc->nbranch == 0) { + INSERT(OCH_, bc->start); /* offset is wrong */ + bc->fwd = bc->start; + bc->back = bc->start; + } + + ASTERN(OOR1, bc->back); + bc->back = THERE(); + AHEAD(bc->fwd); /* fix previous offset */ + bc->fwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + ++bc->nbranch; +} + +/* + * Fix the offset of the tail branch, if we actually had any branches. + * This is to correct the bogus placeholder offset that we use. + */ +static void +p_branch_fix_tail(struct parse *p, struct branchc *bc) +{ + + /* Fix bogus offset at the tail if we actually have branches */ + if (bc->nbranch > 0) { + AHEAD(bc->fwd); + ASTERN(O_CH, bc->back); + } +} + +/* + * Signal to the parser that an empty branch has been encountered; this will, + * in the future, be used to allow for more permissive behavior with empty + * branches. + */ +static void +p_branch_empty(struct parse *p, struct branchc *bc) +{ + + SETERROR(REG_EMPTY); +} + +/* + * Take care of any branching requirements. This includes inserting the + * appropriate branching instructions as well as eating all of the branch + * delimiters until we either run out of pattern or need to parse more pattern. + */ +static bool +p_branch_do(struct parse *p, struct branchc *bc) +{ + int ate = 0; + + ate = p_branch_eat_delim(p, bc); + if (ate == 0) + return (false); + (void)REQUIRE(ate == 1 && (!bc->outer || MORE()), REG_EMPTY); + p_branch_ins_offset(p, bc); + + return (true); +} + +static void +p_bre_pre_parse(struct parse *p, struct branchc *bc) +{ + + (void) bc; + /* + * Does not move cleanly into expression parser because of + * ordinary interpration of * at the beginning position of + * an expression. + */ if (EAT('^')) { EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ +} + +static void +p_bre_post_parse(struct parse *p, struct branchc *bc) +{ + + /* Expression is terminating due to EOL token */ + if (bc->terminate) { DROP(1); EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; } +} - (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ +/* + - p_re - Top level parser, concatenation and BRE anchoring + == static void p_re(struct parse *p, int end1, int end2); + * Giving end1 as OUT essentially eliminates the end1/end2 check. + * + * This implementation is a bit of a kludge, in that a trailing $ is first + * taken as an ordinary character and then revised to be an anchor. + * The amount of lookahead needed to avoid this kludge is excessive. + */ +static void +p_re(struct parse *p, + int end1, /* first terminating character */ + int end2) /* second terminating character; ignored for EREs */ +{ + struct branchc bc; + + bc.nbranch = 0; + if (end1 == OUT && end2 == OUT) + bc.outer = true; + else + bc.outer = false; +#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) + for (;;) { + bc.start = HERE(); + bc.nchain = 0; + bc.terminate = false; + if (p->pre_parse != NULL) + p->pre_parse(p, &bc); + while (MORE() && !SEESPEC('|') && !SEEEND()) { + bc.terminate = p->parse_expr(p, &bc); + ++bc.nchain; + } + if (p->post_parse != NULL) + p->post_parse(p, &bc); + (void) REQUIRE(HERE() != bc.start, REG_EMPTY); + if (!p->allowbranch) + break; + /* + * p_branch_do's return value indicates whether we should + * continue parsing or not. This is both for correctness and + * a slight optimization, because it will check if we've + * encountered an empty branch or the end of the string + * immediately following a branch delimiter. + */ + if (!p_branch_do(p, &bc)) + break; + } +#undef SEE_END + if (p->allowbranch) + p_branch_fix_tail(p, &bc); + assert(!MORE() || SEE(end1)); } /* - p_simp_re - parse a simple RE, an atom possibly followed by a repetition - == static int p_simp_re(struct parse *p, int starordinary); + == static bool p_simp_re(struct parse *p, struct branchc *bc); */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ +static bool /* was the simple RE an unbackslashed $? */ +p_simp_re(struct parse *p, struct branchc *bc) { int c; int count; @@ -614,7 +740,7 @@ p_simp_re(struct parse *p, EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); + p_re(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); @@ -650,11 +776,16 @@ p_simp_re(struct parse *p, p->g->backrefs = 1; break; case '*': - (void)REQUIRE(starordinary, REG_BADRPT); + /* + * Ordinary if used as the first character beyond BOL anchor of + * a (sub-)expression, counts as a bad repetition operator if it + * appears otherwise. + */ + (void)REQUIRE(bc->nchain == 0, REG_BADRPT); /* FALLTHROUGH */ default: if (p->error != 0) - return(0); /* Definitely not $... */ + return (false); /* Definitely not $... */ p->next--; wc = WGETNEXT(); ordinary(p, wc); @@ -685,9 +816,9 @@ p_simp_re(struct parse *p, SETERROR(REG_BADBR); } } else if (c == '$') /* $ (but not \$) ends it */ - return(1); + return (true); - return(0); + return (false); } /* Modified: head/lib/libc/regex/regex2.h ============================================================================== --- head/lib/libc/regex/regex2.h Thu Jul 6 15:27:34 2017 (r320741) +++ head/lib/libc/regex/regex2.h Thu Jul 6 17:01:51 2017 (r320742) @@ -189,4 +189,5 @@ struct re_guts { /* misc utilities */ #define OUT (CHAR_MIN - 1) /* a non-character value */ +#define IGN (CHAR_MIN - 2) #define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')