Date: Sat, 18 Feb 2012 19:37:02 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r231897 - user/gabor/tre-integration/contrib/tre/lib Message-ID: <201202181937.q1IJb2MU045059@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Sat Feb 18 19:37:02 2012 New Revision: 231897 URL: http://svn.freebsd.org/changeset/base/231897 Log: Implement multi-pattern heuristic matching. Not yet connected to the build and totally untested. Modified: user/gabor/tre-integration/contrib/tre/lib/mregcomp.c user/gabor/tre-integration/contrib/tre/lib/mregexec.c user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h Modified: user/gabor/tre-integration/contrib/tre/lib/mregcomp.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/mregcomp.c Sat Feb 18 16:54:01 2012 (r231896) +++ user/gabor/tre-integration/contrib/tre/lib/mregcomp.c Sat Feb 18 19:37:02 2012 (r231897) @@ -54,81 +54,102 @@ __weak_reference(tre_mregfree, mregfree) int tre_mcompile(mregex_t *preg, size_t nr, const tre_char_t *wregex[], - size_t wn[], int cflags) + size_t wn[], const char *regex[], size_t n, int cflags) { int ret; - size_t mfrag = 0; tre_char_t **frags; size_t *siz; wmsearch_t *wm; preg->k = nr; + preg->cflags = cflags; preg->patterns = xmalloc(nr * sizeof(regex_t)); if (!preg->patterns) return REG_ESPACE; + /* Compile NFA, BM and heuristic for each pattern. */ for (int i = 0; i < nr; i++) { - ret = tre_compile_nfa(&preg->patterns[i], wregex[i], wn[i], cflags); + ret = tre_compile(&preg->patterns[i], wregex[i], wn[i], + regex[i], n[i], cflags); if (ret != REG_OK) - goto err; - ret = tre_compile_heur(&preg->patterns[i], wregex[i], wn[i], cflags); - if (ret != REG_OK) - goto err; + return ret; } - for (mfrag = 0; mfrag < nr; mfrag++) - for (int j = 0; j < nr; j++) - if (((heur_t)preg->patterns[j]->heur)->arr[mfrag] == NULL) - goto out; -out: - - preg->mfrag = mfrag; - - /* Worst case, not all patterns have a literal prefix */ - if (mfrag == 0) - return REG_OK; - - wm = xmalloc(mfrag * sizeof(wmsearch_t)); - if (!wm) - goto err; + /* If not literal, check if any of them have fixed-length prefix. */ + if (!(cflags & REG_LITERAL)) + for (int i = 0; i < nr; i++) + if ((preg->patterns[i]->heur == NULL) || + (((heur_t)preg->patterns[i]->heur)->arr[0] == NULL)) + { + preg->type = MHEUR_NONE; + goto finish; + } - frags = xmalloc(nr * sizeof(char *)); - if (!frags) - goto err; + /* + * Set frag and siz to point to the fragments to compile and + * their respective sizes. + */ + if (!(cflags & REG_LITERAL)) + { + frags = xmalloc(nr * sizeof(char *)); + if (!frags) + goto err; - siz = xmalloc(nr * sizeof(size_t)); - // XXX: check NULL + siz = xmalloc(nr * sizeof(size_t)); + if (!siz) + goto err; - for (int i = 0; i < mfrag; i++) - { for (int j = 0; j < nr; j++) { - frags[j] = &((heur_t)preg->patterns[j]->heur)->arr[i]; - siz[j] = ((heur_t)preg->patterns[j]->heur)->siz[i]; + frags[j] = &((heur_t)preg->patterns[j]->heur)->arr[0]; + siz[j] = ((heur_t)preg->patterns[j]->heur)->siz[0]; } - ret = tre_wmcomp(&wm[i], nr, frags, siz, cflags); - if (ret != REG_OK) - goto err; + } + else + { + frags = wregex; + siz = wn; } + /* Alloc and compile the fragments for Wu-Manber algorithm. */ + wm = xmalloc(sizeof(wmsearch_t)); + if (!wm) + goto err; + ret = tre_wmcomp(wm, nr, frags, siz, cflags); + if (ret != REG_OK) + goto err; preg->searchdata = wm; - return REG_OK; + /* Set the specific type of matching. */ + if (cflags & REG_LITERAL) + preg->type = MHEUR_LITERAL; + else if (cflags & REG_NEWLINE) + preg->type = MHEUR_LONGEST; + else + preg->type = MHEUR_PREFIX; + + goto finish; err: if (preg->patterns) - xfree(preg->patterns); + { + for (int i = 1; i < nr; i++) + if (preg->patterns[i]) + tre_regfree(preg->patterns[i]); + xfree(preg->patterns); + } if (wm) + xfree(wm); + +finish: + if (!(cflags & REG_LITERAL)) { - for (int i = 0; i < mfrag; i++) - tre_wmfree(&wm[i]); - xfree(wm); - } - if (frags) - xfree(frags); - if (siz) - xfree(siz); + if (frag) + xfree(frag); + if (siz) + xfree(siz); + } return ret; } Modified: user/gabor/tre-integration/contrib/tre/lib/mregexec.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/mregexec.c Sat Feb 18 16:54:01 2012 (r231896) +++ user/gabor/tre-integration/contrib/tre/lib/mregexec.c Sat Feb 18 19:37:02 2012 (r231897) @@ -80,8 +80,230 @@ tre_mmatch(const void *str, size_t len, size_t nmatch, regmatch_t pmatch[], int eflags, const mregex_t *preg) { - - // XXX: Wu-Manber search with specific cases + tre_char_t *str_wide = str; + char *str_byte = str; + int ret; + bool need_offsets; + + need_offsets = (preg->cflags & REG_NOSUB) && (nmatch > 0); + +#define INPUT(pos) ((type == STR_WIDE) ? str_wide[pos] : str_byte[pos]) + + /* + * Worst case: at least one pattern does not have a literal + * prefix so the Wu-Manber algorithm cannot be used to speed + * up the match. There is no trivial best solution either, + * so just try matching for each of the patterns and return + * the earliest. + */ + if (preg->type == MHEUR_NONE) + { + regmatch_t **pm; + int i, first = -1; + + /* Alloc one regmatch_t for each pattern */ + if (need_offsets) + { + pm = xmalloc(preg->k * sizeof(regmatch_t *)); + if (!pm) + return REG_ESPACE; + for (i = 0; i < preg->k; i++) + { + pm[i] = xmalloc(nmatch * sizeof(regmatch_t)); + if (!pm[i]) + goto finish; + } + } + + /* Run matches for each pattern and save first matching offsets. */ + for (i = 0; i < preg->k; i++) + { + ret = tre_match(&preg->patterns[i], str, len, type, + need_offsets ? nmatch : 0, pm[i], eflags); + + /* Mark if there is no match for the pattern. */ + if (!need_offsets) + { + if (ret == REG_OK) + return REG_OK; + } + else if (ret == REG_NOMATCH) + pm[i][0].rm_so = -1; + else if (ret != REG_OK) + goto finish; + } + + if (!need_offsets) + return REG_NOMATCH; + + /* Check whether there has been a match at all. */ + for (i; i < preg->k; i++) + if (pm[i][0].rm_so != -1) + { + first = i; + break; + } + + if (first == -1) + ret = REG_NOMATCH; + + /* If there are matches, find the first one. */ + else + { + for (i = first + 1; i < preg->k; i++) + if ((pm[i][0].rm_so < pm[first][0].rm_so) || (pm[i][0].rm_eo > pm[first][0].rm_eo)) + { + first = i; + } + } + + /* Fill int the offsets before returning. */ + for (i = 0; need_offsets && (i < 0); i++) + { + pmatch[i].rm_so = pm[first][i].rm_so; + pmatch[i].rm_eo = pm[first][i].rm_eo; + pmatch[i].p = i; + } + ret = REG_OK; + +finish: + if (pm) + { + for (i = 0; i < preg->k; i++) + if (pm[i]) + xfree(pm[i]); + xfree(pm); + } + return ret; + } + + /* + * REG_NEWLINE: only searching the longest fragment of each + * pattern and then isolating the line and calling the + * automaton. + */ + else if (preg->type == MHEUR_LONGEST) + { + regmatch_t *pm, rpm; + size_t st = 0, bl, el; + + /* Alloc regmatch_t structures for results */ + if (need_offsets) + { + pm = xmalloc(nmatch * sizeof(regmatch_t *)); + if (!pm) + return REG_ESPACE; + } + + while (st < len) + { + /* Look for a possible match. */ + ret = tre_wmexec(INPUT(st), len, type, 1, &rpm, + eflags, preg->wm); + if (ret != REG_OK) + goto finish; + + /* Need to start from here if this fails. */ + st += rpm.rm_so + 1; + + /* Look for the beginning of the line. */ + for (bl = st; bl > 0; bl--) + if ((type == STR_WIDE) ? (str_wide[bl] == TRE_CHAR('\n')) + (str_byte[bl] == '\n') + break; + + /* Look for the end of the line. */ + for (el = st; el < len; el++) + if ((type == STR_WIDE) ? (str_wide[el] == TRE_CHAR('\n')) + (str_byte[el] == '\n') + break; + + /* Try to match the pattern on the line. */ + ret = tre_match(&preg->patterns[rpm.p], INPUT(bl), el - bl, + type, need_offsets ? nmatch : 0, &pm, eflags); + + /* Evaluate result. */ + if (ret == REG_NOMATCH) + continue; + else if (ret == REG_OK) + { + if (!need_offsets) + return REG_OK; + else + { + for (int i = 0; i < nmatch; i++) + { + pmatch[i].rm_so = pm[i].rm_so; + pmatch[i].rm_eo = pm[i].rm_eo; + pmatch[i].p = rpm.p; + goto finish; + } + } + } + else + goto finish; + } +finish: + if (!pm) + xfree(pm) + return ret; + } + + /* + * Literal case. It is enough to search with Wu-Manber and immediately + * return the match. + */ + else if (preg->type == MHEUR_LITERAL) + { + return tre_wmexec(str, len, type, nmatch, pmatch, eflags, preg->wm); + } + + /* + * General case. Look for the beginning of any of the patterns with the + * Wu-Manber algorithm and try to match from there with the automaton. + */ + else + { + regmatch_t *pm, rpm; + size_t st = 0; + + /* Alloc regmatch_t structures for results */ + if (need_offsets) + { + pm = xmalloc(nmatch * sizeof(regmatch_t *)); + if (!pm) + return REG_ESPACE; + } + + while (st < len) + { + ret = tre_wmexec(INPUT(st), len, type, nmatch, &rpm, eflags, preg->wm); + if (ret != REG_OK) + return ret; + + ret = tre_match(&preg->patterns[rpm.p], INPUT(rpm.rm_so), + len - rpm.rm_so, type, need_offsets ? nmatch : 0, + pm, eflags); + if ((ret == REG_OK) && (pm[0].rm_so == 0)) + { + for (int i = 0; i < nmatch; i++) + { + pm[i].rm_so += st; + pm[i].rm_eo += eo; + pm[i].p = rpm.p; + } + goto finish; + } + else if ((ret != REG_NOMATCH) || (ret != REG_OK)) + goto finish; + st += pm1.rm_so + 1; + } + +finish: + if (pm) + xfree(pm); + return ret; + } } Modified: user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c Sat Feb 18 16:54:01 2012 (r231896) +++ user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c Sat Feb 18 19:37:02 2012 (r231897) @@ -232,11 +232,14 @@ fail: #define MATCH(beg, end, p) \ do \ { \ - match->rm_so = beg; \ - match->rm_eo = end; \ - match->p = p; \ - err = REG_OK; \ - goto finish; \ + if (!(preg->cflags & REG_NOSUB) && (nmatch > 0)) \ + { \ + pmatch->rm_so = beg; \ + pmatch->rm_eo = end; \ + pmatch->p = p; \ + err = REG_OK; \ + goto finish; \ + } \ } while (0); #define _WMSEARCH(data, pats, sizes, mlen, tbl, dshift) \ @@ -292,7 +295,7 @@ fail: int tre_wmexec(const void *str, size_t len, tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], int eflags, - const wmsearch_t *wm, regmatch_t *match) + const wmsearch_t *wm) { wmentry_t *s_entry, *p_entry; tre_char_t *wide_str = str; Modified: user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h Sat Feb 18 16:54:01 2012 (r231896) +++ user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h Sat Feb 18 19:37:02 2012 (r231897) @@ -9,7 +9,13 @@ #define WM_MAXPAT 64 +#define MHEUR_NONE 0 +#define MHEUR_PREFIX 1 +#define MHEUR_LONGEST 2 +#define MHEUR_LITERAL 3 + typedef struct { + int cflags; char **pat; /* Patterns */ size_t *siz; /* Pattern sizes */ size_t n; /* No of patterns */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201202181937.q1IJb2MU045059>