Date: Tue, 20 Sep 2011 21:54:43 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r225702 - in user/gabor/grep/trunk: . regex Message-ID: <201109202154.p8KLsh89046251@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Tue Sep 20 21:54:43 2011 New Revision: 225702 URL: http://svn.freebsd.org/changeset/base/225702 Log: - Merge improvements from TRE Modified: user/gabor/grep/trunk/Makefile user/gabor/grep/trunk/regex/fastmatch.c user/gabor/grep/trunk/regex/glue.h user/gabor/grep/trunk/regex/tre-fastmatch.c Modified: user/gabor/grep/trunk/Makefile ============================================================================== --- user/gabor/grep/trunk/Makefile Tue Sep 20 21:53:46 2011 (r225701) +++ user/gabor/grep/trunk/Makefile Tue Sep 20 21:54:43 2011 (r225702) @@ -17,7 +17,7 @@ SRCS= file.c grep.c queue.c util.c # Extra files ported backported form some regex improvements .PATH: ${.CURDIR}/regex -SRCS+= fastmatch.c hashtable.c tre-fastmatch.c xmalloc.c +SRCS+= fastmatch.c hashtable.c tre-compile.c tre-fastmatch.c xmalloc.c CFLAGS+=-I${.CURDIR}/regex .if ${MK_BSD_GREP} == "yes" Modified: user/gabor/grep/trunk/regex/fastmatch.c ============================================================================== --- user/gabor/grep/trunk/regex/fastmatch.c Tue Sep 20 21:53:46 2011 (r225701) +++ user/gabor/grep/trunk/regex/fastmatch.c Tue Sep 20 21:54:43 2011 (r225702) @@ -36,63 +36,6 @@ #include "tre-fastmatch.h" #include "xmalloc.h" -/* XXX: avoid duplication */ -#define CONV_PAT \ - { \ - wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \ - if (wregex == NULL) \ - return REG_ESPACE; \ - \ - if (TRE_MB_CUR_MAX == 1) \ - { \ - unsigned int i; \ - const unsigned char *str = (const unsigned char *)regex; \ - tre_char_t *wstr = wregex; \ - \ - for (i = 0; i < n; i++) \ - *(wstr++) = *(str++); \ - wlen = n; \ - } \ - else \ - { \ - int consumed; \ - tre_char_t *wcptr = wregex; \ - mbstate_t state; \ - memset(&state, '\0', sizeof(state)); \ - while (n > 0) \ - { \ - consumed = tre_mbrtowc(wcptr, regex, n, &state); \ - \ - switch (consumed) \ - { \ - case 0: \ - if (*regex == '\0') \ - consumed = 1; \ - else \ - { \ - xfree(wregex); \ - return REG_BADPAT; \ - } \ - break; \ - case -1: \ - DPRINT(("mbrtowc: error %d: %s.\n", errno, \ - strerror(errno))); \ - xfree(wregex); \ - return REG_BADPAT; \ - case -2: \ - consumed = n; \ - break; \ - } \ - regex += consumed; \ - n -= consumed; \ - wcptr++; \ - } \ - wlen = wcptr - wregex; \ - } \ - \ - wregex[wlen] = L'\0'; \ - } - int tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) { @@ -101,14 +44,17 @@ tre_fixncomp(fastmatch_t *preg, const ch size_t wlen; if (n != 0) - CONV_PAT + { + ret = tre_convert_pattern(regex, n, &wregex, &wlen); + if (ret != REG_OK) + return ret; + else + ret = tre_compile_literal(preg, wregex, wlen, cflags); + tre_free_pattern(wregex); + return ret; + } else return tre_compile_literal(preg, NULL, 0, cflags); - - ret = tre_compile_literal(preg, wregex, wlen, cflags); - xfree(wregex); - - return ret; } int @@ -119,16 +65,19 @@ tre_fastncomp(fastmatch_t *preg, const c size_t wlen; if (n != 0) - CONV_PAT + { + ret = tre_convert_pattern(regex, n, &wregex, &wlen); + if (ret != REG_OK) + return ret; + else + ret = (cflags & REG_LITERAL) + ? tre_compile_literal(preg, wregex, wlen, cflags) + : tre_compile_fast(preg, wregex, wlen, cflags); + tre_free_pattern(wregex); + return ret; + } else return tre_compile_literal(preg, NULL, 0, cflags); - - ret = (cflags & REG_LITERAL) ? - tre_compile_literal(preg, wregex, wlen, cflags) : - tre_compile_fast(preg, wregex, wlen, cflags); - xfree(wregex); - - return ret; } @@ -176,30 +125,6 @@ tre_fastfree(fastmatch_t *preg) tre_free_fast(preg); } -/* XXX: avoid duplication */ -#define ADJUST_OFFSETS \ - { \ - size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \ - size_t offset = pmatch[0].rm_so; \ - int ret; \ - \ - if ((pmatch[0].rm_so < 0) || (pmatch[0].rm_eo < 0)) \ - return REG_NOMATCH; \ - if ((len != (unsigned)-1) && ((unsigned long)pmatch[0].rm_eo > len))\ - return REG_NOMATCH; \ - if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \ - return REG_NOMATCH; \ - ret = tre_match_fast(preg, &string[offset], slen, type, nmatch, \ - pmatch, eflags); \ - for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) && \ - (i < nmatch)); i++) \ - { \ - pmatch[i].rm_so += offset; \ - pmatch[i].rm_eo += offset; \ - } \ - return ret; \ - } - int tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, size_t nmatch, regmatch_t pmatch[], int eflags) @@ -207,7 +132,8 @@ tre_fastnexec(const fastmatch_t *preg, c tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; if (eflags & REG_STARTEND) - ADJUST_OFFSETS + CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen, + type, nmatch, pmatch, eflags)); else return tre_match_fast(preg, string, len, type, nmatch, pmatch, eflags); @@ -227,7 +153,8 @@ tre_fastwnexec(const fastmatch_t *preg, tre_str_type_t type = STR_WIDE; if (eflags & REG_STARTEND) - ADJUST_OFFSETS + CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen, + type, nmatch, pmatch, eflags)); else return tre_match_fast(preg, string, len, type, nmatch, pmatch, eflags); Modified: user/gabor/grep/trunk/regex/glue.h ============================================================================== --- user/gabor/grep/trunk/regex/glue.h Tue Sep 20 21:53:46 2011 (r225701) +++ user/gabor/grep/trunk/regex/glue.h Tue Sep 20 21:54:43 2011 (r225702) @@ -11,6 +11,7 @@ #define TRE_WCHAR 1 #define TRE_MULTIBYTE 1 +#define HAVE_MBSTATE_T 1 #define TRE_CHAR(n) L##n @@ -37,4 +38,29 @@ #define MAX(a,b) ((a > b) ? (a) : (b)) typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; + +#define CALL_WITH_OFFSET(fn) \ + do \ + { \ + size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \ + size_t offset = pmatch[0].rm_so; \ + int ret; \ + \ + if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \ + return REG_NOMATCH; \ + ret = fn; \ + for (unsigned i = 0; (!(eflags & REG_NOSUB) && (i < nmatch)); i++)\ + { \ + pmatch[i].rm_so += offset; \ + pmatch[i].rm_eo += offset; \ + } \ + return ret; \ + } while (0 /*CONSTCOND*/) + +int +tre_convert_pattern(const char *regex, size_t n, tre_char_t **w, + size_t *wn); + +void +tre_free_pattern(tre_char_t *wregex); #endif Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c ============================================================================== --- user/gabor/grep/trunk/regex/tre-fastmatch.c Tue Sep 20 21:53:46 2011 (r225701) +++ user/gabor/grep/trunk/regex/tre-fastmatch.c Tue Sep 20 21:54:43 2011 (r225702) @@ -42,8 +42,8 @@ #include "tre-fastmatch.h" #include "xmalloc.h" -static int fastcmp(const void *, const bool *, const void *, size_t, - tre_str_type_t, bool, bool); +static int fastcmp(const fastmatch_t *fg, const void *data, + tre_str_type_t type); /* * Clean up if pattern compilation fails. @@ -97,24 +97,6 @@ static int fastcmp(const void *, const b fg->pattern[siz] = '\0'; \ } \ -/* - * Compares the pattern to the input string at the position - * stored in startptr. - */ -#define COMPARE \ - switch (type) \ - { \ - case STR_WIDE: \ - mismatch = fastcmp(fg->wpattern, fg->wescmap, startptr, \ - fg->wlen, type, \ - fg->icase, fg->newline); \ - break; \ - default: \ - mismatch = fastcmp(fg->pattern, fg->escmap, startptr, \ - fg->len, type, \ - fg->icase, fg->newline); \ - } \ - #define IS_OUT_OF_BOUNDS \ ((!fg->reversed \ ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \ @@ -154,7 +136,7 @@ static int fastcmp(const void *, const b gs = fg->bmGs[mismatch]; \ } \ bc = (r == HASH_OK) ? bc : fg->defBc; \ - DPRINT(("tre_fast_match: mismatch on character %lc, " \ + DPRINT(("tre_fast_match: mismatch on character" CHF ", " \ "BC %d, GS %d\n", \ ((const tre_char_t *)startptr)[mismatch + 1], \ bc, gs)); \ @@ -297,7 +279,7 @@ static int fastcmp(const void *, const b r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ if ((r == HASH_FAIL) || (r == HASH_FULL)) \ FAIL_COMP(REG_ESPACE); \ - DPRINT(("BC shift for wide char %lc is %d\n", fg->wpattern[i], \ + DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\ k)); \ if (fg->icase) \ { \ @@ -306,7 +288,7 @@ static int fastcmp(const void *, const b r = hashtable_put(fg->qsBc_table, &wc, &k); \ if ((r == HASH_FAIL) || (r == HASH_FULL)) \ FAIL_COMP(REG_ESPACE); \ - DPRINT(("BC shift for wide char %lc is %d\n", wc, k)); \ + DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \ } \ } @@ -327,7 +309,7 @@ static int fastcmp(const void *, const b r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ if ((r == HASH_FAIL) || (r == HASH_FULL)) \ FAIL_COMP(REG_ESPACE); \ - DPRINT(("Reverse BC shift for wide char %lc is %d\n", \ + DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \ fg->wpattern[i], k)); \ if (fg->icase) \ { \ @@ -336,7 +318,8 @@ static int fastcmp(const void *, const b r = hashtable_put(fg->qsBc_table, &wc, &k); \ if ((r == HASH_FAIL) || (r == HASH_FULL)) \ FAIL_COMP(REG_ESPACE); \ - DPRINT(("Reverse BC shift for wide char %lc is %d\n", wc, k));\ + DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \ + k)); \ } \ } @@ -853,7 +836,7 @@ badpat: */ int tre_match_fast(const fastmatch_t *fg, const void *data, size_t len, - tre_str_type_t type, int nmatch __unused, regmatch_t pmatch[], int eflags) + tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags) { unsigned int shift, u = 0, v = 0; ssize_t j = 0; @@ -878,7 +861,7 @@ tre_match_fast(const fastmatch_t *fg, co /* Shortcut for empty pattern */ if (fg->matchall) { - if (!fg->nosub) + if (!fg->nosub && nmatch >= 1) { pmatch[0].rm_so = 0; pmatch[0].rm_eo = len; @@ -932,12 +915,12 @@ tre_match_fast(const fastmatch_t *fg, co /* Determine where in data to start search at. */ j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0; SKIP_CHARS(j); - COMPARE; + mismatch = fastcmp(fg, startptr, type); if (mismatch == REG_OK) { if (fg->word && !IS_ON_WORD_BOUNDARY) return ret; - if (!fg->nosub) + if (!fg->nosub && nmatch >= 1) { pmatch[0].rm_so = j; pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len); @@ -952,7 +935,7 @@ tre_match_fast(const fastmatch_t *fg, co do { SKIP_CHARS(j); - COMPARE; + mismatch = fastcmp(fg, startptr, type); if (mismatch == REG_OK) { if (fg->word) @@ -961,7 +944,7 @@ tre_match_fast(const fastmatch_t *fg, co CHECK_BOL_ANCHOR; if (fg->eol) CHECK_EOL_ANCHOR; - if (!fg->nosub) + if (!fg->nosub && nmatch >= 1) { pmatch[0].rm_so = j; pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len); @@ -1008,14 +991,15 @@ tre_free_fast(fastmatch_t *fg) * REG_OK on success */ static inline int -fastcmp(const void *pat, const bool *escmap, const void *data, size_t len, - tre_str_type_t type, bool icase, bool newline) +fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type) { const char *str_byte = data; - const char *pat_byte = pat; - int ret = REG_OK; + const char *pat_byte = fg->pattern; const tre_char_t *str_wide = data; - const tre_char_t *pat_wide = pat; + const tre_char_t *pat_wide = fg->wpattern; + const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap; + size_t len = (type == STR_WIDE) ? fg->wlen : fg->len; + int ret = REG_OK; /* Compare the pattern and the input char-by-char from the last position. */ for (int i = len - 1; i >= 0; i--) { @@ -1024,23 +1008,25 @@ fastcmp(const void *pat, const bool *esc case STR_WIDE: /* Check dot */ - if (pat_wide[i] == TRE_CHAR('.') && (!escmap || !escmap[i]) && - (!newline || (str_wide[i] != TRE_CHAR('\n')))) + if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') && + (!escmap || !escmap[i]) && + (!fg->newline || (str_wide[i] != TRE_CHAR('\n')))) continue; /* Compare */ - if (icase ? (towlower(pat_wide[i]) == towlower(str_wide[i])) + if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i])) : (pat_wide[i] == str_wide[i])) continue; break; default: /* Check dot */ - if (pat_byte[i] == '.' && (!escmap || !escmap[i]) && - (!newline || (str_byte[i] != '\n'))) + if (fg->hasdot && pat_byte[i] == '.' && + (!escmap || !escmap[i]) && + (!fg->newline || (str_byte[i] != '\n'))) continue; /* Compare */ - if (icase ? (tolower(pat_byte[i]) == tolower(str_byte[i])) + if (fg->icase ? (tolower(pat_byte[i]) == tolower(str_byte[i])) : (pat_byte[i] == str_byte[i])) continue; }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201109202154.p8KLsh89046251>