Date: Fri, 2 Sep 2011 18:18:24 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r225347 - user/gabor/tre-integration/contrib/tre/lib Message-ID: <201109021818.p82IIO73010722@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Fri Sep 2 18:18:24 2011 New Revision: 225347 URL: http://svn.freebsd.org/changeset/base/225347 Log: - Merge some improvements and fixes from grep - Fix a cast [1] Submitted by: ache [1] Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c user/gabor/tre-integration/contrib/tre/lib/regcomp.c user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Sep 2 18:13:46 2011 (r225346) +++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Sep 2 18:18:24 2011 (r225347) @@ -1,3 +1,5 @@ +/* $FreeBSD$ */ + /*- * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org> * All rights reserved. @@ -27,7 +29,9 @@ #ifdef HAVE_CONFIG_H #include <config.h> #endif /* HAVE_CONFIG_H */ +#include <errno.h> #include <fastmatch.h> +#include <regex.h> #include <string.h> #include "tre-fastmatch.h" @@ -36,67 +40,72 @@ /* XXX: avoid duplication */ #define CONV_PAT \ - int ret; \ - tre_char_t *wregex; \ - size_t wlen; \ - \ - wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \ - if (wregex == NULL) \ - return REG_ESPACE; \ - \ - if (TRE_MB_CUR_MAX == 1) \ - { \ - unsigned int i; \ - const unsigned char *str = (const unsigned char *)regex; \ - tre_char_t *wstr = wregex; \ - \ - for (i = 0; i < n; i++) \ - *(wstr++) = *(str++); \ - wlen = n; \ - } \ - else \ - { \ - int consumed; \ - tre_char_t *wcptr = wregex; \ - mbstate_t state; \ - memset(&state, '\0', sizeof(state)); \ - while (n > 0) \ - { \ - consumed = tre_mbrtowc(wcptr, regex, n, &state); \ + { \ + wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \ + if (wregex == NULL) \ + return REG_ESPACE; \ \ - switch (consumed) \ - { \ - case 0: \ - if (*regex == '\0') \ - consumed = 1; \ - else \ - { \ - xfree(wregex); \ - return REG_BADPAT; \ - } \ - break; \ - case -1: \ - DPRINT(("mbrtowc: error %d: %s.\n", errno, \ - strerror(errno))); \ - xfree(wregex); \ - return REG_BADPAT; \ - case -2: \ - consumed = n; \ - break; \ - } \ - regex += consumed; \ - n -= consumed; \ - wcptr++; \ - } \ - wlen = wcptr - wregex; \ - } \ + if (TRE_MB_CUR_MAX == 1) \ + { \ + unsigned int i; \ + const unsigned char *str = (const unsigned char *)regex; \ + tre_char_t *wstr = wregex; \ + \ + for (i = 0; i < n; i++) \ + *(wstr++) = *(str++); \ + wlen = n; \ + } \ + else \ + { \ + int consumed; \ + tre_char_t *wcptr = wregex; \ + mbstate_t state; \ + memset(&state, '\0', sizeof(state)); \ + while (n > 0) \ + { \ + consumed = tre_mbrtowc(wcptr, regex, n, &state); \ + \ + switch (consumed) \ + { \ + case 0: \ + if (*regex == '\0') \ + consumed = 1; \ + else \ + { \ + xfree(wregex); \ + return REG_BADPAT; \ + } \ + break; \ + case -1: \ + DPRINT(("mbrtowc: error %d: %s.\n", errno, \ + strerror(errno))); \ + xfree(wregex); \ + return REG_BADPAT; \ + case -2: \ + consumed = n; \ + break; \ + } \ + regex += consumed; \ + n -= consumed; \ + wcptr++; \ + } \ + wlen = wcptr - wregex; \ + } \ \ - wregex[wlen] = L'\0'; + wregex[wlen] = L'\0'; \ + } int tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) { - CONV_PAT; + int ret; + tre_char_t *wregex; + size_t wlen; + + if (n != 0) + CONV_PAT + else + return tre_compile_literal(preg, NULL, 0, cflags); ret = tre_compile_literal(preg, wregex, wlen, cflags); xfree(wregex); @@ -107,7 +116,14 @@ tre_fixncomp(fastmatch_t *preg, const ch int tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) { - CONV_PAT; + int ret; + tre_char_t *wregex; + size_t wlen; + + if (n != 0) + CONV_PAT + else + return tre_compile_literal(preg, NULL, 0, cflags); ret = (cflags & REG_LITERAL) ? tre_compile_literal(preg, wregex, wlen, cflags) : @@ -121,34 +137,13 @@ tre_fastncomp(fastmatch_t *preg, const c int tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags) { - size_t len; - - if (cflags & REG_PEND) - { - if (preg->re_endp >= regex) - len = preg->re_endp - regex - else - len = preg ? strlen(regex) : 0; - return tre_fixncomp(preg, regex, len, cflags); - } - else - return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags); + return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags); } int tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags) { - size_t len; - - if (cflags & REG_PEND) - { - len = (preg->re_endp >= regex) - ? preg->re_endp - regex - : 0; - return tre_fastncomp(preg, regex, len ? strlen(regex) : 0, cflags); - } - else - return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags); + return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags); } int Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/regcomp.c Fri Sep 2 18:13:46 2011 (r225346) +++ user/gabor/tre-integration/contrib/tre/lib/regcomp.c Fri Sep 2 18:18:24 2011 (r225347) @@ -117,11 +117,10 @@ tre_regcomp(regex_t *preg, const char *r if (cflags & REG_PEND) { if (preg->re_endp >= regex) - len = preg->re_endp - regex + len = preg->re_endp - regex; else len = regex ? strlen(regex) : 0; - ) - return tre_regncomp(preg, regex, len, cflags); + return tre_regncomp(preg, regex, len, cflags); } else return tre_regncomp(preg, regex, regex ? strlen(regex) : 0, cflags); Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c ============================================================================== --- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Fri Sep 2 18:13:46 2011 (r225346) +++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Fri Sep 2 18:18:24 2011 (r225347) @@ -208,14 +208,14 @@ static int fastcmp(const void *, const v fg->qsBc[i] = fg->len - fg->hasdot; \ for (int i = fg->hasdot + 1; i < fg->len; i++) \ { \ - fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i; \ + fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \ DPRINT(("BC shift for char %c is %d\n", fg->pattern[i], \ fg->len - i)); \ if (fg->icase) \ { \ char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i]) \ : tolower(fg->pattern[i]); \ - fg->qsBc[(unsigned)c] = fg->len - i; \ + fg->qsBc[(unsigned char)c] = fg->len - i; \ DPRINT(("BC shift for char %c is %d\n", c, fg->len - i)); \ } \ } @@ -370,13 +370,18 @@ static int fastcmp(const void *, const v * Copies the pattern pat having lenght n to p and stores * the size in l. */ -#define SAVE_PATTERN(p, l) \ - l = (n == 0) ? tre_strlen(pat) : n; \ - p = xmalloc((l + 1) * sizeof(tre_char_t)); \ - if (p == NULL) \ - return REG_ESPACE; \ - memcpy(p, pat, l * sizeof(tre_char_t)); \ - p[l] = TRE_CHAR('\0'); +#define SAVE_PATTERN(src, srclen, dst, dstlen) \ + dstlen = srclen; \ + if (dstlen == 0) \ + dst = TRE_CHAR(""); \ + else \ + { \ + dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \ + if (dst == NULL) \ + return REG_ESPACE; \ + memcpy(dst, src, dstlen * sizeof(tre_char_t)); \ + dst[dstlen] = TRE_CHAR('\0'); \ + } /* * Initializes pattern compiling. @@ -392,12 +397,18 @@ static int fastcmp(const void *, const v if (n == 0) \ { \ fg->matchall = true; \ + fg->pattern = ""; \ + fg->wpattern = TRE_CHAR(""); \ + DPRINT(("Matching every input\n")); \ return REG_OK; \ - } + } \ \ /* Cannot handle REG_ICASE with MB string */ \ if (fg->icase && (TRE_MB_CUR_MAX > 1)) \ - return REG_BADPAT; \ + { \ + DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \ + return REG_BADPAT; \ + } /* * Returns: REG_OK on success, error code otherwise @@ -413,14 +424,14 @@ tre_compile_literal(fastmatch_t *fg, con return REG_BADPAT; #ifdef TRE_WCHAR - SAVE_PATTERN(fg->wpattern, fg->wlen); + SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen); STORE_MBS_PAT; #else - SAVE_PATTERN(fg->pattern, fg->len); + SAVE_PATTERN(pat, n, fg->pattern, fg->len); #endif - DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, " - "newline %c\n", fg->pattern, fg->icase ? 'y' : 'n', + DPRINT(("tre_compile_literal: pattern: %s, len %u, icase: %c, word: %c, " + "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); FILL_QSBC; @@ -440,14 +451,11 @@ int tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n, int cflags) { - INIT_COMP; + tre_char_t *tmp; + size_t pos = 0; + bool escaped = false; - /* Remove end-of-line character ('$'). */ - if ((n > 0) && (pat[n - 1] == TRE_CHAR('$'))) - { - fg->eol = true; - n--; - } + INIT_COMP; /* Remove beginning-of-line character ('^'). */ if (pat[0] == TRE_CHAR('^')) @@ -472,36 +480,140 @@ tre_compile_fast(fastmatch_t *fg, const if (fg->word && (TRE_MB_CUR_MAX > 1)) return REG_BADPAT; - /* Look for ways to cheat...er...avoid the full regex engine. */ - for (unsigned int i = 0; i < n; i++) + tmp = xmalloc((n + 1) * sizeof(tre_char_t)); + if (tmp == NULL) + return REG_ESPACE; + +#define STORE_CHAR \ + do \ + { \ + tmp[pos++] = pat[i]; \ + escaped = false; \ + continue; \ + } while (0) + + /* + * Used for heuristic, only beginning ^, trailing $ and . are treated + * as special. + */ + if (cflags & _REG_HEUR) { - /* Can still cheat? */ - if (!(cflags & _REG_HEUR) && - ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) || - (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) || - (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) || - (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/')))) + for (int i = 0; i < n; i++) + switch (pat[i]) + { + case TRE_CHAR('.'): + fg->hasdot = i; + STORE_CHAR; + break; + case TRE_CHAR('$'): + if (i == n - 1) + fg->eol = true; + else + STORE_CHAR; + break; + default: + STORE_CHAR; + } + } + else + for (int i = 0; i < n; i++) + { + switch (pat[i]) + { + case TRE_CHAR('\\'): + if (escaped) + STORE_CHAR; + else + escaped = true; + break; + case TRE_CHAR('['): + if (escaped) + STORE_CHAR; + else + goto badpat; + break; + case TRE_CHAR('*'): + if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) + STORE_CHAR; + else + goto badpat; + break; + case TRE_CHAR('+'): + case TRE_CHAR('?'): + if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else if ((cflags & REG_EXTENDED) ^ !escaped) + STORE_CHAR; + else + goto badpat; + case TRE_CHAR('.'): + if (escaped) + goto badpat; + else + { + fg->hasdot = true; + STORE_CHAR; + } + break; + case TRE_CHAR('^'): + STORE_CHAR; + break; + case TRE_CHAR('$'): + if (!escaped && (i == n - 1)) + fg->eol = true; + else + STORE_CHAR; + break; + case TRE_CHAR('('): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + break; + case TRE_CHAR('{'): + if (escaped && (i == 0)) + STORE_CHAR; + else if (!(cflags & REG_EXTENDED) && (i == 0)) + STORE_CHAR; + else if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else + goto badpat; + break; + case TRE_CHAR('|'): + if ((cflags & REG_EXTENDED) ^ (!escaped)) + goto badpat; + else + STORE_CHAR; + break; + default: + if (escaped) + goto badpat; + else + STORE_CHAR; + } continue; - else if (pat[i] == TRE_CHAR('.')) - fg->hasdot = i; - else +badpat: + xfree(tmp); return REG_BADPAT; - } + } /* - * pat has been adjusted earlier to not include '^', '$' or - * the word match character classes at the beginning and ending - * of the string respectively. + * The pattern has been processed and copied to tmp as a literal string + * with escapes, anchors (^$) and the word boundary match character + * classes stripped out. */ #ifdef TRE_WCHAR - SAVE_PATTERN(fg->wpattern, fg->wlen); + SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen); STORE_MBS_PAT; #else - SAVE_PATTERN(fg->pattern, fg->len); + SAVE_PATTERN(tmp, pos, fg->pattern, fg->len); #endif - DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, " - "icase: %c, word: %c, newline %c\n", fg->pattern, + xfree(tmp); + + DPRINT(("tre_compile_fast: pattern: %s, len %u, bol %c, eol %c, " + "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len, fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n', fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); @@ -593,7 +705,7 @@ tre_match_fast(const fastmatch_t *fg, co const tre_char_t *str_wide = data; /* Calculate length if unspecified. */ - if (len == (unsigned)-1) + if (len == (size_t)-1) switch (type) { case STR_WIDE:
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201109021818.p82IIO73010722>