From owner-svn-src-user@FreeBSD.ORG Fri Sep 2 01:55:02 2011 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id DD1041065672; Fri, 2 Sep 2011 01:55:02 +0000 (UTC) (envelope-from gabor@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id C2B168FC18; Fri, 2 Sep 2011 01:55:02 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id p821t2SC076707; Fri, 2 Sep 2011 01:55:02 GMT (envelope-from gabor@svn.freebsd.org) Received: (from gabor@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id p821t2Gh076705; Fri, 2 Sep 2011 01:55:02 GMT (envelope-from gabor@svn.freebsd.org) Message-Id: <201109020155.p821t2Gh076705@svn.freebsd.org> From: Gabor Kovesdan Date: Fri, 2 Sep 2011 01:55:02 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r225318 - user/gabor/grep/trunk/regex X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 02 Sep 2011 01:55:03 -0000 Author: gabor Date: Fri Sep 2 01:55:02 2011 New Revision: 225318 URL: http://svn.freebsd.org/changeset/base/225318 Log: - Add a finer-grained parser to allow a wider range of patterns to be matched with the fast algorithm Modified: user/gabor/grep/trunk/regex/glue.h user/gabor/grep/trunk/regex/tre-fastmatch.c Modified: user/gabor/grep/trunk/regex/glue.h ============================================================================== --- user/gabor/grep/trunk/regex/glue.h Thu Sep 1 21:42:54 2011 (r225317) +++ user/gabor/grep/trunk/regex/glue.h Fri Sep 2 01:55:02 2011 (r225318) @@ -22,7 +22,7 @@ #define REG_LITERAL 0020 #define REG_WORD 0100 #define REG_GNU 0400 -#define _REG_HEUR 1000 +#define _REG_HEUR 01000 #define REG_OK 0 Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c ============================================================================== --- user/gabor/grep/trunk/regex/tre-fastmatch.c Thu Sep 1 21:42:54 2011 (r225317) +++ user/gabor/grep/trunk/regex/tre-fastmatch.c Fri Sep 2 01:55:02 2011 (r225318) @@ -368,13 +368,18 @@ static int fastcmp(const void *, const v * Copies the pattern pat having lenght n to p and stores * the size in l. */ -#define SAVE_PATTERN(p, l) \ - l = (n == 0) ? tre_strlen(pat) : n; \ - p = xmalloc((l + 1) * sizeof(tre_char_t)); \ - if (p == NULL) \ - return REG_ESPACE; \ - memcpy(p, pat, l * sizeof(tre_char_t)); \ - p[l] = TRE_CHAR('\0'); +#define SAVE_PATTERN(src, srclen, dst, dstlen) \ + dstlen = srclen; \ + if (dstlen == 0) \ + dst = TRE_CHAR(""); \ + else \ + { \ + dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \ + if (dst == NULL) \ + return REG_ESPACE; \ + memcpy(dst, src, dstlen * sizeof(tre_char_t)); \ + dst[dstlen] = TRE_CHAR('\0'); \ + } /* * Initializes pattern compiling. @@ -413,10 +418,10 @@ tre_compile_literal(fastmatch_t *fg, con return REG_BADPAT; #ifdef TRE_WCHAR - SAVE_PATTERN(fg->wpattern, fg->wlen); + SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen); STORE_MBS_PAT; #else - SAVE_PATTERN(fg->pattern, fg->len); + SAVE_PATTERN(pat, n, fg->pattern, fg->len); #endif DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, " @@ -440,14 +445,11 @@ int tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n, int cflags) { - INIT_COMP; + tre_char_t *tmp; + size_t pos = 0; + bool escaped = false; - /* Remove end-of-line character ('$'). */ - if ((n > 0) && (pat[n - 1] == TRE_CHAR('$'))) - { - fg->eol = true; - n--; - } + INIT_COMP; /* Remove beginning-of-line character ('^'). */ if (pat[0] == TRE_CHAR('^')) @@ -472,34 +474,138 @@ tre_compile_fast(fastmatch_t *fg, const if (fg->word && (TRE_MB_CUR_MAX > 1)) return REG_BADPAT; - /* Look for ways to cheat...er...avoid the full regex engine. */ - for (unsigned int i = 0; i < n; i++) + tmp = xmalloc((n + 1) * sizeof(tre_char_t)); + if (tmp == NULL) + return REG_ESPACE; + +#define STORE_CHAR \ + do \ + { \ + tmp[pos++] = pat[i]; \ + escaped = false; \ + continue; \ + } while (0) + + /* + * Used for heuristic, only beginning ^, trailing $ and . are treated + * as special. + */ + if (cflags & _REG_HEUR) { - /* Can still cheat? */ - if (!(cflags & _REG_HEUR) && - ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) || - (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) || - (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) || - (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/')))) + for (int i = 0; i < n; i++) + switch (pat[i]) + { + case TRE_CHAR('.'): + fg->hasdot = true; + STORE_CHAR; + break; + case TRE_CHAR('$'): + if (i == n - 1) + fg->eol = true; + else + STORE_CHAR; + break; + default: + STORE_CHAR; + } + } + else + for (int i = 0; i < n; i++) + { + switch (pat[i]) + { + case TRE_CHAR('\\'): + if (escaped) + STORE_CHAR; + else + escaped = true; + break; + case TRE_CHAR('['): + if (escaped) + STORE_CHAR; + else + goto badpat; + break; + case TRE_CHAR('*'): + if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) + STORE_CHAR; + else + goto badpat; + break; + case TRE_CHAR('+'): + case TRE_CHAR('?'): + if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else if ((cflags & REG_EXTENDED) ^ !escaped) + STORE_CHAR; + else + goto badpat; + case TRE_CHAR('.'): + if (escaped) + goto badpat; + else + { + fg->hasdot = true; + STORE_CHAR; + } + break; + case TRE_CHAR('^'): + STORE_CHAR; + break; + case TRE_CHAR('$'): + if (!escaped && (i == n - 1)) + fg->eol = true; + else + STORE_CHAR; + break; + case TRE_CHAR('('): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + break; + case TRE_CHAR('{'): + if (escaped && (i == 0)) + STORE_CHAR; + else if (!(cflags & REG_EXTENDED) && (i == 0)) + STORE_CHAR; + else if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else + goto badpat; + break; + case TRE_CHAR('|'): + if ((cflags & REG_EXTENDED) ^ (!escaped)) + goto badpat; + else + STORE_CHAR; + break; + default: + if (escaped) + goto badpat; + else + STORE_CHAR; + } continue; - else if (pat[i] == TRE_CHAR('.')) - fg->hasdot = i; - else +badpat: + xfree(tmp); return REG_BADPAT; - } + } /* - * pat has been adjusted earlier to not include '^', '$' or - * the word match character classes at the beginning and ending - * of the string respectively. + * The pattern has been processed and copied to tmp as a literal string + * with escapes, anchors (^$) and the word boundary match character + * classes stripped out. */ #ifdef TRE_WCHAR - SAVE_PATTERN(fg->wpattern, fg->wlen); + SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen); STORE_MBS_PAT; #else - SAVE_PATTERN(fg->pattern, fg->len); + SAVE_PATTERN(tmp, pos, fg->pattern, fg->len); #endif + xfree(tmp); + DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, " "icase: %c, word: %c, newline %c\n", fg->pattern, fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',