Date: Wed, 25 Jun 2008 12:05:26 GMT From: Gabor Kovesdan <gabor@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 144086 for review Message-ID: <200806251205.m5PC5QVD083316@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=144086 Change 144086 by gabor@gabor_server on 2008/06/25 12:05:02 - Remove the fastgrep code. The regex library can auto-detect the fixed strings and operate with the same speed. If -F is specified, also set the REG_NOSPEC explicitly. - As the whole fastgrep code is removed, there is no need to fix the wide char handling there, grep should be localized now. Obtained from: NetBSD Project Affected files ... .. //depot/projects/soc2008/gabor_textproc/grep/grep.c#46 edit .. //depot/projects/soc2008/gabor_textproc/grep/grep.h#26 edit .. //depot/projects/soc2008/gabor_textproc/grep/util.c#40 edit Differences ... ==== //depot/projects/soc2008/gabor_textproc/grep/grep.c#46 (text+ko) ==== @@ -84,7 +84,6 @@ int patterns, pattern_sz; char **pattern; regex_t *r_pattern; -struct fastgrep *fg_pattern; /* For regex errors */ char re_error[RE_ERROR_BUF + 1]; @@ -616,24 +615,17 @@ ++argv; } - if (Eflag) + if (Fflag) + cflags |= REG_NOSPEC; + else if (Eflag) cflags |= REG_EXTENDED; - fg_pattern = grep_calloc(patterns, sizeof(*fg_pattern)); r_pattern = grep_calloc(patterns, sizeof(*r_pattern)); for (i = 0; i < patterns; ++i) { - /* Check if cheating is allowed (always is for fgrep). */ - if (Fflag) { - fgrepcomp(&fg_pattern[i], pattern[i]); - } else { - if (fastcomp(&fg_pattern[i], pattern[i])) { - /* Fall back to full regex library */ - c = regcomp(&r_pattern[i], pattern[i], cflags); - if (c != 0) { - regerror(c, &r_pattern[i], re_error, - RE_ERROR_BUF); - errx(2, "%s", re_error); - } - } + c = regcomp(&r_pattern[i], pattern[i], cflags); + if (c != 0) { + regerror(c, &r_pattern[i], re_error, + RE_ERROR_BUF); + errx(2, "%s", re_error); } } ==== //depot/projects/soc2008/gabor_textproc/grep/grep.h#26 (text+ko) ==== @@ -98,7 +98,6 @@ extern int first, prev, matchall, patterns, tail, notfound; extern char **pattern; -extern struct fastgrep *fg_pattern; extern regex_t *r_pattern; /* For regex errors */ @@ -112,8 +111,6 @@ void *grep_calloc(size_t nmemb, size_t size); void *grep_realloc(void *ptr, size_t size); void printline(struct str *line, int sep); -int fastcomp(struct fastgrep *, const char *); -void fgrepcomp(struct fastgrep *, const char *); /* queue.c */ void initqueue(void); ==== //depot/projects/soc2008/gabor_textproc/grep/util.c#40 (text+ko) ==== @@ -56,9 +56,6 @@ static int linesqueued; static int procline(struct str *l, int); -static int grep_search(struct fastgrep *, unsigned char *, size_t, regmatch_t *pmatch); -static int grep_cmp(const unsigned char *, const unsigned char *, size_t); -static void grep_revstr(unsigned char *, int); int grep_tree(char **argv) @@ -204,13 +201,6 @@ return (c); } - -/* - * Process an individual line in a file. Return non-zero if it matches. - */ - -#define isword(x) (isalnum(x) || (x) == '_') - static int procline(struct str *l, int nottext) { @@ -219,14 +209,9 @@ if (!matchall) { for (c = i = 0; i < patterns; i++) { - if (fg_pattern[i].pattern) { - r = grep_search(&fg_pattern[i], (unsigned char *)l->dat, - l->len, &pmatch); - } else { - pmatch.rm_so = 0; - pmatch.rm_eo = l->len; - r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); - } + pmatch.rm_so = 0; + pmatch.rm_eo = l->len; + r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); if (r == 0 && xflag) { if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) r = REG_NOMATCH; @@ -306,283 +291,6 @@ return (c); } -void -fgrepcomp(struct fastgrep *fg, const char *pattern) -{ - int i; - - /* Initialize. */ - fg->patternLen = strlen(pattern); - fg->bol = 0; - fg->eol = 0; - fg->wmatch = wflag; - fg->reversedSearch = 0; - - /* - * Make a copy and upper case it for later if in -i mode, - * else just copy the pointer. - */ - if (iflag) { - fg->pattern = grep_malloc(fg->patternLen + 1); - for (i = 0; i < fg->patternLen; i++) - fg->pattern[i] = toupper(pattern[i]); - fg->pattern[fg->patternLen] = '\0'; - } else - fg->pattern = (unsigned char *)pattern; /* really const */ - - /* Preprocess pattern. */ - for (i = 0; i <= UCHAR_MAX; i++) - fg->qsBc[i] = fg->patternLen; - for (i = 1; i < fg->patternLen; i++) { - fg->qsBc[fg->pattern[i]] = fg->patternLen - i; - /* - * If case is ignored, make the jump apply to both upper and - * lower cased characters. As the pattern is stored in upper - * case, apply the same to the lower case equivalents. - */ - if (iflag) - fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; - } -} - -/* - * Returns: -1 on failure, 0 on success - */ -int -fastcomp(struct fastgrep *fg, const char *pattern) -{ - int i; - int bol = 0; - int eol = 0; - int shiftPatternLen; - int hasDot = 0; - int firstHalfDot = -1; - int firstLastHalfDot = -1; - int lastHalfDot = 0; - - /* Initialize. */ - fg->patternLen = strlen(pattern); - fg->bol = 0; - fg->eol = 0; - fg->wmatch = 0; - fg->reversedSearch = 0; - - /* Remove end-of-line character ('$'). */ - if (pattern[fg->patternLen - 1] == '$') { - eol++; - fg->eol = 1; - fg->patternLen--; - } - - /* Remove beginning-of-line character ('^'). */ - if (pattern[0] == '^') { - bol++; - fg->bol = 1; - fg->patternLen--; - } - - /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ - if (wflag) { - /* basic re's use \( \), extended re's ( ) */ - int extra = Eflag ? 1 : 2; - fg->patternLen -= 14 + 2 * extra; - fg->wmatch = 7 + extra; - } else if (fg->patternLen >= 14 && - strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && - strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) { - fg->patternLen -= 14; - fg->wmatch = 7; - } - - /* - * Copy pattern minus '^' and '$' characters as well as word - * match character classes at the beginning and ending of the - * string respectively. - */ - fg->pattern = grep_malloc(fg->patternLen + 1); - memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); - fg->pattern[fg->patternLen] = '\0'; - - /* Look for ways to cheat...er...avoid the full regex engine. */ - for (i = 0; i < fg->patternLen; i++) - { - /* Can still cheat? */ - if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || - (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || - (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || - (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) { - /* As long as it is good, upper case it for later. */ - if (iflag) - fg->pattern[i] = toupper(fg->pattern[i]); - } else if (fg->pattern[i] == '.') { - hasDot = i; - if (i < fg->patternLen / 2) { - if (firstHalfDot < 0) - /* Closest dot to the beginning */ - firstHalfDot = i; - } else { - /* Closest dot to the end of the pattern. */ - lastHalfDot = i; - if (firstLastHalfDot < 0) - firstLastHalfDot = i; - } - } else { - /* Free memory and let others know this is empty. */ - free(fg->pattern); - fg->pattern = NULL; - return (-1); - } - } - - /* - * Determine if a reverse search would be faster based on the placement - * of the dots. - */ - if ((!(lflag || cflag)) && ((!(bol || eol)) && - ((lastHalfDot) && ((firstHalfDot < 0) || - ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { - fg->reversedSearch = 1; - hasDot = fg->patternLen - (firstHalfDot < 0 ? - firstLastHalfDot : firstHalfDot) - 1; - grep_revstr(fg->pattern, fg->patternLen); - } - - /* - * Normal Quick Search would require a shift based on the position the - * next character after the comparison is within the pattern. With - * wildcards, the position of the last dot effects the maximum shift - * distance. - * The closer to the end the wild card is the slower the search. A - * reverse version of this algorithm would be useful for wildcards near - * the end of the string. - * - * Examples: - * Pattern Max shift - * ------- --------- - * this 5 - * .his 4 - * t.is 3 - * th.s 2 - * thi. 1 - */ - - /* Adjust the shift based on location of the last dot ('.'). */ - shiftPatternLen = fg->patternLen - hasDot; - - /* Preprocess pattern. */ - for (i = 0; i <= UCHAR_MAX; i++) - fg->qsBc[i] = shiftPatternLen; - for (i = hasDot + 1; i < fg->patternLen; i++) { - fg->qsBc[fg->pattern[i]] = fg->patternLen - i; - /* - * If case is ignored, make the jump apply to both upper and - * lower cased characters. As the pattern is stored in upper - * case, apply the same to the lower case equivalents. - */ - if (iflag) - fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; - } - - /* - * Put pattern back to normal after pre-processing to allow for easy - * comparisons later. - */ - if (fg->reversedSearch) - grep_revstr(fg->pattern, fg->patternLen); - - return (0); -} - -/* - * Word boundaries using regular expressions are defined as the point - * of transition from a non-word char to a word char, or vice versa. - * This means that grep -w +a and grep -w a+ never match anything, - * because they lack a starting or ending transition, but grep -w a+b - * does match a line containing a+b. - */ -#define wmatch(d, l, s, e) \ - ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \ - e > s && isword(d[s]) && isword(d[e-1])) - -static int -grep_search(struct fastgrep *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch) -{ - int j; - int rtrnVal = REG_NOMATCH; - - pmatch->rm_so = -1; - pmatch->rm_eo = -1; - - /* No point in going farther if we do not have enough data. */ - if (dataLen < fg->patternLen) - return (rtrnVal); - - /* Only try once at the beginning or ending of the line. */ - if (fg->bol || fg->eol) { - /* Simple text comparison. */ - /* Verify data is >= pattern length before searching on it. */ - if (dataLen >= fg->patternLen) { - /* Determine where in data to start search at. */ - if (fg->eol) - j = dataLen - fg->patternLen; - else - j = 0; - if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) - if (grep_cmp(fg->pattern, data + j, - fg->patternLen) == -1) { - pmatch->rm_so = j; - pmatch->rm_eo = j + fg->patternLen; - if (!fg->wmatch || wmatch(data, dataLen, - pmatch->rm_so, pmatch->rm_eo)) - rtrnVal = 0; - } - } - } else if (fg->reversedSearch) { - /* Quick Search algorithm. */ - j = dataLen; - do { - if (grep_cmp(fg->pattern, data + j - fg->patternLen, - fg->patternLen) == -1) { - pmatch->rm_so = j - fg->patternLen; - pmatch->rm_eo = j; - if (!fg->wmatch || wmatch(data, dataLen, - pmatch->rm_so, pmatch->rm_eo)) { - rtrnVal = 0; - break; - } - } - /* Shift if within bounds, otherwise, we are done. */ - if (j == fg->patternLen) - break; - j -= fg->qsBc[data[j - fg->patternLen - 1]]; - } while (j >= fg->patternLen); - } else { - /* Quick Search algorithm. */ - j = 0; - do { - if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { - pmatch->rm_so = j; - pmatch->rm_eo = j + fg->patternLen; - if (fg->patternLen == 0 || !fg->wmatch || - wmatch(data, dataLen, pmatch->rm_so, - pmatch->rm_eo)) { - rtrnVal = 0; - break; - } - } - - /* Shift if within bounds, otherwise, we are done. */ - if (j + fg->patternLen == dataLen) - break; - else - j += fg->qsBc[data[j + fg->patternLen]]; - } while (j <= (dataLen - fg->patternLen)); - } - - return (rtrnVal); -} - - void * grep_malloc(size_t size) { @@ -612,38 +320,6 @@ return (ptr); } -/* - * Returns: i >= 0 on failure (position that it failed) - * -1 on success - */ -static int -grep_cmp(const unsigned char *pattern, const unsigned char *data, size_t len) -{ - int i; - - for (i = 0; i < len; i++) { - if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.')) - || (iflag && pattern[i] == toupper(data[i]))) - continue; - return (i); - } - - return (-1); -} - -static void -grep_revstr(unsigned char *str, int len) -{ - int i; - char c; - - for (i = 0; i < len / 2; i++) { - c = str[i]; - str[i] = str[len - i - 1]; - str[len - i - 1] = c; - } -} - void printline(struct str *line, int sep) {
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200806251205.m5PC5QVD083316>