Date: Thu, 18 Aug 2011 17:09:10 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r224980 - in user/gabor/grep/trunk: . regex Message-ID: <201108181709.p7IH9ATh066263@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Thu Aug 18 17:09:10 2011 New Revision: 224980 URL: http://svn.freebsd.org/changeset/base/224980 Log: - Backport the fast matching algorithm from TRE that has been written during this summer as a part of GSoC 2011. This only improves the performance slightly but expected to be much better tested and much more robust. Added: user/gabor/grep/trunk/regex/ user/gabor/grep/trunk/regex/fastmatch.c (contents, props changed) user/gabor/grep/trunk/regex/fastmatch.h (contents, props changed) user/gabor/grep/trunk/regex/glue.h (contents, props changed) user/gabor/grep/trunk/regex/hashtable.c (contents, props changed) user/gabor/grep/trunk/regex/hashtable.h (contents, props changed) user/gabor/grep/trunk/regex/tre-fastmatch.c (contents, props changed) user/gabor/grep/trunk/regex/tre-fastmatch.h (contents, props changed) user/gabor/grep/trunk/regex/xmalloc.c (contents, props changed) user/gabor/grep/trunk/regex/xmalloc.h (contents, props changed) Deleted: user/gabor/grep/trunk/fastgrep.c Modified: user/gabor/grep/trunk/Makefile user/gabor/grep/trunk/grep.c user/gabor/grep/trunk/grep.h user/gabor/grep/trunk/util.c Modified: user/gabor/grep/trunk/Makefile ============================================================================== --- user/gabor/grep/trunk/Makefile Thu Aug 18 16:57:51 2011 (r224979) +++ user/gabor/grep/trunk/Makefile Thu Aug 18 17:09:10 2011 (r224980) @@ -9,7 +9,12 @@ PROG= grep .else PROG= bsdgrep .endif -SRCS= fastgrep.c file.c grep.c queue.c util.c +SRCS= file.c grep.c queue.c util.c + +# Extra files ported backported form some regex improvements +.PATH: ${.CURDIR}/regex +SRCS+= fastmatch.c hashtable.c tre-fastmatch.c xmalloc.c +CFLAGS+=-I${.CURDIR}/regex .if ${MK_BSD_GREP} == "yes" LINKS= ${BINDIR}/grep ${BINDIR}/egrep \ Modified: user/gabor/grep/trunk/grep.c ============================================================================== --- user/gabor/grep/trunk/grep.c Thu Aug 18 16:57:51 2011 (r224979) +++ user/gabor/grep/trunk/grep.c Thu Aug 18 17:09:10 2011 (r224980) @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <string.h> #include <unistd.h> +#include "fastmatch.h" #include "grep.h" #ifndef WITHOUT_NLS @@ -83,7 +84,7 @@ bool matchall; unsigned int patterns, pattern_sz; char **pattern; regex_t *r_pattern; -fastgrep_t *fg_pattern; +fastmatch_t *fg_pattern; /* Filename exclusion/inclusion patterns */ unsigned int fpatterns, fpattern_sz; @@ -662,10 +663,10 @@ main(int argc, char *argv[]) /* Check if cheating is allowed (always is for fgrep). */ if (grepbehave == GREP_FIXED) { for (i = 0; i < patterns; ++i) - fgrepcomp(&fg_pattern[i], pattern[i]); + fixcomp(&fg_pattern[i], pattern[i], cflags); } else { for (i = 0; i < patterns; ++i) { - if (fastcomp(&fg_pattern[i], pattern[i])) { + if (fastcomp(&fg_pattern[i], pattern[i], cflags) != 0) { /* Fall back to full regex library */ c = regcomp(&r_pattern[i], pattern[i], cflags); if (c != 0) { Modified: user/gabor/grep/trunk/grep.h ============================================================================== --- user/gabor/grep/trunk/grep.h Thu Aug 18 16:57:51 2011 (r224979) +++ user/gabor/grep/trunk/grep.h Thu Aug 18 17:09:10 2011 (r224980) @@ -36,6 +36,8 @@ #include <stdio.h> #include <zlib.h> +#include "fastmatch.h" + #ifdef WITHOUT_NLS #define getstr(n) errstr[n] #else @@ -95,17 +97,6 @@ struct epat { int mode; }; -typedef struct { - size_t len; - unsigned char *pattern; - int qsBc[UCHAR_MAX + 1]; - /* flags */ - bool bol; - bool eol; - bool reversed; - bool word; -} fastgrep_t; - /* Flags passed to regcomp() and regexec() */ extern int cflags, eflags; @@ -125,7 +116,7 @@ extern unsigned int dpatterns, fpatterns extern char **pattern; extern struct epat *dpattern, *fpattern; extern regex_t *er_pattern, *r_pattern; -extern fastgrep_t *fg_pattern; +extern fastmatch_t *fg_pattern; /* For regex errors */ #define RE_ERROR_BUF 512 @@ -150,8 +141,3 @@ void clearqueue(void); void grep_close(struct file *f); struct file *grep_open(const char *path); char *grep_fgetln(struct file *f, size_t *len); - -/* fastgrep.c */ -int fastcomp(fastgrep_t *, const char *); -void fgrepcomp(fastgrep_t *, const char *); -int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *); Added: user/gabor/grep/trunk/regex/fastmatch.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/grep/trunk/regex/fastmatch.c Thu Aug 18 17:09:10 2011 (r224980) @@ -0,0 +1,227 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "glue.h" + +#include <fastmatch.h> +#include <regex.h> +#include <string.h> + +#include "tre-fastmatch.h" +#include "xmalloc.h" + +/* XXX: avoid duplication */ +#define CONV_PAT \ + int ret; \ + tre_char_t *wregex; \ + size_t wlen; \ + \ + wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \ + if (wregex == NULL) \ + return REG_ESPACE; \ + \ + if (TRE_MB_CUR_MAX == 1) \ + { \ + unsigned int i; \ + const unsigned char *str = (const unsigned char *)regex; \ + tre_char_t *wstr = wregex; \ + \ + for (i = 0; i < n; i++) \ + *(wstr++) = *(str++); \ + wlen = n; \ + } \ + else \ + { \ + int consumed; \ + tre_char_t *wcptr = wregex; \ + mbstate_t state; \ + memset(&state, '\0', sizeof(state)); \ + while (n > 0) \ + { \ + consumed = tre_mbrtowc(wcptr, regex, n, &state); \ + \ + switch (consumed) \ + { \ + case 0: \ + if (*regex == '\0') \ + consumed = 1; \ + else \ + { \ + xfree(wregex); \ + return REG_BADPAT; \ + } \ + break; \ + case -1: \ + DPRINT(("mbrtowc: error %d: %s.\n", errno, \ + strerror(errno))); \ + xfree(wregex); \ + return REG_BADPAT; \ + case -2: \ + consumed = n; \ + break; \ + } \ + regex += consumed; \ + n -= consumed; \ + wcptr++; \ + } \ + wlen = wcptr - wregex; \ + } \ + \ + wregex[wlen] = L'\0'; + +int +tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) +{ + CONV_PAT; + + ret = tre_compile_literal(preg, wregex, wlen, cflags); + xfree(wregex); + + return ret; +} + +int +tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) +{ + CONV_PAT; + + ret = (cflags & REG_LITERAL) ? + tre_compile_literal(preg, wregex, wlen, cflags) : + tre_compile_fast(preg, wregex, wlen, cflags); + xfree(wregex); + + return ret; +} + + +int +tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags) +{ + return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags); +} + +int +tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags) +{ + return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags); +} + +int +tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags) +{ + return tre_compile_literal(preg, regex, n, cflags); +} + +int +tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags) +{ + return (cflags & REG_LITERAL) ? + tre_compile_literal(preg, regex, n, cflags) : + tre_compile_fast(preg, regex, n, cflags); +} + +int +tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags) +{ + return tre_fixwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags); +} + +int +tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags) +{ + return tre_fastwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags); +} + +void +tre_fastfree(fastmatch_t *preg) +{ + tre_free_fast(preg); +} + +/* XXX: avoid duplication */ +#define ADJUST_OFFSETS \ + { \ + size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \ + size_t offset = pmatch[0].rm_so; \ + int ret; \ + \ + if ((len != (unsigned)-1) && (pmatch[0].rm_eo > len)) \ + return REG_NOMATCH; \ + if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \ + return REG_NOMATCH; \ + ret = tre_match_fast(preg, &string[offset], slen, type, nmatch, \ + pmatch, eflags); \ + for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) && \ + (i < nmatch)); i++) \ + { \ + pmatch[i].rm_so += offset; \ + pmatch[i].rm_eo += offset; \ + } \ + return ret; \ + } + +int +tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + + if (eflags & REG_STARTEND) + ADJUST_OFFSETS + else + return tre_match_fast(preg, string, len, type, nmatch, + pmatch, eflags); +} + +int +tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags) +{ + return tre_fastnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags); +} + +int +tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_str_type_t type = STR_WIDE; + + if (eflags & REG_STARTEND) + ADJUST_OFFSETS + else + return tre_match_fast(preg, string, len, type, nmatch, + pmatch, eflags); +} + +int +tre_fastwexec(const fastmatch_t *preg, const wchar_t *string, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + return tre_fastwnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags); +} + Added: user/gabor/grep/trunk/regex/fastmatch.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/grep/trunk/regex/fastmatch.h Thu Aug 18 17:09:10 2011 (r224980) @@ -0,0 +1,103 @@ +/* $FreeBSD$ */ + +#ifndef FASTMATCH_H +#define FASTMATCH_H 1 + +#include <hashtable.h> +#include <limits.h> +#include <regex.h> +#include <stdbool.h> +#include <wchar.h> + +typedef struct { + size_t wlen; + size_t len; + wchar_t *wpattern; + int hasdot; + int qsBc[UCHAR_MAX + 1]; + int *bmGs; + char *pattern; + int defBc; + hashtable *qsBc_table; + int *sbmGs; + + /* flags */ + bool bol; + bool eol; + bool word; + bool icase; + bool newline; +} fastmatch_t; + +extern int +tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags); + +extern int +tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags); + +extern int +tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags); + +extern void +tre_fastfree(fastmatch_t *preg); + +extern int +tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags); + +extern int +tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags); + +extern int +tre_fastwexec(const fastmatch_t *preg, const wchar_t *string, + size_t nmatch, regmatch_t pmatch[], int eflags); + +/* Versions with a maximum length argument and therefore the capability to + handle null characters in the middle of the strings. */ +extern int +tre_fixncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags); + +extern int +tre_fastncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags); + +extern int +tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); + +extern int +tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags); + +extern int +tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags); + +extern int +tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); + +#define fixncomp tre_fixncomp +#define fastncomp tre_fastncomp +#define fixcomp tre_fixcomp +#define fastcomp tre_fastcomp +#define fixwncomp tre_fixwncomp +#define fastwncomp tre_fastwncomp +#define fixwcomp tre_fixwcomp +#define fastwcomp tre_fastwcomp +#define fastfree tre_fastfree +#define fastnexec tre_fastnexec +#define fastexec tre_fastexec +#define fastwnexec tre_fastwnexec +#define fastwexec tre_fastwexec +#define fixcomp tre_fixcomp +#define fastcomp tre_fastcomp +#define fastexec tre_fastexec +#define fastfree tre_fastfree +#define fixwcomp tre_fixwcomp +#define fastwcomp tre_fastwcomp +#define fastwexec tre_fastwexec +#define fixncomp tre_fixncomp +#define fastncomp tre_fastncomp +#define fastnexec tre_fastnexec +#define fixwncomp tre_fixwncomp +#define fastwncomp tre_fastwncomp +#define fastwnexec tre_fastwnexec +#endif /* FASTMATCH_H */ Added: user/gabor/grep/trunk/regex/glue.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/grep/trunk/regex/glue.h Thu Aug 18 17:09:10 2011 (r224980) @@ -0,0 +1,30 @@ +/* $FreeBSD$ */ + +#ifndef GLUE_H +#define GLUE_H + +#include <regex.h> + +#define TRE_WCHAR 1 +#define TRE_MULTIBYTE 1 + +#define tre_char_t wchar_t +#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) +#define tre_strlen wcslen +#define tre_isspace iswspace +#define tre_isalnum iswalnum + +#define REG_LITERAL 0020 +#define REG_WORD 0100 +#define REG_GNU 0400 + +#define REG_OK 0 + +#define TRE_MB_CUR_MAX MB_CUR_MAX + +#define DPRINT(msg) +#define MIN(a,b) ((a > b) ? (b) : (a)) +#define MAX(a,b) ((a > b) ? (a) : (b)) + +typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; +#endif Added: user/gabor/grep/trunk/regex/hashtable.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/grep/trunk/regex/hashtable.c Thu Aug 18 17:09:10 2011 (r224980) @@ -0,0 +1,159 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/hash.h> +#include <hashtable.h> +#include <stdlib.h> +#include <string.h> + +hashtable +*hashtable_init(size_t table_size, size_t key_size, size_t value_size) +{ + hashtable *tbl; + + tbl = malloc(sizeof(hashtable)); + if (tbl == NULL) + return (NULL); + + tbl->entries = calloc(sizeof(hashtable_entry *), table_size); + if (tbl->entries == NULL) { + free(tbl); + return (NULL); + } + + tbl->table_size = table_size; + tbl->usage = 0; + tbl->key_size = key_size; + tbl->value_size = value_size; + + return (tbl); +} + +int +hashtable_put(hashtable *tbl, const void *key, const void *value) +{ + uint32_t hash = 0; + + if (tbl->table_size == tbl->usage) + return (-1); + + hash = hash32_buf(key, tbl->key_size, hash); + hash %= tbl->table_size; + + while (tbl->entries[hash] != NULL) + hash = (hash >= tbl->table_size) ? 0 : hash + 1; + + tbl->entries[hash] = malloc(sizeof(hashtable_entry)); + if (tbl->entries[hash] == NULL) + return (-1); + + tbl->entries[hash]->key = malloc(tbl->key_size); + if (tbl->entries[hash]->key == NULL) { + free(tbl->entries[hash]); + return (-1); + } + + tbl->entries[hash]->value = malloc(tbl->value_size); + if (tbl->entries[hash]->value == NULL) { + free(tbl->entries[hash]->key); + free(tbl->entries[hash]); + return (-1); + } + + memcpy(&tbl->entries[hash]->key, key, tbl->key_size); + memcpy(&tbl->entries[hash]->value, value, tbl->value_size); + tbl->usage++; + + return (0); +} + +static hashtable_entry +*hashtable_lookup(const hashtable *tbl, const void *key) +{ + uint32_t hash = 0; + + hash = hash32_buf(key, tbl->key_size, hash); + hash %= tbl->table_size; + + for (;;) { + if (tbl->entries[hash] == NULL) + return (NULL); + else if (memcmp(key, &tbl->entries[hash]->key, + tbl->key_size) == 0) + return (tbl->entries[hash]); + + hash = (hash == tbl->table_size) ? 0 : hash + 1; + } +} + +int +hashtable_get(hashtable *tbl, const void *key, void *value) +{ + hashtable_entry *entry; + + entry = hashtable_lookup(tbl, key); + if (entry == NULL) + return (-1); + + memcpy(value, &entry->value, tbl->value_size); + return (0); +} + +int +hashtable_remove(hashtable *tbl, const void *key) +{ + hashtable_entry *entry; + + entry = hashtable_lookup(tbl, key); + if (entry == NULL) + return (-1); + +// free(entry->key); +// free(entry->value); + free(entry); + + tbl->usage--; + + return (0); +} + +void +hashtable_free(hashtable *tbl) +{ + + if (tbl == NULL) + return; + + for (unsigned int i = 0; i < tbl->table_size; i++) + if (tbl->entries[i] != NULL) { +// free(tbl->entries[i]->key); +// free(tbl->entries[i]->value); +// free(tbl->entries[i]); + } + free(tbl->entries); +} Added: user/gabor/grep/trunk/regex/hashtable.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/grep/trunk/regex/hashtable.h Thu Aug 18 17:09:10 2011 (r224980) @@ -0,0 +1,27 @@ +/* $FreeBSD$ */ + +#ifndef HASHTABLE_H +#define HASHTABLE_H 1 + +#include <sys/types.h> + +typedef struct { + void *key; + void *value; +} hashtable_entry; + +typedef struct { + size_t key_size; + size_t table_size; + size_t usage; + size_t value_size; + hashtable_entry **entries; +} hashtable; + +void hashtable_free(hashtable *); +int hashtable_get(hashtable *, const void *, void *); +hashtable *hashtable_init(size_t, size_t, size_t); +int hashtable_put(hashtable *, const void *, const void *); +int hashtable_remove(hashtable *, const void *); + +#endif /* HASHTABLE.H */ Added: user/gabor/grep/trunk/regex/tre-fastmatch.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/gabor/grep/trunk/regex/tre-fastmatch.c Thu Aug 18 17:09:10 2011 (r224980) @@ -0,0 +1,704 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav + * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "glue.h" + +#include <ctype.h> +#include <hashtable.h> +#include <limits.h> +#include <regex.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#ifdef TRE_WCHAR +#include <wchar.h> +#include <wctype.h> +#endif + +#include "hashtable.h" +#include "tre-fastmatch.h" +#include "xmalloc.h" + +static int fastcmp(const void *, const void *, size_t, + tre_str_type_t, bool, bool); + +/* + * We will work with wide characters if they are supported + */ +#ifdef TRE_WCHAR +#define TRE_CHAR(n) L##n +#else +#define TRE_CHAR(n) n +#endif + +/* + * Skips n characters in the input string and assigns the start + * address to startptr. Note: as per IEEE Std 1003.1-2008 + * matching is based on bit pattern not character representations + * so we can handle MB strings as byte sequences just like + * SB strings. + */ +#define SKIP_CHARS(n) \ + switch (type) \ + { \ + case STR_WIDE: \ + startptr = str_wide + n; \ + break; \ + default: \ + startptr = str_byte + n; \ + } + +/* + * Converts the wide string pattern to SB/MB string and stores + * it in fg->pattern. Sets fg->len to the byte length of the + * converted string. + */ +#define STORE_MBS_PAT \ + { \ + size_t siz; \ + \ + siz = wcstombs(NULL, fg->wpattern, 0); \ + if (siz == (size_t)-1) \ + return REG_BADPAT; \ + fg->len = siz; \ + fg->pattern = xmalloc(siz + 1); \ + if (fg->pattern == NULL) \ + return REG_ESPACE; \ + wcstombs(fg->pattern, fg->wpattern, siz); \ + fg->pattern[siz] = '\0'; \ + } \ + +/* + * Compares the pattern to the input string at the position + * stored in startptr. + */ +#define COMPARE \ + switch (type) \ + { \ + case STR_WIDE: \ + mismatch = fastcmp(fg->wpattern, startptr, fg->wlen, type, \ + fg->icase, fg->newline); \ + break; \ + default: \ + mismatch = fastcmp(fg->pattern, startptr, fg->len, type, \ + fg->icase, fg->newline); \ + } \ + +#define IS_OUT_OF_BOUNDS \ + ((type == STR_WIDE) ? ((j + fg->wlen) > len) : ((j + fg->len) > len)) + +/* + * Checks whether the new position after shifting in the input string + * is out of the bounds and break out from the loop if so. + */ +#define CHECKBOUNDS \ + if (IS_OUT_OF_BOUNDS) \ + break; \ + +/* + * Shifts in the input string after a mismatch. The position of the + * mismatch is stored in the mismatch variable. + */ +#define SHIFT \ + CHECKBOUNDS; \ + \ + { \ + int bc = 0, gs = 0, ts, r = -1; \ + \ + switch (type) \ + { \ + case STR_WIDE: \ + if (!fg->hasdot) \ + { \ + if (u != 0 && mismatch == fg->wlen - 1 - shift) \ + mismatch -= u; \ + v = fg->wlen - 1 - mismatch; \ + r = hashtable_get(fg->qsBc_table, \ + &((tre_char_t *)startptr)[mismatch + 1], &bc); \ + gs = fg->bmGs[mismatch]; \ + } \ + bc = (r == 0) ? bc : fg->defBc; \ + break; \ + default: \ + if (!fg->hasdot) \ + { \ + if (u != 0 && mismatch == fg->len - 1 - shift) \ + mismatch -= u; \ + v = fg->len - 1 - mismatch; \ + gs = fg->sbmGs[mismatch]; \ + } \ + bc = fg->qsBc[((unsigned char *)startptr)[mismatch + 1]]; \ + } \ + if (fg->hasdot) \ + shift = bc; \ + else \ + { \ + ts = u - v; \ + shift = MAX(ts, bc); \ + shift = MAX(shift, gs); \ + if (shift == gs) \ + u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \ + else \ + { \ + if (ts < bc) \ + shift = MAX(shift, u + 1); \ + u = 0; \ + } \ + } \ + j += shift; \ + } + +/* + * Normal Quick Search would require a shift based on the position the + * next character after the comparison is within the pattern. With + * wildcards, the position of the last dot effects the maximum shift + * distance. + * The closer to the end the wild card is the slower the search. + * + * Examples: + * Pattern Max shift + * ------- --------- + * this 5 + * .his 4 + * t.is 3 + * th.s 2 + * thi. 1 + */ + +/* + * Fills in the bad character shift array for SB/MB strings. + */ +#define FILL_QSBC \ + for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ + fg->qsBc[i] = fg->len - fg->hasdot; \ + for (int i = fg->hasdot + 1; i < fg->len; i++) \ + { \ + fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i; \ + if (fg->icase) \ + { \ + char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i]) \ + : tolower(fg->pattern[i]); \ + fg->qsBc[(unsigned)c] = fg->len - i; \ + } \ + } + +/* + * Fills in the bad character shifts into a hastable for wide strings. + * With wide characters it is not possible any more to use a normal + * array because there are too many characters and we could not + * provide enough memory. Fortunately, we only have to store distinct + * values for so many characters as the number of distinct characters + * in the pattern, so we can store them in a hashtable and store a + * default shift value for the rest. + */ +#define FILL_QSBC_WIDE \ + /* Adjust the shift based on location of the last dot ('.'). */ \ + fg->defBc = fg->wlen - fg->hasdot; \ + \ + /* Preprocess pattern. */ \ + fg->qsBc_table = hashtable_init(fg->wlen * 4, sizeof(tre_char_t), \ + sizeof(int)); \ + for (unsigned int i = fg->hasdot + 1; i < fg->wlen; i++) \ + { \ + int k = fg->wlen - i; \ + hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ + if (fg->icase) \ + { \ + tre_char_t wc = iswlower(fg->wpattern[i]) ? \ + towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ + hashtable_put(fg->qsBc_table, &wc, &k); \ + } \ + } + +/* + * Fills in the good suffix table for SB/MB strings. + */ +#define FILL_BMGS \ + if (!fg->hasdot) \ + { \ + fg->sbmGs = xmalloc(fg->len * sizeof(int)); \ + if (!fg->sbmGs) \ + return REG_ESPACE; \ + _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ + } + +/* + * Fills in the good suffix table for wide strings. + */ +#define FILL_BMGS_WIDE \ + if (!fg->hasdot) \ + { \ + fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \ + if (!fg->bmGs) \ + return REG_ESPACE; \ + _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ + } + +#define _FILL_BMGS(arr, pat, plen, wide) \ + { \ + char *p; \ + tre_char_t *wp; \ + \ + if (wide) \ + { \ + if (fg->icase) \ + { \ + wp = xmalloc(plen * sizeof(tre_char_t)); \ + if (wp == NULL) \ + return REG_ESPACE; \ + for (int i = 0; i < plen; i++) \ + wp[i] = towlower(pat[i]); \ + _CALC_BMGS(arr, wp, plen); \ + xfree(wp); \ + } \ + else \ + _CALC_BMGS(arr, pat, plen); \ + } \ + else \ + { \ + if (fg->icase) \ + { \ + p = xmalloc(plen); \ + if (p == NULL) \ + return REG_ESPACE; \ + for (int i = 0; i < plen; i++) \ + p[i] = tolower(pat[i]); \ + _CALC_BMGS(arr, p, plen); \ + xfree(p); \ + } \ + else \ + _CALC_BMGS(arr, pat, plen); \ + } \ + } + +#define _CALC_BMGS(arr, pat, plen) \ + { \ + int f, g; \ + \ + int *suff = xmalloc(plen * sizeof(int)); \ + if (suff == NULL) \ + return REG_ESPACE; \ + \ + suff[plen - 1] = plen; \ + g = plen - 1; \ + for (int i = plen - 2; i >= 0; i--) \ + { \ + if (i > g && suff[i + plen - 1 - f] < i - g) \ + suff[i] = suff[i + plen - 1 - f]; \ + else \ + { \ + if (i < g) \ + g = i; \ + f = i; \ + while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \ + g--; \ + suff[i] = f - g; \ + } \ + } \ + \ + for (int i = 0; i < plen; i++) \ + arr[i] = plen; \ + g = 0; \ + for (int i = plen - 1; i >= 0; i--) \ + if (suff[i] == i + 1) \ + for(; g < plen - 1 - i; g++) \ + if (arr[g] == plen) \ + arr[g] = plen - 1 - i; \ + for (int i = 0; i <= plen - 2; i++) \ + arr[plen - 1 - suff[i]] = plen - 1 - i; \ + \ + xfree(suff); \ + } + +/* + * Copies the pattern pat having lenght n to p and stores + * the size in l. *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108181709.p7IH9ATh066263>