Date: Tue, 30 Aug 2011 16:40:18 +0000 (UTC) From: Gabor Kovesdan <gabor@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r225266 - user/gabor/grep/trunk/regex Message-ID: <201108301640.p7UGeIGk058943@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: gabor Date: Tue Aug 30 16:40:17 2011 New Revision: 225266 URL: http://svn.freebsd.org/changeset/base/225266 Log: - Merge from TRE code Modified: user/gabor/grep/trunk/regex/glue.h user/gabor/grep/trunk/regex/hashtable.c user/gabor/grep/trunk/regex/hashtable.h user/gabor/grep/trunk/regex/tre-fastmatch.c Modified: user/gabor/grep/trunk/regex/glue.h ============================================================================== --- user/gabor/grep/trunk/regex/glue.h Tue Aug 30 16:33:59 2011 (r225265) +++ user/gabor/grep/trunk/regex/glue.h Tue Aug 30 16:40:17 2011 (r225266) @@ -10,6 +10,8 @@ #define TRE_WCHAR 1 #define TRE_MULTIBYTE 1 +#define TRE_CHAR(n) L##n + #define tre_char_t wchar_t #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) #define tre_strlen wcslen @@ -19,6 +21,7 @@ #define REG_LITERAL 0020 #define REG_WORD 0100 #define REG_GNU 0400 +#define _REG_HEUR 1000 #define REG_OK 0 Modified: user/gabor/grep/trunk/regex/hashtable.c ============================================================================== --- user/gabor/grep/trunk/regex/hashtable.c Tue Aug 30 16:33:59 2011 (r225265) +++ user/gabor/grep/trunk/regex/hashtable.c Tue Aug 30 16:40:17 2011 (r225266) @@ -1,5 +1,3 @@ -/* $FreeBSD$ */ - /*- * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org> * All rights reserved. @@ -26,134 +24,208 @@ * SUCH DAMAGE. */ -#include <sys/hash.h> -#include <hashtable.h> +#include <errno.h> #include <stdlib.h> #include <string.h> -hashtable -*hashtable_init(size_t table_size, size_t key_size, size_t value_size) +#include "hashtable.h" + + +/* + * Return a 32-bit hash of the given buffer. The init + * value should be 0, or the previous hash value to extend + * the previous hash. + */ +static uint32_t +hash32_buf(const void *buf, size_t len, uint32_t hash) { - hashtable *tbl; + const unsigned char *p = buf; - tbl = malloc(sizeof(hashtable)); - if (tbl == NULL) - return (NULL); - - tbl->entries = calloc(sizeof(hashtable_entry *), table_size); - if (tbl->entries == NULL) { - free(tbl); - return (NULL); - } - - tbl->table_size = table_size; - tbl->usage = 0; - tbl->key_size = key_size; - tbl->value_size = value_size; + while (len--) + hash = HASHSTEP(hash, *p++); - return (tbl); + return hash; } +/* + * Initializes a hash table that can hold table_size number of entries, + * each of which has a key of key_size bytes and a value of value_size + * bytes. On successful allocation returns a pointer to the hash table. + * Otherwise, returns NULL and sets errno to indicate the error. + */ +hashtable +*hashtable_init(size_t table_size, size_t key_size, size_t value_size) +{ + hashtable *tbl; + + tbl = malloc(sizeof(hashtable)); + if (tbl == NULL) + goto mem1; + + tbl->entries = calloc(sizeof(hashtable_entry *), table_size); + if (tbl->entries == NULL) + goto mem2; + + tbl->table_size = table_size; + tbl->usage = 0; + tbl->key_size = key_size; + tbl->value_size = value_size; + + return (tbl); + +mem2: + free(tbl); +mem1: + errno = ENOMEM; + return (NULL); +} + +/* + * Places the key-value pair to the hashtable tbl. + * Returns: + * HASH_OK: if the key was not present in the hash table yet + * but the kay-value pair has been successfully added. + * HASH_UPDATED: if the value for the key has been updated with the + * new value. + * HASH_FULL: if the hash table is full and the entry could not + * be added. + * HASH_FAIL: if an error has occurred and errno has been set to + * indicate the error. + */ int hashtable_put(hashtable *tbl, const void *key, const void *value) { - uint32_t hash = 0; - - if (tbl->table_size == tbl->usage) - return (-1); + uint32_t hash = 0; - hash = hash32_buf(key, tbl->key_size, hash); - hash %= tbl->table_size; + if (tbl->table_size == tbl->usage) + return (HASH_FULL); - while (tbl->entries[hash] != NULL) - hash = (hash >= tbl->table_size) ? 0 : hash + 1; - - tbl->entries[hash] = malloc(sizeof(hashtable_entry)); - if (tbl->entries[hash] == NULL) - return (-1); - - tbl->entries[hash]->key = malloc(tbl->key_size); - if (tbl->entries[hash]->key == NULL) { - free(tbl->entries[hash]); - return (-1); - } - - tbl->entries[hash]->value = malloc(tbl->value_size); - if (tbl->entries[hash]->value == NULL) { - free(tbl->entries[hash]->key); - free(tbl->entries[hash]); - return (-1); - } - - memcpy(&tbl->entries[hash]->key, key, tbl->key_size); - memcpy(&tbl->entries[hash]->value, value, tbl->value_size); - tbl->usage++; + hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; - return (0); + /* + * On hash collision entries are inserted at the next free space, + * so we have to increase the index until we either find an entry + * with the same key (and update it) or we find a free space. + */ + for(;;) + if (tbl->entries[hash] == NULL) + break; + else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0) + { + memcpy(tbl->entries[hash]->value, value, tbl->value_size); + return (HASH_UPDATED); + } + + tbl->entries[hash] = malloc(sizeof(hashtable_entry)); + if (tbl->entries[hash] == NULL) + { + errno = ENOMEM; + goto mem1; + } + + tbl->entries[hash]->key = malloc(tbl->key_size); + if (tbl->entries[hash]->key == NULL) + { + errno = ENOMEM; + goto mem2; + } + + tbl->entries[hash]->value = malloc(tbl->value_size); + if (tbl->entries[hash]->value == NULL) + { + errno = ENOMEM; + goto mem3; + } + + memcpy(tbl->entries[hash]->key, key, tbl->key_size); + memcpy(tbl->entries[hash]->value, value, tbl->value_size); + tbl->usage++; + + return (HASH_OK); + +mem3: + free(tbl->entries[hash]->key); +mem2: + free(tbl->entries[hash]); +mem1: + return (HASH_FAIL); } static hashtable_entry -*hashtable_lookup(const hashtable *tbl, const void *key) +**hashtable_lookup(const hashtable *tbl, const void *key) { - uint32_t hash = 0; + uint32_t hash = 0; - hash = hash32_buf(key, tbl->key_size, hash); - hash %= tbl->table_size; + hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; - for (;;) { - if (tbl->entries[hash] == NULL) - return (NULL); - else if (memcmp(key, &tbl->entries[hash]->key, - tbl->key_size) == 0) - return (tbl->entries[hash]); + for (;;) + { + if (tbl->entries[hash] == NULL) + return (NULL); + else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0) + return (&tbl->entries[hash]); - hash = (hash == tbl->table_size) ? 0 : hash + 1; - } + hash = (hash == tbl->table_size) ? 0 : hash + 1; + } } +/* + * Retrieves the value for key from the hash table tbl and places + * it to the space indicated by the value argument. + * Returns HASH_OK if the value has been found and retrieved or + * HASH_NOTFOUND otherwise. + */ int hashtable_get(hashtable *tbl, const void *key, void *value) { - hashtable_entry *entry; + hashtable_entry **entry; - entry = hashtable_lookup(tbl, key); - if (entry == NULL) - return (-1); + entry = hashtable_lookup(tbl, key); + if (entry == NULL) + return (HASH_NOTFOUND); - memcpy(value, &entry->value, tbl->value_size); - return (0); + memcpy(value, (*entry)->value, tbl->value_size); + return (HASH_OK); } +/* + * Removes the entry with the specifified key from the hash table + * tbl. Returns HASH_OK if the entry has been found and removed + * or HASH_NOTFOUND otherwise. + */ int hashtable_remove(hashtable *tbl, const void *key) { - hashtable_entry *entry; + hashtable_entry **entry; - entry = hashtable_lookup(tbl, key); - if (entry == NULL) - return (-1); + entry = hashtable_lookup(tbl, key); + if (entry == NULL) + return (HASH_NOTFOUND); -// free(entry->key); -// free(entry->value); - free(entry); + free((*entry)->key); + free((*entry)->value); + free(*entry); + *entry = NULL; - tbl->usage--; - - return (0); + tbl->usage--; + return (HASH_OK); } +/* + * Frees the resources associated with the hash table tbl. + */ void hashtable_free(hashtable *tbl) { + if (tbl == NULL) + return; - if (tbl == NULL) - return; + for (unsigned int i = 0; i < tbl->table_size; i++) + if ((tbl->entries[i] != NULL)) + { + free(tbl->entries[i]->key); + free(tbl->entries[i]->value); + } - for (unsigned int i = 0; i < tbl->table_size; i++) - if (tbl->entries[i] != NULL) { -// free(tbl->entries[i]->key); -// free(tbl->entries[i]->value); -// free(tbl->entries[i]); - } - free(tbl->entries); + free(tbl->entries); } Modified: user/gabor/grep/trunk/regex/hashtable.h ============================================================================== --- user/gabor/grep/trunk/regex/hashtable.h Tue Aug 30 16:33:59 2011 (r225265) +++ user/gabor/grep/trunk/regex/hashtable.h Tue Aug 30 16:40:17 2011 (r225266) @@ -5,23 +5,31 @@ #include <sys/types.h> +#define HASH_OK 0 +#define HASH_UPDATED 1 +#define HASH_FAIL 2 +#define HASH_FULL 3 +#define HASH_NOTFOUND 4 + +#define HASHSTEP(x,c) (((x << 5) + x) + (c)) + typedef struct { - void *key; - void *value; + void *key; + void *value; } hashtable_entry; typedef struct { - size_t key_size; - size_t table_size; - size_t usage; - size_t value_size; - hashtable_entry **entries; + size_t key_size; + size_t table_size; + size_t usage; + size_t value_size; + hashtable_entry **entries; } hashtable; -void hashtable_free(hashtable *); -int hashtable_get(hashtable *, const void *, void *); -hashtable *hashtable_init(size_t, size_t, size_t); -int hashtable_put(hashtable *, const void *, const void *); -int hashtable_remove(hashtable *, const void *); +void hashtable_free(hashtable *); +int hashtable_get(hashtable *, const void *, void *); +hashtable *hashtable_init(size_t, size_t, size_t); +int hashtable_put(hashtable *, const void *, const void *); +int hashtable_remove(hashtable *, const void *); #endif /* HASHTABLE.H */ Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c ============================================================================== --- user/gabor/grep/trunk/regex/tre-fastmatch.c Tue Aug 30 16:33:59 2011 (r225265) +++ user/gabor/grep/trunk/regex/tre-fastmatch.c Tue Aug 30 16:40:17 2011 (r225266) @@ -1,5 +1,3 @@ -/* $FreeBSD$ */ - /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org> @@ -30,7 +28,6 @@ #include "glue.h" #include <ctype.h> -#include <hashtable.h> #include <limits.h> #include <regex.h> #include <stdbool.h> @@ -48,14 +45,17 @@ static int fastcmp(const void *, const void *, size_t, tre_str_type_t, bool, bool); -/* - * We will work with wide characters if they are supported - */ -#ifdef TRE_WCHAR -#define TRE_CHAR(n) L##n -#else -#define TRE_CHAR(n) n -#endif +#define FAIL_COMP(errcode) \ + { \ + if (fg->pattern) \ + xfree(fg->pattern); \ + if (fg->wpattern) \ + xfree(fg->wpattern); \ + if (fg->qsBc_table) \ + hashtable_free(fg->qsBc_table); \ + fg = NULL; \ + return errcode; \ + } /* * Skips n characters in the input string and assigns the start @@ -143,7 +143,10 @@ static int fastcmp(const void *, const v &((tre_char_t *)startptr)[mismatch + 1], &bc); \ gs = fg->bmGs[mismatch]; \ } \ - bc = (r == 0) ? bc : fg->defBc; \ + bc = (r == HASH_OK) ? bc : fg->defBc; \ + DPRINT(("tre_fast_match: mismatch on character %lc," \ + "BC %d, GS %d\n", \ + ((tre_char_t *)startptr)[mismatch + 1], bc, gs)); \ break; \ default: \ if (!fg->hasdot) \ @@ -154,6 +157,9 @@ static int fastcmp(const void *, const v gs = fg->sbmGs[mismatch]; \ } \ bc = fg->qsBc[((unsigned char *)startptr)[mismatch + 1]]; \ + DPRINT(("tre_fast_match: mismatch on character %c," \ + "BC %d, GS %d\n", \ + ((unsigned char *)startptr)[mismatch + 1], bc, gs)); \ } \ if (fg->hasdot) \ shift = bc; \ @@ -171,6 +177,7 @@ static int fastcmp(const void *, const v u = 0; \ } \ } \ + DPRINT(("tre_fast_match: shifting %d characters\n", shift)); \ j += shift; \ } @@ -200,6 +207,8 @@ static int fastcmp(const void *, const v for (int i = fg->hasdot + 1; i < fg->len; i++) \ { \ fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i; \ + DPRINT(("BC shift for char %c is %d\n", fg->pattern[i], \ + fg->len - i)); \ if (fg->icase) \ { \ char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i]) \ @@ -224,15 +233,25 @@ static int fastcmp(const void *, const v /* Preprocess pattern. */ \ fg->qsBc_table = hashtable_init(fg->wlen * 4, sizeof(tre_char_t), \ sizeof(int)); \ + if (!fg->qsBc_table) \ + FAIL_COMP(REG_ESPACE); \ for (unsigned int i = fg->hasdot + 1; i < fg->wlen; i++) \ { \ int k = fg->wlen - i; \ - hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ + int r; \ + \ + r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ + if ((r == HASH_FAIL) || (r == HASH_FULL)) \ + FAIL_COMP(REG_ESPACE); \ + DPRINT(("BC shift for wide char %lc is %d\n", fg->wpattern[i], \ + fg->wlen - i)); \ if (fg->icase) \ { \ tre_char_t wc = iswlower(fg->wpattern[i]) ? \ towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ - hashtable_put(fg->qsBc_table, &wc, &k); \ + r = hashtable_put(fg->qsBc_table, &wc, &k); \ + if ((r == HASH_FAIL) || (r == HASH_FULL)) \ + FAIL_COMP(REG_ESPACE); \ } \ } @@ -245,7 +264,10 @@ static int fastcmp(const void *, const v fg->sbmGs = xmalloc(fg->len * sizeof(int)); \ if (!fg->sbmGs) \ return REG_ESPACE; \ - _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ + if (fg->len == 1) \ + fg->sbmGs[0] = 1; \ + else \ + _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ } /* @@ -257,7 +279,10 @@ static int fastcmp(const void *, const v fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \ if (!fg->bmGs) \ return REG_ESPACE; \ - _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ + if (fg->wlen == 1) \ + fg->bmGs[0] = 1; \ + else \ + _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ } #define _FILL_BMGS(arr, pat, plen, wide) \ @@ -385,6 +410,10 @@ tre_compile_literal(fastmatch_t *fg, con SAVE_PATTERN(fg->pattern, fg->len); #endif + DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, " + "newline %c\n", fg->pattern, fg->icase ? 'y' : 'n', + fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); + FILL_QSBC; FILL_BMGS; #ifdef TRE_WCHAR @@ -438,10 +467,11 @@ tre_compile_fast(fastmatch_t *fg, const for (unsigned int i = 0; i < n; i++) { /* Can still cheat? */ - if ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) || + if (!(cflags & _REG_HEUR) && + ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) || (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) || (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) || - (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/'))) + (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/')))) continue; else if (pat[i] == TRE_CHAR('.')) fg->hasdot = i; @@ -461,6 +491,12 @@ tre_compile_fast(fastmatch_t *fg, const SAVE_PATTERN(fg->pattern, fg->len); #endif + DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, " + "icase: %c, word: %c, newline %c\n", fg->pattern, + fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n', + fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', + fg->newline ? 'y' : 'n')); + FILL_QSBC; FILL_BMGS; #ifdef TRE_WCHAR @@ -644,6 +680,9 @@ void tre_free_fast(fastmatch_t *fg) { + DPRINT(("tre_fast_free: freeing structures for pattern %s\n", + fg->pattern)); + #ifdef TRE_WCHAR hashtable_free(fg->qsBc_table); if (!fg->hasdot) @@ -697,6 +736,7 @@ fastcmp(const void *pat, const void *dat : (pat_byte[i] == str_byte[i])) continue; } + DPRINT(("fastcmp: mismatch at position %d\n", i)); ret = -(i + 1); break; }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108301640.p7UGeIGk058943>