Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 30 Aug 2011 16:40:18 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r225266 - user/gabor/grep/trunk/regex
Message-ID:  <201108301640.p7UGeIGk058943@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Tue Aug 30 16:40:17 2011
New Revision: 225266
URL: http://svn.freebsd.org/changeset/base/225266

Log:
  - Merge from TRE code

Modified:
  user/gabor/grep/trunk/regex/glue.h
  user/gabor/grep/trunk/regex/hashtable.c
  user/gabor/grep/trunk/regex/hashtable.h
  user/gabor/grep/trunk/regex/tre-fastmatch.c

Modified: user/gabor/grep/trunk/regex/glue.h
==============================================================================
--- user/gabor/grep/trunk/regex/glue.h	Tue Aug 30 16:33:59 2011	(r225265)
+++ user/gabor/grep/trunk/regex/glue.h	Tue Aug 30 16:40:17 2011	(r225266)
@@ -10,6 +10,8 @@
 #define TRE_WCHAR			1
 #define TRE_MULTIBYTE			1
 
+#define TRE_CHAR(n) L##n
+
 #define tre_char_t			wchar_t
 #define tre_mbrtowc(pwc, s, n, ps)	(mbrtowc((pwc), (s), (n), (ps)))
 #define tre_strlen			wcslen
@@ -19,6 +21,7 @@
 #define REG_LITERAL			0020
 #define REG_WORD			0100
 #define REG_GNU				0400
+#define _REG_HEUR			1000
 
 #define REG_OK				0
 

Modified: user/gabor/grep/trunk/regex/hashtable.c
==============================================================================
--- user/gabor/grep/trunk/regex/hashtable.c	Tue Aug 30 16:33:59 2011	(r225265)
+++ user/gabor/grep/trunk/regex/hashtable.c	Tue Aug 30 16:40:17 2011	(r225266)
@@ -1,5 +1,3 @@
-/* $FreeBSD$ */
-
 /*-
  * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org>
  * All rights reserved.
@@ -26,134 +24,208 @@
  * SUCH DAMAGE.
  */
 
-#include <sys/hash.h>
-#include <hashtable.h>
+#include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 
-hashtable
-*hashtable_init(size_t table_size, size_t key_size, size_t value_size)
+#include "hashtable.h"
+
+
+/*
+ * Return a 32-bit hash of the given buffer.  The init
+ * value should be 0, or the previous hash value to extend
+ * the previous hash.
+ */
+static uint32_t
+hash32_buf(const void *buf, size_t len, uint32_t hash)
 {
-	hashtable *tbl;
+  const unsigned char *p = buf;
 
-	tbl = malloc(sizeof(hashtable));
-	if (tbl == NULL)
-		return (NULL);
-
-	tbl->entries = calloc(sizeof(hashtable_entry *), table_size);
-	if (tbl->entries == NULL) {
-		free(tbl);
-		return (NULL);
-	}
-
-	tbl->table_size = table_size;
-	tbl->usage = 0;
-	tbl->key_size = key_size;
-	tbl->value_size = value_size;
+  while (len--)
+    hash = HASHSTEP(hash, *p++);
 
-	return (tbl);
+  return hash;
 }
 
+/*
+ * Initializes a hash table that can hold table_size number of entries,
+ * each of which has a key of key_size bytes and a value of value_size
+ * bytes. On successful allocation returns a pointer to the hash table.
+ * Otherwise, returns NULL and sets errno to indicate the error.
+ */
+hashtable
+*hashtable_init(size_t table_size, size_t key_size, size_t value_size)
+{
+  hashtable *tbl;
+
+  tbl = malloc(sizeof(hashtable));
+  if (tbl == NULL)
+    goto mem1;
+
+  tbl->entries = calloc(sizeof(hashtable_entry *), table_size);
+  if (tbl->entries == NULL)
+    goto mem2;
+
+  tbl->table_size = table_size;
+  tbl->usage = 0;
+  tbl->key_size = key_size;
+  tbl->value_size = value_size;
+
+  return (tbl);
+
+mem2:
+  free(tbl);
+mem1:
+  errno = ENOMEM;
+  return (NULL);
+}
+
+/*
+ * Places the key-value pair to the hashtable tbl.
+ * Returns:
+ *   HASH_OK:		if the key was not present in the hash table yet
+ *			but the kay-value pair has been successfully added.
+ *   HASH_UPDATED:	if the value for the key has been updated with the
+ *			new value.
+ *   HASH_FULL:		if the hash table is full and the entry could not
+ *			be added.
+ *   HASH_FAIL:		if an error has occurred and errno has been set to
+ *			indicate the error.
+ */
 int
 hashtable_put(hashtable *tbl, const void *key, const void *value)
 {
-	uint32_t hash = 0;
-
-	if (tbl->table_size == tbl->usage)
-		return (-1);
+  uint32_t hash = 0;
 
-	hash = hash32_buf(key, tbl->key_size, hash);
-	hash %= tbl->table_size;
+  if (tbl->table_size == tbl->usage)
+    return (HASH_FULL);
 
-	while (tbl->entries[hash] != NULL)
-		hash = (hash >= tbl->table_size) ? 0 : hash + 1;
-
-	tbl->entries[hash] = malloc(sizeof(hashtable_entry));
-	if (tbl->entries[hash] == NULL)
-		return (-1);
-
-	tbl->entries[hash]->key = malloc(tbl->key_size);
-	if (tbl->entries[hash]->key == NULL) {
-		free(tbl->entries[hash]);
-		return (-1);
-	}
-
-	tbl->entries[hash]->value = malloc(tbl->value_size);
-	if (tbl->entries[hash]->value == NULL) {
-		free(tbl->entries[hash]->key);
-		free(tbl->entries[hash]);
-		return (-1);
-	}
-
-	memcpy(&tbl->entries[hash]->key, key, tbl->key_size);
-	memcpy(&tbl->entries[hash]->value, value, tbl->value_size);
-	tbl->usage++;
+  hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size;
 
-	return (0);
+  /*
+   * On hash collision entries are inserted at the next free space,
+   * so we have to increase the index until we either find an entry
+   * with the same key (and update it) or we find a free space.
+   */
+  for(;;)
+    if (tbl->entries[hash] == NULL)
+      break;
+    else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0)
+      {
+	memcpy(tbl->entries[hash]->value, value, tbl->value_size);
+	return (HASH_UPDATED);
+      }
+
+  tbl->entries[hash] = malloc(sizeof(hashtable_entry));
+  if (tbl->entries[hash] == NULL)
+    {
+      errno = ENOMEM;
+      goto mem1;
+    }
+
+  tbl->entries[hash]->key = malloc(tbl->key_size);
+  if (tbl->entries[hash]->key == NULL)
+    {
+      errno = ENOMEM;
+      goto mem2;
+    }
+
+  tbl->entries[hash]->value = malloc(tbl->value_size);
+  if (tbl->entries[hash]->value == NULL)
+    {
+      errno = ENOMEM;
+      goto mem3;
+    }
+
+  memcpy(tbl->entries[hash]->key, key, tbl->key_size);
+  memcpy(tbl->entries[hash]->value, value, tbl->value_size);
+  tbl->usage++;
+
+  return (HASH_OK);
+
+mem3:
+  free(tbl->entries[hash]->key);
+mem2:
+  free(tbl->entries[hash]);
+mem1:
+  return (HASH_FAIL);
 }
 
 static hashtable_entry
-*hashtable_lookup(const hashtable *tbl, const void *key)
+**hashtable_lookup(const hashtable *tbl, const void *key)
 {
-	uint32_t hash = 0;
+  uint32_t hash = 0;
 
-	hash = hash32_buf(key, tbl->key_size, hash);
-	hash %= tbl->table_size;
+  hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size;
 
-	for (;;) {
-		if (tbl->entries[hash] == NULL)
-			return (NULL);
-		else if (memcmp(key, &tbl->entries[hash]->key,
-		    tbl->key_size) == 0)
-			return (tbl->entries[hash]);
+  for (;;)
+    {
+      if (tbl->entries[hash] == NULL)
+	return (NULL);
+      else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0)
+	return (&tbl->entries[hash]);
 
-		hash = (hash == tbl->table_size) ? 0 : hash + 1;
-  	}
+      hash = (hash == tbl->table_size) ? 0 : hash + 1;
+    }
 }
 
+/*
+ * Retrieves the value for key from the hash table tbl and places
+ * it to the space indicated by the value argument.
+ * Returns HASH_OK if the value has been found and retrieved or
+ * HASH_NOTFOUND otherwise.
+ */
 int
 hashtable_get(hashtable *tbl, const void *key, void *value)
 {
-	hashtable_entry *entry;
+  hashtable_entry **entry;
 
-	entry = hashtable_lookup(tbl, key);
-	if (entry == NULL)
-		return (-1);
+  entry = hashtable_lookup(tbl, key);
+  if (entry == NULL)
+    return (HASH_NOTFOUND);
 
-	memcpy(value, &entry->value, tbl->value_size);
-	return (0);
+  memcpy(value, (*entry)->value, tbl->value_size);
+  return (HASH_OK);
 }
 
+/*
+ * Removes the entry with the specifified key from the hash table
+ * tbl. Returns HASH_OK if the entry has been found and removed
+ * or HASH_NOTFOUND otherwise.
+ */
 int
 hashtable_remove(hashtable *tbl, const void *key)
 {
-	hashtable_entry *entry;
+  hashtable_entry **entry;
 
-	entry = hashtable_lookup(tbl, key);
-	if (entry == NULL)
-		return (-1);
+  entry = hashtable_lookup(tbl, key);
+  if (entry == NULL)
+    return (HASH_NOTFOUND);
 
-//	free(entry->key);
-//	free(entry->value);
-	free(entry);
+  free((*entry)->key);
+  free((*entry)->value);
+  free(*entry);
+  *entry = NULL;
 
-	tbl->usage--;
-
-	return (0);
+  tbl->usage--;
+  return (HASH_OK);
 }
 
+/*
+ * Frees the resources associated with the hash table tbl.
+ */
 void
 hashtable_free(hashtable *tbl)
 {
+  if (tbl == NULL)
+    return;
 
-	if (tbl == NULL)
-		return;
+  for (unsigned int i = 0; i < tbl->table_size; i++)
+    if ((tbl->entries[i] != NULL))
+      {
+	free(tbl->entries[i]->key);
+	free(tbl->entries[i]->value);
+      }
 
-	for (unsigned int i = 0; i < tbl->table_size; i++)
-		if (tbl->entries[i] != NULL) {
-//			free(tbl->entries[i]->key);
-//			free(tbl->entries[i]->value);
-//			free(tbl->entries[i]);
-	}
-	free(tbl->entries);
+  free(tbl->entries);
 }

Modified: user/gabor/grep/trunk/regex/hashtable.h
==============================================================================
--- user/gabor/grep/trunk/regex/hashtable.h	Tue Aug 30 16:33:59 2011	(r225265)
+++ user/gabor/grep/trunk/regex/hashtable.h	Tue Aug 30 16:40:17 2011	(r225266)
@@ -5,23 +5,31 @@
 
 #include <sys/types.h>
 
+#define HASH_OK         0
+#define HASH_UPDATED    1
+#define HASH_FAIL       2
+#define HASH_FULL       3
+#define HASH_NOTFOUND   4
+
+#define HASHSTEP(x,c) (((x << 5) + x) + (c))
+
 typedef struct {
-	void		*key;
-	void		*value;
+  void *key;
+  void *value;
 } hashtable_entry;
 
 typedef struct {
-	size_t		 key_size;
-	size_t		 table_size;
-	size_t		 usage;
-	size_t		 value_size;
-	hashtable_entry **entries;
+  size_t key_size;
+  size_t table_size;
+  size_t usage;
+  size_t value_size;
+  hashtable_entry **entries;
 } hashtable;
 
-void		 hashtable_free(hashtable *);
-int		 hashtable_get(hashtable *, const void *, void *);
-hashtable	*hashtable_init(size_t, size_t, size_t);
-int		 hashtable_put(hashtable *, const void *, const void *);
-int		 hashtable_remove(hashtable *, const void *);
+void hashtable_free(hashtable *);
+int hashtable_get(hashtable *, const void *, void *);
+hashtable *hashtable_init(size_t, size_t, size_t);
+int hashtable_put(hashtable *, const void *, const void *);
+int hashtable_remove(hashtable *, const void *);
 
 #endif	/* HASHTABLE.H */

Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c
==============================================================================
--- user/gabor/grep/trunk/regex/tre-fastmatch.c	Tue Aug 30 16:33:59 2011	(r225265)
+++ user/gabor/grep/trunk/regex/tre-fastmatch.c	Tue Aug 30 16:40:17 2011	(r225266)
@@ -1,5 +1,3 @@
-/* $FreeBSD$ */
-
 /*-
  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
  * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
@@ -30,7 +28,6 @@
 #include "glue.h"
 
 #include <ctype.h>
-#include <hashtable.h>
 #include <limits.h>
 #include <regex.h>
 #include <stdbool.h>
@@ -48,14 +45,17 @@
 static int	fastcmp(const void *, const void *, size_t,
 			tre_str_type_t, bool, bool);
 
-/*
- * We will work with wide characters if they are supported
- */
-#ifdef TRE_WCHAR
-#define TRE_CHAR(n)	L##n
-#else
-#define TRE_CHAR(n)	n
-#endif
+#define FAIL_COMP(errcode)						\
+  {									\
+    if (fg->pattern)							\
+      xfree(fg->pattern);						\
+    if (fg->wpattern)							\
+      xfree(fg->wpattern);						\
+    if (fg->qsBc_table)							\
+      hashtable_free(fg->qsBc_table);					\
+    fg = NULL;								\
+    return errcode;							\
+  }
 
 /*
  * Skips n characters in the input string and assigns the start
@@ -143,7 +143,10 @@ static int	fastcmp(const void *, const v
 		&((tre_char_t *)startptr)[mismatch + 1], &bc);		\
 	      gs = fg->bmGs[mismatch];					\
 	    }								\
-	    bc = (r == 0) ? bc : fg->defBc;				\
+	    bc = (r == HASH_OK) ? bc : fg->defBc;			\
+	    DPRINT(("tre_fast_match: mismatch on character %lc,"	\
+		    "BC %d, GS %d\n",					\
+		    ((tre_char_t *)startptr)[mismatch + 1], bc, gs));	\
             break;							\
 	default:							\
 	  if (!fg->hasdot)						\
@@ -154,6 +157,9 @@ static int	fastcmp(const void *, const v
 	      gs = fg->sbmGs[mismatch];					\
 	    }								\
 	  bc = fg->qsBc[((unsigned char *)startptr)[mismatch + 1]];	\
+	  DPRINT(("tre_fast_match: mismatch on character %c,"		\
+		 "BC %d, GS %d\n",					\
+		 ((unsigned char *)startptr)[mismatch + 1], bc, gs));	\
       }									\
     if (fg->hasdot)							\
       shift = bc;							\
@@ -171,6 +177,7 @@ static int	fastcmp(const void *, const v
 	    u = 0;							\
 	  }								\
       }									\
+      DPRINT(("tre_fast_match: shifting %d characters\n", shift));	\
       j += shift;							\
   }
 
@@ -200,6 +207,8 @@ static int	fastcmp(const void *, const v
   for (int i = fg->hasdot + 1; i < fg->len; i++)			\
     {									\
       fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i;			\
+      DPRINT(("BC shift for char %c is %d\n", fg->pattern[i],		\
+	     fg->len - i));						\
       if (fg->icase)							\
         {								\
           char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i])	\
@@ -224,15 +233,25 @@ static int	fastcmp(const void *, const v
   /* Preprocess pattern. */						\
   fg->qsBc_table = hashtable_init(fg->wlen * 4, sizeof(tre_char_t),	\
     sizeof(int));							\
+  if (!fg->qsBc_table)							\
+    FAIL_COMP(REG_ESPACE);						\
   for (unsigned int i = fg->hasdot + 1; i < fg->wlen; i++)		\
     {									\
       int k = fg->wlen - i;						\
-      hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
+      int r;								\
+									\
+      r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
+      if ((r == HASH_FAIL) || (r == HASH_FULL))				\
+	FAIL_COMP(REG_ESPACE);						\
+      DPRINT(("BC shift for wide char %lc is %d\n", fg->wpattern[i],	\
+	     fg->wlen - i));						\
       if (fg->icase)							\
 	{								\
 	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
 	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
-	  hashtable_put(fg->qsBc_table, &wc, &k);			\
+	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
+	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
+	    FAIL_COMP(REG_ESPACE);					\
 	}								\
     }
 
@@ -245,7 +264,10 @@ static int	fastcmp(const void *, const v
       fg->sbmGs = xmalloc(fg->len * sizeof(int));			\
       if (!fg->sbmGs)							\
 	return REG_ESPACE;						\
-      _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
+      if (fg->len == 1)							\
+	fg->sbmGs[0] = 1;						\
+      else								\
+	_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
     }
 
 /*
@@ -257,7 +279,10 @@ static int	fastcmp(const void *, const v
       fg->bmGs = xmalloc(fg->wlen * sizeof(int));			\
       if (!fg->bmGs)							\
 	return REG_ESPACE;						\
-      _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
+      if (fg->wlen == 1)						\
+	fg->bmGs[0] = 1;						\
+      else								\
+	_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
     }
 
 #define _FILL_BMGS(arr, pat, plen, wide)				\
@@ -385,6 +410,10 @@ tre_compile_literal(fastmatch_t *fg, con
   SAVE_PATTERN(fg->pattern, fg->len);
 #endif
 
+  DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, "
+	 "newline %c\n", fg->pattern, fg->icase ? 'y' : 'n',
+	 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
+
   FILL_QSBC;
   FILL_BMGS;
 #ifdef TRE_WCHAR
@@ -438,10 +467,11 @@ tre_compile_fast(fastmatch_t *fg, const 
   for (unsigned int i = 0; i < n; i++)
     {
       /* Can still cheat? */
-      if ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) ||
+      if (!(cflags & _REG_HEUR) &&
+	  ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) ||
 	  (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) ||
 	  (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) ||
-	  (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/')))
+	  (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/'))))
 	continue;
       else if (pat[i] == TRE_CHAR('.'))
 	fg->hasdot = i;
@@ -461,6 +491,12 @@ tre_compile_fast(fastmatch_t *fg, const 
   SAVE_PATTERN(fg->pattern, fg->len);
 #endif
 
+  DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, "
+	 "icase: %c, word: %c, newline %c\n", fg->pattern,
+	 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
+	 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
+	 fg->newline ? 'y' : 'n'));
+
   FILL_QSBC;
   FILL_BMGS;
 #ifdef TRE_WCHAR
@@ -644,6 +680,9 @@ void
 tre_free_fast(fastmatch_t *fg)
 {
 
+  DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
+	 fg->pattern));
+
 #ifdef TRE_WCHAR
   hashtable_free(fg->qsBc_table);
   if (!fg->hasdot)
@@ -697,6 +736,7 @@ fastcmp(const void *pat, const void *dat
 		    : (pat_byte[i] == str_byte[i]))
 	  continue;
       }
+    DPRINT(("fastcmp: mismatch at position %d\n", i));
     ret = -(i + 1);
     break;
   }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108301640.p7UGeIGk058943>