Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 18 Aug 2011 17:09:10 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r224980 - in user/gabor/grep/trunk: . regex
Message-ID:  <201108181709.p7IH9ATh066263@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Thu Aug 18 17:09:10 2011
New Revision: 224980
URL: http://svn.freebsd.org/changeset/base/224980

Log:
  - Backport the fast matching algorithm from TRE that has been written
    during this summer as a part of GSoC 2011. This only improves the
    performance slightly but expected to be much better tested and
    much more robust.

Added:
  user/gabor/grep/trunk/regex/
  user/gabor/grep/trunk/regex/fastmatch.c   (contents, props changed)
  user/gabor/grep/trunk/regex/fastmatch.h   (contents, props changed)
  user/gabor/grep/trunk/regex/glue.h   (contents, props changed)
  user/gabor/grep/trunk/regex/hashtable.c   (contents, props changed)
  user/gabor/grep/trunk/regex/hashtable.h   (contents, props changed)
  user/gabor/grep/trunk/regex/tre-fastmatch.c   (contents, props changed)
  user/gabor/grep/trunk/regex/tre-fastmatch.h   (contents, props changed)
  user/gabor/grep/trunk/regex/xmalloc.c   (contents, props changed)
  user/gabor/grep/trunk/regex/xmalloc.h   (contents, props changed)
Deleted:
  user/gabor/grep/trunk/fastgrep.c
Modified:
  user/gabor/grep/trunk/Makefile
  user/gabor/grep/trunk/grep.c
  user/gabor/grep/trunk/grep.h
  user/gabor/grep/trunk/util.c

Modified: user/gabor/grep/trunk/Makefile
==============================================================================
--- user/gabor/grep/trunk/Makefile	Thu Aug 18 16:57:51 2011	(r224979)
+++ user/gabor/grep/trunk/Makefile	Thu Aug 18 17:09:10 2011	(r224980)
@@ -9,7 +9,12 @@ PROG=	grep
 .else
 PROG=	bsdgrep
 .endif
-SRCS=	fastgrep.c file.c grep.c queue.c util.c
+SRCS=	file.c grep.c queue.c util.c
+
+# Extra files ported backported form some regex improvements
+.PATH: ${.CURDIR}/regex
+SRCS+=	fastmatch.c hashtable.c tre-fastmatch.c xmalloc.c
+CFLAGS+=-I${.CURDIR}/regex	
 
 .if ${MK_BSD_GREP} == "yes"
 LINKS=	${BINDIR}/grep ${BINDIR}/egrep \

Modified: user/gabor/grep/trunk/grep.c
==============================================================================
--- user/gabor/grep/trunk/grep.c	Thu Aug 18 16:57:51 2011	(r224979)
+++ user/gabor/grep/trunk/grep.c	Thu Aug 18 17:09:10 2011	(r224980)
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
 #include <string.h>
 #include <unistd.h>
 
+#include "fastmatch.h"
 #include "grep.h"
 
 #ifndef WITHOUT_NLS
@@ -83,7 +84,7 @@ bool		 matchall;
 unsigned int	 patterns, pattern_sz;
 char		**pattern;
 regex_t		*r_pattern;
-fastgrep_t	*fg_pattern;
+fastmatch_t	*fg_pattern;
 
 /* Filename exclusion/inclusion patterns */
 unsigned int	 fpatterns, fpattern_sz;
@@ -662,10 +663,10 @@ main(int argc, char *argv[])
 		/* Check if cheating is allowed (always is for fgrep). */
 	if (grepbehave == GREP_FIXED) {
 		for (i = 0; i < patterns; ++i)
-			fgrepcomp(&fg_pattern[i], pattern[i]);
+			fixcomp(&fg_pattern[i], pattern[i], cflags);
 	} else {
 		for (i = 0; i < patterns; ++i) {
-			if (fastcomp(&fg_pattern[i], pattern[i])) {
+			if (fastcomp(&fg_pattern[i], pattern[i], cflags) != 0) {
 				/* Fall back to full regex library */
 				c = regcomp(&r_pattern[i], pattern[i], cflags);
 				if (c != 0) {

Modified: user/gabor/grep/trunk/grep.h
==============================================================================
--- user/gabor/grep/trunk/grep.h	Thu Aug 18 16:57:51 2011	(r224979)
+++ user/gabor/grep/trunk/grep.h	Thu Aug 18 17:09:10 2011	(r224980)
@@ -36,6 +36,8 @@
 #include <stdio.h>
 #include <zlib.h>
 
+#include "fastmatch.h"
+
 #ifdef WITHOUT_NLS
 #define getstr(n)	 errstr[n]
 #else
@@ -95,17 +97,6 @@ struct epat {
 	int		 mode;
 };
 
-typedef struct {
-	size_t		 len;
-	unsigned char	*pattern;
-	int		 qsBc[UCHAR_MAX + 1];
-	/* flags */
-	bool		 bol;
-	bool		 eol;
-	bool		 reversed;
-	bool		 word;
-} fastgrep_t;
-
 /* Flags passed to regcomp() and regexec() */
 extern int	 cflags, eflags;
 
@@ -125,7 +116,7 @@ extern unsigned int dpatterns, fpatterns
 extern char    **pattern;
 extern struct epat *dpattern, *fpattern;
 extern regex_t	*er_pattern, *r_pattern;
-extern fastgrep_t *fg_pattern;
+extern fastmatch_t *fg_pattern;
 
 /* For regex errors  */
 #define RE_ERROR_BUF	512
@@ -150,8 +141,3 @@ void	 clearqueue(void);
 void		 grep_close(struct file *f);
 struct file	*grep_open(const char *path);
 char		*grep_fgetln(struct file *f, size_t *len);
-
-/* fastgrep.c */
-int		 fastcomp(fastgrep_t *, const char *);
-void		 fgrepcomp(fastgrep_t *, const char *);
-int		 grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);

Added: user/gabor/grep/trunk/regex/fastmatch.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/fastmatch.c	Thu Aug 18 17:09:10 2011	(r224980)
@@ -0,0 +1,227 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <fastmatch.h>
+#include <regex.h>
+#include <string.h>
+
+#include "tre-fastmatch.h"
+#include "xmalloc.h"
+
+/* XXX: avoid duplication */
+#define CONV_PAT							\
+  int ret;								\
+  tre_char_t *wregex;							\
+  size_t wlen;								\
+									\
+  wregex = xmalloc(sizeof(tre_char_t) * (n + 1));			\
+  if (wregex == NULL)							\
+    return REG_ESPACE;							\
+									\
+  if (TRE_MB_CUR_MAX == 1)						\
+    {									\
+      unsigned int i;							\
+      const unsigned char *str = (const unsigned char *)regex;		\
+      tre_char_t *wstr = wregex;					\
+									\
+      for (i = 0; i < n; i++)						\
+        *(wstr++) = *(str++);						\
+      wlen = n;								\
+    }									\
+  else									\
+    {									\
+      int consumed;							\
+      tre_char_t *wcptr = wregex;					\
+      mbstate_t state;							\
+      memset(&state, '\0', sizeof(state));				\
+      while (n > 0)							\
+        {								\
+          consumed = tre_mbrtowc(wcptr, regex, n, &state);		\
+									\
+          switch (consumed)						\
+            {								\
+            case 0:							\
+              if (*regex == '\0')					\
+                consumed = 1;						\
+              else							\
+                {							\
+                  xfree(wregex);					\
+                  return REG_BADPAT;					\
+                }							\
+              break;							\
+            case -1:							\
+              DPRINT(("mbrtowc: error %d: %s.\n", errno,		\
+		strerror(errno)));					\
+              xfree(wregex);						\
+              return REG_BADPAT;					\
+            case -2:							\
+              consumed = n;						\
+              break;							\
+            }								\
+          regex += consumed;						\
+          n -= consumed;						\
+          wcptr++;							\
+        }								\
+      wlen = wcptr - wregex;						\
+    }									\
+									\
+  wregex[wlen] = L'\0';
+
+int
+tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
+{
+  CONV_PAT;
+
+  ret = tre_compile_literal(preg, wregex, wlen, cflags);
+  xfree(wregex);
+
+  return ret;
+}
+
+int
+tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
+{
+  CONV_PAT;
+
+  ret = (cflags & REG_LITERAL) ?
+    tre_compile_literal(preg, wregex, wlen, cflags) :
+    tre_compile_fast(preg, wregex, wlen, cflags);
+  xfree(wregex);
+
+  return ret;
+}
+
+
+int
+tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags)
+{
+  return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+}
+
+int
+tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags)
+{
+  return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+}
+
+int
+tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags)
+{
+  return tre_compile_literal(preg, regex, n, cflags);
+}
+
+int
+tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags)
+{
+  return (cflags & REG_LITERAL) ?
+    tre_compile_literal(preg, regex, n, cflags) :
+    tre_compile_fast(preg, regex, n, cflags);
+}
+
+int
+tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags)
+{
+  return tre_fixwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags);
+}
+
+int
+tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags)
+{
+  return tre_fastwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags);
+}
+
+void
+tre_fastfree(fastmatch_t *preg)
+{
+  tre_free_fast(preg);
+}
+
+/* XXX: avoid duplication */
+#define ADJUST_OFFSETS							\
+  {									\
+    size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so);		\
+    size_t offset = pmatch[0].rm_so;					\
+    int ret;								\
+									\
+    if ((len != (unsigned)-1) && (pmatch[0].rm_eo > len))		\
+      return REG_NOMATCH;						\
+    if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0)		\
+      return REG_NOMATCH;						\
+    ret = tre_match_fast(preg, &string[offset], slen, type, nmatch,	\
+			 pmatch, eflags);				\
+    for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) &&		\
+         (i < nmatch)); i++)						\
+      {									\
+        pmatch[i].rm_so += offset;					\
+        pmatch[i].rm_eo += offset;					\
+      }									\
+    return ret;								\
+  }
+
+int
+tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
+         size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+  tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
+
+  if (eflags & REG_STARTEND)
+    ADJUST_OFFSETS
+  else
+    return tre_match_fast(preg, string, len, type, nmatch,
+      pmatch, eflags);
+}
+
+int
+tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch,
+	     regmatch_t pmatch[], int eflags)
+{
+  return tre_fastnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags);
+}
+
+int
+tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len,
+          size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+  tre_str_type_t type = STR_WIDE;
+
+  if (eflags & REG_STARTEND)
+    ADJUST_OFFSETS
+  else
+    return tre_match_fast(preg, string, len, type, nmatch,
+      pmatch, eflags);
+}
+
+int
+tre_fastwexec(const fastmatch_t *preg, const wchar_t *string,
+         size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+  return tre_fastwnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags);
+}
+

Added: user/gabor/grep/trunk/regex/fastmatch.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/fastmatch.h	Thu Aug 18 17:09:10 2011	(r224980)
@@ -0,0 +1,103 @@
+/* $FreeBSD$ */
+
+#ifndef FASTMATCH_H
+#define FASTMATCH_H 1
+
+#include <hashtable.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+typedef struct {
+  size_t	 wlen;
+  size_t	 len;
+  wchar_t	*wpattern;
+  int		 hasdot;
+  int		 qsBc[UCHAR_MAX + 1];
+  int		*bmGs;
+  char		*pattern;
+  int		 defBc;
+  hashtable	*qsBc_table;
+  int		*sbmGs;
+
+  /* flags */
+  bool		 bol;
+  bool		 eol;
+  bool		 word;
+  bool		 icase;
+  bool		 newline;
+} fastmatch_t;
+
+extern int
+tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags);
+
+extern int
+tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags);
+
+extern int
+tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch,
+  regmatch_t pmatch[], int eflags);
+
+extern void
+tre_fastfree(fastmatch_t *preg);
+
+extern int
+tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags);
+
+extern int
+tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags);
+
+extern int
+tre_fastwexec(const fastmatch_t *preg, const wchar_t *string,
+         size_t nmatch, regmatch_t pmatch[], int eflags);
+
+/* Versions with a maximum length argument and therefore the capability to
+   handle null characters in the middle of the strings. */
+extern int
+tre_fixncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags);
+
+extern int
+tre_fastncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags);
+
+extern int
+tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
+  size_t nmatch, regmatch_t pmatch[], int eflags);
+
+extern int
+tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags);
+
+extern int
+tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags);
+
+extern int
+tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len,
+  size_t nmatch, regmatch_t pmatch[], int eflags);
+
+#define fixncomp	tre_fixncomp
+#define fastncomp	tre_fastncomp
+#define fixcomp		tre_fixcomp
+#define fastcomp	tre_fastcomp
+#define fixwncomp	tre_fixwncomp
+#define fastwncomp	tre_fastwncomp
+#define fixwcomp	tre_fixwcomp
+#define fastwcomp	tre_fastwcomp
+#define fastfree	tre_fastfree
+#define fastnexec	tre_fastnexec
+#define fastexec	tre_fastexec
+#define fastwnexec	tre_fastwnexec
+#define fastwexec	tre_fastwexec
+#define fixcomp		tre_fixcomp
+#define fastcomp	tre_fastcomp
+#define fastexec	tre_fastexec
+#define fastfree	tre_fastfree
+#define fixwcomp	tre_fixwcomp
+#define fastwcomp	tre_fastwcomp
+#define fastwexec	tre_fastwexec
+#define fixncomp	tre_fixncomp
+#define fastncomp	tre_fastncomp
+#define fastnexec	tre_fastnexec
+#define fixwncomp	tre_fixwncomp
+#define fastwncomp	tre_fastwncomp
+#define fastwnexec	tre_fastwnexec
+#endif		/* FASTMATCH_H */

Added: user/gabor/grep/trunk/regex/glue.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/glue.h	Thu Aug 18 17:09:10 2011	(r224980)
@@ -0,0 +1,30 @@
+/* $FreeBSD$ */
+
+#ifndef GLUE_H
+#define GLUE_H
+
+#include <regex.h>
+
+#define TRE_WCHAR			1
+#define TRE_MULTIBYTE			1
+
+#define tre_char_t			wchar_t
+#define tre_mbrtowc(pwc, s, n, ps)	(mbrtowc((pwc), (s), (n), (ps)))
+#define tre_strlen			wcslen
+#define tre_isspace			iswspace
+#define tre_isalnum			iswalnum
+
+#define REG_LITERAL			0020
+#define REG_WORD			0100
+#define REG_GNU				0400
+
+#define REG_OK				0
+
+#define TRE_MB_CUR_MAX			MB_CUR_MAX
+
+#define DPRINT(msg)			
+#define MIN(a,b)			((a > b) ? (b) : (a))
+#define MAX(a,b)			((a > b) ? (a) : (b))
+
+typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
+#endif

Added: user/gabor/grep/trunk/regex/hashtable.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/hashtable.c	Thu Aug 18 17:09:10 2011	(r224980)
@@ -0,0 +1,159 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/hash.h>
+#include <hashtable.h>
+#include <stdlib.h>
+#include <string.h>
+
+hashtable
+*hashtable_init(size_t table_size, size_t key_size, size_t value_size)
+{
+	hashtable *tbl;
+
+	tbl = malloc(sizeof(hashtable));
+	if (tbl == NULL)
+		return (NULL);
+
+	tbl->entries = calloc(sizeof(hashtable_entry *), table_size);
+	if (tbl->entries == NULL) {
+		free(tbl);
+		return (NULL);
+	}
+
+	tbl->table_size = table_size;
+	tbl->usage = 0;
+	tbl->key_size = key_size;
+	tbl->value_size = value_size;
+
+	return (tbl);
+}
+
+int
+hashtable_put(hashtable *tbl, const void *key, const void *value)
+{
+	uint32_t hash = 0;
+
+	if (tbl->table_size == tbl->usage)
+		return (-1);
+
+	hash = hash32_buf(key, tbl->key_size, hash);
+	hash %= tbl->table_size;
+
+	while (tbl->entries[hash] != NULL)
+		hash = (hash >= tbl->table_size) ? 0 : hash + 1;
+
+	tbl->entries[hash] = malloc(sizeof(hashtable_entry));
+	if (tbl->entries[hash] == NULL)
+		return (-1);
+
+	tbl->entries[hash]->key = malloc(tbl->key_size);
+	if (tbl->entries[hash]->key == NULL) {
+		free(tbl->entries[hash]);
+		return (-1);
+	}
+
+	tbl->entries[hash]->value = malloc(tbl->value_size);
+	if (tbl->entries[hash]->value == NULL) {
+		free(tbl->entries[hash]->key);
+		free(tbl->entries[hash]);
+		return (-1);
+	}
+
+	memcpy(&tbl->entries[hash]->key, key, tbl->key_size);
+	memcpy(&tbl->entries[hash]->value, value, tbl->value_size);
+	tbl->usage++;
+
+	return (0);
+}
+
+static hashtable_entry
+*hashtable_lookup(const hashtable *tbl, const void *key)
+{
+	uint32_t hash = 0;
+
+	hash = hash32_buf(key, tbl->key_size, hash);
+	hash %= tbl->table_size;
+
+	for (;;) {
+		if (tbl->entries[hash] == NULL)
+			return (NULL);
+		else if (memcmp(key, &tbl->entries[hash]->key,
+		    tbl->key_size) == 0)
+			return (tbl->entries[hash]);
+
+		hash = (hash == tbl->table_size) ? 0 : hash + 1;
+  	}
+}
+
+int
+hashtable_get(hashtable *tbl, const void *key, void *value)
+{
+	hashtable_entry *entry;
+
+	entry = hashtable_lookup(tbl, key);
+	if (entry == NULL)
+		return (-1);
+
+	memcpy(value, &entry->value, tbl->value_size);
+	return (0);
+}
+
+int
+hashtable_remove(hashtable *tbl, const void *key)
+{
+	hashtable_entry *entry;
+
+	entry = hashtable_lookup(tbl, key);
+	if (entry == NULL)
+		return (-1);
+
+//	free(entry->key);
+//	free(entry->value);
+	free(entry);
+
+	tbl->usage--;
+
+	return (0);
+}
+
+void
+hashtable_free(hashtable *tbl)
+{
+
+	if (tbl == NULL)
+		return;
+
+	for (unsigned int i = 0; i < tbl->table_size; i++)
+		if (tbl->entries[i] != NULL) {
+//			free(tbl->entries[i]->key);
+//			free(tbl->entries[i]->value);
+//			free(tbl->entries[i]);
+	}
+	free(tbl->entries);
+}

Added: user/gabor/grep/trunk/regex/hashtable.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/hashtable.h	Thu Aug 18 17:09:10 2011	(r224980)
@@ -0,0 +1,27 @@
+/* $FreeBSD$ */
+
+#ifndef HASHTABLE_H
+#define HASHTABLE_H 1
+
+#include <sys/types.h>
+
+typedef struct {
+	void		*key;
+	void		*value;
+} hashtable_entry;
+
+typedef struct {
+	size_t		 key_size;
+	size_t		 table_size;
+	size_t		 usage;
+	size_t		 value_size;
+	hashtable_entry **entries;
+} hashtable;
+
+void		 hashtable_free(hashtable *);
+int		 hashtable_get(hashtable *, const void *, void *);
+hashtable	*hashtable_init(size_t, size_t, size_t);
+int		 hashtable_put(hashtable *, const void *, const void *);
+int		 hashtable_remove(hashtable *, const void *);
+
+#endif	/* HASHTABLE.H */

Added: user/gabor/grep/trunk/regex/tre-fastmatch.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/tre-fastmatch.c	Thu Aug 18 17:09:10 2011	(r224980)
@@ -0,0 +1,704 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
+ * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <ctype.h>
+#include <hashtable.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef TRE_WCHAR
+#include <wchar.h>
+#include <wctype.h>
+#endif
+
+#include "hashtable.h"
+#include "tre-fastmatch.h"
+#include "xmalloc.h"
+
+static int	fastcmp(const void *, const void *, size_t,
+			tre_str_type_t, bool, bool);
+
+/*
+ * We will work with wide characters if they are supported
+ */
+#ifdef TRE_WCHAR
+#define TRE_CHAR(n)	L##n
+#else
+#define TRE_CHAR(n)	n
+#endif
+
+/*
+ * Skips n characters in the input string and assigns the start
+ * address to startptr. Note: as per IEEE Std 1003.1-2008
+ * matching is based on bit pattern not character representations
+ * so we can handle MB strings as byte sequences just like
+ * SB strings.
+ */
+#define SKIP_CHARS(n)							\
+  switch (type)								\
+    {									\
+      case STR_WIDE:							\
+	startptr = str_wide + n;					\
+	break;								\
+      default:								\
+	startptr = str_byte + n;					\
+    }
+
+/*
+ * Converts the wide string pattern to SB/MB string and stores
+ * it in fg->pattern. Sets fg->len to the byte length of the
+ * converted string.
+ */
+#define STORE_MBS_PAT							\
+  {									\
+    size_t siz;								\
+									\
+    siz = wcstombs(NULL, fg->wpattern, 0);				\
+    if (siz == (size_t)-1)						\
+      return REG_BADPAT;						\
+    fg->len = siz;							\
+    fg->pattern = xmalloc(siz + 1);					\
+    if (fg->pattern == NULL)						\
+      return REG_ESPACE;						\
+    wcstombs(fg->pattern, fg->wpattern, siz);				\
+    fg->pattern[siz] = '\0';						\
+  }									\
+
+/*
+ * Compares the pattern to the input string at the position
+ * stored in startptr.
+ */
+#define COMPARE								\
+  switch (type)								\
+    {									\
+      case STR_WIDE:							\
+	mismatch = fastcmp(fg->wpattern, startptr, fg->wlen, type,	\
+			   fg->icase, fg->newline);			\
+	break;								\
+      default:								\
+	mismatch = fastcmp(fg->pattern, startptr, fg->len, type,	\
+			   fg->icase, fg->newline);			\
+      }									\
+
+#define IS_OUT_OF_BOUNDS						\
+  ((type == STR_WIDE) ? ((j + fg->wlen) > len) : ((j + fg->len) > len))
+
+/*
+ * Checks whether the new position after shifting in the input string
+ * is out of the bounds and break out from the loop if so.
+ */
+#define CHECKBOUNDS							\
+  if (IS_OUT_OF_BOUNDS)							\
+    break;								\
+
+/*
+ * Shifts in the input string after a mismatch. The position of the
+ * mismatch is stored in the mismatch variable.
+ */
+#define SHIFT								\
+  CHECKBOUNDS;								\
+									\
+  {									\
+    int bc = 0, gs = 0, ts, r = -1;					\
+									\
+    switch (type)							\
+      {									\
+	case STR_WIDE:							\
+	  if (!fg->hasdot)						\
+	    {								\
+	      if (u != 0 && mismatch == fg->wlen - 1 - shift)		\
+		mismatch -= u;						\
+	      v = fg->wlen - 1 - mismatch;				\
+	      r = hashtable_get(fg->qsBc_table,				\
+		&((tre_char_t *)startptr)[mismatch + 1], &bc);		\
+	      gs = fg->bmGs[mismatch];					\
+	    }								\
+	    bc = (r == 0) ? bc : fg->defBc;				\
+            break;							\
+	default:							\
+	  if (!fg->hasdot)						\
+	    {								\
+	      if (u != 0 && mismatch == fg->len - 1 - shift)		\
+		mismatch -= u;						\
+	      v = fg->len - 1 - mismatch;				\
+	      gs = fg->sbmGs[mismatch];					\
+	    }								\
+	  bc = fg->qsBc[((unsigned char *)startptr)[mismatch + 1]];	\
+      }									\
+    if (fg->hasdot)							\
+      shift = bc;							\
+    else								\
+      {									\
+	ts = u - v;							\
+	shift = MAX(ts, bc);						\
+	shift = MAX(shift, gs);						\
+	if (shift == gs)						\
+	  u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v);	\
+	else								\
+	  {								\
+	    if (ts < bc)						\
+	      shift = MAX(shift, u + 1);				\
+	    u = 0;							\
+	  }								\
+      }									\
+      j += shift;							\
+  }
+
+/*
+ * Normal Quick Search would require a shift based on the position the
+ * next character after the comparison is within the pattern.  With
+ * wildcards, the position of the last dot effects the maximum shift
+ * distance.
+ * The closer to the end the wild card is the slower the search.
+ *
+ * Examples:
+ * Pattern    Max shift
+ * -------    ---------
+ * this               5
+ * .his               4
+ * t.is               3
+ * th.s               2
+ * thi.               1
+ */
+
+/*
+ * Fills in the bad character shift array for SB/MB strings.
+ */
+#define FILL_QSBC							\
+  for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
+    fg->qsBc[i] = fg->len - fg->hasdot;					\
+  for (int i = fg->hasdot + 1; i < fg->len; i++)			\
+    {									\
+      fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i;			\
+      if (fg->icase)							\
+        {								\
+          char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i])	\
+            : tolower(fg->pattern[i]);					\
+          fg->qsBc[(unsigned)c] = fg->len - i;				\
+        }								\
+    }
+
+/*
+ * Fills in the bad character shifts into a hastable for wide strings.
+ * With wide characters it is not possible any more to use a normal
+ * array because there are too many characters and we could not
+ * provide enough memory. Fortunately, we only have to store distinct
+ * values for so many characters as the number of distinct characters
+ * in the pattern, so we can store them in a hashtable and store a
+ * default shift value for the rest.
+ */
+#define FILL_QSBC_WIDE							\
+  /* Adjust the shift based on location of the last dot ('.'). */	\
+  fg->defBc = fg->wlen - fg->hasdot;					\
+									\
+  /* Preprocess pattern. */						\
+  fg->qsBc_table = hashtable_init(fg->wlen * 4, sizeof(tre_char_t),	\
+    sizeof(int));							\
+  for (unsigned int i = fg->hasdot + 1; i < fg->wlen; i++)		\
+    {									\
+      int k = fg->wlen - i;						\
+      hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
+      if (fg->icase)							\
+	{								\
+	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
+	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
+	  hashtable_put(fg->qsBc_table, &wc, &k);			\
+	}								\
+    }
+
+/*
+ * Fills in the good suffix table for SB/MB strings.
+ */
+#define FILL_BMGS							\
+  if (!fg->hasdot)							\
+    {									\
+      fg->sbmGs = xmalloc(fg->len * sizeof(int));			\
+      if (!fg->sbmGs)							\
+	return REG_ESPACE;						\
+      _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
+    }
+
+/*
+ * Fills in the good suffix table for wide strings.
+ */
+#define FILL_BMGS_WIDE							\
+  if (!fg->hasdot)							\
+    {									\
+      fg->bmGs = xmalloc(fg->wlen * sizeof(int));			\
+      if (!fg->bmGs)							\
+	return REG_ESPACE;						\
+      _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
+    }
+
+#define _FILL_BMGS(arr, pat, plen, wide)				\
+  {									\
+    char *p;								\
+    tre_char_t *wp;							\
+									\
+    if (wide)								\
+      {									\
+	if (fg->icase)							\
+	  {								\
+	    wp = xmalloc(plen * sizeof(tre_char_t));			\
+	    if (wp == NULL)						\
+	      return REG_ESPACE;					\
+	    for (int i = 0; i < plen; i++)				\
+	      wp[i] = towlower(pat[i]);					\
+	    _CALC_BMGS(arr, wp, plen);					\
+	    xfree(wp);							\
+	  }								\
+	else								\
+	  _CALC_BMGS(arr, pat, plen);					\
+      }									\
+    else								\
+      {									\
+	if (fg->icase)							\
+	  {								\
+	    p = xmalloc(plen);						\
+	    if (p == NULL)						\
+	      return REG_ESPACE;					\
+	    for (int i = 0; i < plen; i++)				\
+	      p[i] = tolower(pat[i]);					\
+	    _CALC_BMGS(arr, p, plen);					\
+	    xfree(p);							\
+	  }								\
+	else								\
+	  _CALC_BMGS(arr, pat, plen);					\
+      }									\
+  }
+
+#define _CALC_BMGS(arr, pat, plen)					\
+  {									\
+    int f, g;								\
+									\
+    int *suff = xmalloc(plen * sizeof(int));				\
+    if (suff == NULL)							\
+      return REG_ESPACE;						\
+									\
+    suff[plen - 1] = plen;						\
+    g = plen - 1;							\
+    for (int i = plen - 2; i >= 0; i--)					\
+      {									\
+	if (i > g && suff[i + plen - 1 - f] < i - g)			\
+	  suff[i] = suff[i + plen - 1 - f];				\
+	else								\
+	  {								\
+	    if (i < g)							\
+	      g = i;							\
+	    f = i;							\
+	    while (g >= 0 && pat[g] == pat[g + plen - 1 - f])		\
+	      g--;							\
+	    suff[i] = f - g;						\
+	  }								\
+      }									\
+									\
+    for (int i = 0; i < plen; i++)					\
+      arr[i] = plen;							\
+    g = 0;								\
+    for (int i = plen - 1; i >= 0; i--)					\
+      if (suff[i] == i + 1)						\
+	for(; g < plen - 1 - i; g++)					\
+	  if (arr[g] == plen)						\
+	    arr[g] = plen - 1 - i;					\
+    for (int i = 0; i <= plen - 2; i++)					\
+      arr[plen - 1 - suff[i]] = plen - 1 - i;				\
+									\
+    xfree(suff);							\
+  }
+
+/*
+ * Copies the pattern pat having lenght n to p and stores
+ * the size in l.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108181709.p7IH9ATh066263>