Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 25 Jul 2011 21:47:56 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r224407 - user/gabor/tre-integration/contrib/tre/lib
Message-ID:  <201107252147.p6PLluZq078479@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Mon Jul 25 21:47:56 2011
New Revision: 224407
URL: http://svn.freebsd.org/changeset/base/224407

Log:
  - Save the pattern in SB/MB string form, as well, to speed up matching
    SB/MB input
  - Macroify repeating code to eliminate duplication

Modified:
  user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
  user/gabor/tre-integration/contrib/tre/lib/fastmatch.h

Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c	Mon Jul 25 21:44:35 2011	(r224406)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c	Mon Jul 25 21:47:56 2011	(r224407)
@@ -41,9 +41,10 @@
 #include "tre-internal.h"
 #include "xmalloc.h"
 
-static int	fastcmp(const tre_char_t *, const void *, size_t,
+static int	fastcmp(const void *, const void *, size_t,
 			tre_str_type_t);
 static void	revstr(tre_char_t *, int);
+static void	revs(char *str, int len);
 
 #ifdef TRE_WCHAR
 #define TRE_CHAR(n)	L##n
@@ -56,15 +57,8 @@ static void	revstr(tre_char_t *, int);
     switch (type)						\
       {								\
 	case STR_BYTE:						\
-	  startptr = str_byte + n;				\
-	  break;						\
 	case STR_MBS:						\
-	  for (skip = j = 0; j < n; j++)			\
-	    {							\
-	      siz = mbrlen(str_byte + skip, MB_CUR_MAX, NULL);	\
-	      skip += siz;					\
-	    }							\
-	  startptr = str_byte + skip;				\
+	  startptr = str_byte + n;				\
 	  break;						\
 	case STR_WIDE:						\
 	  startptr = str_wide + n;				\
@@ -75,40 +69,145 @@ static void	revstr(tre_char_t *, int);
       }								\
   } while (0);							\
 
+#define STORE_MBS_PAT						\
+  do {								\
+    size_t siz;							\
+								\
+    siz = wcstombs(NULL, fg->wpattern, 0);			\
+    if (siz == (size_t)-1)					\
+      return REG_BADPAT;					\
+    fg->len = siz;						\
+    fg->pattern = xmalloc(siz + 1);				\
+    if (fg->pattern == NULL)					\
+      return REG_ESPACE;					\
+    wcstombs(fg->pattern, fg->wpattern, siz);			\
+    fg->pattern[siz] = '\0';					\
+  } while (0);							\
+
+#define COMPARE								\
+  do {									\
+    switch (type)							\
+      {									\
+	case STR_BYTE:							\
+	case STR_MBS:							\
+	  mismatch = fastcmp(fg->pattern, startptr, fg->len, type);	\
+	  break;							\
+	case STR_WIDE:							\
+	  mismatch = fastcmp(fg->wpattern, startptr, fg->wlen, type);	\
+	default:							\
+	  break;							\
+      }									\
+  } while (0);
+
+#ifdef TRE_WCHAR
+#define IS_OUT_OF_BOUNDS						\
+  ((type == STR_WIDE) ? ((j + fg->wlen) > len) : ((j + fg->len) > len))
+#else
+#define IS_OUT_OF_BOUNDS	((j + fg->len) > len)
+#endif
+
+#define CHECKBOUNDS							\
+  if (IS_OUT_OF_BOUNDS)							\
+    break;								\
+
+#ifdef TRE_WCHAR
+#define SHIFT								\
+      CHECKBOUNDS;							\
+      {									\
+        int k = 0, r = -1;						\
+									\
+        switch (type)							\
+          {								\
+            case STR_BYTE:						\
+	    case STR_MBS:						\
+	      k = fg->qsBc[(unsigned char)((char *)startptr)		\
+		[mismatch + 1]];					\
+              break;							\
+            case STR_WIDE:						\
+              r = hashtable_get(fg->qsBc_table,				\
+		&((wchar_t *)startptr)[mismatch + 1], &k);		\
+	      k = (r == 0) ? k : fg->defBc;				\
+              break;							\
+            default:							\
+              /* XXX */							\
+              break;							\
+          }								\
+        j += k;								\
+      }
+#else
+#define SHIFT								\
+      CHECKBOUNDS;							\
+      j += fg->qsBc[(unsigned char)((char *)startptr)[mismatch + 1]];
+#endif
+
+/*
+ * Normal Quick Search would require a shift based on the position the
+ * next character after the comparison is within the pattern.  With
+ * wildcards, the position of the last dot effects the maximum shift
+ * distance.
+ * The closer to the end the wild card is the slower the search.  A
+ * reverse version of this algorithm would be useful for wildcards near
+ * the end of the string.
+ *
+ * Examples:
+ * Pattern    Max shift
+ * -------    ---------
+ * this               5
+ * .his               4
+ * t.is               3
+ * th.s               2
+ * thi.               1
+ */
+
+#ifdef TRE_WCHAR
+#define FILL_QSBC							\
+  /* Adjust the shift based on location of the last dot ('.'). */	\
+  fg->defBc = fg->wlen - hasDot;					\
+									\
+  /* Preprocess pattern. */						\
+  fg->qsBc_table = hashtable_init(fg->wlen, sizeof(tre_char_t),		\
+    sizeof(int));							\
+  for (unsigned int i = hasDot + 1; i < fg->wlen; i++)			\
+  {									\
+    int k = fg->wlen - i;						\
+    hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
+  }									\
+									\
+  for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
+    fg->qsBc[i] = fg->len - hasDot;					\
+  for (int i = hasDot + 1; i < fg->len; i++)				\
+    fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i;
+#else
+#define FILL_QSBC							\
+  for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
+    fg->qsBc[i] = fg->wlen - hasDot;					\
+  for (int i = hasDot + 1; i < fg->wlen; i++)				\
+    fg->qsBc[(unsigned)fg->wpattern[i]] = fg->wlen - i;
+#endif
+
+
 /*
  * Returns: REG_OK on success, error code otherwise
  */
 int
-tre_fastcomp_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n)
+tre_fastcomp_literal(fastmatch_t *fg, const tre_char_t *wpat, size_t n)
 {
+  int hasDot = 0;
 
   /* Initialize. */
   memset(fg, 0, sizeof(*fg));
-  fg->len = (n == 0) ? tre_strlen(pat) : n;
-  fg->pattern = xmalloc((fg->len + 1) * sizeof(tre_char_t));
-  if (fg->pattern == NULL)
+  fg->wlen = (n == 0) ? tre_strlen(wpat) : n;
+  fg->wpattern = xmalloc((fg->wlen + 1) * sizeof(tre_char_t));
+  if (fg->wpattern == NULL)
     return REG_ESPACE;
-  memcpy(fg->pattern, pat, fg->len * sizeof(tre_char_t));
-  fg->pattern[fg->len] = TRE_CHAR('\0');
-
-  /* Preprocess pattern. */
+  memcpy(fg->wpattern, wpat, fg->wlen * sizeof(tre_char_t));
+  fg->wpattern[fg->wlen] = TRE_CHAR('\0');
 #ifdef TRE_WCHAR
-  fg->defBc = fg->len;
-  fg->qsBc = hashtable_init(fg->len * 3, sizeof(tre_char_t), sizeof(int));
-  if (fg->qsBc == NULL)
-    return REG_ESPACE;
-  for (unsigned int i = 1; i < fg->len; i++)
-  {
-    int k = fg->len - i;
-    hashtable_put(fg->qsBc, &fg->pattern[i], &k);
-  }
-#else
-  for (i = 0; i <= UCHAR_MAX; i++)
-    fg->qsBc[i] = fg->len;
-  for (i = 1; i < fg->len; i++)
-    fg->qsBc[fg->pattern[i]] = fg->len - i;
+  STORE_MBS_PAT;
 #endif
 
+  FILL_QSBC;
+
   return REG_OK;
 }
 
@@ -116,7 +215,7 @@ tre_fastcomp_literal(fastmatch_t *fg, co
  * Returns: REG_OK on success, error code otherwise
  */
 int
-tre_fastcomp(fastmatch_t *fg, const tre_char_t *pat, size_t n)
+tre_fastcomp(fastmatch_t *fg, const tre_char_t *wpat, size_t n)
 {
   int firstHalfDot = -1;
   int firstLastHalfDot = -1;
@@ -125,55 +224,58 @@ tre_fastcomp(fastmatch_t *fg, const tre_
 
   /* Initialize. */
   memset(fg, 0, sizeof(*fg));
-  fg->len = (n == 0) ? tre_strlen(pat) : n;
+  fg->wlen = (n == 0) ? tre_strlen(wpat) : n;
 
   /* Remove end-of-line character ('$'). */
-  if ((fg->len > 0) && (pat[fg->len - 1] == TRE_CHAR('$')))
+  if ((fg->wlen > 0) && (wpat[fg->wlen - 1] == TRE_CHAR('$')))
   {
     fg->eol = true;
-    fg->len--;
+    fg->wlen--;
   }
 
   /* Remove beginning-of-line character ('^'). */
-  if (pat[0] == TRE_CHAR('^'))
+  if (wpat[0] == TRE_CHAR('^'))
   {
     fg->bol = true;
-    fg->len--;
-    pat++;
+    fg->wlen--;
+    wpat++;
   }
 
-  if ((fg->len >= 14) &&
-      (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
-      (memcmp(pat + fg->len - 7, TRE_CHAR("[[:>:]]"),
+  if ((fg->wlen >= 14) &&
+      (memcmp(wpat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
+      (memcmp(wpat + fg->wlen - 7, TRE_CHAR("[[:>:]]"),
 	      7 * sizeof(tre_char_t)) == 0))
   {
-    fg->len -= 14;
-    pat += 7;
+    fg->wlen -= 14;
+    wpat += 7;
     fg->word = true;
   }
 
   /*
-   * pat has been adjusted earlier to not include '^', '$' or
+   * wpat has been adjusted earlier to not include '^', '$' or
    * the word match character classes at the beginning and ending
    * of the string respectively.
    */
-  fg->pattern = xmalloc((fg->len + 1) * sizeof(tre_char_t));
-  if (fg->pattern == NULL)
+  fg->wpattern = xmalloc((fg->wlen + 1) * sizeof(tre_char_t));
+  if (fg->wpattern == NULL)
     return REG_ESPACE;
-  memcpy(fg->pattern, pat, fg->len * sizeof(tre_char_t));
-  fg->pattern[fg->len] = TRE_CHAR('\0');
+  memcpy(fg->wpattern, wpat, fg->wlen * sizeof(tre_char_t));
+  fg->wpattern[fg->wlen] = TRE_CHAR('\0');
+#ifdef TRE_WCHAR
+  STORE_MBS_PAT;
+#endif
 
   /* Look for ways to cheat...er...avoid the full regex engine. */
-  for (unsigned int i = 0; i < fg->len; i++) {
+  for (unsigned int i = 0; i < fg->wlen; i++) {
     /* Can still cheat? */
-    if ((tre_isalnum(fg->pattern[i])) || tre_isspace(fg->pattern[i]) ||
-      (fg->pattern[i] == TRE_CHAR('_')) || (fg->pattern[i] == TRE_CHAR(',')) ||
-      (fg->pattern[i] == TRE_CHAR('=')) || (fg->pattern[i] == TRE_CHAR('-')) ||
-      (fg->pattern[i] == TRE_CHAR(':')) || (fg->pattern[i] == TRE_CHAR('/'))) {
+    if ((tre_isalnum(fg->wpattern[i])) || tre_isspace(fg->wpattern[i]) ||
+      (fg->wpattern[i] == TRE_CHAR('_')) || (fg->wpattern[i] == TRE_CHAR(',')) ||
+      (fg->wpattern[i] == TRE_CHAR('=')) || (fg->wpattern[i] == TRE_CHAR('-')) ||
+      (fg->wpattern[i] == TRE_CHAR(':')) || (fg->wpattern[i] == TRE_CHAR('/'))) {
 	continue;
-    } else if (fg->pattern[i] == TRE_CHAR('.')) {
+    } else if (fg->wpattern[i] == TRE_CHAR('.')) {
       hasDot = i;
-      if (i < fg->len / 2) {
+      if (i < fg->wlen / 2) {
 	if (firstHalfDot < 0)
 	  /* Closest dot to the beginning */
 	  firstHalfDot = i;
@@ -185,8 +287,12 @@ tre_fastcomp(fastmatch_t *fg, const tre_
       }
     } else {
 	/* Free memory and let others know this is empty. */
+#ifdef TRE_WCHAR
 	free(fg->pattern);
 	fg->pattern = NULL;
+#endif
+	free(fg->wpattern);
+	fg->wpattern = NULL;
 	return REG_BADPAT;
     }
   }
@@ -197,58 +303,29 @@ tre_fastcomp(fastmatch_t *fg, const tre_
    */
   if ((!(fg->bol || fg->eol)) &&
      (lastHalfDot && ((firstHalfDot < 0) ||
-     ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) {
+     ((fg->wlen - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) {
     fg->reversed = true;
-    hasDot = fg->len - (firstHalfDot < 0 ?
+    hasDot = fg->wlen - (firstHalfDot < 0 ?
 	     firstLastHalfDot : firstHalfDot) - 1;
-    revstr(fg->pattern, fg->len);
-  }
-
-  /*
-   * Normal Quick Search would require a shift based on the position the
-   * next character after the comparison is within the pattern.  With
-   * wildcards, the position of the last dot effects the maximum shift
-   * distance.
-   * The closer to the end the wild card is the slower the search.  A
-   * reverse version of this algorithm would be useful for wildcards near
-   * the end of the string.
-   *
-   * Examples:
-   * Pattern	Max shift
-   * -------	---------
-   * this		5
-   * .his		4
-   * t.is		3
-   * th.s		2
-   * thi.		1
-   */
-
+    revstr(fg->wpattern, fg->wlen);
 #ifdef TRE_WCHAR
-  /* Adjust the shift based on location of the last dot ('.'). */
-  fg->defBc = fg->len - hasDot;
-
-  /* Preprocess pattern. */
-  fg->qsBc = hashtable_init(fg->len, sizeof(tre_char_t), sizeof(int));
-  for (unsigned int i = hasDot + 1; i < fg->len; i++)
-  {
-    int k = fg->len - i;
-    hashtable_put(fg->qsBc, &fg->pattern[i], &k);
-  }
-#else
-  /* Preprocess pattern. */
-  for (unsigned int i = 0; i <= (signed)UCHAR_MAX; i++)
-    fg->qsBc[i] = fg->len - hasDot;
-  for (unsigned int i = hasDot + 1; i < fg->len; i++) {
-    fg->qsBc[fg->pattern[i]] = fg->len - i;
-  }
+    revs(fg->pattern, fg->len);
 #endif
+  }
+
+  FILL_QSBC;
 
   /*
    * Put pattern back to normal after pre-processing to allow for easy
    * comparisons later.
    */
   if (fg->reversed)
-    revstr(fg->pattern, fg->len);
+    {
+      revstr(fg->wpattern, fg->wlen);
+#ifdef TRE_WCHAR
+      revs(fg->pattern, fg->len);
+#endif
+    }
 
   return REG_OK;
 }
@@ -258,8 +335,8 @@ tre_fastexec(const fastmatch_t *fg, cons
     tre_str_type_t type, int nmatch, regmatch_t pmatch[])
 {
   unsigned int j;
-  size_t siz, skip;
   int ret = REG_NOMATCH;
+  int mismatch;
   const char *str_byte = data;
   const void *startptr = NULL;
 #ifdef TRE_WCHAR
@@ -294,7 +371,8 @@ tre_fastexec(const fastmatch_t *fg, cons
       /* Determine where in data to start search at. */
       j = fg->eol ? len - fg->len : 0;
       SKIP_CHARS(j);
-      if (fastcmp(fg->pattern, startptr, fg->len, type) == REG_OK) {
+      COMPARE;
+      if (mismatch == REG_OK) {
 	pmatch[0].rm_so = j;
 	pmatch[0].rm_eo = j + fg->len;
 	return REG_OK;
@@ -304,10 +382,8 @@ tre_fastexec(const fastmatch_t *fg, cons
     /* Quick Search algorithm. */
     j = len - fg->len;
     do {
-      int mismatch;
-
       SKIP_CHARS(j);
-      mismatch = fastcmp(fg->pattern, startptr, fg->len, type);
+      COMPARE;
       if (mismatch == REG_OK) {
 	pmatch[0].rm_so = j - fg->len;
 	pmatch[0].rm_eo = j;
@@ -315,47 +391,14 @@ tre_fastexec(const fastmatch_t *fg, cons
       } else if (mismatch > 0)
 	return mismatch;
       mismatch = -mismatch - 1;
-
-      /* Shift if within bounds, otherwise, we are done. */
-      if (((long)len - (long)j) > fg->len)
-        break;
-#ifdef TRE_WCHAR
-      {
-	int k, r = -1;
-	wint_t wc;
-
-	switch (type)
-	  {
-	    case STR_BYTE:
-	      wc = btowc(((char *)startptr)[mismatch]);
-	      r = hashtable_get(fg->qsBc, &wc, &k);
-	      break;
-	    case STR_MBS:
-	      tre_mbrtowc(&wc, &((char *)startptr)[mismatch], MB_CUR_MAX, NULL);
-	      r = hashtable_get(fg->qsBc, &wc, &k);
-	      break;
-	    case STR_WIDE:
-	      r = hashtable_get(fg->qsBc, &((char *)startptr)[mismatch], &k);
-	      break;
-	    default:
-	      /* XXX */
-	      break;
-	  }
-	k = (r == 0) ? k : fg->defBc;
-	j -= k;
-      }
-#else
-      j -= fg->qsBc[((char *)startptr)[mismatch]];
-#endif
-    } while (j >= fg->len);
+      SHIFT;
+    } while (!IS_OUT_OF_BOUNDS);
   } else {
     /* Quick Search algorithm. */
     j = 0;
     do {
-      int mismatch;
-
       SKIP_CHARS(j);
-      mismatch = fastcmp(fg->pattern, startptr, fg->len, type);
+      COMPARE;
       if (mismatch == REG_OK) {
 	pmatch[0].rm_so = j;
 	pmatch[0].rm_eo = j + fg->len;
@@ -363,39 +406,8 @@ tre_fastexec(const fastmatch_t *fg, cons
       } else if (mismatch > 0)
         return mismatch;
       mismatch = -mismatch - 1;
-
-      /* Shift if within bounds, otherwise, we are done. */
-      if ((j + fg->len) >= len)
-	break;
-#ifdef TRE_WCHAR
-      {
-	int k, r = -1;
-	wint_t wc;
-
-	switch (type)
-	  {
-	    case STR_BYTE:
-	      wc = btowc(((char *)startptr)[mismatch + 1]);
-	      r = hashtable_get(fg->qsBc, &wc, &k);
-	      break;
-	    case STR_MBS:
-	      tre_mbrtowc(&wc, &((char *)startptr)[mismatch + 1], MB_CUR_MAX, NULL);
-	      r = hashtable_get(fg->qsBc, &wc, &k);
-	      break;
-	    case STR_WIDE:
-	      r = hashtable_get(fg->qsBc, &((char *)startptr)[mismatch + 1], &k);
-	      break;
-	    default:
-	      /* XXX */
-	      break;
-	  }
-	k = (r == 0) ? k : fg->defBc;
-	j += k;
-      }
-#else
-      j += fg->qsBc[((char *)startptr)[mismatch]];
-#endif
-    } while (j <= (len - fg->len));
+      SHIFT;
+    } while (!IS_OUT_OF_BOUNDS);
   }
   return ret;
 }
@@ -405,9 +417,10 @@ tre_fastfree(fastmatch_t *fg)
 {
 
 #ifdef TRE_WCHAR
-  hashtable_free(fg->qsBc);
-#endif
+  hashtable_free(fg->qsBc_table);
   free(fg->pattern);
+#endif
+  free(fg->wpattern);
 }
 
 /*
@@ -416,41 +429,29 @@ tre_fastfree(fastmatch_t *fg)
  *		REG_OK on success
  */
 static inline int
-fastcmp(const tre_char_t *pat, const void *data, size_t len,
+fastcmp(const void *pat, const void *data, size_t len,
 	tre_str_type_t type)
 {
   const char *str_byte = data;
-  wchar_t *mbs_wide;
+  const char *pat_byte = pat;
   int ret = REG_OK;
 #ifdef TRE_WCHAR
   const wchar_t *str_wide = data;
+  const wchar_t *pat_wide = pat;
 #endif
 
-  if (type == STR_MBS)
-    {
-#ifdef HAVE_ALLOCA
-      mbs_wide = alloca((len + 1) * sizeof(wint_t));
-#elif
-      mbs_wide = xmalloc((len + 1) * sizeof(wint_t));
-      /* XXX */
-      if (mbs_wide == NULL)
-	return REG_ESPACE;
-#endif
-      mbstowcs(mbs_wide, str_byte, len);
-      type = STR_WIDE;
-    }
-
   for (int i = len - 1; i >= 0; i--) {
-    if (pat[i] == TRE_CHAR('.'))
+    if (pat_wide[i] == TRE_CHAR('.'))
       continue;
     switch (type)
       {
 	case STR_BYTE:
-	  if (pat[i] == btowc(str_byte[i]))
+	case STR_MBS:
+	  if (pat_byte[i] == str_byte[i])
 	    continue;
 	  break;
 	case STR_WIDE:
-	  if (pat[i] == str_wide[i])
+	  if (pat_wide[i] == str_wide[i])
 	    continue;
 	  break;
 	default:
@@ -460,10 +461,6 @@ fastcmp(const tre_char_t *pat, const voi
     ret = -(i + 1);
     break;
   }
-#ifndef HAVE_ALLOCA
-    if (mbs_wide != NULL)
-      free(mbs_wide);
-#endif
 
   return ret;
 }
@@ -480,3 +477,19 @@ revstr(tre_char_t *str, int len)
     str[len - i - 1] = c;
   }
 }
+
+/*
+ * XXX: eliminate code duplication
+ */
+static inline void
+revs(char *str, int len)
+{
+  char c;
+
+  for (int i = 0; i < len / 2; i++)
+  {
+    c = str[i];
+    str[i] = str[len - i - 1];
+    str[len - i - 1] = c;
+  }
+}

Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.h	Mon Jul 25 21:44:35 2011	(r224406)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.h	Mon Jul 25 21:47:56 2011	(r224407)
@@ -28,6 +28,7 @@
 #ifndef FASTMATCH_H
 #define FASTMATCH_H 1
 
+#include <limits.h>
 #include <stdbool.h>
 
 #include "hashtable.h"
@@ -35,14 +36,15 @@
 #include "tre-internal.h"
 
 typedef struct {
+  size_t wlen;
   size_t len;
-  tre_char_t *pattern;
+  tre_char_t *wpattern;
+  char *pattern;
 #ifdef TRE_WCHAR
   int defBc;
-  hashtable *qsBc;
-#else
-  int qsBc[UCHAR_MAX + 1];
+  hashtable *qsBc_table;
 #endif
+  int qsBc[UCHAR_MAX + 1];
   /* flags */
   bool bol;
   bool eol;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201107252147.p6PLluZq078479>