Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 22 Oct 2011 10:29:06 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r226629 - user/gabor/tre-integration/contrib/tre/lib
Message-ID:  <201110221029.p9MAT6XQ073073@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Sat Oct 22 10:29:06 2011
New Revision: 226629
URL: http://svn.freebsd.org/changeset/base/226629

Log:
  - Remove handling of dot because in some cases it was less efficient than
    using a heuristic

Modified:
  user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c	Sat Oct 22 09:43:35 2011	(r226628)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c	Sat Oct 22 10:29:06 2011	(r226629)
@@ -100,10 +100,8 @@ static int	fastcmp(const fastmatch_t *fg
   }									\
 
 #define IS_OUT_OF_BOUNDS						\
-  ((!fg->reversed							\
-    ? ((type == STR_WIDE) ? ((j + fg->wlen) > len)			\
-			  : ((j + fg->len) > len))			\
-    : (j < 0)))
+  ((type == STR_WIDE) ? ((j + fg->wlen) > len)				\
+		      : ((j + fg->len) > len))
 
 /*
  * Checks whether the new position after shifting in the input string
@@ -127,16 +125,12 @@ static int	fastcmp(const fastmatch_t *fg
     switch (type)							\
       {									\
 	case STR_WIDE:							\
-	  if (!fg->hasdot)						\
-	    {								\
 	      if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift)	\
 		mismatch -= u;						\
 	      v = fg->wlen - 1 - mismatch;				\
 	      r = hashtable_get(fg->qsBc_table,				\
-		&str_wide[!fg->reversed ? (size_t)j + fg->wlen		\
-			  : (size_t)j - 1], &bc);			\
+		&str_wide[(size_t)j + fg->wlen], &bc);			\
 	      gs = fg->bmGs[mismatch];					\
-	    }								\
 	    bc = (r == HASH_OK) ? bc : fg->defBc;			\
 	    DPRINT(("tre_fast_match: mismatch on character" CHF ", "	\
 		    "BC %d, GS %d\n",					\
@@ -144,24 +138,18 @@ static int	fastcmp(const fastmatch_t *fg
 		    bc, gs));						\
             break;							\
 	default:							\
-	  if (!fg->hasdot)						\
-	    {								\
 	      if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift)	\
 		mismatch -= u;						\
 	      v = fg->len - 1 - mismatch;				\
 	      gs = fg->sbmGs[mismatch];					\
-	    }								\
 	  bc = fg->qsBc[((const unsigned char *)str_byte)		\
-			[!fg->reversed ? (size_t)j + fg->len		\
-			 : (size_t)j - 1]];				\
+			[(size_t)j + fg->len]];				\
 	  DPRINT(("tre_fast_match: mismatch on character %c, "		\
 		 "BC %d, GS %d\n",					\
 		 ((const unsigned char *)startptr)[mismatch + 1],	\
 		 bc, gs));						\
       }									\
-    if (fg->hasdot)							\
-      shift = bc;							\
-    else								\
+									\
       {									\
 	ts = (u >= v) ? (u - v) : 0;					\
 	shift = MAX(ts, bc);						\
@@ -176,43 +164,13 @@ static int	fastcmp(const fastmatch_t *fg
 	  }								\
       }									\
       DPRINT(("tre_fast_match: shifting %u characters\n", shift));	\
-      j = !fg->reversed ? j + shift : j - shift;			\
+      j = j + shift;							\
   }
 
-/*
- * Normal Quick Search would require a shift based on the position the
- * next character after the comparison is within the pattern.  With
- * wildcards, the position of the last dot effects the maximum shift
- * distance.
- * The closer to the end the wild card is the slower the search.
- *
- * Examples:
- * Pattern    Max shift
- * -------    ---------
- * this               5
- * .his               4
- * t.is               3
- * th.s               2
- * thi.               1
- */
-
-/*
- * Fills in the bad character shift array for SB/MB strings.
- */
 #define FILL_QSBC							\
-  if (fg->reversed)							\
-    {									\
-      _FILL_QSBC_REVERSED						\
-    }									\
-  else									\
-    {									\
-      _FILL_QSBC							\
-    }
-
-#define _FILL_QSBC							\
   for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
-    fg->qsBc[i] = fg->len - hasdot;					\
-  for (unsigned int i = hasdot + 1; i < fg->len; i++)			\
+    fg->qsBc[i] = fg->len;						\
+  for (unsigned int i = 1; i < fg->len; i++)				\
     {									\
       fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;		\
       DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],		\
@@ -227,24 +185,6 @@ static int	fastcmp(const fastmatch_t *fg
 	}								\
     }
 
-#define _FILL_QSBC_REVERSED						\
-  for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
-    fg->qsBc[i] = firstdot + 1;						\
-  for (int i = firstdot - 1; i >= 0; i--)				\
-    {									\
-      fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1;			\
-      DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i],	\
-	     i + 1));							\
-      if (fg->icase)							\
-        {								\
-          char c = islower((unsigned char)fg->pattern[i]) ?		\
-		   toupper((unsigned char)fg->pattern[i]) :		\
-		   tolower((unsigned char)fg->pattern[i]);		\
-          fg->qsBc[(unsigned char)c] = i + 1;				\
-	  DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1));	\
-        }								\
-    }
-
 /*
  * Fills in the bad character shifts into a hastable for wide strings.
  * With wide characters it is not possible any more to use a normal
@@ -254,26 +194,16 @@ static int	fastcmp(const fastmatch_t *fg
  * in the pattern, so we can store them in a hashtable and store a
  * default shift value for the rest.
  */
-#define FILL_QSBC_WIDE							\
-  if (fg->reversed)							\
-    {									\
-      _FILL_QSBC_WIDE_REVERSED						\
-    }									\
-  else									\
-    {									\
-      _FILL_QSBC_WIDE							\
-    }
 
-#define _FILL_QSBC_WIDE							\
-  /* Adjust the shift based on location of the last dot ('.'). */	\
-  fg->defBc = fg->wlen - whasdot;					\
+#define FILL_QSBC_WIDE							\
+  fg->defBc = fg->wlen;							\
 									\
   /* Preprocess pattern. */						\
   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),	\
 				  sizeof(tre_char_t), sizeof(int));	\
   if (!fg->qsBc_table)							\
     FAIL_COMP(REG_ESPACE);						\
-  for (unsigned int i = whasdot + 1; i < fg->wlen; i++)			\
+  for (unsigned int i = 1; i < fg->wlen; i++)				\
     {									\
       int k = fg->wlen - i;						\
       int r;								\
@@ -294,37 +224,6 @@ static int	fastcmp(const fastmatch_t *fg
 	}								\
     }
 
-#define _FILL_QSBC_WIDE_REVERSED					\
-  /* Adjust the shift based on location of the last dot ('.'). */	\
-  fg->defBc = (size_t)wfirstdot;					\
-									\
-  /* Preprocess pattern. */						\
-  fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),	\
-				  sizeof(tre_char_t), sizeof(int));	\
-  if (!fg->qsBc_table)							\
-    FAIL_COMP(REG_ESPACE);						\
-  for (int i = wfirstdot - 1; i >= 0; i--)				\
-    {									\
-      int k = i + 1;							\
-      int r;								\
-									\
-      r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
-      if ((r == HASH_FAIL) || (r == HASH_FULL))				\
-	FAIL_COMP(REG_ESPACE);						\
-      DPRINT(("Reverse BC shift for wide char " CHF " is %d\n",		\
-	     fg->wpattern[i], k));					\
-      if (fg->icase)							\
-	{								\
-	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
-	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
-	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
-	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
-	    FAIL_COMP(REG_ESPACE);					\
-	  DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc,	\
-		 k));							\
-	}								\
-    }
-
 #ifdef _GREP_DEBUG
 #define DPRINT_BMGS(len, fmt_str, sh)					\
   for (unsigned int i = 0; i < len; i++)				\
@@ -338,7 +237,6 @@ static int	fastcmp(const fastmatch_t *fg
  * Fills in the good suffix table for SB/MB strings.
  */
 #define FILL_BMGS							\
-  if (!fg->hasdot)							\
     {									\
       fg->sbmGs = xmalloc(fg->len * sizeof(int));			\
       if (!fg->sbmGs)							\
@@ -354,7 +252,6 @@ static int	fastcmp(const fastmatch_t *fg
  * Fills in the good suffix table for wide strings.
  */
 #define FILL_BMGS_WIDE							\
-  if (!fg->hasdot)							\
     {									\
       fg->bmGs = xmalloc(fg->wlen * sizeof(int));			\
       if (!fg->bmGs)							\
@@ -508,8 +405,6 @@ int
 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
 		    int cflags)
 {
-  size_t hasdot = 0, whasdot = 0;
-  ssize_t firstdot = -1, wfirstdot = -1;
 
   INIT_COMP;
 
@@ -548,10 +443,8 @@ tre_compile_fast(fastmatch_t *fg, const 
 		 int cflags)
 {
   tre_char_t *tmp;
-  size_t pos = 0, hasdot = 0, whasdot = 0;;
-  ssize_t firstdot = -1, wfirstdot = -1;
+  size_t pos = 0;
   bool escaped = false;
-  bool *_escmap = NULL;
 
   INIT_COMP;
 
@@ -627,24 +520,9 @@ tre_compile_fast(fastmatch_t *fg, const 
 	    continue;
 	  case TRE_CHAR('.'):
 	    if (escaped)
-	      {
-		if (!_escmap)
-		  _escmap = xmalloc(n * sizeof(bool));
-		if (!_escmap)
-		  {
-		    xfree(tmp);
-		    return REG_ESPACE;
-		  }
-		_escmap[i] = true;
 		STORE_CHAR;
-	      }
 	    else
-	      {
-		whasdot = i;
-		if (wfirstdot == -1)
-			wfirstdot = i;
-		STORE_CHAR;
-	      }
+	      goto badpat;
 	    continue;
 	  case TRE_CHAR('^'):
 	    STORE_CHAR;
@@ -692,8 +570,6 @@ badpat:
       return REG_BADPAT;
     }
 
-  fg->hasdot = whasdot;
-
   /*
    * The pattern has been processed and copied to tmp as a literal string
    * with escapes, anchors (^$) and the word boundary match character
@@ -701,47 +577,9 @@ badpat:
    */
 #ifdef TRE_WCHAR
   SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
-  fg->wescmap = _escmap;
   STORE_MBS_PAT;
-
-  /*
-   * The position of dots and escaped dots is different in the MB string
-   * than in to the wide string so traverse the converted string, as well,
-   * to store these positions.
-   */
-  if (fg->hasdot || (fg->wescmap != NULL))
-    {
-      if (fg->wescmap != NULL)
-	{
-	  fg->escmap = xmalloc(fg->len * sizeof(bool));
-	  if (!fg->escmap)
-	    {
-	      tre_free_fast(fg);
-	      return REG_ESPACE;
-	    }
-	}
-
-      escaped = false;
-      for (unsigned int i = 0; i < fg->len; i++)
-	if (fg->pattern[i] == '\\')
-	  escaped = !escaped;
-	else if (fg->pattern[i] == '.' && escaped)
-	  {
-	    fg->escmap[i] = true;
-	    escaped = false;
-	  }
-	else if (fg->pattern[i] == '.' && !escaped)
-	  {
-	    hasdot = i;
-	    if (firstdot == -1)
-	      firstdot = i;
-	  }
-	else
-	  escaped = false;
-    }
 #else
   SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
-  fg->escmap = _escmap;
 #endif
 
   xfree(tmp);
@@ -752,14 +590,6 @@ badpat:
 	 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
 	 fg->newline ? 'y' : 'n'));
 
-  /* Check whether reverse QS algorithm is more efficient */
-  if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
-      fg->nosub)
-    {
-      fg->reversed = true;
-      DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
-    }
-
   FILL_QSBC;
   FILL_BMGS;
 #ifdef TRE_WCHAR
@@ -773,7 +603,7 @@ badpat:
 #define _SHIFT_ONE							\
   {									\
     shift = 1;								\
-    j = !fg->reversed ? j + shift : j - shift;				\
+    j = j + shift;							\
     continue;								\
   }
 
@@ -902,10 +732,6 @@ tre_match_fast(const fastmatch_t *fg, co
   if (fg->eol && (eflags & REG_NOTEOL))
     len--;
 
-  if (fg->reversed)
-    j = len - (type == STR_WIDE ? fg->wlen : fg->len);
-
-
   /* Only try once at the beginning or ending of the line. */
   if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
       !(eflags & REG_NOTEOL))
@@ -974,16 +800,10 @@ tre_free_fast(fastmatch_t *fg)
 
 #ifdef TRE_WCHAR
   hashtable_free(fg->qsBc_table);
-  if (!fg->hasdot)
-    xfree(fg->bmGs);
-  if (fg->wescmap)
-    xfree(fg->wescmap);
+  xfree(fg->bmGs);
   xfree(fg->wpattern);
 #endif
-  if (!fg->hasdot)
-    xfree(fg->sbmGs);
-  if (fg->escmap)
-    xfree(fg->escmap);
+  xfree(fg->sbmGs);
   xfree(fg->pattern);
 }
 
@@ -999,7 +819,6 @@ fastcmp(const fastmatch_t *fg, const voi
   const char *pat_byte = fg->pattern;
   const tre_char_t *str_wide = data;
   const tre_char_t *pat_wide = fg->wpattern;
-  const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
   size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
   int ret = REG_OK;
 
@@ -1008,25 +827,12 @@ fastcmp(const fastmatch_t *fg, const voi
     switch (type)
       {
 	case STR_WIDE:
-
-	  /* Check dot */
-	  if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
-	      (!escmap || !escmap[i]) &&
-	      (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
-	    continue;
-
 	  /* Compare */
 	  if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
 		    : (pat_wide[i] == str_wide[i]))
 	    continue;
 	  break;
 	default:
-	  /* Check dot */
-	  if (fg->hasdot && pat_byte[i] == '.' &&
-	      (!escmap || !escmap[i]) &&
-	      (!fg->newline || (str_byte[i] != '\n')))
-	    continue;
-
 	  /* Compare */
 	  if (fg->icase ? (tolower(pat_byte[i]) == tolower(str_byte[i]))
 		    : (pat_byte[i] == str_byte[i]))



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201110221029.p9MAT6XQ073073>