Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 25 Aug 2011 18:03:29 +0000 (UTC)
From:      Gabor Kovesdan <gabor@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r225183 - user/gabor/tre-integration/contrib/tre/lib
Message-ID:  <201108251803.p7PI3TO3013837@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gabor
Date: Thu Aug 25 18:03:29 2011
New Revision: 225183
URL: http://svn.freebsd.org/changeset/base/225183

Log:
  - Add support for REG_EXTENDED
  - Some accuracy fixes for REG_BASIC

Modified:
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Thu Aug 25 17:43:06 2011	(r225182)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Thu Aug 25 18:03:29 2011	(r225183)
@@ -125,7 +125,15 @@
       st = i + 1;							\
       escaped = false;							\
       goto end_segment;							\
-    } while (0);
+    } while (0)
+
+#define STORE_CHAR							\
+  do									\
+    {									\
+      escaped = false;							\
+      heur[pos++] = regex[i];						\
+    } while (0)
+
 
 /*
  * Parses a regular expression and constructs a heuristic in heur_t and
@@ -140,10 +148,6 @@ tre_compile_heur(heur_t *h, const tre_ch
   bool escaped = false;
   int errcode, ret;
 
-  /* XXX: only basic regexes are supported. */
-  if (cflags & REG_EXTENDED)
-    return REG_BADPAT;
-
   /* Temporary space, len will be enough. */
   heur = xmalloc(len);
   if (!heur)
@@ -168,10 +172,17 @@ tre_compile_heur(heur_t *h, const tre_ch
 	  switch (regex[i])
 	    {
 
-	      /* Bracketed expression is substituted with a dot. */
+	      /*
+	       * Bracketed expression is substituted with a dot or the
+	       * brackets are treated as normal if at least the opening
+	       * bracket is escaped.
+	       */
 	      case TRE_CHAR('['):
 		PARSE_BRACKETS;
-		heur[pos++] = TRE_CHAR('.');
+		if (escaped)
+		  STORE_CHAR;
+		else
+		  heur[pos++] = TRE_CHAR('.');
 		continue;
 
 	      /*
@@ -180,14 +191,22 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       * character.
 	       */
 	      case TRE_CHAR('{'):
+		if (escaped && (i == 1))
+		  STORE_CHAR;
+		else if ((i == 0) && !(cflags & REG_EXTENDED))
+		  STORE_CHAR;
+		else if ((i == 0) && (cflags & REG_EXTENDED))
+		  continue;
+
 		PARSE_UNIT('{', '}');
-		if (escaped)
+		if (escaped ^ (cflags & REG_EXTENDED))
 		  {
 		    pos--;
 		    END_SEGMENT;
 		  }
-		heur[pos++] = regex[i];
-		break;
+		else
+		  STORE_CHAR;
+		continue;
 
 	      /*
 	       * Terminates the current segment when escaped,
@@ -195,10 +214,11 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       */
 	      case TRE_CHAR('('):
 		PARSE_UNIT('(', ')');
-		if (escaped)
+		if (escaped ^ (cflags & REG_EXTENDED))
 		  END_SEGMENT;
-		heur[pos++] = regex[i];
-		break;
+		else
+		  STORE_CHAR;
+		continue;
 
 	      /*
 	       * Sets escaped flag.
@@ -207,24 +227,88 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       */
 	      case TRE_CHAR('\\'):
 		if (escaped)
-		  heur[pos++] = regex[i];
-		escaped = !escaped;
+		  STORE_CHAR;
+		else
+		  escaped = !escaped;
 		continue;
 
 	      /*
-	       * If not the first character and not escaped, erases the
+	       * BRE: If not the first character and not escaped, erases the
 	       * last character and terminates the segment.
 	       * Otherwise treated as a normal character.
+	       * ERE: Skipped if first character (GNU), rest is like in BRE.
 	       */
 	      case TRE_CHAR('*'):
-		if ((i != 0) && !escaped)
+		if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
+		  STORE_CHAR;
+		else if ((i != 0))
+		  {
+		    pos--;
+		    END_SEGMENT;
+		  }
+		continue;
+
+	      /*
+	       * In BRE, it is a normal character, behavior is undefined
+	       * when escaped.
+	       * In ERE, it is special unless escaped. Terminate segment
+	       * when not escaped. Last character is not removed because it
+	       * must occur at least once. It is skipped when first
+	       * character (GNU).
+	       */
+	      case TRE_CHAR('+'):
+		if ((cflags & REG_EXTENDED) && (i == 0))
+		  continue;
+		else if ((cflags & REG_EXTENDED) ^ escaped)
+		  END_SEGMENT;
+		else 
+		  STORE_CHAR;
+		continue;
+
+	      /*
+	       * In BRE, it is a normal character, behavior is undefined
+	       * when escaped.
+	       * In ERE, it is special unless escaped. Terminate segment
+	       * when not escaped. Last character is removed. Skipped when
+	       * first character (GNU).
+	       */
+	      case TRE_CHAR('?'):
+		if ((cflags & REG_EXTENDED) && (i == 0))
+		  continue;
+		if ((cflags & REG_EXTENDED) ^ escaped)
 		  {
 		    pos--;
 		    END_SEGMENT;
 		  }
 		else
-		  heur[pos++] = regex[i];
-		break;
+		  STORE_CHAR;
+		continue;
+
+	      /*
+	       * Fail if it is an ERE alternation marker.
+	       */
+	      case TRE_CHAR('|'):
+		if ((cflags & REG_EXTENDED) && !escaped)
+		  {
+		    errcode = REG_BADPAT;
+		    goto badpat2;
+		  }
+		else if (!(cflags & REG_EXTENDED) && escaped)
+		  END_SEGMENT;
+		else
+		  STORE_CHAR;
+		continue;
+
+	      /*
+	       * Cut the segment at an escaped dot because the fast matcher
+	       * cannot handle it.
+	       */
+	      case TRE_CHAR('.'):
+		if (escaped)
+		  END_SEGMENT;
+		else
+		  STORE_CHAR;
+		continue;
 
 	      /*
 	       * If escaped, terminates segment.
@@ -234,7 +318,8 @@ tre_compile_heur(heur_t *h, const tre_ch
 	      default:
 		if (escaped)
 		  END_SEGMENT;
-		heur[pos++] = regex[i];
+		else
+		  STORE_CHAR;
 		continue;
 	    }
 	}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108251803.p7PI3TO3013837>