Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 15 Oct 1999 00:45:46 -0400 (EDT)
From:      howardjp@wam.umd.edu
To:        FreeBSD-gnats-submit@freebsd.org
Subject:   bin/14342: [PATCH] Speed ups for regex!
Message-ID:  <199910150445.AAA81151@byzantine.student.umd.edu>

next in thread | raw e-mail | index | archive | help

>Number:         14342
>Category:       bin
>Synopsis:       [PATCH] Speed ups for regex!
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Thu Oct 14 21:50:00 PDT 1999
>Closed-Date:
>Last-Modified:
>Originator:     James Howard
>Release:        FreeBSD 4.0-CURRENT i386
>Organization:
University of Maryland
>Environment:

FreeBSD byzantine.student.umd.edu 4.0-CURRENT FreeBSD 4.0-CURRENT #40: Wed Oct  6 18:17:44 EDT 1999     howardjp@byzantine.student.umd.edu:/usr/src/sys/compile/BYZANTINE  i386

>Description:

Regex is horribly, horribly slow.  

There is code in lib/libc/regex/engine.c to handle searching for
a known constant string which has to be present in any matches. 
This is meant to cut down the number of strings regexec() actually
has to search.  However, it uses a simple search to find it and 
it takes forever.

I replaced it with a Boyer-Moore search.  I stole the code from
4.4BSD's egrep.  I have trimed it down as much as I can and it 
roughly halves the speed of calls to regexec on mis-matches.

Apply the patch below to lib/libc/regex to get add the new code.

>How-To-Repeat:

N/A

>Fix:

diff -ur regex.orig/engine.c regex/engine.c
--- regex.orig/engine.c	Mon Sep 13 09:53:57 1999
+++ regex/engine.c	Fri Oct 15 00:30:46 1999
@@ -144,14 +144,14 @@
 int eflags;
 {
 	register char *endp;
-	register int i;
+	register int i, j;
+	int patlen = g->mlen;
 	struct match mv;
 	register struct match *m = &mv;
-	register char *dp;
+	register char *dp, *k, *s;
 	register const sopno gf = g->firststate+1;	/* +1 for OEND */
 	register const sopno gl = g->laststate;
-	char *start;
-	char *stop;
+	char *start, *stop;
 
 	/* simplify the situation where possible */
 	if (g->cflags&REG_NOSUB)
@@ -167,14 +167,37 @@
 		return(REG_INVARG);
 
 	/* prescreening; this does wonders for this rather slow code */
-	if (g->must != NULL) {
-		for (dp = start; dp < stop; dp++)
-			if (*dp == g->must[0] && stop - dp >= g->mlen &&
-				memcmp(dp, g->must, (size_t)g->mlen) == 0)
-				break;
-		if (dp == stop)		/* we didn't find g->must */
-			return(REG_NOMATCH);
+ 	if (g->must != NULL) {
+		for (k = string + patlen - 1; k < stop;) {
+			/*
+			 * for a large class of patterns, upwards of 80% of
+			 * match time is spent on the next line.  we beat
+			 * existing microcode (vax 'matchc') this way. 
+			 */
+			while ((k += g->delta0[*(unsigned char *) k]) < stop);
+			if (k < string + LARGE)
+				 break;
+			k -= LARGE;
+			
+ 			j = patlen - 1;
+	 		s = k - 1;
+		 	while (g->cmap[*((unsigned char *)s--)] == 
+			       g->must[--j]);
+		
+			/*
+			 * delta-less shortcut for literati. short shrift for
+			 * genetic engineers? 
+			 */
+
+			if (j < 0)
+				goto bmpassed;
+			
+			k++;    /* no match; restart next char */
+		}
+		return REG_NOMATCH;
 	}
+
+bmpassed:
 
 	/* match struct setup */
 	m->g = g;
diff -ur regex.orig/regcomp.c regex/regcomp.c
--- regex.orig/regcomp.c	Mon Sep 13 09:53:57 1999
+++ regex/regcomp.c	Fri Oct 15 00:32:47 1999
@@ -245,6 +245,7 @@
 	g->categories = &g->catspace[-(CHAR_MIN)];
 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 	g->backrefs = 0;
+	(void) memset((void *)g->delta0, 0, 256 * sizeof(int));
 
 	/* do it */
 	EMIT(OEND, 0);
@@ -1738,6 +1739,7 @@
 	}
 	assert(cp == g->must + g->mlen);
 	*cp++ = '\0';		/* just on general principles */
+	gosper(g->must, g->delta0, g->cmap);
 }
 
 /*
@@ -1774,4 +1776,46 @@
 	if (plusnest != 0)
 		g->iflags |= BAD;
 	return(maxnest);
+}
+
+
+#define NALT	7		/* tied to scanf() size in alternate() */
+
+/*
+ * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 
+ */
+void
+gosper(pattern, delta0, cmap)
+	char *pattern;		/* ... HAKMEM lives ... */
+	int  *delta0;           /* Who/what is HAKMEM? */
+	char *cmap;
+{
+	char *altpat[NALT];
+	int altlen[NALT];
+	register int j, altmin;
+	unsigned char c;
+
+	/* Make one-string case look like simple alternatives case */
+	altmin = altlen[0] = strlen(pattern);
+	altpat[0] = pattern;
+	
+	/* For chars that aren't in any string, skip by string length. */
+ 	for (j = 0; j < 256; j++) { 
+ 		delta0[j] = altmin; 
+		cmap[j] = j;	/* Sneak in initialization of cmap */
+	}
+
+	/* For chars in a string, skip distance from char to end of string. */
+	/* (If char appears more than once, skip minimum distance.) */
+
+	delta0[0] = 0;
+	for (j = 0; j < altlen[0] - 1; j++) {
+		c = altpat[0][j];
+		delta0[c] = MIN(delta0[c], altlen[0] - j - 1);
+	}
+
+	/* For last char of each string, fall out of search loop. */
+	c = altpat[0][altlen[0] - 1];
+	delta0[c] = LARGE;	 
+
 }
diff -ur regex.orig/regex2.h regex/regex2.h
--- regex.orig/regex2.h	Mon Sep 13 09:53:57 1999
+++ regex/regex2.h	Fri Oct 15 00:36:14 1999
@@ -56,6 +56,8 @@
  */
 #define	MAGIC1	((('r'^0200)<<8) | 'e')
 
+#define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
+
 /*
  * The internal representation is a *strip*, a sequence of
  * operators ending with an endmarker.  (Some terminology etc. is a
@@ -164,6 +166,8 @@
 	size_t nsub;		/* copy of re_nsub */
 	int backrefs;		/* does it use back references? */
 	sopno nplus;		/* how deep does it nest +s? */
+	int delta0[256];        /* Boyer-Moore algorithm core */
+	char cmap[256];
 	/* catspace must be last */
 	cat_t catspace[1];	/* actually [NC] */
 };
@@ -171,3 +175,14 @@
 /* misc utilities */
 #define	OUT	(CHAR_MAX+1)	/* a non-character value */
 #define ISWORD(c)       (isalnum((uch)(c)) || (c) == '_')
+
+/* stuff used by Boyer-Moore */
+
+#define	MIN(A, B)	((A) > (B) ? (B) : (A))
+
+#define BUFSIZE	65536		/* make higher for cray */
+#define PATSIZE 6000
+#define LARGE 	BUFSIZE + PATSIZE
+
+void             gosper(char *, int *, char *);
+int              chimaera(char *, char *, struct re_guts *);



>Release-Note:
>Audit-Trail:
>Unformatted:


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199910150445.AAA81151>