Date: Fri, 15 Oct 1999 00:45:46 -0400 (EDT) From: howardjp@wam.umd.edu To: FreeBSD-gnats-submit@freebsd.org Subject: bin/14342: [PATCH] Speed ups for regex! Message-ID: <199910150445.AAA81151@byzantine.student.umd.edu>
index | next in thread | raw e-mail
>Number: 14342
>Category: bin
>Synopsis: [PATCH] Speed ups for regex!
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: freebsd-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: change-request
>Submitter-Id: current-users
>Arrival-Date: Thu Oct 14 21:50:00 PDT 1999
>Closed-Date:
>Last-Modified:
>Originator: James Howard
>Release: FreeBSD 4.0-CURRENT i386
>Organization:
University of Maryland
>Environment:
FreeBSD byzantine.student.umd.edu 4.0-CURRENT FreeBSD 4.0-CURRENT #40: Wed Oct 6 18:17:44 EDT 1999 howardjp@byzantine.student.umd.edu:/usr/src/sys/compile/BYZANTINE i386
>Description:
Regex is horribly, horribly slow.
There is code in lib/libc/regex/engine.c to handle searching for
a known constant string which has to be present in any matches.
This is meant to cut down the number of strings regexec() actually
has to search. However, it uses a simple search to find it and
it takes forever.
I replaced it with a Boyer-Moore search. I stole the code from
4.4BSD's egrep. I have trimed it down as much as I can and it
roughly halves the speed of calls to regexec on mis-matches.
Apply the patch below to lib/libc/regex to get add the new code.
>How-To-Repeat:
N/A
>Fix:
diff -ur regex.orig/engine.c regex/engine.c
--- regex.orig/engine.c Mon Sep 13 09:53:57 1999
+++ regex/engine.c Fri Oct 15 00:30:46 1999
@@ -144,14 +144,14 @@
int eflags;
{
register char *endp;
- register int i;
+ register int i, j;
+ int patlen = g->mlen;
struct match mv;
register struct match *m = &mv;
- register char *dp;
+ register char *dp, *k, *s;
register const sopno gf = g->firststate+1; /* +1 for OEND */
register const sopno gl = g->laststate;
- char *start;
- char *stop;
+ char *start, *stop;
/* simplify the situation where possible */
if (g->cflags®_NOSUB)
@@ -167,14 +167,37 @@
return(REG_INVARG);
/* prescreening; this does wonders for this rather slow code */
- if (g->must != NULL) {
- for (dp = start; dp < stop; dp++)
- if (*dp == g->must[0] && stop - dp >= g->mlen &&
- memcmp(dp, g->must, (size_t)g->mlen) == 0)
- break;
- if (dp == stop) /* we didn't find g->must */
- return(REG_NOMATCH);
+ if (g->must != NULL) {
+ for (k = string + patlen - 1; k < stop;) {
+ /*
+ * for a large class of patterns, upwards of 80% of
+ * match time is spent on the next line. we beat
+ * existing microcode (vax 'matchc') this way.
+ */
+ while ((k += g->delta0[*(unsigned char *) k]) < stop);
+ if (k < string + LARGE)
+ break;
+ k -= LARGE;
+
+ j = patlen - 1;
+ s = k - 1;
+ while (g->cmap[*((unsigned char *)s--)] ==
+ g->must[--j]);
+
+ /*
+ * delta-less shortcut for literati. short shrift for
+ * genetic engineers?
+ */
+
+ if (j < 0)
+ goto bmpassed;
+
+ k++; /* no match; restart next char */
+ }
+ return REG_NOMATCH;
}
+
+bmpassed:
/* match struct setup */
m->g = g;
diff -ur regex.orig/regcomp.c regex/regcomp.c
--- regex.orig/regcomp.c Mon Sep 13 09:53:57 1999
+++ regex/regcomp.c Fri Oct 15 00:32:47 1999
@@ -245,6 +245,7 @@
g->categories = &g->catspace[-(CHAR_MIN)];
(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
g->backrefs = 0;
+ (void) memset((void *)g->delta0, 0, 256 * sizeof(int));
/* do it */
EMIT(OEND, 0);
@@ -1738,6 +1739,7 @@
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */
+ gosper(g->must, g->delta0, g->cmap);
}
/*
@@ -1774,4 +1776,46 @@
if (plusnest != 0)
g->iflags |= BAD;
return(maxnest);
+}
+
+
+#define NALT 7 /* tied to scanf() size in alternate() */
+
+/*
+ * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
+ */
+void
+gosper(pattern, delta0, cmap)
+ char *pattern; /* ... HAKMEM lives ... */
+ int *delta0; /* Who/what is HAKMEM? */
+ char *cmap;
+{
+ char *altpat[NALT];
+ int altlen[NALT];
+ register int j, altmin;
+ unsigned char c;
+
+ /* Make one-string case look like simple alternatives case */
+ altmin = altlen[0] = strlen(pattern);
+ altpat[0] = pattern;
+
+ /* For chars that aren't in any string, skip by string length. */
+ for (j = 0; j < 256; j++) {
+ delta0[j] = altmin;
+ cmap[j] = j; /* Sneak in initialization of cmap */
+ }
+
+ /* For chars in a string, skip distance from char to end of string. */
+ /* (If char appears more than once, skip minimum distance.) */
+
+ delta0[0] = 0;
+ for (j = 0; j < altlen[0] - 1; j++) {
+ c = altpat[0][j];
+ delta0[c] = MIN(delta0[c], altlen[0] - j - 1);
+ }
+
+ /* For last char of each string, fall out of search loop. */
+ c = altpat[0][altlen[0] - 1];
+ delta0[c] = LARGE;
+
}
diff -ur regex.orig/regex2.h regex/regex2.h
--- regex.orig/regex2.h Mon Sep 13 09:53:57 1999
+++ regex/regex2.h Fri Oct 15 00:36:14 1999
@@ -56,6 +56,8 @@
*/
#define MAGIC1 ((('r'^0200)<<8) | 'e')
+#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
+
/*
* The internal representation is a *strip*, a sequence of
* operators ending with an endmarker. (Some terminology etc. is a
@@ -164,6 +166,8 @@
size_t nsub; /* copy of re_nsub */
int backrefs; /* does it use back references? */
sopno nplus; /* how deep does it nest +s? */
+ int delta0[256]; /* Boyer-Moore algorithm core */
+ char cmap[256];
/* catspace must be last */
cat_t catspace[1]; /* actually [NC] */
};
@@ -171,3 +175,14 @@
/* misc utilities */
#define OUT (CHAR_MAX+1) /* a non-character value */
#define ISWORD(c) (isalnum((uch)(c)) || (c) == '_')
+
+/* stuff used by Boyer-Moore */
+
+#define MIN(A, B) ((A) > (B) ? (B) : (A))
+
+#define BUFSIZE 65536 /* make higher for cray */
+#define PATSIZE 6000
+#define LARGE BUFSIZE + PATSIZE
+
+void gosper(char *, int *, char *);
+int chimaera(char *, char *, struct re_guts *);
>Release-Note:
>Audit-Trail:
>Unformatted:
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message
home |
help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199910150445.AAA81151>
