Date: Fri, 15 Oct 1999 00:45:46 -0400 (EDT) From: howardjp@wam.umd.edu To: FreeBSD-gnats-submit@freebsd.org Subject: bin/14342: [PATCH] Speed ups for regex! Message-ID: <199910150445.AAA81151@byzantine.student.umd.edu>
next in thread | raw e-mail | index | archive | help
>Number: 14342 >Category: bin >Synopsis: [PATCH] Speed ups for regex! >Confidential: no >Severity: non-critical >Priority: low >Responsible: freebsd-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: change-request >Submitter-Id: current-users >Arrival-Date: Thu Oct 14 21:50:00 PDT 1999 >Closed-Date: >Last-Modified: >Originator: James Howard >Release: FreeBSD 4.0-CURRENT i386 >Organization: University of Maryland >Environment: FreeBSD byzantine.student.umd.edu 4.0-CURRENT FreeBSD 4.0-CURRENT #40: Wed Oct 6 18:17:44 EDT 1999 howardjp@byzantine.student.umd.edu:/usr/src/sys/compile/BYZANTINE i386 >Description: Regex is horribly, horribly slow. There is code in lib/libc/regex/engine.c to handle searching for a known constant string which has to be present in any matches. This is meant to cut down the number of strings regexec() actually has to search. However, it uses a simple search to find it and it takes forever. I replaced it with a Boyer-Moore search. I stole the code from 4.4BSD's egrep. I have trimed it down as much as I can and it roughly halves the speed of calls to regexec on mis-matches. Apply the patch below to lib/libc/regex to get add the new code. >How-To-Repeat: N/A >Fix: diff -ur regex.orig/engine.c regex/engine.c --- regex.orig/engine.c Mon Sep 13 09:53:57 1999 +++ regex/engine.c Fri Oct 15 00:30:46 1999 @@ -144,14 +144,14 @@ int eflags; { register char *endp; - register int i; + register int i, j; + int patlen = g->mlen; struct match mv; register struct match *m = &mv; - register char *dp; + register char *dp, *k, *s; register const sopno gf = g->firststate+1; /* +1 for OEND */ register const sopno gl = g->laststate; - char *start; - char *stop; + char *start, *stop; /* simplify the situation where possible */ if (g->cflags®_NOSUB) @@ -167,14 +167,37 @@ return(REG_INVARG); /* prescreening; this does wonders for this rather slow code */ - if (g->must != NULL) { - for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && stop - dp >= g->mlen && - memcmp(dp, g->must, (size_t)g->mlen) == 0) - break; - if (dp == stop) /* we didn't find g->must */ - return(REG_NOMATCH); + if (g->must != NULL) { + for (k = string + patlen - 1; k < stop;) { + /* + * for a large class of patterns, upwards of 80% of + * match time is spent on the next line. we beat + * existing microcode (vax 'matchc') this way. + */ + while ((k += g->delta0[*(unsigned char *) k]) < stop); + if (k < string + LARGE) + break; + k -= LARGE; + + j = patlen - 1; + s = k - 1; + while (g->cmap[*((unsigned char *)s--)] == + g->must[--j]); + + /* + * delta-less shortcut for literati. short shrift for + * genetic engineers? + */ + + if (j < 0) + goto bmpassed; + + k++; /* no match; restart next char */ + } + return REG_NOMATCH; } + +bmpassed: /* match struct setup */ m->g = g; diff -ur regex.orig/regcomp.c regex/regcomp.c --- regex.orig/regcomp.c Mon Sep 13 09:53:57 1999 +++ regex/regcomp.c Fri Oct 15 00:32:47 1999 @@ -245,6 +245,7 @@ g->categories = &g->catspace[-(CHAR_MIN)]; (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); g->backrefs = 0; + (void) memset((void *)g->delta0, 0, 256 * sizeof(int)); /* do it */ EMIT(OEND, 0); @@ -1738,6 +1739,7 @@ } assert(cp == g->must + g->mlen); *cp++ = '\0'; /* just on general principles */ + gosper(g->must, g->delta0, g->cmap); } /* @@ -1774,4 +1776,46 @@ if (plusnest != 0) g->iflags |= BAD; return(maxnest); +} + + +#define NALT 7 /* tied to scanf() size in alternate() */ + +/* + * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] + */ +void +gosper(pattern, delta0, cmap) + char *pattern; /* ... HAKMEM lives ... */ + int *delta0; /* Who/what is HAKMEM? */ + char *cmap; +{ + char *altpat[NALT]; + int altlen[NALT]; + register int j, altmin; + unsigned char c; + + /* Make one-string case look like simple alternatives case */ + altmin = altlen[0] = strlen(pattern); + altpat[0] = pattern; + + /* For chars that aren't in any string, skip by string length. */ + for (j = 0; j < 256; j++) { + delta0[j] = altmin; + cmap[j] = j; /* Sneak in initialization of cmap */ + } + + /* For chars in a string, skip distance from char to end of string. */ + /* (If char appears more than once, skip minimum distance.) */ + + delta0[0] = 0; + for (j = 0; j < altlen[0] - 1; j++) { + c = altpat[0][j]; + delta0[c] = MIN(delta0[c], altlen[0] - j - 1); + } + + /* For last char of each string, fall out of search loop. */ + c = altpat[0][altlen[0] - 1]; + delta0[c] = LARGE; + } diff -ur regex.orig/regex2.h regex/regex2.h --- regex.orig/regex2.h Mon Sep 13 09:53:57 1999 +++ regex/regex2.h Fri Oct 15 00:36:14 1999 @@ -56,6 +56,8 @@ */ #define MAGIC1 ((('r'^0200)<<8) | 'e') +#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ + /* * The internal representation is a *strip*, a sequence of * operators ending with an endmarker. (Some terminology etc. is a @@ -164,6 +166,8 @@ size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ sopno nplus; /* how deep does it nest +s? */ + int delta0[256]; /* Boyer-Moore algorithm core */ + char cmap[256]; /* catspace must be last */ cat_t catspace[1]; /* actually [NC] */ }; @@ -171,3 +175,14 @@ /* misc utilities */ #define OUT (CHAR_MAX+1) /* a non-character value */ #define ISWORD(c) (isalnum((uch)(c)) || (c) == '_') + +/* stuff used by Boyer-Moore */ + +#define MIN(A, B) ((A) > (B) ? (B) : (A)) + +#define BUFSIZE 65536 /* make higher for cray */ +#define PATSIZE 6000 +#define LARGE BUFSIZE + PATSIZE + +void gosper(char *, int *, char *); +int chimaera(char *, char *, struct re_guts *); >Release-Note: >Audit-Trail: >Unformatted: To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-bugs" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199910150445.AAA81151>