Date: Sun, 14 Jun 2026 18:14:35 +0000 From: Baptiste Daroussin <bapt@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: a74c77cc7bed - main - grep(1): optimize -w/--word-regexp word boundary check Message-ID: <6a2eef8b.1f3da.117cc007@gitrepo.freebsd.org>
index | next in thread | raw e-mail
The branch main has been updated by bapt: URL: https://cgit.FreeBSD.org/src/commit/?id=a74c77cc7bed8dba50e976a7be2aa0094ee27b61 commit a74c77cc7bed8dba50e976a7be2aa0094ee27b61 Author: Baptiste Daroussin <bapt@FreeBSD.org> AuthorDate: 2026-06-10 14:41:39 +0000 Commit: Baptiste Daroussin <bapt@FreeBSD.org> CommitDate: 2026-06-14 18:14:31 +0000 grep(1): optimize -w/--word-regexp word boundary check The -w option checks word boundaries before and after each potential match by decoding the adjacent character. This was done via the heavyweight sscanf(3) with "%lc", which goes through the full scanf parser and locale-aware mbrtowc(3) machinery even for simple ASCII. Replace with a three-tier fast path: 1. ASCII bytes (< 0x80): simple isalnum(3) / '_' comparison 2. UTF-8 continuation bytes (0x80-0xBF): interior bytes of a multi-byte character are always word characters -> no further decoding needed 3. Multi-byte start bytes (>= 0xC0): decode with mbrtowc(3) directly instead of sscanf(3)/%lc, avoiding scanf parser overhead Benchmark with ministat(1) (10 runs each): Worst-case ASCII (100k lines of 100 'a' chars, -w 'a'): Difference at 95.0% confidence: -15.3% +/- 3.1% Worst-case Unicode (50k lines of 100 accented 'e', -w 'e'): Difference at 95.0% confidence: -11.2% +/- 4.7% Normal -w (500k lines, -w 'the'): Difference at 95.0% confidence: -18.1% +/- 3.6% French text (100k lines, -w accented 'ete'): Difference at 95.0% confidence: -18.0% +/- 4.1% Non -w case shows no regression. Reviewed by: kevans Differential Revision: https://reviews.freebsd.org/D57587 --- usr.bin/grep/util.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c index dbb21dcfd78e..bbb174370bd5 100644 --- a/usr.bin/grep/util.c +++ b/usr.bin/grep/util.c @@ -490,6 +490,35 @@ litexec(const struct pat *pat, const char *string, size_t nmatch, #define iswword(x) (iswalnum((x)) || (x) == L'_') +/* + * Check if the byte at the given offset in the line is a word character + * (alphanumeric or _). Handles ASCII fast path, UTF-8 continuation bytes, + * and multi-byte decoding via mbrtowc(3). + */ +static bool +iswordchar(const char *dat, size_t len, size_t offset) +{ + unsigned char ch; + mbstate_t mbstate; + wchar_t wc; + size_t n; + + if (offset >= len) + return (false); + + ch = (unsigned char)dat[offset]; + if (ch < 0x80) + return (isalnum(ch) || ch == '_'); + if ((ch & 0xC0) == 0x80) + /* Continuation byte: part of a word */ + return (true); + + /* Multi-byte start byte: decode with mbrtowc */ + memset(&mbstate, 0, sizeof(mbstate)); + n = mbrtowc(&wc, &dat[offset], MB_CUR_MAX, &mbstate); + return (n == (size_t)-1 || n == (size_t)-2 || iswword(wc)); +} + /* * Processes a line comparing it with the specified patterns. Each pattern * is looped to be compared along with the full string, saving each and every @@ -501,7 +530,6 @@ static bool procline(struct parsec *pc) { regmatch_t pmatch, lastmatch, chkmatch; - wchar_t wbegin, wend; size_t st, nst; unsigned int i; int r = 0, leflags = eflags; @@ -567,18 +595,14 @@ procline(struct parsec *pc) continue; /* Check for whole word match */ if (wflag) { - wbegin = wend = L' '; if (pmatch.rm_so != 0 && - sscanf(&pc->ln.dat[pmatch.rm_so - 1], - "%lc", &wbegin) != 1) + iswordchar(pc->ln.dat, pc->ln.len, + pmatch.rm_so - 1)) r = REG_NOMATCH; - else if ((size_t)pmatch.rm_eo != + if (r == 0 && (size_t)pmatch.rm_eo != pc->ln.len && - sscanf(&pc->ln.dat[pmatch.rm_eo], - "%lc", &wend) != 1) - r = REG_NOMATCH; - else if (iswword(wbegin) || - iswword(wend)) + iswordchar(pc->ln.dat, pc->ln.len, + pmatch.rm_eo)) r = REG_NOMATCH; /* * If we're doing whole word matching and wehome | help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?6a2eef8b.1f3da.117cc007>
