Date: Tue, 12 Sep 1995 23:24:54 +0200 From: Wolfram Schneider <wosch@cs.tu-berlin.de> To: current@freebsd.org Subject: /usr/libexec/locate.* Message-ID: <199509122124.XAA08914@caramba.cs.tu-berlin.de>
next in thread | raw e-mail | index | archive | help
o locate.bigram.c
Bigram does not remove newline at end of filename. This
break particulary the bigram algorithm and /var/db/locate.database
grow up 15 %. It's really a bug!!!
The bigram output is silly and need ~1/2 CPU time of
database rebuilding.
old:
locate.bigram < $filelist | sort | uniq -c | sort -T /$TMPDIR -nr
^^^^^^^^^^^^^^
this can easy made bigram
new:
locate.bigram < $filelist | sort -T /$TMPDIR -nr
o locate.code.c
Use a lookup array instead a function. 3 x faster (GNU-code is
now 6 x slower)
o local.updatedb.csh
# search locally or everything
# find ${SRCHPATHS} -print | \
find ${SRCHPATHS} \! -fstype local -prune -or -print | \
tr '/' '\001' | \
^^^^^^^^^^^^^^^^^^
Superfluously. Nobody need it.
(sort -T $TMPDIR -f; echo $status > $errs) | tr '\001' '/' > $filelist
^^
wrong, made database 0.5% bigger
^^^^^^^^^^^^^^^^^^^^^^^^
Superfluously, see above. It double the disk space for filenames. The
filenames are in a temp sort file and at the same time in $filelist.
sort -T $TMPDIR -o $filelist avoid this by renaming temp sort file
to $filelist. My database is 115MB big ...
$LIBDIR/locate.bigram < $filelist | \
(sort -T /$TMPDIR; echo $status >> $errs) | uniq -c |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
see locate.bigram.c
--- /usr/src/usr.bin/locate/bigram/locate.bigram.c Fri May 27 14:32:02 1994
+++ locate.bigram.c Tue Sep 12 14:24:56 1995
@@ -53,32 +53,51 @@
#include <stdio.h>
#include <sys/param.h> /* for MAXPATHLEN */
+#include <string.h> /* memchr */
-char buf1[MAXPATHLEN] = " ";
-char buf2[MAXPATHLEN];
+u_char buf1[MAXPATHLEN] = " ";
+u_char buf2[MAXPATHLEN];
+unsigned int bigram[UCHAR_MAX][UCHAR_MAX];
-main ( )
+
+void main ( )
{
- register char *cp;
- register char *oldpath = buf1, *path = buf2;
+ register u_char *cp;
+ register u_char *oldpath = buf1, *path = buf2;
+ register int i, j;
+
+ /* init bigram buffer */
+ for (i = 0; i < UCHAR_MAX; i++)
+ for (j = 0; j < UCHAR_MAX; j++)
+ bigram[i][j] = 0;
while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) {
+ /* chop newline */
+ if ((cp = memchr(path, '\n', sizeof(buf2))) != NULL)
+ *cp = NULL;
+
/* skip longest common prefix */
- for ( cp = path; *cp == *oldpath; cp++, oldpath++ )
- if ( *oldpath == NULL )
- break;
+ for (cp = path; (*cp == *oldpath) != NULL; cp++, oldpath++);
+
/*
* output post-residue bigrams only
*/
+
while ( *cp != NULL && *(cp + 1) != NULL ) {
- putchar ( *cp++ );
- putchar ( *cp++ );
- putchar ( '\n' );
+ bigram[*cp][*(cp+1)]++;
+ cp += 2;
}
+
if ( path == buf1 ) /* swap pointers */
path = buf2, oldpath = buf1;
else
path = buf1, oldpath = buf2;
}
+
+ /* output */
+ for (i = 0; i < UCHAR_MAX; i++)
+ for (j = 0; j < UCHAR_MAX; j++)
+ if (bigram[i][j] != 0)
+ fprintf(stdout, "%4d %c%c\n", bigram[i][j], i, j);
}
--- 1.1 1995/09/12 16:40:45
+++ locate.code.c 1995/09/12 18:07:57
@@ -93,8 +93,20 @@
char buf2[MAXPATHLEN];
char bigrams[BGBUFSIZE + 1] = { 0 };
+#if 1
+#define BGINDEX(x) (big[(int)*x][(int)*(x+1)])
+typedef u_char bg_t;
+bg_t big[UCHAR_MAX][UCHAR_MAX];
+
+#else
+#define BGINDEX(x) bgindex(x)
+typedef int bg_t;
+#endif
+
int bgindex __P((char *));
void usage __P((void));
+extern int optind;
+extern int optopt;
int
main(argc, argv)
@@ -104,6 +116,7 @@
register char *cp, *oldpath, *path;
int ch, code, count, diffcount, oldcount;
FILE *fp;
+ register int i, j;
while ((ch = getopt(argc, argv, "")) != EOF)
switch(ch) {
@@ -126,14 +139,22 @@
err(1, "stdout");
(void)fclose(fp);
+ /* init lookup table */
+ for (i = 0; i < UCHAR_MAX; i++)
+ for (j = 0; j < UCHAR_MAX; j++)
+ big[i][j] = (bg_t)-1;
+
+ for (cp = bigrams, i = 0; *cp != NULL; i += 2, cp += 2)
+ big[(int)*cp][(int)*(cp + 1)] = (bg_t)i;
+
oldpath = buf1;
path = buf2;
oldcount = 0;
while (fgets(path, sizeof(buf2), stdin) != NULL) {
- /* Truncate newline. */
- cp = path + strlen(path) - 1;
- if (cp > path && *cp == '\n')
- *cp = '\0';
+
+ /* chop newline */
+ if ((cp = memchr(path, '\n', sizeof(buf2))) != NULL)
+ *cp = NULL;
/* Squelch characters that would botch the decoding. */
for (cp = path; *cp != NULL; cp++) {
@@ -144,9 +165,9 @@
}
/* Skip longest common prefix. */
- for (cp = path; *cp == *oldpath; cp++, oldpath++)
- if (*oldpath == NULL)
- break;
+ for (cp = path; (*cp == *oldpath) != NULL; cp++, oldpath++)
+ ;
+
count = cp - path;
diffcount = count - oldcount + OFFSET;
oldcount = count;
@@ -164,7 +185,7 @@
err(1, "stdout");
break;
}
- if ((code = bgindex(cp)) < 0) {
+ if ((code = BGINDEX(cp)) == (bg_t)-1) {
if (putchar(*cp++) == EOF ||
putchar(*cp++) == EOF)
err(1, "stdout");
--- /usr/libexec/locate.updatedb.old Sun Jan 1 05:36:58 1995
+++ /usr/libexec/locate.updatedb Tue Sep 12 14:33:32 1995
@@ -58,12 +58,10 @@
# search locally or everything
# find ${SRCHPATHS} -print | \
find ${SRCHPATHS} \! -fstype local -prune -or -print | \
- tr '/' '\001' | \
- (sort -T $TMPDIR -f; echo $status > $errs) | tr '\001' '/' > $filelist
+ sort -T $TMPDIR -o $filelist; echo $status > $errs
-$LIBDIR/locate.bigram < $filelist | \
- (sort -T /$TMPDIR; echo $status >> $errs) | \
- uniq -c | sort -T /$TMPDIR -nr | \
+$LIBDIR/locate.bigram.new < $filelist | \
+ sort -T $TMPDIR -nr | \
awk '{ if (NR <= 128) print $2 }' | tr -d '\012' > $bigrams
# code the file list
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199509122124.XAA08914>
