From owner-freebsd-current  Sun Sep 10 01:06:10 1995
Return-Path: current-owner
Received: (from majordom@localhost)
          by freefall.freebsd.org (8.6.12/8.6.6) id BAA27738
          for current-outgoing; Sun, 10 Sep 1995 01:06:10 -0700
Received: from godzilla.zeta.org.au (godzilla.zeta.org.au [203.2.228.34])
          by freefall.freebsd.org (8.6.12/8.6.6) with ESMTP id BAA27714
          for <current@freebsd.org>; Sun, 10 Sep 1995 01:05:50 -0700
Received: (from bde@localhost) by godzilla.zeta.org.au (8.6.9/8.6.9) id RAA27567; Sun, 10 Sep 1995 17:58:09 +1000
Date: Sun, 10 Sep 1995 17:58:09 +1000
From: Bruce Evans <bde@zeta.org.au>
Message-Id: <199509100758.RAA27567@godzilla.zeta.org.au>
To: pst@shockwave.com, wosch@freebsd.first.gmd.de
Subject: Re: 20-40% faster wc(1)
Cc: current@freebsd.org
Sender: current-owner@freebsd.org
Precedence: bulk

>I don't understand... the original isspace() was just a lookup in a table
>followed by an 'and'.  I assume this changed because of the rune code?

The rune code has 2 or 3 more tests.

>The question you should be asking yourself is WHY is isspace(3) slow?

Because gcc does a poor job of optimizing away the extra tests.

>Not kludging wc.  I do not think this patch should be accepted.

Any program that lexes input could benefit from having specialized
ctype tables.  It would be interesting to generate correct ones
automagically.  The proposed patch is close to generating a correct
one for wc.

+        u_char spbuf[256]; /* buffer for isspace lookup */
+
+        for (ch = 0; ch < 256; ch++)
+            spbuf[ch] = isspace(ch);

Correct (?) version:

#if UCHAR_MAX >= 4096
#define	ISSPACE(ch)	isspace(ch)
#else
         u_char myctype[UCHAR_MAX + 1];

         for (ch = 0; ch <= UCHAR_MAX; ch++)
             myctype[ch] = isspace(ch) == 1 : 0;
#ifdef maybe
	/* It might be best to coalesce the space and newline tests. */
	myctype['\n'] |= 2;
#endif
#define	ISSPACE(ch)	(myctype[ch] != 0)
#endif

Bruce

Annotated gcc output:

	.align 2,0x90
//				do {	/* for loop rearranged */
L71:
//					ch = *p++;
	// gcc generates its usual poor sign extension code.  It should
	// keep the top 24 bits of %edx as 0 and load the char into the
	// low 8 bits, saving 5 cycles out of 6 on i486's.
	movzbl (%esi),%edx
	// Increment p.  Code is best possible.  1 cycle on an i486.
	incl %esi
//					if (ch == '\n')
	// gcc could save a byte or two of space here, and perhaps some
	// time, by comparing only the low 8 bits.
	// A better algorithm would use a special character type test
	// that lumps newlines with spaces.  1 cycle on an i486.
	cmpl $10,%edx
	// The branch is usually taken.  Better i486 code would arrange for
	// it not to be taken.  Usually 3 cycles on an i486.
	jne L72
//						++linect;
	// Usually not executed.
	incl -16484(%ebp)
L72:
//					if (isspace(ch))
//					#define	isspace(c) __istype((c), _S)
//					static __inline int
//					__istype(_BSD_RUNE_T_ _c,
//						 unsigned long _f)
//					{
	// Poor register allocation wastes 2 cycles on i486's.  Perhaps gcc
	// was challenged by the inline.
	movl %edx,%ecx
	movl %ecx,%edx
//						/*
/						 * ctype is hacked to work
//						 * with bogus negative args
//						 * although this breaks it
//						 * on EOF.  Negative args
//						 * are generated by the
//						 * common bug `isfoo(*cp)'.
//						 */
//						if (_c < 0)
//							_c = (unsigned char) _c;
	// At this point `c' is known to be the conversion of an unsigned char
	// to an int, so the above 2 lines can be optimized out.  And they were.
//						return ((((_c & _CRMASK)
	// Similarly, _c & _CRMASK is known to be 0.  gcc half figures this
	// out, then it stupidly generates the comparision of 0 with 0,
	// wasting 5 cycles on a 486.
	movb $0,%dl
	testl %edx,%edx
	je L76
//						       ?___runetype(_c) :
	// The following is never executed by wc.
	pushl %ecx
	call ____runetype
	addl $4,%esp
	shrl $14,%eax
	movl %eax,%edx
	jmp L84
	.align 2,0x90
L76:
//	    					_CurrentRuneLocale->runetype[_c]) & _f) ? 1 : 0);
//					}
	// isspace() introduces only a little bloat.
	// First an indirection.  Costs 1 cycle (+ cache misses) on i486's.
	movl __CurrentRuneLocale,%edx
	// Then a complicated address mode.  Probably costs a cycle on i486's.
	movb 53(%edx,%ecx,4),%dl
	// Back to gcc pessimizations.  Pay 2 cycles (guess on which machine :-)
	// to merge with the runetype case that can't happen.
	shrl $6,%edx
L84:
	// Do the classification.  Stupidly branch for the usual case.
	// Usually 1 + 3 cycles.
	andl $1,%edx
	je L73
//						gotsp = 1;
	// !isspace() case is usually not reached.
	movw $1,-16500(%ebp)
	jmp L68
	.align 2,0x90
L73:
//					else if (gotsp) {
	// Back to C code pessimizations.  `register short gotsp' is
	// actually `unregister slow'.  The short costs a cycle.  gcc
	// apparently ran out of registers and had to keep the flag in
	// memory.  As usual, stupidly branch for the usual case.
	// Usually 3 + 3 cycles.
	cmpw $0,-16500(%ebp)
	je L68
//						gotsp = 0;
	// Usually not executed.
	movw $0,-16500(%ebp)
//						++wordct;
	// Usually not executed.
	incl -16488(%ebp)
//					}
L68:
//				} while (len-- != 0);	/* for loop rearranged */
	// Test for loop exit.  1 cycle.
	decl %ebx
	// Neither the C code nor gcc was smart enough to test against 0.
	// 1 more cycle.
	cmpl $-1,%ebx
	// Normally don't exit.  3 cycles.
	jne L71