From owner-freebsd-hardware  Fri Feb 28 04:20:59 1997
Return-Path: <owner-hardware>
Received: (from root@localhost)
          by freefall.freebsd.org (8.8.5/8.8.5) id EAA23990
          for hardware-outgoing; Fri, 28 Feb 1997 04:20:59 -0800 (PST)
Received: from dfw-ix6.ix.netcom.com (dfw-ix6.ix.netcom.com [206.214.98.6])
          by freefall.freebsd.org (8.8.5/8.8.5) with ESMTP id EAA23983
          for <freebsd-hardware@FreeBSD.ORG>; Fri, 28 Feb 1997 04:20:51 -0800 (PST)
Received: (from smap@localhost)
          by dfw-ix6.ix.netcom.com (8.8.4/8.8.4)
	  id GAA22272; Fri, 28 Feb 1997 06:18:59 -0600 (CST)
Received: from ala-ca9-57.ix.netcom.com(207.93.143.121) by dfw-ix6.ix.netcom.com via smap (V1.3)
	id sma021343; Fri Feb 28 06:17:22 1997
Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.8.5/8.6.9) id EAA07192; Fri, 28 Feb 1997 04:17:18 -0800 (PST)
Date: Fri, 28 Feb 1997 04:17:18 -0800 (PST)
Message-Id: <199702281217.EAA07192@silvia.HIP.Berkeley.EDU>
To: bde@zeta.org.au
CC: freebsd-hardware@FreeBSD.ORG
In-reply-to: <199702281008.VAA07624@godzilla.zeta.org.au> (message from Bruce Evans on Fri, 28 Feb 1997 21:08:05 +1100)
Subject: Re: Memory speed of P6-200 (256k)
From: asami@vader.cs.berkeley.edu (Satoshi Asami)
Sender: owner-hardware@FreeBSD.ORG
X-Loop: FreeBSD.org
Precedence: bulk

 * Just write it hex or octal if you are in a hurry.  We already do this
 * for `rdmsr' etc.

Well ok, let's try it first.  However, if what Bruce said is true (and
it usually is) and I understand it correctly (which I usually don't),
we won't get any better results because (1) the memory bandwidth is
about 180MB/s on Triton-I/II with EDO, and (2) a P5-133 can't quite
keep up (I've seen about 160MB/s), while a P6-166 can.  Since MMX
comes only with 166MHz and 200MHz parts, it's not really going to
help.

Anyway.

The following shar contains three files in it: "runtest" (shell
script), "mem.c" (test driver) and "unrolled.s" (assembly language
subroutine).

You can do a

  gcc -O -o mem mem.c unrolled.s

and then

  sh runtest mem

to do the standard FPcopy test.  This should give you around 80MB/s
for a 133MHz and faster Pentium.

Now, look at this part inside "unrolled.s":

===
unrolled_loop:
	fildq 0(%esi)
	fildq 8(%esi)
	fildq 16(%esi)
	fildq 24(%esi)
	fildq 32(%esi)
	fildq 40(%esi)
	fildq 48(%esi)
	fildq 56(%esi)
	fistpq 56(%edi)
	fistpq 48(%edi)
	fistpq 40(%edi)
	fistpq 32(%edi)
	fistpq 24(%edi)
	fistpq 16(%edi)
	fistpq 8(%edi)
	fistpq 0(%edi)
===

These should be changed to

===
unrolled_loop:
	movq mm0,0(%esi)
	movq mm1,8(%esi)
	movq mm2,16(%esi)
	movq mm3,24(%esi)
	movq mm4,32(%esi)
	movq mm5,40(%esi)
	movq mm6,48(%esi)
	movq mm7,56(%esi)
	movq 0(%esi),mm0
	movq 8(%esi),mm1
	movq 16(%esi),mm2
	movq 24(%esi),mm3
	movq 32(%esi),mm4
	movq 40(%esi),mm5
	movq 48(%esi),mm6
	movq 56(%esi),mm7
===

Also, at the end of the routine:

===
	leal -8(%ebp),%esp
	popl %esi
	popl %edi
	leave
	ret
===

you need to insert a "empty mmx state" instruction:

===
	leal -8(%ebp),%esp
	popl %esi
	popl %edi
	.byte 0x0f, 0x77	; emms
	leave
	ret
===

Make these changes and let us know what you get.

Unfortunately, we don't have an assembler that works on this yet so
I'm not sure how to translate the "movq" instructions.  Intel's manual
(that I just downloaded from "http://developer.intel.com/design/mmx/")
says that the opcode for "movq mm, mm/m64" is "0f 6f /r" and "movq
mm/m64 mm" is "0f 7f /r".  Someone who's more proficient in Intel's
terminology can probably translate this.

Satoshi
-------
# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	runtest
#	mem.c
#	unrolled.s
#
echo x - runtest
sed 's/^X//' >runtest << 'END-of-runtest'
X#!/bin/sh
X
Xif [ $# != 1 ]; then
X  echo "usage: $0 executable"
X  exit
Xfi
Xexec=$1
X
Xi=32
Xecho "    size    bandwidth"
Xwhile [ $i -le 1000000 ]; do
X  ./$exec $i 2>&1 >/dev/null | \
X    awk "{printf(\" %7d  %9s MB/s\n\", $i, \$5)}"
X  i=$(($i * 2))
Xdone
END-of-runtest
echo x - mem.c
sed 's/^X//' >mem.c << 'END-of-mem.c'
X/*
X * mem.c - simple memory copy test
X */
X#include <stdio.h>
X#include <stdlib.h>
X#include <string.h>
X#include <ctype.h>
X#include <sys/time.h>
X
X#define TOTALSIZE 4194304	/* 4MB */
X
Xint main(int argc, char **argv)
X{
X        int     usecs, i, bytes;
X        unsigned char *src, *dst;
X	unsigned long tmp;
X	struct timeval tv, start, stop;
X	int N;
X
X	if (argc != 2) {
X		fprintf(stderr,
X		    "Usage: %s size\n", argv[0]);
X		exit(1);
X	}
X	bytes = atoi(argv[1]);
X	if (toupper(argv[1][strlen(argv[1])-1]) == 'K')
X		bytes *= 1024;
X	if (toupper(argv[1][strlen(argv[1])-1]) == 'M')
X		bytes *= (1024 * 1024);
X	if (bytes > TOTALSIZE) {
X	    fprintf(stderr, "size cannot be more than %d\n", TOTALSIZE);
X	    exit(1);
X	}
X	N = TOTALSIZE/bytes;
X        src = (unsigned char *) malloc(bytes*N + 16384);
X	dst = (unsigned char *) malloc(bytes*N + 16384);
X	if (!src || !dst) {
X		perror("malloc");
X		exit(1);
X	}
X
X	/* align arrays to 8K boundaries */
X	tmp = (unsigned long) src;
X	tmp += 8192;
X	tmp &= ~8191;
X	src = (unsigned char *) tmp;
X	tmp = (unsigned long) dst;
X	tmp += 8192;
X	tmp &= ~8191;
X	dst = (unsigned char *) tmp;
X
X	/* fill in src with random junk */
X	gettimeofday(&tv, NULL);
X	srandom(tv.tv_usec);
X	for (i = 0 ; i < bytes*N ; i++)
X	    src[i] = random();
X	bzero(dst, bytes*N);
X
X	/* ensure both src and dst are not swapped out */
X	bcopy(src, dst, bytes*N);
X
X	/* main loop */
X	gettimeofday(&start, NULL) ;
X	for (i = 0; i < N; ++i) {
X	    unrolled(src+bytes*i, dst+bytes*i, bytes);
X	}
X	gettimeofday(&stop, NULL) ;
X
X	usecs = (stop.tv_sec - start.tv_sec)*1000000 +
X	    (stop.tv_usec - start.tv_usec) ;
X
X	/* make sure everything is copied correctly */
X	for (i = 0 ; i < bytes ; i++)
X	    if (src[i] != dst[i])
X		printf("error: byte %d, should be %02x, is %02x\n",
X		       i, src[i], dst[i]);
X
X	fprintf(stderr, "%d bytes copied at  %f MB/s\n", bytes,
X		(double) bytes / 1024 / 1024 / (usecs / 1000000.0 / N));
X
X	return (0);
X}
END-of-mem.c
echo x - unrolled.s
sed 's/^X//' >unrolled.s << 'END-of-unrolled.s'
X.text
X	.align 2
X	.globl	_unrolled
X	.type	_unrolled,@function
X_unrolled:
X	pushl %ebp
X	movl %esp,%ebp
X	pushl %edi
X	pushl %esi
X	movl 8(%ebp),%esi
X	movl 12(%ebp),%edi
X	movl 16(%ebp),%ecx
X
X	cmpl $63,%ecx
X	jbe unrolled_tail
X
X4:
X	pushl %ecx
X	cmpl $1792,%ecx
X	jbe 2f
X	movl $1792,%ecx
X2:
X	subl %ecx,0(%esp)
X	cmpl $256,%ecx
X	jb 5f
X	pushl %esi
X	pushl %ecx
X	.align 4,0x90
X3:
X	movl 0(%esi),%eax
X	movl 32(%esi),%eax
X	movl 64(%esi),%eax
X	movl 96(%esi),%eax
X	movl 128(%esi),%eax
X	movl 160(%esi),%eax
X	movl 192(%esi),%eax
X	movl 224(%esi),%eax
X	addl $256,%esi
X	subl $256,%ecx
X	cmpl $256,%ecx
X	jae 3b
X	popl %ecx
X	popl %esi
X5:
X	.align 2,0x90
Xunrolled_loop:
X	fildq 0(%esi)
X	fildq 8(%esi)
X	fildq 16(%esi)
X	fildq 24(%esi)
X	fildq 32(%esi)
X	fildq 40(%esi)
X	fildq 48(%esi)
X	fildq 56(%esi)
X	fistpq 56(%edi)
X	fistpq 48(%edi)
X	fistpq 40(%edi)
X	fistpq 32(%edi)
X	fistpq 24(%edi)
X	fistpq 16(%edi)
X	fistpq 8(%edi)
X	fistpq 0(%edi)
X	addl $-64,%ecx
X	addl $64,%esi
X	addl $64,%edi
X	cmpl $63,%ecx
X	ja unrolled_loop
X	popl %eax
X	addl %eax,%ecx
X	cmpl $64,%ecx
X	jae 4b
X
Xunrolled_tail:
X	movl %ecx,%eax
X	shrl $2,%ecx
X	cld
X	rep
X	movsl
X	movl %eax,%ecx
X	andl $3,%ecx
X	rep
X	movsb
X
X	leal -8(%ebp),%esp
X	popl %esi
X	popl %edi
X	leave
X	ret
X
Xunrolled_end:
X	.size	_unrolled,unrolled_end-_unrolled
END-of-unrolled.s
exit