From owner-freebsd-hardware Fri Feb 28 04:20:59 1997 Return-Path: Received: (from root@localhost) by freefall.freebsd.org (8.8.5/8.8.5) id EAA23990 for hardware-outgoing; Fri, 28 Feb 1997 04:20:59 -0800 (PST) Received: from dfw-ix6.ix.netcom.com (dfw-ix6.ix.netcom.com [206.214.98.6]) by freefall.freebsd.org (8.8.5/8.8.5) with ESMTP id EAA23983 for ; Fri, 28 Feb 1997 04:20:51 -0800 (PST) Received: (from smap@localhost) by dfw-ix6.ix.netcom.com (8.8.4/8.8.4) id GAA22272; Fri, 28 Feb 1997 06:18:59 -0600 (CST) Received: from ala-ca9-57.ix.netcom.com(207.93.143.121) by dfw-ix6.ix.netcom.com via smap (V1.3) id sma021343; Fri Feb 28 06:17:22 1997 Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.8.5/8.6.9) id EAA07192; Fri, 28 Feb 1997 04:17:18 -0800 (PST) Date: Fri, 28 Feb 1997 04:17:18 -0800 (PST) Message-Id: <199702281217.EAA07192@silvia.HIP.Berkeley.EDU> To: bde@zeta.org.au CC: freebsd-hardware@FreeBSD.ORG In-reply-to: <199702281008.VAA07624@godzilla.zeta.org.au> (message from Bruce Evans on Fri, 28 Feb 1997 21:08:05 +1100) Subject: Re: Memory speed of P6-200 (256k) From: asami@vader.cs.berkeley.edu (Satoshi Asami) Sender: owner-hardware@FreeBSD.ORG X-Loop: FreeBSD.org Precedence: bulk * Just write it hex or octal if you are in a hurry. We already do this * for `rdmsr' etc. Well ok, let's try it first. However, if what Bruce said is true (and it usually is) and I understand it correctly (which I usually don't), we won't get any better results because (1) the memory bandwidth is about 180MB/s on Triton-I/II with EDO, and (2) a P5-133 can't quite keep up (I've seen about 160MB/s), while a P6-166 can. Since MMX comes only with 166MHz and 200MHz parts, it's not really going to help. Anyway. The following shar contains three files in it: "runtest" (shell script), "mem.c" (test driver) and "unrolled.s" (assembly language subroutine). You can do a gcc -O -o mem mem.c unrolled.s and then sh runtest mem to do the standard FPcopy test. This should give you around 80MB/s for a 133MHz and faster Pentium. Now, look at this part inside "unrolled.s": === unrolled_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) === These should be changed to === unrolled_loop: movq mm0,0(%esi) movq mm1,8(%esi) movq mm2,16(%esi) movq mm3,24(%esi) movq mm4,32(%esi) movq mm5,40(%esi) movq mm6,48(%esi) movq mm7,56(%esi) movq 0(%esi),mm0 movq 8(%esi),mm1 movq 16(%esi),mm2 movq 24(%esi),mm3 movq 32(%esi),mm4 movq 40(%esi),mm5 movq 48(%esi),mm6 movq 56(%esi),mm7 === Also, at the end of the routine: === leal -8(%ebp),%esp popl %esi popl %edi leave ret === you need to insert a "empty mmx state" instruction: === leal -8(%ebp),%esp popl %esi popl %edi .byte 0x0f, 0x77 ; emms leave ret === Make these changes and let us know what you get. Unfortunately, we don't have an assembler that works on this yet so I'm not sure how to translate the "movq" instructions. Intel's manual (that I just downloaded from "http://developer.intel.com/design/mmx/") says that the opcode for "movq mm, mm/m64" is "0f 6f /r" and "movq mm/m64 mm" is "0f 7f /r". Someone who's more proficient in Intel's terminology can probably translate this. Satoshi ------- # This is a shell archive. Save it in a file, remove anything before # this line, and then unpack it by entering "sh file". Note, it may # create directories; files and directories will be owned by you and # have default permissions. # # This archive contains: # # runtest # mem.c # unrolled.s # echo x - runtest sed 's/^X//' >runtest << 'END-of-runtest' X#!/bin/sh X Xif [ $# != 1 ]; then X echo "usage: $0 executable" X exit Xfi Xexec=$1 X Xi=32 Xecho " size bandwidth" Xwhile [ $i -le 1000000 ]; do X ./$exec $i 2>&1 >/dev/null | \ X awk "{printf(\" %7d %9s MB/s\n\", $i, \$5)}" X i=$(($i * 2)) Xdone END-of-runtest echo x - mem.c sed 's/^X//' >mem.c << 'END-of-mem.c' X/* X * mem.c - simple memory copy test X */ X#include X#include X#include X#include X#include X X#define TOTALSIZE 4194304 /* 4MB */ X Xint main(int argc, char **argv) X{ X int usecs, i, bytes; X unsigned char *src, *dst; X unsigned long tmp; X struct timeval tv, start, stop; X int N; X X if (argc != 2) { X fprintf(stderr, X "Usage: %s size\n", argv[0]); X exit(1); X } X bytes = atoi(argv[1]); X if (toupper(argv[1][strlen(argv[1])-1]) == 'K') X bytes *= 1024; X if (toupper(argv[1][strlen(argv[1])-1]) == 'M') X bytes *= (1024 * 1024); X if (bytes > TOTALSIZE) { X fprintf(stderr, "size cannot be more than %d\n", TOTALSIZE); X exit(1); X } X N = TOTALSIZE/bytes; X src = (unsigned char *) malloc(bytes*N + 16384); X dst = (unsigned char *) malloc(bytes*N + 16384); X if (!src || !dst) { X perror("malloc"); X exit(1); X } X X /* align arrays to 8K boundaries */ X tmp = (unsigned long) src; X tmp += 8192; X tmp &= ~8191; X src = (unsigned char *) tmp; X tmp = (unsigned long) dst; X tmp += 8192; X tmp &= ~8191; X dst = (unsigned char *) tmp; X X /* fill in src with random junk */ X gettimeofday(&tv, NULL); X srandom(tv.tv_usec); X for (i = 0 ; i < bytes*N ; i++) X src[i] = random(); X bzero(dst, bytes*N); X X /* ensure both src and dst are not swapped out */ X bcopy(src, dst, bytes*N); X X /* main loop */ X gettimeofday(&start, NULL) ; X for (i = 0; i < N; ++i) { X unrolled(src+bytes*i, dst+bytes*i, bytes); X } X gettimeofday(&stop, NULL) ; X X usecs = (stop.tv_sec - start.tv_sec)*1000000 + X (stop.tv_usec - start.tv_usec) ; X X /* make sure everything is copied correctly */ X for (i = 0 ; i < bytes ; i++) X if (src[i] != dst[i]) X printf("error: byte %d, should be %02x, is %02x\n", X i, src[i], dst[i]); X X fprintf(stderr, "%d bytes copied at %f MB/s\n", bytes, X (double) bytes / 1024 / 1024 / (usecs / 1000000.0 / N)); X X return (0); X} END-of-mem.c echo x - unrolled.s sed 's/^X//' >unrolled.s << 'END-of-unrolled.s' X.text X .align 2 X .globl _unrolled X .type _unrolled,@function X_unrolled: X pushl %ebp X movl %esp,%ebp X pushl %edi X pushl %esi X movl 8(%ebp),%esi X movl 12(%ebp),%edi X movl 16(%ebp),%ecx X X cmpl $63,%ecx X jbe unrolled_tail X X4: X pushl %ecx X cmpl $1792,%ecx X jbe 2f X movl $1792,%ecx X2: X subl %ecx,0(%esp) X cmpl $256,%ecx X jb 5f X pushl %esi X pushl %ecx X .align 4,0x90 X3: X movl 0(%esi),%eax X movl 32(%esi),%eax X movl 64(%esi),%eax X movl 96(%esi),%eax X movl 128(%esi),%eax X movl 160(%esi),%eax X movl 192(%esi),%eax X movl 224(%esi),%eax X addl $256,%esi X subl $256,%ecx X cmpl $256,%ecx X jae 3b X popl %ecx X popl %esi X5: X .align 2,0x90 Xunrolled_loop: X fildq 0(%esi) X fildq 8(%esi) X fildq 16(%esi) X fildq 24(%esi) X fildq 32(%esi) X fildq 40(%esi) X fildq 48(%esi) X fildq 56(%esi) X fistpq 56(%edi) X fistpq 48(%edi) X fistpq 40(%edi) X fistpq 32(%edi) X fistpq 24(%edi) X fistpq 16(%edi) X fistpq 8(%edi) X fistpq 0(%edi) X addl $-64,%ecx X addl $64,%esi X addl $64,%edi X cmpl $63,%ecx X ja unrolled_loop X popl %eax X addl %eax,%ecx X cmpl $64,%ecx X jae 4b X Xunrolled_tail: X movl %ecx,%eax X shrl $2,%ecx X cld X rep X movsl X movl %eax,%ecx X andl $3,%ecx X rep X movsb X X leal -8(%ebp),%esp X popl %esi X popl %edi X leave X ret X Xunrolled_end: X .size _unrolled,unrolled_end-_unrolled END-of-unrolled.s exit