From owner-freebsd-current Fri Apr 5 03:47:25 1996 Return-Path: owner-current Received: (from root@localhost) by freefall.freebsd.org (8.7.3/8.7.3) id DAA12742 for current-outgoing; Fri, 5 Apr 1996 03:47:25 -0800 (PST) Received: from silvia.HIP.Berkeley.EDU (silvia.HIP.Berkeley.EDU [136.152.64.181]) by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id DAA12737 Fri, 5 Apr 1996 03:47:17 -0800 (PST) Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.7.5/8.6.9) id DAA24903; Fri, 5 Apr 1996 03:44:34 -0800 (PST) Date: Fri, 5 Apr 1996 03:44:34 -0800 (PST) Message-Id: <199604051144.DAA24903@silvia.HIP.Berkeley.EDU> To: davidg@root.com CC: current@freebsd.org, nisha@cs.berkeley.edu, tege@matematik.su.se, dyson@freebsd.org, hasty@rah.star-gate.com In-reply-to: <199604051021.CAA00222@Root.COM> (message from David Greenman on Fri, 05 Apr 1996 02:21:48 -0800) Subject: Re: fast memory copy for large data sizes From: asami@cs.berkeley.edu (Satoshi Asami) Sender: owner-current@freebsd.org X-Loop: FreeBSD.org Precedence: bulk By the way, if someone wants to try putting it into the kernel, here is a patch to support.s. Change the two "cmpl $1024" lines if you want to change the cutoff. We've been running this on our -current system here for a couple of days, it seems to be working fine. As I said, it pushed up the maximum read bandwidth (through the filesystem) for our disk array from 21MB/s to 23MB/s. (I didn't see much speed difference (only about 100KB/s) for single disks though, the bottleneck is probably not here in this case.) I also wrote a small program to just issue multiple reads to the same region of a file, and I got 37MB/s for the stock kernel and 49MB/s for the modified version on the 133MHz Pentium (which is about the same as what I got from the user-level code). Here's the testing program. Sorry I didn't have time to clean it up, it's kinda messy. But all you need to see is the loop that has memcpy() and lseek() (no, the memcpy() is not called by default). === /* rawread.c: repeatedly read from same block over and over */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* some constants */ #define True (1) #define False (0) #define Ok (0) #define Error (1) /* start of onfigurable parameters */ /* default buffer size */ #define BlockSize 8192 /* default size of file */ #define TotalSize 67108864 int removefile = True ; int verbose = False ; int writeonly = False ; int randomize = False ; int readonly = False ; /* end of configurable parameters */ /* default name of temporary file */ #define TmpFile "disktest.tmp" /* default line length */ #define LineLen 1024 char *myname ; void *xmalloc(size_t size) ; void usage(int retval) ; void error(char *msg) ; void remfile(void) ; void flushoutput(void) ; void cuechild(void) ; int main(int argc, char **argv) { int i ; char *filename ; int blocksize = BlockSize ; int totalsize = TotalSize ; int iterations ; void *buffer, *buffer2 ; int fd ; int count ; struct timeval tv_start, tv_end ; double elapsed ; int docopy = False ; myname = argv[0] ; filename = argv[argc-1] ; if (argc < 2) usage(Error) ; for (i = 1 ; i < argc-1 ; i++) { if (argv[i][0] == '-') { /* option */ if (!strcmp(argv[i], "-b")) { if (i+1 == argc) usage(Error) ; blocksize = atoi(argv[i+1]) ; if (blocksize <= 0) usage(Error) ; i++ ; } else if (!strcmp(argv[i], "-s")) { if (i+1 == argc) usage(Error) ; totalsize = atoi(argv[i+1]) ; if (totalsize <= 0) usage(Error) ; i++ ; } else if (!strcmp(argv[i], "-c")) docopy = True ; else if (!strcmp(argv[i], "-h")) usage(Ok) ; else usage(Error) ; } else usage(Error) ; } iterations = totalsize / blocksize ; if (filename[0] == '-') usage(Error); buffer = xmalloc(blocksize) ; if (docopy) buffer2 = xmalloc(blocksize) ; if ((fd = open(filename, O_RDONLY, 0)) < 0) { fprintf(stderr, "file: %s\n", filename) ; error("open") ; } gettimeofday(&tv_start, NULL) ; for (count = 0 ; count < iterations ; count++) { if (read(fd, buffer, blocksize) != blocksize) error("read") ; if (docopy) memcpy(buffer2, buffer, blocksize) ; lseek(fd, 0, SEEK_SET) ; } gettimeofday(&tv_end, NULL) ; elapsed = tv_end.tv_sec-tv_start.tv_sec + ((double) tv_end.tv_usec-tv_start.tv_usec)/1000000 ; if (verbose) printf("%d reads of %d bytes in %f seconds\n", count, blocksize, elapsed) ; printf("%d bytes transferred in %d secs (%d bytes/sec) from \"%s\"\n", totalsize, (int) elapsed, (int) (totalsize/elapsed), filename) ; fflush(stdout) ; close(fd) ; return Ok ; } void usage(int retval) { fprintf(stderr, "usage: %s [-b bufsize] [-s size] [-c] filename\n", myname) ; exit(retval) ; } void *xmalloc(size_t size) { void *vp ; if ((vp = malloc(size)) == NULL) { fprintf(stderr, "panic: memory exausted with request size %d\n", size) ; exit(Error) ; } return vp ; } void error(char *msg) { perror(msg) ; exit(Error) ; } === Use it as: dd if=/dev/zero of=foo bs=1024 count=1024 rawread -b 1048576 -s 104857600 foo The number after -b is the read request size and the one after -s is the total size (it will repeat it 100 times in the above case). Note that for small -b sizes, the copy will always be done within the cache, so the resulting number may not reflect the real world performance. Here's the patch: === Index: support.s =================================================================== RCS file: /usr/cvs/src/sys/i386/i386/support.s,v retrieving revision 1.31 diff -u -r1.31 support.s --- support.s 1995/12/28 23:14:40 1.31 +++ support.s 1996/04/04 06:27:15 @@ -463,6 +463,14 @@ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx + cmpl $1024,%ecx + jbe slow_copyout + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: shrl $2,%ecx cld rep @@ -510,6 +518,14 @@ cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault + cmpl $1024,%ecx + jbe slow_copyin + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -520,6 +536,8 @@ rep movsb + ALIGN_TEXT +done_copyin: popl %edi popl %esi xorl %eax,%eax @@ -534,6 +552,70 @@ movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax + ret + +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ + ALIGN_TEXT +fastmove: + movl %cr0,%edx + movl $8, %eax /* CR0_TS */ + not %eax + andl %eax,%edx /* clear CR0_TS */ + movl %edx,%cr0 + + cmpl $63,%ecx + jbe L57 + + subl $108,%esp + fsave (%esp) + + ALIGN_TEXT +L58: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fxch %st(7) + fistpq 0(%edi) + fxch %st(5) + fistpq 8(%edi) + fxch %st(3) + fistpq 16(%edi) + fxch %st(1) + fistpq 24(%edi) + fistpq 32(%edi) + fistpq 40(%edi) + fistpq 48(%edi) + fistpq 56(%edi) + addl $-64,%ecx + addl $64,%esi + addl $64,%edi + cmpl $63,%ecx + ja L58 + + frstor (%esp) + addl $108,%esp + + ALIGN_TEXT +L57: + cld + rep + movsb + + andl $8,%edx + movl %cr0,%eax + orl %edx, %eax /* reset CR0_TS to the original value */ + movl %eax,%cr0 + ret /* === Enjoy, Satoshi