Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 24 Oct 2010 18:05:05 +0200
From:      Jean-Francois Dockes <jf@dockes.org>
To:        freebsd-hackers@freebsd.org
Subject:   dump cache performance
Message-ID:  <19652.22833.731361.647776@y.dockes.com>
In-Reply-To: <20101024152835.GA62748@muon.cran.org.uk>
References:  <AANLkTikzZvZn=vNNRtcSViWq8ty7b8qOooQ4NbHiJH5q@mail.gmail.com> <4CC44E5B.9040902@infracaninophile.co.uk> <20101024152835.GA62748@muon.cran.org.uk>

next in thread | previous in thread | raw e-mail | index | archive | help

--7h54wWBw8j
Content-Type: text/plain; charset=us-ascii
Content-Description: message body text
Content-Transfer-Encoding: 7bit

Hello,

I took a look at the "cache management for the dump command" project on the
freebsd.org project ideas page.
http://www.freebsd.org/projects/ideas/ideas.html#p-extenddump

It appears that modifying dump to use a shared cache in a very simple way
(move the control structures to the shared segment and perform simple locking)
yields substantial speed increases.

A patch implementing this is attached. 

Some numbers follow. The test system is an intel core i5 750, with recent
SATA disks. The tests all record an improvement with the shared cache, but
the values vary widely from 7% to 236%. It would be interesting to have
more tests on different configurations.

Would someone be interested in reviewing the patch and/or perform
more tests ?

Regards,
J.F. Dockes


Some tests results
===================

The command used in all cases is  "dump -0aC XX -f /dev/null filesystem"

The current dump actually uses 5 times the value of the -C option for
cache. The patched version uses a single shared memory segment. So "olddump
-C 10" and "newdump -C 50" are equivalent in terms of cache memory usage.

---------------
Tests performed on a small slice (3.7GB/4GB). The filesystem is quite full, and
has been pushed beyond full then partially pruned a few times to simulate
one which would actually had a life. It contains a mix of /home/ user
files (avg size 68 kB). Tests were run both in single disk and mirror mode.

Mirrored slice
    Split cache -C 10: 18 MB/s
    Shared cache -C 10 : 42 MB/s (+133%)

Same slice, without the mirroring
    Split cache -C 10: 11 MB/s
    Shared cache -C 50: 37 MB/s  (+236%)

--------
Tests on /var (500 MB / 5 GB). Mirrored slice
    Split cache -C 10: 15 MB/s
    Shared cache -C 50: 28 MB/s (+86%)

-----------
Tests on a bigger slice (24 GB / 43 GB) with mostly big files. Single disk
    Split cache -C 10: 15 MB/s
    Shared cache -C 50: 35 MB/s (+133%)

-----------
Tests on /usr (464 GB / 595 GB), mirrored
    Split cache, -C 50: 57 MB/s
    Shared cache, -C 250: 63 MB/s (+10%)
Level 1 tests (5GB dump)
    Split cache, -C 50: 38 MB/s
    Shared cache, -C 250: 41 MB/s (+7%)


--7h54wWBw8j
Content-Type: text/x-patch; name="dump-shared-cache.diff"
Content-Disposition: inline;
	filename="dump-shared-cache.diff"
Content-Transfer-Encoding: 7bit

diff -r 01d7ce8f41d5 cache.c
--- a/cache.c	Wed Oct 20 08:39:03 2010 +0200
+++ b/cache.c	Sun Oct 24 11:27:36 2010 +0200
@@ -9,6 +9,8 @@
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
+#include <sys/file.h>
+#include <fcntl.h>
 
 #ifdef sunos
 #include <sys/vnode.h>
@@ -43,6 +45,8 @@
 #define HFACTOR		4
 #define BLKFACTOR	4
 
+static int lockfd;
+
 static char  *DataBase;
 static Block **BlockHash;
 static int   BlockSize;
@@ -55,6 +59,15 @@
 	int i;
 	int hi;
 	Block *base;
+	int data_offs;
+	char tempname[20];
+
+	/* If mkstemp fails, we're back to using flock(), no need for special
+	   action here */
+	strcpy(tempname, "/tmp/dump.XXXXXX");
+	if ((lockfd = mkstemp(tempname)) >= 0) {
+		unlink(tempname);
+	}
 
 	if ((BlockSize = sblock->fs_bsize * BLKFACTOR) > MAXBSIZE)
 		BlockSize = MAXBSIZE;
@@ -64,10 +77,19 @@
 	msg("Cache %d MB, blocksize = %d\n", 
 	    NBlocks * BlockSize / (1024 * 1024), BlockSize);
 
-	base = calloc(sizeof(Block), NBlocks);
-	BlockHash = calloc(sizeof(Block *), HSize);
-	DataBase = mmap(NULL, NBlocks * BlockSize, 
-			PROT_READ|PROT_WRITE, MAP_ANON, -1, 0);
+	data_offs = howmany(sizeof(Block) * NBlocks + sizeof(Block *) * HSize,
+			    getpagesize()) * getpagesize();
+	DataBase = mmap(NULL, NBlocks * BlockSize + data_offs, 
+			PROT_READ|PROT_WRITE, MAP_ANON|MAP_SHARED|MAP_NOSYNC, 
+			-1, 0);
+	if (DataBase == MAP_FAILED) {
+		msg("mmap failed: %s\n", strerror(errno));
+		DataBase = NULL;
+		return;
+	}
+	BlockHash = (Block **)DataBase;
+	base = (Block *)(DataBase + HSize * sizeof(Block *));
+	DataBase += data_offs;
 	for (i = 0; i < NBlocks; ++i) {
 		base[i].b_Data = DataBase + i * BlockSize;
 		base[i].b_Offset = (off_t)-1;
@@ -86,14 +108,18 @@
 	int hi;
 	int n;
 	off_t mask;
+	struct flock lock;
 
 	/*
 	 * If the cache is disabled, or we do not yet know the filesystem
 	 * block size, then revert to pread.  Otherwise initialize the
 	 * cache as necessary and continue.
+	 * This will happen in the top process while mapping and needs no 
+	 * locking.
 	 */
-	if (cachesize <= 0 || sblock->fs_bsize == 0)
+	if (cachesize <= 0 || sblock->fs_bsize == 0) {
 		return(pread(fd, buf, nbytes, offset));
+	}
 	if (DataBase == NULL)
 		cinit();
 
@@ -115,6 +141,20 @@
 	 * occur near the end of the media).
 	 */
 	hi = (offset / BlockSize) % HSize;
+
+	/* Lock hash bucket */
+	lock.l_start = hi;
+	lock.l_len = 1;
+	lock.l_type = F_WRLCK;
+	lock.l_whence = SEEK_SET;
+	    
+	if (lockfd >= 0 ? fcntl(lockfd, F_SETLKW, &lock) :
+	    flock(fd, LOCK_EX)) {
+		msg("shared cache lock failed: %s\n", strerror(errno));
+		return(pread(fd, buf, nbytes, offset));
+	}
+	lock.l_type = F_UNLCK;
+
 	pblk = &BlockHash[hi];
 	ppblk = NULL;
 	while ((blk = *pblk) != NULL) {
@@ -138,8 +178,19 @@
 		*pblk = blk->b_HNext;
 		blk->b_HNext = BlockHash[hi];
 		BlockHash[hi] = blk;
+
+		if (lockfd >= 0 ? fcntl(lockfd, F_SETLKW, &lock) :
+		    flock(diskfd, LOCK_UN)) {
+			msg("unlock shared cache: %s\n", strerror(errno));
+		}
+
 		return(nbytes);
 	} else {
+		if (lockfd >= 0 ? fcntl(lockfd, F_SETLKW, &lock) :
+		    flock(diskfd, LOCK_UN)) {
+			msg("unlock shared cache: %s\n", strerror(errno));
+		}
+
 		return(pread(fd, buf, nbytes, offset));
 	}
 }

--7h54wWBw8j
Content-Type: text/plain; charset=us-ascii
Content-Description: message body text
Content-Transfer-Encoding: 7bit

diff
--7h54wWBw8j
Content-Type: text/x-patch; name="dump-shared-cache.diff"
Content-Disposition: inline;
	filename="dump-shared-cache.diff"
Content-Transfer-Encoding: 7bit

diff -r 01d7ce8f41d5 cache.c
--- a/cache.c	Wed Oct 20 08:39:03 2010 +0200
+++ b/cache.c	Sun Oct 24 11:27:36 2010 +0200
@@ -9,6 +9,8 @@
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
+#include <sys/file.h>
+#include <fcntl.h>
 
 #ifdef sunos
 #include <sys/vnode.h>
@@ -43,6 +45,8 @@
 #define HFACTOR		4
 #define BLKFACTOR	4
 
+static int lockfd;
+
 static char  *DataBase;
 static Block **BlockHash;
 static int   BlockSize;
@@ -55,6 +59,15 @@
 	int i;
 	int hi;
 	Block *base;
+	int data_offs;
+	char tempname[20];
+
+	/* If mkstemp fails, we're back to using flock(), no need for special
+	   action here */
+	strcpy(tempname, "/tmp/dump.XXXXXX");
+	if ((lockfd = mkstemp(tempname)) >= 0) {
+		unlink(tempname);
+	}
 
 	if ((BlockSize = sblock->fs_bsize * BLKFACTOR) > MAXBSIZE)
 		BlockSize = MAXBSIZE;
@@ -64,10 +77,19 @@
 	msg("Cache %d MB, blocksize = %d\n", 
 	    NBlocks * BlockSize / (1024 * 1024), BlockSize);
 
-	base = calloc(sizeof(Block), NBlocks);
-	BlockHash = calloc(sizeof(Block *), HSize);
-	DataBase = mmap(NULL, NBlocks * BlockSize, 
-			PROT_READ|PROT_WRITE, MAP_ANON, -1, 0);
+	data_offs = howmany(sizeof(Block) * NBlocks + sizeof(Block *) * HSize,
+			    getpagesize()) * getpagesize();
+	DataBase = mmap(NULL, NBlocks * BlockSize + data_offs, 
+			PROT_READ|PROT_WRITE, MAP_ANON|MAP_SHARED|MAP_NOSYNC, 
+			-1, 0);
+	if (DataBase == MAP_FAILED) {
+		msg("mmap failed: %s\n", strerror(errno));
+		DataBase = NULL;
+		return;
+	}
+	BlockHash = (Block **)DataBase;
+	base = (Block *)(DataBase + HSize * sizeof(Block *));
+	DataBase += data_offs;
 	for (i = 0; i < NBlocks; ++i) {
 		base[i].b_Data = DataBase + i * BlockSize;
 		base[i].b_Offset = (off_t)-1;
@@ -86,14 +108,18 @@
 	int hi;
 	int n;
 	off_t mask;
+	struct flock lock;
 
 	/*
 	 * If the cache is disabled, or we do not yet know the filesystem
 	 * block size, then revert to pread.  Otherwise initialize the
 	 * cache as necessary and continue.
+	 * This will happen in the top process while mapping and needs no 
+	 * locking.
 	 */
-	if (cachesize <= 0 || sblock->fs_bsize == 0)
+	if (cachesize <= 0 || sblock->fs_bsize == 0) {
 		return(pread(fd, buf, nbytes, offset));
+	}
 	if (DataBase == NULL)
 		cinit();
 
@@ -115,6 +141,20 @@
 	 * occur near the end of the media).
 	 */
 	hi = (offset / BlockSize) % HSize;
+
+	/* Lock hash bucket */
+	lock.l_start = hi;
+	lock.l_len = 1;
+	lock.l_type = F_WRLCK;
+	lock.l_whence = SEEK_SET;
+	    
+	if (lockfd >= 0 ? fcntl(lockfd, F_SETLKW, &lock) :
+	    flock(fd, LOCK_EX)) {
+		msg("shared cache lock failed: %s\n", strerror(errno));
+		return(pread(fd, buf, nbytes, offset));
+	}
+	lock.l_type = F_UNLCK;
+
 	pblk = &BlockHash[hi];
 	ppblk = NULL;
 	while ((blk = *pblk) != NULL) {
@@ -138,8 +178,19 @@
 		*pblk = blk->b_HNext;
 		blk->b_HNext = BlockHash[hi];
 		BlockHash[hi] = blk;
+
+		if (lockfd >= 0 ? fcntl(lockfd, F_SETLKW, &lock) :
+		    flock(diskfd, LOCK_UN)) {
+			msg("unlock shared cache: %s\n", strerror(errno));
+		}
+
 		return(nbytes);
 	} else {
+		if (lockfd >= 0 ? fcntl(lockfd, F_SETLKW, &lock) :
+		    flock(diskfd, LOCK_UN)) {
+			msg("unlock shared cache: %s\n", strerror(errno));
+		}
+
 		return(pread(fd, buf, nbytes, offset));
 	}
 }

--7h54wWBw8j--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?19652.22833.731361.647776>