Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 22 Sep 2011 11:02:22 -0700
From:      Artem Belevich <art@freebsd.org>
To:        freebsd-fs@freebsd.org
Cc:        Andriy Gapon <avg@freebsd.org>
Subject:   bootloader block cache improvement
Message-ID:  <CAFqOu6hu%2Bx89utyjT=4BeDN%2BWNmZpzSFGkdxeXop_-bXeb4kbA@mail.gmail.com>

next in thread | raw e-mail | index | archive | help
Hi,

I've had ZFS-only box that boots off 8-drive raidz2 array. I've noticed
that on this machine it takes noticeably longer to load kernel and
modules than on a similar box that boots off 1-drive ZFS filesystem.

It turns out that block cache in loader only caches data from one disk
only and invalidates the cache as soon as we read from another
drive. With ZFS reading from multiple drives when filesystem is on a
raidz pool the cache was effectively useless in that scenario. I've got
literally 0 hits reported by bcachestat command.

The patch below modifies bootloader block cache so that it can cache
data from multiple drives simultaneously. That helped a bit.

Another issue is the size of the cache. Currently it's set at 32
512-byte blocks. While it may be enough to cache frequently used data on
UFS, it seems to be way too small for ZFS. On my machine the sweet spot
seems to be just below 256. Setting it to 256 resulted in sharp increase
of cache misses, probably because single 128K record (default on ZFS)
would evict all other data from cache. In the end I've settled on 192
blocks which resulted in about 90% hit rate.

Here's the data from 'bcachestat' command in the loader on 3-disk raidz:

 |  cache size |  ops | bypass | hits  | misses |
 |-------------+------+--------+-------+--------|
 | 32(w/patch) | 6541 |    287 | 6988  | ~15K   |
 |         128 | 6541 |    281 | ~15K  | 7560   |
 |         192 | 6541 |    281 | 12883 | 1394   |
 |         256 | 6541 |     15 | ~19K  | ~37K   |

Attached is the patch with the changes.

If the changes are acceptable, I'd like to eventually MFC them to
8-stable, too.

--Artem


================================================================

---
 sys/boot/common/bcache.c    |   50 ++++++++++++++++++++++---------------------
 sys/boot/i386/loader/main.c |    5 +++-
 2 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/sys/boot/common/bcache.c b/sys/boot/common/bcache.c
index c88adca..85e47a6 100644
--- a/sys/boot/common/bcache.c
+++ b/sys/boot/common/bcache.c
@@ -55,6 +55,7 @@ struct bcachectl
     daddr_t	bc_blkno;
     time_t	bc_stamp;
     int		bc_count;
+    int		bc_unit;
 };

 static struct bcachectl	*bcache_ctl;
@@ -66,9 +67,9 @@ static u_int		bcache_hits, bcache_misses,
bcache_ops, bcache_bypasses;
 static u_int		bcache_flushes;
 static u_int		bcache_bcount;

-static void	bcache_invalidate(daddr_t blkno);
-static void	bcache_insert(caddr_t buf, daddr_t blkno);
-static int	bcache_lookup(caddr_t buf, daddr_t blkno);
+static void	bcache_invalidate(int unit, daddr_t blkno);
+static void	bcache_insert(caddr_t buf, int unit, daddr_t blkno);
+static int	bcache_lookup(caddr_t buf, int unit, daddr_t blkno);

 /*
  * Initialise the cache for (nblks) of (bsize).
@@ -117,6 +118,7 @@ bcache_flush(void)
     for (i = 0; i < bcache_nblks; i++) {
 	bcache_ctl[i].bc_count = -1;
 	bcache_ctl[i].bc_blkno = -1;
+	bcache_ctl[i].bc_unit  = -1;
     }
 }

@@ -136,7 +138,7 @@ write_strategy(void *devdata, int unit, int rw,
daddr_t blk, size_t size,

     /* Invalidate the blocks being written */
     for (i = 0; i < nblk; i++) {
-	bcache_invalidate(blk + i);
+	bcache_invalidate(unit, blk + i);
     }

     /* Write the blocks */
@@ -145,7 +147,7 @@ write_strategy(void *devdata, int unit, int rw,
daddr_t blk, size_t size,
     /* Populate the block cache with the new data */
     if (err == 0) {
     	for (i = 0; i < nblk; i++) {
-	    bcache_insert(buf + (i * bcache_blksize),blk + i);
+	    bcache_insert(buf + (i * bcache_blksize), unit, blk + i);
     	}
     }

@@ -171,7 +173,7 @@ read_strategy(void *devdata, int unit, int rw,
daddr_t blk, size_t size,

     /* Satisfy any cache hits up front */
     for (i = 0; i < nblk; i++) {
-	if (bcache_lookup(buf + (bcache_blksize * i), blk + i)) {
+	if (bcache_lookup(buf + (bcache_blksize * i), unit, blk + i)) {
 	    bit_set(bcache_miss, i);	/* cache miss */
 	    bcache_misses++;
 	} else {
@@ -200,7 +202,7 @@ read_strategy(void *devdata, int unit, int rw,
daddr_t blk, size_t size,
 	    if (result != 0)
 		goto done;
 	    for (j = 0; j < p_size; j++)
-		bcache_insert(p_buf + (j * bcache_blksize), p_blk + j);
+		bcache_insert(p_buf + (j * bcache_blksize), unit, p_blk + j);
 	    p_blk = -1;
 	}
     }
@@ -210,7 +212,7 @@ read_strategy(void *devdata, int unit, int rw,
daddr_t blk, size_t size,
 	if (result != 0)
 	    goto done;
 	for (j = 0; j < p_size; j++)
-	    bcache_insert(p_buf + (j * bcache_blksize), p_blk + j);
+	    bcache_insert(p_buf + (j * bcache_blksize), unit, p_blk + j);
     }

  done:
@@ -227,19 +229,13 @@ int
 bcache_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size,
 		char *buf, size_t *rsize)
 {
-    static int			bcache_unit = -1;
     struct bcache_devdata	*dd = (struct bcache_devdata *)devdata;

     bcache_ops++;

-    if(bcache_unit != unit) {
-	bcache_flush();
-	bcache_unit = unit;
-    }
-
     /* bypass large requests, or when the cache is inactive */
     if ((bcache_data == NULL) || ((size * 2 / bcache_blksize) >
bcache_nblks)) {
-	DEBUG("bypass %d from %d", size / bcache_blksize, blk);
+	DEBUG("bypass %d from %d", size / bcache_blksize, (int)blk);
 	bcache_bypasses++;
 	return(dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize));
     }
@@ -260,7 +256,7 @@ bcache_strategy(void *devdata, int unit, int rw,
daddr_t blk, size_t size,
  * XXX the LRU algorithm will fail after 2^31 blocks have been transferred.
  */
 static void
-bcache_insert(caddr_t buf, daddr_t blkno)
+bcache_insert(caddr_t buf, int unit, daddr_t blkno)
 {
     time_t	now;
     int		cand, ocount;
@@ -272,7 +268,7 @@ bcache_insert(caddr_t buf, daddr_t blkno)

     /* find the oldest block */
     for (i = 1; i < bcache_nblks; i++) {
-	if (bcache_ctl[i].bc_blkno == blkno) {
+	if (bcache_ctl[i].bc_unit == unit && bcache_ctl[i].bc_blkno == blkno) {
 	    /* reuse old entry */
 	    cand = i;
 	    break;
@@ -283,8 +279,9 @@ bcache_insert(caddr_t buf, daddr_t blkno)
 	}
     }

-    DEBUG("insert blk %d -> %d @ %d # %d", blkno, cand, now, bcache_bcount);
+    DEBUG("insert blk %d:%d -> %d @ %d # %d", unit, blkno, cand, now,
bcache_bcount);
     bcopy(buf, bcache_data + (bcache_blksize * cand), bcache_blksize);
+    bcache_ctl[cand].bc_unit  = unit;
     bcache_ctl[cand].bc_blkno = blkno;
     bcache_ctl[cand].bc_stamp = now;
     bcache_ctl[cand].bc_count = bcache_bcount++;
@@ -296,7 +293,7 @@ bcache_insert(caddr_t buf, daddr_t blkno)
  * if successful and return zero, or return nonzero on failure.
  */
 static int
-bcache_lookup(caddr_t buf, daddr_t blkno)
+bcache_lookup(caddr_t buf, int unit, daddr_t blkno)
 {
     time_t	now;
     u_int	i;
@@ -305,9 +302,10 @@ bcache_lookup(caddr_t buf, daddr_t blkno)

     for (i = 0; i < bcache_nblks; i++)
 	/* cache hit? */
-	if ((bcache_ctl[i].bc_blkno == blkno) && ((bcache_ctl[i].bc_stamp +
BCACHE_TIMEOUT) >= now)) {
+	if ((bcache_ctl[i].bc_unit == unit) && (bcache_ctl[i].bc_blkno == blkno)
+	    && ((bcache_ctl[i].bc_stamp + BCACHE_TIMEOUT) >= now)) {
 	    bcopy(bcache_data + (bcache_blksize * i), buf, bcache_blksize);
-	    DEBUG("hit blk %d <- %d (now %d then %d)", blkno, i, now,
bcache_ctl[i].bc_stamp);
+	    DEBUG("hit blk %d:%d <- %d (now %d then %d)", unit, blkno, i,
now, bcache_ctl[i].bc_stamp);
 	    return(0);
 	}
     return(ENOENT);
@@ -317,15 +315,16 @@ bcache_lookup(caddr_t buf, daddr_t blkno)
  * Invalidate a block from the cache.
  */
 static void
-bcache_invalidate(daddr_t blkno)
+bcache_invalidate(int unit, daddr_t blkno)
 {
     u_int	i;

     for (i = 0; i < bcache_nblks; i++) {
-	if (bcache_ctl[i].bc_blkno == blkno) {
+	if ((bcache_ctl[i].bc_unit == unit) && (bcache_ctl[i].bc_blkno == blkno)) {
 	    bcache_ctl[i].bc_count = -1;
 	    bcache_ctl[i].bc_blkno = -1;
-	    DEBUG("invalidate blk %d", blkno);
+	    bcache_ctl[i].bc_unit  = -1;
+	    DEBUG("invalidate blk %d:%d", unit, blkno);
 	    break;
 	}
     }
@@ -339,7 +338,8 @@ command_bcache(int argc, char *argv[])
     u_int	i;

     for (i = 0; i < bcache_nblks; i++) {
-	printf("%08jx %04x %04x|", (uintmax_t)bcache_ctl[i].bc_blkno,
(unsigned int)bcache_ctl[i].bc_stamp & 0xffff, bcache_ctl[i].bc_count
& 0xffff);
+	printf("%02x:%08jx %04x %04x|", bcache_ctl[i].bc_unit,
(uintmax_t)bcache_ctl[i].bc_blkno,
+	       (unsigned int)bcache_ctl[i].bc_stamp & 0xffff,
bcache_ctl[i].bc_count & 0xffff);
 	if (((i + 1) % 4) == 0)
 	    printf("\n");
     }
diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c
index 75d5dbc..e1a01fa 100644
--- a/sys/boot/i386/loader/main.c
+++ b/sys/boot/i386/loader/main.c
@@ -86,6 +86,7 @@ int
 main(void)
 {
     int			i;
+    int 		bcache_size;

     /* Pick up arguments */
     kargs = (void *)__args;
@@ -109,11 +110,13 @@ main(void)
 	heap_bottom = PTOV(high_heap_base);
 	if (high_heap_base < memtop_copyin)
 	    memtop_copyin = high_heap_base;
+	bcache_size = 192;
     } else
 #endif
     {
 	heap_top = (void *)PTOV(bios_basemem);
 	heap_bottom = (void *)end;
+	bcache_size = 32;
     }
     setheap(heap_bottom, heap_top);

@@ -140,7 +143,7 @@ main(void)
     /*
      * Initialise the block cache
      */
-    bcache_init(32, 512);	/* 16k cache XXX tune this */
+    bcache_init(bcache_size, 512);

     /*
      * Special handling for PXE and CD booting.



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAFqOu6hu%2Bx89utyjT=4BeDN%2BWNmZpzSFGkdxeXop_-bXeb4kbA>