Date: Thu, 22 Sep 2011 11:02:22 -0700 From: Artem Belevich <art@freebsd.org> To: freebsd-fs@freebsd.org Cc: Andriy Gapon <avg@freebsd.org> Subject: bootloader block cache improvement Message-ID: <CAFqOu6hu%2Bx89utyjT=4BeDN%2BWNmZpzSFGkdxeXop_-bXeb4kbA@mail.gmail.com>
next in thread | raw e-mail | index | archive | help
Hi, I've had ZFS-only box that boots off 8-drive raidz2 array. I've noticed that on this machine it takes noticeably longer to load kernel and modules than on a similar box that boots off 1-drive ZFS filesystem. It turns out that block cache in loader only caches data from one disk only and invalidates the cache as soon as we read from another drive. With ZFS reading from multiple drives when filesystem is on a raidz pool the cache was effectively useless in that scenario. I've got literally 0 hits reported by bcachestat command. The patch below modifies bootloader block cache so that it can cache data from multiple drives simultaneously. That helped a bit. Another issue is the size of the cache. Currently it's set at 32 512-byte blocks. While it may be enough to cache frequently used data on UFS, it seems to be way too small for ZFS. On my machine the sweet spot seems to be just below 256. Setting it to 256 resulted in sharp increase of cache misses, probably because single 128K record (default on ZFS) would evict all other data from cache. In the end I've settled on 192 blocks which resulted in about 90% hit rate. Here's the data from 'bcachestat' command in the loader on 3-disk raidz: | cache size | ops | bypass | hits | misses | |-------------+------+--------+-------+--------| | 32(w/patch) | 6541 | 287 | 6988 | ~15K | | 128 | 6541 | 281 | ~15K | 7560 | | 192 | 6541 | 281 | 12883 | 1394 | | 256 | 6541 | 15 | ~19K | ~37K | Attached is the patch with the changes. If the changes are acceptable, I'd like to eventually MFC them to 8-stable, too. --Artem ================================================================ --- sys/boot/common/bcache.c | 50 ++++++++++++++++++++++--------------------- sys/boot/i386/loader/main.c | 5 +++- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/sys/boot/common/bcache.c b/sys/boot/common/bcache.c index c88adca..85e47a6 100644 --- a/sys/boot/common/bcache.c +++ b/sys/boot/common/bcache.c @@ -55,6 +55,7 @@ struct bcachectl daddr_t bc_blkno; time_t bc_stamp; int bc_count; + int bc_unit; }; static struct bcachectl *bcache_ctl; @@ -66,9 +67,9 @@ static u_int bcache_hits, bcache_misses, bcache_ops, bcache_bypasses; static u_int bcache_flushes; static u_int bcache_bcount; -static void bcache_invalidate(daddr_t blkno); -static void bcache_insert(caddr_t buf, daddr_t blkno); -static int bcache_lookup(caddr_t buf, daddr_t blkno); +static void bcache_invalidate(int unit, daddr_t blkno); +static void bcache_insert(caddr_t buf, int unit, daddr_t blkno); +static int bcache_lookup(caddr_t buf, int unit, daddr_t blkno); /* * Initialise the cache for (nblks) of (bsize). @@ -117,6 +118,7 @@ bcache_flush(void) for (i = 0; i < bcache_nblks; i++) { bcache_ctl[i].bc_count = -1; bcache_ctl[i].bc_blkno = -1; + bcache_ctl[i].bc_unit = -1; } } @@ -136,7 +138,7 @@ write_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, /* Invalidate the blocks being written */ for (i = 0; i < nblk; i++) { - bcache_invalidate(blk + i); + bcache_invalidate(unit, blk + i); } /* Write the blocks */ @@ -145,7 +147,7 @@ write_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, /* Populate the block cache with the new data */ if (err == 0) { for (i = 0; i < nblk; i++) { - bcache_insert(buf + (i * bcache_blksize),blk + i); + bcache_insert(buf + (i * bcache_blksize), unit, blk + i); } } @@ -171,7 +173,7 @@ read_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, /* Satisfy any cache hits up front */ for (i = 0; i < nblk; i++) { - if (bcache_lookup(buf + (bcache_blksize * i), blk + i)) { + if (bcache_lookup(buf + (bcache_blksize * i), unit, blk + i)) { bit_set(bcache_miss, i); /* cache miss */ bcache_misses++; } else { @@ -200,7 +202,7 @@ read_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, if (result != 0) goto done; for (j = 0; j < p_size; j++) - bcache_insert(p_buf + (j * bcache_blksize), p_blk + j); + bcache_insert(p_buf + (j * bcache_blksize), unit, p_blk + j); p_blk = -1; } } @@ -210,7 +212,7 @@ read_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, if (result != 0) goto done; for (j = 0; j < p_size; j++) - bcache_insert(p_buf + (j * bcache_blksize), p_blk + j); + bcache_insert(p_buf + (j * bcache_blksize), unit, p_blk + j); } done: @@ -227,19 +229,13 @@ int bcache_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize) { - static int bcache_unit = -1; struct bcache_devdata *dd = (struct bcache_devdata *)devdata; bcache_ops++; - if(bcache_unit != unit) { - bcache_flush(); - bcache_unit = unit; - } - /* bypass large requests, or when the cache is inactive */ if ((bcache_data == NULL) || ((size * 2 / bcache_blksize) > bcache_nblks)) { - DEBUG("bypass %d from %d", size / bcache_blksize, blk); + DEBUG("bypass %d from %d", size / bcache_blksize, (int)blk); bcache_bypasses++; return(dd->dv_strategy(dd->dv_devdata, rw, blk, size, buf, rsize)); } @@ -260,7 +256,7 @@ bcache_strategy(void *devdata, int unit, int rw, daddr_t blk, size_t size, * XXX the LRU algorithm will fail after 2^31 blocks have been transferred. */ static void -bcache_insert(caddr_t buf, daddr_t blkno) +bcache_insert(caddr_t buf, int unit, daddr_t blkno) { time_t now; int cand, ocount; @@ -272,7 +268,7 @@ bcache_insert(caddr_t buf, daddr_t blkno) /* find the oldest block */ for (i = 1; i < bcache_nblks; i++) { - if (bcache_ctl[i].bc_blkno == blkno) { + if (bcache_ctl[i].bc_unit == unit && bcache_ctl[i].bc_blkno == blkno) { /* reuse old entry */ cand = i; break; @@ -283,8 +279,9 @@ bcache_insert(caddr_t buf, daddr_t blkno) } } - DEBUG("insert blk %d -> %d @ %d # %d", blkno, cand, now, bcache_bcount); + DEBUG("insert blk %d:%d -> %d @ %d # %d", unit, blkno, cand, now, bcache_bcount); bcopy(buf, bcache_data + (bcache_blksize * cand), bcache_blksize); + bcache_ctl[cand].bc_unit = unit; bcache_ctl[cand].bc_blkno = blkno; bcache_ctl[cand].bc_stamp = now; bcache_ctl[cand].bc_count = bcache_bcount++; @@ -296,7 +293,7 @@ bcache_insert(caddr_t buf, daddr_t blkno) * if successful and return zero, or return nonzero on failure. */ static int -bcache_lookup(caddr_t buf, daddr_t blkno) +bcache_lookup(caddr_t buf, int unit, daddr_t blkno) { time_t now; u_int i; @@ -305,9 +302,10 @@ bcache_lookup(caddr_t buf, daddr_t blkno) for (i = 0; i < bcache_nblks; i++) /* cache hit? */ - if ((bcache_ctl[i].bc_blkno == blkno) && ((bcache_ctl[i].bc_stamp + BCACHE_TIMEOUT) >= now)) { + if ((bcache_ctl[i].bc_unit == unit) && (bcache_ctl[i].bc_blkno == blkno) + && ((bcache_ctl[i].bc_stamp + BCACHE_TIMEOUT) >= now)) { bcopy(bcache_data + (bcache_blksize * i), buf, bcache_blksize); - DEBUG("hit blk %d <- %d (now %d then %d)", blkno, i, now, bcache_ctl[i].bc_stamp); + DEBUG("hit blk %d:%d <- %d (now %d then %d)", unit, blkno, i, now, bcache_ctl[i].bc_stamp); return(0); } return(ENOENT); @@ -317,15 +315,16 @@ bcache_lookup(caddr_t buf, daddr_t blkno) * Invalidate a block from the cache. */ static void -bcache_invalidate(daddr_t blkno) +bcache_invalidate(int unit, daddr_t blkno) { u_int i; for (i = 0; i < bcache_nblks; i++) { - if (bcache_ctl[i].bc_blkno == blkno) { + if ((bcache_ctl[i].bc_unit == unit) && (bcache_ctl[i].bc_blkno == blkno)) { bcache_ctl[i].bc_count = -1; bcache_ctl[i].bc_blkno = -1; - DEBUG("invalidate blk %d", blkno); + bcache_ctl[i].bc_unit = -1; + DEBUG("invalidate blk %d:%d", unit, blkno); break; } } @@ -339,7 +338,8 @@ command_bcache(int argc, char *argv[]) u_int i; for (i = 0; i < bcache_nblks; i++) { - printf("%08jx %04x %04x|", (uintmax_t)bcache_ctl[i].bc_blkno, (unsigned int)bcache_ctl[i].bc_stamp & 0xffff, bcache_ctl[i].bc_count & 0xffff); + printf("%02x:%08jx %04x %04x|", bcache_ctl[i].bc_unit, (uintmax_t)bcache_ctl[i].bc_blkno, + (unsigned int)bcache_ctl[i].bc_stamp & 0xffff, bcache_ctl[i].bc_count & 0xffff); if (((i + 1) % 4) == 0) printf("\n"); } diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c index 75d5dbc..e1a01fa 100644 --- a/sys/boot/i386/loader/main.c +++ b/sys/boot/i386/loader/main.c @@ -86,6 +86,7 @@ int main(void) { int i; + int bcache_size; /* Pick up arguments */ kargs = (void *)__args; @@ -109,11 +110,13 @@ main(void) heap_bottom = PTOV(high_heap_base); if (high_heap_base < memtop_copyin) memtop_copyin = high_heap_base; + bcache_size = 192; } else #endif { heap_top = (void *)PTOV(bios_basemem); heap_bottom = (void *)end; + bcache_size = 32; } setheap(heap_bottom, heap_top); @@ -140,7 +143,7 @@ main(void) /* * Initialise the block cache */ - bcache_init(32, 512); /* 16k cache XXX tune this */ + bcache_init(bcache_size, 512); /* * Special handling for PXE and CD booting.
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAFqOu6hu%2Bx89utyjT=4BeDN%2BWNmZpzSFGkdxeXop_-bXeb4kbA>