From owner-freebsd-bugs@FreeBSD.ORG Tue May 7 09:00:00 2013 Return-Path: Delivered-To: freebsd-bugs@smarthost.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.FreeBSD.org [8.8.178.115]) by hub.freebsd.org (Postfix) with ESMTP id BB5B3283 for ; Tue, 7 May 2013 09:00:00 +0000 (UTC) (envelope-from gnats@FreeBSD.org) Received: from freefall.freebsd.org (freefall.freebsd.org [IPv6:2001:1900:2254:206c::16:87]) by mx1.freebsd.org (Postfix) with ESMTP id A2F969FB for ; Tue, 7 May 2013 09:00:00 +0000 (UTC) Received: from freefall.freebsd.org (localhost [127.0.0.1]) by freefall.freebsd.org (8.14.6/8.14.6) with ESMTP id r47900MV089273 for ; Tue, 7 May 2013 09:00:00 GMT (envelope-from gnats@freefall.freebsd.org) Received: (from gnats@localhost) by freefall.freebsd.org (8.14.6/8.14.6/Submit) id r479009i089272; Tue, 7 May 2013 09:00:00 GMT (envelope-from gnats) Resent-Date: Tue, 7 May 2013 09:00:00 GMT Resent-Message-Id: <201305070900.r479009i089272@freefall.freebsd.org> Resent-From: FreeBSD-gnats-submit@FreeBSD.org (GNATS Filer) Resent-To: freebsd-bugs@FreeBSD.org Resent-Reply-To: FreeBSD-gnats-submit@FreeBSD.org, Adam Nowacki Received: from mx1.freebsd.org (mx1.FreeBSD.org [8.8.178.115]) by hub.freebsd.org (Postfix) with ESMTP id 7ED30F3 for ; Tue, 7 May 2013 08:57:35 +0000 (UTC) (envelope-from nobody@FreeBSD.org) Received: from oldred.FreeBSD.org (oldred.freebsd.org [8.8.178.121]) by mx1.freebsd.org (Postfix) with ESMTP id 6FCCF9D9 for ; Tue, 7 May 2013 08:57:35 +0000 (UTC) Received: from oldred.FreeBSD.org ([127.0.1.6]) by oldred.FreeBSD.org (8.14.5/8.14.5) with ESMTP id r478vYGX094212 for ; Tue, 7 May 2013 08:57:34 GMT (envelope-from nobody@oldred.FreeBSD.org) Received: (from nobody@localhost) by oldred.FreeBSD.org (8.14.5/8.14.5/Submit) id r478vYev094211; Tue, 7 May 2013 08:57:34 GMT (envelope-from nobody) Message-Id: <201305070857.r478vYev094211@oldred.FreeBSD.org> Date: Tue, 7 May 2013 08:57:34 GMT From: Adam Nowacki To: freebsd-gnats-submit@FreeBSD.org X-Send-Pr-Version: www-3.1 Subject: kern/178387: [zfs] [patch] sparse files performance improvements X-BeenThere: freebsd-bugs@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: Bug reports List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 07 May 2013 09:00:00 -0000 >Number: 178387 >Category: kern >Synopsis: [zfs] [patch] sparse files performance improvements >Confidential: no >Severity: non-critical >Priority: low >Responsible: freebsd-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: sw-bug >Submitter-Id: current-users >Arrival-Date: Tue May 07 09:00:00 UTC 2013 >Closed-Date: >Last-Modified: >Originator: Adam Nowacki >Release: 9.1-STABLE >Organization: >Environment: >Description: Sparse files in ZFS are slow. Default behavior is to allocate a temporary dbuf, zero it and destroy on return. There is no caching. Bandwidth is limited by cpu time bzeroing temporary dbufs. The attached patch avoids creating temporary dbufs. 4k reads are important for mmapped files. without the patch: zfs recordsize=128k, 4k reads: 104857600 bytes transferred in 1.178753 secs (88956388 bytes/sec) zfs recordsize=8M, 4k reads: 8388608 bytes transferred in 9.134382 secs (918355 bytes/sec) with the patch: zfs recordsize=128k, 4k reads: (13x improvement) 1048576000 bytes transferred in 0.904384 secs (1159436525 bytes/sec) zfs recordsize=8M, 4k reads: (over 1000x improvement) 1048576000 bytes transferred in 0.909877 secs (1152437009 bytes/sec) >How-To-Repeat: # truncate -s 1000m testfile # time dd if=testfile of=/dev/null bs=4k >Fix: Patch attached with submission follows: Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 250290) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (working copy) @@ -591,6 +591,7 @@ */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ +#define DMU_READ_FAIL_SPARSE 2 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 250290) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (working copy) @@ -412,12 +412,19 @@ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); - if (db == NULL) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); - return (SET_ERROR(EIO)); + dmu_buf_impl_t *db; + int dberr = dbuf_hold_impl(dn, 0, blkid + i, + flags & DMU_READ_FAIL_SPARSE ? TRUE : FALSE, tag, &db); + if (dberr) { + if (dberr == ENOENT && flags & DMU_READ_FAIL_SPARSE && read) { + dbp[i] = NULL; + continue; + } else { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (SET_ERROR(EIO)); + } } /* initiate async i/o */ if (read) @@ -442,7 +449,7 @@ /* wait for other io to complete */ if (read) { - for (i = 0; i < nblks; i++) { + for (i = 0; i < nblks; i++) if (dbp[i]) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || @@ -755,6 +762,7 @@ while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + uint64_t end_size = size - mylen; int i; /* @@ -762,17 +770,22 @@ * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp, flags); + TRUE, FTAG, &numbufs, &dbp, flags | DMU_READ_FAIL_SPARSE); if (err) break; for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; + uint64_t bufoff, tocpy; dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - + if (!db) + continue; + if (offset < db->db_offset) { + uint64_t len = db->db_offset - offset; + bzero(buf, len); + buf = (char *)buf + len; + offset += len; + size -= len; + } bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); @@ -782,6 +795,14 @@ size -= tocpy; buf = (char *)buf + tocpy; } + if (size > end_size) { + uint64_t len = size - end_size; + bzero(buf, len); + buf = (char *)buf + len; + offset += len; + size -= len; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); @@ -981,54 +1002,76 @@ dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; + dnode_t *dn; - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); - if (err) - return (err); + ASSERT(size > 0); #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); - ASSERT(size > 0); + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + /* XXX - optimize sparse reads for xuio too? */ + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, + FTAG, &numbufs, &dbp, + DMU_READ_PREFETCH | (xuio ? 0 : DMU_READ_FAIL_SPARSE)); + if (err) { + dnode_rele(dn, FTAG); + return (err); + } - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - if (xuio) { + if (xuio) { + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + int bufoff = uio->uio_loffset - db->db_offset; + int tocpy = MIN(db->db_size - bufoff, size); dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) { - uio->uio_resid -= tocpy; - uio->uio_loffset += tocpy; - } + if (err) + break; + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + size -= tocpy; if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else { - err = uiomove((char *)db->db_data + bufoff, tocpy, + } + } else { + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + if (!db) + continue; + if (uio->uio_loffset < db->db_offset) { + int len = db->db_offset - uio->uio_loffset; + err = uiozero(len, uio); + if (err) + break; + size -= len; + } + int bufoff = uio->uio_loffset - db->db_offset; + int tocpy = MIN(db->db_size - bufoff, size); + err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); + if (err) + break; + size -= tocpy; } - if (err) - break; - - size -= tocpy; + if (size) + err = uiozero(size, uio); } + dnode_rele(dn, FTAG); dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); Index: sys/sys/uio.h =================================================================== --- sys/sys/uio.h (revision 250290) +++ sys/sys/uio.h (working copy) @@ -102,6 +102,8 @@ struct uio *uio); int uiomove_nofault(void *cp, int n, struct uio *uio); int uiomoveco(void *cp, int n, struct uio *uio, int disposable); +int uiozero(int len, struct uio *uio); +int uiozero_nofault(int len, struct uio *uio); #else /* !_KERNEL */ Index: sys/kern/subr_uio.c =================================================================== --- sys/kern/subr_uio.c (revision 250290) +++ sys/kern/subr_uio.c (working copy) @@ -65,7 +65,9 @@ "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault); +static int uiozero_faultflag(int len, struct uio *uio, int nofault); + #ifdef ZERO_COPY_SOCKETS /* Declared in uipc_socket.c */ extern int so_zero_copy_receive; @@ -261,6 +263,32 @@ return (uiomove((char *)buf + offset, n, uio)); } +int +uiozero(int len, struct uio *uio) +{ + return (uiozero_faultflag(len, uio, 0)); +} + +int +uiozero_nofault(int len, struct uio *uio) +{ + return (uiozero_faultflag(len, uio, 1)); +} + +static int +uiozero_faultflag(int len, struct uio *uio, int nofault) +{ + int bs, error; + for (; len > 0; len -= bs) { + bs = MIN(len, ZERO_REGION_SIZE); + error = uiomove_faultflag(__DECONST(void *, zero_region), + bs, uio, nofault); + if (error) + return (error); + } + return 0; +} + #ifdef ZERO_COPY_SOCKETS /* * Experimental support for zero-copy I/O >Release-Note: >Audit-Trail: >Unformatted: