From owner-freebsd-fs@FreeBSD.ORG Tue Oct 9 13:27:01 2007 Return-Path: Delivered-To: freebsd-fs@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 0EBC716A417 for ; Tue, 9 Oct 2007 13:27:01 +0000 (UTC) (envelope-from bg@sics.se) Received: from letter.sics.se (letter.sics.se [193.10.64.6]) by mx1.freebsd.org (Postfix) with ESMTP id A59BD13C455 for ; Tue, 9 Oct 2007 13:27:00 +0000 (UTC) (envelope-from bg@sics.se) Received: from sics.se (ibook.sics.se [193.10.66.104]) by letter.sics.se (Postfix) with ESMTP id 3005D400D0; Tue, 9 Oct 2007 15:02:00 +0200 (CEST) Date: Tue, 9 Oct 2007 15:01:49 +0200 From: Bjorn Gronvall To: freebsd-fs@freebsd.org Message-ID: <20071009150149.337279ce@ibook.sics.se> Organization: SICS.SE X-Mailer: Claws Mail 2.9.1 (GTK+ 2.10.6; i386-portbld-freebsd6.2) Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8bit Cc: Subject: NFS server does not cluster writes X-BeenThere: freebsd-fs@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Filesystems List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 09 Oct 2007 13:27:01 -0000 Hi, The current NFS server does only cluster reads but never writes which in turn leads to poor sequential-write performance. The attached patch makes the following changes: 1/ Rearrange the code so that the same code can be used to detect both sequential reads and writes. 2/ Merge in updates from vfs_vnops.c::sequential_heuristic. 3/ Use double hashing in order to avoid hash-clustering in the nfsheur table. 4/ Pack nfsheur table more efficiently. 5/ Tolerate reordered RPCs to some small amount (initially suggested by Ellard and Seltzer). 6/ Back-off from sequential access rather than immediately switching to random access. These changes has been tested on a low performance ATA disk (with write caching disabled) and speeded up large sequential writes by a factor of four. I would be interested in getting numbers from more normal server configurations if somebody has the time to try it out. Cheers, /b -- _ _ ,_______________. Bjorn Gronvall (Björn Grönvall) /_______________/| Swedish Institute of Computer Science | || PO Box 1263, S-164 29 Kista, Sweden | Schroedingers || Email: bg@sics.se, Phone +46 -8 633 15 25 | Cat |/ Cellular +46 -70 768 06 35, Fax +46 -8 751 72 30 '---------------' --- nfs_serv.c.orig 2007-10-09 12:03:00.000000000 +0200 +++ nfs_serv.c 2007-10-09 13:50:02.000000000 +0200 @@ -106,18 +106,98 @@ #define MAX_COMMIT_COUNT (1024 * 1024) -#define NUM_HEURISTIC 1017 +#define NUM_HEURISTIC 1031 /* Must be prime! */ +#define HASH_MAXSTEP 0x3ff #define NHUSE_INIT 64 #define NHUSE_INC 16 #define NHUSE_MAX 2048 +CTASSERT(NUM_HEURISTIC > (HASH_MAXSTEP + 1)); static struct nfsheur { + off_t nh_nextoff; /* next offset for sequential detection */ struct vnode *nh_vp; /* vp to match (unreferenced pointer) */ - off_t nh_nextr; /* next offset for sequential detection */ - int nh_use; /* use count for selection */ - int nh_seqcount; /* heuristic */ + uint16_t nh_use; /* use count for selection */ + uint16_t nh_seqcount; /* in units of BKVASIZE bytes */ } nfsheur[NUM_HEURISTIC]; +/* + * Sequential heuristic - detect sequential operation + */ +static +struct nfsheur * +sequential_heuristic(const struct uio *uio, struct vnode *vp) +{ + struct nfsheur *nh; + unsigned hi, step; /* Double hashing */ + int try = 32; /* A bit large? */ + int nblocks; + + /* + * Locate best candidate + */ + + hi = ((unsigned)vp / sizeof(struct vnode)) % NUM_HEURISTIC; + step = ((unsigned)vp / sizeof(struct vnode)) & HASH_MAXSTEP; + step++; /* Step must not be zero. */ + nh = &nfsheur[hi]; + + while (try--) { + if (nfsheur[hi].nh_vp == vp) { + nh = &nfsheur[hi]; + break; + } + if (nfsheur[hi].nh_use > 0) + --nfsheur[hi].nh_use; + hi = hi + step; + if (hi >= NUM_HEURISTIC) + hi -= NUM_HEURISTIC; + if (nfsheur[hi].nh_use < nh->nh_use) + nh = &nfsheur[hi]; + } + + if (nh->nh_vp != vp) { + nh->nh_vp = vp; + nh->nh_nextoff = uio->uio_offset; + nh->nh_use = NHUSE_INIT; + if (uio->uio_offset == 0) + nh->nh_seqcount = 4; + else + nh->nh_seqcount = 1; + } + + nh->nh_use += NHUSE_INC; + if (nh->nh_use > NHUSE_MAX) + nh->nh_use = NHUSE_MAX; + + /* + * Calculate heuristic + */ + + /* + * XXX we assume that the filesystem block size is + * the default. Not true, but still gives us a pretty + * good indicator of how sequential the read operations + * are. + */ + nblocks = (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; + if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) || + uio->uio_offset == nh->nh_nextoff) { + nh->nh_seqcount += nblocks; + if (nh->nh_seqcount > IO_SEQMAX) + nh->nh_seqcount = IO_SEQMAX; + } else if (qabs(uio->uio_offset - nh->nh_nextoff) <= + 4*imax(BKVASIZE, uio->uio_resid)) { + /* Probably reordered RPC, do nothing. */ + } else { + nh->nh_seqcount /= 4; + /* RPCs larger than 1 block should cluster IO. */ + if (nblocks > 1 && nh->nh_seqcount < nblocks) + nh->nh_seqcount = nblocks; + } + + return (nh); +} + /* Global vars */ int nfsrvw_procrastinate = NFS_GATHERDELAY * 1000; @@ -855,61 +935,6 @@ else cnt = reqlen; - /* - * Calculate seqcount for heuristic - */ - - { - int hi; - int try = 32; - - /* - * Locate best candidate - */ - - hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC; - nh = &nfsheur[hi]; - - while (try--) { - if (nfsheur[hi].nh_vp == vp) { - nh = &nfsheur[hi]; - break; - } - if (nfsheur[hi].nh_use > 0) - --nfsheur[hi].nh_use; - hi = (hi + 1) % NUM_HEURISTIC; - if (nfsheur[hi].nh_use < nh->nh_use) - nh = &nfsheur[hi]; - } - - if (nh->nh_vp != vp) { - nh->nh_vp = vp; - nh->nh_nextr = off; - nh->nh_use = NHUSE_INIT; - if (off == 0) - nh->nh_seqcount = 4; - else - nh->nh_seqcount = 1; - } - - /* - * Calculate heuristic - */ - - if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) { - if (++nh->nh_seqcount > IO_SEQMAX) - nh->nh_seqcount = IO_SEQMAX; - } else if (nh->nh_seqcount > 1) { - nh->nh_seqcount = 1; - } else { - nh->nh_seqcount = 0; - } - nh->nh_use += NHUSE_INC; - if (nh->nh_use > NHUSE_MAX) - nh->nh_use = NHUSE_MAX; - ioflag |= nh->nh_seqcount << IO_SEQSHIFT; - } - nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt)); if (v3) { tl = nfsm_build(u_int32_t *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED); @@ -967,9 +992,11 @@ uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; + nh = sequential_heuristic(uiop, vp); + ioflag |= nh->nh_seqcount << IO_SEQSHIFT; error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred); off = uiop->uio_offset; - nh->nh_nextr = off; + nh->nh_nextoff = off; FREE((caddr_t)iv2, M_TEMP); if (error || (getret = VOP_GETATTR(vp, vap, cred, td))) { if (!error) @@ -1037,12 +1064,14 @@ nfsfh_t nfh; fhandle_t *fhp; struct uio io, *uiop = &io; + struct nfsheur *nh; off_t off; struct mount *mntp = NULL; int tvfslocked; int vfslocked; nfsdbprintf(("%s %d\n", __FILE__, __LINE__)); + bwillwrite(); vfslocked = 0; if (mrep == NULL) { *mrq = NULL; @@ -1175,9 +1204,12 @@ uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; uiop->uio_offset = off; + nh = sequential_heuristic(uiop, vp); + ioflags |= nh->nh_seqcount << IO_SEQSHIFT; error = VOP_WRITE(vp, uiop, ioflags, cred); /* XXXRW: unlocked write. */ nfsrvstats.srvvop_writes++; + nh->nh_nextoff = uiop->uio_offset; FREE((caddr_t)iv, M_TEMP); } aftat_ret = VOP_GETATTR(vp, vap, cred, td);