Date: Sat, 19 Jul 2014 11:42:36 +0100 (BST) From: Robert Watson <rwatson@FreeBSD.org> To: Xin LI <delphij@FreeBSD.org> Cc: svn-src-all@freebsd.org, src-committers@freebsd.org, svn-src-vendor@freebsd.org Subject: Re: svn commit: r268852 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys Message-ID: <alpine.BSF.2.11.1407191141550.65204@fledge.watson.org> In-Reply-To: <201407181809.s6II9KkB002810@svn.freebsd.org> References: <201407181809.s6II9KkB002810@svn.freebsd.org>
next in thread | previous in thread | raw e-mail | index | archive | help
On Fri, 18 Jul 2014, Xin LI wrote: > Log: > 5008 lock contention (rrw_exit) while running a read only load > Reviewed by: Matthew Ahrens <matthew.ahrens@delphix.com> > Reviewed by: George Wilson <george.wilson@delphix.com> > Reviewed by: Alex Reece <alex.reece@delphix.com> > Reviewed by: Christopher Siden <christopher.siden@delphix.com> > Reviewed by: Richard Yao <ryao@gentoo.org> > Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> > Approved by: Garrett D'Amore <garrett@damore.org> > > illumos/illumos-gate@c9030f6c93613fe30ee0c16f92b96da7816ac052 Is there an opportunity to use our own read-mostly lock implementation here? It should be substantially more scalable, has integration with WITNESS, proper priority propagation, etc. Robert > > Modified: > vendor-sys/illumos/dist/uts/common/fs/zfs/rrwlock.c > vendor-sys/illumos/dist/uts/common/fs/zfs/sys/rrwlock.h > vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_vfsops.h > vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_znode.h > vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c > vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c > vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/rrwlock.c > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/rrwlock.c Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/rrwlock.c Fri Jul 18 18:09:20 2014 (r268852) > @@ -286,3 +286,91 @@ rrw_tsd_destroy(void *arg) > (void *)curthread, (void *)rn->rn_rrl); > } > } > + > +/* > + * A reader-mostly lock implementation, tuning above reader-writer locks > + * for hightly parallel read acquisitions, while pessimizing writes. > + * > + * The idea is to split single busy lock into array of locks, so that > + * each reader can lock only one of them for read, depending on result > + * of simple hash function. That proportionally reduces lock congestion. > + * Writer same time has to sequentially aquire write on all the locks. > + * That makes write aquisition proportionally slower, but in places where > + * it is used (filesystem unmount) performance is not critical. > + * > + * All the functions below are direct wrappers around functions above. > + */ > +void > +rrm_init(rrmlock_t *rrl, boolean_t track_all) > +{ > + int i; > + > + for (i = 0; i < RRM_NUM_LOCKS; i++) > + rrw_init(&rrl->locks[i], track_all); > +} > + > +void > +rrm_destroy(rrmlock_t *rrl) > +{ > + int i; > + > + for (i = 0; i < RRM_NUM_LOCKS; i++) > + rrw_destroy(&rrl->locks[i]); > +} > + > +void > +rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) > +{ > + if (rw == RW_READER) > + rrm_enter_read(rrl, tag); > + else > + rrm_enter_write(rrl); > +} > + > +/* > + * This maps the current thread to a specific lock. Note that the lock > + * must be released by the same thread that acquired it. We do this > + * mapping by taking the thread pointer mod a prime number. We examine > + * only the low 32 bits of the thread pointer, because 32-bit division > + * is faster than 64-bit division, and the high 32 bits have little > + * entropy anyway. > + */ > +#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) > + > +void > +rrm_enter_read(rrmlock_t *rrl, void *tag) > +{ > + rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); > +} > + > +void > +rrm_enter_write(rrmlock_t *rrl) > +{ > + int i; > + > + for (i = 0; i < RRM_NUM_LOCKS; i++) > + rrw_enter_write(&rrl->locks[i]); > +} > + > +void > +rrm_exit(rrmlock_t *rrl, void *tag) > +{ > + int i; > + > + if (rrl->locks[0].rr_writer == curthread) { > + for (i = 0; i < RRM_NUM_LOCKS; i++) > + rrw_exit(&rrl->locks[i], tag); > + } else { > + rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); > + } > +} > + > +boolean_t > +rrm_held(rrmlock_t *rrl, krw_t rw) > +{ > + if (rw == RW_WRITER) { > + return (rrw_held(&rrl->locks[0], rw)); > + } else { > + return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); > + } > +} > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/rrwlock.h > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/rrwlock.h Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/rrwlock.h Fri Jul 18 18:09:20 2014 (r268852) > @@ -80,6 +80,31 @@ void rrw_tsd_destroy(void *arg); > #define RRW_LOCK_HELD(x) \ > (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) > > +/* > + * A reader-mostly lock implementation, tuning above reader-writer locks > + * for hightly parallel read acquisitions, pessimizing write acquisitions. > + * > + * This should be a prime number. See comment in rrwlock.c near > + * RRM_TD_LOCK() for details. > + */ > +#define RRM_NUM_LOCKS 17 > +typedef struct rrmlock { > + rrwlock_t locks[RRM_NUM_LOCKS]; > +} rrmlock_t; > + > +void rrm_init(rrmlock_t *rrl, boolean_t track_all); > +void rrm_destroy(rrmlock_t *rrl); > +void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag); > +void rrm_enter_read(rrmlock_t *rrl, void *tag); > +void rrm_enter_write(rrmlock_t *rrl); > +void rrm_exit(rrmlock_t *rrl, void *tag); > +boolean_t rrm_held(rrmlock_t *rrl, krw_t rw); > + > +#define RRM_READ_HELD(x) rrm_held(x, RW_READER) > +#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER) > +#define RRM_LOCK_HELD(x) \ > + (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER)) > + > #ifdef __cplusplus > } > #endif > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_vfsops.h > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_vfsops.h Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_vfsops.h Fri Jul 18 18:09:20 2014 (r268852) > @@ -64,7 +64,7 @@ struct zfsvfs { > int z_norm; /* normalization flags */ > boolean_t z_atime; /* enable atimes mount option */ > boolean_t z_unmounted; /* unmounted */ > - rrwlock_t z_teardown_lock; > + rrmlock_t z_teardown_lock; > krwlock_t z_teardown_inactive_lock; > list_t z_all_znodes; /* all vnodes in the fs */ > kmutex_t z_znodes_lock; /* lock for z_all_znodes */ > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_znode.h > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_znode.h Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_znode.h Fri Jul 18 18:09:20 2014 (r268852) > @@ -238,7 +238,7 @@ typedef struct znode { > /* Called on entry to each ZFS vnode and vfs operation */ > #define ZFS_ENTER(zfsvfs) \ > { \ > - rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ > + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ > if ((zfsvfs)->z_unmounted) { \ > ZFS_EXIT(zfsvfs); \ > return (EIO); \ > @@ -246,7 +246,7 @@ typedef struct znode { > } > > /* Must be called before exiting the vop */ > -#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) > +#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) > > /* Verifies the znode is valid */ > #define ZFS_VERIFY_ZP(zp) \ > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c Fri Jul 18 18:09:20 2014 (r268852) > @@ -1420,7 +1420,7 @@ zfsvfs_hold(const char *name, void *tag, > if (getzfsvfs(name, zfvp) != 0) > error = zfsvfs_create(name, zfvp); > if (error == 0) { > - rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : > + rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : > RW_READER, tag); > if ((*zfvp)->z_unmounted) { > /* > @@ -1428,7 +1428,7 @@ zfsvfs_hold(const char *name, void *tag, > * thread should be just about to disassociate the > * objset from the zfsvfs. > */ > - rrw_exit(&(*zfvp)->z_teardown_lock, tag); > + rrm_exit(&(*zfvp)->z_teardown_lock, tag); > return (SET_ERROR(EBUSY)); > } > } > @@ -1438,7 +1438,7 @@ zfsvfs_hold(const char *name, void *tag, > static void > zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) > { > - rrw_exit(&zfsvfs->z_teardown_lock, tag); > + rrm_exit(&zfsvfs->z_teardown_lock, tag); > > if (zfsvfs->z_vfs) { > VFS_RELE(zfsvfs->z_vfs); > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c Fri Jul 18 18:09:20 2014 (r268852) > @@ -1004,7 +1004,7 @@ zfsvfs_create(const char *osname, zfsvfs > mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); > list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), > offsetof(znode_t, z_link_node)); > - rrw_init(&zfsvfs->z_teardown_lock, B_FALSE); > + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); > rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); > rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); > for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) > @@ -1119,7 +1119,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs) > mutex_destroy(&zfsvfs->z_znodes_lock); > mutex_destroy(&zfsvfs->z_lock); > list_destroy(&zfsvfs->z_all_znodes); > - rrw_destroy(&zfsvfs->z_teardown_lock); > + rrm_destroy(&zfsvfs->z_teardown_lock); > rw_destroy(&zfsvfs->z_teardown_inactive_lock); > rw_destroy(&zfsvfs->z_fuid_lock); > for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) > @@ -1784,7 +1784,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea > { > znode_t *zp; > > - rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); > + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); > > if (!unmounting) { > /* > @@ -1814,7 +1814,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea > */ > if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { > rw_exit(&zfsvfs->z_teardown_inactive_lock); > - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); > + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); > return (SET_ERROR(EIO)); > } > > @@ -1841,7 +1841,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea > */ > if (unmounting) { > zfsvfs->z_unmounted = B_TRUE; > - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); > + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); > rw_exit(&zfsvfs->z_teardown_inactive_lock); > } > > @@ -2073,7 +2073,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const ch > znode_t *zp; > uint64_t sa_obj = 0; > > - ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); > + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); > ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); > > /* > @@ -2129,7 +2129,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const ch > bail: > /* release the VOPs */ > rw_exit(&zfsvfs->z_teardown_inactive_lock); > - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); > + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); > > if (err) { > /* > > Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c > ============================================================================== > --- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c Fri Jul 18 18:05:09 2014 (r268851) > +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c Fri Jul 18 18:09:20 2014 (r268852) > @@ -276,7 +276,7 @@ zfs_znode_move(void *buf, void *newbuf, > * can safely ensure that the filesystem is not and will not be > * unmounted. The next statement is equivalent to ZFS_ENTER(). > */ > - rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); > + rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); > if (zfsvfs->z_unmounted) { > ZFS_EXIT(zfsvfs); > rw_exit(&zfsvfs_lock); > >
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?alpine.BSF.2.11.1407191141550.65204>