Date: Fri, 3 Aug 2018 00:01:48 +0000 (UTC) From: Alexander Motin <mav@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r337211 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys Message-ID: <201808030001.w7301mtr031476@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mav Date: Fri Aug 3 00:01:48 2018 New Revision: 337211 URL: https://svnweb.freebsd.org/changeset/base/337211 Log: MFV r337210: 9577 remove zfs_dbuf_evict_key tsd The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary - we can instead pass a flag down in a few places to prevent recursive dbuf eviction. Making this change has 3 benefits: 1. The code semantics are easier to understand. 2. On Linux, performance is improved, because creating/removing TSD values (by setting to NULL vs non-NULL) is expensive, and we do it very often. 3. According to Nexenta, the current semantics can cause a deadlock when concurrently calling dmu_objset_evict_dbufs() (which is rare today, but they are working on a "parallel unmount" change that triggers this more easily) illumos/illumos-gate@c2919acbea007fa95c709b60d073db9a24526e01 Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Andy Stormont <astormont@racktopsystems.com> Approved by: Richard Lowe <richlowe@richlowe.net> Author: Matthew Ahrens <mahrens@delphix.com> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h Directory Properties: head/sys/cddl/contrib/opensolaris/ (props changed) Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Thu Aug 2 23:59:52 2018 (r337210) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Fri Aug 3 00:01:48 2018 (r337211) @@ -51,8 +51,6 @@ #include <sys/cityhash.h> #include <sys/spa_impl.h> -uint_t zfs_dbuf_evict_key; - static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -534,14 +532,6 @@ dbuf_evict_one(void) ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); - /* - * Set the thread's tsd to indicate that it's processing evictions. - * Once a thread stops evicting from the dbuf cache it will - * reset its tsd to NULL. - */ - ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); - (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); - dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); @@ -561,7 +551,6 @@ dbuf_evict_one(void) } else { multilist_sublist_unlock(mls); } - (void) tsd_set(zfs_dbuf_evict_key, NULL); } /* @@ -615,30 +604,7 @@ dbuf_evict_thread(void *unused __unused) static void dbuf_evict_notify(void) { - /* - * We use thread specific data to track when a thread has - * started processing evictions. This allows us to avoid deeply - * nested stacks that would have a call flow similar to this: - * - * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() - * ^ | - * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ - * - * The dbuf_eviction_thread will always have its tsd set until - * that thread exits. All other threads will only set their tsd - * if they are participating in the eviction process. This only - * happens if the eviction thread is unable to process evictions - * fast enough. To keep the dbuf cache size in check, other threads - * can evict from the dbuf cache directly. Those threads will set - * their tsd values so that we ensure that they only evict one dbuf - * from the dbuf cache. - */ - if (tsd_get(zfs_dbuf_evict_key) != NULL) - return; - - /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. @@ -714,7 +680,6 @@ retry: refcount_create(&dbuf_caches[dcs].size); } - tsd_create(&zfs_dbuf_evict_key, NULL); dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); @@ -741,7 +706,6 @@ dbuf_fini(void) cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); - tsd_destroy(&zfs_dbuf_evict_key); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); @@ -1026,7 +990,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, db->db_state = DB_CACHED; } cv_broadcast(&db->db_changed); - dbuf_rele_and_unlock(db, NULL); + dbuf_rele_and_unlock(db, NULL, B_FALSE); } static void @@ -2186,7 +2150,8 @@ dbuf_destroy(dmu_buf_impl_t *db) * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ - dnode_rele(dn, db); + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); @@ -2213,8 +2178,10 @@ dbuf_destroy(dmu_buf_impl_t *db) * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ - if (parent && parent != dndb) - dbuf_rele(parent, db); + if (parent && parent != dndb) { + mutex_enter(&parent->db_mtx); + dbuf_rele_and_unlock(parent, db, B_TRUE); + } } /* @@ -2846,7 +2813,7 @@ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, tag); + dbuf_rele_and_unlock(db, tag, B_FALSE); } void @@ -2857,10 +2824,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag) /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow - * db_dirtycnt and db_holds to be updated atomically. + * db_dirtycnt and db_holds to be updated atomically. The 'evicting' + * argument should be set if we are already in the dbuf-evicting code + * path, in which case we don't want to recursively evict. This allows us to + * avoid deeply nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * */ void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) { int64_t holds; @@ -2963,7 +2939,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) db->db.db_size, db); mutex_exit(&db->db_mtx); - if (db->db_caching_status == DB_DBUF_CACHE) { + if (db->db_caching_status == DB_DBUF_CACHE && + !evicting) { dbuf_evict_notify(); } } @@ -3230,7 +3207,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); return; } @@ -3580,7 +3557,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } static void Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c Thu Aug 2 23:59:52 2018 (r337210) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c Fri Aug 3 00:01:48 2018 (r337211) @@ -1240,11 +1240,11 @@ void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, tag); + dnode_rele_and_unlock(dn, tag, B_FALSE); } void -dnode_rele_and_unlock(dnode_t *dn, void *tag) +dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ @@ -1275,7 +1275,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag) * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ - dbuf_rele(db, dnh); + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, dnh, evicting); } } Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c Thu Aug 2 23:59:52 2018 (r337210) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c Fri Aug 3 00:01:48 2018 (r337211) @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -439,6 +439,19 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); + /* + * We need to use the "marker" dbuf rather than + * simply getting the next dbuf, because + * dbuf_destroy() may actually remove multiple dbufs. + * It can call itself recursively on the parent dbuf, + * which may also be removed from dn_dbufs. The code + * flow would look like: + * + * dbuf_destroy(): + * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): + * if (!cacheable || pending_evict) + * dbuf_destroy() + */ dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); @@ -497,7 +510,7 @@ dnode_undirty_dbufs(list_t *list) list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); } } Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h Thu Aug 2 23:59:52 2018 (r337210) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h Fri Aug 3 00:01:48 2018 (r337211) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -303,7 +303,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os uint64_t dbuf_refcount(dmu_buf_impl_t *db); void dbuf_rele(dmu_buf_impl_t *db, void *tag); -void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting); dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid); Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h Thu Aug 2 23:59:52 2018 (r337210) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h Fri Aug 3 00:01:48 2018 (r337211) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -291,7 +291,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); -void dnode_rele_and_unlock(dnode_t *dn, void *tag); +void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808030001.w7301mtr031476>