Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 14 Oct 2016 07:39:34 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r307290 - in stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201610140739.u9E7dYoH069697@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Fri Oct 14 07:39:34 2016
New Revision: 307290
URL: https://svnweb.freebsd.org/changeset/base/307290

Log:
  MFC r305340: MFC r305337:
  7004 dmu_tx_hold_zap() does dnode_hold() 7x on same object
  
  Using a benchmark which has 32 threads creating 2 million files in the
  same directory, on a machine with 16 CPU cores, I observed poor
  performance. I noticed that dmu_tx_hold_zap() was using about 30% of
  all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
  object that is being held).
  
  dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
  running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
  dnode_t that we already have in hand, rather than repeatedly calling
  dnode_hold(). To do this, we need to pass the dnode_t down through
  all the intermediate calls that dmu_tx_hold_zap() makes, making these
  routines take the dnode_t* rather than an objset_t* and a uint64_t
  object number. In particular, the following routines will need to have
  analogous *_by_dnode() variants created:
  
  dmu_buf_hold_noread()
  dmu_buf_hold()
  zap_lookup()
  zap_lookup_norm()
  zap_count_write()
  zap_lockdir()
  zap_count_write()
  
  This can improve performance on the benchmark described above by 100%,
  from 30,000 file creations per second to 60,000. (This improvement is on
  top of that provided by working around the object allocation issue. Peak
  performance of ~90,000 creations per second was observed with 8 CPUs;
  adding CPUs past that decreased performance due to lock contention.) The
  CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
  to 40 CPU-seconds.
  
  Sponsored by: Intel Corp.
  
  Closes #109
  
  Reviewed by: Steve Gonczi <steve.gonczi@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
  Reviewed by: Ned Bass <bass6@llnl.gov>
  Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
  Author: Matthew Ahrens <mahrens@delphix.com>
  
  openzfs/openzfs@d3e523d489a169ab36f9ec1b2a111a60a5563a9f

Modified:
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Fri Oct 14 07:39:34 2016	(r307290)
@@ -2910,6 +2910,21 @@ dmu_buf_get_objset(dmu_buf_t *db)
 	return (dbi->db_objset);
 }
 
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_ENTER(dbi);
+	return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_EXIT(dbi);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Fri Oct 14 07:39:34 2016	(r307290)
@@ -131,6 +131,26 @@ const dmu_object_byteswap_info_t dmu_ot_
 };
 
 int
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	blkid = dbuf_whichblock(dn, 0, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (0);
+}
+int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
@@ -158,6 +178,29 @@ dmu_buf_hold_noread(objset_t *os, uint64
 }
 
 int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
+int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Fri Oct 14 07:39:34 2016	(r307290)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -808,15 +808,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
-		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
-		    8, 0, NULL);
+		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 
-	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+	err = zap_count_write_by_dnode(dn, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Fri Oct 14 07:39:34 2016	(r307290)
@@ -78,6 +78,7 @@ struct file;
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
+typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
@@ -419,7 +420,7 @@ dmu_write_embedded(objset_t *os, uint64_
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
-void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
@@ -445,7 +446,7 @@ int dmu_rm_spill(objset_t *, uint64_t, d
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
-int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
     void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
@@ -465,6 +466,8 @@ int dmu_spill_hold_existing(dmu_buf_t *b
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
+int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags);
 
 /*
  * Add a reference to a dmu buffer that has already been held via
@@ -618,6 +621,8 @@ void *dmu_buf_remove_user(dmu_buf_t *db,
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
+dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
+void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
@@ -801,7 +806,7 @@ extern const dmu_object_byteswap_info_t 
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h	Fri Oct 14 07:39:34 2016	(r307290)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -149,7 +149,7 @@ typedef struct dnode_phys {
 	blkptr_t dn_spill;
 } dnode_phys_t;
 
-typedef struct dnode {
+struct dnode {
 	/*
 	 * Protects the structure of the dnode, including the number of levels
 	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
@@ -247,7 +247,7 @@ typedef struct dnode {
 
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
-} dnode_t;
+};
 
 /*
  * Adds a level of indirection between the dbuf and the dnode to avoid

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h	Fri Oct 14 07:39:34 2016	(r307290)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_H
@@ -216,8 +216,14 @@ int zap_lookup_uint64(objset_t *os, uint
 int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
 int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp);
 
-int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
+int zap_count_write_by_dnode(dnode_t *dn, const char *name,
     int add, refcount_t *towrite, refcount_t *tooverwrite);
 
 /*

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	Fri Oct 14 07:39:34 2016	(r307290)
@@ -270,6 +270,7 @@ zap_table_load(zap_t *zap, zap_table_phy
 	uint64_t blk, off;
 	int err;
 	dmu_buf_t *db;
+	dnode_t *dn;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@@ -277,8 +278,15 @@ zap_table_load(zap_t *zap, zap_table_phy
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	/*
+	 * Note: this is equivalent to dmu_buf_hold(), but we use
+	 * _dnode_enter / _by_dnode because it's faster because we don't
+	 * have to hold the dnode.
+	 */
+	dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phy
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+		err = dmu_buf_hold_by_dnode(dn,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
+		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -505,8 +515,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t 
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	Fri Oct 14 07:37:08 2016	(r307289)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	Fri Oct 14 07:39:34 2016	(r307290)
@@ -550,6 +550,24 @@ zap_lockdir_impl(dmu_buf_t *db, void *ta
 	return (0);
 }
 
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		return (err);
+	}
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
+		dmu_buf_rele(db, tag);
+	}
+	return (err);
+}
+
 int
 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
@@ -875,6 +893,33 @@ zap_lookup_norm(objset_t *os, uint64_t z
 }
 
 int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
@@ -1445,7 +1490,7 @@ zap_get_stats(objset_t *os, uint64_t zap
 }
 
 int
-zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
+zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
     refcount_t *towrite, refcount_t *tooverwrite)
 {
 	zap_t *zap;
@@ -1474,7 +1519,7 @@ zap_count_write(objset_t *os, uint64_t z
 	 * At present we are just evaluating the possibility of this operation
 	 * and hence we do not want to trigger an upgrade.
 	 */
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE,
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
 	    FTAG, &zap);
 	if (err != 0)
 		return (err);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201610140739.u9E7dYoH069697>