Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 10 Aug 2015 21:13:59 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r286598 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
Message-ID:  <201508102113.t7ALDx0R042635@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Mon Aug 10 21:13:59 2015
New Revision: 286598
URL: https://svnweb.freebsd.org/changeset/base/286598

Log:
  MFV 286597: 5701 zpool list reports incorrect "alloc" value for cache devices
  
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <george@delphix.com>
  Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
  Approved by: Dan McDonald <danmcd@omniti.com>
  Author: Prakash Surya <prakash.surya@delphix.com>
  
  illumos/illumos-gate@a52fc310ba80fa3b2006110936198de7f828cd94

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Mon Aug 10 21:13:31 2015	(r286597)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Mon Aug 10 21:13:59 2015	(r286598)
@@ -962,6 +962,16 @@ uint64_t zfs_crc64_table[256];
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
+/*
+ * Used to distinguish headers that are being process by
+ * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
+ * address. This can happen when the header is added to the l2arc's list
+ * of buffers to write in the first stage of l2arc_write_buffers(), but
+ * has not yet been written out which happens in the second stage of
+ * l2arc_write_buffers().
+ */
+#define	L2ARC_ADDR_UNSET	((uint64_t)(-1))
+
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
 
@@ -1045,12 +1055,12 @@ struct l2arc_dev {
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
-	uint64_t		l2ad_evict;	/* last addr eviction reached */
 	boolean_t		l2ad_first;	/* first sweep through */
 	boolean_t		l2ad_writing;	/* currently writing */
 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
 	list_t			l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
+	refcount_t		l2ad_alloc;	/* allocated bytes */
 };
 
 static list_t L2ARC_dev_list;			/* device list */
@@ -1422,6 +1432,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem
 	buf_hash_remove(hdr);
 
 	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
 	if (new == hdr_full_cache) {
 		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
 		/*
@@ -1465,6 +1476,20 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem
 
 	mutex_exit(&dev->l2ad_mtx);
 
+	/*
+	 * Since we're using the pointer address as the tag when
+	 * incrementing and decrementing the l2ad_alloc refcount, we
+	 * must remove the old pointer (that we're about to destroy) and
+	 * add the new pointer to the refcount. Otherwise we'd remove
+	 * the wrong pointer address when calling arc_hdr_destroy() later.
+	 */
+
+	(void) refcount_remove_many(&dev->l2ad_alloc,
+	    hdr->b_l2hdr.b_asize, hdr);
+
+	(void) refcount_add_many(&dev->l2ad_alloc,
+	    nhdr->b_l2hdr.b_asize, nhdr);
+
 	buf_discard_identity(hdr);
 	hdr->b_freeze_cksum = NULL;
 	kmem_cache_free(old, hdr);
@@ -2216,6 +2241,57 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
 }
 
 static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+	l2arc_dev_t *dev = l2hdr->b_dev;
+
+	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	list_remove(&dev->l2ad_buflist, hdr);
+
+	/*
+	 * We don't want to leak the b_tmp_cdata buffer that was
+	 * allocated in l2arc_write_buffers()
+	 */
+	arc_buf_l2_cdata_free(hdr);
+
+	/*
+	 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
+	 * this header is being processed by l2arc_write_buffers() (i.e.
+	 * it's in the first stage of l2arc_write_buffers()).
+	 * Re-affirming that truth here, just to serve as a reminder. If
+	 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
+	 * may not have its HDR_L2_WRITING flag set. (the write may have
+	 * completed, in which case HDR_L2_WRITING will be false and the
+	 * b_daddr field will point to the address of the buffer on disk).
+	 */
+	IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
+
+	/*
+	 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
+	 * l2arc_write_buffers(). Since we've just removed this header
+	 * from the l2arc buffer list, this header will never reach the
+	 * second stage of l2arc_write_buffers(), which increments the
+	 * accounting stats for this header. Thus, we must be careful
+	 * not to decrement them for this header either.
+	 */
+	if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
+		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
+		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+
+		vdev_space_update(dev->l2ad_vdev,
+		    -l2hdr->b_asize, 0, 0);
+
+		(void) refcount_remove_many(&dev->l2ad_alloc,
+		    l2hdr->b_asize, hdr);
+	}
+
+	hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
+}
+
+static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
@@ -2228,31 +2304,29 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
-		l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
-		boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx);
-
-		if (!buflist_held) {
-			mutex_enter(&l2hdr->b_dev->l2ad_mtx);
-			l2hdr = &hdr->b_l2hdr;
-		}
+		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
-		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
-		    l2hdr->b_asize, 0);
-		list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
+		if (!buflist_held)
+			mutex_enter(&dev->l2ad_mtx);
 
 		/*
-		 * We don't want to leak the b_tmp_cdata buffer that was
-		 * allocated in l2arc_write_buffers()
-		 */
-		arc_buf_l2_cdata_free(hdr);
-
-		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
+		 * Even though we checked this conditional above, we
+		 * need to check this again now that we have the
+		 * l2ad_mtx. This is because we could be racing with
+		 * another thread calling l2arc_evict() which might have
+		 * destroyed this header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx. If that happens, we don't
+		 * want to re-destroy the header's L2 portion.
+		 */
+		if (HDR_HAS_L2HDR(hdr)) {
+			trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
+			    hdr->b_l2hdr.b_asize, 0);
+			arc_hdr_l2hdr_destroy(hdr);
+		}
 
 		if (!buflist_held)
-			mutex_exit(&l2hdr->b_dev->l2ad_mtx);
-
-		hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
+			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	if (!BUF_EMPTY(hdr))
@@ -4271,23 +4345,23 @@ arc_release(arc_buf_t *buf, void *tag)
 	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
 
 	if (HDR_HAS_L2HDR(hdr)) {
-		ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
-		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-
 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-		trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
-		    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
-		list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr);
 
 		/*
-		 * We don't want to leak the b_tmp_cdata buffer that was
-		 * allocated in l2arc_write_buffers()
+		 * We have to recheck this conditional again now that
+		 * we're holding the l2ad_mtx to prevent a race with
+		 * another thread which might be concurrently calling
+		 * l2arc_evict(). In that case, l2arc_evict() might have
+		 * destroyed the header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx.
 		 */
-		arc_buf_l2_cdata_free(hdr);
+		if (HDR_HAS_L2HDR(hdr)) {
+			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
+			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
+			arc_hdr_l2hdr_destroy(hdr);
+		}
 
 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-
-		hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
 	}
 
 	/*
@@ -5355,6 +5429,10 @@ l2arc_write_done(zio_t *zio)
 
 			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+
+			bytes_dropped += hdr->b_l2hdr.b_asize;
+			(void) refcount_remove_many(&dev->l2ad_alloc,
+			    hdr->b_l2hdr.b_asize, hdr);
 		}
 
 		/*
@@ -5511,7 +5589,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t d
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
-	int64_t bytes_evicted = 0;
 
 	buflist = &dev->l2ad_buflist;
 
@@ -5596,21 +5673,11 @@ top:
 				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
 			}
 
-			/* Tell ARC this no longer exists in L2ARC. */
-			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
-			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
-			list_remove(buflist, hdr);
-
-			/* This may have been leftover after a failed write. */
-			hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
+			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
-
-	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
-	dev->l2ad_evict = taddr;
 }
 
 /*
@@ -5762,6 +5829,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 			hdr->b_l2hdr.b_asize = hdr->b_size;
 			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
 
+			/*
+			 * Explicitly set the b_daddr field to a known
+			 * value which means "invalid address". This
+			 * enables us to differentiate which stage of
+			 * l2arc_write_buffers() the particular header
+			 * is in (e.g. this loop, or the one below).
+			 * ARC_FLAG_L2_WRITING is not enough to make
+			 * this distinction, and we need to know in
+			 * order to do proper l2arc vdev accounting in
+			 * arc_release() and arc_hdr_destroy().
+			 *
+			 * Note, we can't use a new flag to distinguish
+			 * the two stages because we don't hold the
+			 * header's hash_lock below, in the second stage
+			 * of this function. Thus, we can't simply
+			 * change the b_flags field to denote that the
+			 * IO has been sent. We can change the b_daddr
+			 * field of the L2 portion, though, since we'll
+			 * be holding the l2ad_mtx; which is why we're
+			 * using it to denote the header's state change.
+			 */
+			hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
+
 			buf_sz = hdr->b_size;
 			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
 
@@ -5838,6 +5928,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 		if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
 			hdr->b_l1hdr.b_tmp_cdata = NULL;
 
+		/*
+		 * We need to do this regardless if buf_sz is zero or
+		 * not, otherwise, when this l2hdr is evicted we'll
+		 * remove a reference that was never added.
+		 */
+		(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
+
 		/* Compression may have squashed the buffer to zero length. */
 		if (buf_sz != 0) {
 			uint64_t buf_p_sz;
@@ -5852,6 +5949,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 			(void) zio_nowait(wzio);
 
 			write_asize += buf_sz;
+
 			/*
 			 * Keep the clock hand suitably device-aligned.
 			 */
@@ -5876,7 +5974,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
 		dev->l2ad_hand = dev->l2ad_start;
-		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 	}
 
@@ -6183,7 +6280,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
-	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 
@@ -6196,6 +6292,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+	refcount_create(&adddev->l2ad_alloc);
 
 	/*
 	 * Add device to global list
@@ -6241,6 +6338,7 @@ l2arc_remove_vdev(vdev_t *vd)
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	mutex_destroy(&remdev->l2ad_mtx);
+	refcount_destroy(&remdev->l2ad_alloc);
 	kmem_free(remdev, sizeof (l2arc_dev_t));
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201508102113.t7ALDx0R042635>