Date: Mon, 10 Aug 2015 21:13:59 +0000 (UTC) From: Alexander Motin <mav@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r286598 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs Message-ID: <201508102113.t7ALDx0R042635@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mav Date: Mon Aug 10 21:13:59 2015 New Revision: 286598 URL: https://svnweb.freebsd.org/changeset/base/286598 Log: MFV 286597: 5701 zpool list reports incorrect "alloc" value for cache devices Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george@delphix.com> Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com> Approved by: Dan McDonald <danmcd@omniti.com> Author: Prakash Surya <prakash.surya@delphix.com> illumos/illumos-gate@a52fc310ba80fa3b2006110936198de7f828cd94 Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Mon Aug 10 21:13:31 2015 (r286597) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Mon Aug 10 21:13:59 2015 (r286598) @@ -962,6 +962,16 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ +/* + * Used to distinguish headers that are being process by + * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk + * address. This can happen when the header is added to the l2arc's list + * of buffers to write in the first stage of l2arc_write_buffers(), but + * has not yet been written out which happens in the second stage of + * l2arc_write_buffers(). + */ +#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) + #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -1045,12 +1055,12 @@ struct l2arc_dev { uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ - uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ + refcount_t l2ad_alloc; /* allocated bytes */ }; static list_t L2ARC_dev_list; /* device list */ @@ -1422,6 +1432,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; /* @@ -1465,6 +1476,20 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem mutex_exit(&dev->l2ad_mtx); + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, + hdr->b_l2hdr.b_asize, hdr); + + (void) refcount_add_many(&dev->l2ad_alloc, + nhdr->b_l2hdr.b_asize, nhdr); + buf_discard_identity(hdr); hdr->b_freeze_cksum = NULL; kmem_cache_free(old, hdr); @@ -2216,6 +2241,57 @@ arc_buf_destroy(arc_buf_t *buf, boolean_ } static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + + /* + * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then + * this header is being processed by l2arc_write_buffers() (i.e. + * it's in the first stage of l2arc_write_buffers()). + * Re-affirming that truth here, just to serve as a reminder. If + * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or + * may not have its HDR_L2_WRITING flag set. (the write may have + * completed, in which case HDR_L2_WRITING will be false and the + * b_daddr field will point to the address of the buffer on disk). + */ + IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); + + /* + * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with + * l2arc_write_buffers(). Since we've just removed this header + * from the l2arc buffer list, this header will never reach the + * second stage of l2arc_write_buffers(), which increments the + * accounting stats for this header. Thus, we must be careful + * not to decrement them for this header either. + */ + if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + vdev_space_update(dev->l2ad_vdev, + -l2hdr->b_asize, 0, 0); + + (void) refcount_remove_many(&dev->l2ad_alloc, + l2hdr->b_asize, hdr); + } + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; +} + +static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { @@ -2228,31 +2304,29 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); - - if (!buflist_held) { - mutex_enter(&l2hdr->b_dev->l2ad_mtx); - l2hdr = &hdr->b_l2hdr; - } + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); - trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, - l2hdr->b_asize, 0); - list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + if (!buflist_held) + mutex_enter(&dev->l2ad_mtx); /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() - */ - arc_buf_l2_cdata_free(hdr); - - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); + * Even though we checked this conditional above, we + * need to check this again now that we have the + * l2ad_mtx. This is because we could be racing with + * another thread calling l2arc_evict() which might have + * destroyed this header's L2 portion as we were waiting + * to acquire the l2ad_mtx. If that happens, we don't + * want to re-destroy the header's L2 portion. + */ + if (HDR_HAS_L2HDR(hdr)) { + trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, + hdr->b_l2hdr.b_asize, 0); + arc_hdr_l2hdr_destroy(hdr); + } if (!buflist_held) - mutex_exit(&l2hdr->b_dev->l2ad_mtx); - - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + mutex_exit(&dev->l2ad_mtx); } if (!BUF_EMPTY(hdr)) @@ -4271,23 +4345,23 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); - trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); - list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() + * We have to recheck this conditional again now that + * we're holding the l2ad_mtx to prevent a race with + * another thread which might be concurrently calling + * l2arc_evict(). In that case, l2arc_evict() might have + * destroyed the header's L2 portion as we were waiting + * to acquire the l2ad_mtx. */ - arc_buf_l2_cdata_free(hdr); + if (HDR_HAS_L2HDR(hdr)) { + trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); + arc_hdr_l2hdr_destroy(hdr); + } mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); - - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } /* @@ -5355,6 +5429,10 @@ l2arc_write_done(zio_t *zio) ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + bytes_dropped += hdr->b_l2hdr.b_asize; + (void) refcount_remove_many(&dev->l2ad_alloc, + hdr->b_l2hdr.b_asize, hdr); } /* @@ -5511,7 +5589,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t d arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; - int64_t bytes_evicted = 0; buflist = &dev->l2ad_buflist; @@ -5596,21 +5673,11 @@ top: hdr->b_flags |= ARC_FLAG_L2_EVICTED; } - /* Tell ARC this no longer exists in L2ARC. */ - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; - list_remove(buflist, hdr); - - /* This may have been leftover after a failed write. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); - - vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); - dev->l2ad_evict = taddr; } /* @@ -5762,6 +5829,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_de hdr->b_l2hdr.b_asize = hdr->b_size; hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; + /* + * Explicitly set the b_daddr field to a known + * value which means "invalid address". This + * enables us to differentiate which stage of + * l2arc_write_buffers() the particular header + * is in (e.g. this loop, or the one below). + * ARC_FLAG_L2_WRITING is not enough to make + * this distinction, and we need to know in + * order to do proper l2arc vdev accounting in + * arc_release() and arc_hdr_destroy(). + * + * Note, we can't use a new flag to distinguish + * the two stages because we don't hold the + * header's hash_lock below, in the second stage + * of this function. Thus, we can't simply + * change the b_flags field to denote that the + * IO has been sent. We can change the b_daddr + * field of the L2 portion, though, since we'll + * be holding the l2ad_mtx; which is why we're + * using it to denote the header's state change. + */ + hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; + buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; @@ -5838,6 +5928,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_de if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) hdr->b_l1hdr.b_tmp_cdata = NULL; + /* + * We need to do this regardless if buf_sz is zero or + * not, otherwise, when this l2hdr is evicted we'll + * remove a reference that was never added. + */ + (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); + /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { uint64_t buf_p_sz; @@ -5852,6 +5949,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_de (void) zio_nowait(wzio); write_asize += buf_sz; + /* * Keep the clock hand suitably device-aligned. */ @@ -5876,7 +5974,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_de */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } @@ -6183,7 +6280,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; - adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; @@ -6196,6 +6292,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + refcount_create(&adddev->l2ad_alloc); /* * Add device to global list @@ -6241,6 +6338,7 @@ l2arc_remove_vdev(vdev_t *vd) l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); mutex_destroy(&remdev->l2ad_mtx); + refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201508102113.t7ALDx0R042635>