Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 21 Mar 2016 00:01:59 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject:   svn commit: r297102 - in stable/10: cddl/contrib/opensolaris/cmd/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <201603210001.u2L01xTA029545@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Mon Mar 21 00:01:59 2016
New Revision: 297102
URL: https://svnweb.freebsd.org/changeset/base/297102

Log:
  MFC r294815: MFV r294814: 6393 zfs receive a full send as a clone
  
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: Prakash Surya <prakash.surya@delphix.com>
  Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
  Approved by: Dan McDonald <danmcd@omniti.com>
  Author: Paul Dagnelie <pcd@delphix.com>
  
  illumos/illumos-gate@68ecb2ec930c4b0f00acaf8e0abb2b19c4b8b76f
  
  This allows to do a full (non-incremental send) and receive it as a clone
  of an existing dataset. It can leverage nopwrite to share blocks with the
  origin. This can be used to change the relationship of datasets on the
  target. For example, maybe on the source you have:
  
  A ---- B ---- C
  
  And you have sent to the target a full of B, and the incremental B->C:
  
  B ---- C
  
  You later realize that you want to have A on the target. You will have to
  do a full send of A, but nopwrite can save you space on the target if you
  receive it as a clone of B, assuming that A and B have some blocks inxi
  common:
  
  B ---- C
   \
    A

Modified:
  stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8	Sun Mar 20 23:58:44 2016	(r297101)
+++ stable/10/cddl/contrib/opensolaris/cmd/zfs/zfs.8	Mon Mar 21 00:01:59 2016	(r297102)
@@ -2839,8 +2839,11 @@ Do not actually receive the stream. This
 option to verify the name the receive operation would use.
 .It Fl o Sy origin Ns = Ns Ar snapshot
 Forces the stream to be received as a clone of the given snapshot.
-This is only valid if the stream is an incremental stream whose source
-is the same as the provided origin.
+If the stream is a full send stream, this will create the filesystem
+described by the stream as a clone of the specified snapshot. Which
+snapshot was specified will not affect the success or failure of the
+receive, as long as the snapshot does exist.  If the stream is an
+incremental send stream, all the normal verification will be performed.
 .It Fl F
 Force a rollback of the file system to the most recent snapshot before
 performing the receive operation. If receiving an incremental replication

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	Sun Mar 20 23:58:44 2016	(r297101)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	Mon Mar 21 00:01:59 2016	(r297102)
@@ -158,6 +158,14 @@ dump_record(dmu_sendarg_t *dsp, void *pa
 	return (0);
 }
 
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
 static int
 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
@@ -181,15 +189,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t o
 	    (object == dsp->dsa_last_data_object &&
 	    offset > dsp->dsa_last_data_offset));
 
-	/*
-	 * If we are doing a non-incremental send, then there can't
-	 * be any data in the dataset we're receiving into.  Therefore
-	 * a free record would simply be a no-op.  Save space by not
-	 * sending it to begin with.
-	 */
-	if (!dsp->dsa_incremental)
-		return (0);
-
 	if (length != -1ULL && offset + length < offset)
 		length = -1ULL;
 
@@ -368,10 +367,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uin
 {
 	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 
-	/* See comment in dump_free(). */
-	if (!dsp->dsa_incremental)
-		return (0);
-
 	/*
 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 	 * push it out, since free block aggregation can only be done for
@@ -776,6 +771,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp,
 	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+	drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
 
 	if (ancestor_zb != NULL) {
 		drr->drr_u.drr_begin.drr_fromguid =
@@ -799,7 +795,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp,
 	dsp->dsa_off = off;
 	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsp->dsa_pending_op = PENDING_NONE;
-	dsp->dsa_incremental = (ancestor_zb != NULL);
 	dsp->dsa_featureflags = featureflags;
 	dsp->dsa_resume_object = resumeobj;
 	dsp->dsa_resume_offset = resumeoff;
@@ -1321,7 +1316,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
-		if (flags & DRR_FLAG_CLONE) {
+		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
@@ -1340,6 +1335,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
+		/*
+		 * If we're receiving a full send as a clone, and it doesn't
+		 * contain all the necessary free records and freeobject
+		 * records, reject it.
+		 */
+		if (fromguid == 0 && drba->drba_origin &&
+		    !(flags & DRR_FLAG_FREERECORDS))
+			return (SET_ERROR(EINVAL));
+
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@@ -1379,7 +1383,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
-			if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+			    fromguid != 0) {
 				dsl_dataset_rele(origin, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
@@ -1709,6 +1714,20 @@ struct receive_writer_arg {
 	uint64_t bytes_read; /* bytes read when current record created */
 };
 
+struct objlist {
+	list_t list; /* List of struct receive_objnode. */
+	/*
+	 * Last object looked up. Used to assert that objects are being looked
+	 * up in ascending order.
+	 */
+	uint64_t last_lookup;
+};
+
+struct receive_objnode {
+	list_node_t node;
+	uint64_t object;
+};
+
 struct receive_arg  {
 	objset_t *os;
 	kthread_t *td;
@@ -1727,12 +1746,7 @@ struct receive_arg  {
 	int err;
 	boolean_t byteswap;
 	/* Sorted list of objects not to issue prefetches for. */
-	list_t ignore_obj_list;
-};
-
-struct receive_ign_obj_node {
-	list_node_t node;
-	uint64_t object;
+	struct objlist ignore_objlist;
 };
 
 typedef struct guid_map_entry {
@@ -2068,13 +2082,14 @@ receive_freeobjects(struct receive_write
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
+	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj;
-	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
 		if (dmu_object_info(rwa->os, obj, NULL) != 0)
@@ -2084,7 +2099,8 @@ receive_freeobjects(struct receive_write
 		if (err != 0)
 			return (err);
 	}
-
+	if (next_err != ESRCH)
+		return (next_err);
 	return (0);
 }
 
@@ -2414,6 +2430,66 @@ receive_read_payload_and_next_header(str
 	return (0);
 }
 
+static void
+objlist_create(struct objlist *list)
+{
+	list_create(&list->list, sizeof (struct receive_objnode),
+	    offsetof(struct receive_objnode, node));
+	list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+	for (struct receive_objnode *n = list_remove_head(&list->list);
+	    n != NULL; n = list_remove_head(&list->list)) {
+		kmem_free(n, sizeof (*n));
+	}
+	list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist.  In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number.  Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = list_head(&list->list);
+	ASSERT3U(object, >=, list->last_lookup);
+	list->last_lookup = object;
+	while (node != NULL && node->object < object) {
+		VERIFY3P(node, ==, list_remove_head(&list->list));
+		kmem_free(node, sizeof (*node));
+		node = list_head(&list->list);
+	}
+	return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order.  However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+	node->object = object;
+#ifdef ZFS_DEBUG
+	struct receive_objnode *last_object = list_tail(&list->list);
+	uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+	ASSERT3U(node->object, >, last_objnum);
+#endif
+	list_insert_tail(&list->list, node);
+}
+
 /*
  * Issue the prefetch reads for any necessary indirect blocks.
  *
@@ -2436,13 +2512,7 @@ static void
 receive_read_prefetch(struct receive_arg *ra,
     uint64_t object, uint64_t offset, uint64_t length)
 {
-	struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
-	while (node != NULL && node->object < object) {
-		VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
-		kmem_free(node, sizeof (*node));
-		node = list_head(&ra->ignore_obj_list);
-	}
-	if (node == NULL || node->object > object) {
+	if (!objlist_exists(&ra->ignore_objlist, object)) {
 		dmu_prefetch(ra->os, object, 1, offset, length,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -2475,18 +2545,7 @@ receive_read_record(struct receive_arg *
 		 */
 		if (err == ENOENT ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
-			struct receive_ign_obj_node *node =
-			    kmem_zalloc(sizeof (*node),
-			    KM_SLEEP);
-			node->object = drro->drr_object;
-#ifdef ZFS_DEBUG
-			struct receive_ign_obj_node *last_object =
-			    list_tail(&ra->ignore_obj_list);
-			uint64_t last_objnum = (last_object != NULL ?
-			    last_object->object : 0);
-			ASSERT3U(node->object, >, last_objnum);
-#endif
-			list_insert_tail(&ra->ignore_obj_list, node);
+			objlist_insert(&ra->ignore_objlist, drro->drr_object);
 			err = 0;
 		}
 		return (err);
@@ -2704,7 +2763,6 @@ resume_check(struct receive_arg *ra, nvl
 	return (0);
 }
 
-
 /*
  * Read in the stream's records, one by one, and apply them to the pool.  There
  * are two threads involved; the thread that calls this function will spin up a
@@ -2739,8 +2797,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, 
 		    sizeof (ra.bytes_read), 1, &ra.bytes_read);
 	}
 
-	list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
-	    offsetof(struct receive_ign_obj_node, node));
+	objlist_create(&ra.ignore_objlist);
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2894,12 +2951,7 @@ out:
 	}
 
 	*voffp = ra.voff;
-	for (struct receive_ign_obj_node *n =
-	    list_remove_head(&ra.ignore_obj_list); n != NULL;
-	    n = list_remove_head(&ra.ignore_obj_list)) {
-		kmem_free(n, sizeof (*n));
-	}
-	list_destroy(&ra.ignore_obj_list);
+	objlist_destroy(&ra.ignore_objlist);
 	return (err);
 }
 

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h	Sun Mar 20 23:58:44 2016	(r297101)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h	Mon Mar 21 00:01:59 2016	(r297102)
@@ -25,7 +25,7 @@
 /*
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DMU_IMPL_H
@@ -296,7 +296,6 @@ typedef struct dmu_sendarg {
 	uint64_t dsa_toguid;
 	int dsa_err;
 	dmu_pendop_t dsa_pending_op;
-	boolean_t dsa_incremental;
 	uint64_t dsa_featureflags;
 	uint64_t dsa_last_data_object;
 	uint64_t dsa_last_data_offset;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	Sun Mar 20 23:58:44 2016	(r297101)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h	Mon Mar 21 00:01:59 2016	(r297102)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
@@ -126,6 +126,16 @@ typedef enum dmu_send_resume_token_versi
 
 #define	DRR_FLAG_CLONE		(1<<0)
 #define	DRR_FLAG_CI_DATA	(1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process.  This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible.  We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define	DRR_FLAG_FREERECORDS	(1<<2)
 
 /*
  * flags in the drr_checksumflags field in the DRR_WRITE and



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201603210001.u2L01xTA029545>