Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 1 Feb 2012 14:06:33 -0700
From:      "Justin T. Gibbs" <gibbs@scsiguy.com>
To:        freebsd-xen@freebsd.org
Subject:   [CFT][PATCH] - Rationalize FreeBSD multi-page ring extensions with those from other vendors
Message-ID:  <8FFE3850-7668-48C2-90C1-525213193A33@scsiguy.com>

next in thread | raw e-mail | index | archive | help

--Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
	charset=us-ascii

I'm spent some time documenting all of the Xen blkif extensions that are =
out in the wild and have modified the extensions in FreeBSD so that it =
is possible to make a fully interoperable driver.  Although I have =
performed some of my own testing on different Xen Dom0's, I'm looking =
for additional testers before I push this into -current and merge it =
down into the -stable branches.  I'm most interested in Amazon EC2 =
testing since I know that not all of their nodes run exactly the same =
software.

Feedback always welcome.

Thanks,
Justin


--Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6
Content-Disposition: attachment;
	filename=blkif.diffs
Content-Type: application/octet-stream;
	name="blkif.diffs"
Content-Transfer-Encoding: 7bit

diff -x .svn -ur sys/dev/xen/blkback/blkback.c /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkback/blkback.c
--- sys/dev/xen/blkback/blkback.c	2012-01-31 17:31:44.383114171 -0700
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkback/blkback.c	2012-02-01 12:55:39.958114726 -0700
@@ -40,6 +40,8 @@
  *        a FreeBSD domain to other domains.
  */
 
+#include "opt_kdtrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -63,6 +65,7 @@
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/bitstring.h>
+#include <sys/sdt.h>
 
 #include <geom/geom.h>
 
@@ -980,9 +983,10 @@
 static uint8_t *
 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
 {
-	intptr_t first_clear, num_clear;
+	intptr_t first_clear;
+	intptr_t num_clear;
 	uint8_t *free_kva;
-	int i;
+	int      i;
 
 	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
 
@@ -1681,19 +1685,19 @@
 			req_ring_idx++;
 			switch (xbb->abi) {
 			case BLKIF_PROTOCOL_NATIVE:
-				sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
-							    req_ring_idx);
+				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
+							   req_ring_idx);
 				break;
 			case BLKIF_PROTOCOL_X86_32:
 			{
-				sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
-							    req_ring_idx);
+				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
+							   req_ring_idx);
 				break;
 			}
 			case BLKIF_PROTOCOL_X86_64:
 			{
-				sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
-							    req_ring_idx);
+				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
+							   req_ring_idx);
 				break;
 			}
 			default:
@@ -1817,8 +1821,8 @@
 	struct xbb_xen_reqlist *reqlist;
 
 
-	xbb	      = (struct xbb_softc *)context;
-	rings	      = &xbb->rings;
+	xbb   = (struct xbb_softc *)context;
+	rings = &xbb->rings;
 
 	/*
 	 * Work gather and dispatch loop.  Note that we have a bias here
@@ -2032,6 +2036,13 @@
 	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 }
 
+SDT_PROVIDER_DEFINE(xbb);
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
+		  "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
+		  "uint64_t", "uint64_t");
+
 /*----------------------------- Backend Handlers -----------------------------*/
 /**
  * Backend handler for character device access.
@@ -2087,6 +2098,9 @@
 
 		nreq->pendcnt	 = 1;
 
+		SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
+			   device_get_unit(xbb->dev));
+
 		(*dev_data->csw->d_strategy)(bio);
 
 		return (0);
@@ -2181,6 +2195,17 @@
 			       bios[bio_idx]->bio_bcount);
 		}
 #endif
+		if (operation == BIO_READ) {
+			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
+				   device_get_unit(xbb->dev),
+				   bios[bio_idx]->bio_offset,
+				   bios[bio_idx]->bio_length);
+		} else if (operation == BIO_WRITE) {
+			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
+				   device_get_unit(xbb->dev),
+				   bios[bio_idx]->bio_offset,
+				   bios[bio_idx]->bio_length);
+		}
 		(*dev_data->csw->d_strategy)(bios[bio_idx]);
 	}
 
@@ -2193,6 +2218,12 @@
 	return (error);
 }
 
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
+		  "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
+		  "uint64_t", "uint64_t");
+
 /**
  * Backend handler for file access.
  *
@@ -2237,6 +2268,9 @@
 	case BIO_FLUSH: {
 		struct mount *mountpoint;
 
+		SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
+			   device_get_unit(xbb->dev));
+
 		vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
 
 		(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
@@ -2336,6 +2370,10 @@
 	switch (operation) {
 	case BIO_READ:
 
+		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
+			   device_get_unit(xbb->dev), xuio.uio_offset,
+			   xuio.uio_resid);
+
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 
 		/*
@@ -2366,6 +2404,10 @@
 	case BIO_WRITE: {
 		struct mount *mountpoint;
 
+		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
+			   device_get_unit(xbb->dev), xuio.uio_offset,
+			   xuio.uio_resid);
+
 		(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
@@ -3028,6 +3070,8 @@
 	const char *otherend_path;
 	int	    error;
 	u_int	    ring_idx;
+	u_int	    ring_page_order;
+	size_t	    ring_size;
 
 	otherend_path = xenbus_get_otherend_path(xbb->dev);
 
@@ -3042,16 +3086,13 @@
 	/*
 	 * Mandatory data (used in all versions of the protocol) first.
 	 */
-	error = xs_gather(XST_NIL, otherend_path,
-			  "ring-ref", "%" PRIu32,
-			  &xbb->ring_config.ring_ref[0],
-			  "event-channel", "%" PRIu32,
-			  &xbb->ring_config.evtchn,
-			  NULL);
+	error = xs_scanf(XST_NIL, otherend_path,
+			 "event-channel", NULL, "%" PRIu32,
+			 &xbb->ring_config.evtchn);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
-				 "Unable to retrieve ring information from "
-				 "frontend %s.  Unable to connect.",
+				 "Unable to retrieve event-channel information "
+				 "from frontend %s.  Unable to connect.",
 				 xenbus_get_otherend_path(xbb->dev));
 		return (error);
 	}
@@ -3065,10 +3106,19 @@
 	 *       we must use independant calls in order to guarantee
 	 *       we don't miss information in a sparsly populated front-end
 	 *       tree.
+	 * \note xs_scanf() does not update variables for unmatched
+	 *       fields.
 	 */
+	ring_page_order = 0;
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "ring-page-order", NULL, "%u",
+		       &ring_page_order);
+	xbb->ring_config.ring_pages = 1 << ring_page_order;
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "ring-pages", NULL, "%u",
 		       &xbb->ring_config.ring_pages);
+	ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
+	xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-requests", NULL, "%u",
@@ -3116,22 +3166,39 @@
 		return (EINVAL);
 	}
 
-	/* If using a multi-page ring, pull in the remaining references. */
-	for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
-		char ring_ref_name[]= "ring_refXX";
-
-		snprintf(ring_ref_name, sizeof(ring_ref_name),
-			 "ring-ref%u", ring_idx);
-		error = xs_scanf(XST_NIL, otherend_path,
-				 ring_ref_name, NULL, "%" PRIu32,
-			         &xbb->ring_config.ring_ref[ring_idx]);
+	if (xbb->ring_config.ring_pages	== 1) {
+		error = xs_gather(XST_NIL, otherend_path,
+				  "ring-ref", "%" PRIu32,
+				  &xbb->ring_config.ring_ref[0],
+				  NULL);
 		if (error != 0) {
 			xenbus_dev_fatal(xbb->dev, error,
-					 "Failed to retriev grant reference "
-					 "for page %u of shared ring.  Unable "
-					 "to connect.", ring_idx);
+					 "Unable to retrieve ring information "
+					 "from frontend %s.  Unable to "
+					 "connect.",
+					 xenbus_get_otherend_path(xbb->dev));
 			return (error);
 		}
+	} else {
+		/* Multi-page ring format. */
+		for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
+		     ring_idx++) {
+			char ring_ref_name[]= "ring_refXX";
+
+			snprintf(ring_ref_name, sizeof(ring_ref_name),
+				 "ring-ref%u", ring_idx);
+			error = xs_scanf(XST_NIL, otherend_path,
+					 ring_ref_name, NULL, "%" PRIu32,
+					 &xbb->ring_config.ring_ref[ring_idx]);
+			if (error != 0) {
+				xenbus_dev_fatal(xbb->dev, error,
+						 "Failed to retriev grant "
+						 "reference for page %u of "
+						 "shared ring.  Unable "
+						 "to connect.", ring_idx);
+				return (error);
+			}
+		}
 	}
 
 	error = xs_gather(XST_NIL, otherend_path,
@@ -3197,8 +3264,8 @@
 static int
 xbb_alloc_request_lists(struct xbb_softc *xbb)
 {
-	int i;
 	struct xbb_xen_reqlist *reqlist;
+	int			i;
 
 	/*
 	 * If no requests can be merged, we need 1 request list per
@@ -3318,7 +3385,7 @@
 static void
 xbb_connect(struct xbb_softc *xbb)
 {
-	int		      error;
+	int error;
 
 	if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
 		return;
@@ -3399,7 +3466,8 @@
 static int
 xbb_shutdown(struct xbb_softc *xbb)
 {
-	int error;
+	XenbusState frontState;
+	int	    error;
 
 	DPRINTF("\n");
 
@@ -3413,6 +3481,20 @@
 	if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
 		return (EAGAIN);
 
+	xbb->flags |= XBBF_IN_SHUTDOWN;
+	mtx_unlock(&xbb->lock);
+
+	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
+		xenbus_set_state(xbb->dev, XenbusStateClosing);
+
+	frontState = xenbus_get_otherend_state(xbb->dev);
+	mtx_lock(&xbb->lock);
+	xbb->flags &= ~XBBF_IN_SHUTDOWN;
+
+	/* The front can submit I/O until entering the closed state. */
+	if (frontState < XenbusStateClosed)
+		return (EAGAIN);
+
 	DPRINTF("\n");
 
 	/* Indicate shutdown is in progress. */
@@ -3434,19 +3516,6 @@
 
 	DPRINTF("\n");
 
-	/*
-	 * Before unlocking mutex, set this flag to prevent other threads from
-	 * getting into this function
-	 */
-	xbb->flags |= XBBF_IN_SHUTDOWN;
-	mtx_unlock(&xbb->lock);
-
-	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
-		xenbus_set_state(xbb->dev, XenbusStateClosing);
-
-	mtx_lock(&xbb->lock);
-	xbb->flags &= ~XBBF_IN_SHUTDOWN;
-
 	/* Indicate to xbb_detach() that is it safe to proceed. */
 	wakeup(xbb);
 
@@ -3573,6 +3642,11 @@
 		        "max_request_segments", CTLFLAG_RD,
 		        &xbb->max_request_segments, 0,
 		        "maximum number of pages per requests (negotiated)");
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "ring_pages", CTLFLAG_RD,
+		        &xbb->ring_config.ring_pages, 0,
+		        "communication channel pages (negotiated)");
 }
 
 /**
@@ -3587,6 +3661,7 @@
 {
 	struct xbb_softc	*xbb;
 	int			 error;
+	u_int			 max_ring_page_order;
 
 	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
 
@@ -3621,6 +3696,10 @@
 		return (error);
 	}
 
+	/*
+	 * Amazon EC2 client compatility.  They refer to max-ring-pages
+	 * instead of to max-ring-page-order.
+	 */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
 	if (error) {
@@ -3629,6 +3708,15 @@
 		return (error);
 	}
 
+	max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "max-ring-page-order", "%u", max_ring_page_order);
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-requests", "%u", XBB_MAX_REQUESTS);
 	if (error) {
@@ -3862,12 +3950,16 @@
 		xbb_connect(xbb);
 		break;
 	case XenbusStateClosing:
+		/*
+		 * Frontend has acknowledged Closing request.
+		 * Wait for Closed state.
+		 */
+		break;
 	case XenbusStateClosed:
 		mtx_lock(&xbb->lock);
 		xbb_shutdown(xbb);
 		mtx_unlock(&xbb->lock);
-		if (frontend_state == XenbusStateClosed)
-			xenbus_set_state(xbb->dev, XenbusStateClosed);
+		xenbus_set_state(xbb->dev, XenbusStateClosed);
 		break;
 	default:
 		xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
diff -x .svn -ur sys/dev/xen/blkfront/blkfront.c /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkfront/blkfront.c
--- sys/dev/xen/blkfront/blkfront.c	2011-12-09 10:56:37.165520312 -0700
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkfront/blkfront.c	2012-02-01 12:59:37.312115640 -0700
@@ -226,7 +226,7 @@
 	sc->xb_disk->d_sectorsize = sector_size;
 
 	sc->xb_disk->d_mediasize = sectors * sector_size;
-	sc->xb_disk->d_maxsize = sc->max_request_size;
+	sc->xb_disk->d_maxsize = sc->max_request_size - PAGE_SIZE;
 	sc->xb_disk->d_flags = 0;
 	disk_create(sc->xb_disk, DISK_VERSION_00);
 
@@ -501,6 +501,7 @@
 {
 	const char *otherend_path;
 	const char *node_path;
+	uint32_t max_ring_page_order;
 	int error;
 	int i;
 
@@ -513,6 +514,7 @@
 	 * Protocol defaults valid even if negotiation for a
 	 * setting fails.
 	 */
+	max_ring_page_order = 0;
 	sc->ring_pages = 1;
 	sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
 	sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
@@ -526,12 +528,22 @@
 	 *       we must use independant calls in order to guarantee
 	 *       we don't miss information in a sparsly populated back-end
 	 *       tree.
+	 * \note xs_scanf() does not update variables for unmatched
+	 *	 fields.
 	 */
 	otherend_path = xenbus_get_otherend_path(sc->xb_dev);
 	node_path = xenbus_get_node(sc->xb_dev);
+
+	/* Support both backend schemes for relaying ring page limits. */
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-ring-page-order", NULL, "%" PRIu32,
+		       &max_ring_page_order);
+	sc->ring_pages = 1 << max_ring_page_order;
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-ring-pages", NULL, "%" PRIu32,
 		       &sc->ring_pages);
+	if (sc->ring_pages < 1)
+		sc->ring_pages = 1;
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-requests", NULL, "%" PRIu32,
@@ -552,6 +564,16 @@
 		sc->ring_pages = XBF_MAX_RING_PAGES;
 	}
 
+	if (powerof2(sc->ring_pages) == 0) {
+		u_int new_page_limit;
+
+		new_page_limit = 0x01 << (fls(sc->ring_pages) - 1);
+		device_printf(sc->xb_dev, "Back-end specified ring-pages of "
+			      "%u is not a power of 2. Limited to %u.\n",
+			      sc->ring_pages, new_page_limit);
+		sc->ring_pages = new_page_limit;
+	}
+
 	if (sc->max_requests > XBF_MAX_REQUESTS) {
 		device_printf(sc->xb_dev, "Back-end specified max_requests of "
 			      "%u limited to front-end limit of %u.\n",
@@ -625,6 +647,7 @@
 	if (setup_blkring(sc) != 0)
 		return;
 
+	/* Support both backend schemes for relaying ring page limits. */
 	error = xs_printf(XST_NIL, node_path,
 			 "ring-pages","%u", sc->ring_pages);
 	if (error) {
@@ -633,6 +656,14 @@
 				 node_path);
 		return;
 	}
+	error = xs_printf(XST_NIL, node_path,
+			 "ring-page-order","%u", fls(sc->ring_pages) - 1);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/ring-page-order",
+				 node_path);
+		return;
+	}
 
 	error = xs_printf(XST_NIL, node_path,
 			 "max-requests","%u", sc->max_requests);
@@ -795,7 +826,7 @@
 	unsigned int binfo;
 	int err, feature_barrier;
 
-        if( (sc->connected == BLKIF_STATE_CONNECTED) || 
+	if( (sc->connected == BLKIF_STATE_CONNECTED) || 
 	    (sc->connected == BLKIF_STATE_SUSPENDED) )
 		return;
 
@@ -923,15 +954,13 @@
 		return (ENXIO);
 	sc->xb_flags &= ~XB_OPEN;
 	if (--(sc->users) == 0) {
-		/* Check whether we have been instructed to close.  We will
-		   have ignored this request initially, as the device was
-		   still mounted. */
-		device_t dev = sc->xb_dev;
-		XenbusState state =
-			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
-
-		if (state == XenbusStateClosing)
-			blkfront_closing(dev);
+		/*
+		 * Check whether we have been instructed to close.  We will
+		 * have ignored this request initially, as the device was
+		 * still mounted.
+		 */
+		if (xenbus_get_otherend_state(sc->xb_dev) == XenbusStateClosing)
+			blkfront_closing(sc->xb_dev);
 	}
 	return (0);
 }
@@ -1033,7 +1062,7 @@
 	struct xb_command *cm;
 	blkif_request_t	*ring_req;
 	struct blkif_request_segment *sg;
-        struct blkif_request_segment *last_block_sg;
+	struct blkif_request_segment *last_block_sg;
 	grant_ref_t *sg_ref;
 	vm_paddr_t buffer_ma;
 	uint64_t fsect, lsect;
@@ -1104,12 +1133,12 @@
 			nsegs--;
 		}
 		block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
-                if (block_segs == 0)
-                        break;
+		if (block_segs == 0)
+			break;
 
-                sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
+		sg = BLKRING_GET_SEG_BLOCK(&sc->ring, sc->ring.req_prod_pvt);
 		sc->ring.req_prod_pvt++;
-                last_block_sg = sg + block_segs;
+		last_block_sg = sg + block_segs;
 	}
 
 	if (cm->operation == BLKIF_OP_READ)
diff -x .svn -ur sys/xen/interface/io/blkif.h /usr/home/justing/perforce/SpectraBSD/head/sys/xen/interface/io/blkif.h
--- sys/xen/interface/io/blkif.h	2010-10-27 22:12:14.560797810 -0600
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/xen/interface/io/blkif.h	2012-01-31 16:11:47.565117660 -0700
@@ -1,8 +1,8 @@
 /******************************************************************************
  * blkif.h
- * 
+ *
  * Unified block-device I/O interface for Xen guest OSes.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -22,6 +22,7 @@
  * DEALINGS IN THE SOFTWARE.
  *
  * Copyright (c) 2003-2004, Keir Fraser
+ * Copyright (c) 2012, Spectra Logic Corporation
  */
 
 #ifndef __XEN_PUBLIC_IO_BLKIF_H__
@@ -35,7 +36,7 @@
  * notification can be made conditional on req_event (i.e., the generic
  * hold-off mechanism provided by the ring macros). Backends must set
  * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
- * 
+ *
  * Back->front notifications: When enqueuing a new response, sending a
  * notification can be made conditional on rsp_event (i.e., the generic
  * hold-off mechanism provided by the ring macros). Frontends must set
@@ -48,37 +49,401 @@
 #define blkif_sector_t uint64_t
 
 /*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen block driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters.  This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * All data in the XenStore is stored as strings.  Nodes specifying numeric
+ * values are encoded in decimal.  Integer value ranges listed below are
+ * expressed as fixed sized integer types capable of storing the conversion
+ * of a properly formated node string, without loss of information.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * See the XenBus state transition diagram below for details on when XenBus
+ * nodes must be published and when they can be queried.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-barrier
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_WRITE_BARRIER request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-flush-cache
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_FLUSH_DISKCACHE request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-discard
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_DISCARD request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ *----------------------- Request Transport Parameters ------------------------
+ *
+ * max-ring-page-order
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Notes:          1, 3
+ *
+ *      The maximum supported size of the request ring buffer in units of
+ *      lb(machine pages). (e.g. 0 == 1 page,  1 = 2 pages, 2 == 4 pages,
+ *      etc.).
+ *
+ * max-ring-pages
+ *      Values:         <uint32_t>
+ *      Default Value:  1
+ *      Notes:          2, 3
+ *
+ *      The maximum supported size of the request ring buffer in units of
+ *      machine pages.  The value must be a power of 2.
+ *
+ * max-requests         <uint32_t>
+ *      Default Value:  BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ *      Maximum Value:  BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
+ *
+ *      The maximum number of concurrent requests supported by the backend.
+ *
+ * max-request-segments
+ *      Values:         <uint8_t>
+ *      Default Value:  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ *      Maximum Value:  255
+ *
+ *      The maximum value of blkif_request.nr_segments supported by
+ *      the backend.
+ *
+ * max-request-size
+ *      Values:         <uint32_t>
+ *      Default Value:  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ *      Maximum Value:  255 * PAGE_SIZE
+ *
+ *      The maximum amount of data, in bytes, that can be referenced by a
+ *      request type that accesses frontend memory (currently BLKIF_OP_READ,
+ *      BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *----------------------- Backend Device Identification -----------------------
+ * mode
+ *      Values:         "r" (read only), "w" (writable)
+ *
+ *      The read or write access permissions to the backing store to be
+ *      granted to the frontend.
+ *
+ * params
+ *      Values:         string
+ *
+ *      A free formatted string providing sufficient information for the
+ *      backend driver to open the backing device.  (e.g. the path to the
+ *      file or block device representing the backing store.)
+ *
+ * type
+ *      Values:         "file", "phy", "tap"
+ *
+ *      The type of the backing device/object.
+ *
+ *------------------------- Backend Device Properties -------------------------
+ *
+ * discard-aligment
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Notes:          4, 5
+ *
+ *      The offset, in bytes from the beginning of the virtual block device,
+ *      to the first, addressable, discard extent on the underlying device.
+ *
+ * discard-granularity
+ *      Values:         <uint32_t>
+ *      Default Value:  512
+ *      Notes:          4
+ *
+ *      The size, in bytes, of the individually addressable discard extents
+ *      of the underlying device.
+ *
+ * discard-secure
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
+ *      requests with the BLKIF_DISCARD_SECURE flag set.
+ *
+ * info
+ *      Values:         <uint32_t> (bitmap)
+ *
+ *      A collection of bit flags describing attributes of the backing
+ *      device.  The VDISK_* macros define the meaning of each bit
+ *      location.
+ *
+ * sector-size
+ *      Values:         <uint32_t>
+ *
+ *      The native sector size, in bytes, of the backend device.
+ *
+ * sectors
+ *      Values:         <uint64_t>
+ *
+ *      The size of the backend device, expressed in units of its native
+ *      sector size ("sector-size").
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         <uint32_t>
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * ring-ref
+ *      Values:         <uint32_t>
+ *      Notes:          6
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer.
+ *
+ * ring-ref%u
+ *      Values:         <uint32_t>
+ *      Notes:          6
+ *
+ *      For a frontend providing a multi-page ring, a "ring-pages" sized
+ *      list of nodes, each containing a Xen grant reference granting
+ *      permission for the backend to map the page of the ring located
+ *      at page index "%u".  Page indexes are zero based.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ *
+ * ring-page-order
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Maximum Value:  MAX(ffs(max-ring-pages) - 1, max-ring-page-order)
+ *      Notes:          1, 3
+ *
+ *      The size of the frontend allocated request ring buffer in units
+ *      of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ *      etc.).
+ *
+ * ring-pages
+ *      Values:         <uint32_t>
+ *      Default Value:  1
+ *      Maximum Value:  MAX(max-ring-pages,(0x1 << max-ring-page-order))
+ *      Notes:          2, 3
+ *
+ *      The size of the frontend allocated request ring buffer in units of
+ *      machine pages.  The value must be a power of 2.
+ *
+ * max-requests
+ *      Values:         <uint32_t>
+ *      Default Value:  BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ *      Maximum Value:  BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring_pages)
+ *
+ *      The maximum number of concurrent requests that will be issued by
+ *      the frontend.
+ *
+ * max-request-segments
+ *      Values:         <uint8_t>
+ *      Default Value:  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ *      Maximum Value:  MIN(255, backend/max-request-segments)
+ *
+ *      The maximum value the frontend will set in the
+ *      blkif_request.nr_segments field.
+ *
+ * max-request-size
+ *      Values:         <uint32_t>
+ *      Default Value:  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ *      Maximum Value:  max-request-segments * PAGE_SIZE
+ *      Notes:          3
+ *
+ *      The maximum amount of data, in bytes, that can be referenced by
+ *      a request type that accesses frontend memory (currently BLKIF_OP_READ,
+ *      BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *------------------------- Virtual Device Properties -------------------------
+ *
+ * device-type
+ *      Values:         "disk", "cdrom", "floppy", etc.
+ *
+ * virtual-device
+ *      Values:         <uint16_t> (XEN_*_MAJOR << 8 | Minor)
+ *
+ *      A value indicating the physical device to virtualize within the
+ *      frontend's domain.  (e.g. "The first ATA disk", "The third SCSI
+ *      disk", etc.)
+ *
+ * Notes
+ * -----
+ * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
+ *     PV drivers.
+ * (2) Multi-page ring buffer scheme first used in some RedHat distributions
+ *     including a distribution deployed on certain nodes of the Amazon
+ *     EC2 cluster.
+ * (3) Support for multi-page ring buffers was implemented independently,
+ *     in slightly different forms, by both Citrix and RedHat/Amazon.
+ *     For full interoperability, block front and backends should support
+ *     both methods of negotiating this capability.
+ * (4) Devices that support discard functionality may internally allocate
+ *     space (discardable extents) in units that are larger than the
+ *     exported logical block size.
+ * (5) The discard-alignment parameter allows a physical device to be
+ *     partitioned into virtual devices that do not necessarily begin or
+ *     end on a discardable extent.
+ * (6) When there is only a single page allocated to the request ring,
+ *     'ring-ref' is used to communicate the grant reference for this
+ *     page to the backend.  When using a multi-page ring, the 'ring-ref'
+ *     node is not created.  Instead 'ring-ref0' - 'ring-refN' are used.
+ */
+
+/*
+ * STATE DIAGRAMS
+ *
+ *****************************************************************************
+ *                                   Startup                                 *
+ *****************************************************************************
+ *
+ * Tool stack creates front and back nodes with state XenbusStateInitialising.
+ *
+ * Front                                Back
+ * =================================    =====================================
+ * XenbusStateInitialising              XenbusStateInitialising
+ *  o Query virtual device              o Query backend device identification
+ *    properties.                          data.
+ *  o Setup OS device instance.          o Open and validate backend device.
+ *                                       o Publish backend features and
+ *                                         transport parameters.
+ *                                                      |
+ *                                                      |
+ *                                                      V
+ *                                      XenbusStateInitWait
+ *
+ * o Query backend features and
+ *   transport parameters.
+ * o Allocate and initialize the
+ *   request ring.
+ * o Publish transport parameters
+ *   that will be in effect during
+ *   this connection.
+ *              |
+ *              |
+ *              V
+ * XenbusStateInitialised
+ *
+ *                                       o Query frontend, transport parameters.
+ *                                       o Connect to the request ring and
+ *                                         event channel.
+ *                                       o Publish backend device properties.
+ *                                                      |
+ *                                                      |
+ *                                                      V
+ *                                      XenbusStateConnected
+ *
+ *  o Query backend device properties.
+ *  o Finalize OS virtual device
+ *    instance.
+ *              |
+ *              |
+ *              V
+ * XenbusStateConnected
+ *
+ * Note: Drivers that do not support the negotiation of transport
+ *       parameters, can skip certain states in the state machine:
+ *
+ *       o A frontend may transition to XenbusStateInitialised without
+ *         waiting for the backend to enter XenbusStateInitWait.  In this
+ *         case, default transport parameters are in effect and any
+ *         transport parameters published by the frontend must contain
+ *         their default values.
+ *
+ *       o A backend may transition to XenbusStateInitialed without waiting
+ *         for the backend to first enter the XenbusStateInitialised state.
+ *         In this case, default transport parameters are in effect and any
+ *         transport parameters published by the backend must contain their
+ *         default values.
+ *
+ *       Drivers that support transport parameter negotiation must tolerate
+ *       these additional state transition paths in order to interoperate
+ *       with drivers that do not.  In general this means performing the
+ *       work of any skipped state transition, if it has not already been
+ *       performed, in addition to the work associated with the current state.
+ */
+
+/*
  * REQUEST CODES.
  */
 #define BLKIF_OP_READ              0
 #define BLKIF_OP_WRITE             1
 /*
- * Recognised only if "feature-barrier" is present in backend xenbus info.
- * The "feature-barrier" node contains a boolean indicating whether barrier
- * requests are likely to succeed or fail. Either way, a barrier request
- * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
- * the underlying block-device hardware. The boolean simply indicates whether
- * or not it is worthwhile for the frontend to attempt barrier requests.
- * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
- * create the "feature-barrier" node!
+ * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER
+ * operation code ("barrier request") must be completed prior to the
+ * execution of the barrier request.  All writes issued after the barrier
+ * request must not execute until after the completion of the barrier request.
+ *
+ * Optional.  See "feature-barrier" XenBus node documentation above.
  */
 #define BLKIF_OP_WRITE_BARRIER     2
 /*
- * Recognised if "feature-flush-cache" is present in backend xenbus
- * info.  A flush will ask the underlying storage hardware to flush its
- * non-volatile caches as appropriate.  The "feature-flush-cache" node
- * contains a boolean indicating whether flush requests are likely to
- * succeed or fail. Either way, a flush request may fail at any time
- * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
- * block-device hardware. The boolean simply indicates whether or not it
- * is worthwhile for the frontend to attempt flushes.  If a backend does
- * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
- * "feature-flush-cache" node!
+ * Commit any uncommitted contents of the backing device's volatile cache
+ * to stable storage.
+ *
+ * Optional.  See "feature-flush-cache" XenBus node documentation above.
  */
 #define BLKIF_OP_FLUSH_DISKCACHE   3
+/*
+ * Used in SLES sources for device specific command packet
+ * contained within the request. Reserved for that purpose.
+ */
+#define BLKIF_OP_RESERVED_1        4
+/*
+ * Indicate to the backend device that a region of storage is no longer in
+ * use, and may be discarded at any time without impact to the client.  If
+ * BLKIF_DISCARD_SECURE flag is set on the request, all copies of the
+ * discarded region on the device must be rendered unrecoverable before the
+ * command returns.
+ *
+ * This operation is a analogous to performing a trim (ATA) or unamp (SCSI),
+ * command on a native device.
+ *
+ * More information about trim/unmap operations can be found at:
+ * http://t13.org/Documents/UploadedDocuments/docs2008/
+ *     e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
+ * http://www.seagate.com/staticfiles/support/disc/manuals/
+ *     Interface%20manuals/100293068c.pdf
+ *
+ * Optional.  See "feature-discard", "discard-alignment",
+ * "discard-granularity", and "discard-secure" in the XenBus node
+ * documentation above.
+ */
+#define BLKIF_OP_DISCARD           5
 
 /*
  * Maximum scatter/gather segments associated with a request header block.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
  */
 #define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK  11
 
@@ -92,6 +457,13 @@
  */
 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 255
 
+/*
+ * NB. first_sect and last_sect in blkif_request_segment, as well as
+ * sector_number in blkif_request, are always expressed in 512-byte units.
+ * However they must be properly aligned to the real sector size of the
+ * physical disk, which is reported in the "sector-size" node in the backend
+ * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
+ */
 struct blkif_request_segment {
     grant_ref_t gref;        /* reference to I/O buffer frame        */
     /* @first_sect: first sector in frame to transfer (inclusive).   */
@@ -100,16 +472,60 @@
 };
 typedef struct blkif_request_segment blkif_request_segment_t;
 
+/*
+ * Starting ring element for any I/O request.
+ *
+ * One or more segment blocks can be inserted into the request ring
+ * just after a blkif_request_t, allowing requests to operate on
+ * up to BLKIF_MAX_SEGMENTS_PER_REQUEST.
+ *
+ * BLKIF_SEGS_TO_BLOCKS() can be used on blkif_requst.nr_segments
+ * to determine the number of contiguous ring entries associated
+ * with this request.
+ *
+ * Note:  Due to the way Xen request rings operate, the producer and
+ *        consumer indices of the ring must be incremented by the
+ *        BLKIF_SEGS_TO_BLOCKS() value of the associated request.
+ *        (e.g. a response to a 3 ring entry request must also consume
+ *        3 entries in the ring, even though only the first ring entry
+ *        in the response has any data.)
+ */
 struct blkif_request {
     uint8_t        operation;    /* BLKIF_OP_???                         */
     uint8_t        nr_segments;  /* number of segments                   */
     blkif_vdev_t   handle;       /* only for read/write requests         */
     uint64_t       id;           /* private guest value, echoed in resp  */
     blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+    blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
 };
 typedef struct blkif_request blkif_request_t;
 
+/*
+ * A segment block is a ring request structure that contains only
+ * segment data.
+ *
+ * sizeof(struct blkif_segment_block) <= sizeof(struct blkif_request)
+ */
+struct blkif_segment_block {
+    blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK];
+};
+typedef struct blkif_segment_block blkif_segment_block_t;
+
+/*
+ * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD
+ * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request)
+ */
+struct blkif_request_discard {
+    uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+    uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+#define BLKIF_DISCARD_SECURE (1<<0)  /* ignored if discard-secure=0      */
+    blkif_vdev_t   handle;       /* same as for read/write requests      */
+    uint64_t       id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk             */
+    uint64_t       nr_sectors;   /* number of contiguous sectors to discard*/
+};
+typedef struct blkif_request_discard blkif_request_discard_t;
+
 struct blkif_response {
     uint64_t        id;              /* copied from request */
     uint8_t         operation;       /* copied from request */
@@ -130,24 +546,48 @@
 /*
  * Generate blkif ring structures and types.
  */
-
 DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
 
-#define BLKRING_GET_SG_REQUEST(_r, _idx)				\
-    ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx))
+/*
+ * Index to, and treat as a segment block, an entry in the ring.
+ */
+#define BLKRING_GET_SEG_BLOCK(_r, _idx)                                 \
+    (((blkif_segment_block_t *)RING_GET_REQUEST(_r, _idx))->seg)
+
+/*
+ * The number of ring request blocks required to handle an I/O
+ * request containing _segs segments.
+ */
+#define BLKIF_SEGS_TO_BLOCKS(_segs)                                     \
+    ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK)                    \
+     + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1))                      \
+    / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
 
 #define VDISK_CDROM        0x1
 #define VDISK_REMOVABLE    0x2
 #define VDISK_READONLY     0x4
 
 /*
- * The number of ring request blocks required to handle an I/O
- * request containing _segs segments.
+ * Xen-defined major numbers for virtual disks.
  */
-#define BLKIF_SEGS_TO_BLOCKS(_segs)					\
-	((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK)		\
-	 + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1))			\
-        / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
+#define XEN_IDE0_MAJOR          3
+#define XEN_IDE1_MAJOR          22
+#define XEN_SCSI_DISK0_MAJOR    8
+#define XEN_SCSI_DISK1_MAJOR    65
+#define XEN_SCSI_DISK2_MAJOR    66
+#define XEN_SCSI_DISK3_MAJOR    67
+#define XEN_SCSI_DISK4_MAJOR    68
+#define XEN_SCSI_DISK5_MAJOR    69
+#define XEN_SCSI_DISK6_MAJOR    70
+#define XEN_SCSI_DISK7_MAJOR    71
+#define XEN_SCSI_DISK8_MAJOR    128
+#define XEN_SCSI_DISK9_MAJOR    129
+#define XEN_SCSI_DISK10_MAJOR   130
+#define XEN_SCSI_DISK11_MAJOR   131
+#define XEN_SCSI_DISK12_MAJOR   132
+#define XEN_SCSI_DISK13_MAJOR   133
+#define XEN_SCSI_DISK14_MAJOR   134
+#define XEN_SCSI_DISK15_MAJOR   135
 
 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
 
diff -x .svn -ur sys/xen/xenbus/xenbusvar.h /usr/home/justing/perforce/SpectraBSD/head/sys/xen/xenbus/xenbusvar.h
--- sys/xen/xenbus/xenbusvar.h	2011-06-10 22:59:01.723658126 -0600
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/xen/xenbus/xenbusvar.h	2012-01-31 16:41:51.486111080 -0700
@@ -104,6 +104,20 @@
 XenbusState xenbus_read_driver_state(const char *path);
 
 /**
+ * Return the state of the "other end" (peer) of a XenBus device.
+ *
+ * \param dev   The XenBus device whose peer to query.
+ *
+ * \return  The current state of the peer device or XenbusStateClosed if no
+ *          state can be read.
+ */
+static inline XenbusState
+xenbus_get_otherend_state(device_t dev)
+{
+	return (xenbus_read_driver_state(xenbus_get_otherend_path(dev)));
+}
+
+/**
  * Initialize and register a watch on the given path (client suplied storage).
  *
  * \param dev       The XenBus device requesting the watch service.
Only in sys/xen/xenstore: xenstore.c.orig

--Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?8FFE3850-7668-48C2-90C1-525213193A33>