Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 15 Feb 2012 06:45:49 +0000 (UTC)
From:      "Justin T. Gibbs" <gibbs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r231743 - in head/sys: dev/xen/blkback dev/xen/blkfront xen/interface/io xen/xenbus
Message-ID:  <201202150645.q1F6jnLi041573@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gibbs
Date: Wed Feb 15 06:45:49 2012
New Revision: 231743
URL: http://svn.freebsd.org/changeset/base/231743

Log:
  Enhance documentation, improve interoperability, and fix defects in
  FreeBSD's front and back Xen blkif interface drivers.
  
  sys/dev/xen/blkfront/block.h:
  sys/dev/xen/blkfront/blkfront.c:
  sys/dev/xen/blkback/blkback.c:
  	Replace FreeBSD specific multi-page ring impelementation with
  	support for both the Citrix and Amazon/RedHat versions of this
  	extension.
  
  sys/dev/xen/blkfront/blkfront.c:
  	o Add a per-instance sysctl tree that exposes all negotiated
  	  transport parameters (ring pages, max number of requests,
  	  max request size, max number of segments).
  	o In blkfront_vdevice_to_unit() add a missing return statement
  	  so that we properly identify the unit number for high numbered
  	  xvd devices.
  
  sys/dev/xen/blkback/blkback.c:
  	o Add static dtrace probes for several events in this driver.
  	o Defer connection shutdown processing until the front-end
  	  enters the closed state.  This avoids prematurely tearing
  	  down the connection when buggy front-ends transition to the
  	  closing state, even though the device is open and they
  	  veto the close request from the tool stack.
  	o Add nodes for maximum request size and the number of active
  	  ring pages to the exising, per-instance, sysctl tree.
  	o Miscelaneous style cleanup.
  
  sys/xen/interface/io/blkif.h:
  	o Add extensive documentation of the XenStore nodes used to
  	  implement the blkif interface.
  	o Document the startup sequence between a front and back driver.
  	o Add structures and documenatation for the "discard" feature
  	  (AKA Trim).
  	o Cleanup some definitions related to FreeBSD's request
  	  number/size/segment-limit extension.
  
  sys/dev/xen/blkfront/blkfront.c:
  sys/dev/xen/blkback/blkback.c:
  sys/xen/xenbus/xenbusvar.h:
  	Add the convenience function xenbus_get_otherend_state() and
  	use it to simplify some logic in both block-front and block-back.
  
  MFC after:	1 day

Modified:
  head/sys/dev/xen/blkback/blkback.c
  head/sys/dev/xen/blkfront/blkfront.c
  head/sys/dev/xen/blkfront/block.h
  head/sys/xen/interface/io/blkif.h
  head/sys/xen/xenbus/xenbusvar.h

Modified: head/sys/dev/xen/blkback/blkback.c
==============================================================================
--- head/sys/dev/xen/blkback/blkback.c	Wed Feb 15 06:16:52 2012	(r231742)
+++ head/sys/dev/xen/blkback/blkback.c	Wed Feb 15 06:45:49 2012	(r231743)
@@ -40,6 +40,8 @@ __FBSDID("$FreeBSD$");
  *        a FreeBSD domain to other domains.
  */
 
+#include "opt_kdtrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -63,6 +65,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/bitstring.h>
+#include <sys/sdt.h>
 
 #include <geom/geom.h>
 
@@ -124,7 +127,7 @@ __FBSDID("$FreeBSD$");
 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
 
 #ifdef XBB_DEBUG
-#define DPRINTF(fmt, args...) \
+#define DPRINTF(fmt, args...)					\
     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
 #define DPRINTF(fmt, args...) do {} while(0)
@@ -134,7 +137,7 @@ static MALLOC_DEFINE(M_XENBLOCKBACK, "xb
  * The maximum mapped region size per request we will allow in a negotiated
  * block-front/back communication channel.
  */
-#define	XBB_MAX_REQUEST_SIZE		\
+#define	XBB_MAX_REQUEST_SIZE					\
 	MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
 
 /**
@@ -142,9 +145,9 @@ static MALLOC_DEFINE(M_XENBLOCKBACK, "xb
  * segment blocks) per request we will allow in a negotiated block-front/back
  * communication channel.
  */
-#define	XBB_MAX_SEGMENTS_PER_REQUEST			\
-	(MIN(UIO_MAXIOV,				\
-	     MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,	\
+#define	XBB_MAX_SEGMENTS_PER_REQUEST				\
+	(MIN(UIO_MAXIOV,					\
+	     MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,		\
 		 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
 
 /**
@@ -980,9 +983,10 @@ xbb_get_gntaddr(struct xbb_xen_reqlist *
 static uint8_t *
 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
 {
-	intptr_t first_clear, num_clear;
+	intptr_t first_clear;
+	intptr_t num_clear;
 	uint8_t *free_kva;
-	int i;
+	int      i;
 
 	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
 
@@ -1681,19 +1685,19 @@ xbb_dispatch_io(struct xbb_softc *xbb, s
 			req_ring_idx++;
 			switch (xbb->abi) {
 			case BLKIF_PROTOCOL_NATIVE:
-				sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
-							    req_ring_idx);
+				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
+							   req_ring_idx);
 				break;
 			case BLKIF_PROTOCOL_X86_32:
 			{
-				sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
-							    req_ring_idx);
+				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
+							   req_ring_idx);
 				break;
 			}
 			case BLKIF_PROTOCOL_X86_64:
 			{
-				sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
-							    req_ring_idx);
+				sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
+							   req_ring_idx);
 				break;
 			}
 			default:
@@ -1817,8 +1821,8 @@ xbb_run_queue(void *context, int pending
 	struct xbb_xen_reqlist *reqlist;
 
 
-	xbb	      = (struct xbb_softc *)context;
-	rings	      = &xbb->rings;
+	xbb   = (struct xbb_softc *)context;
+	rings = &xbb->rings;
 
 	/*
 	 * Work gather and dispatch loop.  Note that we have a bias here
@@ -2032,6 +2036,13 @@ xbb_intr(void *arg)
 	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 }
 
+SDT_PROVIDER_DEFINE(xbb);
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
+		  "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
+		  "uint64_t", "uint64_t");
+
 /*----------------------------- Backend Handlers -----------------------------*/
 /**
  * Backend handler for character device access.
@@ -2087,6 +2098,9 @@ xbb_dispatch_dev(struct xbb_softc *xbb, 
 
 		nreq->pendcnt	 = 1;
 
+		SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
+			   device_get_unit(xbb->dev));
+
 		(*dev_data->csw->d_strategy)(bio);
 
 		return (0);
@@ -2181,6 +2195,17 @@ xbb_dispatch_dev(struct xbb_softc *xbb, 
 			       bios[bio_idx]->bio_bcount);
 		}
 #endif
+		if (operation == BIO_READ) {
+			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
+				   device_get_unit(xbb->dev),
+				   bios[bio_idx]->bio_offset,
+				   bios[bio_idx]->bio_length);
+		} else if (operation == BIO_WRITE) {
+			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
+				   device_get_unit(xbb->dev),
+				   bios[bio_idx]->bio_offset,
+				   bios[bio_idx]->bio_length);
+		}
 		(*dev_data->csw->d_strategy)(bios[bio_idx]);
 	}
 
@@ -2193,6 +2218,12 @@ fail_free_bios:
 	return (error);
 }
 
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
+		  "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
+		  "uint64_t", "uint64_t");
+
 /**
  * Backend handler for file access.
  *
@@ -2237,6 +2268,9 @@ xbb_dispatch_file(struct xbb_softc *xbb,
 	case BIO_FLUSH: {
 		struct mount *mountpoint;
 
+		SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
+			   device_get_unit(xbb->dev));
+
 		vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
 
 		(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
@@ -2336,6 +2370,10 @@ xbb_dispatch_file(struct xbb_softc *xbb,
 	switch (operation) {
 	case BIO_READ:
 
+		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
+			   device_get_unit(xbb->dev), xuio.uio_offset,
+			   xuio.uio_resid);
+
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 
 		/*
@@ -2366,6 +2404,10 @@ xbb_dispatch_file(struct xbb_softc *xbb,
 	case BIO_WRITE: {
 		struct mount *mountpoint;
 
+		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
+			   device_get_unit(xbb->dev), xuio.uio_offset,
+			   xuio.uio_resid);
+
 		(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
@@ -3028,6 +3070,8 @@ xbb_collect_frontend_info(struct xbb_sof
 	const char *otherend_path;
 	int	    error;
 	u_int	    ring_idx;
+	u_int	    ring_page_order;
+	size_t	    ring_size;
 
 	otherend_path = xenbus_get_otherend_path(xbb->dev);
 
@@ -3035,23 +3079,19 @@ xbb_collect_frontend_info(struct xbb_sof
 	 * Protocol defaults valid even if all negotiation fails.
 	 */
 	xbb->ring_config.ring_pages = 1;
-	xbb->max_requests	    = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
 	xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
 	xbb->max_request_size	    = xbb->max_request_segments * PAGE_SIZE;
 
 	/*
 	 * Mandatory data (used in all versions of the protocol) first.
 	 */
-	error = xs_gather(XST_NIL, otherend_path,
-			  "ring-ref", "%" PRIu32,
-			  &xbb->ring_config.ring_ref[0],
-			  "event-channel", "%" PRIu32,
-			  &xbb->ring_config.evtchn,
-			  NULL);
+	error = xs_scanf(XST_NIL, otherend_path,
+			 "event-channel", NULL, "%" PRIu32,
+			 &xbb->ring_config.evtchn);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
-				 "Unable to retrieve ring information from "
-				 "frontend %s.  Unable to connect.",
+				 "Unable to retrieve event-channel information "
+				 "from frontend %s.  Unable to connect.",
 				 xenbus_get_otherend_path(xbb->dev));
 		return (error);
 	}
@@ -3065,10 +3105,20 @@ xbb_collect_frontend_info(struct xbb_sof
 	 *       we must use independant calls in order to guarantee
 	 *       we don't miss information in a sparsly populated front-end
 	 *       tree.
+	 *
+	 * \note xs_scanf() does not update variables for unmatched
+	 *       fields.
 	 */
+	ring_page_order = 0;
 	(void)xs_scanf(XST_NIL, otherend_path,
-		       "ring-pages", NULL, "%u",
+		       "ring-page-order", NULL, "%u",
+		       &ring_page_order);
+	xbb->ring_config.ring_pages = 1 << ring_page_order;
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "num-ring-pages", NULL, "%u",
 		       &xbb->ring_config.ring_pages);
+	ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
+	xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-requests", NULL, "%u",
@@ -3116,22 +3166,39 @@ xbb_collect_frontend_info(struct xbb_sof
 		return (EINVAL);
 	}
 
-	/* If using a multi-page ring, pull in the remaining references. */
-	for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
-		char ring_ref_name[]= "ring_refXX";
-
-		snprintf(ring_ref_name, sizeof(ring_ref_name),
-			 "ring-ref%u", ring_idx);
-		error = xs_scanf(XST_NIL, otherend_path,
-				 ring_ref_name, NULL, "%" PRIu32,
-			         &xbb->ring_config.ring_ref[ring_idx]);
+	if (xbb->ring_config.ring_pages	== 1) {
+		error = xs_gather(XST_NIL, otherend_path,
+				  "ring-ref", "%" PRIu32,
+				  &xbb->ring_config.ring_ref[0],
+				  NULL);
 		if (error != 0) {
 			xenbus_dev_fatal(xbb->dev, error,
-					 "Failed to retriev grant reference "
-					 "for page %u of shared ring.  Unable "
-					 "to connect.", ring_idx);
+					 "Unable to retrieve ring information "
+					 "from frontend %s.  Unable to "
+					 "connect.",
+					 xenbus_get_otherend_path(xbb->dev));
 			return (error);
 		}
+	} else {
+		/* Multi-page ring format. */
+		for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
+		     ring_idx++) {
+			char ring_ref_name[]= "ring_refXX";
+
+			snprintf(ring_ref_name, sizeof(ring_ref_name),
+				 "ring-ref%u", ring_idx);
+			error = xs_scanf(XST_NIL, otherend_path,
+					 ring_ref_name, NULL, "%" PRIu32,
+					 &xbb->ring_config.ring_ref[ring_idx]);
+			if (error != 0) {
+				xenbus_dev_fatal(xbb->dev, error,
+						 "Failed to retriev grant "
+						 "reference for page %u of "
+						 "shared ring.  Unable "
+						 "to connect.", ring_idx);
+				return (error);
+			}
+		}
 	}
 
 	error = xs_gather(XST_NIL, otherend_path,
@@ -3197,8 +3264,8 @@ xbb_alloc_requests(struct xbb_softc *xbb
 static int
 xbb_alloc_request_lists(struct xbb_softc *xbb)
 {
-	int i;
 	struct xbb_xen_reqlist *reqlist;
+	int			i;
 
 	/*
 	 * If no requests can be merged, we need 1 request list per
@@ -3318,7 +3385,7 @@ xbb_publish_backend_info(struct xbb_soft
 static void
 xbb_connect(struct xbb_softc *xbb)
 {
-	int		      error;
+	int error;
 
 	if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
 		return;
@@ -3399,7 +3466,8 @@ xbb_connect(struct xbb_softc *xbb)
 static int
 xbb_shutdown(struct xbb_softc *xbb)
 {
-	int error;
+	XenbusState frontState;
+	int	    error;
 
 	DPRINTF("\n");
 
@@ -3413,6 +3481,20 @@ xbb_shutdown(struct xbb_softc *xbb)
 	if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
 		return (EAGAIN);
 
+	xbb->flags |= XBBF_IN_SHUTDOWN;
+	mtx_unlock(&xbb->lock);
+
+	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
+		xenbus_set_state(xbb->dev, XenbusStateClosing);
+
+	frontState = xenbus_get_otherend_state(xbb->dev);
+	mtx_lock(&xbb->lock);
+	xbb->flags &= ~XBBF_IN_SHUTDOWN;
+
+	/* The front can submit I/O until entering the closed state. */
+	if (frontState < XenbusStateClosed)
+		return (EAGAIN);
+
 	DPRINTF("\n");
 
 	/* Indicate shutdown is in progress. */
@@ -3434,19 +3516,6 @@ xbb_shutdown(struct xbb_softc *xbb)
 
 	DPRINTF("\n");
 
-	/*
-	 * Before unlocking mutex, set this flag to prevent other threads from
-	 * getting into this function
-	 */
-	xbb->flags |= XBBF_IN_SHUTDOWN;
-	mtx_unlock(&xbb->lock);
-
-	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
-		xenbus_set_state(xbb->dev, XenbusStateClosing);
-
-	mtx_lock(&xbb->lock);
-	xbb->flags &= ~XBBF_IN_SHUTDOWN;
-
 	/* Indicate to xbb_detach() that is it safe to proceed. */
 	wakeup(xbb);
 
@@ -3573,6 +3642,16 @@ xbb_setup_sysctl(struct xbb_softc *xbb)
 		        "max_request_segments", CTLFLAG_RD,
 		        &xbb->max_request_segments, 0,
 		        "maximum number of pages per requests (negotiated)");
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "max_request_size", CTLFLAG_RD,
+		        &xbb->max_request_size, 0,
+		        "maximum size in bytes of a request (negotiated)");
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "ring_pages", CTLFLAG_RD,
+		        &xbb->ring_config.ring_pages, 0,
+		        "communication channel pages (negotiated)");
 }
 
 /**
@@ -3587,6 +3666,7 @@ xbb_attach(device_t dev)
 {
 	struct xbb_softc	*xbb;
 	int			 error;
+	u_int			 max_ring_page_order;
 
 	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
 
@@ -3621,6 +3701,10 @@ xbb_attach(device_t dev)
 		return (error);
 	}
 
+	/*
+	 * Amazon EC2 client compatility.  They refer to max-ring-pages
+	 * instead of to max-ring-page-order.
+	 */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
 	if (error) {
@@ -3629,6 +3713,15 @@ xbb_attach(device_t dev)
 		return (error);
 	}
 
+	max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "max-ring-page-order", "%u", max_ring_page_order);
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-requests", "%u", XBB_MAX_REQUESTS);
 	if (error) {
@@ -3862,12 +3955,16 @@ xbb_frontend_changed(device_t dev, Xenbu
 		xbb_connect(xbb);
 		break;
 	case XenbusStateClosing:
+		/*
+		 * Frontend has acknowledged Closing request.
+		 * Wait for Closed state.
+		 */
+		break;
 	case XenbusStateClosed:
 		mtx_lock(&xbb->lock);
 		xbb_shutdown(xbb);
 		mtx_unlock(&xbb->lock);
-		if (frontend_state == XenbusStateClosed)
-			xenbus_set_state(xbb->dev, XenbusStateClosed);
+		xenbus_set_state(xbb->dev, XenbusStateClosed);
 		break;
 	default:
 		xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",

Modified: head/sys/dev/xen/blkfront/blkfront.c
==============================================================================
--- head/sys/dev/xen/blkfront/blkfront.c	Wed Feb 15 06:16:52 2012	(r231742)
+++ head/sys/dev/xen/blkfront/blkfront.c	Wed Feb 15 06:45:49 2012	(r231743)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/module.h>
+#include <sys/sysctl.h>
 
 #include <machine/bus.h>
 #include <sys/rman.h>
@@ -139,7 +140,7 @@ static	int	xb_dump(void *, void *, vm_of
  * with blkfront as the emulated drives, easing transition slightly.
  */
 static void
-blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
+blkfront_vdevice_to_unit(uint32_t vdevice, int *unit, const char **name)
 {
 	static struct vdev_info {
 		int major;
@@ -186,6 +187,7 @@ blkfront_vdevice_to_unit(int vdevice, in
 	if (vdevice & (1 << 28)) {
 		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
 		*name = "xbd";
+		return;
 	}
 
 	for (i = 0; info[i].major; i++) {
@@ -226,7 +228,7 @@ xlvbd_add(struct xb_softc *sc, blkif_sec
 	sc->xb_disk->d_sectorsize = sector_size;
 
 	sc->xb_disk->d_mediasize = sectors * sector_size;
-	sc->xb_disk->d_maxsize = sc->max_request_size;
+	sc->xb_disk->d_maxsize = sc->max_request_size - PAGE_SIZE;
 	sc->xb_disk->d_flags = 0;
 	disk_create(sc->xb_disk, DISK_VERSION_00);
 
@@ -407,6 +409,40 @@ blkfront_probe(device_t dev)
 	return (ENXIO);
 }
 
+static void
+xb_setup_sysctl(struct xb_softc *xb)
+{
+	struct sysctl_ctx_list *sysctl_ctx = NULL;
+	struct sysctl_oid      *sysctl_tree = NULL;
+	
+	sysctl_ctx = device_get_sysctl_ctx(xb->xb_dev);
+	if (sysctl_ctx == NULL)
+		return;
+
+	sysctl_tree = device_get_sysctl_tree(xb->xb_dev);
+	if (sysctl_tree == NULL)
+		return;
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "max_requests", CTLFLAG_RD, &xb->max_requests, -1,
+		        "maximum outstanding requests (negotiated)");
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "max_request_segments", CTLFLAG_RD,
+		        &xb->max_request_segments, 0,
+		        "maximum number of pages per requests (negotiated)");
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "max_request_size", CTLFLAG_RD,
+		        &xb->max_request_size, 0,
+		        "maximum size in bytes of a request (negotiated)");
+
+	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+		        "ring_pages", CTLFLAG_RD,
+		        &xb->ring_pages, 0,
+		        "communication channel pages (negotiated)");
+}
+
 /*
  * Setup supplies the backend dir, virtual device.  We place an event
  * channel and shared frame entries.  We watch backend to wait if it's
@@ -417,14 +453,14 @@ blkfront_attach(device_t dev)
 {
 	struct xb_softc *sc;
 	const char *name;
+	uint32_t vdevice;
 	int error;
-	int vdevice;
 	int i;
 	int unit;
 
 	/* FIXME: Use dynamic device id if this is not set. */
 	error = xs_scanf(XST_NIL, xenbus_get_node(dev),
-	    "virtual-device", NULL, "%i", &vdevice);
+	    "virtual-device", NULL, "%" PRIu32, &vdevice);
 	if (error) {
 		xenbus_dev_fatal(dev, error, "reading virtual-device");
 		device_printf(dev, "Couldn't determine virtual device.\n");
@@ -449,6 +485,8 @@ blkfront_attach(device_t dev)
 	sc->vdevice = vdevice;
 	sc->connected = BLKIF_STATE_DISCONNECTED;
 
+	xb_setup_sysctl(sc);
+
 	/* Wait for backend device to publish its protocol capabilities. */
 	xenbus_set_state(dev, XenbusStateInitialising);
 
@@ -501,6 +539,7 @@ blkfront_initialize(struct xb_softc *sc)
 {
 	const char *otherend_path;
 	const char *node_path;
+	uint32_t max_ring_page_order;
 	int error;
 	int i;
 
@@ -513,8 +552,8 @@ blkfront_initialize(struct xb_softc *sc)
 	 * Protocol defaults valid even if negotiation for a
 	 * setting fails.
 	 */
+	max_ring_page_order = 0;
 	sc->ring_pages = 1;
-	sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
 	sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
 	sc->max_request_size = (sc->max_request_segments - 1) * PAGE_SIZE;
 	sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
@@ -526,13 +565,25 @@ blkfront_initialize(struct xb_softc *sc)
 	 *       we must use independant calls in order to guarantee
 	 *       we don't miss information in a sparsly populated back-end
 	 *       tree.
+	 *
+	 * \note xs_scanf() does not update variables for unmatched
+	 *	 fields.
 	 */
 	otherend_path = xenbus_get_otherend_path(sc->xb_dev);
 	node_path = xenbus_get_node(sc->xb_dev);
+
+	/* Support both backend schemes for relaying ring page limits. */
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-ring-page-order", NULL, "%" PRIu32,
+		       &max_ring_page_order);
+	sc->ring_pages = 1 << max_ring_page_order;
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-ring-pages", NULL, "%" PRIu32,
 		       &sc->ring_pages);
+	if (sc->ring_pages < 1)
+		sc->ring_pages = 1;
 
+	sc->max_requests = BLKIF_MAX_RING_REQUESTS(sc->ring_pages * PAGE_SIZE);
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "max-requests", NULL, "%" PRIu32,
 		       &sc->max_requests);
@@ -552,6 +603,16 @@ blkfront_initialize(struct xb_softc *sc)
 		sc->ring_pages = XBF_MAX_RING_PAGES;
 	}
 
+	if (powerof2(sc->ring_pages) == 0) {
+		uint32_t new_page_limit;
+
+		new_page_limit = 0x01 << (fls(sc->ring_pages) - 1);
+		device_printf(sc->xb_dev, "Back-end specified ring-pages of "
+			      "%u is not a power of 2. Limited to %u.\n",
+			      sc->ring_pages, new_page_limit);
+		sc->ring_pages = new_page_limit;
+	}
+
 	if (sc->max_requests > XBF_MAX_REQUESTS) {
 		device_printf(sc->xb_dev, "Back-end specified max_requests of "
 			      "%u limited to front-end limit of %u.\n",
@@ -625,11 +686,20 @@ blkfront_initialize(struct xb_softc *sc)
 	if (setup_blkring(sc) != 0)
 		return;
 
+	/* Support both backend schemes for relaying ring page limits. */
 	error = xs_printf(XST_NIL, node_path,
-			 "ring-pages","%u", sc->ring_pages);
+			 "num-ring-pages","%u", sc->ring_pages);
 	if (error) {
 		xenbus_dev_fatal(sc->xb_dev, error,
-				 "writing %s/ring-pages",
+				 "writing %s/num-ring-pages",
+				 node_path);
+		return;
+	}
+	error = xs_printf(XST_NIL, node_path,
+			 "ring-page-order","%u", fls(sc->ring_pages) - 1);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/ring-page-order",
 				 node_path);
 		return;
 	}
@@ -711,25 +781,31 @@ setup_blkring(struct xb_softc *sc)
 			return (error);
 		}
 	}
-	error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
-			  "ring-ref","%u", sc->ring_ref[0]);
-	if (error) {
-		xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref",
-				 xenbus_get_node(sc->xb_dev));
-		return (error);
-	}
-	for (i = 1; i < sc->ring_pages; i++) {
-		char ring_ref_name[]= "ring_refXX";
-
-		snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+	if (sc->ring_pages == 1) {
 		error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
-				 ring_ref_name, "%u", sc->ring_ref[i]);
+				  "ring-ref", "%u", sc->ring_ref[0]);
 		if (error) {
-			xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s",
-					 xenbus_get_node(sc->xb_dev),
-					 ring_ref_name);
+			xenbus_dev_fatal(sc->xb_dev, error,
+					 "writing %s/ring-ref",
+					 xenbus_get_node(sc->xb_dev));
 			return (error);
 		}
+	} else {
+		for (i = 0; i < sc->ring_pages; i++) {
+			char ring_ref_name[]= "ring_refXX";
+
+			snprintf(ring_ref_name, sizeof(ring_ref_name),
+				 "ring-ref%u", i);
+			error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
+					 ring_ref_name, "%u", sc->ring_ref[i]);
+			if (error) {
+				xenbus_dev_fatal(sc->xb_dev, error,
+						 "writing %s/%s",
+						 xenbus_get_node(sc->xb_dev),
+						 ring_ref_name);
+				return (error);
+			}
+		}
 	}
 
 	error = bind_listening_port_to_irqhandler(
@@ -795,7 +871,7 @@ blkfront_connect(struct xb_softc *sc)
 	unsigned int binfo;
 	int err, feature_barrier;
 
-        if( (sc->connected == BLKIF_STATE_CONNECTED) || 
+	if( (sc->connected == BLKIF_STATE_CONNECTED) || 
 	    (sc->connected == BLKIF_STATE_SUSPENDED) )
 		return;
 
@@ -923,15 +999,13 @@ blkif_close(struct disk *dp)
 		return (ENXIO);
 	sc->xb_flags &= ~XB_OPEN;
 	if (--(sc->users) == 0) {
-		/* Check whether we have been instructed to close.  We will
-		   have ignored this request initially, as the device was
-		   still mounted. */
-		device_t dev = sc->xb_dev;
-		XenbusState state =
-			xenbus_read_driver_state(xenbus_get_otherend_path(dev));
-
-		if (state == XenbusStateClosing)
-			blkfront_closing(dev);
+		/*
+		 * Check whether we have been instructed to close.  We will
+		 * have ignored this request initially, as the device was
+		 * still mounted.
+		 */
+		if (xenbus_get_otherend_state(sc->xb_dev) == XenbusStateClosing)
+			blkfront_closing(sc->xb_dev);
 	}
 	return (0);
 }
@@ -1033,7 +1107,7 @@ blkif_queue_cb(void *arg, bus_dma_segmen
 	struct xb_command *cm;
 	blkif_request_t	*ring_req;
 	struct blkif_request_segment *sg;
-        struct blkif_request_segment *last_block_sg;
+	struct blkif_request_segment *last_block_sg;
 	grant_ref_t *sg_ref;
 	vm_paddr_t buffer_ma;
 	uint64_t fsect, lsect;
@@ -1104,12 +1178,12 @@ blkif_queue_cb(void *arg, bus_dma_segmen
 			nsegs--;
 		}
 		block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
-                if (block_segs == 0)
-                        break;
+		if (block_segs == 0)
+			break;
 
-                sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
+		sg = BLKRING_GET_SEG_BLOCK(&sc->ring, sc->ring.req_prod_pvt);
 		sc->ring.req_prod_pvt++;
-                last_block_sg = sg + block_segs;
+		last_block_sg = sg + block_segs;
 	}
 
 	if (cm->operation == BLKIF_OP_READ)

Modified: head/sys/dev/xen/blkfront/block.h
==============================================================================
--- head/sys/dev/xen/blkfront/block.h	Wed Feb 15 06:16:52 2012	(r231742)
+++ head/sys/dev/xen/blkfront/block.h	Wed Feb 15 06:45:49 2012	(r231743)
@@ -49,7 +49,7 @@
  *       guarantee we can handle an unaligned transfer without the need to
  *       use a bounce buffer..
  */
-#define	XBF_MAX_REQUEST_SIZE		\
+#define	XBF_MAX_REQUEST_SIZE						\
 	MIN(MAXPHYS, (BLKIF_MAX_SEGMENTS_PER_REQUEST - 1) * PAGE_SIZE)
 
 /**
@@ -57,8 +57,8 @@
  * segment blocks) per request we will allow in a negotiated block-front/back
  * communication channel.
  */
-#define	XBF_MAX_SEGMENTS_PER_REQUEST		\
-	(MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,	\
+#define	XBF_MAX_SEGMENTS_PER_REQUEST					\
+	(MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,				\
 	     (XBF_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))
 
 /**

Modified: head/sys/xen/interface/io/blkif.h
==============================================================================
--- head/sys/xen/interface/io/blkif.h	Wed Feb 15 06:16:52 2012	(r231742)
+++ head/sys/xen/interface/io/blkif.h	Wed Feb 15 06:45:49 2012	(r231743)
@@ -1,8 +1,8 @@
 /******************************************************************************
  * blkif.h
- * 
+ *
  * Unified block-device I/O interface for Xen guest OSes.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -22,6 +22,7 @@
  * DEALINGS IN THE SOFTWARE.
  *
  * Copyright (c) 2003-2004, Keir Fraser
+ * Copyright (c) 2012, Spectra Logic Corporation
  */
 
 #ifndef __XEN_PUBLIC_IO_BLKIF_H__
@@ -35,7 +36,7 @@
  * notification can be made conditional on req_event (i.e., the generic
  * hold-off mechanism provided by the ring macros). Backends must set
  * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
- * 
+ *
  * Back->front notifications: When enqueuing a new response, sending a
  * notification can be made conditional on rsp_event (i.e., the generic
  * hold-off mechanism provided by the ring macros). Frontends must set
@@ -48,37 +49,413 @@
 #define blkif_sector_t uint64_t
 
 /*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen block driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters.  This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * All data in the XenStore is stored as strings.  Nodes specifying numeric
+ * values are encoded in decimal.  Integer value ranges listed below are
+ * expressed as fixed sized integer types capable of storing the conversion
+ * of a properly formated node string, without loss of information.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ * See the XenBus state transition diagram below for details on when XenBus
+ * nodes must be published and when they can be queried.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * mode
+ *      Values:         "r" (read only), "w" (writable)
+ *
+ *      The read or write access permissions to the backing store to be
+ *      granted to the frontend.
+ *
+ * params
+ *      Values:         string
+ *
+ *      A free formatted string providing sufficient information for the
+ *      backend driver to open the backing device.  (e.g. the path to the
+ *      file or block device representing the backing store.)
+ *
+ * type
+ *      Values:         "file", "phy", "tap"
+ *
+ *      The type of the backing device/object.
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-barrier
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_WRITE_BARRIER request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-flush-cache
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_FLUSH_DISKCACHE request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-discard
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_DISCARD request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ *----------------------- Request Transport Parameters ------------------------
+ *
+ * max-ring-page-order
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Notes:          1, 3
+ *
+ *      The maximum supported size of the request ring buffer in units of
+ *      lb(machine pages). (e.g. 0 == 1 page,  1 = 2 pages, 2 == 4 pages,
+ *      etc.).
+ *
+ * max-ring-pages
+ *      Values:         <uint32_t>
+ *      Default Value:  1
+ *      Notes:          2, 3
+ *
+ *      The maximum supported size of the request ring buffer in units of
+ *      machine pages.  The value must be a power of 2.
+ *
+ * max-requests         <uint32_t>
+ *      Default Value:  BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ *      Maximum Value:  BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
+ *
+ *      The maximum number of concurrent, logical requests that will be
+ *      issued by the backend.
+ *
+ *      Note: A logical request may span multiple ring entries.
+ *
+ * max-request-segments
+ *      Values:         <uint8_t>
+ *      Default Value:  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ *      Maximum Value:  BLKIF_MAX_SEGMENTS_PER_REQUEST
+ *
+ *      The maximum value of blkif_request.nr_segments supported by
+ *      the backend.
+ *
+ * max-request-size
+ *      Values:         <uint32_t>
+ *      Default Value:  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ *      Maximum Value:  BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE
+ *
+ *      The maximum amount of data, in bytes, that can be referenced by a
+ *      request type that accesses frontend memory (currently BLKIF_OP_READ,
+ *      BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *------------------------- Backend Device Properties -------------------------
+ *
+ * discard-aligment
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Notes:          4, 5
+ *
+ *      The offset, in bytes from the beginning of the virtual block device,
+ *      to the first, addressable, discard extent on the underlying device.
+ *
+ * discard-granularity
+ *      Values:         <uint32_t>
+ *      Default Value:  <"sector-size">
+ *      Notes:          4
+ *
+ *      The size, in bytes, of the individually addressable discard extents
+ *      of the underlying device.
+ *
+ * discard-secure
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
+ *      requests with the BLKIF_DISCARD_SECURE flag set.
+ *
+ * info
+ *      Values:         <uint32_t> (bitmap)
+ *
+ *      A collection of bit flags describing attributes of the backing
+ *      device.  The VDISK_* macros define the meaning of each bit
+ *      location.
+ *
+ * sector-size
+ *      Values:         <uint32_t>
+ *
+ *      The native sector size, in bytes, of the backend device.
+ *
+ * sectors
+ *      Values:         <uint64_t>
+ *
+ *      The size of the backend device, expressed in units of its native
+ *      sector size ("sector-size").
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         <uint32_t>
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * ring-ref
+ *      Values:         <uint32_t>
+ *      Notes:          6
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer.
+ *
+ * ring-ref%u
+ *      Values:         <uint32_t>
+ *      Notes:          6
+ *
+ *      For a frontend providing a multi-page ring, a "num-ring-pages" sized
+ *      list of nodes, each containing a Xen grant reference granting
+ *      permission for the backend to map the page of the ring located
+ *      at page index "%u".  Page indexes are zero based.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ *
+ * ring-page-order
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Maximum Value:  MAX(ffs(max-ring-pages) - 1, max-ring-page-order)
+ *      Notes:          1, 3
+ *
+ *      The size of the frontend allocated request ring buffer in units
+ *      of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ *      etc.).
+ *
+ * num-ring-pages
+ *      Values:         <uint32_t>
+ *      Default Value:  1
+ *      Maximum Value:  MAX(max-ring-pages,(0x1 << max-ring-page-order))
+ *      Notes:          2, 3

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201202150645.q1F6jnLi041573>