Date: Wed, 1 Feb 2012 14:06:33 -0700 From: "Justin T. Gibbs" <gibbs@scsiguy.com> To: freebsd-xen@freebsd.org Subject: [CFT][PATCH] - Rationalize FreeBSD multi-page ring extensions with those from other vendors Message-ID: <8FFE3850-7668-48C2-90C1-525213193A33@scsiguy.com>
index | next in thread | raw e-mail
[-- Attachment #1 --]
I'm spent some time documenting all of the Xen blkif extensions that are out in the wild and have modified the extensions in FreeBSD so that it is possible to make a fully interoperable driver. Although I have performed some of my own testing on different Xen Dom0's, I'm looking for additional testers before I push this into -current and merge it down into the -stable branches. I'm most interested in Amazon EC2 testing since I know that not all of their nodes run exactly the same software.
Feedback always welcome.
Thanks,
Justin
[-- Attachment #2 --]
diff -x .svn -ur sys/dev/xen/blkback/blkback.c /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkback/blkback.c
--- sys/dev/xen/blkback/blkback.c 2012-01-31 17:31:44.383114171 -0700
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkback/blkback.c 2012-02-01 12:55:39.958114726 -0700
@@ -40,6 +40,8 @@
* a FreeBSD domain to other domains.
*/
+#include "opt_kdtrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -63,6 +65,7 @@
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/bitstring.h>
+#include <sys/sdt.h>
#include <geom/geom.h>
@@ -980,9 +983,10 @@
static uint8_t *
xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
{
- intptr_t first_clear, num_clear;
+ intptr_t first_clear;
+ intptr_t num_clear;
uint8_t *free_kva;
- int i;
+ int i;
KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
@@ -1681,19 +1685,19 @@
req_ring_idx++;
switch (xbb->abi) {
case BLKIF_PROTOCOL_NATIVE:
- sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
- req_ring_idx);
+ sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
+ req_ring_idx);
break;
case BLKIF_PROTOCOL_X86_32:
{
- sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
- req_ring_idx);
+ sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
+ req_ring_idx);
break;
}
case BLKIF_PROTOCOL_X86_64:
{
- sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
- req_ring_idx);
+ sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
+ req_ring_idx);
break;
}
default:
@@ -1817,8 +1821,8 @@
struct xbb_xen_reqlist *reqlist;
- xbb = (struct xbb_softc *)context;
- rings = &xbb->rings;
+ xbb = (struct xbb_softc *)context;
+ rings = &xbb->rings;
/*
* Work gather and dispatch loop. Note that we have a bias here
@@ -2032,6 +2036,13 @@
taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
+SDT_PROVIDER_DEFINE(xbb);
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
+ "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
+ "uint64_t", "uint64_t");
+
/*----------------------------- Backend Handlers -----------------------------*/
/**
* Backend handler for character device access.
@@ -2087,6 +2098,9 @@
nreq->pendcnt = 1;
+ SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
+ device_get_unit(xbb->dev));
+
(*dev_data->csw->d_strategy)(bio);
return (0);
@@ -2181,6 +2195,17 @@
bios[bio_idx]->bio_bcount);
}
#endif
+ if (operation == BIO_READ) {
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
+ device_get_unit(xbb->dev),
+ bios[bio_idx]->bio_offset,
+ bios[bio_idx]->bio_length);
+ } else if (operation == BIO_WRITE) {
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
+ device_get_unit(xbb->dev),
+ bios[bio_idx]->bio_offset,
+ bios[bio_idx]->bio_length);
+ }
(*dev_data->csw->d_strategy)(bios[bio_idx]);
}
@@ -2193,6 +2218,12 @@
return (error);
}
+SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
+ "uint64_t");
+SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
+ "uint64_t", "uint64_t");
+
/**
* Backend handler for file access.
*
@@ -2237,6 +2268,9 @@
case BIO_FLUSH: {
struct mount *mountpoint;
+ SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
+ device_get_unit(xbb->dev));
+
vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
@@ -2336,6 +2370,10 @@
switch (operation) {
case BIO_READ:
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
+ device_get_unit(xbb->dev), xuio.uio_offset,
+ xuio.uio_resid);
+
vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
/*
@@ -2366,6 +2404,10 @@
case BIO_WRITE: {
struct mount *mountpoint;
+ SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
+ device_get_unit(xbb->dev), xuio.uio_offset,
+ xuio.uio_resid);
+
(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
@@ -3028,6 +3070,8 @@
const char *otherend_path;
int error;
u_int ring_idx;
+ u_int ring_page_order;
+ size_t ring_size;
otherend_path = xenbus_get_otherend_path(xbb->dev);
@@ -3042,16 +3086,13 @@
/*
* Mandatory data (used in all versions of the protocol) first.
*/
- error = xs_gather(XST_NIL, otherend_path,
- "ring-ref", "%" PRIu32,
- &xbb->ring_config.ring_ref[0],
- "event-channel", "%" PRIu32,
- &xbb->ring_config.evtchn,
- NULL);
+ error = xs_scanf(XST_NIL, otherend_path,
+ "event-channel", NULL, "%" PRIu32,
+ &xbb->ring_config.evtchn);
if (error != 0) {
xenbus_dev_fatal(xbb->dev, error,
- "Unable to retrieve ring information from "
- "frontend %s. Unable to connect.",
+ "Unable to retrieve event-channel information "
+ "from frontend %s. Unable to connect.",
xenbus_get_otherend_path(xbb->dev));
return (error);
}
@@ -3065,10 +3106,19 @@
* we must use independant calls in order to guarantee
* we don't miss information in a sparsly populated front-end
* tree.
+ * \note xs_scanf() does not update variables for unmatched
+ * fields.
*/
+ ring_page_order = 0;
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "ring-page-order", NULL, "%u",
+ &ring_page_order);
+ xbb->ring_config.ring_pages = 1 << ring_page_order;
(void)xs_scanf(XST_NIL, otherend_path,
"ring-pages", NULL, "%u",
&xbb->ring_config.ring_pages);
+ ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
+ xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
(void)xs_scanf(XST_NIL, otherend_path,
"max-requests", NULL, "%u",
@@ -3116,22 +3166,39 @@
return (EINVAL);
}
- /* If using a multi-page ring, pull in the remaining references. */
- for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
- char ring_ref_name[]= "ring_refXX";
-
- snprintf(ring_ref_name, sizeof(ring_ref_name),
- "ring-ref%u", ring_idx);
- error = xs_scanf(XST_NIL, otherend_path,
- ring_ref_name, NULL, "%" PRIu32,
- &xbb->ring_config.ring_ref[ring_idx]);
+ if (xbb->ring_config.ring_pages == 1) {
+ error = xs_gather(XST_NIL, otherend_path,
+ "ring-ref", "%" PRIu32,
+ &xbb->ring_config.ring_ref[0],
+ NULL);
if (error != 0) {
xenbus_dev_fatal(xbb->dev, error,
- "Failed to retriev grant reference "
- "for page %u of shared ring. Unable "
- "to connect.", ring_idx);
+ "Unable to retrieve ring information "
+ "from frontend %s. Unable to "
+ "connect.",
+ xenbus_get_otherend_path(xbb->dev));
return (error);
}
+ } else {
+ /* Multi-page ring format. */
+ for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++) {
+ char ring_ref_name[]= "ring_refXX";
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name),
+ "ring-ref%u", ring_idx);
+ error = xs_scanf(XST_NIL, otherend_path,
+ ring_ref_name, NULL, "%" PRIu32,
+ &xbb->ring_config.ring_ref[ring_idx]);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Failed to retriev grant "
+ "reference for page %u of "
+ "shared ring. Unable "
+ "to connect.", ring_idx);
+ return (error);
+ }
+ }
}
error = xs_gather(XST_NIL, otherend_path,
@@ -3197,8 +3264,8 @@
static int
xbb_alloc_request_lists(struct xbb_softc *xbb)
{
- int i;
struct xbb_xen_reqlist *reqlist;
+ int i;
/*
* If no requests can be merged, we need 1 request list per
@@ -3318,7 +3385,7 @@
static void
xbb_connect(struct xbb_softc *xbb)
{
- int error;
+ int error;
if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
return;
@@ -3399,7 +3466,8 @@
static int
xbb_shutdown(struct xbb_softc *xbb)
{
- int error;
+ XenbusState frontState;
+ int error;
DPRINTF("\n");
@@ -3413,6 +3481,20 @@
if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
return (EAGAIN);
+ xbb->flags |= XBBF_IN_SHUTDOWN;
+ mtx_unlock(&xbb->lock);
+
+ if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
+ xenbus_set_state(xbb->dev, XenbusStateClosing);
+
+ frontState = xenbus_get_otherend_state(xbb->dev);
+ mtx_lock(&xbb->lock);
+ xbb->flags &= ~XBBF_IN_SHUTDOWN;
+
+ /* The front can submit I/O until entering the closed state. */
+ if (frontState < XenbusStateClosed)
+ return (EAGAIN);
+
DPRINTF("\n");
/* Indicate shutdown is in progress. */
@@ -3434,19 +3516,6 @@
DPRINTF("\n");
- /*
- * Before unlocking mutex, set this flag to prevent other threads from
- * getting into this function
- */
- xbb->flags |= XBBF_IN_SHUTDOWN;
- mtx_unlock(&xbb->lock);
-
- if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
- xenbus_set_state(xbb->dev, XenbusStateClosing);
-
- mtx_lock(&xbb->lock);
- xbb->flags &= ~XBBF_IN_SHUTDOWN;
-
/* Indicate to xbb_detach() that is it safe to proceed. */
wakeup(xbb);
@@ -3573,6 +3642,11 @@
"max_request_segments", CTLFLAG_RD,
&xbb->max_request_segments, 0,
"maximum number of pages per requests (negotiated)");
+
+ SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
+ "ring_pages", CTLFLAG_RD,
+ &xbb->ring_config.ring_pages, 0,
+ "communication channel pages (negotiated)");
}
/**
@@ -3587,6 +3661,7 @@
{
struct xbb_softc *xbb;
int error;
+ u_int max_ring_page_order;
DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
@@ -3621,6 +3696,10 @@
return (error);
}
+ /*
+ * Amazon EC2 client compatility. They refer to max-ring-pages
+ * instead of to max-ring-page-order.
+ */
error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
"max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
if (error) {
@@ -3629,6 +3708,15 @@
return (error);
}
+ max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-ring-page-order", "%u", max_ring_page_order);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
"max-requests", "%u", XBB_MAX_REQUESTS);
if (error) {
@@ -3862,12 +3950,16 @@
xbb_connect(xbb);
break;
case XenbusStateClosing:
+ /*
+ * Frontend has acknowledged Closing request.
+ * Wait for Closed state.
+ */
+ break;
case XenbusStateClosed:
mtx_lock(&xbb->lock);
xbb_shutdown(xbb);
mtx_unlock(&xbb->lock);
- if (frontend_state == XenbusStateClosed)
- xenbus_set_state(xbb->dev, XenbusStateClosed);
+ xenbus_set_state(xbb->dev, XenbusStateClosed);
break;
default:
xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
diff -x .svn -ur sys/dev/xen/blkfront/blkfront.c /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkfront/blkfront.c
--- sys/dev/xen/blkfront/blkfront.c 2011-12-09 10:56:37.165520312 -0700
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkfront/blkfront.c 2012-02-01 12:59:37.312115640 -0700
@@ -226,7 +226,7 @@
sc->xb_disk->d_sectorsize = sector_size;
sc->xb_disk->d_mediasize = sectors * sector_size;
- sc->xb_disk->d_maxsize = sc->max_request_size;
+ sc->xb_disk->d_maxsize = sc->max_request_size - PAGE_SIZE;
sc->xb_disk->d_flags = 0;
disk_create(sc->xb_disk, DISK_VERSION_00);
@@ -501,6 +501,7 @@
{
const char *otherend_path;
const char *node_path;
+ uint32_t max_ring_page_order;
int error;
int i;
@@ -513,6 +514,7 @@
* Protocol defaults valid even if negotiation for a
* setting fails.
*/
+ max_ring_page_order = 0;
sc->ring_pages = 1;
sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
@@ -526,12 +528,22 @@
* we must use independant calls in order to guarantee
* we don't miss information in a sparsly populated back-end
* tree.
+ * \note xs_scanf() does not update variables for unmatched
+ * fields.
*/
otherend_path = xenbus_get_otherend_path(sc->xb_dev);
node_path = xenbus_get_node(sc->xb_dev);
+
+ /* Support both backend schemes for relaying ring page limits. */
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-ring-page-order", NULL, "%" PRIu32,
+ &max_ring_page_order);
+ sc->ring_pages = 1 << max_ring_page_order;
(void)xs_scanf(XST_NIL, otherend_path,
"max-ring-pages", NULL, "%" PRIu32,
&sc->ring_pages);
+ if (sc->ring_pages < 1)
+ sc->ring_pages = 1;
(void)xs_scanf(XST_NIL, otherend_path,
"max-requests", NULL, "%" PRIu32,
@@ -552,6 +564,16 @@
sc->ring_pages = XBF_MAX_RING_PAGES;
}
+ if (powerof2(sc->ring_pages) == 0) {
+ u_int new_page_limit;
+
+ new_page_limit = 0x01 << (fls(sc->ring_pages) - 1);
+ device_printf(sc->xb_dev, "Back-end specified ring-pages of "
+ "%u is not a power of 2. Limited to %u.\n",
+ sc->ring_pages, new_page_limit);
+ sc->ring_pages = new_page_limit;
+ }
+
if (sc->max_requests > XBF_MAX_REQUESTS) {
device_printf(sc->xb_dev, "Back-end specified max_requests of "
"%u limited to front-end limit of %u.\n",
@@ -625,6 +647,7 @@
if (setup_blkring(sc) != 0)
return;
+ /* Support both backend schemes for relaying ring page limits. */
error = xs_printf(XST_NIL, node_path,
"ring-pages","%u", sc->ring_pages);
if (error) {
@@ -633,6 +656,14 @@
node_path);
return;
}
+ error = xs_printf(XST_NIL, node_path,
+ "ring-page-order","%u", fls(sc->ring_pages) - 1);
+ if (error) {
+ xenbus_dev_fatal(sc->xb_dev, error,
+ "writing %s/ring-page-order",
+ node_path);
+ return;
+ }
error = xs_printf(XST_NIL, node_path,
"max-requests","%u", sc->max_requests);
@@ -795,7 +826,7 @@
unsigned int binfo;
int err, feature_barrier;
- if( (sc->connected == BLKIF_STATE_CONNECTED) ||
+ if( (sc->connected == BLKIF_STATE_CONNECTED) ||
(sc->connected == BLKIF_STATE_SUSPENDED) )
return;
@@ -923,15 +954,13 @@
return (ENXIO);
sc->xb_flags &= ~XB_OPEN;
if (--(sc->users) == 0) {
- /* Check whether we have been instructed to close. We will
- have ignored this request initially, as the device was
- still mounted. */
- device_t dev = sc->xb_dev;
- XenbusState state =
- xenbus_read_driver_state(xenbus_get_otherend_path(dev));
-
- if (state == XenbusStateClosing)
- blkfront_closing(dev);
+ /*
+ * Check whether we have been instructed to close. We will
+ * have ignored this request initially, as the device was
+ * still mounted.
+ */
+ if (xenbus_get_otherend_state(sc->xb_dev) == XenbusStateClosing)
+ blkfront_closing(sc->xb_dev);
}
return (0);
}
@@ -1033,7 +1062,7 @@
struct xb_command *cm;
blkif_request_t *ring_req;
struct blkif_request_segment *sg;
- struct blkif_request_segment *last_block_sg;
+ struct blkif_request_segment *last_block_sg;
grant_ref_t *sg_ref;
vm_paddr_t buffer_ma;
uint64_t fsect, lsect;
@@ -1104,12 +1133,12 @@
nsegs--;
}
block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
- if (block_segs == 0)
- break;
+ if (block_segs == 0)
+ break;
- sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
+ sg = BLKRING_GET_SEG_BLOCK(&sc->ring, sc->ring.req_prod_pvt);
sc->ring.req_prod_pvt++;
- last_block_sg = sg + block_segs;
+ last_block_sg = sg + block_segs;
}
if (cm->operation == BLKIF_OP_READ)
diff -x .svn -ur sys/xen/interface/io/blkif.h /usr/home/justing/perforce/SpectraBSD/head/sys/xen/interface/io/blkif.h
--- sys/xen/interface/io/blkif.h 2010-10-27 22:12:14.560797810 -0600
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/xen/interface/io/blkif.h 2012-01-31 16:11:47.565117660 -0700
@@ -1,8 +1,8 @@
/******************************************************************************
* blkif.h
- *
+ *
* Unified block-device I/O interface for Xen guest OSes.
- *
+ *
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
@@ -22,6 +22,7 @@
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2003-2004, Keir Fraser
+ * Copyright (c) 2012, Spectra Logic Corporation
*/
#ifndef __XEN_PUBLIC_IO_BLKIF_H__
@@ -35,7 +36,7 @@
* notification can be made conditional on req_event (i.e., the generic
* hold-off mechanism provided by the ring macros). Backends must set
* req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
- *
+ *
* Back->front notifications: When enqueuing a new response, sending a
* notification can be made conditional on rsp_event (i.e., the generic
* hold-off mechanism provided by the ring macros). Frontends must set
@@ -48,37 +49,401 @@
#define blkif_sector_t uint64_t
/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen block driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters. This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * All data in the XenStore is stored as strings. Nodes specifying numeric
+ * values are encoded in decimal. Integer value ranges listed below are
+ * expressed as fixed sized integer types capable of storing the conversion
+ * of a properly formated node string, without loss of information.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * See the XenBus state transition diagram below for details on when XenBus
+ * nodes must be published and when they can be queried.
+ *
+ *****************************************************************************
+ * Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-barrier
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process requests
+ * containing the BLKIF_OP_WRITE_BARRIER request opcode. Requests
+ * of this type may still be returned at any time with the
+ * BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-flush-cache
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process requests
+ * containing the BLKIF_OP_FLUSH_DISKCACHE request opcode. Requests
+ * of this type may still be returned at any time with the
+ * BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-discard
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process requests
+ * containing the BLKIF_OP_DISCARD request opcode. Requests
+ * of this type may still be returned at any time with the
+ * BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ *----------------------- Request Transport Parameters ------------------------
+ *
+ * max-ring-page-order
+ * Values: <uint32_t>
+ * Default Value: 0
+ * Notes: 1, 3
+ *
+ * The maximum supported size of the request ring buffer in units of
+ * lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ * etc.).
+ *
+ * max-ring-pages
+ * Values: <uint32_t>
+ * Default Value: 1
+ * Notes: 2, 3
+ *
+ * The maximum supported size of the request ring buffer in units of
+ * machine pages. The value must be a power of 2.
+ *
+ * max-requests <uint32_t>
+ * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages)
+ *
+ * The maximum number of concurrent requests supported by the backend.
+ *
+ * max-request-segments
+ * Values: <uint8_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ * Maximum Value: 255
+ *
+ * The maximum value of blkif_request.nr_segments supported by
+ * the backend.
+ *
+ * max-request-size
+ * Values: <uint32_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ * Maximum Value: 255 * PAGE_SIZE
+ *
+ * The maximum amount of data, in bytes, that can be referenced by a
+ * request type that accesses frontend memory (currently BLKIF_OP_READ,
+ * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *----------------------- Backend Device Identification -----------------------
+ * mode
+ * Values: "r" (read only), "w" (writable)
+ *
+ * The read or write access permissions to the backing store to be
+ * granted to the frontend.
+ *
+ * params
+ * Values: string
+ *
+ * A free formatted string providing sufficient information for the
+ * backend driver to open the backing device. (e.g. the path to the
+ * file or block device representing the backing store.)
+ *
+ * type
+ * Values: "file", "phy", "tap"
+ *
+ * The type of the backing device/object.
+ *
+ *------------------------- Backend Device Properties -------------------------
+ *
+ * discard-aligment
+ * Values: <uint32_t>
+ * Default Value: 0
+ * Notes: 4, 5
+ *
+ * The offset, in bytes from the beginning of the virtual block device,
+ * to the first, addressable, discard extent on the underlying device.
+ *
+ * discard-granularity
+ * Values: <uint32_t>
+ * Default Value: 512
+ * Notes: 4
+ *
+ * The size, in bytes, of the individually addressable discard extents
+ * of the underlying device.
+ *
+ * discard-secure
+ * Values: 0/1 (boolean)
+ * Default Value: 0
+ *
+ * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
+ * requests with the BLKIF_DISCARD_SECURE flag set.
+ *
+ * info
+ * Values: <uint32_t> (bitmap)
+ *
+ * A collection of bit flags describing attributes of the backing
+ * device. The VDISK_* macros define the meaning of each bit
+ * location.
+ *
+ * sector-size
+ * Values: <uint32_t>
+ *
+ * The native sector size, in bytes, of the backend device.
+ *
+ * sectors
+ * Values: <uint64_t>
+ *
+ * The size of the backend device, expressed in units of its native
+ * sector size ("sector-size").
+ *
+ *****************************************************************************
+ * Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ * Values: <uint32_t>
+ *
+ * The identifier of the Xen event channel used to signal activity
+ * in the ring buffer.
+ *
+ * ring-ref
+ * Values: <uint32_t>
+ * Notes: 6
+ *
+ * The Xen grant reference granting permission for the backend to map
+ * the sole page in a single page sized ring buffer.
+ *
+ * ring-ref%u
+ * Values: <uint32_t>
+ * Notes: 6
+ *
+ * For a frontend providing a multi-page ring, a "ring-pages" sized
+ * list of nodes, each containing a Xen grant reference granting
+ * permission for the backend to map the page of the ring located
+ * at page index "%u". Page indexes are zero based.
+ *
+ * protocol
+ * Values: string (XEN_IO_PROTO_ABI_*)
+ * Default Value: XEN_IO_PROTO_ABI_NATIVE
+ *
+ * The machine ABI rules governing the format of all ring request and
+ * response structures.
+ *
+ * ring-page-order
+ * Values: <uint32_t>
+ * Default Value: 0
+ * Maximum Value: MAX(ffs(max-ring-pages) - 1, max-ring-page-order)
+ * Notes: 1, 3
+ *
+ * The size of the frontend allocated request ring buffer in units
+ * of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ * etc.).
+ *
+ * ring-pages
+ * Values: <uint32_t>
+ * Default Value: 1
+ * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order))
+ * Notes: 2, 3
+ *
+ * The size of the frontend allocated request ring buffer in units of
+ * machine pages. The value must be a power of 2.
+ *
+ * max-requests
+ * Values: <uint32_t>
+ * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE)
+ * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring_pages)
+ *
+ * The maximum number of concurrent requests that will be issued by
+ * the frontend.
+ *
+ * max-request-segments
+ * Values: <uint8_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK
+ * Maximum Value: MIN(255, backend/max-request-segments)
+ *
+ * The maximum value the frontend will set in the
+ * blkif_request.nr_segments field.
+ *
+ * max-request-size
+ * Values: <uint32_t>
+ * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE
+ * Maximum Value: max-request-segments * PAGE_SIZE
+ * Notes: 3
+ *
+ * The maximum amount of data, in bytes, that can be referenced by
+ * a request type that accesses frontend memory (currently BLKIF_OP_READ,
+ * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER).
+ *
+ *------------------------- Virtual Device Properties -------------------------
+ *
+ * device-type
+ * Values: "disk", "cdrom", "floppy", etc.
+ *
+ * virtual-device
+ * Values: <uint16_t> (XEN_*_MAJOR << 8 | Minor)
+ *
+ * A value indicating the physical device to virtualize within the
+ * frontend's domain. (e.g. "The first ATA disk", "The third SCSI
+ * disk", etc.)
+ *
+ * Notes
+ * -----
+ * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
+ * PV drivers.
+ * (2) Multi-page ring buffer scheme first used in some RedHat distributions
+ * including a distribution deployed on certain nodes of the Amazon
+ * EC2 cluster.
+ * (3) Support for multi-page ring buffers was implemented independently,
+ * in slightly different forms, by both Citrix and RedHat/Amazon.
+ * For full interoperability, block front and backends should support
+ * both methods of negotiating this capability.
+ * (4) Devices that support discard functionality may internally allocate
+ * space (discardable extents) in units that are larger than the
+ * exported logical block size.
+ * (5) The discard-alignment parameter allows a physical device to be
+ * partitioned into virtual devices that do not necessarily begin or
+ * end on a discardable extent.
+ * (6) When there is only a single page allocated to the request ring,
+ * 'ring-ref' is used to communicate the grant reference for this
+ * page to the backend. When using a multi-page ring, the 'ring-ref'
+ * node is not created. Instead 'ring-ref0' - 'ring-refN' are used.
+ */
+
+/*
+ * STATE DIAGRAMS
+ *
+ *****************************************************************************
+ * Startup *
+ *****************************************************************************
+ *
+ * Tool stack creates front and back nodes with state XenbusStateInitialising.
+ *
+ * Front Back
+ * ================================= =====================================
+ * XenbusStateInitialising XenbusStateInitialising
+ * o Query virtual device o Query backend device identification
+ * properties. data.
+ * o Setup OS device instance. o Open and validate backend device.
+ * o Publish backend features and
+ * transport parameters.
+ * |
+ * |
+ * V
+ * XenbusStateInitWait
+ *
+ * o Query backend features and
+ * transport parameters.
+ * o Allocate and initialize the
+ * request ring.
+ * o Publish transport parameters
+ * that will be in effect during
+ * this connection.
+ * |
+ * |
+ * V
+ * XenbusStateInitialised
+ *
+ * o Query frontend, transport parameters.
+ * o Connect to the request ring and
+ * event channel.
+ * o Publish backend device properties.
+ * |
+ * |
+ * V
+ * XenbusStateConnected
+ *
+ * o Query backend device properties.
+ * o Finalize OS virtual device
+ * instance.
+ * |
+ * |
+ * V
+ * XenbusStateConnected
+ *
+ * Note: Drivers that do not support the negotiation of transport
+ * parameters, can skip certain states in the state machine:
+ *
+ * o A frontend may transition to XenbusStateInitialised without
+ * waiting for the backend to enter XenbusStateInitWait. In this
+ * case, default transport parameters are in effect and any
+ * transport parameters published by the frontend must contain
+ * their default values.
+ *
+ * o A backend may transition to XenbusStateInitialed without waiting
+ * for the backend to first enter the XenbusStateInitialised state.
+ * In this case, default transport parameters are in effect and any
+ * transport parameters published by the backend must contain their
+ * default values.
+ *
+ * Drivers that support transport parameter negotiation must tolerate
+ * these additional state transition paths in order to interoperate
+ * with drivers that do not. In general this means performing the
+ * work of any skipped state transition, if it has not already been
+ * performed, in addition to the work associated with the current state.
+ */
+
+/*
* REQUEST CODES.
*/
#define BLKIF_OP_READ 0
#define BLKIF_OP_WRITE 1
/*
- * Recognised only if "feature-barrier" is present in backend xenbus info.
- * The "feature-barrier" node contains a boolean indicating whether barrier
- * requests are likely to succeed or fail. Either way, a barrier request
- * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
- * the underlying block-device hardware. The boolean simply indicates whether
- * or not it is worthwhile for the frontend to attempt barrier requests.
- * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
- * create the "feature-barrier" node!
+ * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER
+ * operation code ("barrier request") must be completed prior to the
+ * execution of the barrier request. All writes issued after the barrier
+ * request must not execute until after the completion of the barrier request.
+ *
+ * Optional. See "feature-barrier" XenBus node documentation above.
*/
#define BLKIF_OP_WRITE_BARRIER 2
/*
- * Recognised if "feature-flush-cache" is present in backend xenbus
- * info. A flush will ask the underlying storage hardware to flush its
- * non-volatile caches as appropriate. The "feature-flush-cache" node
- * contains a boolean indicating whether flush requests are likely to
- * succeed or fail. Either way, a flush request may fail at any time
- * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
- * block-device hardware. The boolean simply indicates whether or not it
- * is worthwhile for the frontend to attempt flushes. If a backend does
- * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
- * "feature-flush-cache" node!
+ * Commit any uncommitted contents of the backing device's volatile cache
+ * to stable storage.
+ *
+ * Optional. See "feature-flush-cache" XenBus node documentation above.
*/
#define BLKIF_OP_FLUSH_DISKCACHE 3
+/*
+ * Used in SLES sources for device specific command packet
+ * contained within the request. Reserved for that purpose.
+ */
+#define BLKIF_OP_RESERVED_1 4
+/*
+ * Indicate to the backend device that a region of storage is no longer in
+ * use, and may be discarded at any time without impact to the client. If
+ * BLKIF_DISCARD_SECURE flag is set on the request, all copies of the
+ * discarded region on the device must be rendered unrecoverable before the
+ * command returns.
+ *
+ * This operation is a analogous to performing a trim (ATA) or unamp (SCSI),
+ * command on a native device.
+ *
+ * More information about trim/unmap operations can be found at:
+ * http://t13.org/Documents/UploadedDocuments/docs2008/
+ * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
+ * http://www.seagate.com/staticfiles/support/disc/manuals/
+ * Interface%20manuals/100293068c.pdf
+ *
+ * Optional. See "feature-discard", "discard-alignment",
+ * "discard-granularity", and "discard-secure" in the XenBus node
+ * documentation above.
+ */
+#define BLKIF_OP_DISCARD 5
/*
* Maximum scatter/gather segments associated with a request header block.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
*/
#define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK 11
@@ -92,6 +457,13 @@
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 255
+/*
+ * NB. first_sect and last_sect in blkif_request_segment, as well as
+ * sector_number in blkif_request, are always expressed in 512-byte units.
+ * However they must be properly aligned to the real sector size of the
+ * physical disk, which is reported in the "sector-size" node in the backend
+ * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
+ */
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
/* @first_sect: first sector in frame to transfer (inclusive). */
@@ -100,16 +472,60 @@
};
typedef struct blkif_request_segment blkif_request_segment_t;
+/*
+ * Starting ring element for any I/O request.
+ *
+ * One or more segment blocks can be inserted into the request ring
+ * just after a blkif_request_t, allowing requests to operate on
+ * up to BLKIF_MAX_SEGMENTS_PER_REQUEST.
+ *
+ * BLKIF_SEGS_TO_BLOCKS() can be used on blkif_requst.nr_segments
+ * to determine the number of contiguous ring entries associated
+ * with this request.
+ *
+ * Note: Due to the way Xen request rings operate, the producer and
+ * consumer indices of the ring must be incremented by the
+ * BLKIF_SEGS_TO_BLOCKS() value of the associated request.
+ * (e.g. a response to a 3 ring entry request must also consume
+ * 3 entries in the ring, even though only the first ring entry
+ * in the response has any data.)
+ */
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
- struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+ blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
};
typedef struct blkif_request blkif_request_t;
+/*
+ * A segment block is a ring request structure that contains only
+ * segment data.
+ *
+ * sizeof(struct blkif_segment_block) <= sizeof(struct blkif_request)
+ */
+struct blkif_segment_block {
+ blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK];
+};
+typedef struct blkif_segment_block blkif_segment_block_t;
+
+/*
+ * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD
+ * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request)
+ */
+struct blkif_request_discard {
+ uint8_t operation; /* BLKIF_OP_DISCARD */
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+#define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */
+ blkif_vdev_t handle; /* same as for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk */
+ uint64_t nr_sectors; /* number of contiguous sectors to discard*/
+};
+typedef struct blkif_request_discard blkif_request_discard_t;
+
struct blkif_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
@@ -130,24 +546,48 @@
/*
* Generate blkif ring structures and types.
*/
-
DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
-#define BLKRING_GET_SG_REQUEST(_r, _idx) \
- ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx))
+/*
+ * Index to, and treat as a segment block, an entry in the ring.
+ */
+#define BLKRING_GET_SEG_BLOCK(_r, _idx) \
+ (((blkif_segment_block_t *)RING_GET_REQUEST(_r, _idx))->seg)
+
+/*
+ * The number of ring request blocks required to handle an I/O
+ * request containing _segs segments.
+ */
+#define BLKIF_SEGS_TO_BLOCKS(_segs) \
+ ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \
+ + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \
+ / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
#define VDISK_CDROM 0x1
#define VDISK_REMOVABLE 0x2
#define VDISK_READONLY 0x4
/*
- * The number of ring request blocks required to handle an I/O
- * request containing _segs segments.
+ * Xen-defined major numbers for virtual disks.
*/
-#define BLKIF_SEGS_TO_BLOCKS(_segs) \
- ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \
- + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \
- / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
+#define XEN_IDE0_MAJOR 3
+#define XEN_IDE1_MAJOR 22
+#define XEN_SCSI_DISK0_MAJOR 8
+#define XEN_SCSI_DISK1_MAJOR 65
+#define XEN_SCSI_DISK2_MAJOR 66
+#define XEN_SCSI_DISK3_MAJOR 67
+#define XEN_SCSI_DISK4_MAJOR 68
+#define XEN_SCSI_DISK5_MAJOR 69
+#define XEN_SCSI_DISK6_MAJOR 70
+#define XEN_SCSI_DISK7_MAJOR 71
+#define XEN_SCSI_DISK8_MAJOR 128
+#define XEN_SCSI_DISK9_MAJOR 129
+#define XEN_SCSI_DISK10_MAJOR 130
+#define XEN_SCSI_DISK11_MAJOR 131
+#define XEN_SCSI_DISK12_MAJOR 132
+#define XEN_SCSI_DISK13_MAJOR 133
+#define XEN_SCSI_DISK14_MAJOR 134
+#define XEN_SCSI_DISK15_MAJOR 135
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff -x .svn -ur sys/xen/xenbus/xenbusvar.h /usr/home/justing/perforce/SpectraBSD/head/sys/xen/xenbus/xenbusvar.h
--- sys/xen/xenbus/xenbusvar.h 2011-06-10 22:59:01.723658126 -0600
+++ /usr/home/justing/perforce/SpectraBSD/head/sys/xen/xenbus/xenbusvar.h 2012-01-31 16:41:51.486111080 -0700
@@ -104,6 +104,20 @@
XenbusState xenbus_read_driver_state(const char *path);
/**
+ * Return the state of the "other end" (peer) of a XenBus device.
+ *
+ * \param dev The XenBus device whose peer to query.
+ *
+ * \return The current state of the peer device or XenbusStateClosed if no
+ * state can be read.
+ */
+static inline XenbusState
+xenbus_get_otherend_state(device_t dev)
+{
+ return (xenbus_read_driver_state(xenbus_get_otherend_path(dev)));
+}
+
+/**
* Initialize and register a watch on the given path (client suplied storage).
*
* \param dev The XenBus device requesting the watch service.
Only in sys/xen/xenstore: xenstore.c.orig
help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?8FFE3850-7668-48C2-90C1-525213193A33>
