From owner-freebsd-xen@FreeBSD.ORG Wed Feb 1 21:36:45 2012 Return-Path: Delivered-To: freebsd-xen@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id BBE73106566B for ; Wed, 1 Feb 2012 21:36:45 +0000 (UTC) (envelope-from gibbs@scsiguy.com) Received: from aslan.scsiguy.com (ns1.scsiguy.com [70.89.174.89]) by mx1.freebsd.org (Postfix) with ESMTP id 74DB28FC16 for ; Wed, 1 Feb 2012 21:36:45 +0000 (UTC) Received: from danzmolek-dell.sldomain.com (207-225-98-3.dia.static.qwest.net [207.225.98.3]) (authenticated bits=0) by aslan.scsiguy.com (8.14.5/8.14.5) with ESMTP id q11L6OBN097775 (version=TLSv1/SSLv3 cipher=AES128-SHA bits=128 verify=NO) for ; Wed, 1 Feb 2012 14:06:24 -0700 (MST) (envelope-from gibbs@scsiguy.com) From: "Justin T. Gibbs" Content-Type: multipart/mixed; boundary="Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6" Date: Wed, 1 Feb 2012 14:06:33 -0700 Message-Id: <8FFE3850-7668-48C2-90C1-525213193A33@scsiguy.com> To: freebsd-xen@freebsd.org Mime-Version: 1.0 (Apple Message framework v1251.1) X-Mailer: Apple Mail (2.1251.1) X-Greylist: Sender succeeded SMTP AUTH, not delayed by milter-greylist-4.2.7 (aslan.scsiguy.com [70.89.174.89]); Wed, 01 Feb 2012 14:06:24 -0700 (MST) Subject: [CFT][PATCH] - Rationalize FreeBSD multi-page ring extensions with those from other vendors X-BeenThere: freebsd-xen@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Discussion of the freebsd port to xen - implementation and usage List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 01 Feb 2012 21:36:45 -0000 --Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset=us-ascii I'm spent some time documenting all of the Xen blkif extensions that are = out in the wild and have modified the extensions in FreeBSD so that it = is possible to make a fully interoperable driver. Although I have = performed some of my own testing on different Xen Dom0's, I'm looking = for additional testers before I push this into -current and merge it = down into the -stable branches. I'm most interested in Amazon EC2 = testing since I know that not all of their nodes run exactly the same = software. Feedback always welcome. Thanks, Justin --Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6 Content-Disposition: attachment; filename=blkif.diffs Content-Type: application/octet-stream; name="blkif.diffs" Content-Transfer-Encoding: 7bit diff -x .svn -ur sys/dev/xen/blkback/blkback.c /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkback/blkback.c --- sys/dev/xen/blkback/blkback.c 2012-01-31 17:31:44.383114171 -0700 +++ /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkback/blkback.c 2012-02-01 12:55:39.958114726 -0700 @@ -40,6 +40,8 @@ * a FreeBSD domain to other domains. */ +#include "opt_kdtrace.h" + #include #include #include @@ -63,6 +65,7 @@ #include #include #include +#include #include @@ -980,9 +983,10 @@ static uint8_t * xbb_get_kva(struct xbb_softc *xbb, int nr_pages) { - intptr_t first_clear, num_clear; + intptr_t first_clear; + intptr_t num_clear; uint8_t *free_kva; - int i; + int i; KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); @@ -1681,19 +1685,19 @@ req_ring_idx++; switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: - sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, - req_ring_idx); + sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native, + req_ring_idx); break; case BLKIF_PROTOCOL_X86_32: { - sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, - req_ring_idx); + sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32, + req_ring_idx); break; } case BLKIF_PROTOCOL_X86_64: { - sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, - req_ring_idx); + sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64, + req_ring_idx); break; } default: @@ -1817,8 +1821,8 @@ struct xbb_xen_reqlist *reqlist; - xbb = (struct xbb_softc *)context; - rings = &xbb->rings; + xbb = (struct xbb_softc *)context; + rings = &xbb->rings; /* * Work gather and dispatch loop. Note that we have a bias here @@ -2032,6 +2036,13 @@ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } +SDT_PROVIDER_DEFINE(xbb); +SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int"); +SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t", + "uint64_t"); +SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int", + "uint64_t", "uint64_t"); + /*----------------------------- Backend Handlers -----------------------------*/ /** * Backend handler for character device access. @@ -2087,6 +2098,9 @@ nreq->pendcnt = 1; + SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, + device_get_unit(xbb->dev)); + (*dev_data->csw->d_strategy)(bio); return (0); @@ -2181,6 +2195,17 @@ bios[bio_idx]->bio_bcount); } #endif + if (operation == BIO_READ) { + SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, + device_get_unit(xbb->dev), + bios[bio_idx]->bio_offset, + bios[bio_idx]->bio_length); + } else if (operation == BIO_WRITE) { + SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, + device_get_unit(xbb->dev), + bios[bio_idx]->bio_offset, + bios[bio_idx]->bio_length); + } (*dev_data->csw->d_strategy)(bios[bio_idx]); } @@ -2193,6 +2218,12 @@ return (error); } +SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int"); +SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t", + "uint64_t"); +SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int", + "uint64_t", "uint64_t"); + /** * Backend handler for file access. * @@ -2237,6 +2268,9 @@ case BIO_FLUSH: { struct mount *mountpoint; + SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, + device_get_unit(xbb->dev)); + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); @@ -2336,6 +2370,10 @@ switch (operation) { case BIO_READ: + SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, + device_get_unit(xbb->dev), xuio.uio_offset, + xuio.uio_resid); + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* @@ -2366,6 +2404,10 @@ case BIO_WRITE: { struct mount *mountpoint; + SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, + device_get_unit(xbb->dev), xuio.uio_offset, + xuio.uio_resid); + (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); @@ -3028,6 +3070,8 @@ const char *otherend_path; int error; u_int ring_idx; + u_int ring_page_order; + size_t ring_size; otherend_path = xenbus_get_otherend_path(xbb->dev); @@ -3042,16 +3086,13 @@ /* * Mandatory data (used in all versions of the protocol) first. */ - error = xs_gather(XST_NIL, otherend_path, - "ring-ref", "%" PRIu32, - &xbb->ring_config.ring_ref[0], - "event-channel", "%" PRIu32, - &xbb->ring_config.evtchn, - NULL); + error = xs_scanf(XST_NIL, otherend_path, + "event-channel", NULL, "%" PRIu32, + &xbb->ring_config.evtchn); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, - "Unable to retrieve ring information from " - "frontend %s. Unable to connect.", + "Unable to retrieve event-channel information " + "from frontend %s. Unable to connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } @@ -3065,10 +3106,19 @@ * we must use independant calls in order to guarantee * we don't miss information in a sparsly populated front-end * tree. + * \note xs_scanf() does not update variables for unmatched + * fields. */ + ring_page_order = 0; + (void)xs_scanf(XST_NIL, otherend_path, + "ring-page-order", NULL, "%u", + &ring_page_order); + xbb->ring_config.ring_pages = 1 << ring_page_order; (void)xs_scanf(XST_NIL, otherend_path, "ring-pages", NULL, "%u", &xbb->ring_config.ring_pages); + ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; + xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); (void)xs_scanf(XST_NIL, otherend_path, "max-requests", NULL, "%u", @@ -3116,22 +3166,39 @@ return (EINVAL); } - /* If using a multi-page ring, pull in the remaining references. */ - for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { - char ring_ref_name[]= "ring_refXX"; - - snprintf(ring_ref_name, sizeof(ring_ref_name), - "ring-ref%u", ring_idx); - error = xs_scanf(XST_NIL, otherend_path, - ring_ref_name, NULL, "%" PRIu32, - &xbb->ring_config.ring_ref[ring_idx]); + if (xbb->ring_config.ring_pages == 1) { + error = xs_gather(XST_NIL, otherend_path, + "ring-ref", "%" PRIu32, + &xbb->ring_config.ring_ref[0], + NULL); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, - "Failed to retriev grant reference " - "for page %u of shared ring. Unable " - "to connect.", ring_idx); + "Unable to retrieve ring information " + "from frontend %s. Unable to " + "connect.", + xenbus_get_otherend_path(xbb->dev)); return (error); } + } else { + /* Multi-page ring format. */ + for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; + ring_idx++) { + char ring_ref_name[]= "ring_refXX"; + + snprintf(ring_ref_name, sizeof(ring_ref_name), + "ring-ref%u", ring_idx); + error = xs_scanf(XST_NIL, otherend_path, + ring_ref_name, NULL, "%" PRIu32, + &xbb->ring_config.ring_ref[ring_idx]); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Failed to retriev grant " + "reference for page %u of " + "shared ring. Unable " + "to connect.", ring_idx); + return (error); + } + } } error = xs_gather(XST_NIL, otherend_path, @@ -3197,8 +3264,8 @@ static int xbb_alloc_request_lists(struct xbb_softc *xbb) { - int i; struct xbb_xen_reqlist *reqlist; + int i; /* * If no requests can be merged, we need 1 request list per @@ -3318,7 +3385,7 @@ static void xbb_connect(struct xbb_softc *xbb) { - int error; + int error; if (xenbus_get_state(xbb->dev) == XenbusStateConnected) return; @@ -3399,7 +3466,8 @@ static int xbb_shutdown(struct xbb_softc *xbb) { - int error; + XenbusState frontState; + int error; DPRINTF("\n"); @@ -3413,6 +3481,20 @@ if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) return (EAGAIN); + xbb->flags |= XBBF_IN_SHUTDOWN; + mtx_unlock(&xbb->lock); + + if (xenbus_get_state(xbb->dev) < XenbusStateClosing) + xenbus_set_state(xbb->dev, XenbusStateClosing); + + frontState = xenbus_get_otherend_state(xbb->dev); + mtx_lock(&xbb->lock); + xbb->flags &= ~XBBF_IN_SHUTDOWN; + + /* The front can submit I/O until entering the closed state. */ + if (frontState < XenbusStateClosed) + return (EAGAIN); + DPRINTF("\n"); /* Indicate shutdown is in progress. */ @@ -3434,19 +3516,6 @@ DPRINTF("\n"); - /* - * Before unlocking mutex, set this flag to prevent other threads from - * getting into this function - */ - xbb->flags |= XBBF_IN_SHUTDOWN; - mtx_unlock(&xbb->lock); - - if (xenbus_get_state(xbb->dev) < XenbusStateClosing) - xenbus_set_state(xbb->dev, XenbusStateClosing); - - mtx_lock(&xbb->lock); - xbb->flags &= ~XBBF_IN_SHUTDOWN; - /* Indicate to xbb_detach() that is it safe to proceed. */ wakeup(xbb); @@ -3573,6 +3642,11 @@ "max_request_segments", CTLFLAG_RD, &xbb->max_request_segments, 0, "maximum number of pages per requests (negotiated)"); + + SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, + "ring_pages", CTLFLAG_RD, + &xbb->ring_config.ring_pages, 0, + "communication channel pages (negotiated)"); } /** @@ -3587,6 +3661,7 @@ { struct xbb_softc *xbb; int error; + u_int max_ring_page_order; DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); @@ -3621,6 +3696,10 @@ return (error); } + /* + * Amazon EC2 client compatility. They refer to max-ring-pages + * instead of to max-ring-page-order. + */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); if (error) { @@ -3629,6 +3708,15 @@ return (error); } + max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-ring-page-order", "%u", max_ring_page_order); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", + xenbus_get_node(xbb->dev)); + return (error); + } + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "max-requests", "%u", XBB_MAX_REQUESTS); if (error) { @@ -3862,12 +3950,16 @@ xbb_connect(xbb); break; case XenbusStateClosing: + /* + * Frontend has acknowledged Closing request. + * Wait for Closed state. + */ + break; case XenbusStateClosed: mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); - if (frontend_state == XenbusStateClosed) - xenbus_set_state(xbb->dev, XenbusStateClosed); + xenbus_set_state(xbb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", diff -x .svn -ur sys/dev/xen/blkfront/blkfront.c /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkfront/blkfront.c --- sys/dev/xen/blkfront/blkfront.c 2011-12-09 10:56:37.165520312 -0700 +++ /usr/home/justing/perforce/SpectraBSD/head/sys/dev/xen/blkfront/blkfront.c 2012-02-01 12:59:37.312115640 -0700 @@ -226,7 +226,7 @@ sc->xb_disk->d_sectorsize = sector_size; sc->xb_disk->d_mediasize = sectors * sector_size; - sc->xb_disk->d_maxsize = sc->max_request_size; + sc->xb_disk->d_maxsize = sc->max_request_size - PAGE_SIZE; sc->xb_disk->d_flags = 0; disk_create(sc->xb_disk, DISK_VERSION_00); @@ -501,6 +501,7 @@ { const char *otherend_path; const char *node_path; + uint32_t max_ring_page_order; int error; int i; @@ -513,6 +514,7 @@ * Protocol defaults valid even if negotiation for a * setting fails. */ + max_ring_page_order = 0; sc->ring_pages = 1; sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; @@ -526,12 +528,22 @@ * we must use independant calls in order to guarantee * we don't miss information in a sparsly populated back-end * tree. + * \note xs_scanf() does not update variables for unmatched + * fields. */ otherend_path = xenbus_get_otherend_path(sc->xb_dev); node_path = xenbus_get_node(sc->xb_dev); + + /* Support both backend schemes for relaying ring page limits. */ + (void)xs_scanf(XST_NIL, otherend_path, + "max-ring-page-order", NULL, "%" PRIu32, + &max_ring_page_order); + sc->ring_pages = 1 << max_ring_page_order; (void)xs_scanf(XST_NIL, otherend_path, "max-ring-pages", NULL, "%" PRIu32, &sc->ring_pages); + if (sc->ring_pages < 1) + sc->ring_pages = 1; (void)xs_scanf(XST_NIL, otherend_path, "max-requests", NULL, "%" PRIu32, @@ -552,6 +564,16 @@ sc->ring_pages = XBF_MAX_RING_PAGES; } + if (powerof2(sc->ring_pages) == 0) { + u_int new_page_limit; + + new_page_limit = 0x01 << (fls(sc->ring_pages) - 1); + device_printf(sc->xb_dev, "Back-end specified ring-pages of " + "%u is not a power of 2. Limited to %u.\n", + sc->ring_pages, new_page_limit); + sc->ring_pages = new_page_limit; + } + if (sc->max_requests > XBF_MAX_REQUESTS) { device_printf(sc->xb_dev, "Back-end specified max_requests of " "%u limited to front-end limit of %u.\n", @@ -625,6 +647,7 @@ if (setup_blkring(sc) != 0) return; + /* Support both backend schemes for relaying ring page limits. */ error = xs_printf(XST_NIL, node_path, "ring-pages","%u", sc->ring_pages); if (error) { @@ -633,6 +656,14 @@ node_path); return; } + error = xs_printf(XST_NIL, node_path, + "ring-page-order","%u", fls(sc->ring_pages) - 1); + if (error) { + xenbus_dev_fatal(sc->xb_dev, error, + "writing %s/ring-page-order", + node_path); + return; + } error = xs_printf(XST_NIL, node_path, "max-requests","%u", sc->max_requests); @@ -795,7 +826,7 @@ unsigned int binfo; int err, feature_barrier; - if( (sc->connected == BLKIF_STATE_CONNECTED) || + if( (sc->connected == BLKIF_STATE_CONNECTED) || (sc->connected == BLKIF_STATE_SUSPENDED) ) return; @@ -923,15 +954,13 @@ return (ENXIO); sc->xb_flags &= ~XB_OPEN; if (--(sc->users) == 0) { - /* Check whether we have been instructed to close. We will - have ignored this request initially, as the device was - still mounted. */ - device_t dev = sc->xb_dev; - XenbusState state = - xenbus_read_driver_state(xenbus_get_otherend_path(dev)); - - if (state == XenbusStateClosing) - blkfront_closing(dev); + /* + * Check whether we have been instructed to close. We will + * have ignored this request initially, as the device was + * still mounted. + */ + if (xenbus_get_otherend_state(sc->xb_dev) == XenbusStateClosing) + blkfront_closing(sc->xb_dev); } return (0); } @@ -1033,7 +1062,7 @@ struct xb_command *cm; blkif_request_t *ring_req; struct blkif_request_segment *sg; - struct blkif_request_segment *last_block_sg; + struct blkif_request_segment *last_block_sg; grant_ref_t *sg_ref; vm_paddr_t buffer_ma; uint64_t fsect, lsect; @@ -1104,12 +1133,12 @@ nsegs--; } block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); - if (block_segs == 0) - break; + if (block_segs == 0) + break; - sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt); + sg = BLKRING_GET_SEG_BLOCK(&sc->ring, sc->ring.req_prod_pvt); sc->ring.req_prod_pvt++; - last_block_sg = sg + block_segs; + last_block_sg = sg + block_segs; } if (cm->operation == BLKIF_OP_READ) diff -x .svn -ur sys/xen/interface/io/blkif.h /usr/home/justing/perforce/SpectraBSD/head/sys/xen/interface/io/blkif.h --- sys/xen/interface/io/blkif.h 2010-10-27 22:12:14.560797810 -0600 +++ /usr/home/justing/perforce/SpectraBSD/head/sys/xen/interface/io/blkif.h 2012-01-31 16:11:47.565117660 -0700 @@ -1,8 +1,8 @@ /****************************************************************************** * blkif.h - * + * * Unified block-device I/O interface for Xen guest OSes. - * + * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2003-2004, Keir Fraser + * Copyright (c) 2012, Spectra Logic Corporation */ #ifndef __XEN_PUBLIC_IO_BLKIF_H__ @@ -35,7 +36,7 @@ * notification can be made conditional on req_event (i.e., the generic * hold-off mechanism provided by the ring macros). Backends must set * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). - * + * * Back->front notifications: When enqueuing a new response, sending a * notification can be made conditional on rsp_event (i.e., the generic * hold-off mechanism provided by the ring macros). Frontends must set @@ -48,37 +49,401 @@ #define blkif_sector_t uint64_t /* + * Feature and Parameter Negotiation + * ================================= + * The two halves of a Xen block driver utilize nodes within the XenStore to + * communicate capabilities and to negotiate operating parameters. This + * section enumerates these nodes which reside in the respective front and + * backend portions of the XenStore, following the XenBus convention. + * + * All data in the XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + * Any specified default value is in effect if the corresponding XenBus node + * is not present in the XenStore. + * + * See the XenBus state transition diagram below for details on when XenBus + * nodes must be published and when they can be queried. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *--------------------------------- Features --------------------------------- + * + * feature-barrier + * Values: 0/1 (boolean) + * Default Value: 0 + * + * A value of "1" indicates that the backend can process requests + * containing the BLKIF_OP_WRITE_BARRIER request opcode. Requests + * of this type may still be returned at any time with the + * BLKIF_RSP_EOPNOTSUPP result code. + * + * feature-flush-cache + * Values: 0/1 (boolean) + * Default Value: 0 + * + * A value of "1" indicates that the backend can process requests + * containing the BLKIF_OP_FLUSH_DISKCACHE request opcode. Requests + * of this type may still be returned at any time with the + * BLKIF_RSP_EOPNOTSUPP result code. + * + * feature-discard + * Values: 0/1 (boolean) + * Default Value: 0 + * + * A value of "1" indicates that the backend can process requests + * containing the BLKIF_OP_DISCARD request opcode. Requests + * of this type may still be returned at any time with the + * BLKIF_RSP_EOPNOTSUPP result code. + * + *----------------------- Request Transport Parameters ------------------------ + * + * max-ring-page-order + * Values: + * Default Value: 0 + * Notes: 1, 3 + * + * The maximum supported size of the request ring buffer in units of + * lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages, + * etc.). + * + * max-ring-pages + * Values: + * Default Value: 1 + * Notes: 2, 3 + * + * The maximum supported size of the request ring buffer in units of + * machine pages. The value must be a power of 2. + * + * max-requests + * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE) + * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring-pages) + * + * The maximum number of concurrent requests supported by the backend. + * + * max-request-segments + * Values: + * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK + * Maximum Value: 255 + * + * The maximum value of blkif_request.nr_segments supported by + * the backend. + * + * max-request-size + * Values: + * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE + * Maximum Value: 255 * PAGE_SIZE + * + * The maximum amount of data, in bytes, that can be referenced by a + * request type that accesses frontend memory (currently BLKIF_OP_READ, + * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER). + * + *----------------------- Backend Device Identification ----------------------- + * mode + * Values: "r" (read only), "w" (writable) + * + * The read or write access permissions to the backing store to be + * granted to the frontend. + * + * params + * Values: string + * + * A free formatted string providing sufficient information for the + * backend driver to open the backing device. (e.g. the path to the + * file or block device representing the backing store.) + * + * type + * Values: "file", "phy", "tap" + * + * The type of the backing device/object. + * + *------------------------- Backend Device Properties ------------------------- + * + * discard-aligment + * Values: + * Default Value: 0 + * Notes: 4, 5 + * + * The offset, in bytes from the beginning of the virtual block device, + * to the first, addressable, discard extent on the underlying device. + * + * discard-granularity + * Values: + * Default Value: 512 + * Notes: 4 + * + * The size, in bytes, of the individually addressable discard extents + * of the underlying device. + * + * discard-secure + * Values: 0/1 (boolean) + * Default Value: 0 + * + * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD + * requests with the BLKIF_DISCARD_SECURE flag set. + * + * info + * Values: (bitmap) + * + * A collection of bit flags describing attributes of the backing + * device. The VDISK_* macros define the meaning of each bit + * location. + * + * sector-size + * Values: + * + * The native sector size, in bytes, of the backend device. + * + * sectors + * Values: + * + * The size of the backend device, expressed in units of its native + * sector size ("sector-size"). + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: + * Notes: 6 + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. + * + * ring-ref%u + * Values: + * Notes: 6 + * + * For a frontend providing a multi-page ring, a "ring-pages" sized + * list of nodes, each containing a Xen grant reference granting + * permission for the backend to map the page of the ring located + * at page index "%u". Page indexes are zero based. + * + * protocol + * Values: string (XEN_IO_PROTO_ABI_*) + * Default Value: XEN_IO_PROTO_ABI_NATIVE + * + * The machine ABI rules governing the format of all ring request and + * response structures. + * + * ring-page-order + * Values: + * Default Value: 0 + * Maximum Value: MAX(ffs(max-ring-pages) - 1, max-ring-page-order) + * Notes: 1, 3 + * + * The size of the frontend allocated request ring buffer in units + * of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages, + * etc.). + * + * ring-pages + * Values: + * Default Value: 1 + * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order)) + * Notes: 2, 3 + * + * The size of the frontend allocated request ring buffer in units of + * machine pages. The value must be a power of 2. + * + * max-requests + * Values: + * Default Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE) + * Maximum Value: BLKIF_MAX_RING_REQUESTS(PAGE_SIZE * max-ring_pages) + * + * The maximum number of concurrent requests that will be issued by + * the frontend. + * + * max-request-segments + * Values: + * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK + * Maximum Value: MIN(255, backend/max-request-segments) + * + * The maximum value the frontend will set in the + * blkif_request.nr_segments field. + * + * max-request-size + * Values: + * Default Value: BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK * PAGE_SIZE + * Maximum Value: max-request-segments * PAGE_SIZE + * Notes: 3 + * + * The maximum amount of data, in bytes, that can be referenced by + * a request type that accesses frontend memory (currently BLKIF_OP_READ, + * BLKIF_OP_WRITE, or BLKIF_OP_WRITE_BARRIER). + * + *------------------------- Virtual Device Properties ------------------------- + * + * device-type + * Values: "disk", "cdrom", "floppy", etc. + * + * virtual-device + * Values: (XEN_*_MAJOR << 8 | Minor) + * + * A value indicating the physical device to virtualize within the + * frontend's domain. (e.g. "The first ATA disk", "The third SCSI + * disk", etc.) + * + * Notes + * ----- + * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer + * PV drivers. + * (2) Multi-page ring buffer scheme first used in some RedHat distributions + * including a distribution deployed on certain nodes of the Amazon + * EC2 cluster. + * (3) Support for multi-page ring buffers was implemented independently, + * in slightly different forms, by both Citrix and RedHat/Amazon. + * For full interoperability, block front and backends should support + * both methods of negotiating this capability. + * (4) Devices that support discard functionality may internally allocate + * space (discardable extents) in units that are larger than the + * exported logical block size. + * (5) The discard-alignment parameter allows a physical device to be + * partitioned into virtual devices that do not necessarily begin or + * end on a discardable extent. + * (6) When there is only a single page allocated to the request ring, + * 'ring-ref' is used to communicate the grant reference for this + * page to the backend. When using a multi-page ring, the 'ring-ref' + * node is not created. Instead 'ring-ref0' - 'ring-refN' are used. + */ + +/* + * STATE DIAGRAMS + * + ***************************************************************************** + * Startup * + ***************************************************************************** + * + * Tool stack creates front and back nodes with state XenbusStateInitialising. + * + * Front Back + * ================================= ===================================== + * XenbusStateInitialising XenbusStateInitialising + * o Query virtual device o Query backend device identification + * properties. data. + * o Setup OS device instance. o Open and validate backend device. + * o Publish backend features and + * transport parameters. + * | + * | + * V + * XenbusStateInitWait + * + * o Query backend features and + * transport parameters. + * o Allocate and initialize the + * request ring. + * o Publish transport parameters + * that will be in effect during + * this connection. + * | + * | + * V + * XenbusStateInitialised + * + * o Query frontend, transport parameters. + * o Connect to the request ring and + * event channel. + * o Publish backend device properties. + * | + * | + * V + * XenbusStateConnected + * + * o Query backend device properties. + * o Finalize OS virtual device + * instance. + * | + * | + * V + * XenbusStateConnected + * + * Note: Drivers that do not support the negotiation of transport + * parameters, can skip certain states in the state machine: + * + * o A frontend may transition to XenbusStateInitialised without + * waiting for the backend to enter XenbusStateInitWait. In this + * case, default transport parameters are in effect and any + * transport parameters published by the frontend must contain + * their default values. + * + * o A backend may transition to XenbusStateInitialed without waiting + * for the backend to first enter the XenbusStateInitialised state. + * In this case, default transport parameters are in effect and any + * transport parameters published by the backend must contain their + * default values. + * + * Drivers that support transport parameter negotiation must tolerate + * these additional state transition paths in order to interoperate + * with drivers that do not. In general this means performing the + * work of any skipped state transition, if it has not already been + * performed, in addition to the work associated with the current state. + */ + +/* * REQUEST CODES. */ #define BLKIF_OP_READ 0 #define BLKIF_OP_WRITE 1 /* - * Recognised only if "feature-barrier" is present in backend xenbus info. - * The "feature-barrier" node contains a boolean indicating whether barrier - * requests are likely to succeed or fail. Either way, a barrier request - * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by - * the underlying block-device hardware. The boolean simply indicates whether - * or not it is worthwhile for the frontend to attempt barrier requests. - * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not* - * create the "feature-barrier" node! + * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER + * operation code ("barrier request") must be completed prior to the + * execution of the barrier request. All writes issued after the barrier + * request must not execute until after the completion of the barrier request. + * + * Optional. See "feature-barrier" XenBus node documentation above. */ #define BLKIF_OP_WRITE_BARRIER 2 /* - * Recognised if "feature-flush-cache" is present in backend xenbus - * info. A flush will ask the underlying storage hardware to flush its - * non-volatile caches as appropriate. The "feature-flush-cache" node - * contains a boolean indicating whether flush requests are likely to - * succeed or fail. Either way, a flush request may fail at any time - * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying - * block-device hardware. The boolean simply indicates whether or not it - * is worthwhile for the frontend to attempt flushes. If a backend does - * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the - * "feature-flush-cache" node! + * Commit any uncommitted contents of the backing device's volatile cache + * to stable storage. + * + * Optional. See "feature-flush-cache" XenBus node documentation above. */ #define BLKIF_OP_FLUSH_DISKCACHE 3 +/* + * Used in SLES sources for device specific command packet + * contained within the request. Reserved for that purpose. + */ +#define BLKIF_OP_RESERVED_1 4 +/* + * Indicate to the backend device that a region of storage is no longer in + * use, and may be discarded at any time without impact to the client. If + * BLKIF_DISCARD_SECURE flag is set on the request, all copies of the + * discarded region on the device must be rendered unrecoverable before the + * command returns. + * + * This operation is a analogous to performing a trim (ATA) or unamp (SCSI), + * command on a native device. + * + * More information about trim/unmap operations can be found at: + * http://t13.org/Documents/UploadedDocuments/docs2008/ + * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc + * http://www.seagate.com/staticfiles/support/disc/manuals/ + * Interface%20manuals/100293068c.pdf + * + * Optional. See "feature-discard", "discard-alignment", + * "discard-granularity", and "discard-secure" in the XenBus node + * documentation above. + */ +#define BLKIF_OP_DISCARD 5 /* * Maximum scatter/gather segments associated with a request header block. + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. + * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK 11 @@ -92,6 +457,13 @@ */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 255 +/* + * NB. first_sect and last_sect in blkif_request_segment, as well as + * sector_number in blkif_request, are always expressed in 512-byte units. + * However they must be properly aligned to the real sector size of the + * physical disk, which is reported in the "sector-size" node in the backend + * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units. + */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ /* @first_sect: first sector in frame to transfer (inclusive). */ @@ -100,16 +472,60 @@ }; typedef struct blkif_request_segment blkif_request_segment_t; +/* + * Starting ring element for any I/O request. + * + * One or more segment blocks can be inserted into the request ring + * just after a blkif_request_t, allowing requests to operate on + * up to BLKIF_MAX_SEGMENTS_PER_REQUEST. + * + * BLKIF_SEGS_TO_BLOCKS() can be used on blkif_requst.nr_segments + * to determine the number of contiguous ring entries associated + * with this request. + * + * Note: Due to the way Xen request rings operate, the producer and + * consumer indices of the ring must be incremented by the + * BLKIF_SEGS_TO_BLOCKS() value of the associated request. + * (e.g. a response to a 3 ring entry request must also consume + * 3 entries in the ring, even though only the first ring entry + * in the response has any data.) + */ struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK]; + blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK]; }; typedef struct blkif_request blkif_request_t; +/* + * A segment block is a ring request structure that contains only + * segment data. + * + * sizeof(struct blkif_segment_block) <= sizeof(struct blkif_request) + */ +struct blkif_segment_block { + blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK]; +}; +typedef struct blkif_segment_block blkif_segment_block_t; + +/* + * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD + * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request) + */ +struct blkif_request_discard { + uint8_t operation; /* BLKIF_OP_DISCARD */ + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ +#define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */ + blkif_vdev_t handle; /* same as for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk */ + uint64_t nr_sectors; /* number of contiguous sectors to discard*/ +}; +typedef struct blkif_request_discard blkif_request_discard_t; + struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ @@ -130,24 +546,48 @@ /* * Generate blkif ring structures and types. */ - DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); -#define BLKRING_GET_SG_REQUEST(_r, _idx) \ - ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx)) +/* + * Index to, and treat as a segment block, an entry in the ring. + */ +#define BLKRING_GET_SEG_BLOCK(_r, _idx) \ + (((blkif_segment_block_t *)RING_GET_REQUEST(_r, _idx))->seg) + +/* + * The number of ring request blocks required to handle an I/O + * request containing _segs segments. + */ +#define BLKIF_SEGS_TO_BLOCKS(_segs) \ + ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \ + + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \ + / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1) #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 /* - * The number of ring request blocks required to handle an I/O - * request containing _segs segments. + * Xen-defined major numbers for virtual disks. */ -#define BLKIF_SEGS_TO_BLOCKS(_segs) \ - ((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK) \ - + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1)) \ - / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1) +#define XEN_IDE0_MAJOR 3 +#define XEN_IDE1_MAJOR 22 +#define XEN_SCSI_DISK0_MAJOR 8 +#define XEN_SCSI_DISK1_MAJOR 65 +#define XEN_SCSI_DISK2_MAJOR 66 +#define XEN_SCSI_DISK3_MAJOR 67 +#define XEN_SCSI_DISK4_MAJOR 68 +#define XEN_SCSI_DISK5_MAJOR 69 +#define XEN_SCSI_DISK6_MAJOR 70 +#define XEN_SCSI_DISK7_MAJOR 71 +#define XEN_SCSI_DISK8_MAJOR 128 +#define XEN_SCSI_DISK9_MAJOR 129 +#define XEN_SCSI_DISK10_MAJOR 130 +#define XEN_SCSI_DISK11_MAJOR 131 +#define XEN_SCSI_DISK12_MAJOR 132 +#define XEN_SCSI_DISK13_MAJOR 133 +#define XEN_SCSI_DISK14_MAJOR 134 +#define XEN_SCSI_DISK15_MAJOR 135 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ diff -x .svn -ur sys/xen/xenbus/xenbusvar.h /usr/home/justing/perforce/SpectraBSD/head/sys/xen/xenbus/xenbusvar.h --- sys/xen/xenbus/xenbusvar.h 2011-06-10 22:59:01.723658126 -0600 +++ /usr/home/justing/perforce/SpectraBSD/head/sys/xen/xenbus/xenbusvar.h 2012-01-31 16:41:51.486111080 -0700 @@ -104,6 +104,20 @@ XenbusState xenbus_read_driver_state(const char *path); /** + * Return the state of the "other end" (peer) of a XenBus device. + * + * \param dev The XenBus device whose peer to query. + * + * \return The current state of the peer device or XenbusStateClosed if no + * state can be read. + */ +static inline XenbusState +xenbus_get_otherend_state(device_t dev) +{ + return (xenbus_read_driver_state(xenbus_get_otherend_path(dev))); +} + +/** * Initialize and register a watch on the given path (client suplied storage). * * \param dev The XenBus device requesting the watch service. Only in sys/xen/xenstore: xenstore.c.orig --Apple-Mail=_D06AA316-BD1F-4F32-9F53-83C356F10EA6--