Date: Mon, 14 Sep 2015 19:37:52 +0000 (UTC) From: Colin Percival <cperciva@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r287802 - in stable/10: release/tools sys/dev/xen/blkfront sys/xen/interface/io Message-ID: <201509141937.t8EJbq5V034530@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: cperciva Date: Mon Sep 14 19:37:51 2015 New Revision: 287802 URL: https://svnweb.freebsd.org/changeset/base/287802 Log: MFC r286062, r286063 Add support to blkfront for blkif indirect segment I/Os. Turn this support off by default in EC2 builds due to performance issues on some EC2 instance types. Modified: stable/10/release/tools/ec2.conf stable/10/sys/dev/xen/blkfront/blkfront.c stable/10/sys/dev/xen/blkfront/block.h stable/10/sys/xen/interface/io/blkif.h Modified: stable/10/release/tools/ec2.conf ============================================================================== --- stable/10/release/tools/ec2.conf Mon Sep 14 19:35:33 2015 (r287801) +++ stable/10/release/tools/ec2.conf Mon Sep 14 19:37:51 2015 (r287802) @@ -70,6 +70,11 @@ vm_extra_pre_umount() { # nodes, but apply the workaround just in case. echo 'hw.broken_txfifo="1"' >> ${DESTDIR}/boot/loader.conf + # Some EC2 instances suffer a significant (~40%) reduction in + # throughput when using blkif indirect segment I/Os. Disable this + # by default for now. + echo 'hw.xbd.xbd_enable_indirect="0"' >> ${DESTDIR}/boot/loader.conf + # The first time the AMI boots, the installed "first boot" scripts # should be allowed to run: # * ec2_configinit (download and process EC2 user-data) Modified: stable/10/sys/dev/xen/blkfront/blkfront.c ============================================================================== --- stable/10/sys/dev/xen/blkfront/blkfront.c Mon Sep 14 19:35:33 2015 (r287801) +++ stable/10/sys/dev/xen/blkfront/blkfront.c Mon Sep 14 19:37:51 2015 (r287802) @@ -84,6 +84,11 @@ static void xbd_startio(struct xbd_softc /*---------------------------- Global Static Data ----------------------------*/ static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data"); +static int xbd_enable_indirect = 1; +SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters"); +SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN, + &xbd_enable_indirect, 0, "Enable xbd indirect segments"); + /*---------------------------- Command Processing ----------------------------*/ static void xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag) @@ -205,7 +210,6 @@ xbd_queue_cb(void *arg, bus_dma_segment_ { struct xbd_softc *sc; struct xbd_command *cm; - blkif_request_t *ring_req; int op; cm = arg; @@ -218,22 +222,47 @@ xbd_queue_cb(void *arg, bus_dma_segment_ return; } - KASSERT(nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST, + KASSERT(nsegs <= sc->xbd_max_request_segments, ("Too many segments in a blkfront I/O")); - /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); - sc->xbd_ring.req_prod_pvt++; - ring_req->id = cm->cm_id; - ring_req->operation = cm->cm_operation; - ring_req->sector_number = cm->cm_sector_number; - ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; - ring_req->nr_segments = nsegs; - cm->cm_nseg = nsegs; - xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, - xenbus_get_otherend_id(sc->xbd_dev), - cm->cm_operation == BLKIF_OP_WRITE, - cm->cm_sg_refs, ring_req->seg); + if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) { + blkif_request_t *ring_req; + + /* Fill out a blkif_request_t structure. */ + ring_req = (blkif_request_t *) + RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); + sc->xbd_ring.req_prod_pvt++; + ring_req->id = cm->cm_id; + ring_req->operation = cm->cm_operation; + ring_req->sector_number = cm->cm_sector_number; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; + ring_req->nr_segments = nsegs; + cm->cm_nseg = nsegs; + xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, + xenbus_get_otherend_id(sc->xbd_dev), + cm->cm_operation == BLKIF_OP_WRITE, + cm->cm_sg_refs, ring_req->seg); + } else { + blkif_request_indirect_t *ring_req; + + /* Fill out a blkif_request_indirect_t structure. */ + ring_req = (blkif_request_indirect_t *) + RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); + sc->xbd_ring.req_prod_pvt++; + ring_req->id = cm->cm_id; + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->indirect_op = cm->cm_operation; + ring_req->sector_number = cm->cm_sector_number; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; + ring_req->nr_segments = nsegs; + cm->cm_nseg = nsegs; + xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, + xenbus_get_otherend_id(sc->xbd_dev), + cm->cm_operation == BLKIF_OP_WRITE, + cm->cm_sg_refs, cm->cm_indirectionpages); + memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs, + sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages); + } if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_PREREAD; @@ -1015,6 +1044,16 @@ xbd_free(struct xbd_softc *sc) cm->cm_sg_refs = NULL; } + if (cm->cm_indirectionpages != NULL) { + gnttab_end_foreign_access_references( + sc->xbd_max_request_indirectpages, + &cm->cm_indirectionrefs[0]); + contigfree(cm->cm_indirectionpages, PAGE_SIZE * + sc->xbd_max_request_indirectpages, + M_XENBLOCKFRONT); + cm->cm_indirectionpages = NULL; + } + bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map); } free(sc->xbd_shadow, M_XENBLOCKFRONT); @@ -1051,9 +1090,6 @@ xbd_initialize(struct xbd_softc *sc) */ max_ring_page_order = 0; sc->xbd_ring_pages = 1; - sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; - sc->xbd_max_request_size = - XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); /* * Protocol negotiation. @@ -1167,7 +1203,7 @@ xbd_connect(struct xbd_softc *sc) unsigned long sectors, sector_size; unsigned int binfo; int err, feature_barrier, feature_flush; - int i; + int i, j; if (sc->xbd_state == XBD_STATE_CONNECTED || sc->xbd_state == XBD_STATE_SUSPENDED) @@ -1198,6 +1234,22 @@ xbd_connect(struct xbd_softc *sc) if (err == 0 && feature_flush != 0) sc->xbd_flags |= XBDF_FLUSH; + err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), + "feature-max-indirect-segments", "%" PRIu32, + &sc->xbd_max_request_segments, NULL); + if ((err != 0) || (xbd_enable_indirect == 0)) + sc->xbd_max_request_segments = 0; + if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS) + sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS; + if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS)) + sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS); + sc->xbd_max_request_indirectpages = + XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments); + if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) + sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + sc->xbd_max_request_size = + XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); + /* Allocate datastructures based on negotiated values. */ err = bus_dma_tag_create( bus_get_dma_tag(sc->xbd_dev), /* parent */ @@ -1230,6 +1282,7 @@ xbd_connect(struct xbd_softc *sc) for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; + void * indirectpages; cm = &sc->xbd_shadow[i]; cm->cm_sg_refs = malloc( @@ -1242,6 +1295,24 @@ xbd_connect(struct xbd_softc *sc) cm->cm_sc = sc; if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0) break; + if (sc->xbd_max_request_indirectpages > 0) { + indirectpages = contigmalloc( + PAGE_SIZE * sc->xbd_max_request_indirectpages, + M_XENBLOCKFRONT, M_ZERO, 0, ~0, PAGE_SIZE, 0); + } else { + indirectpages = NULL; + } + for (j = 0; j < sc->xbd_max_request_indirectpages; j++) { + if (gnttab_grant_foreign_access( + xenbus_get_otherend_id(sc->xbd_dev), + (vtomach(indirectpages) >> PAGE_SHIFT) + j, + 1 /* grant read-only access */, + &cm->cm_indirectionrefs[j])) + break; + } + if (j < sc->xbd_max_request_indirectpages) + break; + cm->cm_indirectionpages = indirectpages; xbd_free_command(cm); } Modified: stable/10/sys/dev/xen/blkfront/block.h ============================================================================== --- stable/10/sys/dev/xen/blkfront/block.h Mon Sep 14 19:35:33 2015 (r287801) +++ stable/10/sys/dev/xen/blkfront/block.h Mon Sep 14 19:37:51 2015 (r287802) @@ -75,11 +75,25 @@ __CONST_RING_SIZE(blkif, PAGE_SIZE * XBD_MAX_RING_PAGES) /** - * The maximum mapped region size per request we will allow in a negotiated - * block-front/back communication channel. + * The maximum number of blkif segments which can be provided per indirect + * page in an indirect request. */ -#define XBD_MAX_REQUEST_SIZE \ - MIN(MAXPHYS, XBD_SEGS_TO_SIZE(BLKIF_MAX_SEGMENTS_PER_REQUEST)) +#define XBD_MAX_SEGMENTS_PER_PAGE \ + (PAGE_SIZE / sizeof(struct blkif_request_segment)) + +/** + * The maximum number of blkif segments which can be provided in an indirect + * request. + */ +#define XBD_MAX_INDIRECT_SEGMENTS \ + (BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST * XBD_MAX_SEGMENTS_PER_PAGE) + +/** + * Compute the number of indirect segment pages required for an I/O with the + * specified number of indirect segments. + */ +#define XBD_INDIRECT_SEGS_TO_PAGES(segs) \ + ((segs + XBD_MAX_SEGMENTS_PER_PAGE - 1) / XBD_MAX_SEGMENTS_PER_PAGE) typedef enum { XBDCF_Q_MASK = 0xFF, @@ -111,6 +125,8 @@ struct xbd_command { blkif_sector_t cm_sector_number; int cm_status; xbd_cbcf_t *cm_complete; + void *cm_indirectionpages; + grant_ref_t cm_indirectionrefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; }; typedef enum { @@ -165,6 +181,7 @@ struct xbd_softc { uint32_t xbd_max_requests; uint32_t xbd_max_request_segments; uint32_t xbd_max_request_size; + uint32_t xbd_max_request_indirectpages; grant_ref_t xbd_ring_ref[XBD_MAX_RING_PAGES]; blkif_front_ring_t xbd_ring; xen_intr_handle_t xen_intr_handle; Modified: stable/10/sys/xen/interface/io/blkif.h ============================================================================== --- stable/10/sys/xen/interface/io/blkif.h Mon Sep 14 19:35:33 2015 (r287801) +++ stable/10/sys/xen/interface/io/blkif.h Mon Sep 14 19:37:51 2015 (r287802) @@ -97,6 +97,28 @@ * * The type of the backing device/object. * + * + * direct-io-safe + * Values: 0/1 (boolean) + * Default Value: 0 + * + * The underlying storage is not affected by the direct IO memory + * lifetime bug. See: + * http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html + * + * Therefore this option gives the backend permission to use + * O_DIRECT, notwithstanding that bug. + * + * That is, if this option is enabled, use of O_DIRECT is safe, + * in circumstances where we would normally have avoided it as a + * workaround for that bug. This option is not relevant for all + * backends, and even not necessarily supported for those for + * which it is relevant. A backend which knows that it is not + * affected by the bug can ignore this option. + * + * This option doesn't require a backend to use O_DIRECT, so it + * should not be used to try to control the caching behaviour. + * *--------------------------------- Features --------------------------------- * * feature-barrier @@ -126,6 +148,34 @@ * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * + * feature-persistent + * Values: 0/1 (boolean) + * Default Value: 0 + * Notes: 7 + * + * A value of "1" indicates that the backend can keep the grants used + * by the frontend driver mapped, so the same set of grants should be + * used in all transactions. The maximum number of grants the backend + * can map persistently depends on the implementation, but ideally it + * should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this + * feature the backend doesn't need to unmap each grant, preventing + * costly TLB flushes. The backend driver should only map grants + * persistently if the frontend supports it. If a backend driver chooses + * to use the persistent protocol when the frontend doesn't support it, + * it will probably hit the maximum number of persistently mapped grants + * (due to the fact that the frontend won't be reusing the same grants), + * and fall back to non-persistent mode. Backend implementations may + * shrink or expand the number of persistently mapped grants without + * notifying the frontend depending on memory constraints (this might + * cause a performance degradation). + * + * If a backend driver wants to limit the maximum number of persistently + * mapped grants to a value less than RING_SIZE * + * BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to + * discard the grants that are less commonly used. Using a LRU in the + * backend driver paired with a LIFO queue in the frontend will + * allow us to have better performance in this scenario. + * *----------------------- Request Transport Parameters ------------------------ * * max-ring-page-order @@ -147,6 +197,16 @@ * *------------------------- Backend Device Properties ------------------------- * + * discard-enable + * Values: 0/1 (boolean) + * Default Value: 1 + * + * This optional property, set by the toolstack, instructs the backend + * to offer discard to the frontend. If the property is missing the + * backend should offer discard if the backing storage actually supports + * it. This optional property, set by the toolstack, requests that the + * backend offer, or not offer, discard to the frontend. + * * discard-alignment * Values: <uint32_t> * Default Value: 0 @@ -166,6 +226,7 @@ * discard-secure * Values: 0/1 (boolean) * Default Value: 0 + * Notes: 10 * * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD * requests with the BLKIF_DISCARD_SECURE flag set. @@ -180,13 +241,17 @@ * sector-size * Values: <uint32_t> * - * The size, in bytes, of the individually addressible data blocks - * on the backend device. + * The logical sector size, in bytes, of the backend device. + * + * physical-sector-size + * Values: <uint32_t> + * + * The physical sector size, in bytes, of the backend device. * * sectors * Values: <uint64_t> * - * The size of the backend device, expressed in units of its native + * The size of the backend device, expressed in units of its logical * sector size ("sector-size"). * ***************************************************************************** @@ -243,6 +308,27 @@ * The size of the frontend allocated request ring buffer in units of * machine pages. The value must be a power of 2. * + * feature-persistent + * Values: 0/1 (boolean) + * Default Value: 0 + * Notes: 7, 8, 9 + * + * A value of "1" indicates that the frontend will reuse the same grants + * for all transactions, allowing the backend to map them with write + * access (even when it should be read-only). If the frontend hits the + * maximum number of allowed persistently mapped grants, it can fallback + * to non persistent mode. This will cause a performance degradation, + * since the the backend driver will still try to map those grants + * persistently. Since the persistent grants protocol is compatible with + * the previous protocol, a frontend driver can choose to work in + * persistent mode even when the backend doesn't support it. + * + * It is recommended that the frontend driver stores the persistently + * mapped grants in a LIFO queue, so a subset of all persistently mapped + * grants gets used commonly. This is done in case the backend driver + * decides to limit the maximum number of persistently mapped grants + * to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. + * *------------------------- Virtual Device Properties ------------------------- * * device-type @@ -262,17 +348,23 @@ * ----- * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer * PV drivers. - * (2) Multi-page ring buffer scheme first used in some Red Hat distributions + * (2) Multi-page ring buffer scheme first used in some RedHat distributions * including a distribution deployed on certain nodes of the Amazon * EC2 cluster. * (3) Support for multi-page ring buffers was implemented independently, - * in slightly different forms, by both Citrix and Red Hat/Amazon. + * in slightly different forms, by both Citrix and RedHat/Amazon. * For full interoperability, block front and backends should publish * identical ring parameters, adjusted for unit differences, to the * XenStore nodes used in both schemes. - * (4) Devices that support discard functionality may internally allocate - * space (discardable extents) in units that are larger than the - * exported logical block size. + * (4) Devices that support discard functionality may internally allocate space + * (discardable extents) in units that are larger than the exported logical + * block size. If the backing device has such discardable extents the + * backend should provide both discard-granularity and discard-alignment. + * Providing just one of the two may be considered an error by the frontend. + * Backends supporting discard should include discard-granularity and + * discard-alignment even if it supports discarding individual sectors. + * Frontends should assume discard-alignment == 0 and discard-granularity + * == sector size if these keys are missing. * (5) The discard-alignment parameter allows a physical device to be * partitioned into virtual devices that do not necessarily begin or * end on a discardable extent boundary. @@ -280,6 +372,19 @@ * 'ring-ref' is used to communicate the grant reference for this * page to the backend. When using a multi-page ring, the 'ring-ref' * node is not created. Instead 'ring-ref0' - 'ring-refN' are used. + * (7) When using persistent grants data has to be copied from/to the page + * where the grant is currently mapped. The overhead of doing this copy + * however doesn't suppress the speed improvement of not having to unmap + * the grants. + * (8) The frontend driver has to allow the backend driver to map all grants + * with write access, even when they should be mapped read-only, since + * further requests may reuse these grants and require write permissions. + * (9) Linux implementation doesn't have a limit on the maximum number of + * grants that can be persistently mapped in the frontend driver, but + * due to the frontent driver implementation it should never be bigger + * than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. + *(10) The discard-secure property may be present and will be set to 1 if the + * backing device supports secure discard. */ /* @@ -404,6 +509,30 @@ #define BLKIF_OP_DISCARD 5 /* + * Recognized if "feature-max-indirect-segments" in present in the backend + * xenbus info. The "feature-max-indirect-segments" node contains the maximum + * number of segments allowed by the backend per request. If the node is + * present, the frontend might use blkif_request_indirect structs in order to + * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The + * maximum number of indirect segments is fixed by the backend, but the + * frontend can issue requests with any number of indirect segments as long as + * it's less than the number provided by the backend. The indirect_grefs field + * in blkif_request_indirect should be filled by the frontend with the + * grant references of the pages that are holding the indirect segments. + * These pages are filled with an array of blkif_request_segment that hold the + * information about the segments. The number of indirect pages to use is + * determined by the number of segments an indirect request contains. Every + * indirect page can contain a maximum of + * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to + * calculate the number of indirect pages to use we have to do + * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). + * + * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* + * create the "feature-max-indirect-segments" node! + */ +#define BLKIF_OP_INDIRECT 6 + +/* * Maximum scatter/gather segments per request. * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. @@ -411,11 +540,17 @@ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 /* + * Maximum number of indirect pages to use per request. + */ +#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 + +/* * NB. first_sect and last_sect in blkif_request_segment, as well as * sector_number in blkif_request, are always expressed in 512-byte units. * However they must be properly aligned to the real sector size of the - * physical disk, which is reported in the "sector-size" node in the backend - * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units. + * physical disk, which is reported in the "physical-sector-size" node in + * the backend xenbus info. Also the xenbus "sectors" node is expressed in + * 512-byte units. */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ @@ -453,6 +588,20 @@ struct blkif_request_discard { }; typedef struct blkif_request_discard blkif_request_discard_t; +struct blkif_request_indirect { + uint8_t operation; /* BLKIF_OP_INDIRECT */ + uint8_t indirect_op; /* BLKIF_OP_{READ/WRITE} */ + uint16_t nr_segments; /* number of segments */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + blkif_vdev_t handle; /* same as for read/write requests */ + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; +#ifdef __i386__ + uint64_t pad; /* Make it 64 byte aligned on i386 */ +#endif +}; +typedef struct blkif_request_indirect blkif_request_indirect_t; + struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ @@ -484,7 +633,7 @@ DEFINE_RING_TYPES(blkif, struct blkif_re /* * Local variables: * mode: C - * c-set-style: "BSD" + * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201509141937.t8EJbq5V034530>