From owner-svn-src-head@FreeBSD.ORG  Mon Jun 13 20:36:30 2011
Return-Path: <owner-svn-src-head@FreeBSD.ORG>
Delivered-To: svn-src-head@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id 27B681065676;
	Mon, 13 Jun 2011 20:36:30 +0000 (UTC)
	(envelope-from gibbs@FreeBSD.org)
Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c])
	by mx1.freebsd.org (Postfix) with ESMTP id 15F9B8FC22;
	Mon, 13 Jun 2011 20:36:30 +0000 (UTC)
Received: from svn.freebsd.org (localhost [127.0.0.1])
	by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id p5DKaUEA097701;
	Mon, 13 Jun 2011 20:36:30 GMT (envelope-from gibbs@svn.freebsd.org)
Received: (from gibbs@localhost)
	by svn.freebsd.org (8.14.4/8.14.4/Submit) id p5DKaUXY097699;
	Mon, 13 Jun 2011 20:36:30 GMT (envelope-from gibbs@svn.freebsd.org)
Message-Id: <201106132036.p5DKaUXY097699@svn.freebsd.org>
From: "Justin T. Gibbs" <gibbs@FreeBSD.org>
Date: Mon, 13 Jun 2011 20:36:30 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
	svn-src-head@freebsd.org
X-SVN-Group: head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cc: 
Subject: svn commit: r223059 - head/sys/dev/xen/blkback
X-BeenThere: svn-src-head@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: SVN commit messages for the src tree for head/-current
	<svn-src-head.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-head>,
	<mailto:svn-src-head-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-head>
List-Post: <mailto:svn-src-head@freebsd.org>
List-Help: <mailto:svn-src-head-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-head>,
	<mailto:svn-src-head-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Mon, 13 Jun 2011 20:36:30 -0000

Author: gibbs
Date: Mon Jun 13 20:36:29 2011
New Revision: 223059
URL: http://svn.freebsd.org/changeset/base/223059

Log:
  Several enhancements to the Xen block back driver.
  
  sys/dev/xen/blkback/blkback.c:
  	o Implement front-end request coalescing.  This greatly improves the
  	  performance of front-end clients that are unaware of the dynamic
  	  request-size/number of requests negotiation available in the
  	  FreeBSD backend driver.  This required a large restructuring
  	  in how this driver records in-flight transactions and how those
  	  transactions are mapped into kernel KVA.  For example, the driver
  	  now includes a mini "KVA manager" that allocates ranges of
  	  contiguous KVA to patches of requests that are physically
  	  contiguous in the backing store so that a single bio or UIO
  	  segment can be used to represent the I/O.
  
  	o Refuse to open any backend files or devices if the system
  	  has yet to mount root.  This avoids a panic.
  
  	o Properly handle "onlined" devices.  An "onlined" backend
  	  device stays attached to its backing store across front-end
  	  disconnections.  This feature is intended to reduce latency
  	  when a front-end does a hand-off to another driver (e.g.
  	  PV aware bootloader to OS kernel) or during a VM reboot.
  
  	o Harden the driver against a pathological/buggy front-end
  	  by carefully vetting front-end XenStore data such as the
  	  front-end state.
  
  	o Add sysctls that report the negotiated number of
  	  segments per-request and the number of requests that
  	  can be concurrently in flight.
  
  Submitted by:	kdm
  Reviewed by:	gibbs
  Sponsored by:	Spectra Logic Corporation
  MFC after:	1 week

Modified:
  head/sys/dev/xen/blkback/blkback.c

Modified: head/sys/dev/xen/blkback/blkback.c
==============================================================================
--- head/sys/dev/xen/blkback/blkback.c	Mon Jun 13 20:34:12 2011	(r223058)
+++ head/sys/dev/xen/blkback/blkback.c	Mon Jun 13 20:36:29 2011	(r223059)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2010 Spectra Logic Corporation
+ * Copyright (c) 2009-2011 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,6 +61,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/bitstring.h>
 
 #include <geom/geom.h>
 
@@ -153,9 +155,19 @@ MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "X
 #define	XBB_MAX_RING_PAGES						    \
 	BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
 		       * XBB_MAX_REQUESTS)
+/**
+ * The maximum number of ring pages that we can allow per request list.
+ * We limit this to the maximum number of segments per request, because
+ * that is already a reasonable number of segments to aggregate.  This
+ * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
+ * because that would leave situations where we can't dispatch even one
+ * large request.
+ */
+#define	XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
 
 /*--------------------------- Forward Declarations ---------------------------*/
 struct xbb_softc;
+struct xbb_xen_req;
 
 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
 			      ...) __attribute__((format(printf, 3, 4)));
@@ -163,16 +175,15 @@ static int  xbb_shutdown(struct xbb_soft
 static int  xbb_detach(device_t dev);
 
 /*------------------------------ Data Structures -----------------------------*/
-/**
- * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
- */
-struct xbb_xen_req {
-	/**
-	 * Linked list links used to aggregate idle request in the
-	 * request free pool (xbb->request_free_slist).
-	 */
-	SLIST_ENTRY(xbb_xen_req) links;
 
+STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
+
+typedef enum {
+	XBB_REQLIST_NONE	= 0x00,
+	XBB_REQLIST_MAPPED	= 0x01
+} xbb_reqlist_flags;
+
+struct xbb_xen_reqlist {
 	/**
 	 * Back reference to the parent block back instance for this
 	 * request.  Used during bio_done handling.
@@ -180,17 +191,71 @@ struct xbb_xen_req {
 	struct xbb_softc        *xbb;
 
 	/**
-	 * The remote domain's identifier for this I/O request.
+	 * BLKIF_OP code for this request.
+	 */
+	int			 operation;
+
+	/**
+	 * Set to BLKIF_RSP_* to indicate request status.
+	 *
+	 * This field allows an error status to be recorded even if the
+	 * delivery of this status must be deferred.  Deferred reporting
+	 * is necessary, for example, when an error is detected during
+	 * completion processing of one bio when other bios for this
+	 * request are still outstanding.
+	 */
+	int			 status;
+
+	/**
+	 * Number of 512 byte sectors not transferred.
+	 */
+	int			 residual_512b_sectors;
+
+	/**
+	 * Starting sector number of the first request in the list.
+	 */
+	off_t			 starting_sector_number;
+
+	/**
+	 * If we're going to coalesce, the next contiguous sector would be
+	 * this one.
+	 */
+	off_t			 next_contig_sector;
+
+	/**
+	 * Number of child requests in the list.
 	 */
-	uint64_t		 id;
+	int			 num_children;
+
+	/**
+	 * Number of I/O requests dispatched to the backend.
+	 */
+	int			 pendcnt;
+
+	/**
+	 * Total number of segments for requests in the list.
+	 */
+	int			 nr_segments;
+
+	/**
+	 * Flags for this particular request list.
+	 */
+	xbb_reqlist_flags	 flags;
 
 	/**
 	 * Kernel virtual address space reserved for this request
-	 * structure and used to map the remote domain's pages for
+	 * list structure and used to map the remote domain's pages for
 	 * this I/O, into our domain's address space.
 	 */
 	uint8_t			*kva;
 
+	/**
+	 * Base, psuedo-physical address, corresponding to the start
+	 * of this request's kva region.
+	 */
+	uint64_t	 	 gnt_base;
+
+
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	/**
 	 * Pre-allocated domain local memory used to proxy remote
@@ -200,53 +265,91 @@ struct xbb_xen_req {
 #endif
 
 	/**
-	 * Base, psuedo-physical address, corresponding to the start
-	 * of this request's kva region.
+	 * Array of grant handles (one per page) used to map this request.
 	 */
-	uint64_t	 	 gnt_base;
+	grant_handle_t		*gnt_handles;
+
+	/**
+	 * Device statistics request ordering type (ordered or simple).
+	 */
+	devstat_tag_type	 ds_tag_type;
+
+	/**
+	 * Device statistics request type (read, write, no_data).
+	 */
+	devstat_trans_flags	 ds_trans_type;
+
+	/**
+	 * The start time for this request.
+	 */
+	struct bintime		 ds_t0;
+
+	/**
+	 * Linked list of contiguous requests with the same operation type.
+	 */
+	struct xbb_xen_req_list	 contig_req_list;
+
+	/**
+	 * Linked list links used to aggregate idle requests in the
+	 * request list free pool (xbb->reqlist_free_stailq) and pending
+	 * requests waiting for execution (xbb->reqlist_pending_stailq).
+	 */
+	STAILQ_ENTRY(xbb_xen_reqlist) links;
+};
+
+STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
+
+/**
+ * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
+ */
+struct xbb_xen_req {
+	/**
+	 * Linked list links used to aggregate requests into a reqlist
+	 * and to store them in the request free pool.
+	 */
+	STAILQ_ENTRY(xbb_xen_req) links;
+
+	/**
+	 * The remote domain's identifier for this I/O request.
+	 */
+	uint64_t		  id;
 
 	/**
 	 * The number of pages currently mapped for this request.
 	 */
-	int			 nr_pages;
+	int			  nr_pages;
 
 	/**
 	 * The number of 512 byte sectors comprising this requests.
 	 */
-	int			 nr_512b_sectors;
+	int			  nr_512b_sectors;
 
 	/**
 	 * The number of struct bio requests still outstanding for this
 	 * request on the backend device.  This field is only used for	
 	 * device (rather than file) backed I/O.
 	 */
-	int			 pendcnt;
+	int			  pendcnt;
 
 	/**
 	 * BLKIF_OP code for this request.
 	 */
-	int			 operation;
+	int			  operation;
 
 	/**
-	 * BLKIF_RSP status code for this request.
-	 *
-	 * This field allows an error status to be recorded even if the
-	 * delivery of this status must be deferred.  Deferred reporting
-	 * is necessary, for example, when an error is detected during
-	 * completion processing of one bio when other bios for this
-	 * request are still outstanding.
+	 * Storage used for non-native ring requests.
 	 */
-	int			 status;
+	blkif_request_t		 ring_req_storage;
 
 	/**
-	 * Device statistics request ordering type (ordered or simple).
+	 * Pointer to the Xen request in the ring.
 	 */
-	devstat_tag_type	 ds_tag_type;
+	blkif_request_t		*ring_req;
 
 	/**
-	 * Device statistics request type (read, write, no_data).
+	 * Consumer index for this request.
 	 */
-	devstat_trans_flags	 ds_trans_type;
+	RING_IDX		 req_ring_idx;
 
 	/**
 	 * The start time for this request.
@@ -254,9 +357,9 @@ struct xbb_xen_req {
 	struct bintime		 ds_t0;
 
 	/**
-	 * Array of grant handles (one per page) used to map this request.
+	 * Pointer back to our parent request list.
 	 */
-	grant_handle_t		*gnt_handles;
+	struct xbb_xen_reqlist  *reqlist;
 };
 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
 
@@ -321,7 +424,10 @@ typedef enum
 	XBBF_RESOURCE_SHORTAGE = 0x04,
 
 	/** Connection teardown in progress. */
-	XBBF_SHUTDOWN          = 0x08
+	XBBF_SHUTDOWN          = 0x08,
+
+	/** A thread is already performing shutdown processing. */
+	XBBF_IN_SHUTDOWN       = 0x10
 } xbb_flag_t;
 
 /** Backend device type.  */
@@ -399,7 +505,7 @@ struct xbb_file_data {
 	 * Only a single file based request is outstanding per-xbb instance,
 	 * so we only need one of these.
 	 */
-	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 #ifdef XBB_USE_BOUNCE_BUFFERS
 
 	/**
@@ -411,7 +517,7 @@ struct xbb_file_data {
 	 * bounce-out the read data.  This array serves as the temporary
 	 * storage for this saved data.
 	 */
-	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/**
 	 * \brief Array of memoized bounce buffer kva offsets used
@@ -422,7 +528,7 @@ struct xbb_file_data {
 	 * the request sg elements is unavoidable. We memoize the computed
 	 * bounce address here to reduce the cost of the second walk.
 	 */
-	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST];
+	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 };
 
@@ -437,9 +543,9 @@ union xbb_backend_data {
 /**
  * Function signature of backend specific I/O handlers.
  */
-typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req,
-			      struct xbb_xen_req *req, int nseg,
-			      int operation, int flags);
+typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
+			      struct xbb_xen_reqlist *reqlist, int operation,
+			      int flags);
 
 /**
  * Per-instance configuration data.
@@ -467,14 +573,23 @@ struct xbb_softc {
 	xbb_dispatch_t		  dispatch_io;
 
 	/** The number of requests outstanding on the backend device/file. */
-	u_int			  active_request_count;
+	int			  active_request_count;
 
 	/** Free pool of request tracking structures. */
-	struct xbb_xen_req_slist  request_free_slist;
+	struct xbb_xen_req_list   request_free_stailq;
 
 	/** Array, sized at connection time, of request tracking structures. */
 	struct xbb_xen_req	 *requests;
 
+	/** Free pool of request list structures. */
+	struct xbb_xen_reqlist_list reqlist_free_stailq;
+
+	/** List of pending request lists awaiting execution. */
+	struct xbb_xen_reqlist_list reqlist_pending_stailq;
+
+	/** Array, sized at connection time, of request list structures. */
+	struct xbb_xen_reqlist	 *request_lists;
+
 	/**
 	 * Global pool of kva used for mapping remote domain ring
 	 * and I/O transaction data.
@@ -487,6 +602,15 @@ struct xbb_softc {
 	/** The size of the global kva pool. */
 	int			  kva_size;
 
+	/** The size of the KVA area used for request lists. */
+	int			  reqlist_kva_size;
+
+	/** The number of pages of KVA used for request lists */
+	int			  reqlist_kva_pages;
+
+	/** Bitmap of free KVA pages */
+	bitstr_t		 *kva_free;
+
 	/**
 	 * \brief Cached value of the front-end's domain id.
 	 * 
@@ -508,12 +632,12 @@ struct xbb_softc {
 	int			  abi;
 
 	/**
-	 * \brief The maximum number of requests allowed to be in
-	 *        flight at a time.
+	 * \brief The maximum number of requests and request lists allowed
+	 *        to be in flight at a time.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
-	uint32_t		  max_requests;
+	u_int			  max_requests;
 
 	/**
 	 * \brief The maximum number of segments (1 page per segment)
@@ -521,7 +645,15 @@ struct xbb_softc {
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
-	uint32_t		  max_request_segments;
+	u_int			  max_request_segments;
+
+	/**
+	 * \brief Maximum number of segments per request list.
+	 *
+	 * This value is derived from and will generally be larger than
+	 * max_request_segments.
+	 */
+	u_int			  max_reqlist_segments;
 
 	/**
 	 * The maximum size of any request to this back-end
@@ -529,7 +661,13 @@ struct xbb_softc {
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
-	uint32_t		  max_request_size;
+	u_int			  max_request_size;
+
+	/**
+	 * The maximum size of any request list.  This is derived directly
+	 * from max_reqlist_segments.
+	 */
+	u_int			  max_reqlist_size;
 
 	/** Various configuration and state bit flags. */
 	xbb_flag_t		  flags;
@@ -574,6 +712,7 @@ struct xbb_softc {
 	struct vnode		 *vn;
 
 	union xbb_backend_data	  backend;
+
 	/** The native sector size of the backend. */
 	u_int			  sector_size;
 
@@ -598,7 +737,14 @@ struct xbb_softc {
 	 *
 	 * Ring processing is serialized so we only need one of these.
 	 */
-	struct xbb_sg		  xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST];
+	struct xbb_sg		  xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
+
+	/**
+	 * Temporary grant table map used in xbb_dispatch_io().  When
+	 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
+	 * stack could cause a stack overflow.
+	 */
+	struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/** Mutex protecting per-instance data. */
 	struct mtx		  lock;
@@ -614,8 +760,51 @@ struct xbb_softc {
 	int			  pseudo_phys_res_id;
 #endif
 
-	/** I/O statistics. */
+	/**
+	 * I/O statistics from BlockBack dispatch down.  These are
+	 * coalesced requests, and we start them right before execution.
+	 */
 	struct devstat		 *xbb_stats;
+
+	/**
+	 * I/O statistics coming into BlockBack.  These are the requests as
+	 * we get them from BlockFront.  They are started as soon as we
+	 * receive a request, and completed when the I/O is complete.
+	 */
+	struct devstat		 *xbb_stats_in;
+
+	/** Disable sending flush to the backend */
+	int			  disable_flush;
+
+	/** Send a real flush for every N flush requests */
+	int			  flush_interval;
+
+	/** Count of flush requests in the interval */
+	int			  flush_count;
+
+	/** Don't coalesce requests if this is set */
+	int			  no_coalesce_reqs;
+
+	/** Number of requests we have received */
+	uint64_t		  reqs_received;
+
+	/** Number of requests we have completed*/
+	uint64_t		  reqs_completed;
+
+	/** How many forced dispatches (i.e. without coalescing) have happend */
+	uint64_t		  forced_dispatch;
+
+	/** How many normal dispatches have happend */
+	uint64_t		  normal_dispatch;
+
+	/** How many total dispatches have happend */
+	uint64_t		  total_dispatch;
+
+	/** How many times we have run out of KVA */
+	uint64_t		  kva_shortages;
+
+	/** How many times we have run out of request structures */
+	uint64_t		  request_shortages;
 };
 
 /*---------------------------- Request Processing ----------------------------*/
@@ -633,21 +822,14 @@ xbb_get_req(struct xbb_softc *xbb)
 	struct xbb_xen_req *req;
 
 	req = NULL;
-	mtx_lock(&xbb->lock);
 
-	/*
-	 * Do not allow new requests to be allocated while we
-	 * are shutting down.
-	 */
-	if ((xbb->flags & XBBF_SHUTDOWN) == 0) {
-		if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) {
-			SLIST_REMOVE_HEAD(&xbb->request_free_slist, links);
-			xbb->active_request_count++;
-		} else {
-			xbb->flags |= XBBF_RESOURCE_SHORTAGE;
-		}
+	mtx_assert(&xbb->lock, MA_OWNED);
+
+	if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
+		STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
+		xbb->active_request_count++;
 	}
-	mtx_unlock(&xbb->lock);
+
 	return (req);
 }
 
@@ -660,34 +842,40 @@ xbb_get_req(struct xbb_softc *xbb)
 static inline void
 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
 {
-	int wake_thread;
+	mtx_assert(&xbb->lock, MA_OWNED);
 
-	mtx_lock(&xbb->lock);
-	wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE;
-	xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
-	SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+	STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
 	xbb->active_request_count--;
 
-	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
-		/*
-		 * Shutdown is in progress.  See if we can
-		 * progress further now that one more request
-		 * has completed and been returned to the
-		 * free pool.
-		 */
-		xbb_shutdown(xbb);
-	}
-	mtx_unlock(&xbb->lock);
+	KASSERT(xbb->active_request_count >= 0,
+		("xbb_release_req: negative active count"));
+}
 
-	if (wake_thread != 0)
-		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
+/**
+ * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
+ *
+ * \param xbb	    Per-instance xbb configuration structure.
+ * \param req_list  The list of requests to free.
+ * \param nreqs	    The number of items in the list.
+ */
+static inline void
+xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
+		 int nreqs)
+{
+	mtx_assert(&xbb->lock, MA_OWNED);
+
+	STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
+	xbb->active_request_count -= nreqs;
+
+	KASSERT(xbb->active_request_count >= 0,
+		("xbb_release_reqs: negative active count"));
 }
 
 /**
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's kva region.
  *
- * \param req     The request structure whose kva region will be accessed.
+ * \param reqlist The request structure whose kva region will be accessed.
  * \param pagenr  The page index used to compute the kva offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                kva offset.
@@ -695,9 +883,9 @@ xbb_release_req(struct xbb_softc *xbb, s
  * \return  The computed global KVA offset.
  */
 static inline uint8_t *
-xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
-	return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9));
+	return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
@@ -705,7 +893,7 @@ xbb_req_vaddr(struct xbb_xen_req *req, i
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's local bounce memory region.
  *
- * \param req     The request structure whose bounce region will be accessed.
+ * \param reqlist The request structure whose bounce region will be accessed.
  * \param pagenr  The page index used to compute the bounce offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                bounce offset.
@@ -713,9 +901,9 @@ xbb_req_vaddr(struct xbb_xen_req *req, i
  * \return  The computed global bounce buffer address.
  */
 static inline uint8_t *
-xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
-	return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
+	return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 #endif
 
@@ -724,7 +912,7 @@ xbb_req_bounce_addr(struct xbb_xen_req *
  * calculate an offset into the request's memory region that the
  * underlying backend device/file should use for I/O.
  *
- * \param req     The request structure whose I/O region will be accessed.
+ * \param reqlist The request structure whose I/O region will be accessed.
  * \param pagenr  The page index used to compute the I/O offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                I/O offset.
@@ -736,12 +924,12 @@ xbb_req_bounce_addr(struct xbb_xen_req *
  * this request.
  */
 static inline uint8_t *
-xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 #ifdef XBB_USE_BOUNCE_BUFFERS
-	return (xbb_req_bounce_addr(req, pagenr, sector));
+	return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
 #else
-	return (xbb_req_vaddr(req, pagenr, sector));
+	return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
 #endif
 }
 
@@ -750,7 +938,7 @@ xbb_req_ioaddr(struct xbb_xen_req *req, 
  * an offset into the local psuedo-physical address space used to map a
  * front-end's request data into a request.
  *
- * \param req     The request structure whose pseudo-physical region
+ * \param reqlist The request list structure whose pseudo-physical region
  *                will be accessed.
  * \param pagenr  The page index used to compute the pseudo-physical offset.
  * \param sector  The 512b sector index used to compute the page relative
@@ -763,10 +951,126 @@ xbb_req_ioaddr(struct xbb_xen_req *req, 
  * this request.
  */
 static inline uintptr_t
-xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
+{
+	struct xbb_softc *xbb;
+
+	xbb = reqlist->xbb;
+
+	return ((uintptr_t)(xbb->gnt_base_addr +
+		(uintptr_t)(reqlist->kva - xbb->kva) +
+		(PAGE_SIZE * pagenr) + (sector << 9)));
+}
+
+/**
+ * Get Kernel Virtual Address space for mapping requests.
+ *
+ * \param xbb         Per-instance xbb configuration structure.
+ * \param nr_pages    Number of pages needed.
+ * \param check_only  If set, check for free KVA but don't allocate it.
+ * \param have_lock   If set, xbb lock is already held.
+ *
+ * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
+ *
+ * Note:  This should be unnecessary once we have either chaining or
+ * scatter/gather support for struct bio.  At that point we'll be able to
+ * put multiple addresses and lengths in one bio/bio chain and won't need
+ * to map everything into one virtual segment.
+ */
+static uint8_t *
+xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
+{
+	intptr_t first_clear, num_clear;
+	uint8_t *free_kva;
+	int i;
+
+	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
+
+	first_clear = 0;
+	free_kva = NULL;
+
+	mtx_lock(&xbb->lock);
+
+	/*
+	 * Look for the first available page.  If there are none, we're done.
+	 */
+	bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
+
+	if (first_clear == -1)
+		goto bailout;
+
+	/*
+	 * Starting at the first available page, look for consecutive free
+	 * pages that will satisfy the user's request.
+	 */
+	for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
+		/*
+		 * If this is true, the page is used, so we have to reset
+		 * the number of clear pages and the first clear page
+		 * (since it pointed to a region with an insufficient number
+		 * of clear pages).
+		 */
+		if (bit_test(xbb->kva_free, i)) {
+			num_clear = 0;
+			first_clear = -1;
+			continue;
+		}
+
+		if (first_clear == -1)
+			first_clear = i;
+
+		/*
+		 * If this is true, we've found a large enough free region
+		 * to satisfy the request.
+		 */
+		if (++num_clear == nr_pages) {
+
+			bit_nset(xbb->kva_free, first_clear,
+				 first_clear + nr_pages - 1);
+
+			free_kva = xbb->kva +
+				(uint8_t *)(first_clear * PAGE_SIZE);
+
+			KASSERT(free_kva >= (uint8_t *)xbb->kva &&
+				free_kva + (nr_pages * PAGE_SIZE) <=
+				(uint8_t *)xbb->ring_config.va,
+				("Free KVA %p len %d out of range, "
+				 "kva = %#jx, ring VA = %#jx\n", free_kva,
+				 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
+				 (uintmax_t)xbb->ring_config.va));
+			break;
+		}
+	}
+
+bailout:
+
+	if (free_kva == NULL) {
+		xbb->flags |= XBBF_RESOURCE_SHORTAGE;
+		xbb->kva_shortages++;
+	}
+
+	mtx_unlock(&xbb->lock);
+
+	return (free_kva);
+}
+
+/**
+ * Free allocated KVA.
+ *
+ * \param xbb	    Per-instance xbb configuration structure.
+ * \param kva_ptr   Pointer to allocated KVA region.  
+ * \param nr_pages  Number of pages in the KVA region.
+ */
+static void
+xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
 {
-	return ((uintptr_t)(req->gnt_base
-			  + (PAGE_SIZE * pagenr) + (sector << 9)));
+	intptr_t start_page;
+
+	mtx_assert(&xbb->lock, MA_OWNED);
+
+	start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
+	bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
+
 }
 
 /**
@@ -775,23 +1079,23 @@ xbb_req_gntaddr(struct xbb_xen_req *req,
  * \param req  The request structure to unmap.
  */
 static void
-xbb_unmap_req(struct xbb_xen_req *req)
+xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
 {
-	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST];
+	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
 	u_int			      i;
 	u_int			      invcount;
 	int			      error;
 
 	invcount = 0;
-	for (i = 0; i < req->nr_pages; i++) {
+	for (i = 0; i < reqlist->nr_segments; i++) {
 
-		if (req->gnt_handles[i] == GRANT_REF_INVALID)
+		if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
 			continue;
 
-		unmap[invcount].host_addr    = xbb_req_gntaddr(req, i, 0);
+		unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
 		unmap[invcount].dev_bus_addr = 0;
-		unmap[invcount].handle       = req->gnt_handles[i];
-		req->gnt_handles[i]	     = GRANT_REF_INVALID;
+		unmap[invcount].handle       = reqlist->gnt_handles[i];
+		reqlist->gnt_handles[i]	     = GRANT_REF_INVALID;
 		invcount++;
 	}
 
@@ -801,6 +1105,175 @@ xbb_unmap_req(struct xbb_xen_req *req)
 }
 
 /**
+ * Allocate an internal transaction tracking structure from the free pool.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
+ *          Otherwise NULL.
+ */
+static inline struct xbb_xen_reqlist *
+xbb_get_reqlist(struct xbb_softc *xbb)
+{
+	struct xbb_xen_reqlist *reqlist;
+
+	reqlist = NULL;
+
+	mtx_assert(&xbb->lock, MA_OWNED);
+
+	if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
+
+		STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
+		reqlist->flags = XBB_REQLIST_NONE;
+		reqlist->kva = NULL;
+		reqlist->status = BLKIF_RSP_OKAY;
+		reqlist->residual_512b_sectors = 0;
+		reqlist->num_children = 0;
+		reqlist->nr_segments = 0;
+		STAILQ_INIT(&reqlist->contig_req_list);
+	}
+
+	return (reqlist);
+}
+
+/**
+ * Return an allocated transaction tracking structure to the free pool.
+ *
+ * \param xbb        Per-instance xbb configuration structure.
+ * \param req        The request list structure to free.
+ * \param wakeup     If set, wakeup the work thread if freeing this reqlist
+ *                   during a resource shortage condition.
+ */
+static inline void
+xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
+		    int wakeup)
+{
+
+	mtx_lock(&xbb->lock);
+
+	if (wakeup) {
+		wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
+		xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
+	}
+
+	if (reqlist->kva != NULL)
+		xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
+
+	xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
+
+	STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
+
+	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+		/*
+		 * Shutdown is in progress.  See if we can
+		 * progress further now that one more request
+		 * has completed and been returned to the
+		 * free pool.
+		 */
+		xbb_shutdown(xbb);
+	}
+
+	mtx_unlock(&xbb->lock);
+
+	if (wakeup != 0)
+		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
+}
+
+/**
+ * Request resources and do basic request setup.
+ *
+ * \param xbb          Per-instance xbb configuration structure.
+ * \param reqlist      Pointer to reqlist pointer.
+ * \param ring_req     Pointer to a block ring request.
+ * \param ring_index   The ring index of this request.
+ *
+ * \return  0 for success, non-zero for failure.
+ */
+static int
+xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
+		  blkif_request_t *ring_req, RING_IDX ring_idx)
+{
+	struct xbb_xen_reqlist *nreqlist;
+	struct xbb_xen_req     *nreq;
+
+	nreqlist = NULL;
+	nreq     = NULL;
+
+	mtx_lock(&xbb->lock);
+
+	/*
+	 * We don't allow new resources to be allocated if we're in the
+	 * process of shutting down.
+	 */
+	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+		mtx_unlock(&xbb->lock);
+		return (1);
+	}
+
+	/*
+	 * Allocate a reqlist if the caller doesn't have one already.
+	 */
+	if (*reqlist == NULL) {
+		nreqlist = xbb_get_reqlist(xbb);
+		if (nreqlist == NULL)
+			goto bailout_error;
+	}
+
+	/* We always allocate a request. */
+	nreq = xbb_get_req(xbb);
+	if (nreq == NULL)
+		goto bailout_error;
+
+	mtx_unlock(&xbb->lock);
+
+	if (*reqlist == NULL) {
+		*reqlist = nreqlist;
+		nreqlist->operation = ring_req->operation;
+		nreqlist->starting_sector_number = ring_req->sector_number;
+		STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
+				   links);
+	}
+
+	nreq->reqlist = *reqlist;
+	nreq->req_ring_idx = ring_idx;
+
+	if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
+		bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
+		nreq->ring_req = &nreq->ring_req_storage;
+	} else {
+		nreq->ring_req = ring_req;
+	}
+
+	binuptime(&nreq->ds_t0);
+	devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
+	STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
+	(*reqlist)->num_children++;
+	(*reqlist)->nr_segments += ring_req->nr_segments;
+
+	return (0);
+
+bailout_error:
+
+	/*
+	 * We're out of resources, so set the shortage flag.  The next time
+	 * a request is released, we'll try waking up the work thread to
+	 * see if we can allocate more resources.
+	 */
+	xbb->flags |= XBBF_RESOURCE_SHORTAGE;
+	xbb->request_shortages++;
+
+	if (nreq != NULL)
+		xbb_release_req(xbb, nreq);
+
+	mtx_unlock(&xbb->lock);
+
+	if (nreqlist != NULL)
+		xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
+
+	return (1);
+}
+
+/**
  * Create and transmit a response to a blkif request.
  * 
  * \param xbb     Per-instance xbb configuration structure.
@@ -862,6 +1335,8 @@ xbb_send_response(struct xbb_softc *xbb,
 		more_to_do = 1;
 	}
 
+	xbb->reqs_completed++;
+
 	mtx_unlock(&xbb->lock);
 
 	if (more_to_do)
@@ -872,6 +1347,70 @@ xbb_send_response(struct xbb_softc *xbb,
 }
 
 /**
+ * Complete a request list.
+ *
+ * \param xbb        Per-instance xbb configuration structure.
+ * \param reqlist    Allocated internal request list structure.
+ */
+static void
+xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
+{
+	struct xbb_xen_req *nreq;
+	off_t		    sectors_sent;
+
+	sectors_sent = 0;
+
+	if (reqlist->flags & XBB_REQLIST_MAPPED)
+		xbb_unmap_reqlist(reqlist);
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***