From owner-svn-src-head@freebsd.org  Fri Sep 30 17:49:06 2016
Return-Path: <owner-svn-src-head@freebsd.org>
Delivered-To: svn-src-head@mailman.ysv.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org
 [IPv6:2001:1900:2254:206a::19:1])
 by mailman.ysv.freebsd.org (Postfix) with ESMTP id 215D3C03487;
 Fri, 30 Sep 2016 17:49:06 +0000 (UTC) (envelope-from imp@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mx1.freebsd.org (Postfix) with ESMTPS id F19821F79;
 Fri, 30 Sep 2016 17:49:05 +0000 (UTC) (envelope-from imp@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u8UHn5Kf059230;
 Fri, 30 Sep 2016 17:49:05 GMT (envelope-from imp@FreeBSD.org)
Received: (from imp@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id u8UHn5Ok059229;
 Fri, 30 Sep 2016 17:49:05 GMT (envelope-from imp@FreeBSD.org)
Message-Id: <201609301749.u8UHn5Ok059229@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: imp set sender to imp@FreeBSD.org
 using -f
From: Warner Losh <imp@FreeBSD.org>
Date: Fri, 30 Sep 2016 17:49:05 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-head@freebsd.org
Subject: svn commit: r306514 - head/sys/cam
X-SVN-Group: head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-head@freebsd.org
X-Mailman-Version: 2.1.23
Precedence: list
List-Id: SVN commit messages for the src tree for head/-current
 <svn-src-head.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-src-head>,
 <mailto:svn-src-head-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-head/>
List-Post: <mailto:svn-src-head@freebsd.org>
List-Help: <mailto:svn-src-head-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-src-head>,
 <mailto:svn-src-head-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Fri, 30 Sep 2016 17:49:06 -0000

Author: imp
Date: Fri Sep 30 17:49:04 2016
New Revision: 306514
URL: https://svnweb.freebsd.org/changeset/base/306514

Log:
  Compute two new metrics. Disk load, the average number of transactions
  we have queued up normaliazed to the queue size. Also compute buckets
  of latency to help compute, in userland, estimates of Median, P90, P95
  and P99 values.
  
  Sponsored by: Netflix, Inc

Modified:
  head/sys/cam/cam_iosched.c

Modified: head/sys/cam/cam_iosched.c
==============================================================================
--- head/sys/cam/cam_iosched.c	Fri Sep 30 17:30:39 2016	(r306513)
+++ head/sys/cam/cam_iosched.c	Fri Sep 30 17:49:04 2016	(r306514)
@@ -42,12 +42,14 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/sbuf.h>
 #include <sys/sysctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
+#include <cam/cam_xpt_internal.h>
 #include <cam/cam_iosched.h>
 
 #include <ddb/ddb.h>
@@ -73,14 +75,31 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_dynam
     &do_dynamic_iosched, 1,
     "Enable Dynamic I/O scheduler optimizations.");
 
+/*
+ * For an EMA, with an alpha of alpha, we know
+ * 	alpha = 2 / (N + 1)
+ * or
+ * 	N = 1 + (2 / alpha)
+ * where N is the number of samples that 86% of the current
+ * EMA is derived from.
+ *
+ * So we invent[*] alpha_bits:
+ *	alpha_bits = -log_2(alpha)
+ *	alpha = 2^-alpha_bits
+ * So
+ *	N = 1 + 2^(alpha_bits + 1)
+ *
+ * The default 9 gives a 1025 lookback for 86% of the data.
+ * For a brief intro: https://en.wikipedia.org/wiki/Moving_average
+ *
+ * [*] Steal from the load average code and many other places.
+ */
 static int alpha_bits = 9;
 TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
 SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
     &alpha_bits, 1,
     "Bits in EMA's alpha.");
 
-
-
 struct iop_stats;
 struct cam_iosched_softc;
 
@@ -208,11 +227,17 @@ struct iop_stats 
 	/*
 	 * Statistics on different bits of the process.
 	 */
-		/* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */
+		/* Exp Moving Average, see alpha_bits for more details */
 	sbintime_t      ema;
 	sbintime_t      emss;		/* Exp Moving sum of the squares */
 	sbintime_t      sd;		/* Last computed sd */
 
+	uint32_t	state_flags;
+#define IOP_RATE_LIMITED		1u
+
+#define LAT_BUCKETS 12			/* < 1ms < 2ms ... 512ms < 1024ms > 1024ms */
+	uint64_t	latencies[LAT_BUCKETS];
+
 	struct cam_iosched_softc *softc;
 };
 
@@ -258,6 +283,7 @@ struct cam_iosched_softc
 	int		read_bias;		/* Read bias setting */
 	int		current_read_bias;	/* Current read bias state */
 	int		total_ticks;
+	int		load;			/* EMA of 'load average' of disk / 2^16 */
 
 	struct bio_queue_head write_queue;
 	struct iop_stats read_stats, write_stats, trim_stats;
@@ -509,6 +535,7 @@ cam_iosched_ticker(void *arg)
 {
 	struct cam_iosched_softc *isc = arg;
 	sbintime_t now, delta;
+	int pending;
 
 	callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
 
@@ -525,6 +552,36 @@ cam_iosched_ticker(void *arg)
 
 	cam_iosched_schedule(isc, isc->periph);
 
+	/*
+	 * isc->load is an EMA of the pending I/Os at each tick. The number of
+	 * pending I/Os is the sum of the I/Os queued to the hardware, and those
+	 * in the software queue that could be queued to the hardware if there
+	 * were slots.
+	 *
+	 * ios_stats.pending is a count of requests in the SIM right now for
+	 * each of these types of I/O. So the total pending count is the sum of
+	 * these I/Os and the sum of the queued I/Os still in the software queue
+	 * for those operations that aren't being rate limited at the moment.
+	 *
+	 * The reason for the rate limiting bit is because those I/Os
+	 * aren't part of the software queued load (since we could
+	 * give them to hardware, but choose not to).
+	 *
+	 * Note: due to a bug in counting pending TRIM in the device, we
+	 * don't include them in this count. We count each BIO_DELETE in
+	 * the pending count, but the periph drivers collapse them down
+	 * into one TRIM command. That one trim command gets the completion
+	 * so the counts get off.
+	 */
+	pending = isc->read_stats.pending + isc->write_stats.pending /* + isc->trim_stats.pending */;
+	pending += !!(isc->read_stats.state_flags & IOP_RATE_LIMITED) * isc->read_stats.queued +
+	    !!(isc->write_stats.state_flags & IOP_RATE_LIMITED) * isc->write_stats.queued /* +
+	    !!(isc->trim_stats.state_flags & IOP_RATE_LIMITED) * isc->trim_stats.queued */ ;
+	pending <<= 16;
+	pending /= isc->periph->path->device->ccbq.total_openings;
+
+	isc->load = (pending + (isc->load << 13) - isc->load) >> 13; /* see above: 13 -> 16139 / 200/s = ~81s ~1 minute */
+
 	isc->total_ticks++;
 }
 
@@ -610,7 +667,7 @@ cam_iosched_cl_maybe_steer(struct contro
 		if (isc->write_stats.current > isc->write_stats.max)
 			isc->write_stats.current = isc->write_stats.max;
 		if (old != isc->write_stats.current && 	iosched_debug)
-			printf("Steering write from %d kBps to %d kBps due to latency of %jdms\n",
+			printf("Steering write from %d kBps to %d kBps due to latency of %jdus\n",
 			    old, isc->write_stats.current,
 			    (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32);
 		break;
@@ -714,7 +771,7 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLE
 	char buf[16];
 	struct iop_stats *ios;
 	struct cam_iosched_softc *isc;
-	int value, i, error, cantick;
+	int value, i, error;
 	const char *p;
 	
 	ios = arg1;
@@ -742,21 +799,9 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLE
 			cam_periph_unlock(isc->periph);
 			return error;
 		}
-		cantick = !!limsw[isc->read_stats.limiter].l_tick +
-		    !!limsw[isc->write_stats.limiter].l_tick +
-		    !!limsw[isc->trim_stats.limiter].l_tick +
-		    1;	/* Control loop requires it */
-		if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) {
-			if (cantick == 0) {
-				callout_stop(&isc->ticker);
-				isc->flags &= ~CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
-			}
-		} else {
-			if (cantick != 0) {
-				callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
-				isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
-			}
-		}
+		/* Note: disk load averate requires ticker to be always running */
+		callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
+		isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
 
 		cam_periph_unlock(isc->periph);
 		return 0;
@@ -821,6 +866,25 @@ cam_iosched_sbintime_sysctl(SYSCTL_HANDL
 	return 0;
 }
 
+static int
+cam_iosched_sysctl_latencies(SYSCTL_HANDLER_ARGS)
+{
+	int i, error;
+	struct sbuf sb;
+	uint64_t *latencies;
+
+	latencies = arg1;
+	sbuf_new_for_sysctl(&sb, NULL, LAT_BUCKETS * 16, req);
+
+	for (i = 0; i < LAT_BUCKETS - 1; i++)
+		sbuf_printf(&sb, "%jd,", (intmax_t)latencies[i]);
+	sbuf_printf(&sb, "%jd", (intmax_t)latencies[LAT_BUCKETS - 1]);
+	error = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+	return (error);
+}
+
 static void
 cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name)
 {
@@ -884,6 +948,11 @@ cam_iosched_iop_stats_sysctl_init(struct
 	    &ios->current, 0,
 	    "current resource");
 
+	SYSCTL_ADD_PROC(ctx, n,
+	    OID_AUTO, "latencies", CTLTYPE_STRING | CTLFLAG_RD,
+	    &ios->latencies, 0,
+	    cam_iosched_sysctl_latencies, "A",
+	    "Array of power of 2 latency from 1ms to 1.024s");
 }
 
 static void
@@ -1051,6 +1120,11 @@ void cam_iosched_sysctl_init(struct cam_
 	    OID_AUTO, "total_ticks", CTLFLAG_RD,
 	    &isc->total_ticks, 0,
 	    "Total number of ticks we've done");
+
+	SYSCTL_ADD_INT(ctx, n,
+	    OID_AUTO, "load", CTLFLAG_RD,
+	    &isc->load, 0,
+	    "scaled load average / 100");
 #endif
 }
 
@@ -1100,6 +1174,7 @@ cam_iosched_get_write(struct cam_iosched
 		if (iosched_debug)
 			printf("Reads present and current_read_bias is %d queued writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued);
 		isc->current_read_bias--;
+		/* We're not limiting writes, per se, just doing reads first */
 		return NULL;
 	}
 
@@ -1109,6 +1184,7 @@ cam_iosched_get_write(struct cam_iosched
 	if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) {
 		if (iosched_debug)
 			printf("Can't write because limiter says no.\n");
+		isc->write_stats.state_flags |= IOP_RATE_LIMITED;
 		return NULL;
 	}
 
@@ -1125,6 +1201,7 @@ cam_iosched_get_write(struct cam_iosched
 	}
 	if (iosched_debug > 9)
 		printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
+	isc->write_stats.state_flags &= ~IOP_RATE_LIMITED;
 	return bp;
 }
 #endif
@@ -1224,14 +1301,17 @@ cam_iosched_next_bio(struct cam_iosched_
 
 #ifdef CAM_IOSCHED_DYNAMIC
 	/*
-	 * For the netflix scheduler, bio_queue is only for reads, so enforce
+	 * For the dynamic scheduler, bio_queue is only for reads, so enforce
 	 * the limits here. Enforce only for reads.
 	 */
 	if (do_dynamic_iosched) {
 		if (bp->bio_cmd == BIO_READ &&
-		    cam_iosched_limiter_iop(&isc->read_stats, bp) != 0)
+		    cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) {
+			isc->read_stats.state_flags |= IOP_RATE_LIMITED;
 			return NULL;
+		}
 	}
+	isc->read_stats.state_flags &= ~IOP_RATE_LIMITED;
 #endif
 	bioq_remove(&isc->bio_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
@@ -1478,13 +1558,41 @@ mul(uint64_t a, uint64_t b)
 	return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32);
 }
 
+static sbintime_t latencies[] = {
+	SBT_1MS <<  0,
+	SBT_1MS <<  1,
+	SBT_1MS <<  2,
+	SBT_1MS <<  3,
+	SBT_1MS <<  4,
+	SBT_1MS <<  5,
+	SBT_1MS <<  6,
+	SBT_1MS <<  7,
+	SBT_1MS <<  8,
+	SBT_1MS <<  9,
+	SBT_1MS << 10
+};
+
 static void
 cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
 {
 	sbintime_t y, yy;
 	uint64_t var;
+	int i;
+
+	/*
+	 * Keep counts for latency. We do it by power of two buckets.
+	 * This helps us spot outlier behavior obscured by averages.
+	 */
+	for (i = 0; i < LAT_BUCKETS - 1; i++) {
+		if (sim_latency < latencies[i]) {
+			iop->latencies[i]++;
+			break;
+		}
+	}
+	if (i == LAT_BUCKETS - 1)
+		iop->latencies[i]++; 	 /* Put all > 1024ms values into the last bucket. */
 
-	/* 
+	/*
 	 * Classic expoentially decaying average with a tiny alpha
 	 * (2 ^ -alpha_bits). For more info see the NIST statistical
 	 * handbook.