From owner-svn-src-head@freebsd.org Fri Sep 30 17:49:06 2016 Return-Path: Delivered-To: svn-src-head@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 215D3C03487; Fri, 30 Sep 2016 17:49:06 +0000 (UTC) (envelope-from imp@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id F19821F79; Fri, 30 Sep 2016 17:49:05 +0000 (UTC) (envelope-from imp@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u8UHn5Kf059230; Fri, 30 Sep 2016 17:49:05 GMT (envelope-from imp@FreeBSD.org) Received: (from imp@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id u8UHn5Ok059229; Fri, 30 Sep 2016 17:49:05 GMT (envelope-from imp@FreeBSD.org) Message-Id: <201609301749.u8UHn5Ok059229@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: imp set sender to imp@FreeBSD.org using -f From: Warner Losh Date: Fri, 30 Sep 2016 17:49:05 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r306514 - head/sys/cam X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 30 Sep 2016 17:49:06 -0000 Author: imp Date: Fri Sep 30 17:49:04 2016 New Revision: 306514 URL: https://svnweb.freebsd.org/changeset/base/306514 Log: Compute two new metrics. Disk load, the average number of transactions we have queued up normaliazed to the queue size. Also compute buckets of latency to help compute, in userland, estimates of Median, P90, P95 and P99 values. Sponsored by: Netflix, Inc Modified: head/sys/cam/cam_iosched.c Modified: head/sys/cam/cam_iosched.c ============================================================================== --- head/sys/cam/cam_iosched.c Fri Sep 30 17:30:39 2016 (r306513) +++ head/sys/cam/cam_iosched.c Fri Sep 30 17:49:04 2016 (r306514) @@ -42,12 +42,14 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -73,14 +75,31 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_dynam &do_dynamic_iosched, 1, "Enable Dynamic I/O scheduler optimizations."); +/* + * For an EMA, with an alpha of alpha, we know + * alpha = 2 / (N + 1) + * or + * N = 1 + (2 / alpha) + * where N is the number of samples that 86% of the current + * EMA is derived from. + * + * So we invent[*] alpha_bits: + * alpha_bits = -log_2(alpha) + * alpha = 2^-alpha_bits + * So + * N = 1 + 2^(alpha_bits + 1) + * + * The default 9 gives a 1025 lookback for 86% of the data. + * For a brief intro: https://en.wikipedia.org/wiki/Moving_average + * + * [*] Steal from the load average code and many other places. + */ static int alpha_bits = 9; TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits); SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW, &alpha_bits, 1, "Bits in EMA's alpha."); - - struct iop_stats; struct cam_iosched_softc; @@ -208,11 +227,17 @@ struct iop_stats /* * Statistics on different bits of the process. */ - /* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */ + /* Exp Moving Average, see alpha_bits for more details */ sbintime_t ema; sbintime_t emss; /* Exp Moving sum of the squares */ sbintime_t sd; /* Last computed sd */ + uint32_t state_flags; +#define IOP_RATE_LIMITED 1u + +#define LAT_BUCKETS 12 /* < 1ms < 2ms ... 512ms < 1024ms > 1024ms */ + uint64_t latencies[LAT_BUCKETS]; + struct cam_iosched_softc *softc; }; @@ -258,6 +283,7 @@ struct cam_iosched_softc int read_bias; /* Read bias setting */ int current_read_bias; /* Current read bias state */ int total_ticks; + int load; /* EMA of 'load average' of disk / 2^16 */ struct bio_queue_head write_queue; struct iop_stats read_stats, write_stats, trim_stats; @@ -509,6 +535,7 @@ cam_iosched_ticker(void *arg) { struct cam_iosched_softc *isc = arg; sbintime_t now, delta; + int pending; callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc); @@ -525,6 +552,36 @@ cam_iosched_ticker(void *arg) cam_iosched_schedule(isc, isc->periph); + /* + * isc->load is an EMA of the pending I/Os at each tick. The number of + * pending I/Os is the sum of the I/Os queued to the hardware, and those + * in the software queue that could be queued to the hardware if there + * were slots. + * + * ios_stats.pending is a count of requests in the SIM right now for + * each of these types of I/O. So the total pending count is the sum of + * these I/Os and the sum of the queued I/Os still in the software queue + * for those operations that aren't being rate limited at the moment. + * + * The reason for the rate limiting bit is because those I/Os + * aren't part of the software queued load (since we could + * give them to hardware, but choose not to). + * + * Note: due to a bug in counting pending TRIM in the device, we + * don't include them in this count. We count each BIO_DELETE in + * the pending count, but the periph drivers collapse them down + * into one TRIM command. That one trim command gets the completion + * so the counts get off. + */ + pending = isc->read_stats.pending + isc->write_stats.pending /* + isc->trim_stats.pending */; + pending += !!(isc->read_stats.state_flags & IOP_RATE_LIMITED) * isc->read_stats.queued + + !!(isc->write_stats.state_flags & IOP_RATE_LIMITED) * isc->write_stats.queued /* + + !!(isc->trim_stats.state_flags & IOP_RATE_LIMITED) * isc->trim_stats.queued */ ; + pending <<= 16; + pending /= isc->periph->path->device->ccbq.total_openings; + + isc->load = (pending + (isc->load << 13) - isc->load) >> 13; /* see above: 13 -> 16139 / 200/s = ~81s ~1 minute */ + isc->total_ticks++; } @@ -610,7 +667,7 @@ cam_iosched_cl_maybe_steer(struct contro if (isc->write_stats.current > isc->write_stats.max) isc->write_stats.current = isc->write_stats.max; if (old != isc->write_stats.current && iosched_debug) - printf("Steering write from %d kBps to %d kBps due to latency of %jdms\n", + printf("Steering write from %d kBps to %d kBps due to latency of %jdus\n", old, isc->write_stats.current, (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32); break; @@ -714,7 +771,7 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLE char buf[16]; struct iop_stats *ios; struct cam_iosched_softc *isc; - int value, i, error, cantick; + int value, i, error; const char *p; ios = arg1; @@ -742,21 +799,9 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLE cam_periph_unlock(isc->periph); return error; } - cantick = !!limsw[isc->read_stats.limiter].l_tick + - !!limsw[isc->write_stats.limiter].l_tick + - !!limsw[isc->trim_stats.limiter].l_tick + - 1; /* Control loop requires it */ - if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) { - if (cantick == 0) { - callout_stop(&isc->ticker); - isc->flags &= ~CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; - } - } else { - if (cantick != 0) { - callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc); - isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; - } - } + /* Note: disk load averate requires ticker to be always running */ + callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc); + isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; cam_periph_unlock(isc->periph); return 0; @@ -821,6 +866,25 @@ cam_iosched_sbintime_sysctl(SYSCTL_HANDL return 0; } +static int +cam_iosched_sysctl_latencies(SYSCTL_HANDLER_ARGS) +{ + int i, error; + struct sbuf sb; + uint64_t *latencies; + + latencies = arg1; + sbuf_new_for_sysctl(&sb, NULL, LAT_BUCKETS * 16, req); + + for (i = 0; i < LAT_BUCKETS - 1; i++) + sbuf_printf(&sb, "%jd,", (intmax_t)latencies[i]); + sbuf_printf(&sb, "%jd", (intmax_t)latencies[LAT_BUCKETS - 1]); + error = sbuf_finish(&sb); + sbuf_delete(&sb); + + return (error); +} + static void cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name) { @@ -884,6 +948,11 @@ cam_iosched_iop_stats_sysctl_init(struct &ios->current, 0, "current resource"); + SYSCTL_ADD_PROC(ctx, n, + OID_AUTO, "latencies", CTLTYPE_STRING | CTLFLAG_RD, + &ios->latencies, 0, + cam_iosched_sysctl_latencies, "A", + "Array of power of 2 latency from 1ms to 1.024s"); } static void @@ -1051,6 +1120,11 @@ void cam_iosched_sysctl_init(struct cam_ OID_AUTO, "total_ticks", CTLFLAG_RD, &isc->total_ticks, 0, "Total number of ticks we've done"); + + SYSCTL_ADD_INT(ctx, n, + OID_AUTO, "load", CTLFLAG_RD, + &isc->load, 0, + "scaled load average / 100"); #endif } @@ -1100,6 +1174,7 @@ cam_iosched_get_write(struct cam_iosched if (iosched_debug) printf("Reads present and current_read_bias is %d queued writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued); isc->current_read_bias--; + /* We're not limiting writes, per se, just doing reads first */ return NULL; } @@ -1109,6 +1184,7 @@ cam_iosched_get_write(struct cam_iosched if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) { if (iosched_debug) printf("Can't write because limiter says no.\n"); + isc->write_stats.state_flags |= IOP_RATE_LIMITED; return NULL; } @@ -1125,6 +1201,7 @@ cam_iosched_get_write(struct cam_iosched } if (iosched_debug > 9) printf("HWQ : %p %#x\n", bp, bp->bio_cmd); + isc->write_stats.state_flags &= ~IOP_RATE_LIMITED; return bp; } #endif @@ -1224,14 +1301,17 @@ cam_iosched_next_bio(struct cam_iosched_ #ifdef CAM_IOSCHED_DYNAMIC /* - * For the netflix scheduler, bio_queue is only for reads, so enforce + * For the dynamic scheduler, bio_queue is only for reads, so enforce * the limits here. Enforce only for reads. */ if (do_dynamic_iosched) { if (bp->bio_cmd == BIO_READ && - cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) + cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) { + isc->read_stats.state_flags |= IOP_RATE_LIMITED; return NULL; + } } + isc->read_stats.state_flags &= ~IOP_RATE_LIMITED; #endif bioq_remove(&isc->bio_queue, bp); #ifdef CAM_IOSCHED_DYNAMIC @@ -1478,13 +1558,41 @@ mul(uint64_t a, uint64_t b) return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32); } +static sbintime_t latencies[] = { + SBT_1MS << 0, + SBT_1MS << 1, + SBT_1MS << 2, + SBT_1MS << 3, + SBT_1MS << 4, + SBT_1MS << 5, + SBT_1MS << 6, + SBT_1MS << 7, + SBT_1MS << 8, + SBT_1MS << 9, + SBT_1MS << 10 +}; + static void cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency) { sbintime_t y, yy; uint64_t var; + int i; + + /* + * Keep counts for latency. We do it by power of two buckets. + * This helps us spot outlier behavior obscured by averages. + */ + for (i = 0; i < LAT_BUCKETS - 1; i++) { + if (sim_latency < latencies[i]) { + iop->latencies[i]++; + break; + } + } + if (i == LAT_BUCKETS - 1) + iop->latencies[i]++; /* Put all > 1024ms values into the last bucket. */ - /* + /* * Classic expoentially decaying average with a tiny alpha * (2 ^ -alpha_bits). For more info see the NIST statistical * handbook.