Date: Wed, 16 Sep 2015 22:15:51 +0000 (UTC) From: Warner Losh <imp@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r287876 - in projects/iosched/sys/cam: . ata scsi Message-ID: <201509162215.t8GMFp1b023705@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: imp Date: Wed Sep 16 22:15:50 2015 New Revision: 287876 URL: https://svnweb.freebsd.org/changeset/base/287876 Log: Commit the post-BSDcan level (and a little more) iosched work. This work is described in the paper that I presented at BSDcan http://people.freebsd.org/~imp/bsdcan2015/iosched-v3.pdf section XII. Recent Changes. Modified: projects/iosched/sys/cam/ata/ata_da.c projects/iosched/sys/cam/cam_iosched.c projects/iosched/sys/cam/cam_iosched.h projects/iosched/sys/cam/scsi/scsi_da.c Modified: projects/iosched/sys/cam/ata/ata_da.c ============================================================================== --- projects/iosched/sys/cam/ata/ata_da.c Wed Sep 16 21:43:51 2015 (r287875) +++ projects/iosched/sys/cam/ata/ata_da.c Wed Sep 16 22:15:50 2015 (r287876) @@ -98,11 +98,13 @@ typedef enum { typedef enum { ADA_Q_NONE = 0x00, ADA_Q_4K = 0x01, + ADA_Q_NCQ_TRIM_BROKEN = 0x02, } ada_quirks; #define ADA_Q_BIT_STRING \ "\020" \ - "\0014K" + "\0014K" \ + "\002NCQ_TRIM_BROKEN" typedef enum { ADA_CCB_RAHEAD = 0x01, @@ -160,6 +162,8 @@ struct ada_softc { int trim_max_ranges; int read_ahead; int write_cache; + int unmappedio; + int rotating; #ifdef ADA_TEST_FAILURE int force_read_error; int force_write_error; @@ -173,6 +177,13 @@ struct ada_softc { struct sysctl_oid *sysctl_tree; struct callout sendordered_c; struct trim_request trim_req; +#ifdef CAM_IO_STATS + struct sysctl_ctx_list sysctl_stats_ctx; + struct sysctl_oid *sysctl_stats_tree; + u_int timeouts; + u_int errors; + u_int invalidations; +#endif }; struct ada_quirk_entry { @@ -350,6 +361,38 @@ static struct ada_quirk_entry ada_quirk_ }, { /* + * Crucial M500 SSDs EU07 firmware + * NCQ Trim works ? + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "EU07" }, + /*quirks*/0 + }, + { + /* + * Crucial M500 SSDs all other firmware + * NCQ Trim doesn't work + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "*" }, + /*quirks*/ADA_Q_NCQ_TRIM_BROKEN + }, + { + /* + * Crucial M550 SSDs + * NCQ Trim doesn't work, but only on MU01 firmware + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M550*", "MU01" }, + /*quirks*/ADA_Q_NCQ_TRIM_BROKEN + }, + { + /* + * Crucial MX100 SSDs + * NCQ Trim doesn't work, but only on MU01 firmware + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*MX100*", "MU01" }, + /*quirks*/ADA_Q_NCQ_TRIM_BROKEN + }, + { + /* * Crucial RealSSD C300 SSDs * 4k optimised */ @@ -422,6 +465,30 @@ static struct ada_quirk_entry ada_quirk_ }, { /* + * Micron M500 SSDs firmware EU07 + * NCQ Trim works? + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "EU07" }, + /*quirks*/0 + }, + { + /* + * Micron M500 SSDs all other firmware + * NCQ Trim doesn't work + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "*" }, + /*quirks*/ADA_Q_NCQ_TRIM_BROKEN + }, + { + /* + * Micron M5[15]0 SSDs + * NCQ Trim doesn't work, but only MU01 firmware + */ + { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M5[15]0*", "MU01" }, + /*quirks*/ADA_Q_NCQ_TRIM_BROKEN + }, + { + /* * OCZ Agility 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ @@ -471,22 +538,22 @@ static struct ada_quirk_entry ada_quirk_ { /* * Samsung 830 Series SSDs - * 4k optimised + * 4k optimised, NCQ TRIM broken (normal TRIM fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD 830 Series*", "*" }, - /*quirks*/ADA_Q_4K + /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Samsung 840 SSDs - * 4k optimised + * 4k optimised, NCQ TRIM broken (normal TRIM fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 840*", "*" }, - /*quirks*/ADA_Q_4K + /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* - * Samsung 843T Series SSDs + * Samsung PM843T Series SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7WD*", "*" }, @@ -495,10 +562,10 @@ static struct ada_quirk_entry ada_quirk_ { /* * Samsung 850 SSDs - * 4k optimised + * 4k optimised, NCQ TRIM broken (normal TRIM fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 850*", "*" }, - /*quirks*/ADA_Q_4K + /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* @@ -782,8 +849,6 @@ adastrategy(struct bio *bp) /* * Place it in the queue of disk activities for this disk */ - if (bp->bio_cmd == BIO_DELETE) { - } cam_iosched_queue_work(softc->cam_iosched, bp); /* @@ -865,7 +930,7 @@ adadump(void *arg, void *virtual, vm_off 0, NULL, 0, - ada_default_timeout*1000); + 5*1000); if (softc->flags & ADA_FLAG_CAN_48BIT) ata_48bit_cmd(&ccb.ataio, ATA_FLUSHCACHE48, 0, 0, 0); @@ -939,6 +1004,9 @@ adaoninvalidate(struct cam_periph *perip * De-register any async callbacks. */ xpt_register_async(0, adaasync, periph, periph->path); +#ifdef CAM_IO_STATS + softc->invalidations++; +#endif /* * Return all queued I/O with ENXIO. @@ -959,12 +1027,20 @@ adacleanup(struct cam_periph *periph) cam_periph_unlock(periph); + cam_iosched_fini(softc->cam_iosched); + /* * If we can't free the sysctl tree, oh well... */ - if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0 - && sysctl_ctx_free(&softc->sysctl_ctx) != 0) { - xpt_print(periph->path, "can't remove sysctl context\n"); + if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) { +#ifdef CAM_IO_STATS + if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0) + xpt_print(periph->path, + "can't remove sysctl stats context\n"); +#endif + if (sysctl_ctx_free(&softc->sysctl_ctx) != 0) + xpt_print(periph->path, + "can't remove sysctl context\n"); } disk_destroy(softc->disk); @@ -977,16 +1053,9 @@ static void adasetdeletemethod(struct ada_softc *softc) { -#if 0 - /* - * Don't set NCQ_DSM_TRIM method by default. It is currently - * a "feature of interest" implicated in some data corruption. - */ if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM) softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM; - else -#endif - if (softc->flags & ADA_FLAG_CAN_TRIM) + else if (softc->flags & ADA_FLAG_CAN_TRIM) softc->delete_method = ADA_DELETE_DSM_TRIM; else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT)) softc->delete_method = ADA_DELETE_CFA_ERASE; @@ -1069,7 +1138,8 @@ adaasync(void *callback_arg, u_int32_t c * the sim do do things properly. Perhaps we should look at log 13 * dword 0 bit 0 and dword 1 bit 0 are set too... */ - if ((softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 && + if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 && + (softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 && (cgd.ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 && (softc->flags & ADA_FLAG_CAN_TRIM) != 0) softc->flags |= ADA_FLAG_CAN_NCQ_TRIM; @@ -1165,6 +1235,12 @@ adasysctlinit(void *context, int pending SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "write_cache", CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->write_cache, 0, "Enable disk write cache."); + SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), + OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE, + &softc->unmappedio, 0, "Unmapped I/O leaf"); + SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), + OID_AUTO, "rotating", CTLFLAG_RD | CTLFLAG_MPSAFE, + &softc->rotating, 0, "Rotating media"); #ifdef ADA_TEST_FAILURE /* * Add a 'door bell' sysctl which allows one to set it from userland @@ -1184,6 +1260,28 @@ adasysctlinit(void *context, int pending &softc->periodic_read_error, 0, "Force a read error every N reads (don't set too low)."); #endif + +#ifdef CAM_IO_STATS + softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx, + SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats", + CTLFLAG_RD, 0, "Statistics"); + SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, + SYSCTL_CHILDREN(softc->sysctl_stats_tree), + OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE, + &softc->timeouts, 0, + "Device timeouts reported by the SIM"); + SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, + SYSCTL_CHILDREN(softc->sysctl_stats_tree), + OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE, + &softc->errors, 0, + "Transport errors reported by the SIM."); + SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, + SYSCTL_CHILDREN(softc->sysctl_stats_tree), + OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE, + &softc->invalidations, 0, + "Device pack invalidations."); +#endif + cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx, softc->sysctl_tree); @@ -1270,7 +1368,7 @@ adaregister(struct cam_periph *periph, v return(CAM_REQ_CMP_ERR); } - if (cam_iosched_init(&softc->cam_iosched) != 0) { + if (cam_iosched_init(&softc->cam_iosched, periph) != 0) { printf("adaregister: Unable to probe new device. " "Unable to allocate iosched memory\n"); return(CAM_REQ_CMP_ERR); @@ -1346,8 +1444,12 @@ adaregister(struct cam_periph *periph, v "kern.cam.ada.%d.write_cache", periph->unit_number); TUNABLE_INT_FETCH(announce_buf, &softc->write_cache); /* Disable queue sorting for non-rotational media by default. */ - cam_iosched_set_sort_queue(softc->cam_iosched, - cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING); + if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) { + softc->rotating = 0; + } else { + softc->rotating = 1; + } + cam_iosched_set_sort_queue(softc->cam_iosched, softc->rotating ? -1 : 0); adagetparams(periph, cgd); softc->disk = disk_alloc(); softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate; @@ -1390,8 +1492,10 @@ adaregister(struct cam_periph *periph, v softc->disk->d_delmaxsize = 256 * softc->params.secsize; } else softc->disk->d_delmaxsize = maxio; - if ((cpi.hba_misc & PIM_UNMAPPED) != 0) + if ((cpi.hba_misc & PIM_UNMAPPED) != 0) { softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO; + softc->unmappedio = 1; + } /* * If we can do RCVSND_FPDMA_QUEUED commands, we may be able to do * NCQ trims, if we support trims at all. We also need support from @@ -1400,9 +1504,9 @@ adaregister(struct cam_periph *periph, v */ if (cpi.hba_misc & PIM_NCQ_KLUDGE) softc->flags |= ADA_FLAG_PIM_CAN_NCQ_TRIM; - if ((softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 && - (cgd->ident_data.satacapabilities2 & - ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 && + if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 && + (softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 && + (cgd->ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 && (softc->flags & ADA_FLAG_CAN_TRIM) != 0) softc->flags |= ADA_FLAG_CAN_NCQ_TRIM; strlcpy(softc->disk->d_descr, cgd->ident_data.model, @@ -1675,8 +1779,7 @@ adastart(struct cam_periph *periph, unio } if ((bp->bio_flags & BIO_ORDERED) != 0 || - (bp->bio_cmd != BIO_DELETE && - (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) { + (bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) { softc->flags &= ~ADA_FLAG_NEED_OTAG; softc->flags |= ADA_FLAG_WAS_OTAG; tag_code = 0; @@ -1807,7 +1910,10 @@ adastart(struct cam_periph *periph, unio ada_cfaerase(softc, bp, ataio); break; default: - panic("adastart: BIO_DELETE without method, not possible."); + biofinish(bp, NULL, EOPNOTSUPP); + xpt_release_ccb(start_ccb); + adaschedule(periph); + return; } start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM; start_ccb->ccb_h.flags |= CAM_UNLOCKED; @@ -1893,7 +1999,7 @@ adadone(struct cam_periph *periph, union case ADA_CCB_TRIM: { struct bio *bp; - int error, need_sched; + int error; cam_periph_lock(periph); bp = (struct bio *)done_ccb->ccb_h.ccb_bp; @@ -1945,7 +2051,7 @@ adadone(struct cam_periph *periph, union if (softc->outstanding_cmds == 0) softc->flags |= ADA_FLAG_WAS_OTAG; - need_sched = cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); + cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); xpt_release_ccb(done_ccb); if (state == ADA_CCB_TRIM) { TAILQ_HEAD(, bio) queue; @@ -1957,9 +2063,9 @@ adadone(struct cam_periph *periph, union * Normally, the xpt_release_ccb() above would make sure * that when we have more work to do, that work would * get kicked off. However, we specifically keep - * trim running set to 0 before the call above to allow + * trim_running set to 0 before the call above to allow * other I/O to progress when many BIO_DELETE requests - * are pushed down. We set trim running to 0 and call + * are pushed down. We set trim_running to 0 and call * daschedule again so that we don't stall if there are * no other I/Os pending apart from BIO_DELETEs. */ @@ -1977,8 +2083,7 @@ adadone(struct cam_periph *periph, union biodone(bp1); } } else { - if (need_sched) - adaschedule(periph); + adaschedule(periph); cam_periph_unlock(periph); biodone(bp); } @@ -2070,6 +2175,31 @@ out: static int adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { + struct ada_softc *softc; + struct cam_periph *periph; + + periph = xpt_path_periph(ccb->ccb_h.path); + softc = (struct ada_softc *)periph->softc; + + switch (ccb->ccb_h.status & CAM_STATUS_MASK) { + case CAM_CMD_TIMEOUT: +#ifdef CAM_IO_STATS + softc->timeouts++; +#endif + break; + case CAM_REQ_ABORTED: + case CAM_REQ_CMP_ERR: + case CAM_REQ_TERMIO: + case CAM_UNREC_HBA_ERROR: + case CAM_DATA_RUN_ERR: + case CAM_ATA_STATUS_ERROR: +#ifdef CAM_IO_STATS + softc->errors++; +#endif + break; + default: + break; + } return(cam_periph_error(ccb, cam_flags, sense_flags, NULL)); } Modified: projects/iosched/sys/cam/cam_iosched.c ============================================================================== --- projects/iosched/sys/cam/cam_iosched.c Wed Sep 16 21:43:51 2015 (r287875) +++ projects/iosched/sys/cam/cam_iosched.c Wed Sep 16 22:15:50 2015 (r287876) @@ -63,9 +63,6 @@ static MALLOC_DEFINE(M_CAMSCHED, "CAM I/ */ #ifdef CAM_NETFLIX_IOSCHED -#define IOP_MAX_SKIP 50 -#define IOP_MAX_TRAINING 500 -#define ALPHA_BITS 14 /* ~32k events or about the last minute */ SYSCTL_DECL(_kern_cam); static int do_netflix_iosched = 1; @@ -74,20 +71,178 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_netfl &do_netflix_iosched, 1, "Enable Netflix I/O scheduler optimizations."); +static int alpha_bits = 9; +TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits); +SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW, + &alpha_bits, 1, + "Bits in EMA's alpha."); + + + +struct iop_stats; +struct cam_iosched_softc; + int iosched_debug = 0; +typedef enum { + none = 0, /* No limits */ + queue_depth, /* Limit how many ops we queue to SIM */ + iops, /* Limit # of IOPS to the drive */ + bandwidth, /* Limit bandwidth to the drive */ + limiter_max +} io_limiter; + +static const char *cam_iosched_limiter_names[] = + { "none", "queue_depth", "iops", "bandwidth" }; + +/* + * Called to initialize the bits of the iop_stats structure relevant to the + * limiter. Called just after the limiter is set. + */ +typedef int l_init_t(struct iop_stats *); + +/* + * Called every tick. + */ +typedef int l_tick_t(struct iop_stats *); + +/* + * Called to see if the limiter thinks this IOP can be allowed to + * proceed. If so, the limiter assumes that the while IOP proceeded + * and makes any accounting of it that's needed. + */ +typedef int l_iop_t(struct iop_stats *, struct bio *); + +/* + * Called when an I/O completes so the limiter can updates its + * accounting. Pending I/Os may complete in any order (even when + * sent to the hardware at the same time), so the limiter may not + * make any assumptions other than this I/O has completed. If it + * returns 1, then xpt_schedule() needs to be called again. + */ +typedef int l_iodone_t(struct iop_stats *, struct bio *); + +static l_iop_t cam_iosched_qd_iop; +static l_iop_t cam_iosched_qd_caniop; +static l_iodone_t cam_iosched_qd_iodone; + +static l_init_t cam_iosched_iops_init; +static l_tick_t cam_iosched_iops_tick; +static l_iop_t cam_iosched_iops_caniop; +static l_iop_t cam_iosched_iops_iop; + +static l_init_t cam_iosched_bw_init; +static l_tick_t cam_iosched_bw_tick; +static l_iop_t cam_iosched_bw_caniop; +static l_iop_t cam_iosched_bw_iop; + +struct limswitch +{ + l_init_t *l_init; + l_tick_t *l_tick; + l_iop_t *l_iop; + l_iop_t *l_caniop; + l_iodone_t *l_iodone; +} limsw[] = +{ + { /* none */ + .l_init = NULL, + .l_tick = NULL, + .l_iop = NULL, + .l_iodone= NULL, + }, + { /* queue_depth */ + .l_init = NULL, + .l_tick = NULL, + .l_caniop = cam_iosched_qd_caniop, + .l_iop = cam_iosched_qd_iop, + .l_iodone= cam_iosched_qd_iodone, + }, + { /* iops */ + .l_init = cam_iosched_iops_init, + .l_tick = cam_iosched_iops_tick, + .l_caniop = cam_iosched_iops_caniop, + .l_iop = cam_iosched_iops_iop, + .l_iodone= NULL, + }, + { /* bandwidth */ + .l_init = cam_iosched_bw_init, + .l_tick = cam_iosched_bw_tick, + .l_caniop = cam_iosched_bw_caniop, + .l_iop = cam_iosched_bw_iop, + .l_iodone= NULL, + }, +}; + struct iop_stats { - sbintime_t data[IOP_MAX_TRAINING]; /* Data for training period */ - sbintime_t worst; /* estimate of worst case latency */ - int outliers; /* Number of outlier latency I/Os */ - int skipping; /* Skipping I/Os when < IOP_MAX_SKIP */ - int training; /* Training when < IOP_MAX_TRAINING */ + /* + * sysctl state for this subnode. + */ + struct sysctl_ctx_list sysctl_ctx; + struct sysctl_oid *sysctl_tree; + + /* + * Information about the current rate limiters, if any + */ + io_limiter limiter; /* How are I/Os being limited */ + int min; /* Low range of limit */ + int max; /* High range of limit */ + int current; /* Current rate limiter */ + int l_value1; /* per-limiter scratch value 1. */ + int l_value2; /* per-limiter scratch value 2. */ + + + /* + * Debug information about counts of I/Os that have gone through the + * scheduler. + */ + int pending; /* I/Os pending in the hardware */ + int queued; /* number currently in the queue */ + int total; /* Total for all time -- wraps */ + int in; /* number queued all time -- wraps */ + int out; /* number completed all time -- wraps */ + + /* + * Statistics on different bits of the process. + */ /* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */ sbintime_t ema; sbintime_t emss; /* Exp Moving sum of the squares */ sbintime_t sd; /* Last computed sd */ + + struct cam_iosched_softc *softc; }; + + +typedef enum { + set_max = 0, /* current = max */ + read_latency, /* Steer read latency by throttling writes */ + cl_max /* Keep last */ +} control_type; + +static const char *cam_iosched_control_type_names[] = + { "set_max", "read_latency" }; + +struct control_loop +{ + /* + * sysctl state for this subnode. + */ + struct sysctl_ctx_list sysctl_ctx; + struct sysctl_oid *sysctl_tree; + + sbintime_t next_steer; /* Time of next steer */ + sbintime_t steer_interval; /* How often do we steer? */ + sbintime_t lolat; + sbintime_t hilat; + int alpha; + control_type type; /* What type of control? */ + int last_count; /* Last I/O count */ + + struct cam_iosched_softc *softc; +}; + #endif struct cam_iosched_softc @@ -98,36 +253,375 @@ struct cam_iosched_softc uint32_t flags; int sort_io_queue; #ifdef CAM_NETFLIX_IOSCHED - /* Number of pending transactions */ - int pending_reads; - int pending_writes; - /* Have at least this many transactions in progress, if possible */ - int min_reads; - int min_writes; - /* Maximum number of each type of transaction in progress */ - int max_reads; - int max_writes; - - int trims; - int reads; - int writes; - int queued_reads; - int queued_writes; - int in_reads; - int in_writes; - int out_reads; - int out_writes; - - int read_bias; - int current_read_bias; + int read_bias; /* Read bias setting */ + int current_read_bias; /* Current read bias state */ + int total_ticks; struct bio_queue_head write_queue; struct iop_stats read_stats, write_stats, trim_stats; + struct sysctl_ctx_list sysctl_ctx; + struct sysctl_oid *sysctl_tree; + + int quanta; /* Number of quanta per second */ + struct callout ticker; /* Callout for our quota system */ + struct cam_periph *periph; /* cam periph associated with this device */ + uint32_t this_frac; /* Fraction of a second (1024ths) for this tick */ + sbintime_t last_time; /* Last time we ticked */ + struct control_loop cl; #endif }; +#ifdef CAM_NETFLIX_IOSCHED +/* + * helper functions to call the limsw functions. + */ +static int +cam_iosched_limiter_init(struct iop_stats *ios) +{ + int lim = ios->limiter; + + /* maybe this should be a kassert */ + if (lim < none || lim >= limiter_max) + return EINVAL; + + if (limsw[lim].l_init) + return limsw[lim].l_init(ios); + + return 0; +} + +static int +cam_iosched_limiter_tick(struct iop_stats *ios) +{ + int lim = ios->limiter; + + /* maybe this should be a kassert */ + if (lim < none || lim >= limiter_max) + return EINVAL; + + if (limsw[lim].l_tick) + return limsw[lim].l_tick(ios); + + return 0; +} + +static int +cam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp) +{ + int lim = ios->limiter; + + /* maybe this should be a kassert */ + if (lim < none || lim >= limiter_max) + return EINVAL; + + if (limsw[lim].l_iop) + return limsw[lim].l_iop(ios, bp); + + return 0; +} + +static int +cam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp) +{ + int lim = ios->limiter; + + /* maybe this should be a kassert */ + if (lim < none || lim >= limiter_max) + return EINVAL; + + if (limsw[lim].l_caniop) + return limsw[lim].l_caniop(ios, bp); + + return 0; +} + +static int +cam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp) +{ + int lim = ios->limiter; + + /* maybe this should be a kassert */ + if (lim < none || lim >= limiter_max) + return 0; + + if (limsw[lim].l_iodone) + return limsw[lim].l_iodone(ios, bp); + + return 0; +} + +/* + * Functions to implement the different kinds of limiters + */ + +static int +cam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp) +{ + + if (ios->current <= 0 || ios->pending < ios->current) + return 0; + + return EAGAIN; +} + +static int +cam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp) +{ + + if (ios->current <= 0 || ios->pending < ios->current) + return 0; + + return EAGAIN; +} + +static int +cam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp) +{ + + if (ios->current <= 0 || ios->pending != ios->current) + return 0; + + return 1; +} + +static int +cam_iosched_iops_init(struct iop_stats *ios) +{ + + ios->l_value1 = ios->current / ios->softc->quanta; + if (ios->l_value1 <= 0) + ios->l_value1 = 1; + + return 0; +} + +static int +cam_iosched_iops_tick(struct iop_stats *ios) +{ + + ios->l_value1 = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16); + if (ios->l_value1 <= 0) + ios->l_value1 = 1; + + return 0; +} + +static int +cam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp) +{ + + /* + * So if we have any more IOPs left, allow it, + * otherwise wait. + */ + if (ios->l_value1 <= 0) + return EAGAIN; + return 0; +} + +static int +cam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp) +{ + int rv; + + rv = cam_iosched_limiter_caniop(ios, bp); + if (rv == 0) + ios->l_value1--; + + return rv; +} + +static int +cam_iosched_bw_init(struct iop_stats *ios) +{ + + /* ios->current is in kB/s, so scale to bytes */ + ios->l_value1 = ios->current * 1000 / ios->softc->quanta; + + return 0; +} + +static int +cam_iosched_bw_tick(struct iop_stats *ios) +{ + int bw; + + /* + * If we're in the hole for available quota from + * the last time, then add the quantum for this. + * If we have any left over from last quantum, + * then too bad, that's lost. Also, ios->current + * is in kB/s, so scale. + * + * We also allow up to 4 quanta of credits to + * accumulate to deal with burstiness. 4 is extremely + * arbitrary. + */ + bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16); + if (ios->l_value1 < bw * 4) + ios->l_value1 += bw; + + return 0; +} + +static int +cam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp) +{ + /* + * So if we have any more bw quota left, allow it, + * otherwise wait. Not, we'll go negative and that's + * OK. We'll just get a lettle less next quota. + * + * Note on going negative: that allows us to process + * requests in order better, since we won't allow + * shorter reads to get around the long one that we + * don't have the quota to do just yet. It also prevents + * starvation by being a little more permissive about + * what we let through this quantum (to prevent the + * starvation), at the cost of getting a little less + * next quantum. + */ + if (ios->l_value1 <= 0) + return EAGAIN; + + + return 0; +} + +static int +cam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp) +{ + int rv; + + rv = cam_iosched_limiter_caniop(ios, bp); + if (rv == 0) + ios->l_value1 -= bp->bio_length; + + return rv; +} + +static void cam_iosched_cl_maybe_steer(struct control_loop *clp); + +static void +cam_iosched_ticker(void *arg) +{ + struct cam_iosched_softc *isc = arg; + sbintime_t now, delta; + + callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc); + + now = sbinuptime(); + delta = now - isc->last_time; + isc->this_frac = (uint32_t)delta >> 16; /* Note: discards seconds -- should be 0 harmless if not */ + isc->last_time = now; + + cam_iosched_cl_maybe_steer(&isc->cl); + + cam_iosched_limiter_tick(&isc->read_stats); + cam_iosched_limiter_tick(&isc->write_stats); + cam_iosched_limiter_tick(&isc->trim_stats); + + cam_iosched_schedule(isc, isc->periph); + + isc->total_ticks++; +} + + +static void +cam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc) +{ + + clp->next_steer = sbinuptime(); + clp->softc = isc; + clp->steer_interval = SBT_1S * 5; /* Let's start out steering every 5s */ + clp->lolat = 5 * SBT_1MS; + clp->hilat = 15 * SBT_1MS; + clp->alpha = 20; /* Alpha == gain. 20 = .2 */ + clp->type = set_max; +} + +static void +cam_iosched_cl_maybe_steer(struct control_loop *clp) +{ + struct cam_iosched_softc *isc; + sbintime_t now, lat; + int old; + + isc = clp->softc; + now = isc->last_time; + if (now < clp->next_steer) + return; + + clp->next_steer = now + clp->steer_interval; + switch (clp->type) { + case set_max: + if (isc->write_stats.current != isc->write_stats.max) + printf("Steering write from %d kBps to %d kBps\n", + isc->write_stats.current, isc->write_stats.max); + isc->read_stats.current = isc->read_stats.max; + isc->write_stats.current = isc->write_stats.max; + isc->trim_stats.current = isc->trim_stats.max; + break; + case read_latency: + old = isc->write_stats.current; + lat = isc->read_stats.ema; + /* + * Simple PLL-like engine. Since we're steering to a range for + * the SP (set point) that makes things a little more + * complicated. In addition, we're not directly controlling our + * PV (process variable), the read latency, but instead are + * manipulating the write bandwidth limit for our MV + * (manipulation variable), analysis of this code gets a bit + * messy. Also, the MV is a very noisy control surface for read + * latency since it is affected by many hidden processes inside + * the device which change how responsive read latency will be + * in reaction to changes in write bandwidth. Unlike the classic + * boiler control PLL. this may result in over-steering while + * the SSD takes its time to react to the new, lower load. This + * is why we use a relatively low alpha of between .1 and .25 to + * compensate for this effect. At .1, it takes ~22 steering + * intervals to back off by a factor of 10. At .2 it only takes + * ~10. At .25 it only takes ~8. However some preliminary data + * from the SSD drives suggests a reasponse time in 10's of + * seconds before latency drops regardless of the new write + * rate. Careful observation will be reqiured to tune this + * effectively. + * + * Also, when there's no read traffic, we jack up the write + * limit too regardless of the last read latency. 10 is + * somewhat arbitrary. + */ + if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10) + isc->write_stats.current = isc->write_stats.current * + (100 + clp->alpha) / 100; /* Scale up */ + else if (lat > clp->hilat) + isc->write_stats.current = isc->write_stats.current * + (100 - clp->alpha) / 100; /* Scale down */ + clp->last_count = isc->read_stats.total; + + /* + * Even if we don't steer, per se, enforce the min/max limits as + * those may have changed. + */ + if (isc->write_stats.current < isc->write_stats.min) + isc->write_stats.current = isc->write_stats.min; + if (isc->write_stats.current > isc->write_stats.max) + isc->write_stats.current = isc->write_stats.max; + if (old != isc->write_stats.current) + printf("Steering write from %d kBps to %d kBps due to latency of %ldus\n", + old, isc->write_stats.current, + ((uint64_t)1000000 * (uint32_t)lat) >> 32); + break; + case cl_max: + break; + } *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201509162215.t8GMFp1b023705>