Date: Wed, 9 Feb 2011 15:40:13 +0000 (UTC) From: Alexander Motin <mav@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r218486 - projects/graid/head/sys/geom/raid Message-ID: <201102091540.p19FeDee088918@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mav Date: Wed Feb 9 15:40:13 2011 New Revision: 218486 URL: http://svn.freebsd.org/changeset/base/218486 Log: Implement more advanced algorithm for choosing disk to read from RAID1. General idea is the same as in gmirror balance algorithm. Take into account: subdisk state, running error recovery, average disk load, head position and possible cache hits. Modified: projects/graid/head/sys/geom/raid/g_raid.c projects/graid/head/sys/geom/raid/g_raid.h projects/graid/head/sys/geom/raid/tr_raid1.c Modified: projects/graid/head/sys/geom/raid/g_raid.c ============================================================================== --- projects/graid/head/sys/geom/raid/g_raid.c Wed Feb 9 15:33:13 2011 (r218485) +++ projects/graid/head/sys/geom/raid/g_raid.c Wed Feb 9 15:40:13 2011 (r218486) @@ -1074,7 +1074,7 @@ void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) { struct g_consumer *cp; - struct g_raid_disk *disk; + struct g_raid_disk *disk, *tdisk; bp->bio_caller1 = sd; @@ -1104,6 +1104,17 @@ nodisk: bp->bio_from = cp; bp->bio_to = cp->provider; cp->index++; + + /* Update average disks load. */ + TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { + if (tdisk->d_consumer == NULL) + tdisk->d_load = 0; + else + tdisk->d_load = (tdisk->d_consumer->index * + G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; + } + + disk->d_last_offset = bp->bio_offset + bp->bio_length; if (dumping) { G_RAID_LOGREQ(3, bp, "Sending dumping request."); if (bp->bio_cmd == BIO_WRITE) { Modified: projects/graid/head/sys/geom/raid/g_raid.h ============================================================================== --- projects/graid/head/sys/geom/raid/g_raid.h Wed Feb 9 15:33:13 2011 (r218485) +++ projects/graid/head/sys/geom/raid/g_raid.h Wed Feb 9 15:40:13 2011 (r218486) @@ -147,8 +147,8 @@ struct g_raid_disk { struct g_consumer *d_consumer; /* GEOM disk consumer. */ void *d_md_data; /* Disk's metadata storage. */ struct g_kerneldump d_kd; /* Kernel dumping method/args. */ - u_int d_state; /* Disk state. */ uint64_t d_flags; /* Additional flags. */ + u_int d_state; /* Disk state. */ u_int d_load; /* Disk average load. */ off_t d_last_offset; /* Last head offset. */ TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */ @@ -169,6 +169,13 @@ struct g_raid_disk { #define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */ #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */ +#define G_RAID_SUBDISK_POS(sd) \ + ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0) +#define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024) +#define G_RAID_SUBDISK_LOAD(sd) \ + ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0) +#define G_RAID_SUBDISK_LOAD_SCALE 256 + struct g_raid_subdisk { struct g_raid_softc *sd_softc; /* Back-pointer to softc. */ struct g_raid_disk *sd_disk; /* Where this subdisk lives. */ @@ -179,6 +186,7 @@ struct g_raid_subdisk { u_int sd_state; /* Subdisk state. */ off_t sd_rebuild_pos; /* Rebuild position. */ int sd_read_errs; /* Count of the read errors */ + int sd_recovery; /* Count of recovery reqs. */ TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */ }; Modified: projects/graid/head/sys/geom/raid/tr_raid1.c ============================================================================== --- projects/graid/head/sys/geom/raid/tr_raid1.c Wed Feb 9 15:33:13 2011 (r218485) +++ projects/graid/head/sys/geom/raid/tr_raid1.c Wed Feb 9 15:40:13 2011 (r218486) @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/kernel.h> #include <sys/kobj.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> @@ -490,22 +491,43 @@ g_raid_tr_stop_raid1(struct g_raid_tr_ob } /* - * Select the disk to do the reads to. For now, we just pick the first one in - * the list that's active always. This ensures we favor one disk on boot, and - * have more deterministic recovery from the weird edge cases of power - * failure. In the future, we can imagine policies that go for the least - * loaded disk to improve performance, or we need to limit reads to a disk - * during some kind of error recovery with that disk. + * Select the disk to read from. Take into account: subdisk state, running + * error recovery, average disk load, head position and possible cache hits. */ +#define ABS(x) (((x) >= 0) ? (x) : (-(x))) static struct g_raid_subdisk * -g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol) +g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp) { - int i; + struct g_raid_subdisk *sd, *best; + int i, prio, bestprio; - for (i = 0; i < vol->v_disks_count; i++) - if (vol->v_subdisks[i].sd_state == G_RAID_SUBDISK_S_ACTIVE) - return (&vol->v_subdisks[i]); - return (NULL); + best = NULL; + bestprio = INT_MAX; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && + !((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && + bp->bio_offset + bp->bio_length < + sd->sd_rebuild_pos)) + continue; + prio = G_RAID_SUBDISK_LOAD(sd); + prio += min(sd->sd_recovery, 255) << 22; + prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; + /* If disk head is precisely in position - highly prefer it. */ + if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) + prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; + else + /* If disk head is close to position - prefer it. */ + if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < + G_RAID_SUBDISK_TRACK_SIZE) + prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; + if (prio < bestprio) { + best = sd; + bestprio = prio; + } + } + return (best); } static void @@ -514,7 +536,7 @@ g_raid_tr_iostart_raid1_read(struct g_ra struct g_raid_subdisk *sd; struct bio *cbp; - sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume); + sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp); KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name)); @@ -832,6 +854,7 @@ rebuild_round_done: break; G_RAID_LOGREQ(2, cbp, "Retrying read"); if (pbp->bio_children == 2 && do_write) { + sd->sd_recovery++; cbp->bio_caller1 = nsd; pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; /* Lock callback starts I/O */ @@ -892,6 +915,10 @@ rebuild_round_done: g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } + if (pbp->bio_driver1 != NULL) { + ((struct g_raid_subdisk *)pbp->bio_driver1) + ->sd_recovery--; + } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201102091540.p19FeDee088918>