Date: Tue, 17 Jul 2007 21:09:25 GMT From: Ulf Lilleengen <lulf@FreeBSD.org> To: Perforce Change Reviews <perforce@FreeBSD.org> Subject: PERFORCE change 123662 for review Message-ID: <200707172109.l6HL9PMJ078780@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=123662 Change 123662 by lulf@lulf_carrot on 2007/07/17 21:08:43 - Initial implementation of growing RAID-5 arrays. This is done by splitting the offset calculation into one for read and one for write operations. We make a distinction of subdisks that were added after the plex is not newborn any longer and subdisks that were added at creation/tasting time. When a BIO write comes, the write will go to the whole plex, but read operations will only be done on subdisks that do not have the GV_SD_GROW flag set. The bad thing with this is that we must ensure that new subdisks are added to a later plexoffset (which we should force, to make it easier for us, since there is not a good reason why the user should be able to set the plexoffset in this operation). The implementation will probably change a bit. - Add another state called RESIZING, and a flag called GV_PLEX_GROWING to indicate that a plex is in growing operation. - Make sure obvious parts of the code respects this flag. Will need to look over this more though. Affected files ... .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum.h#21 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_events.c#8 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_init.c#14 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_plex.c#17 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_raid5.c#9 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_share.c#3 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_state.c#17 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_subr.c#19 edit .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_var.h#22 edit Differences ... ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum.h#21 (text+ko) ==== @@ -132,6 +132,7 @@ void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t); void gv_parity_request(struct gv_plex *, int, off_t); +int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t); void gv_parityop(struct gv_softc *, struct gctl_req *); #endif /* !_GEOM_VINUM_H_ */ ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_events.c#8 (text+ko) ==== ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_init.c#14 (text+ko) ==== @@ -42,6 +42,7 @@ static int gv_sync(struct gv_volume *); static int gv_rebuild_plex(struct gv_plex *); static int gv_init_plex(struct gv_plex *); +static int gv_grow_plex(struct gv_plex *); void gv_start_obj(struct g_geom *gp, struct gctl_req *req) @@ -115,6 +116,8 @@ else if (p->org == GV_PLEX_RAID5) { if (p->state == GV_PLEX_DEGRADED) error = gv_rebuild_plex(p); + else if (p->state == GV_PLEX_RESIZING) + error = gv_grow_plex(p); else error = gv_init_plex(p); } @@ -226,7 +229,8 @@ return (EBUSY);*/ if (p->flags & GV_PLEX_SYNCING || - p->flags & GV_PLEX_REBUILDING) + p->flags & GV_PLEX_REBUILDING || + p->flags & GV_PLEX_GROWING) return (EINPROGRESS); p->flags |= GV_PLEX_REBUILDING; p->synced = 0; @@ -236,6 +240,54 @@ } static int +gv_grow_plex(struct gv_plex *p) +{ + struct gv_volume *v; + struct gv_sd *s; + off_t origsize, origlength; + int error, sdcount; + + KASSERT(p != NULL, ("gv_grow_plex: NULL p")); + v = p->vol_sc; + KASSERT(v != NULL, ("gv_grow_plex: NULL v")); + + printf ("Start growing\n"); + if (p->flags & GV_PLEX_GROWING || + p->flags & GV_PLEX_SYNCING || + p->flags & GV_PLEX_REBUILDING) + return (EINPROGRESS); + if (p->state > GV_PLEX_RESIZING) + return (GV_ERR_INVSTATE); + g_topology_lock(); + error = gv_access(v->provider, 1, 1, 0); + g_topology_unlock(); + if (error) { + printf("VINUM: unable to access provider\n"); + return (GV_ERR_ISOPEN); /*XXX: wrong errorcode */ + } + + /* XXX: This routine with finding origsize is used two other places as + * well, so we should create a function for it. */ + sdcount = p->sdcount; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) + sdcount--; + } + s = LIST_FIRST(&p->subdisks); + if (s == NULL) { + printf("VINUM: error growing plex without subdisks"); + return (GV_ERR_NOTFOUND); + } + p->flags |= GV_PLEX_GROWING; + origsize = (sdcount - 1) * s->size; + origlength = (sdcount - 1) * p->stripesize; + printf("Starting growing at 0 reading %jd bytes\n", origlength); + gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL); + + return (0); +} + +static int gv_init_plex(struct gv_plex *p) { struct gv_drive *d; ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_plex.c#17 (text+ko) ==== @@ -47,6 +47,7 @@ struct gv_raid5_packet *); static void gv_parity_complete(struct gv_plex *, struct bio *); static void gv_rebuild_complete(struct gv_plex *, struct bio *); +static void gv_grow_complete(struct gv_plex *, struct bio *); static void gv_init_complete(struct gv_plex *, struct bio *); static struct bio * gv_plexbuffer(struct gv_plex *, struct bio *, caddr_t, off_t, off_t, int *); @@ -376,6 +377,8 @@ gv_rebuild_complete(p, pbp); } else if (pbp->bio_cflags & GV_BIO_INIT) { gv_init_complete(p, pbp); + } else if (p->state == GV_PLEX_RESIZING) { + gv_grow_complete(p, pbp); } else { g_io_deliver(pbp, pbp->bio_error); } @@ -535,6 +538,106 @@ } /* + * Finish handling of a bio to a growing plex. + */ +void +gv_grow_complete(struct gv_plex *p, struct bio *bp) +{ + struct gv_sd *s; + struct gv_volume *v; + off_t origsize, offset; + int sdcount, err; + + v = p->vol_sc; + err = 0; + + /* If it was a read, write it. */ + if (bp->bio_cmd == BIO_READ) { + printf("Finished read, do a write\n"); + err = gv_grow_request(p, bp->bio_offset, bp->bio_length, + BIO_WRITE, bp->bio_data); + /* If it was a write, read next. */ + } else if (bp->bio_cmd == BIO_WRITE) { + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + + /* Find the real size of the plex. */ + sdcount = p->sdcount; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) + sdcount--; + } + s = LIST_FIRST(&p->subdisks); + /* XXX: hmm, nice way to solve it? */ + if (s == NULL) { + printf("VINUM: error growing plex without subdisks"); + return; + } + origsize = (s->size * (sdcount - 1)); + if (bp->bio_offset + bp->bio_length >= origsize) { + printf("VINUM: growing of %s completed\n", p->name); + p->flags &= ~GV_PLEX_GROWING; + printf("Updating state\n"); + LIST_FOREACH(s, &p->subdisks, in_plex) { + s->flags &= ~GV_SD_GROW; + gv_set_sd_state(s, GV_SD_UP, 0); + } + gv_set_plex_state(p, GV_PLEX_UP, 0); + g_topology_lock(); + gv_access(v->provider, -1, -1, 0); + g_topology_unlock(); + } else { + offset = bp->bio_offset + bp->bio_length; + printf("Issuing next bio read at 0x%jx\n", offset); + err = gv_grow_request(p, offset, + MIN(bp->bio_length, origsize - offset), + BIO_READ, NULL); + } + } + g_destroy_bio(bp); + + if (err) { + p->flags &= ~GV_PLEX_GROWING; + printf("VINUM: error growing plex: error code %d\n", err); + } +} + +/* + * Create a new bio struct for the next grow request. + */ +int +gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, + caddr_t data) +{ + struct bio *bp; + + KASSERT(p != NULL, ("gv_grow_request: NULL p")); + bp = g_new_bio(); + if (bp == NULL) { + printf("VINUM: grow of %s failed creating bio: " + "out of memory\n", p->name); + return (ENOMEM); + } + + bp->bio_cmd = type; + bp->bio_done = gv_done; + bp->bio_error = 0; + bp->bio_offset = offset; + bp->bio_length = length; + if (data == NULL) + data = g_malloc(length, M_WAITOK); + bp->bio_cflags |= GV_BIO_MALLOC; + bp->bio_data = data; + + /* Send down. */ + printf("Sending bio: "); + g_print_bio(bp); + printf("\n"); + gv_plex_start(p, bp); + return (0); +} + +/* * Handle a finished initialization BIO. */ static void ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_raid5.c#9 (text+ko) ==== @@ -45,6 +45,8 @@ static int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int *); +static int gv_raid5_offset_read(struct gv_plex *, off_t, off_t, + off_t *, off_t *, int *, int *); static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, struct gv_raid5_packet *, caddr_t, int); static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, @@ -341,8 +343,15 @@ type = REQ_TYPE_NORMAL; original = parity = broken = NULL; - gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno); + /* Reads must take into account the growing plexes. */ + if (bp->bio_cmd == BIO_READ) + gv_raid5_offset_read(p, boff, bcount, &real_off, &real_len, + &sdno, &psdno); + else + gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, + &psdno); + printf("Got sdno %d and psdno %d\n", sdno, psdno); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { @@ -526,6 +535,70 @@ return (0); } +/* + * Calculate the offsets in the various subdisks for a RAID5 request. Also take + * care of new subdisks in an expanded RAID5 array. + * XXX: This assumes that the new subdisks are inserted after the others (which + * is okay as long as plex_offset is larger). If subdisks are inserted into the + * plexlist before, we get problems. + */ +static int +gv_raid5_offset_read(struct gv_plex *p, off_t boff, off_t bcount, + off_t *real_off, off_t *real_len, int *sdno, int *psdno) +{ + struct gv_sd *s; + int sd, psd, sdcount; + off_t len_left, stripeend, stripeoff, stripestart; + + printf("In read we take into account new subdisks.\n"); + sdcount = p->sdcount; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) { + printf("Decrease\n"); + sdcount--; + } + } + + /* The number of the subdisk containing the parity stripe. */ + psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % + sdcount; + KASSERT(psdno >= 0, ("gv_raid5_offset_read: psdno < 0")); + + /* Offset of the start address from the start of the stripe. */ + stripeoff = boff % (p->stripesize * (sdcount - 1)); + KASSERT(stripeoff >= 0, ("gv_raid5_offset_read: stripeoff < 0")); + + /* The number of the subdisk where the stripe resides. */ + sd = stripeoff / p->stripesize; + KASSERT(sdno >= 0, ("gv_raid5_offset_read: sdno < 0")); + + /* At or past parity subdisk. */ + if (sd >= psd) + sd++; + + /* The offset of the stripe on this subdisk. */ + stripestart = (boff - stripeoff) / (sdcount - 1); + KASSERT(stripestart >= 0, ("gv_raid5_offset_read: stripestart < 0")); + + stripeoff %= p->stripesize; + + /* The offset of the request on this subdisk. */ + *real_off = stripestart + stripeoff; + + stripeend = stripestart + p->stripesize; + len_left = stripeend - *real_off; + KASSERT(len_left >= 0, ("gv_raid5_offset_read: len_left < 0")); + + *real_len = (bcount <= len_left) ? bcount : len_left; + + if (sdno != NULL) + *sdno = sd; + if (psdno != NULL) + *psdno = psd; + + return (0); +} + /* Calculate the offsets in the various subdisks for a RAID5 request. */ static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_share.c#3 (text+ko) ==== @@ -274,6 +274,8 @@ return (GV_PLEX_INITIALIZING); else if (!strcmp(buf, "degraded")) return (GV_PLEX_DEGRADED); + else if (!strcmp(buf, "resizing")) + return (GV_PLEX_RESIZING); else return (GV_PLEX_DOWN); } @@ -287,6 +289,8 @@ return "down"; case GV_PLEX_INITIALIZING: return "initializing"; + case GV_PLEX_RESIZING: + return "resizing"; case GV_PLEX_DEGRADED: return "degraded"; case GV_PLEX_UP: ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_state.c#17 (text+ko) ==== @@ -419,8 +419,12 @@ /* First, check the state of our subdisks. */ sdstates = gv_sdstatemap(p); + /* If we're growing, don't change the state. */ + if (p->flags & GV_PLEX_GROWING) + p->state = GV_PLEX_RESIZING; + /* If all subdisks are up, our plex can be up, too. */ - if (sdstates == GV_SD_UPSTATE) + else if (sdstates == GV_SD_UPSTATE) p->state = GV_PLEX_UP; /* One or more of our subdisks are down. */ ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_subr.c#19 (text+ko) ==== @@ -163,6 +163,8 @@ * needed here (on-disk config parsing). */ s->flags &= ~GV_SD_NEWBORN; + s->flags &= ~GV_SD_GROW; + printf("S-flags is now: %d\n", s->flags); } } } @@ -333,6 +335,13 @@ } p->sddetached--; } else { + if ((p->org == GV_PLEX_RAID5 || + p->org == GV_PLEX_STRIPED) && + !(p->flags & GV_PLEX_NEWBORN)) { + printf("Adding to a running plex, must add grow-flag to" + " sd and plex\n"); + s->flags |= GV_SD_GROW; + } p->sdcount++; } @@ -451,8 +460,8 @@ } } - /* Trim subdisk sizes so that they match the stripe size. */ LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Trim subdisk sizes to match the stripe size. */ remainder = s->size % p->stripesize; if (remainder) { printf("VINUM: size of sd %s is not a " @@ -473,6 +482,14 @@ gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); + } else { + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->flags & GV_SD_GROW) { + printf("Setting state\n"); + p->state = GV_PLEX_RESIZING; + break; + } + } } /* Our plex is grown up now. */ p->flags &= ~GV_PLEX_NEWBORN; @@ -1075,7 +1092,7 @@ gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename) { struct gv_sd *s2; - int error; + int error, sdcount; g_topology_assert(); @@ -1099,6 +1116,7 @@ s->plex_offset = offset; strlcpy(s->plex, p->name, GV_MAXPLEXNAME); + sdcount = p->sdcount; error = gv_sd_to_plex(s, p); if (error) return (error); ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_var.h#22 (text+ko) ==== @@ -288,6 +288,7 @@ #define GV_SD_NEWBORN 0x01 /* Subdisk is created by user. */ #define GV_SD_TASTED 0x02 /* Subdisk is created during taste. */ #define GV_SD_CANGOUP 0x04 /* Subdisk can go up immediately. */ +#define GV_SD_GROW 0x08 /* Subdisk is added to striped plex. */ char drive[GV_MAXDRIVENAME]; /* Name of underlying drive. */ char plex[GV_MAXPLEXNAME]; /* Name of associated plex. */ @@ -309,8 +310,9 @@ int state; /* The plex state. */ #define GV_PLEX_DOWN 0 #define GV_PLEX_INITIALIZING 1 -#define GV_PLEX_DEGRADED 2 -#define GV_PLEX_UP 3 +#define GV_PLEX_RESIZING 2 +#define GV_PLEX_DEGRADED 3 +#define GV_PLEX_UP 4 int org; /* The plex organisation. */ #define GV_PLEX_DISORG 0 @@ -334,6 +336,7 @@ #define GV_PLEX_THREAD_DEAD 0x10 /* The RAID5 thread has died. */ #define GV_PLEX_NEWBORN 0x20 /* The plex was just created. */ #define GV_PLEX_REBUILDING 0x40 /* The plex is rebuilding. */ +#define GV_PLEX_GROWING 0x80 /* The plex is growing. */ off_t synced; /* Count of synced bytes. */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200707172109.l6HL9PMJ078780>
