Date: Thu, 3 Feb 2011 19:50:42 +0000 (UTC) From: Warner Losh <imp@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r218234 - projects/graid/head/sys/geom/raid Message-ID: <201102031950.p13JogVH056403@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: imp Date: Thu Feb 3 19:50:42 2011 New Revision: 218234 URL: http://svn.freebsd.org/changeset/base/218234 Log: Fix range locking in a number of ways. Avoid deadlocks with multiple I/Os that were documented previously. Also fix asymetry between different parts of the code that caused rebuild hangs when a read request was pending. # this fixes the rebuild hangs for me, and should for doug also. Modified: projects/graid/head/sys/geom/raid/g_raid.c projects/graid/head/sys/geom/raid/g_raid.h Modified: projects/graid/head/sys/geom/raid/g_raid.c ============================================================================== --- projects/graid/head/sys/geom/raid/g_raid.c Thu Feb 3 19:27:08 2011 (r218233) +++ projects/graid/head/sys/geom/raid/g_raid.c Thu Feb 3 19:50:42 2011 (r218234) @@ -76,6 +76,11 @@ static u_int g_raid_name_format = 0; TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW, &g_raid_name_format, 0, "Providers name format."); +static u_int g_raid_idle_threshold = 1000000; +TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW, + &g_raid_idle_threshold, 1000000, + "Time in microseconds to consider a volume idle."); #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ @@ -882,6 +887,7 @@ g_raid_start_request(struct bio *bp) */ if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && g_raid_is_in_locked_range(vol, bp)) { + G_RAID_LOGREQ(3, bp, "Defer request."); bioq_insert_tail(&vol->v_locked, bp); return; } @@ -902,19 +908,52 @@ g_raid_start_request(struct bio *bp) * the transformation layer to start the I/O. */ bioq_insert_tail(&vol->v_inflight, bp); + G_RAID_LOGREQ(2, bp, "Request started"); G_RAID_TR_IOSTART(vol->v_tr, bp); } + +sttic void +g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) +{ + struct bio *nbp; + struct g_raid_lock *lp; + + vol->v_pending_lock = 0; + LIST_FOREACH(lp, &vol->v_locks, l_next) { + if (lp->l_pending) { + off = lp->l_offset; + len = lp->l_length; + lp->l_pending = 0; + TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { + if (g_raid_bio_overlaps(nbp, off, len)) + lp->l_pending++; + } + if (lp->l_pending) { + vol->v_pending_lock = 1; + G_RAID_DEBUG(4, + "Deferred lock(%jd, %jd) has %d pending", + (intmax_t)off, (intmax_t)(off + len), + lp->l_pending); + continue; + } + G_RAID_DEBUG(4, + "Deferred lock of %jd to %jd completed", + (intmax_t)off, (intmax_t)(off + len)); + G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); + } + } +} + void g_raid_iodone(struct bio *bp, int error) { + off_t off, len; struct g_raid_softc *sc; struct g_raid_volume *vol; - struct g_raid_lock *lp; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); - vol = bp->bio_to->private; G_RAID_LOGREQ(3, bp, "Request done: %d.", error); @@ -925,37 +964,8 @@ g_raid_iodone(struct bio *bp, int error) } bioq_remove(&vol->v_inflight, bp); - if (bp->bio_cmd == BIO_WRITE && vol->v_pending_lock && - g_raid_is_in_locked_range(vol, bp)) { - /* - * XXX this structure forces serialization of all - * XXX pending requests before any are allowed through. - * - * XXX Also, if there is a pending request that overlaps one - * XXX locked area and another locked area comes along that also - * XXX overlaps that area, we wind up double counting it, but - * XXX not double uncounting it, so we hit deadlock. Ouch. - * Most likely, we should add pending counts to struct - * g_raid_lock and recompute v_pending_lock in lock_range() - * and here, which would eliminate the doubel counting. Heck, - * if we wanted to burn the cylces here, we could look at the - * inflight queue and the v_locks and just recompute here. - */ - G_RAID_LOGREQ(3, bp, - "Write to locking zone complete: %d writes outstanding", - vol->v_pending_lock); - if (--vol->v_pending_lock == 0) { - G_RAID_LOGREQ(3, bp, - "Last write done, calling pending callbacks."); - LIST_FOREACH(lp, &vol->v_locks,l_next) { - if (lp->l_flags & G_RAID_LOCK_PENDING) { - G_RAID_TR_LOCKED(vol->v_tr, - lp->l_callback_arg); - lp->l_flags &= ~G_RAID_LOCK_PENDING; - } - } - } - } + if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) + g_raid_finish_with_locked_ranges(vol, bp); g_io_deliver(bp, error); } @@ -965,31 +975,32 @@ g_raid_lock_range(struct g_raid_volume * struct g_raid_softc *sc; struct g_raid_lock *lp; struct bio *bp; - int pending; sc = vol->v_softc; lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); - lp->l_flags |= G_RAID_LOCK_PENDING; lp->l_offset = off; lp->l_length = len; lp->l_callback_arg = argp; - pending = 0; + lp->l_pending = 0; TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { if (g_raid_bio_overlaps(bp, off, len)) - pending++; + lp->l_pending++; } /* * If there are any writes that are pending, we return EBUSY. All * callers will have to wait until all pending writes clear. */ - if (pending > 0) { - vol->v_pending_lock += pending; + if (lp->l_pending > 0) { + vol->v_pending_lock = 1; + G_RAID_DEBUG(4, "Locking range %jd to %jd deferred %d pend", + (intmax_t)off, (intmax_t)(off+len), lp->l_pending); return (EBUSY); } - lp->l_flags &= ~G_RAID_LOCK_PENDING; + G_RAID_DEBUG(4, "Locking range %jd to %jd", + (intmax_t)off, (intmax_t)(off+len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); return (0); } @@ -997,12 +1008,12 @@ g_raid_lock_range(struct g_raid_volume * int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) { - struct g_raid_lock *lp, *tmp; + struct g_raid_lock *lp; struct g_raid_softc *sc; struct bio *bp; sc = vol->v_softc; - LIST_FOREACH_SAFE(lp, &vol->v_locks, l_next, tmp) { + LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_offset == off && lp->l_length == len) { LIST_REMOVE(lp, l_next); /* XXX @@ -1011,9 +1022,10 @@ g_raid_unlock_range(struct g_raid_volume * locked ranges will go right back on this list * when the worker thread runs. * XXX - * Also, see note above about deadlock and how it - * doth sucketh... */ + G_RAID_DEBUG(4, "Unlocked %jd to %jd", + (intmax_t)lp->l_offset, + (intmax_t)(lp->l_offset+lp->l_length)); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) bioq_disksort(&sc->sc_queue, bp); @@ -1164,10 +1176,17 @@ g_raid_worker(void *arg) else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) ; else { - timeout = 1; + /* + * Two steps to avoid overflows at HZ=1000 + * and idle timeouts > 2.1s. Some rounding errors + * can occur, but they are < 1tick, which is deemed to + * be close enough for this purpose. + */ + int micpertic = 1000000 / hz; + timeout = g_raid_idle_threshold / micpertic; sx_xunlock(&sc->sc_lock); MSLEEP(rv, sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "-", - timeout * hz); + timeout); sx_xlock(&sc->sc_lock); goto process; } Modified: projects/graid/head/sys/geom/raid/g_raid.h ============================================================================== --- projects/graid/head/sys/geom/raid/g_raid.h Thu Feb 3 19:27:08 2011 (r218233) +++ projects/graid/head/sys/geom/raid/g_raid.h Thu Feb 3 19:50:42 2011 (r218234) @@ -99,12 +99,11 @@ extern struct g_class g_raid_class; #define G_RAID_BIO_FLAG_SPECIAL \ (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP) -#define G_RAID_LOCK_PENDING 0x1 struct g_raid_lock { off_t l_offset; off_t l_length; void *l_callback_arg; - int l_flags; + int l_pending; LIST_ENTRY(g_raid_lock) l_next; };
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201102031950.p13JogVH056403>