Date: Sat, 13 Sep 2008 22:47:52 +0300 From: Giorgos Keramidas <keramida@freebsd.org> To: Julian Elischer <julian@elischer.org> Cc: freebsd-current@freebsd.org, Robert Watson <rwatson@freebsd.org> Subject: Re: panic in rt_check_fib() Message-ID: <874p4ju8t3.fsf@kobe.laptop> In-Reply-To: <48CC14AD.4090708@elischer.org> (Julian Elischer's message of "Sat, 13 Sep 2008 12:29:49 -0700") References: <87prnjh80z.fsf@kobe.laptop> <alpine.BSF.1.10.0809131105280.55411@fledge.watson.org> <48CC14AD.4090708@elischer.org>
next in thread | previous in thread | raw e-mail | index | archive | help
--=-=-= Content-Transfer-Encoding: quoted-printable On Sat, 13 Sep 2008 12:29:49 -0700, Julian Elischer <julian@elischer.org> w= rote: > Robert Watson wrote: >> >> On Fri, 5 Sep 2008, Giorgos Keramidas wrote: >> >>> A kernel that I built last night to test Ed's "packet mode" for ptys >>> included all the changes up to 182743 panics with: >> >> I had an identical panic on 7-STABLE last night: > > I have a patch for this that i have had out for review for s while... > it's a replacement rt_check_fib function.. As it happens, I just build a kernel with the new rt_check_fib() function with the patched code. I'm not sure it fixes the panic, but it didn't compile at all without s/RT_FREE/RTFREE/ in a couple of places, so I'll give it a try but a more thorough review than I can give is needed. The original diff was: %%% diff -r ef8e7f2fc284 sys/net/route.c =2D-- a/sys/net/route.c Fri Sep 12 02:12:33 2008 +0300 +++ b/sys/net/route.c Sat Sep 13 22:40:53 2008 +0300 @@ -1676,19 +1676,31 @@ * *lrt0 points to the cached route to the final destination; * *lrt is not meaningful; * fibnum is the index to the correct network fib for this packet + * (*lrt0 has not ref held on it so REMREF is not needed ) * * =3D=3D=3D Operation =3D=3D=3D * If the route is marked down try to find a new route. If the route * to the gateway is gone, try to setup a new route. Otherwise, * if the route is marked for packets to be rejected, enforce that. + * Note that rtalloc returns an rtentry with an extra REF that we need to = lose. * * =3D=3D=3D On return =3D=3D=3D * *dst is unchanged; * *lrt0 points to the (possibly new) route to the final destination =2D * *lrt points to the route to the next hop + * *lrt points to the route to the next hop [LOCKED] * * Their values are meaningful ONLY if no error is returned. + * + * To follow this you have to remember that: + * RT_REMREF reduces the reference count by 1 but doesn't check it for 0 (= !) + * RTFREE_LOCKED includes an RT_REMREF (or an rtfree if refs =3D=3D 1) + * and an RT_UNLOCK + * RTFREE does an RT_LOCK and an RTFREE_LOCKED + * The gwroute pointer counts as a reference on the rtentry to which it po= ints. + * so when we add it we use the ref that rtalloc gives us and when we lose= it + * we need to remove the reference. */ + int rt_check(struct rtentry **lrt, struct rtentry **lrt0, struct sockaddr *dst) { @@ -1704,55 +1716,79 @@ int error; =20 KASSERT(*lrt0 !=3D NULL, ("rt_check")); =2D rt =3D rt0 =3D *lrt0; + rt0 =3D *lrt0; + rt =3D NULL; =20 /* NB: the locking here is tortuous... */ =2D RT_LOCK(rt); =2D if ((rt->rt_flags & RTF_UP) =3D=3D 0) { =2D RT_UNLOCK(rt); =2D rt =3D rtalloc1_fib(dst, 1, 0UL, fibnum); =2D if (rt !=3D NULL) { =2D RT_REMREF(rt); =2D /* XXX what about if change? */ =2D } else + RT_LOCK(rt0); +retry: + if (rt0 && (rt0->rt_flags & RTF_UP) =3D=3D 0) { + /* Current rt0 is useless, try get a replacement. */ + RT_UNLOCK(rt0); + rt0 =3D NULL; + } + if (rt0 =3D=3D NULL) { + rt0 =3D rtalloc1_fib(dst, 1, 0UL, fibnum); + if (rt0 =3D=3D NULL) { return (EHOSTUNREACH); =2D rt0 =3D rt; + } + RT_REMREF(rt0); /* don't need the reference. */ } =2D /* XXX BSD/OS checks dst->sa_family !=3D AF_NS */ =2D if (rt->rt_flags & RTF_GATEWAY) { =2D if (rt->rt_gwroute =3D=3D NULL) =2D goto lookup; =2D rt =3D rt->rt_gwroute; =2D RT_LOCK(rt); /* NB: gwroute */ =2D if ((rt->rt_flags & RTF_UP) =3D=3D 0) { =2D RTFREE_LOCKED(rt); /* unlock gwroute */ =2D rt =3D rt0; =2D rt0->rt_gwroute =3D NULL; =2D lookup: =2D RT_UNLOCK(rt0); =2D/* XXX MRT link level looked up in table 0 */ =2D rt =3D rtalloc1_fib(rt->rt_gateway, 1, 0UL, 0); =2D if (rt =3D=3D rt0) { =2D RT_REMREF(rt0); =2D RT_UNLOCK(rt0); + + if (rt0->rt_flags & RTF_GATEWAY) { + if ((rt =3D rt0->rt_gwroute) !=3D NULL) { + RT_LOCK(rt); /* NB: gwroute */ + if ((rt->rt_flags & RTF_UP) =3D=3D 0) { + /* gw route is dud. ignore/lose it */ + RTFREE_LOCKED(rt); /* unref (&unlock) gwroute */ + rt =3D rt0->rt_gwroute =3D NULL; + } + } + if (rt =3D=3D NULL) { /* NOT AN ELSE CLAUSE */ + RT_TEMP_UNLOCK(rt0); /* MUST return to undo this */ + rt =3D rtalloc1_fib(rt0->rt_gateway, 1, 0UL, fibnum); + if ((rt =3D=3D rt0) || (rt =3D=3D NULL)) { + /* the best we can do is not good enough */ + if (rt) { + RT_REMREF(rt); /* assumes ref > 0 */ + RT_UNLOCK(rt); + } + RT_FREE(rt0); /* lock, unref, (unlock) */ return (ENETUNREACH); } =2D RT_LOCK(rt0); =2D if (rt0->rt_gwroute !=3D NULL) =2D RTFREE(rt0->rt_gwroute); =2D rt0->rt_gwroute =3D rt; =2D if (rt =3D=3D NULL) { =2D RT_UNLOCK(rt0); =2D return (EHOSTUNREACH); + /* + * Relock it and lose the added reference. + * All sorts of things could have happenned while we + * had no lock on it, so check for them. + */ + RT_RELOCK(rt0); + if (rt0 =3D=3D NULL || ((rt0->rt_flags & RTF_UP) =3D=3D 0)) + /* Ru-roh.. what we had is no longer any good */ + goto retry; + /* + * While we were away, someone replaced the gateway. + * Since a reference count is involved we can't just + * overwrite it. + */ + if (rt0->rt_gwroute) { + if (rt0->rt_gwroute !=3D rt) { + RT_FREE_LOCKED(rt); + goto retry; + } + } else { + rt0->rt_gwroute =3D rt; } } + RT_LOCK_ASSERT(rt); RT_UNLOCK(rt0); + } else { + /* think of rt as having the lock from now on.. */ + rt =3D rt0; } /* XXX why are we inspecting rmx_expire? */ =2D error =3D (rt->rt_flags & RTF_REJECT) && =2D (rt->rt_rmx.rmx_expire =3D=3D 0 || =2D time_uptime < rt->rt_rmx.rmx_expire); =2D if (error) { + if ((rt->rt_flags & RTF_REJECT) && + (rt->rt_rmx.rmx_expire =3D=3D 0 || + time_uptime < rt->rt_rmx.rmx_expire)) { RT_UNLOCK(rt); return (rt =3D=3D rt0 ? EHOSTDOWN : EHOSTUNREACH); } diff -r ef8e7f2fc284 sys/net/route.h =2D-- a/sys/net/route.h Fri Sep 12 02:12:33 2008 +0300 +++ b/sys/net/route.h Sat Sep 13 22:40:53 2008 +0300 @@ -315,6 +315,22 @@ (_rt)->rt_refcnt--; \ } while (0) =20 +#define RT_TEMP_UNLOCK(_rt) do { \ + RT_ADDREF(_rt); \ + RT_UNLOCK(_rt); \ +} while (0) + +#define RT_RELOCK(_rt) do { \ + RT_LOCK(_rt) \ + if ((_rt)->rt_refcnt <=3D 1) \ + rtfree(_rt); \ + _rt =3D 0; /* signal that it went away */ \ + else { \ + RT_REMREF(_rt); \ + /* note that _rt is still valid */ \ + } \ +} while (0) + #define RTFREE_LOCKED(_rt) do { \ if ((_rt)->rt_refcnt <=3D 1) \ rtfree(_rt); \ %%% The two places I had to edit after applying the patch are lines 1756 and 1755: 1756 RT_FREE(rt0); /* lock, unref, (unlock)= */ 1775 RT_FREE_LOCKED(rt); The 'fix' was trivial (s/RT_FREE/RTFREE/) but I haven't booted into the new kernel yet. BRB in a few minutes :) --=-=-= Content-Type: application/pgp-signature -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.9 (FreeBSD) iEYEARECAAYFAkjMGPAACgkQ1g+UGjGGA7Zl0ACgiUst7jozBZMSW06FBVNPqb6X yEEAmwTZbZwksVWFovDw9HBSg/s2mMda =ieqp -----END PGP SIGNATURE----- --=-=-=--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?874p4ju8t3.fsf>