Date: Thu, 9 May 2013 10:20:46 +0300 From: Konstantin Belousov <kostikbel@gmail.com> To: Peter Holm <peter@holm.cc> Cc: freebsd-stable@freebsd.org Subject: Re: Nullfs leaks i-nodes Message-ID: <20130509072046.GN3047@kib.kiev.ua> In-Reply-To: <20130509070256.GA15884@x2.osted.lan> References: <B799E3B928B18B9E6C68F912@[172.16.2.62]> <20130508091317.GJ3047@kib.kiev.ua> <20130509070256.GA15884@x2.osted.lan>
next in thread | previous in thread | raw e-mail | index | archive | help
--CzJ9BTjEYwHt6w7v Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On Thu, May 09, 2013 at 09:02:56AM +0200, Peter Holm wrote: > On Wed, May 08, 2013 at 12:13:17PM +0300, Konstantin Belousov wrote: > > On Tue, May 07, 2013 at 08:30:06AM +0200, G??ran L??wkrantz wrote: > > > I created a PR, kern/178238, on this but would like to know if anyone= has=20 > > > any ideas or patches? > > >=20 > > > Have updated the system where I see this to FreeBSD 9.1-STABLE #0 r25= 0229=20 > > > and still have the problem. > >=20 > > The patch below should fix the issue for you, at least it did so in my > > limited testing. > >=20 > > What is does: > > 1. When inactivating a nullfs vnode, check if the lower vnode is > > unlinked, and reclaim upper vnode if so. [This fixes your case]. > >=20 > > 2. Besides a callback to the upper filesystems for the lower vnode > > reclaimation, it also calls the upper fs for lower vnode unlink. > > This allows nullfs to purge cached vnodes for the unlinked lower. > > [This fixes an opposite case, when the vnode is removed from the > > lower mount, but upper aliases prevent the vnode from being > > recycled]. > >=20 > > 3. Fix a wart which existed from the introduction of the nullfs caching, > > do not unlock lower vnode in the nullfs_reclaim_lowervp(). It should > > be completely innocent, but now it is also formally safe. > >=20 > > 4. Fix vnode reference leak in nullfs_reclaim_lowervp(). > >=20 > > Please note that the patch is basically not tested, I only verified your > > scenario and a mirror of it as described in item 2. > >=20 > > diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h > > index 4f37020..a624be6 100644 > > --- a/sys/fs/nullfs/null.h >=20 > The page fault seen in fifo_close() seems unrelated to this patch, > which I will continue testing some more. >=20 > The scenario triggering the page fault is the "rm": >=20 > mdconfig -a -t swap -s 1g -u 5 > bsdlabel -w md5 auto > newfs -U md5a > mount /dev/md5a /mnt > mount -t nullfs /mnt /mnt2 > mkfifo /mnt2/fifo > rm /mnt/fifo Yeah, I figured this out from the backtrace. The panic in kostik563 is related and directly caused by the patch, since patch erronously performs vgone() on the active (removed) vnode. As result, a spurious VOP_CLOSE() call is performed on the vnode, which is more or less innocent for regular files, but fatal for fifos and probably nfsv4 as well. Updated patch is below. >=20 > Not a problem on 8.3-STABLE r247938. Yes, new nullfs is only in HEAD and stable/9. diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index 4f37020..0972dfc 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -53,8 +53,12 @@ struct null_node { LIST_ENTRY(null_node) null_hash; /* Hash list */ struct vnode *null_lowervp; /* VREFed once */ struct vnode *null_vnode; /* Back pointer */ + u_int null_flags; }; =20 +#define NULLV_NOUNLOCK 0x0001 +#define NULLV_DROP 0x0002 + #define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) #define VTONULL(vp) ((struct null_node *)(vp)->v_data) #define NULLTOV(xp) ((xp)->null_vnode) diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 02932bd..ad02236 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -65,7 +65,6 @@ static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; -static vfs_reclaim_lowervp_t nullfs_reclaim_lowervp; =20 /* * Mount null layer @@ -391,8 +390,37 @@ nullfs_reclaim_lowervp(struct mount *mp, struct vnode = *lowervp) vp =3D null_hashget(mp, lowervp); if (vp =3D=3D NULL) return; + VTONULL(vp)->null_flags |=3D NULLV_NOUNLOCK; vgone(vp); - vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); + vput(vp); +} + +static void +nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp) +{ + struct vnode *vp; + struct null_node *xp; + + vp =3D null_hashget(mp, lowervp); + if (vp =3D=3D NULL) + return; + xp =3D VTONULL(vp); + xp->null_flags |=3D NULLV_DROP | NULLV_NOUNLOCK; + vhold(vp); + vunref(vp); + + /* + * If vunref() dropped the last use reference on the nullfs + * vnode, it must be reclaimed, and its lock was split from + * the lower vnode lock. Need to do extra unlock before + * allowing the final vdrop() to free the vnode. + */ + if (vp->v_usecount =3D=3D 0) { + KASSERT((vp->v_iflag & VI_DOOMED) !=3D 0, + ("not reclaimed %p", vp)); + VOP_UNLOCK(vp, 0); + } + vdrop(vp); } =20 static struct vfsops null_vfsops =3D { @@ -408,6 +436,7 @@ static struct vfsops null_vfsops =3D { .vfs_unmount =3D nullfs_unmount, .vfs_vget =3D nullfs_vget, .vfs_reclaim_lowervp =3D nullfs_reclaim_lowervp, + .vfs_unlink_lowervp =3D nullfs_unlink_lowervp, }; =20 VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL); diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index f59865f..6ff15ee 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -692,18 +692,24 @@ null_unlock(struct vop_unlock_args *ap) static int null_inactive(struct vop_inactive_args *ap __unused) { - struct vnode *vp; + struct vnode *vp, *lvp; + struct null_node *xp; struct mount *mp; struct null_mount *xmp; =20 vp =3D ap->a_vp; + xp =3D VTONULL(vp); + lvp =3D NULLVPTOLOWERVP(vp); mp =3D vp->v_mount; xmp =3D MOUNTTONULLMOUNT(mp); - if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0) { + if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0 || + (xp->null_flags & NULLV_DROP) !=3D 0 || + (lvp->v_vflag & VV_NOSYNC) !=3D 0) { /* * If this is the last reference and caching of the - * nullfs vnodes is not enabled, then free up the - * vnode so as not to tie up the lower vnodes. + * nullfs vnodes is not enabled, or the lower vnode is + * deleted, then free up the vnode so as not to tie up + * the lower vnodes. */ vp->v_object =3D NULL; vrecycle(vp); @@ -748,7 +754,10 @@ null_reclaim(struct vop_reclaim_args *ap) */ if (vp->v_writecount > 0) VOP_ADD_WRITECOUNT(lowervp, -1); - vput(lowervp); + if ((xp->null_flags & NULLV_NOUNLOCK) !=3D 0) + vunref(lowervp); + else + vput(lowervp); free(xp, M_NULLFSNODE); =20 return (0); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index de118f7..988a114 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2700,19 +2700,20 @@ vgone(struct vnode *vp) } =20 static void -vgonel_reclaim_lowervp_vfs(struct mount *mp __unused, +notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } =20 /* - * Notify upper mounts about reclaimed vnode. + * Notify upper mounts about reclaimed or unlinked vnode. */ -static void -vgonel_reclaim_lowervp(struct vnode *vp) +void +vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops =3D { - .vfs_reclaim_lowervp =3D vgonel_reclaim_lowervp_vfs + .vfs_reclaim_lowervp =3D notify_lowervp_vfs_dummy, + .vfs_unlink_lowervp =3D notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; =20 @@ -2736,7 +2737,17 @@ vgonel_reclaim_lowervp(struct vnode *vp) } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); - VFS_RECLAIM_LOWERVP(ump, vp); + switch (event) { + case VFS_NOTIFY_UPPER_RECLAIM: + VFS_RECLAIM_LOWERVP(ump, vp); + break; + case VFS_NOTIFY_UPPER_UNLINK: + VFS_UNLINK_LOWERVP(ump, vp); + break; + default: + KASSERT(0, ("invalid event %d", event)); + break; + } MNT_ILOCK(mp); ump =3D TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); @@ -2783,7 +2794,7 @@ vgonel(struct vnode *vp) active =3D vp->v_usecount; oweinact =3D (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); - vgonel_reclaim_lowervp(vp); + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); =20 /* * Clean out any buffers associated with the vnode. diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 29cb7bd..a004ea0 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1846,6 +1846,7 @@ restart: if (error) goto out; #endif + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error =3D VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: @@ -3825,6 +3826,7 @@ restart: return (error); goto restart; } + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error =3D VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: diff --git a/sys/sys/mount.h b/sys/sys/mount.h index a9c86f6..a953dae 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -608,7 +608,7 @@ typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); -typedef void vfs_reclaim_lowervp_t(struct mount *mp, struct vnode *lowervp= ); +typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp); =20 struct vfsops { vfs_mount_t *vfs_mount; @@ -626,7 +626,8 @@ struct vfsops { vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; - vfs_reclaim_lowervp_t *vfs_reclaim_lowervp; + vfs_notify_lowervp_t *vfs_reclaim_lowervp; + vfs_notify_lowervp_t *vfs_unlink_lowervp; }; =20 vfs_statfs_t __vfs_statfs; @@ -747,6 +748,14 @@ vfs_statfs_t __vfs_statfs; } \ } while (0) =20 +#define VFS_UNLINK_LOWERVP(MP, VP) do { \ + if (*(MP)->mnt_op->vfs_unlink_lowervp !=3D NULL) { \ + VFS_PROLOGUE(MP); \ + (*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP)); \ + VFS_EPILOGUE(MP); \ + } \ +} while (0) + #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) =3D=3D 0) \ @@ -759,6 +768,9 @@ vfs_statfs_t __vfs_statfs; VN_KNOTE((vp), (hint), 0); \ } while (0) =20 +#define VFS_NOTIFY_UPPER_RECLAIM 1 +#define VFS_NOTIFY_UPPER_UNLINK 2 + #include <sys/module.h> =20 /* @@ -840,6 +852,7 @@ int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); +void vfs_notify_upper(struct vnode *, int); void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp); void vfs_ref(struct mount *); --CzJ9BTjEYwHt6w7v Content-Type: application/pgp-signature -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.19 (FreeBSD) iQIcBAEBAgAGBQJRi05NAAoJEJDCuSvBvK1BuFkP/i4qzpF1cWvS3zmNFxrMK4HN QKEcbbUByJhzuwxTsgAKPI/VXI2zs8PW0by3ul4Tl4x9forfja578esLdaTW83QZ VGntUXYfJXYBEg5kC3IdfOWAyHZa5S84OZI/Dd3wur0ftxvQPPydE809Ri+Zh4Nw vzL0uroV6k6N3PUgE1HVTeIR99Ft+U1vy7aymQC/kSFYPahur0N9P7Z6U9cIQWkp SvOQ7xlQZd3o5FJ3hNT+D+IgjSz9PveePSobe7LqGgB1/Pj3wvW7jsaQKZ3OlV+N DkA6COQ2VaaOH4Cr9UGw+G0Vk1u7sKBKnDsP+7iIjv6T/nk5G2ry2ICmQFKTSlzb Y4eWp1h9VWh1+QalYBpXi8b3kdLGRf7pRO7TT8gX6ixZnhVYsY9mrSfSGZWeLVNO g5OHPXSVof7eje34LW9WEnUGrmteHzbzgIJGrC7++kEXdEdTgS59jhQwzoiHFObY U4brSroNEN8WJhftBU5cz/ahbNeAoEVt1aT1ZosZwOSLWQ6LpeIxe/0Bd1VrDKb1 W4CZ4saV43CbDcL77h2Fja1+MyxZqAgOxd9bZ4zlEROYcZWuYI7PelDQY7ICCvDv gF51ho0PwncD8+r/0R4bT8vjL4PGBV2T+CG2WRJFtGs61H4lyMUJJDrQK3STMM2y aTZhLmZTD55gEkC0QM+J =cA3q -----END PGP SIGNATURE----- --CzJ9BTjEYwHt6w7v--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20130509072046.GN3047>