Date: Wed, 8 May 2013 12:13:17 +0300 From: Konstantin Belousov <kostikbel@gmail.com> To: G??ran L??wkrantz <goran.lowkrantz@ismobile.com> Cc: pho@freebsd.org, freebsd-stable@freebsd.org Subject: Re: Nullfs leaks i-nodes Message-ID: <20130508091317.GJ3047@kib.kiev.ua> In-Reply-To: <B799E3B928B18B9E6C68F912@[172.16.2.62]> References: <B799E3B928B18B9E6C68F912@[172.16.2.62]>
next in thread | previous in thread | raw e-mail | index | archive | help
--wICysDeFgFMelglS Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On Tue, May 07, 2013 at 08:30:06AM +0200, G??ran L??wkrantz wrote: > I created a PR, kern/178238, on this but would like to know if anyone has= =20 > any ideas or patches? >=20 > Have updated the system where I see this to FreeBSD 9.1-STABLE #0 r250229= =20 > and still have the problem. The patch below should fix the issue for you, at least it did so in my limited testing. What is does: 1. When inactivating a nullfs vnode, check if the lower vnode is unlinked, and reclaim upper vnode if so. [This fixes your case]. 2. Besides a callback to the upper filesystems for the lower vnode reclaimation, it also calls the upper fs for lower vnode unlink. This allows nullfs to purge cached vnodes for the unlinked lower. [This fixes an opposite case, when the vnode is removed from the lower mount, but upper aliases prevent the vnode from being recycled]. 3. Fix a wart which existed from the introduction of the nullfs caching, do not unlock lower vnode in the nullfs_reclaim_lowervp(). It should be completely innocent, but now it is also formally safe. 4. Fix vnode reference leak in nullfs_reclaim_lowervp(). Please note that the patch is basically not tested, I only verified your scenario and a mirror of it as described in item 2. diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index 4f37020..a624be6 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -53,8 +53,11 @@ struct null_node { LIST_ENTRY(null_node) null_hash; /* Hash list */ struct vnode *null_lowervp; /* VREFed once */ struct vnode *null_vnode; /* Back pointer */ + u_int null_flags; }; =20 +#define NULLV_NOUNLOCK 0x0001 + #define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) #define VTONULL(vp) ((struct null_node *)(vp)->v_data) #define NULLTOV(xp) ((xp)->null_vnode) diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 02932bd..544c358 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -65,7 +65,6 @@ static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; -static vfs_reclaim_lowervp_t nullfs_reclaim_lowervp; =20 /* * Mount null layer @@ -391,8 +390,22 @@ nullfs_reclaim_lowervp(struct mount *mp, struct vnode = *lowervp) vp =3D null_hashget(mp, lowervp); if (vp =3D=3D NULL) return; + VTONULL(vp)->null_flags |=3D NULLV_NOUNLOCK; vgone(vp); - vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); + vput(vp); +} + +static void +nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp) +{ + struct vnode *vp; + + vp =3D null_hashget(mp, lowervp); + if (vp =3D=3D NULL || vp->v_usecount > 1) + return; + VTONULL(vp)->null_flags |=3D NULLV_NOUNLOCK; + vgone(vp); + vput(vp); } =20 static struct vfsops null_vfsops =3D { @@ -408,6 +421,7 @@ static struct vfsops null_vfsops =3D { .vfs_unmount =3D nullfs_unmount, .vfs_vget =3D nullfs_vget, .vfs_reclaim_lowervp =3D nullfs_reclaim_lowervp, + .vfs_unlink_lowervp =3D nullfs_unlink_lowervp, }; =20 VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL); diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index f59865f..3c41124 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -692,18 +692,21 @@ null_unlock(struct vop_unlock_args *ap) static int null_inactive(struct vop_inactive_args *ap __unused) { - struct vnode *vp; + struct vnode *vp, *lvp; struct mount *mp; struct null_mount *xmp; =20 vp =3D ap->a_vp; + lvp =3D NULLVPTOLOWERVP(vp); mp =3D vp->v_mount; xmp =3D MOUNTTONULLMOUNT(mp); - if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0) { + if ((xmp->nullm_flags & NULLM_CACHE) =3D=3D 0 || + (lvp->v_vflag & VV_NOSYNC) !=3D 0) { /* * If this is the last reference and caching of the - * nullfs vnodes is not enabled, then free up the - * vnode so as not to tie up the lower vnodes. + * nullfs vnodes is not enabled, or the lower vnode is + * deleted, then free up the vnode so as not to tie up + * the lower vnodes. */ vp->v_object =3D NULL; vrecycle(vp); @@ -748,7 +751,10 @@ null_reclaim(struct vop_reclaim_args *ap) */ if (vp->v_writecount > 0) VOP_ADD_WRITECOUNT(lowervp, -1); - vput(lowervp); + if ((xp->null_flags & NULLV_NOUNLOCK) !=3D 0) + vunref(lowervp); + else + vput(lowervp); free(xp, M_NULLFSNODE); =20 return (0); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index de118f7..988a114 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2700,19 +2700,20 @@ vgone(struct vnode *vp) } =20 static void -vgonel_reclaim_lowervp_vfs(struct mount *mp __unused, +notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } =20 /* - * Notify upper mounts about reclaimed vnode. + * Notify upper mounts about reclaimed or unlinked vnode. */ -static void -vgonel_reclaim_lowervp(struct vnode *vp) +void +vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops =3D { - .vfs_reclaim_lowervp =3D vgonel_reclaim_lowervp_vfs + .vfs_reclaim_lowervp =3D notify_lowervp_vfs_dummy, + .vfs_unlink_lowervp =3D notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; =20 @@ -2736,7 +2737,17 @@ vgonel_reclaim_lowervp(struct vnode *vp) } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); - VFS_RECLAIM_LOWERVP(ump, vp); + switch (event) { + case VFS_NOTIFY_UPPER_RECLAIM: + VFS_RECLAIM_LOWERVP(ump, vp); + break; + case VFS_NOTIFY_UPPER_UNLINK: + VFS_UNLINK_LOWERVP(ump, vp); + break; + default: + KASSERT(0, ("invalid event %d", event)); + break; + } MNT_ILOCK(mp); ump =3D TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); @@ -2783,7 +2794,7 @@ vgonel(struct vnode *vp) active =3D vp->v_usecount; oweinact =3D (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); - vgonel_reclaim_lowervp(vp); + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); =20 /* * Clean out any buffers associated with the vnode. diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 29cb7bd..a004ea0 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1846,6 +1846,7 @@ restart: if (error) goto out; #endif + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error =3D VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: @@ -3825,6 +3826,7 @@ restart: return (error); goto restart; } + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error =3D VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: diff --git a/sys/sys/mount.h b/sys/sys/mount.h index a9c86f6..a953dae 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -608,7 +608,7 @@ typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); -typedef void vfs_reclaim_lowervp_t(struct mount *mp, struct vnode *lowervp= ); +typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp); =20 struct vfsops { vfs_mount_t *vfs_mount; @@ -626,7 +626,8 @@ struct vfsops { vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; - vfs_reclaim_lowervp_t *vfs_reclaim_lowervp; + vfs_notify_lowervp_t *vfs_reclaim_lowervp; + vfs_notify_lowervp_t *vfs_unlink_lowervp; }; =20 vfs_statfs_t __vfs_statfs; @@ -747,6 +748,14 @@ vfs_statfs_t __vfs_statfs; } \ } while (0) =20 +#define VFS_UNLINK_LOWERVP(MP, VP) do { \ + if (*(MP)->mnt_op->vfs_unlink_lowervp !=3D NULL) { \ + VFS_PROLOGUE(MP); \ + (*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP)); \ + VFS_EPILOGUE(MP); \ + } \ +} while (0) + #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) =3D=3D 0) \ @@ -759,6 +768,9 @@ vfs_statfs_t __vfs_statfs; VN_KNOTE((vp), (hint), 0); \ } while (0) =20 +#define VFS_NOTIFY_UPPER_RECLAIM 1 +#define VFS_NOTIFY_UPPER_UNLINK 2 + #include <sys/module.h> =20 /* @@ -840,6 +852,7 @@ int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); +void vfs_notify_upper(struct vnode *, int); void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp); void vfs_ref(struct mount *); --wICysDeFgFMelglS Content-Type: application/pgp-signature -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.19 (FreeBSD) iQIcBAEBAgAGBQJRihcsAAoJEJDCuSvBvK1BTKAP/3X9R+Z3LWS3qarpGobad/oJ Ro4NJaOX+T8Bq84dJKC4vUfaMT0tU10v5erO/7991qWzb7XBoBOzph7WGbdy5LmR IBr88bqyjMUeX+PZXlRNP57GRtjNGPRa6vLnlYmtevX/jP6gN+Laa+ck+xZN9pQJ CijODfARyvt0eWLhqPYbEhbYkgAmmjV1g3qotFN6BCrwUemddJ+nvVlq1fYPvHUE 9r+d8nNbWyTZOuIoNuS5icGesNxNvMJUQi5HgBJLFCvsoJz8GjPPNXTv4xaAG+X2 k3xaqQy/q6SeAHtbqOdZLbSWFPHTd+3PGOh6I/28kutVXU6IxdJI0Nz6qt/ezks8 smidVcggmuyqKl3u1vdntgfijaW2JR0EKMs2QR/Hvvk+28DPK3YyA+VrXrBcCbjY eD6OIDQAVKX1NVjqEline6lObqOExIT4Nkjh+jhcYBxDHBe65pgPf1px6fs5Aw86 ITMVlJbCQp+IeUiS2zcdzTvLuNoYkGQ2+hV7iMf3ZSt5NM94K2awWhYiHRXL44k5 g5f3ObJBT86lGTsAIPNpl/fIJUGbIBs5Q+9VrGvlLKZOk0XLoC+agVMijmQNjZCe Ncjj9/xJsLvxn4hDRylhkQ0zgHrJVIe2rtB6G1ccFid/UhLbrfSsJdn9aRGU6ygL 1v8Gn/5YdSjEYFRe6WjF =/jR1 -----END PGP SIGNATURE----- --wICysDeFgFMelglS--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20130508091317.GJ3047>