Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 27 Feb 2010 17:16:26 +0200
From:      Kostik Belousov <kostikbel@gmail.com>
To:        Mikolaj Golub <to.my.trociny@gmail.com>
Cc:        freebsd-net@freebsd.org
Subject:   Re: kmem leakage on tun/tap device removal
Message-ID:  <20100227151626.GJ2489@deviant.kiev.zoral.com.ua>
In-Reply-To: <86hbp3jape.fsf@kopusha.onet>
References:  <86hbp3jape.fsf@kopusha.onet>

next in thread | previous in thread | raw e-mail | index | archive | help

--ed/6oDxOLijJh8b0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Sat, Feb 27, 2010 at 11:21:17AM +0200, Mikolaj Golub wrote:
> Hi,
>=20
> Recently I run some tests, which create/destroy tun interface in loop, and
> after several hours my system panicked with "kmem_map too small". It has
> appeared that tun (or tap) device does not free memory after the device
> destroy:
>=20
> zhuzha:/usr/src/sys/kern% sudo vmstat -m |grep 'Type\|DEVFS1'
>          Type InUse MemUse HighUse Requests  Size(s)
>        DEVFS1   139    35K       -      157  256
> zhuzha:/usr/src/sys/kern% sudo ifconfig tun0 create         =20
> zhuzha:/usr/src/sys/kern% sudo vmstat -m |grep 'Type\|DEVFS1'
>          Type InUse MemUse HighUse Requests  Size(s)
>        DEVFS1   140    35K       -      159  256
> zhuzha:/usr/src/sys/kern% sudo ifconfig tun0 destroy        =20
> zhuzha:/usr/src/sys/kern% sudo vmstat -m |grep 'Type\|DEVFS1'
>          Type InUse MemUse HighUse Requests  Size(s)
>        DEVFS1   140    35K       -      159  256
>=20
> And when running create/destroy in loop:
>=20
>  Time          Type InUse MemUse HighUse Requests  Size(s)
> 09:20        DEVFS1   104    26K       -      113  256
> 09:25        DEVFS1  8504  2126K       -    16912  256
> 09:30        DEVFS1 31602  7901K       -    63108  256
> 09:35        DEVFS1 54316 13579K       -   108536  256
> 09:40        DEVFS1 77068 19267K       -   154040  256
> 09:45        DEVFS1 99764 24941K       -   199431  256
> 09:50        DEVFS1 122408 30602K       -   244719  256
> 09:55        DEVFS1 144689 36173K       -   289281  256
>=20
> It looks like the problem is that tun/tap_clone_create() calls make_dev()=
 and
> then dev_ref(dev). make_dev() calls itself dev_refl(), so after device cr=
eating
> we have si_refcount =3D=3D 2. But on device removal (tun/tap_clone_destro=
y())
> dev_rel() is never called, only destroy_dev(dev), which checks that
> si_refcount is still not zero and places the dev in dead_cdevsw.d_devs li=
st.
>=20
> And running kgdb we can see the following picture:
>=20
> (kgdb) p *dead_cdevsw.d_devs.lh_first
> $2 =3D {__si_reserved =3D 0x0, si_flags =3D 0, si_atime =3D {tv_sec =3D 1=
267218482, tv_nsec =3D 0}, si_ctime =3D {
>     tv_sec =3D 1267218482, tv_nsec =3D 0}, si_mtime =3D {tv_sec =3D 12672=
18482, tv_nsec =3D 0}, si_uid =3D 66,=20
>   si_gid =3D 68, si_mode =3D 384, si_cred =3D 0x0, si_drv0 =3D 0, si_refc=
ount =3D 1, si_list =3D {
>     le_next =3D 0xcd183100, le_prev =3D 0xc0d90278}, si_clone =3D {le_nex=
t =3D 0x0, le_prev =3D 0xc5c83b50},=20
>   si_children =3D {lh_first =3D 0x0}, si_siblings =3D {le_next =3D 0x0, l=
e_prev =3D 0x0}, si_parent =3D 0x0,=20
>   si_name =3D 0xcaadc278 "tun0", si_drv1 =3D 0x0, si_drv2 =3D 0x0, si_dev=
sw =3D 0x0, si_iosize_max =3D 0,=20
>   si_usecount =3D 0, si_threadcount =3D 0, __si_u =3D {__sid_snapdata =3D=
 0x0},=20
>   __si_namebuf =3D "tun0", '\0' <repeats 59 times>}
> (kgdb) p *dead_cdevsw.d_devs.lh_first->si_list.le_next
> $3 =3D {__si_reserved =3D 0x0, si_flags =3D 0, si_atime =3D {tv_sec =3D 1=
267218421, tv_nsec =3D 0}, si_ctime =3D {
>     tv_sec =3D 1267218421, tv_nsec =3D 0}, si_mtime =3D {tv_sec =3D 12672=
18421, tv_nsec =3D 0}, si_uid =3D 66,=20
>   si_gid =3D 68, si_mode =3D 384, si_cred =3D 0x0, si_drv0 =3D 0, si_refc=
ount =3D 1, si_list =3D {
>     le_next =3D 0xcd183000, le_prev =3D 0xcaadc238}, si_clone =3D {le_nex=
t =3D 0x0, le_prev =3D 0xc5c83b50},=20
>   si_children =3D {lh_first =3D 0x0}, si_siblings =3D {le_next =3D 0x0, l=
e_prev =3D 0x0}, si_parent =3D 0x0,=20
>   si_name =3D 0xcd183178 "tun0", si_drv1 =3D 0x0, si_drv2 =3D 0x0, si_dev=
sw =3D 0x0, si_iosize_max =3D 0,=20
>   si_usecount =3D 0, si_threadcount =3D 0, __si_u =3D {__sid_snapdata =3D=
 0x0},=20
>   __si_namebuf =3D "tun0", '\0' <repeats 59 times>}
> ... and so on.
>=20
> Is dev_ref() needed in tun_clone_create() after make_dev() call? Can't it=
 be
> safely removed as in the patch below? I have run some tests with the patc=
h and
> it looks like it works for me.

CHEAPCLONE is unused too. Besides this, there are two races in
the clone handling. First is that dev_clone handlers shall use
make_dev_credf(MAKEDEV_REF) instead of make_dev/dev_ref. Second is that
module unload shall drain clone events.

Please test the patch below.

diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c
index 950e96c..93801f1 100644
--- a/sys/net/if_tap.c
+++ b/sys/net/if_tap.c
@@ -192,10 +192,6 @@ tap_clone_create(struct if_clone *ifc, int unit, caddr=
_t params)
 	if (i) {
 		dev =3D make_dev(&tap_cdevsw, unit | extra,
 		     UID_ROOT, GID_WHEEL, 0600, "%s%d", ifc->ifc_name, unit);
-		if (dev !=3D NULL) {
-			dev_ref(dev);
-			dev->si_flags |=3D SI_CHEAPCLONE;
-		}
 	}
=20
 	tapcreate(dev);
@@ -300,6 +296,7 @@ tapmodevent(module_t mod, int type, void *data)
 		EVENTHANDLER_DEREGISTER(dev_clone, eh_tag);
 		if_clone_detach(&tap_cloner);
 		if_clone_detach(&vmnet_cloner);
+		drain_dev_clone_events();
=20
 		mtx_lock(&tapmtx);
 		while ((tp =3D SLIST_FIRST(&taphead)) !=3D NULL) {
@@ -381,12 +378,8 @@ tapclone(void *arg, struct ucred *cred, char *name, in=
t namelen, struct cdev **d
 			name =3D devname;
 		}
=20
-		*dev =3D make_dev(&tap_cdevsw, unit | extra,
-		     UID_ROOT, GID_WHEEL, 0600, "%s", name);
-		if (*dev !=3D NULL) {
-			dev_ref(*dev);
-			(*dev)->si_flags |=3D SI_CHEAPCLONE;
-		}
+		*dev =3D make_dev_credf(MAKEDEV_REF, &tap_cdevsw, unit | extra,
+		     cred, UID_ROOT, GID_WHEEL, 0600, "%s", name);
 	}
=20
 	if_clone_create(name, namelen, NULL);
diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c
index 37b5e70..1fa02ac 100644
--- a/sys/net/if_tun.c
+++ b/sys/net/if_tun.c
@@ -188,10 +188,6 @@ tun_clone_create(struct if_clone *ifc, int unit, caddr=
_t params)
 		/* No preexisting struct cdev *, create one */
 		dev =3D make_dev(&tun_cdevsw, unit,
 		    UID_UUCP, GID_DIALER, 0600, "%s%d", ifc->ifc_name, unit);
-		if (dev !=3D NULL) {
-			dev_ref(dev);
-			dev->si_flags |=3D SI_CHEAPCLONE;
-		}
 	}
 	tuncreate(ifc->ifc_name, dev);
=20
@@ -237,12 +233,8 @@ tunclone(void *arg, struct ucred *cred, char *name, in=
t namelen,
 			name =3D devname;
 		}
 		/* No preexisting struct cdev *, create one */
-		*dev =3D make_dev(&tun_cdevsw, u,
+		*dev =3D make_dev_credf(MAKEDEV_REF, &tun_cdevsw, u, cred,
 		    UID_UUCP, GID_DIALER, 0600, "%s", name);
-		if (*dev !=3D NULL) {
-			dev_ref(*dev);
-			(*dev)->si_flags |=3D SI_CHEAPCLONE;
-		}
 	}
=20
 	if_clone_create(name, namelen, NULL);
@@ -303,6 +295,7 @@ tunmodevent(module_t mod, int type, void *data)
 	case MOD_UNLOAD:
 		if_clone_detach(&tun_cloner);
 		EVENTHANDLER_DEREGISTER(dev_clone, tag);
+		drain_dev_clone_events();
=20
 		mtx_lock(&tunmtx);
 		while ((tp =3D TAILQ_FIRST(&tunhead)) !=3D NULL) {

--ed/6oDxOLijJh8b0
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.10 (FreeBSD)

iEYEARECAAYFAkuJN0oACgkQC3+MBN1Mb4jMaACg0vYhY36hmXXIrc8I8vVhZE5W
KqAAoIvy6vSk4nScFqS6zEqXlmGybXFb
=CAa/
-----END PGP SIGNATURE-----

--ed/6oDxOLijJh8b0--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20100227151626.GJ2489>