Date: Wed, 15 Sep 2010 17:49:44 -0400 From: John Baldwin <jhb@freebsd.org> To: freebsd-current@freebsd.org Cc: Marcin Cieslak <saper@saper.info> Subject: Re: tun(4) in -CURRENT: No buffer space available - race condition patch Message-ID: <201009151749.45038.jhb@freebsd.org> In-Reply-To: <slrni8tir9.2b61.saper@saper.info> References: <slrnht8djj.1tsh.saper@saper.info> <slrni8tir9.2b61.saper@saper.info>
next in thread | previous in thread | raw e-mail | index | archive | help
On Monday, September 13, 2010 9:10:01 pm Marcin Cieslak wrote: > Output queue of tun(4) gets full after some time when sending lots of data. > I have been observing this on -CURRENT at least since March this year. > > Looks like it's a race condition (same in tun(4) and tap(4)), > the following patch seems to address the issue: This is a good find. I actually went through these drivers a bit further and have a bit of a larger patch to extend the locking some. Would you care to test it? Index: if_tap.c =================================================================== --- if_tap.c (revision 212557) +++ if_tap.c (working copy) @@ -209,7 +209,6 @@ tap_destroy(struct tap_softc *tp) { struct ifnet *ifp = tp->tap_ifp; - int s; /* Unlocked read. */ KASSERT(!(tp->tap_flags & TAP_OPEN), @@ -217,10 +216,8 @@ knlist_destroy(&tp->tap_rsel.si_note); destroy_dev(tp->tap_dev); - s = splimp(); ether_ifdetach(ifp); if_free_type(ifp, IFT_ETHER); - splx(s); mtx_destroy(&tp->tap_mtx); free(tp, M_TAP); @@ -398,7 +395,7 @@ struct tap_softc *tp = NULL; unsigned short macaddr_hi; uint32_t macaddr_mid; - int unit, s; + int unit; char *name = NULL; u_char eaddr[6]; @@ -449,15 +446,13 @@ dev->si_drv1 = tp; tp->tap_dev = dev; - s = splimp(); ether_ifattach(ifp, eaddr); - splx(s); mtx_lock(&tp->tap_mtx); tp->tap_flags |= TAP_INITED; mtx_unlock(&tp->tap_mtx); - knlist_init_mtx(&tp->tap_rsel.si_note, NULL); + knlist_init_mtx(&tp->tap_rsel.si_note, &tp->tap_mtx); TAPDEBUG("interface %s is created. minor = %#x\n", ifp->if_xname, dev2unit(dev)); @@ -474,7 +469,7 @@ { struct tap_softc *tp = NULL; struct ifnet *ifp = NULL; - int error, s; + int error; if (tapuopen == 0) { error = priv_check(td, PRIV_NET_TAP); @@ -497,15 +492,13 @@ tp->tap_pid = td->td_proc->p_pid; tp->tap_flags |= TAP_OPEN; ifp = tp->tap_ifp; - mtx_unlock(&tp->tap_mtx); - s = splimp(); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; + mtx_unlock(&tp->tap_mtx); if_link_state_change(ifp, LINK_STATE_UP); - splx(s); TAPDEBUG("%s is open. minor = %#x\n", ifp->if_xname, dev2unit(dev)); @@ -524,7 +517,6 @@ struct ifaddr *ifa; struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; - int s; /* junk all pending output */ IF_DRAIN(&ifp->if_snd); @@ -537,25 +529,24 @@ mtx_lock(&tp->tap_mtx); if (((tp->tap_flags & TAP_VMNET) == 0) && (ifp->if_flags & IFF_UP)) { mtx_unlock(&tp->tap_mtx); - s = splimp(); if_down(ifp); + mtx_lock(&tp->tap_mtx); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + mtx_unlock(&tp->tap_mtx); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { rtinit(ifa, (int)RTM_DELETE, 0); } if_purgeaddrs(ifp); - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + mtx_lock(&tp->tap_mtx); } - splx(s); - } else - mtx_unlock(&tp->tap_mtx); + } if_link_state_change(ifp, LINK_STATE_DOWN); funsetown(&tp->tap_sigio); selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_UNLOCKED(&tp->tap_rsel.si_note, 0); + KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - mtx_lock(&tp->tap_mtx); tp->tap_flags &= ~TAP_OPEN; tp->tap_pid = 0; mtx_unlock(&tp->tap_mtx); @@ -580,8 +571,10 @@ TAPDEBUG("initializing %s\n", ifp->if_xname); + mtx_lock(&tp->tap_mtx); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + mtx_unlock(&tp->tap_mtx); /* attempt to start output */ tapifstart(ifp); @@ -599,7 +592,7 @@ struct tap_softc *tp = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct ifstat *ifs = NULL; - int s, dummy; + int dummy; switch (cmd) { case SIOCSIFFLAGS: /* XXX -- just like vmnet does */ @@ -612,7 +605,6 @@ break; case SIOCGIFSTATUS: - s = splimp(); ifs = (struct ifstat *)data; dummy = strlen(ifs->ascii); mtx_lock(&tp->tap_mtx); @@ -621,14 +613,10 @@ sizeof(ifs->ascii) - dummy, "\tOpened by PID %d\n", tp->tap_pid); mtx_unlock(&tp->tap_mtx); - splx(s); break; default: - s = splimp(); - dummy = ether_ioctl(ifp, cmd, data); - splx(s); - return (dummy); + return (ether_ioctl(ifp, cmd, data)); /* NOT REACHED */ } @@ -645,7 +633,6 @@ tapifstart(struct ifnet *ifp) { struct tap_softc *tp = ifp->if_softc; - int s; TAPDEBUG("%s starting\n", ifp->if_xname); @@ -665,24 +652,19 @@ TAPDEBUG("%s not ready, tap_flags = 0x%x\n", ifp->if_xname, tp->tap_flags); - s = splimp(); do { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) m_freem(m); ifp->if_oerrors ++; } while (m != NULL); - splx(s); return; } - mtx_unlock(&tp->tap_mtx); - s = splimp(); ifp->if_drv_flags |= IFF_DRV_OACTIVE; - if (ifp->if_snd.ifq_len != 0) { - mtx_lock(&tp->tap_mtx); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tap_flags & TAP_RWAIT) { tp->tap_flags &= ~TAP_RWAIT; wakeup(tp); @@ -691,16 +673,16 @@ if ((tp->tap_flags & TAP_ASYNC) && (tp->tap_sigio != NULL)) { mtx_unlock(&tp->tap_mtx); pgsigio(&tp->tap_sigio, SIGIO, 0); - } else - mtx_unlock(&tp->tap_mtx); + mtx_lock(&tp->tap_mtx); + } selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_UNLOCKED(&tp->tap_rsel.si_note, 0); + KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); ifp->if_opackets ++; /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - splx(s); + mtx_unlock(&tp->tap_mtx); } /* tapifstart */ @@ -715,7 +697,6 @@ struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct tapinfo *tapp = NULL; - int s; int f; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) @@ -724,12 +705,10 @@ switch (cmd) { case TAPSIFINFO: - s = splimp(); tapp = (struct tapinfo *)data; ifp->if_mtu = tapp->mtu; ifp->if_type = tapp->type; ifp->if_baudrate = tapp->baudrate; - splx(s); break; case TAPGIFINFO: @@ -757,26 +736,26 @@ break; case FIOASYNC: - s = splimp(); mtx_lock(&tp->tap_mtx); if (*(int *)data) tp->tap_flags |= TAP_ASYNC; else tp->tap_flags &= ~TAP_ASYNC; mtx_unlock(&tp->tap_mtx); - splx(s); break; case FIONREAD: - s = splimp(); - if (ifp->if_snd.ifq_head) { - struct mbuf *mb = ifp->if_snd.ifq_head; + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + struct mbuf *mb; - for(*(int *)data = 0;mb != NULL;mb = mb->m_next) + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, mb); + for (*(int *)data = 0; mb != NULL; + mb = mb->m_next) *(int *)data += mb->m_len; + IFQ_UNLOCK(&ifp->if_snd); } else *(int *)data = 0; - splx(s); break; case FIOSETOWN: @@ -797,10 +776,6 @@ /* VMware/VMnet port ioctl's */ - case SIOCGIFFLAGS: /* get ifnet flags */ - bcopy(&ifp->if_flags, data, sizeof(ifp->if_flags)); - break; - #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): @@ -814,9 +789,9 @@ f &= ~IFF_CANTCHANGE; f |= IFF_UP; - s = splimp(); + mtx_lock(&tp->tap_mtx); ifp->if_flags = f | (ifp->if_flags & IFF_CANTCHANGE); - splx(s); + mtx_unlock(&tp->tap_mtx); break; case OSIOCGIFADDR: /* get MAC address of the remote side */ @@ -851,7 +826,7 @@ struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; struct mbuf *m = NULL; - int error = 0, len, s; + int error = 0, len; TAPDEBUG("%s reading, minor = %#x\n", ifp->if_xname, dev2unit(dev)); @@ -867,26 +842,27 @@ } tp->tap_flags &= ~TAP_RWAIT; - mtx_unlock(&tp->tap_mtx); /* sleep until we get a packet */ do { - s = splimp(); IF_DEQUEUE(&ifp->if_snd, m); - splx(s); if (m == NULL) { - if (flag & O_NONBLOCK) + if (flag & O_NONBLOCK) { + mtx_unlock(&tp->tap_mtx); return (EWOULDBLOCK); + } - mtx_lock(&tp->tap_mtx); tp->tap_flags |= TAP_RWAIT; - mtx_unlock(&tp->tap_mtx); - error = tsleep(tp,PCATCH|(PZERO+1),"taprd",0); - if (error) + error = mtx_sleep(tp, &tp->tap_mtx, PCATCH | (PZERO + 1), + "taprd", 0); + if (error) { + mtx_unlock(&tp->tap_mtx); return (error); + } } } while (m == NULL); + mtx_unlock(&tp->tap_mtx); /* feed packet to bpf */ BPF_MTAP(ifp, m); @@ -982,14 +958,14 @@ { struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; - int s, revents = 0; + int revents = 0; TAPDEBUG("%s polling, minor = %#x\n", ifp->if_xname, dev2unit(dev)); - s = splimp(); if (events & (POLLIN | POLLRDNORM)) { - if (ifp->if_snd.ifq_len > 0) { + IFQ_LOCK(&ifp->if_snd); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TAPDEBUG("%s have data in queue. len = %d, " \ "minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); @@ -1001,12 +977,12 @@ selrecord(td, &tp->tap_rsel); } + IFQ_UNLOCK(&ifp->if_snd); } if (events & (POLLOUT | POLLWRNORM)) revents |= (events & (POLLOUT | POLLWRNORM)); - splx(s); return (revents); } /* tappoll */ @@ -1019,11 +995,9 @@ static int tapkqfilter(struct cdev *dev, struct knote *kn) { - int s; struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; - s = splimp(); switch (kn->kn_filter) { case EVFILT_READ: TAPDEBUG("%s kqfilter: EVFILT_READ, minor = %#x\n", @@ -1040,11 +1014,9 @@ default: TAPDEBUG("%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); - splx(s); return (EINVAL); /* NOT REACHED */ } - splx(s); kn->kn_hook = (caddr_t) dev; knlist_add(&tp->tap_rsel.si_note, kn, 0); @@ -1061,12 +1033,11 @@ static int tapkqread(struct knote *kn, long hint) { - int ret, s; + int ret; struct cdev *dev = (struct cdev *)(kn->kn_hook); struct tap_softc *tp = dev->si_drv1; struct ifnet *ifp = tp->tap_ifp; - s = splimp(); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TAPDEBUG("%s have data in queue. len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); @@ -1076,7 +1047,6 @@ ifp->if_xname, dev2unit(dev)); ret = 0; } - splx(s); return (ret); } /* tapkqread */ @@ -1090,13 +1060,10 @@ static int tapkqwrite(struct knote *kn, long hint) { - int s; struct tap_softc *tp = ((struct cdev *) kn->kn_hook)->si_drv1; struct ifnet *ifp = tp->tap_ifp; - s = splimp(); kn->kn_data = ifp->if_mtu; - splx(s); return (1); } /* tapkqwrite */ Index: if_tun.c =================================================================== --- if_tun.c (revision 212557) +++ if_tun.c (working copy) @@ -344,13 +344,13 @@ tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { mtx_unlock(&tp->tun_mtx); pgsigio(&tp->tun_sigio, SIGIO, 0); } else mtx_unlock(&tp->tun_mtx); - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_UNLOCKED(&tp->tun_rsel.si_note, 0); } /* XXX: should return an error code so it can fail. */ @@ -385,7 +385,7 @@ IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); - knlist_init_mtx(&sc->tun_rsel.si_note, NULL); + knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; @@ -443,7 +443,6 @@ { struct tun_softc *tp; struct ifnet *ifp; - int s; tp = dev->si_drv1; ifp = TUN2IFP(tp); @@ -451,27 +450,25 @@ mtx_lock(&tp->tun_mtx); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; - mtx_unlock(&tp->tun_mtx); /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); - s = splimp(); IFQ_PURGE(&ifp->if_snd); - splx(s); if (ifp->if_flags & IFF_UP) { - s = splimp(); + mtx_unlock(&tp->tun_mtx); if_down(ifp); - splx(s); + mtx_lock(&tp->tun_mtx); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { struct ifaddr *ifa; - s = splimp(); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + mtx_unlock(&tp->tun_mtx); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { /* deal w/IPv4 PtP destination; unlocked read */ if (ifa->ifa_addr->sa_family == AF_INET) { @@ -482,16 +479,14 @@ } } if_purgeaddrs(ifp); - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - splx(s); + mtx_lock(&tp->tun_mtx); } if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); - mtx_lock(&tp->tun_mtx); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_UNLOCKED(&tp->tun_rsel.si_note, 0); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); cv_broadcast(&tp->tun_cv); @@ -510,9 +505,11 @@ TUNDEBUG(ifp, "tuninit\n"); + mtx_lock(&tp->tun_mtx); ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; getmicrotime(&ifp->if_lastchange); + mtx_unlock(&tp->tun_mtx); #ifdef INET if_addr_rlock(ifp); @@ -545,9 +542,8 @@ struct ifreq *ifr = (struct ifreq *)data; struct tun_softc *tp = ifp->if_softc; struct ifstat *ifs; - int error = 0, s; + int error = 0; - s = splimp(); switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; @@ -576,7 +572,6 @@ default: error = EINVAL; } - splx(s); return (error); } @@ -682,7 +677,6 @@ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { - int s; int error; struct tun_softc *tp = dev->si_drv1; struct tuninfo *tunp; @@ -745,9 +739,11 @@ switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: + mtx_lock(&tp->tun_mtx); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; + mtx_unlock(&tp->tun_mtx); break; default: return(EINVAL); @@ -769,17 +765,15 @@ mtx_unlock(&tp->tun_mtx); break; case FIONREAD: - s = splimp(); if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); - for( *(int *)data = 0; mb != 0; mb = mb->m_next) + for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; - splx(s); break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); @@ -813,7 +807,7 @@ struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; - int error=0, len, s; + int error=0, len; TUNDEBUG (ifp, "read\n"); mtx_lock(&tp->tun_mtx); @@ -824,27 +818,24 @@ } tp->tun_flags &= ~TUN_RWAIT; - mtx_unlock(&tp->tun_mtx); - s = splimp(); do { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { if (flag & O_NONBLOCK) { - splx(s); + mtx_unlock(&tp->tun_mtx); return (EWOULDBLOCK); } - mtx_lock(&tp->tun_mtx); tp->tun_flags |= TUN_RWAIT; - mtx_unlock(&tp->tun_mtx); - if ((error = tsleep(tp, PCATCH | (PZERO + 1), - "tunread", 0)) != 0) { - splx(s); + error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), + "tunread", 0); + if (error != 0) { + mtx_unlock(&tp->tun_mtx); return (error); } } } while (m == NULL); - splx(s); + mtx_unlock(&tp->tun_mtx); while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); @@ -957,13 +948,11 @@ static int tunpoll(struct cdev *dev, int events, struct thread *td) { - int s; struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; struct mbuf *m; - s = splimp(); TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { @@ -981,7 +970,6 @@ if (events & (POLLOUT | POLLWRNORM)) revents |= events & (POLLOUT | POLLWRNORM); - splx(s); return (revents); } @@ -991,11 +979,9 @@ static int tunkqfilter(struct cdev *dev, struct knote *kn) { - int s; struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); - s = splimp(); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", @@ -1012,10 +998,8 @@ default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); - splx(s); return(EINVAL); } - splx(s); kn->kn_hook = (caddr_t) dev; knlist_add(&tp->tun_rsel.si_note, kn, 0); @@ -1029,12 +1013,11 @@ static int tunkqread(struct knote *kn, long hint) { - int ret, s; + int ret; struct cdev *dev = (struct cdev *)(kn->kn_hook); struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); - s = splimp(); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", @@ -1046,7 +1029,6 @@ dev2unit(dev)); ret = 0; } - splx(s); return (ret); } @@ -1057,13 +1039,10 @@ static int tunkqwrite(struct knote *kn, long hint) { - int s; struct tun_softc *tp = ((struct cdev *)kn->kn_hook)->si_drv1; struct ifnet *ifp = TUN2IFP(tp); - s = splimp(); kn->kn_data = ifp->if_mtu; - splx(s); return (1); } -- John Baldwin
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201009151749.45038.jhb>