Date: Wed, 09 Jan 2008 00:54:44 +0100 From: Kris Kennaway <kris@FreeBSD.org> To: Kris Kennaway <kris@FreeBSD.org> Cc: "freebsd-stable@freebsd.org" <freebsd-stable@freebsd.org> Subject: Re: Performance! Message-ID: <47840D44.9090106@FreeBSD.org> In-Reply-To: <47840D21.6060807@FreeBSD.org> References: <476A5EE1.9000003@bulinfo.net> <476FF662.6050604@FreeBSD.org> <477BB7C0.3060603@bulinfo.net> <477C1FA3.2070904@FreeBSD.org> <477CC7DC.6060801@bulinfo.net> <47840D21.6060807@FreeBSD.org>
next in thread | previous in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format. --------------050309040305010505040405 Content-Type: text/plain; charset=windows-1250; format=flowed Content-Transfer-Encoding: 7bit Kris Kennaway wrote: > Krassimir Slavchev wrote: >> -----BEGIN PGP SIGNED MESSAGE----- >> Hash: SHA1 >> >> Hello Kris, >> >> Here is the lock profiling results, see the attachment. >> >> Please, let me know if you want ssh access to this machine? > > Thanks, this is very interesting. The problem is already fixed in 8.0 > but we were not seeing it being this much of a factor on our test > hardware. Possibly it is due to your CPUs being moderately faster and > causing a timing difference. Anyway, try this patch. If there is still > a performance deficit then repeating the lock profiling will be useful. > > Kris > > Now with actual patch. Kris --------------050309040305010505040405 Content-Type: text/plain; x-mac-type="0"; x-mac-creator="0"; name="select.diff" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="select.diff" FreeBSD src repository Modified files: sys/kern kern_event.c kern_thread.c sys_generic.c sys_pipe.c uipc_sockbuf.c sys/netncp ncp_rq.c ncp_sock.c ncp_sock.h sys/netsmb smb_trantcp.c sys/sys proc.h selinfo.h socketvar.h systm.h Log: Refactor select to reduce contention and hide internal implementation details from consumers. - Track individual selecters on a per-descriptor basis such that there are no longer collisions and after sleeping for events only those descriptors which triggered events must be rescaned. - Protect the selinfo (per descriptor) structure with a mtx pool mutex. mtx pool mutexes were chosen to preserve api compatibility with existing code which does nothing but bzero() to setup selinfo structures. - Use a per-thread wait channel rather than a global wait channel. - Hide select implementation details in a seltd structure which is opaque to the rest of the kernel. - Provide a 'selsocket' interface for those kernel consumers who wish to select on a socket when they have no fd so they no longer have to be aware of select implementation details. Tested by: kris Reviewed on: arch Revision Changes Path 1.114 +6 -3 src/sys/kern/kern_event.c 1.264 +2 -0 src/sys/kern/kern_thread.c 1.160 +414 -168 src/sys/kern/sys_generic.c 1.194 +6 -3 src/sys/kern/sys_pipe.c 1.173 +2 -1 src/sys/kern/uipc_sockbuf.c 1.16 +9 -3 src/sys/netncp/ncp_rq.c 1.20 +0 -105 src/sys/netncp/ncp_sock.c 1.7 +0 -3 src/sys/netncp/ncp_sock.h 1.27 +1 -79 src/sys/netsmb/smb_trantcp.c 1.498 +2 -3 src/sys/sys/proc.h 1.19 +8 -8 src/sys/sys/selinfo.h 1.159 +2 -0 src/sys/sys/socketvar.h 1.263 +0 -4 src/sys/sys/systm.h http://cvsweb.FreeBSD.org/src/sys/kern/kern_event.c.diff?r1=1.113&r2=1.114 --- src/sys/kern/kern_event.c 2007/07/14 21:23:30 1.113 +++ src/sys/kern/kern_event.c 2007/12/16 06:21:19 1.114 @@ -1400,7 +1400,8 @@ kqueue_poll(struct file *fp, int events, revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); - kq->kq_state |= KQ_SEL; + if (SEL_WAITING(&kq->kq_sel)) + kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); @@ -1486,8 +1487,9 @@ kqueue_close(struct file *fp, struct thr } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { - kq->kq_state &= ~KQ_SEL; selwakeuppri(&kq->kq_sel, PSOCK); + if (!SEL_WAITING(&kq->kq_sel)) + kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); @@ -1522,8 +1524,9 @@ kqueue_wakeup(struct kqueue *kq) wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { - kq->kq_state &= ~KQ_SEL; selwakeuppri(&kq->kq_sel, PSOCK); + if (!SEL_WAITING(&kq->kq_sel)) + kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); http://cvsweb.FreeBSD.org/src/sys/kern/kern_thread.c.diff?r1=1.263&r2=1.264 diff -u -r1.255.2.1 kern_thread.c --- src/sys/kern/kern_thread.c 14 Dec 2007 13:41:09 -0000 1.255.2.1 +++ src/sys/kern/kern_thread.c 8 Jan 2008 23:47:00 -0000 @@ -40,6 +40,7 @@ #include <sys/sysctl.h> #include <sys/sched.h> #include <sys/sleepqueue.h> +#include <sys/selinfo.h> #include <sys/turnstile.h> #include <sys/ktr.h> #include <sys/umtx.h> @@ -207,6 +208,7 @@ turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); + seltdfini(td); vm_thread_dispose(td); } http://cvsweb.FreeBSD.org/src/sys/kern/sys_generic.c.diff?r1=1.159&r2=1.160 --- src/sys/kern/sys_generic.c 2007/11/14 06:21:23 1.159 +++ src/sys/kern/sys_generic.c 2007/12/16 06:21:19 1.160 @@ -69,17 +69,59 @@ __FBSDID("$FreeBSD: /usr/local/www/cvsro #include <sys/ktrace.h> #endif +#include <sys/ktr.h> + static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollscan(struct thread *, struct pollfd *, u_int); +static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); +static int selrescan(struct thread *, fd_mask **, fd_mask **); +static void selfdalloc(struct thread *, void *); +static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); +static void seltdinit(struct thread *); +static int seltdwait(struct thread *, int); +static void seltdclear(struct thread *); + +/* + * One seltd per-thread allocated on demand as needed. + * + * t - protected by st_mtx + * k - Only accessed by curthread or read-only + */ +struct seltd { + STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ + struct selfd *st_free1; /* (k) free fd for read set. */ + struct selfd *st_free2; /* (k) free fd for write set. */ + struct mtx st_mtx; /* Protects struct seltd */ + struct cv st_wait; /* (t) Wait channel. */ + int st_flags; /* (t) SELTD_ flags. */ +}; + +#define SELTD_PENDING 0x0001 /* We have pending events. */ +#define SELTD_RESCAN 0x0002 /* Doing a rescan. */ + +/* + * One selfd allocated per-thread per-file-descriptor. + * f - protected by sf_mtx + */ +struct selfd { + STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ + TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ + struct selinfo *sf_si; /* (f) selinfo when linked. */ + struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ + struct seltd *sf_td; /* (k) owning seltd. */ + void *sf_cookie; /* (k) fd or pollfd. */ +}; + +static uma_zone_t selfd_zone; #ifndef _SYS_SYSPROTO_H_ struct read_args { @@ -629,14 +671,6 @@ out: return (error); } -/* - * sellock and selwait are initialized in selectinit() via SYSINIT. - */ -struct mtx sellock; -struct cv selwait; -u_int nselcoll; /* Select collisions since boot */ -SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); - #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; @@ -678,7 +712,7 @@ kern_select(struct thread *td, int nd, f fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval atv, rtv, ttv; int error, timo; - u_int ncoll, nbufbytes, ncpbytes, nfdbits; + u_int nbufbytes, ncpbytes, nfdbits; if (nd < 0) return (EINVAL); @@ -723,7 +757,7 @@ kern_select(struct thread *td, int nd, f sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpbytes); \ if (error != 0) \ - goto done_nosellock; \ + goto done; \ } \ } while (0) getbits(fd_in, 0); @@ -737,7 +771,7 @@ kern_select(struct thread *td, int nd, f atv = *tvp; if (itimerfix(&atv)) { error = EINVAL; - goto done_nosellock; + goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); @@ -746,58 +780,31 @@ kern_select(struct thread *td, int nd, f atv.tv_usec = 0; } timo = 0; - TAILQ_INIT(&td->td_selq); - mtx_lock(&sellock); -retry: - ncoll = nselcoll; - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - error = selscan(td, ibits, obits, nd); - mtx_lock(&sellock); - if (error || td->td_retval[0]) - goto done; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); - } - - /* - * An event of interest may occur while we do not hold - * sellock, so check TDF_SELECT and the number of - * collisions and rescan the file descriptors if - * necessary. - */ - thread_lock(td); - if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - thread_unlock(td); - goto retry; + seltdinit(td); + /* Iterate until the timeout expires or descriptors become ready. */ + for (;;) { + error = selscan(td, ibits, obits, nd); + if (error || td->td_retval[0] != 0) + break; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + break; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td, timo); + if (error) + break; + error = selrescan(td, ibits, obits); + if (error || td->td_retval[0] != 0) + break; } - thread_unlock(td); - - if (timo > 0) - error = cv_timedwait_sig(&selwait, &sellock, timo); - else - error = cv_wait_sig(&selwait, &sellock); - - if (error == 0) - goto retry; + seltdclear(td); done: - clear_selinfo_list(td); - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - -done_nosellock: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; @@ -820,6 +827,60 @@ done_nosellock: return (error); } +/* + * Traverse the list of fds attached to this thread's seltd and check for + * completion. + */ +static int +selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + struct selinfo *si; + struct file *fp; + int msk, fd; + int n = 0; + /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ + static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; + struct filedesc *fdp = td->td_proc->p_fd; + + stp = td->td_sel; + FILEDESC_SLOCK(fdp); + STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { + fd = (int)(uintptr_t)sfp->sf_cookie; + si = sfp->sf_si; + selfdfree(stp, sfp); + /* If the selinfo wasn't cleared the event didn't fire. */ + if (si != NULL) + continue; + if ((fp = fget_locked(fdp, fd)) == NULL) { + FILEDESC_SUNLOCK(fdp); + return (EBADF); + } + for (msk = 0; msk < 3; msk++) { + if (ibits[msk] == NULL) + continue; + if ((ibits[msk][fd/NFDBITS] & + ((fd_mask) 1 << (fd % NFDBITS))) == 0) + continue; + if (fo_poll(fp, flag[msk], td->td_ucred, td)) { + obits[msk][(fd)/NFDBITS] |= + ((fd_mask)1 << ((fd) % NFDBITS)); + n++; + } + } + } + FILEDESC_SUNLOCK(fdp); + stp->st_flags = 0; + td->td_retval[0] = n; + return (0); +} + +/* + * Perform the initial filedescriptor scan and register ourselves with + * each selinfo. + */ static int selscan(td, ibits, obits, nfd) struct thread *td; @@ -848,6 +909,7 @@ selscan(td, ibits, obits, nfd) FILEDESC_SUNLOCK(fdp); return (EBADF); } + selfdalloc(td, (void *)(uintptr_t)fd); if (fo_poll(fp, flag[msk], td->td_ucred, td)) { obits[msk][(fd)/NFDBITS] |= @@ -878,7 +940,7 @@ poll(td, uap) struct pollfd smallbits[32]; struct timeval atv, rtv, ttv; int error = 0, timo; - u_int ncoll, nfds; + u_int nfds; size_t ni; nfds = uap->nfds; @@ -894,8 +956,7 @@ poll(td, uap) if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && (nfds > FD_SETSIZE)) { PROC_UNLOCK(td->td_proc); - error = EINVAL; - goto done2; + return (EINVAL); } PROC_UNLOCK(td->td_proc); ni = nfds * sizeof(struct pollfd); @@ -905,13 +966,13 @@ poll(td, uap) bits = smallbits; error = copyin(uap->fds, bits, ni); if (error) - goto done_nosellock; + goto done; if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; if (itimerfix(&atv)) { error = EINVAL; - goto done_nosellock; + goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); @@ -920,56 +981,31 @@ poll(td, uap) atv.tv_usec = 0; } timo = 0; - TAILQ_INIT(&td->td_selq); - mtx_lock(&sellock); -retry: - ncoll = nselcoll; - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - error = pollscan(td, bits, nfds); - mtx_lock(&sellock); - if (error || td->td_retval[0]) - goto done; - if (atv.tv_sec || atv.tv_usec) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) - goto done; - ttv = atv; - timevalsub(&ttv, &rtv); - timo = ttv.tv_sec > 24 * 60 * 60 ? - 24 * 60 * 60 * hz : tvtohz(&ttv); - } - /* - * An event of interest may occur while we do not hold - * sellock, so check TDF_SELECT and the number of collisions - * and rescan the file descriptors if necessary. - */ - thread_lock(td); - if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - thread_unlock(td); - goto retry; + seltdinit(td); + /* Iterate until the timeout expires or descriptors become ready. */ + for (;;) { + error = pollscan(td, bits, nfds); + if (error || td->td_retval[0] != 0) + break; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + break; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td, timo); + if (error) + break; + error = pollrescan(td); + if (error || td->td_retval[0] != 0) + break; } - thread_unlock(td); - - if (timo > 0) - error = cv_timedwait_sig(&selwait, &sellock, timo); - else - error = cv_wait_sig(&selwait, &sellock); - - if (error == 0) - goto retry; + seltdclear(td); done: - clear_selinfo_list(td); - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - -done_nosellock: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; @@ -983,17 +1019,60 @@ done_nosellock: out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); -done2: return (error); } static int +pollrescan(struct thread *td) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + struct selinfo *si; + struct filedesc *fdp; + struct file *fp; + struct pollfd *fd; + int n; + + n = 0; + fdp = td->td_proc->p_fd; + stp = td->td_sel; + FILEDESC_SLOCK(fdp); + STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { + fd = (struct pollfd *)sfp->sf_cookie; + si = sfp->sf_si; + selfdfree(stp, sfp); + /* If the selinfo wasn't cleared the event didn't fire. */ + if (si != NULL) + continue; + fp = fdp->fd_ofiles[fd->fd]; + if (fp == NULL) { + fd->revents = POLLNVAL; + n++; + continue; + } + /* + * Note: backend also returns POLLHUP and + * POLLERR if appropriate. + */ + fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); + if (fd->revents != 0) + n++; + } + FILEDESC_SUNLOCK(fdp); + stp->st_flags = 0; + td->td_retval[0] = n; + return (0); +} + + +static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { - register struct filedesc *fdp = td->td_proc->p_fd; + struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; @@ -1015,6 +1094,7 @@ pollscan(td, fds, nfd) * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ + selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); if (fds->revents != 0) @@ -1048,23 +1128,90 @@ openbsd_poll(td, uap) } /* - * Remove the references to the thread from all of the objects we were - * polling. - * - * This code assumes that the underlying owner of the selinfo structure will - * hold sellock before it changes it, and that it will unlink itself from our - * list if it goes away. + * XXX This was created specifically to support netncp and netsmb. This + * allows the caller to specify a socket to wait for events on. It returns + * 0 if any events matched and an error otherwise. There is no way to + * determine which events fired. */ -void -clear_selinfo_list(td) - struct thread *td; +int +selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) { - struct selinfo *si; + struct timeval atv, rtv, ttv; + int error, timo; + + if (tvp != NULL) { + atv = *tvp; + if (itimerfix(&atv)) + return (EINVAL); + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } - mtx_assert(&sellock, MA_OWNED); - TAILQ_FOREACH(si, &td->td_selq, si_thrlist) - si->si_thread = NULL; - TAILQ_INIT(&td->td_selq); + timo = 0; + seltdinit(td); + /* + * Iterate until the timeout expires or the socket becomes ready. + */ + for (;;) { + selfdalloc(td, NULL); + error = sopoll(so, events, NULL, td); + /* error here is actually the ready events. */ + if (error) + return (0); + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) { + seltdclear(td); + return (EWOULDBLOCK); + } + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td, timo); + seltdclear(td); + if (error) + break; + } + /* XXX Duplicates ncp/smb behavior. */ + if (error == ERESTART) + error = 0; + return (error); +} + +/* + * Preallocate two selfds associated with 'cookie'. Some fo_poll routines + * have two select sets, one for read and another for write. + */ +static void +selfdalloc(struct thread *td, void *cookie) +{ + struct seltd *stp; + + stp = td->td_sel; + if (stp->st_free1 == NULL) + stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); + stp->st_free1->sf_td = stp; + stp->st_free1->sf_cookie = cookie; + if (stp->st_free2 == NULL) + stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); + stp->st_free2->sf_td = stp; + stp->st_free2->sf_cookie = cookie; +} + +static void +selfdfree(struct seltd *stp, struct selfd *sfp) +{ + STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); + mtx_lock(sfp->sf_mtx); + if (sfp->sf_si) + TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); + mtx_unlock(sfp->sf_mtx); + uma_zfree(selfd_zone, sfp); } /* @@ -1075,26 +1222,46 @@ selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { + struct selfd *sfp; + struct seltd *stp; + struct mtx *mtxp; - mtx_lock(&sellock); + stp = selector->td_sel; + /* + * Don't record when doing a rescan. + */ + if (stp->st_flags & SELTD_RESCAN) + return; + /* + * Grab one of the preallocated descriptors. + */ + sfp = NULL; + if ((sfp = stp->st_free1) != NULL) + stp->st_free1 = NULL; + else if ((sfp = stp->st_free2) != NULL) + stp->st_free2 = NULL; + else + panic("selrecord: No free selfd on selq"); + mtxp = mtx_pool_find(mtxpool_sleep, sip); + /* + * Initialize the sfp and queue it in the thread. + */ + sfp->sf_si = sip; + sfp->sf_mtx = mtxp; + STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); /* - * If the selinfo's thread pointer is NULL then take ownership of it. - * - * If the thread pointer is not NULL and it points to another - * thread, then we have a collision. - * - * If the thread pointer is not NULL and points back to us then leave - * it alone as we've already added pointed it at us and added it to - * our list. + * Now that we've locked the sip, check for initialization. */ - if (sip->si_thread == NULL) { - sip->si_thread = selector; - TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); - } else if (sip->si_thread != selector) { - sip->si_flags |= SI_COLL; + mtx_lock(mtxp); + if (sip->si_mtx == NULL) { + sip->si_mtx = mtxp; + TAILQ_INIT(&sip->si_tdlist); } - - mtx_unlock(&sellock); + /* + * Add this thread to the list of selfds listening on this selinfo. + */ + TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); + mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ @@ -1122,36 +1289,115 @@ doselwakeup(sip, pri) struct selinfo *sip; int pri; { - struct thread *td; + struct selfd *sfp; + struct selfd *sfn; + struct seltd *stp; - mtx_lock(&sellock); - td = sip->si_thread; - if ((sip->si_flags & SI_COLL) != 0) { - nselcoll++; - sip->si_flags &= ~SI_COLL; - cv_broadcastpri(&selwait, pri); - } - if (td == NULL) { - mtx_unlock(&sellock); + /* If it's not initialized there can't be any waiters. */ + if (sip->si_mtx == NULL) return; + /* + * Locking the selinfo locks all selfds associated with it. + */ + mtx_lock(sip->si_mtx); + TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { + /* + * Once we remove this sfp from the list and clear the + * sf_si seltdclear will know to ignore this si. + */ + TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); + sfp->sf_si = NULL; + stp = sfp->sf_td; + mtx_lock(&stp->st_mtx); + stp->st_flags |= SELTD_PENDING; + cv_broadcastpri(&stp->st_wait, pri); + mtx_unlock(&stp->st_mtx); } - TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); - sip->si_thread = NULL; - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - sleepq_remove(td, &selwait); - mtx_unlock(&sellock); + mtx_unlock(sip->si_mtx); } -static void selectinit(void *); -SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) +static void +seltdinit(struct thread *td) +{ + struct seltd *stp; + + if ((stp = td->td_sel) != NULL) + goto out; + td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); + mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); + cv_init(&stp->st_wait, "select"); +out: + stp->st_flags = 0; + STAILQ_INIT(&stp->st_selq); +} + +static int +seltdwait(struct thread *td, int timo) +{ + struct seltd *stp; + int error; -/* ARGSUSED*/ + stp = td->td_sel; + /* + * An event of interest may occur while we do not hold the seltd + * locked so check the pending flag before we sleep. + */ + mtx_lock(&stp->st_mtx); + /* + * Any further calls to selrecord will be a rescan. + */ + stp->st_flags |= SELTD_RESCAN; + if (stp->st_flags & SELTD_PENDING) { + mtx_unlock(&stp->st_mtx); + return (0); + } + if (timo > 0) + error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); + else + error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); + mtx_unlock(&stp->st_mtx); + + return (error); +} + +void +seltdfini(struct thread *td) +{ + struct seltd *stp; + + stp = td->td_sel; + if (stp == NULL) + return; + if (stp->st_free1) + uma_zfree(selfd_zone, stp->st_free1); + if (stp->st_free2) + uma_zfree(selfd_zone, stp->st_free2); + td->td_sel = NULL; + free(stp, M_SELECT); +} + +/* + * Remove the references to the thread from all of the objects we were + * polling. + */ +static void +seltdclear(struct thread *td) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + + stp = td->td_sel; + STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) + selfdfree(stp, sfp); + stp->st_flags = 0; +} + +static void selectinit(void *); +SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); static void -selectinit(dummy) - void *dummy; +selectinit(void *dummy __unused) { - cv_init(&selwait, "select"); - mtx_init(&sellock, "sellck", NULL, MTX_DEF); + selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); } http://cvsweb.FreeBSD.org/src/sys/kern/sys_pipe.c.diff?r1=1.193&r2=1.194 --- src/sys/kern/sys_pipe.c 2007/11/19 15:05:20 1.193 +++ src/sys/kern/sys_pipe.c 2007/12/16 06:21:19 1.194 @@ -524,8 +524,9 @@ pipeselwakeup(cpipe) PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { - cpipe->pipe_state &= ~PIPE_SEL; selwakeuppri(&cpipe->pipe_sel, PSOCK); + if (!SEL_WAITING(&cpipe->pipe_sel)) + cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); @@ -1354,12 +1355,14 @@ pipe_poll(fp, events, active_cred, td) if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); - rpipe->pipe_state |= PIPE_SEL; + if (SEL_WAITING(&rpipe->pipe_sel)) + rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); - wpipe->pipe_state |= PIPE_SEL; + if (SEL_WAITING(&wpipe->pipe_sel)) + wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC http://cvsweb.FreeBSD.org/src/sys/kern/uipc_sockbuf.c.diff?r1=1.172&r2=1.173 --- src/sys/kern/uipc_sockbuf.c 2007/10/12 03:56:27 1.172 +++ src/sys/kern/uipc_sockbuf.c 2007/12/16 06:21:19 1.173 @@ -176,7 +176,8 @@ sowakeup(struct socket *so, struct sockb SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(&sb->sb_sel, PSOCK); - sb->sb_flags &= ~SB_SEL; + if (!SEL_WAITING(&sb->sb_sel)) + sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_cc); http://cvsweb.FreeBSD.org/src/sys/netncp/ncp_rq.c.diff?r1=1.15&r2=1.16 --- src/sys/netncp/ncp_rq.c 2005/07/29 13:22:37 1.15 +++ src/sys/netncp/ncp_rq.c 2007/12/16 06:21:20 1.16 @@ -43,6 +43,8 @@ __FBSDID("$FreeBSD: /usr/local/www/cvsro #include <sys/mbuf.h> #include <sys/poll.h> #include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/uio.h> #include <netncp/ncp.h> @@ -274,7 +276,9 @@ ncp_request_int(struct ncp_rq *rqp) /* * Flush out replies on previous reqs */ - while (ncp_poll(so, POLLIN) != 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + while (selsocket(so, POLLIN, &tv, td) == 0) { if (ncp_sock_recv(so, &m, &len) != 0) break; m_freem(m); @@ -319,7 +323,7 @@ ncp_request_int(struct ncp_rq *rqp) } tv.tv_sec = conn->li.timeout; tv.tv_usec = 0; - error = ncp_sock_rselect(so, td, &tv, POLLIN); + error = selsocket(so, POLLIN, &tv, td); if (error == EWOULDBLOCK ) /* timeout expired */ continue; error = ncp_chkintr(conn, td); @@ -335,7 +339,9 @@ ncp_request_int(struct ncp_rq *rqp) dosend = 1; /* resend rq if error */ for (;;) { error = 0; - if (ncp_poll(so, POLLIN) == 0) + tv.tv_sec = 0; + tv.tv_usec = 0; + if (selsocket(so, POLLIN, &tv, td) != 0) break; /* if (so->so_rcv.sb_cc == 0) { break; http://cvsweb.FreeBSD.org/src/sys/netncp/ncp_sock.c.diff?r1=1.19&r2=1.20 --- src/sys/netncp/ncp_sock.c 2007/06/05 00:00:55 1.19 +++ src/sys/netncp/ncp_sock.c 2007/12/16 06:21:20 1.20 @@ -65,7 +65,6 @@ __FBSDID("$FreeBSD: /usr/local/www/cvsro #define ipx_setnullhost(x) ((x).x_host.s_host[0] = 0); \ ((x).x_host.s_host[1] = 0); ((x).x_host.s_host[2] = 0); -/*int ncp_poll(struct socket *so, int events);*/ /*static int ncp_getsockname(struct socket *so, caddr_t asa, int *alen);*/ static int ncp_soconnect(struct socket *so, struct sockaddr *target, struct thread *td); @@ -181,110 +180,6 @@ ncp_sock_send(struct socket *so, struct return error; } -int -ncp_poll(struct socket *so, int events) -{ - struct thread *td = curthread; - int revents; - - /* Fake up enough state to look like we are in poll(2). */ - mtx_lock(&sellock); - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - TAILQ_INIT(&td->td_selq); - - revents = sopoll(so, events, NULL, td); - - /* Tear down the fake poll(2) state. */ - mtx_lock(&sellock); - clear_selinfo_list(td); - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - return (revents); -} - -int -ncp_sock_rselect(struct socket *so, struct thread *td, struct timeval *tv, - int events) -{ - struct timeval atv, rtv, ttv; - int ncoll, timo, error, revents; - - if (tv) { - atv = *tv; - if (itimerfix(&atv)) { - error = EINVAL; - goto done_noproclock; - } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); - } - timo = 0; - mtx_lock(&sellock); - -retry: - ncoll = nselcoll; - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - TAILQ_INIT(&td->td_selq); - revents = sopoll(so, events, NULL, td); - mtx_lock(&sellock); - if (revents) { - error = 0; - goto done; - } - if (tv) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) { - error = EWOULDBLOCK; - goto done; - } - ttv = atv; - timevalsub(&ttv, &rtv); - timo = tvtohz(&ttv); - } - /* - * An event of our interest may occur during locking a thread. - * In order to avoid missing the event that occurred during locking - * the process, test TDF_SELECT and rescan file descriptors if - * necessary. - */ - thread_lock(td); - if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - thread_unlock(td); - goto retry; - } - thread_unlock(td); - - if (timo > 0) - error = cv_timedwait(&selwait, &sellock, timo); - else { - cv_wait(&selwait, &sellock); - error = 0; - } - -done: - clear_selinfo_list(td); - - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - -done_noproclock: - if (error == ERESTART) - error = 0; - return (error); -} - /* * Connect to specified server via IPX */ http://cvsweb.FreeBSD.org/src/sys/netncp/ncp_sock.h.diff?r1=1.6&r2=1.7 --- src/sys/netncp/ncp_sock.h 2005/01/07 01:45:49 1.6 +++ src/sys/netncp/ncp_sock.h 2007/12/16 06:21:20 1.7 @@ -45,9 +45,6 @@ int ncp_sock_connect(struct ncp_conn *n int ncp_sock_recv(struct socket *so, struct mbuf **mp, int *rlen); int ncp_sock_send(struct socket *so, struct mbuf *data, struct ncp_rq *rqp); int ncp_sock_disconnect(struct ncp_conn *conn); -int ncp_poll(struct socket *so, int events); -int ncp_sock_rselect(struct socket *so, struct thread *td, struct timeval *tv, - int events); int ncp_sock_checksum(struct ncp_conn *conn, int enable); void ncp_check_rq(struct ncp_conn *conn); http://cvsweb.FreeBSD.org/src/sys/netsmb/smb_trantcp.c.diff?r1=1.26&r2=1.27 --- src/sys/netsmb/smb_trantcp.c 2007/06/15 23:49:54 1.26 +++ src/sys/netsmb/smb_trantcp.c 2007/12/16 06:21:20 1.27 @@ -95,84 +95,6 @@ nb_setsockopt_int(struct socket *so, int } static int -nbssn_rselect(struct nbpcb *nbp, struct timeval *tv, int events, - struct thread *td) -{ - struct timeval atv, rtv, ttv; - int ncoll, timo, error, revents; - - if (tv) { - atv = *tv; - if (itimerfix(&atv)) { - error = EINVAL; - goto done_noproclock; - } - getmicrouptime(&rtv); - timevaladd(&atv, &rtv); - } - timo = 0; - mtx_lock(&sellock); -retry: - - ncoll = nselcoll; - thread_lock(td); - td->td_flags |= TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - - /* XXX: Should be done when the thread is initialized. */ - TAILQ_INIT(&td->td_selq); - revents = sopoll(nbp->nbp_tso, events, NULL, td); - mtx_lock(&sellock); - if (revents) { - error = 0; - goto done; - } - if (tv) { - getmicrouptime(&rtv); - if (timevalcmp(&rtv, &atv, >=)) { - error = EWOULDBLOCK; - goto done; - } - ttv = atv; - timevalsub(&ttv, &rtv); - timo = tvtohz(&ttv); - } - /* - * An event of our interest may occur during locking a process. - * In order to avoid missing the event that occurred during locking - * the process, test P_SELECT and rescan file descriptors if - * necessary. - */ - thread_lock(td); - if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { - thread_unlock(td); - goto retry; - } - thread_unlock(td); - - if (timo > 0) - error = cv_timedwait(&selwait, &sellock, timo); - else { - cv_wait(&selwait, &sellock); - error = 0; - } - -done: - clear_selinfo_list(td); - - thread_lock(td); - td->td_flags &= ~TDF_SELECT; - thread_unlock(td); - mtx_unlock(&sellock); - -done_noproclock: - if (error == ERESTART) - return 0; - return error; -} - -static int nb_intr(struct nbpcb *nbp, struct proc *p) { return 0; @@ -302,7 +224,7 @@ nbssn_rq_request(struct nbpcb *nbp, stru if (error) return error; TIMESPEC_TO_TIMEVAL(&tv, &nbp->nbp_timo); - error = nbssn_rselect(nbp, &tv, POLLIN, td); + error = selsocket(nbp->nbp_tso, POLLIN, &tv, td); if (error == EWOULDBLOCK) { /* Timeout */ NBDEBUG("initial request timeout\n"); return ETIMEDOUT; http://cvsweb.FreeBSD.org/src/sys/sys/proc.h.diff?r1=1.497&r2=1.498 --- src/sys/sys/proc.h 2007/12/15 23:13:31 1.497 +++ src/sys/sys/proc.h 2007/12/16 06:21:20 1.498 @@ -142,7 +142,6 @@ struct pargs { * m - Giant * n - not locked, lazy * o - ktrace lock - * p - select lock (sellock) * q - td_contested lock * r - p_peers lock * t - thread lock @@ -210,7 +209,7 @@ struct thread { TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ - TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ + struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ @@ -322,7 +321,7 @@ do { \ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ -#define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ +#define TDF_UNUSEDx40 0x00000040 /* --available-- */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ http://cvsweb.FreeBSD.org/src/sys/sys/selinfo.h.diff?r1=1.18&r2=1.19 --- src/sys/sys/selinfo.h 2004/08/15 06:24:42 1.18 +++ src/sys/sys/selinfo.h 2007/12/16 06:21:20 1.19 @@ -35,26 +35,26 @@ #include <sys/event.h> /* for struct klist */ +struct selfd; +TAILQ_HEAD(selfdlist, selfd); + /* * Used to maintain information about processes that wish to be * notified when I/O becomes possible. */ struct selinfo { - TAILQ_ENTRY(selinfo) si_thrlist; /* list hung off of thread */ - struct thread *si_thread; /* thread waiting */ - struct knlist si_note; /* kernel note list */ - short si_flags; /* see below */ + struct selfdlist si_tdlist; /* List of sleeping threads. */ + struct knlist si_note; /* kernel note list */ + struct mtx *si_mtx; /* Lock for tdlist. */ }; -#define SI_COLL 0x0001 /* collision occurred */ -#define SEL_WAITING(si) \ - ((si)->si_thread != NULL || ((si)->si_flags & SI_COLL) != 0) +#define SEL_WAITING(si) (!TAILQ_EMPTY(&(si)->si_tdlist)) #ifdef _KERNEL -void clear_selinfo_list(struct thread *td); void selrecord(struct thread *selector, struct selinfo *sip); void selwakeup(struct selinfo *sip); void selwakeuppri(struct selinfo *sip, int pri); +void seltdfini(struct thread *td); #endif #endif /* !_SYS_SELINFO_H_ */ http://cvsweb.FreeBSD.org/src/sys/sys/socketvar.h.diff?r1=1.158&r2=1.159 --- src/sys/sys/socketvar.h 2007/05/03 14:42:42 1.158 +++ src/sys/sys/socketvar.h 2007/12/16 06:21:20 1.159 @@ -546,6 +546,8 @@ int sosetopt(struct socket *so, struct s int soshutdown(struct socket *so, int how); void sotoxsocket(struct socket *so, struct xsocket *xso); void sowakeup(struct socket *so, struct sockbuf *sb); +int selsocket(struct socket *so, int events, struct timeval *tv, + struct thread *td); #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); http://cvsweb.FreeBSD.org/src/sys/sys/systm.h.diff?r1=1.262&r2=1.263 --- src/sys/sys/systm.h 2007/12/04 12:28:07 1.262 +++ src/sys/sys/systm.h 2007/12/16 06:21:20 1.263 @@ -56,10 +56,6 @@ extern int kstack_pages; /* number of ke extern int nswap; /* size of swap space */ -extern u_int nselcoll; /* select collisions since boot */ -extern struct mtx sellock; /* select lock variable */ -extern struct cv selwait; /* select conditional variable */ - extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ --------------050309040305010505040405--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?47840D44.9090106>