Date: Wed, 20 Oct 2004 17:18:23 -0400 From: John Baldwin <jhb@FreeBSD.org> To: Daniel Eischen <deischen@FreeBSD.org> Cc: threads@FreeBSD.org Subject: Infinite loop bug in libc_r on 4.x with condition variables and signals Message-ID: <200410201718.23862.jhb@FreeBSD.org>
next in thread | raw e-mail | index | archive | help
We are trying to run mono on 4.x and are having problems with the process getting stuck spinning in an infinite loop. After some debugging, we determined that the problem is that the condition variable thread queues are getting corrupted due to threads being added to a queue while they are already queued on another queue. For example, if a thread is somehow on c1's queue but runs and blocks on c2, later when c1 tries to do a broadcast, it tries to remove all the waiters to wake them up doing something like: while ((head = TAILQ_FIRST(&c1->c_queue)) != NULL) { } The problem is that since the thread was last added to c2's queue, his tqe_prev pointer in his sqe TAILQ_ENTRY points to an item on c2's list, and thus the c_queue.tqe_next pointer doesn't get updated by TAILQ_REMOVE, so the thread just "sticks" on c1's head pointer and it spins forever. We seemed to have tracked this down to some sort of bug related to signals and condition variables. It seems that we try to go handle a signal while we are on a condition variable queue, but not in PS_COND_WAIT, so _cond_wait_backout() is not called to remove the thread from the queue. I tried deferring signals around the cond queue manipulations in cond_wait() and cond_timedwait() but we are still seeing the problem. The patches we currently are using (including debug cruft) are below. Right now we see the assertion in _thread_sig_wrapper() firing, but if I remove that, one of the assertions in the condition variable code that check for threads not being on the right condition variable queue trigger instead. Does anyone have any other ideas of how a thread could catch a signal while PS_RUNNING and on a condition variable queue? (I'm also worried that the wait() functions assume that if the thread is interrupted, its always not on the queue, but that doesn't seem to be the case for pthread_cancel() for example.) Index: Makefile =================================================================== RCS file: /usr/cvs/src/lib/libc_r/Makefile,v retrieving revision 1.24.2.7 diff -u -r1.24.2.7 Makefile --- Makefile 22 Oct 2002 14:44:02 -0000 1.24.2.7 +++ Makefile 14 Oct 2004 23:33:42 -0000 @@ -14,7 +14,7 @@ # Uncomment this if you want libc_r to contain debug information for # thread locking. -CFLAGS+=-D_LOCK_DEBUG +CFLAGS+=-D_LOCK_DEBUG -ggdb # enable extra internal consistancy checks CFLAGS+=-D_PTHREADS_INVARIANTS Index: uthread/pthread_private.h =================================================================== RCS file: /usr/cvs/src/lib/libc_r/uthread/pthread_private.h,v retrieving revision 1.36.2.21 diff -u -r1.36.2.21 pthread_private.h --- uthread/pthread_private.h 22 Oct 2002 14:44:02 -0000 1.36.2.21 +++ uthread/pthread_private.h 14 Oct 2004 22:24:00 -0000 @@ -744,6 +744,7 @@ */ TAILQ_ENTRY(pthread) pqe; /* priority queue link */ TAILQ_ENTRY(pthread) sqe; /* synchronization queue link */ + TAILQ_ENTRY(pthread) cqe; /* condition variable queue link */ TAILQ_ENTRY(pthread) qe; /* all other queues link */ /* Wait data. */ Index: uthread/uthread_cond.c =================================================================== RCS file: /usr/cvs/src/lib/libc_r/uthread/uthread_cond.c,v retrieving revision 1.22.2.8 diff -u -r1.22.2.8 uthread_cond.c --- uthread/uthread_cond.c 22 Oct 2002 14:44:02 -0000 1.22.2.8 +++ uthread/uthread_cond.c 15 Oct 2004 21:55:48 -0000 @@ -37,6 +37,8 @@ #include <pthread.h> #include "pthread_private.h" +#define DEFER + /* * Prototypes */ @@ -195,6 +197,14 @@ * signal handler. */ do { +#ifdef DEFER + /* + * Defer signals to protect the scheduling queues + * from access by the signal handler: + */ + _thread_kern_sig_defer(); +#endif + /* Lock the condition variable structure: */ _SPINLOCK(&(*cond)->lock); @@ -270,6 +280,7 @@ * after handling a signal. */ if (interrupted != 0) { + PTHREAD_ASSERT_NOT_IN_SYNCQ(curthread); /* * Lock the mutex and ignore any * errors. Note that even @@ -314,6 +325,13 @@ if ((interrupted != 0) && (curthread->continuation != NULL)) curthread->continuation((void *) curthread); +#ifdef DEFER + /* + * Undefer and handle pending signals, yielding if + * necessary: + */ + _thread_kern_sig_undefer(); +#endif } while ((done == 0) && (rval == 0)); _thread_leave_cancellation_point(); @@ -354,6 +372,13 @@ * signal handler. */ do { +#ifdef DEFER + /* + * Defer signals to protect the scheduling queues + * from access by the signal handler: + */ + _thread_kern_sig_defer(); +#endif /* Lock the condition variable structure: */ _SPINLOCK(&(*cond)->lock); @@ -431,6 +456,7 @@ * after handling a signal. */ if (interrupted != 0) { + PTHREAD_ASSERT_NOT_IN_SYNCQ(curthread); /* * Lock the mutex and ignore any * errors. Note that even @@ -484,6 +510,13 @@ if ((interrupted != 0) && (curthread->continuation != NULL)) curthread->continuation((void *) curthread); +#ifdef DEFER + /* + * Undefer and handle pending signals, yielding if + * necessary: + */ + _thread_kern_sig_undefer(); +#endif } while ((done == 0) && (rval == 0)); _thread_leave_cancellation_point(); @@ -671,8 +704,12 @@ pthread_t pthread; while ((pthread = TAILQ_FIRST(&cond->c_queue)) != NULL) { - TAILQ_REMOVE(&cond->c_queue, pthread, sqe); + PTHREAD_ASSERT(pthread->data.cond == cond, "cond_queue_deq: mismatched condition variables"); + PTHREAD_ASSERT(pthread->cqe.tqe_prev == &TAILQ_FIRST(&cond->c_queue), "cond_queue_deq: elem doesn't match"); + PTHREAD_ASSERT(pthread->flags & PTHREAD_FLAGS_IN_CONDQ, "cond_queue_deq: condq flag not set"); + TAILQ_REMOVE(&cond->c_queue, pthread, cqe); pthread->flags &= ~PTHREAD_FLAGS_IN_CONDQ; + pthread->data.cond = NULL; if ((pthread->timeout == 0) && (pthread->interrupted == 0)) /* * Only exit the loop when we find a thread @@ -693,6 +730,9 @@ static inline void cond_queue_remove(pthread_cond_t cond, pthread_t pthread) { + pthread_t foo; + int found; + /* * Because pthread_cond_timedwait() can timeout as well * as be signaled by another thread, it is necessary to @@ -700,8 +740,28 @@ * it isn't in the queue. */ if (pthread->flags & PTHREAD_FLAGS_IN_CONDQ) { - TAILQ_REMOVE(&cond->c_queue, pthread, sqe); + PTHREAD_ASSERT(pthread->data.cond == cond, + "cond_queue_remove: mismatched condition variables"); + found = 0; + TAILQ_FOREACH(foo, &cond->c_queue, cqe) + if (foo == pthread) + found++; + PTHREAD_ASSERT(found != 0, "thread not on queue"); + PTHREAD_ASSERT(found <= 1, "thread on queue more than once"); + + if (TAILQ_FIRST(&cond->c_queue) == pthread) + PTHREAD_ASSERT(pthread->cqe.tqe_prev == + &TAILQ_FIRST(&cond->c_queue), + "cond_queue_remove: elem doesn't match"); + else + TAILQ_FOREACH(foo, &cond->c_queue, cqe) + if (TAILQ_NEXT(foo, cqe) == pthread) + PTHREAD_ASSERT(pthread->cqe.tqe_prev == + &foo->cqe.tqe_next, + "cond_queue_remove: elem doesn't match"); + TAILQ_REMOVE(&cond->c_queue, pthread, cqe); pthread->flags &= ~PTHREAD_FLAGS_IN_CONDQ; + pthread->data.cond = NULL; } } @@ -713,21 +773,25 @@ cond_queue_enq(pthread_cond_t cond, pthread_t pthread) { pthread_t tid = TAILQ_LAST(&cond->c_queue, cond_head); + pthread_t foo; PTHREAD_ASSERT_NOT_IN_SYNCQ(pthread); + TAILQ_FOREACH(foo, &cond->c_queue, cqe) + PTHREAD_ASSERT(pthread != foo, "thread already on queue"); + /* * For the common case of all threads having equal priority, * we perform a quick check against the priority of the thread * at the tail of the queue. */ if ((tid == NULL) || (pthread->active_priority <= tid->active_priority)) - TAILQ_INSERT_TAIL(&cond->c_queue, pthread, sqe); + TAILQ_INSERT_TAIL(&cond->c_queue, pthread, cqe); else { tid = TAILQ_FIRST(&cond->c_queue); while (pthread->active_priority <= tid->active_priority) - tid = TAILQ_NEXT(tid, sqe); - TAILQ_INSERT_BEFORE(tid, pthread, sqe); + tid = TAILQ_NEXT(tid, cqe); + TAILQ_INSERT_BEFORE(tid, pthread, cqe); } pthread->flags |= PTHREAD_FLAGS_IN_CONDQ; pthread->data.cond = cond; Index: uthread/uthread_mutex.c =================================================================== RCS file: /usr/cvs/src/lib/libc_r/uthread/uthread_mutex.c,v retrieving revision 1.20.2.8 diff -u -r1.20.2.8 uthread_mutex.c --- uthread/uthread_mutex.c 22 Oct 2002 14:44:03 -0000 1.20.2.8 +++ uthread/uthread_mutex.c 20 Oct 2004 20:18:41 -0000 @@ -59,6 +59,8 @@ #define _MUTEX_ASSERT_NOT_OWNED(m) #endif +#define DEFER + /* * Prototypes */ @@ -748,6 +750,11 @@ struct pthread *curthread = _get_curthread(); int ret = 0; +#ifdef DEFER + if (add_reference) + PTHREAD_ASSERT(curthread->sig_defer_count == 1, + "lost defer count start"); +#endif if (mutex == NULL || *mutex == NULL) { ret = EINVAL; } else { @@ -755,6 +762,9 @@ * Defer signals to protect the scheduling queues from * access by the signal handler: */ +#ifdef DEFER + if (!add_reference) +#endif _thread_kern_sig_defer(); /* Lock the mutex structure: */ @@ -1064,8 +1074,16 @@ * Undefer and handle pending signals, yielding if * necessary: */ +#ifdef DEFER + if (!add_reference) +#endif _thread_kern_sig_undefer(); } +#ifdef DEFER + if (add_reference) + PTHREAD_ASSERT(curthread->sig_defer_count == 1, + "lost defer count finish"); +#endif /* Return the completion status: */ return (ret); Index: uthread/uthread_sig.c =================================================================== RCS file: /usr/cvs/src/lib/libc_r/uthread/uthread_sig.c,v retrieving revision 1.25.2.13 diff -u -r1.25.2.13 uthread_sig.c --- uthread/uthread_sig.c 22 Oct 2002 14:44:03 -0000 1.25.2.13 +++ uthread/uthread_sig.c 15 Oct 2004 22:29:18 -0000 @@ -1007,6 +1007,10 @@ break; } } +#if 1 + PTHREAD_ASSERT((thread->flags & PTHREAD_FLAGS_IN_CONDQ) == 0, + "still on cond queue"); +#endif /* Unblock the signal in case we don't return from the handler: */ _thread_sigq[psf->signo - 1].blocked = 0; -- John Baldwin <jhb@FreeBSD.org> <>< http://www.FreeBSD.org/~jhb/ "Power Users Use the Power to Serve" = http://www.FreeBSD.org
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200410201718.23862.jhb>