Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 24 Feb 2016 15:18:18 +0200
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        Paul Koch <paul.koch@akips.com>
Cc:        stable@freebsd.org
Subject:   Re: 10.2 - Process stuck in unkillable sleep
Message-ID:  <20160224131818.GO91220@kib.kiev.ua>
In-Reply-To: <20160224142619.6710b6c1@akips.com>
References:  <20160224142619.6710b6c1@akips.com>

next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, Feb 24, 2016 at 02:26:19PM +1000, Paul Koch wrote:
> 
> Occasionally we see a process get stuck in an unkillable state and
> the only solution is a hard reboot.
> 
> Occasionally == once every two weeks across 60+ servers, which are spread
> across the globe in customer sites.  We have no remote access to these boxes.
> 
> The process that most often that gets stuck, but not limited to, is a large
> scale Ping/SNMP poller.  It is a fairly simplistic C program that just fires
> out lots of ping (raw ICMP socket) and SNMP (UDP socket) requests
> asynchronously.
> 
> We've managed to trap the problem a few times on a test server running in
> VirtualBox, but it also occurs on customer sites who run VMware, Hyper-V,
> QEMU and on bare metal.
> 
> 
> We raise this PR
>  https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204081
> 
> but suspect it is a similar/same issue as
>  https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=200992
> 
> This is the info we've gathered from the most recent time it has occurred:
> 
> 
> # uname -a
> FreeBSD shed153.akips.com 10.2-RELEASE-p12 FreeBSD 10.2-RELEASE-p12 #0 r295070:
> Sat Jan 30 20:03:44 UTC 2016  root@shed21.akips.com:/usr/obj/usr/src/sys/GENERIC amd64

> # ps auxww | grep nm-poller
> akips    1014   0.0  2.6 871820 106540  -  Ds   10Feb16  1078:59.06 nm-poller
> 
> 
> # procstat -k 1014 
>   PID    TID COMM       TDNAME   KSTACK                       
>  1014 100365 nm-poller  -        mi_switch sleepq_timedwait_sig _cv_timedwait_sig_sbt seltdwait kern_select sys_select amd64_syscall Xfast_syscall 
> 

Yes, on HEAD it was reported that the https://reviews.freebsd.org/D5221
fixed the problem.  Still not reviewed.

I did back-port to stable/10, the patch below is probably not applicable
to 10.2, you would need 10.3 for it.  Some revisions are missed from
stable/10, but I think that the issue worked around in the patch is at
the core of troubles many people reported.

Index: sys/kern/kern_timeout.c
===================================================================
--- sys/kern/kern_timeout.c	(revision 295966)
+++ sys/kern/kern_timeout.c	(working copy)
@@ -1127,7 +1127,7 @@ _callout_stop_safe(c, safe)
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
-	if (!safe && c->c_lock != NULL) {
+	if ((safe & CS_DRAIN) == 0 && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
@@ -1207,7 +1207,7 @@ again:
 			return (0);
 		}
 
-		if (safe) {
+		if ((safe & CS_DRAIN) != 0) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
@@ -1319,7 +1319,7 @@ again:
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
-			return (0);
+			return ((safe & CS_MIGRBLOCK) != 0);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
Index: sys/kern/subr_sleepqueue.c
===================================================================
--- sys/kern/subr_sleepqueue.c	(revision 295966)
+++ sys/kern/subr_sleepqueue.c	(working copy)
@@ -572,7 +572,8 @@ sleepq_check_timeout(void)
 	 * another CPU, so synchronize with it to avoid having it
 	 * accidentally wake up a subsequent sleep.
 	 */
-	else if (callout_stop(&td->td_slpcallout) == 0) {
+	else if (_callout_stop_safe(&td->td_slpcallout, CS_MIGRBLOCK)
+	    == 0) {
 		td->td_flags |= TDF_TIMEOUT;
 		TD_SET_SLEEPING(td);
 		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h	(revision 295966)
+++ sys/sys/callout.h	(working copy)
@@ -62,6 +62,9 @@ struct callout_handle {
 	struct callout *callout;
 };
 
+#define	CS_DRAIN		0x0001
+#define	CS_MIGRBLOCK		0x0002
+
 #ifdef _KERNEL
 /* 
  * Note the flags field is actually *two* fields. The c_flags
@@ -81,7 +84,7 @@ struct callout_handle {
  */
 #define	callout_active(c)	((c)->c_flags & CALLOUT_ACTIVE)
 #define	callout_deactivate(c)	((c)->c_flags &= ~CALLOUT_ACTIVE)
-#define	callout_drain(c)	_callout_stop_safe(c, 1)
+#define	callout_drain(c)	_callout_stop_safe(c, CS_DRAIN)
 void	callout_init(struct callout *, int);
 void	_callout_init_lock(struct callout *, struct lock_object *, int);
 #define	callout_init_mtx(c, mtx, flags)					\




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20160224131818.GO91220>