Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 28 Nov 2019 23:46:33 +0200
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        Willem Jan Withagen <wjw@digiware.nl>
Cc:        FreeBSD Hackers <freebsd-hackers@freebsd.org>, Eugene Grosbein <eugen@grosbein.net>
Subject:   Re: Process in T state does not want to die.....
Message-ID:  <20191128214633.GV10580@kib.kiev.ua>
In-Reply-To: <1ae7ad65-902c-8e5f-bcf1-1e98448c64bb@digiware.nl>
References:  <966f830c-bf09-3683-90da-e70aa343cc16@digiware.nl> <3c57e51d-fa36-39a3-9691-49698e8d2124@grosbein.net> <91490c30-45e9-3c38-c55b-12534fd09e28@digiware.nl> <20191128115122.GN10580@kib.kiev.ua> <296874db-40f0-c7c9-a573-410e4c86049a@digiware.nl> <20191128195013.GU10580@kib.kiev.ua> <1ae7ad65-902c-8e5f-bcf1-1e98448c64bb@digiware.nl>

next in thread | previous in thread | raw e-mail | index | archive | help
On Thu, Nov 28, 2019 at 09:52:50PM +0100, Willem Jan Withagen wrote:
>   # ps -o pid,lwp,flags,flags2,state,tracer,command -p 3532
>   PID    LWP        F       F2 STAT TRACER COMMAND
> 3532 103955 11080081 00000000 TsJ       0 ceph-osd -i 5
> 
> # procstat -kk 3532
>    PID    TID COMM                TDNAME              KSTACK
>   3532 103166 ceph-osd            log                 mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103167 ceph-osd            service             mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103168 ceph-osd            admin_socket        mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103169 ceph-osd            msgr-worker-0       mi_switch+0xe2 
> thread_suspend_switch+0x140 thread_single+0x47b sigexit+0x53 
> postsig+0x304 ast+0x327 fast_syscall_common+0x198
>   3532 103170 ceph-osd            msgr-worker-1       mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103171 ceph-osd            msgr-worker-2       mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103172 ceph-osd            signal_handler      mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103406 ceph-osd            OpHistorySvc        mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103407 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103418 ceph-osd            safe_timer          mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103419 ceph-osd            safe_timer          mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103421 ceph-osd            safe_timer          mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103427 ceph-osd            safe_timer          mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103663 ceph-osd            fn_anonymous        mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103675 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103677 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103678 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103679 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103680 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103681 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103682 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103683 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103684 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103685 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 103955 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104621 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104826 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104827 ceph-osd            -                   mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104828 ceph-osd            wb_throttle         mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104829 ceph-osd            filestore_sync      mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104830 ceph-osd            journal_write       mi_switch+0xe2 
> sleepq_wait+0x2c _sleep+0x247 bwillwrite+0x97 dofilewrite+0x93 
> sys_writev+0x6e amd64_syscall+0x364 fast_syscall_common+0x101
>   3532 104831 ceph-osd            fn_jrn_objstore     mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104832 ceph-osd            tp_fstore_op        mi_switch+0xe2 
> thread_suspend_check+0x297 ast+0x3b9 doreti_ast+0x1f
>   3532 104833 ceph-osd            tp_fstore_op        mi_switch+0xe2 
> sleepq_wait+0x2c _sleep+0x247 bwillwrite+0x97 vn_open_cred+0xc8 
> zfs_setextattr+0x216 VOP_SETEXTATTR_APV+0x7c extattr_set_vp+0x11d 
> sys_extattr_set_fd+0xee amd64_syscall+0x364 fast_syscall_common+0x101
This is an example of the cause for your problem.

The thread is executing some ZFS code, zfs_setextattr() VOP probably to
do something with the ext attrs. There, it recurses into VFS to open a
file, and vn_open_cred() waits for buffer space pressure because it is
assumed the vn_open_cred() is called from top level, not from inside
VFS/fs code.

Until this thread finished its operation and safely returned back to
kernel/user boundary, the process cannot exit.

There are two problems.  One is this call to bwillwrite(), and it is easy
to get rid of it, see the patch at the end of the message.  But I wonder
why do you have so many dirty buffers and why it does not resolve itself.
Note that ZFS does not use buffer cache, you must have some other very
active fs, using buffer cache, that is somehow blocked on writes.

diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index ebcc0ad92e0..ae37dd1fba1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -5490,7 +5490,7 @@ vop_getextattr {
 	flags = FREAD;
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
-	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
+	error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
@@ -5627,7 +5627,8 @@ vop_setextattr {
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
-	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
+	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
+	    NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index a0c018deb32..c69010dd999 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -219,7 +219,8 @@ vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
 		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
 			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
-		bwillwrite();
+		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
+			bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 8472bc0fb7b..27dbcbc58b1 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -579,6 +579,7 @@ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
+#define	VN_OPEN_INVFS		0x00000008
 
 /*
  * Public vnode manipulation functions.



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20191128214633.GV10580>