Date: Sat, 22 Aug 1998 22:30:01 -0700 (PDT) From: Matt Dillon <dillon@best.net> To: freebsd-bugs@FreeBSD.ORG Subject: Re: kern/7557: -current machine running Diablo, lockup, possible inode deadlock Message-ID: <199808230530.WAA28407@freefall.freebsd.org>
index | next in thread | raw e-mail
The following reply was made to PR kern/7557; it has been noted by GNATS.
From: Matt Dillon <dillon@best.net>
To: freebsd-gnats-submit@freebsd.org
Cc: Subject: Re: kern/7557: -current machine running Diablo, lockup, possible inode deadlock
Date: Sat, 22 Aug 1998 22:24:25 -0700 (PDT)
More information on deadlock. I have two full-debug crash dumps. It's
hard to track things down, but both crash dumps have an interesting
commonality. In both instances, in addition to all the process stuck
in inode locks, there is a process stuck in getblk AND a process stuck
in pgtblk. the getblk wait occurs if bp->b_Flags & B_BUSY, and the
pgtblk wait occurs if (a vm_page_t) m->flags & BG_BUSY occurs.
nntp3:/var/crash# ps -M vmcore.6 -N kernel.6 -axl | less
UID PID PPID CPU PRI NI VSZ RSS WCHAN STAT TT TIME COMMAND
...
8 10400 198 0 -2 0 43780 0 getblk D ?? 0:00.00 (diablo)
8 10419 198 0 -18 0 43788 0 pgtblk D ?? 0:00.00 (diablo)
nntp3:/var/crash# ps -M vmcore.7 -N kernel.7 -axl | less
UID PID PPID CPU PRI NI VSZ RSS WCHAN STAT TT TIME COMMAND
...
8 319 198 1 -2 0 44312 0 getblk D ?? 0:00.00 (diablo)
8 280 198 1 -18 0 44312 0 pgtblk D ?? 0:00.00 (diablo)
(kgdb) proc 319
(kgdb) back
#0 mi_switch () at ../../kern/kern_synch.c:661
#1 0xf0119fb1 in tsleep (ident=0xf6e400b0, priority=0x14,
wmesg=0xf013432f "getblk", timo=0x0) at ../../kern/kern_synch.c:435
#2 0xf01343dd in getblk (vp=0xfc24d180, blkno=0xc, size=0x2000, slpflag=0x0,
slptimeo=0x0) at ../../kern/vfs_bio.c:1437
#3 0xf01366fb in cluster_read (vp=0xfc24d180, filesize=0x4034b2, lblkno=0xc,
size=0x2000, cred=0x0, totread=0x10000, seqcount=0x8, bpp=0xfc10fd60)
at ../../kern/vfs_cluster.c:114
#4 0xf01ac721 in ffs_read (ap=0xfc10fe18) at ../../ufs/ufs/ufs_readwrite.c:168
#5 0xf01ad2bd in ffs_getpages (ap=0xfc10fe70) at vnode_if.h:303
#6 0xf01c386a in vnode_pager_getpages (object=0xfc3f0220, m=0xfc10ff1c,
count=0x2, reqpage=0x0) at vnode_if.h:1067
#7 0xf01c2587 in vm_pager_get_pages (object=0xfc3f0220, m=0xfc10ff1c,
count=0x2, reqpage=0x0) at ../../vm/vm_pager.c:256
#8 0xf01b6f34 in vm_fault (map=0xfc073380, vaddr=0x22897000, fault_type=0x1,
fault_flags=0x0) at ../../vm/vm_fault.c:424
#9 0xf01daca2 in trap_pfault (frame=0xfc10ffbc, usermode=0x1)
at ../../i386/i386/trap.c:753
#10 0xf01da7e3 in trap (frame={tf_es = 0xefbf0027, tf_ds = 0xfc100027,
tf_edi = 0x1, tf_esi = 0x17fea, tf_ebp = 0xefbfd58c,
tf_isp = 0xfc10ffe4, tf_ebx = 0x18000, tf_edx = 0x2287f000,
tf_ecx = 0x0, tf_eax = 0x9cf7f, tf_trapno = 0xc, tf_err = 0x4,
tf_eip = 0x414c, tf_cs = 0x1f, tf_eflags = 0x10297, tf_esp = 0xefbfd520,
tf_ss = 0x27}) at ../../i386/i386/trap.c:317
#11 0x414c in ?? ()
#12 0x276e in ?? ()
#13 0x1ee1 in ?? ()
#14 0x1809 in ?? ()
#15 0x107e in ?? ()
(kgdb) frame 2
#2 0xf01343dd in getblk (vp=0xfc24d180, blkno=0xc, size=0x2000, slpflag=0x0,
slptimeo=0x0) at ../../kern/vfs_bio.c:1437
1437 if (!tsleep(bp,
(kgdb) print bp
$11 = (struct buf *) 0xf6e400b0
(kgdb) print *bp
$12 = {
b_hash = {
le_next = 0x0,
le_prev = 0xf6e2f0f8
},
b_vnbufs = {
le_next = 0xf6e4f258,
le_prev = 0xfc24d1b0
},
b_freelist = {
tqe_next = 0xf6d77f08,
tqe_prev = 0xf0202158
},
b_act = {
tqe_next = 0x0,
tqe_prev = 0xf1ca0e14
},
b_proc = 0x0,
b_flags = 0x20800030,
b_qindex = 0x0,
b_usecount = 0x6,
b_error = 0x0,
b_bufsize = 0x0,
b_bcount = 0x0,
b_resid = 0x0,
b_dev = 0xffffffff,
b_data = 0xf95ae000 <Address 0xf95ae000 out of bounds>,
b_kvabase = 0xf95ae000 <Address 0xf95ae000 out of bounds>,
b_kvasize = 0x2000,
b_lblkno = 0xc,
b_blkno = 0xc,
b_offset = 0x0000000000018000,
b_iodone = 0,
b_iodone_chain = 0x0,
b_vp = 0xfc24d180,
b_dirtyoff = 0x0,
b_dirtyend = 0x0,
b_rcred = 0x0,
b_wcred = 0x0,
b_validoff = 0x0,
b_validend = 0x0,
b_pblkno = 0x9804d0,
b_saveaddr = 0x0,
b_savekva = 0x0,
b_driver1 = 0x0,
b_driver2 = 0x0,
b_spc = 0x0,
b_cluster = {
cluster_head = {
tqh_first = 0xf6d77f08,
tqh_last = 0xf6d850e8
},
cluster_entry = {
tqe_next = 0xf6d77f08,
tqe_prev = 0xf6d850e8
}
},
b_pages = {0x0 <repeats 32 times>},
b_npages = 0x0,
b_dep = {
lh_first = 0x0
}
}
(kgdb) proc 280
(kgdb) back
#0 mi_switch () at ../../kern/kern_synch.c:661
#1 0xf0119fb1 in tsleep (ident=0xf0e19ba0, priority=0x4,
wmesg=0xf01346d2 "pgtblk", timo=0x0) at ../../kern/kern_synch.c:435
#2 0xf0134afa in allocbuf (bp=0xf6e400b0, size=0x2000)
at ../../kern/vfs_bio.c:1799
#3 0xf0134612 in getblk (vp=0xfc24d180, blkno=0xc, size=0x2000, slpflag=0x0,
slptimeo=0x0) at ../../kern/vfs_bio.c:1557
#4 0xf0136a5e in cluster_read (vp=0xfc24d180, filesize=0x4034b2, lblkno=0xc,
size=0x2000, cred=0x0, totread=0xc000, seqcount=0x8, bpp=0xfc0f6d60)
at ../../kern/vfs_cluster.c:235
#5 0xf01ac721 in ffs_read (ap=0xfc0f6e18) at ../../ufs/ufs/ufs_readwrite.c:168
#6 0xf01ad2bd in ffs_getpages (ap=0xfc0f6e70) at vnode_if.h:303
#7 0xf01c386a in vnode_pager_getpages (object=0xfc3f0220, m=0xfc0f6f1c,
count=0x2, reqpage=0x0) at vnode_if.h:1067
#8 0xf01c2587 in vm_pager_get_pages (object=0xfc3f0220, m=0xfc0f6f1c,
count=0x2, reqpage=0x0) at ../../vm/vm_pager.c:256
#9 0xf01b6f34 in vm_fault (map=0xfc0738c0, vaddr=0x22891000, fault_type=0x1,
fault_flags=0x0) at ../../vm/vm_fault.c:424
#10 0xf01daca2 in trap_pfault (frame=0xfc0f6fbc, usermode=0x1)
at ../../i386/i386/trap.c:753
#11 0xf01da7e3 in trap (frame={tf_es = 0xefbf0027, tf_ds = 0xfc0f0027,
tf_edi = 0x1, tf_esi = 0x11fd1, tf_ebp = 0xefbfd58c,
tf_isp = 0xfc0f6fe4, tf_ebx = 0x12000, tf_edx = 0x2287f000,
tf_ecx = 0x0, tf_eax = 0x9cf7f, tf_trapno = 0xc, tf_err = 0x4,
tf_eip = 0x414c, tf_cs = 0x1f, tf_eflags = 0x10297, tf_esp = 0xefbfd520,
tf_ss = 0x27}) at ../../i386/i386/trap.c:317
#12 0x414c in ?? ()
#13 0x276e in ?? ()
#14 0x1ee1 in ?? ()
#15 0x1809 in ?? ()
#16 0x107e in ?? ()
(kgdb)
(kgdb) print bp
$14 = (struct buf *) 0xf6e400b0 (this is the same bp)
The deadlock doesn't occur here, but it seems odd. It's impossible to
determine what is causing the deadlock. If I track down inode lock chains
I always get to a process that is waiting for an exclusive lock on a
shared-locked inode (with 2 references even!). I cannot determine who
is holding the shared lock(s) to track it down further.
inode f1d27400 lock holder 0xCC pid 204
204 fbfb25c0 fc01f000 8 200 200 000105 S dreaderd inode f1d27200
inode f1d27200 lock holder 0xFC pid 252
252 fc098d40 fc0da000 8 198 198 000105 S diablo inode f1d2ca00
inode f1d2ca00 lock holder 0xFE pid 254
254 fc098ac0 fc0e2000 8 198 198 000105 S diablo inode f1d58000
lock holder 0xF1 241
241 fc0999c0 fc0b3000 8 198 198 000105 S diablo inode f1f25000
lock holder 0x13e (318)
318 fc097d00 fc10b000 8 198 198 000105 S diablo inode f21e6e00
shared lock, share cnt 2, waitcnt 1
here I'm stuck. I can't tell who is holding the shared lock on
inode f21e6e00. However, this inode is associated with vnode
fc24d180 which happens to be the vnode under which the bp exists
that is stuck in pgtblk and getblk in the other two processes,
so maybe they are the ones holding the shared lock.
Whatever the case, the result is a chain reaction that puts just
about every process in the system in an inode wait state.
-Matt
Matthew Dillon Engineering, HiWay Technologies, Inc. & BEST Internet
Communications.
<dillon@best.net> (Please include portions of article in any response)
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message
help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199808230530.WAA28407>
