Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 3 Feb 1998 17:24:14 -0800 (PST)
From:      Matt Dillon <dillon@best.net>
To:        FreeBSD-gnats-submit@FreeBSD.ORG
Subject:   kern/5639: Another crash tracked down to biowait wakeup / corrupt run queue
Message-ID:  <199802040124.RAA02710@flea.best.net>

next in thread | raw e-mail | index | archive | help

>Number:         5639
>Category:       kern
>Synopsis:       Another crash tracked down to biowait wakeup / corrupt run queue
>Confidential:   no
>Severity:       critical
>Priority:       high
>Responsible:    freebsd-bugs
>State:          open
>Quarter:
>Keywords:
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Tue Feb  3 17:30:01 PST 1998
>Last-Modified:
>Originator:     Matt Dillon
>Organization:
BEST Internet communications
>Release:        FreeBSD 2.2.5-STABLE i386
>Environment:

	FreeBSD 2.2.5-STABLE (from CVS, with most recent fixes)
	Adaptec 2940UW 
	four seagate SCSI disks
	heavily loaded machine, lots of processes, lots of disk I/O.
	PPro 200, 1 cpu, 256MB of ram

>Description:

	Below is the chain of processes I followed in GDB from the crash 
	dump.  I followed the inode lock owner through a chain of processes.
	The chain ended with a process in biowait() that was in a Run state
	but was NOT properly on the qs[] run queue, therefore never getting 
	any cpu.

0   865 537457500   0 -14  0   624    0 ufslk2 D     ??    0:00.00  (sendmail)      f34f0e00
proc f34f0e00

   print *(struct inode *)f24de000
   lock holder 0x40e

0  1038 537457500 227 -14  0   256    0 ufslk2 Ds    ??    0:00.00  (popper)        f3414c00
proc f3414c00

print *(struct inode *)f2522200
    lock holder 0x18BE

 1268  6334 537457500   0 -14  0   836    0 ufslk2 D+    q9    0:00.00  (irc-2.9_roof)  f2ad0e00
 proc f2ad0e00
 print *(struct inode *)f2526f00
  i_lockholder = 0x43c, 

UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND               PROC
2259  1084 537457500  27 -14  0   232    0 uihget D+    p9    0:00.00  (Mail)          f35ee600

#0  mi_switch () at ../../kern/kern_synch.c:635
#1  0xf0114eda in tsleep (ident=0xf256f200, priority=0x8, 
    wmesg=0xf019bfb9 "uihget", timo=0x0) at ../../kern/kern_synch.c:398
#2  0xf019c053 in ufs_ihashget (device=0x413, inum=0x2)
    at ../../ufs/ufs/ufs_ihash.c:104
#3  0xf019a1cf in ffs_vget (mp=0xf2530600, ino=0x2, vpp=0xefbffd40)
    at ../../ufs/ffs/ffs_vfsops.c:868
#4  0xf019ed24 in ufs_root (mp=0xf2530600, vpp=0xefbffd80)
    at ../../ufs/ufs/ufs_vfsops.c:90
#5  0xf0131468 in lookup (ndp=0xefbffef4) at ../../kern/vfs_lookup.c:482
#6  0xf0130e7b in namei (ndp=0xefbffef4) at ../../kern/vfs_lookup.c:156

    i_lockholder is 0x43D (process 1085)

shell5:/var/crash# kps -M vmcore.6 -N kernel.6 -axl -p 1085 -o proc
  UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND               PROC
 2044  1085 537457500   5 -14  0   428    0 ufslk2 D     ??    0:00.00  (perl)          f34e9c00

#0  mi_switch () at ../../kern/kern_synch.c:635
#1  0xf0114eda in tsleep (ident=0xf3332800, priority=0x8, 
    wmesg=0xf01a1071 "ufslk2", timo=0x0) at ../../kern/kern_synch.c:398
#2  0xf01a10a1 in ufs_lock (ap=0xefbffc98) at ../../ufs/ufs/ufs_vnops.c:1707
#3  0xf0132707 in vget (vp=0xf2b68180, lockflag=0x1) at vnode_if.h:731
#4  0xf019c652 in ufs_lookup (ap=0xefbffd84) at ../../ufs/ufs/ufs_lookup.c:183
#5  0xf0131339 in lookup (ndp=0xefbffef4) at vnode_if.h:31
#6  0xf0130e7b in namei (ndp=0xefbffef4) at ../../kern/vfs_lookup.c:156
#7  0xf0136a2f in vn_open (ndp=0xefbffef4, fmode=0x1, cmode=0x1a4)
    at ../../kern/vfs_vnops.c:123
#8  0xf0134202 in open (p=0xf34e9c00, uap=0xefbfff94, retval=0xefbfff84)
    at ../../kern/vfs_syscalls.c:701
#9  0xf01bf437 in syscall (frame={tf_es = 0x20050027, tf_ds = 0xefbf0027, 
      tf_edi = 0x4, tf_esi = 0x200f3598, tf_ebp = 0xefbfddec, 
      tf_isp = 0xefbfffe4, tf_ebx = 0x200e3060, tf_edx = 0x10, tf_ecx = 0x0, 
      tf_eax = 0x5, tf_trapno = 0xc, tf_err = 0x7, tf_eip = 0x200cc6f1, 
      tf_cs = 0x1f, tf_eflags = 0x292, tf_esp = 0xefbfddcc, tf_ss = 0x27})

inode i_lockholder = 0x43b, == process 1083

shell5:/var/crash# kps -M vmcore.6 -N kernel.6 -axl -p 1083 -o proc
  UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND               PROC
 2044  1083 537457500   1 -18  0   428    0 ffsvgt D     ??    0:00.00  (perl)          f3341e00

(kgdb) back
#0  mi_switch () at ../../kern/kern_synch.c:635
#1  0xf0114eda in tsleep (ident=0xf01fb978, priority=0x4, 
    wmesg=0xf019a1a0 "ffsvgt", timo=0x0) at ../../kern/kern_synch.c:398
#2  0xf019a209 in ffs_vget (mp=0xf2530600, ino=0x5a16b, vpp=0xefbffd14)
    at ../../ufs/ffs/ffs_vfsops.c:879
#3  0xf019d034 in ufs_lookup (ap=0xefbffd84) at ../../ufs/ufs/ufs_lookup.c:561
#4  0xf0131339 in lookup (ndp=0xefbffef4) at vnode_if.h:31
#5  0xf0130e7b in namei (ndp=0xefbffef4) at ../../kern/vfs_lookup.c:156
#6  0xf0136a2f in vn_open (ndp=0xefbffef4, fmode=0x1, cmode=0x1a4)
    at ../../kern/vfs_vnops.c:123
#7  0xf0134202 in open (p=0xf3341e00, uap=0xefbfff94, retval=0xefbfff84)
    at ../../kern/vfs_syscalls.c:701
#8  0xf01bf437 in syscall (frame={tf_es = 0x27, tf_ds = 0x27, tf_edi = 0x4, 
      tf_esi = 0x200f3598, tf_ebp = 0xefbfdde0, tf_isp = 0xefbfffe4, 
      tf_ebx = 0x200e3060, tf_edx = 0x10, tf_ecx = 0x0, tf_eax = 0x5, 
      tf_trapno = 0xc, tf_err = 0x7, tf_eip = 0x200cc6f1, tf_cs = 0x1f, 
      tf_eflags = 0x282, tf_esp = 0xefbfddc0, tf_ss = 0x27})
    at ../../i386/i386/trap.c:914

	(locked waiting for ffs_inode_hash_lock)

   tracking down other processes in ffs_vget(), I found:

(kgdb) current pcb at f87cf000
(kgdb) #0  mi_switch () at ../../kern/kern_synch.c:635
#1  0xf0114eda in tsleep (ident=0xf44ebf08, priority=0x10,
    wmesg=0xf012e81b "biowait", timo=0x0) at ../../kern/kern_synch.c:398
#2  0xf012e857 in biowait (bp=0xf44ebf08) at ../../kern/vfs_bio.c:1534
#3  0xf012cc1f in bread (vp=0xf2520b80, blkno=0xa0090, size=0x2000, cred=0x0,
    bpp=0xefbffd68) at ../../kern/vfs_bio.c:244
#4  0xf019a369 in ffs_vget (mp=0xf2530e00, ino=0x12d7c, vpp=0xefbffde4)
    at ../../ufs/ffs/ffs_vfsops.c:931
#5  0xf0195376 in ffs_valloc (ap=0xefbffdd0) at ../../ufs/ffs/ffs_alloc.c:535 
#6  0xf01a174a in ufs_makeinode (mode=0x81a4, dvp=0xf2a33480, vpp=0xefbfff04,
    cnp=0xefbfff18) at vnode_if.h:927
#7  0xf019ef65 in ufs_create (ap=0xefbffe28) at ../../ufs/ufs/ufs_vnops.c:113
#8  0xf013697c in vn_open (ndp=0xefbffef4, fmode=0x602, cmode=0x1a4)
    at vnode_if.h:59
#9  0xf0134202 in open (p=0xf3401400, uap=0xefbfff94, retval=0xefbfff84)
    at ../../kern/vfs_syscalls.c:701
#10 0xf01bf437 in syscall (frame={tf_es = 0xefbf0027, tf_ds = 0xefbf0027,

   and found process  3194

shell5:/var/crash# kps -M vmcore.6 -N kernel.6 -axl -p 3194 -o proc
  UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND               PROC
 1173  3194 537457500 255  -6 20   996    0 -      RN    ??    0:00.00  (eggdrop)       f3401400

    This process is supposedly in the run state, the p_procq chain linkages:

(kgdb) print ((struct proc *)f3401400)->p_procq.tqe_next
$3 = (struct proc *) 0xf34f0000
$4 = (struct proc *) 0xf2ee6a00
$5 = (struct proc *) 0xf33f0800
$6 = (struct proc *) 0xf33a3800
$7 = (struct proc *) 0xf267c000
$8 = (struct proc *) 0xf251ec00
$9 = (struct proc *) 0xf3480000
$10 = (struct proc *) 0xf33a3a00
$11 = (struct proc *) 0xf334f800
$12 = (struct proc *) 0xf3342800
$13 = (struct proc *) 0xf0206a38
$14 = (struct proc *) 0xf3401800
$15 = (struct proc *) 0xf3429600
$16 = (struct proc *) 0xf29ad800
$17 = (struct proc *) 0xf267c400
$18 = (struct proc *) 0xf2aba800
$19 = (struct proc *) 0xf34f0600
$20 = (struct proc *) 0xf3424000
$21 = (struct proc *) 0xf3045000
$22 = (struct proc *) 0xf3401c00
$23 = (struct proc *) 0xf2ac3c00
$24 = (struct proc *) 0xf2aba600
$25 = (struct proc *) 0xf3486000
$26 = (struct proc *) 0xf3401400
$27 = (struct proc *) 0xf34f0000

	the process is in qs 0x1F (31), but qs[31] is not in the
	chain!

(kgdb) print &qs[1f]
$57 = (struct prochd *) 0xf0203428
(kgdb) print qs[1f]
$58 = {
  ph_link = 0xf333b600, 
  ph_rlink = 0xf333b600
}

	Thus, the process in the biowait is not on any run list even
	through it is marked as running, and it is linked 'weirdly'.

	There is, in fact, only one process in qs[31].

   	This is the forth time I've tracked down a crash to a process that
	is marked running and not in the actual run queue, with a seemingly
	corrupt run chain.

	This is the second time where the process in question was being woken
	up from a biowait(), probably by the SCSI interrupt (adaptect 2940UW
	PCI).  The first two times might have been that too, but the second
	two times are DEFINITELY a fuckup when a process in biowait is
	woken up and then doesn't get put in the run queue properly.

					-Matt

>How-To-Repeat:

	The bug may be difficult to reproduce.  It appears to be a timing
	window in the SCSI interrupt / wakeup code.  I think there may be 
	something broken in the IPL routines that allow a SCSI interrupt
	to interrupt operations on the run queue (qs[] related stuff).

>Fix:
	
	Don't know.

	The crash dump is available if you guys want it.

>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199802040124.RAA02710>