Date: Mon, 17 Dec 2001 13:06:04 +0100 (CET) From: Martin Blapp <mb@imp.ch> To: <freebsd-hackers@freebsd.org> Subject: deadlock with softupdates ? Message-ID: <20011217125752.P43207-100000@levais.imp.ch>
next in thread | raw e-mail | index | archive | help
We have a newserver running here, which freezes
every 2-3 days. Break into db is still possible.
It is a STABLE 4.4 machine, with new kernel.
NEWS # mount
/dev/da0s1a on / (ufs, local, soft-updates)
/dev/da0s1e on /var (ufs, local, soft-updates)
/dev/da1s1e on /news (ufs, local, soft-updates)
/dev/da2s1e on /news/spool/overview (ufs, local, soft-updates)
procfs on /proc (procfs, local)
/dev/twed0s1e on /news/spool/articles/1 (ufs, local, soft-updates)
/dev/twed1s1e on /news/spool/articles/2 (ufs, local, soft-updates)
The latter two filesystems have 10 big files with the cyclic news
filesystem on them. So we have a filesystem on a filesystem.
I guess this is what makes the problems.
I've got a kernel dump of a hanged session, but it
is not very informative.
bash-2.05a# ps -M /var/crash/vmcore.0 -N
/usr/src/sys/compile/NEWS/kernel.deb-axl | more
UID PID PPID CPU PRI NI VSZ RSS WCHAN STAT TT TIME COMMAND
8 289 1 100 10 0 668 0 wait IW #C5- 0:00.00 (sh)
8 290 1 100 10 0 668 0 wait IW #C5- 0:00.00 (sh)
8 292 289 0 -18 0 680 0 vmwait DL #C5- 0:30.78 (sh)
8 293 290 0 -14 0 3644 0 inode D #C5- 0:49.73 (perl)
0 12263 1 0 10 0 1200 0 wait IWs #C5 0:00.00
(login)
0 12295 12263 0 18 0 1324 0 pause IW #C5 0:00.00 (csh)
0 12297 12295 0 10 0 1060 0 wait IW #C5 0:00.00 (bash)
0 12300 12297 0 -5 0 1532 0 sysctl D+ #C5 6:58.09
(systat)
1042 677 671 0 18 0 580 0 pause IWs #C5 0:00.00 (ksh)
0 689 677 0 18 0 1336 0 pause IW #C5 0:00.00 (csh)
0 696 689 0 18 0 588 0 pause I #C5 0:00.03 (ksh)
8 13284 696 12 10 0 632 0 wait I #C5 0:00.01 (sh)
8 13742 13284 12 10 0 636 0 wait I+ #C5 0:00.01 (sh)
8 13750 13742 0 -18 0 172960 0 vmwait DL+ #C5 0:01.82
(suck)
0 317 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 316 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 315 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 314 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 313 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 312 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 319 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 318 1 0 3 0 948 0 ttyin IWs+ #C1 0:00.00
(getty)
0 12307 12306 0 10 0 1064 0 wait IWs #C9 0:00.00 (bash)
0 12312 12307 0 10 0 1064 0 wait IW #C9 0:00.00 (bash)
0 12313 12312 0 -18 0 2012 0 vmwait D+ #C9 8:22.42 (top)
0 59816 59812 0 3 0 1064 0 ttyin Is+ #C6 0:00.04 (bash)
1042 35037 35035 2 18 0 580 0 pause IWs #C6 0:00.00 (ksh)
0 35050 35037 0 18 0 1336 0 pause IW #C6 0:00.00 (csh)
0 35059 35050 0 3 0 600 0 ttyin I+ #C6 0:00.17 (ksh)
0 0 0 0 -18 0 0 0 sched DLs ?? 0:00.00
(swapper)
0 1 0 0 10 0 544 0 wait ILs ?? 0:00.01 (init)
0 2 0 0 -14 0 0 0 inode DL ?? 0:36.22
(pagedaemon
0 3 0 38 18 0 0 0 psleep DL ?? 0:00.61
(vmdaemon)
0 4 0 0 -18 0 0 0 psleep DL ?? 0:43.03
(bufdaemon)
0 5 0 0 18 0 0 0 syncer DL ?? 10:42.21
(syncer)
0 137 1 0 -18 0 2404 0 vmwait DLs ?? 0:54.90
(gated)
0 149 1 0 2 0 960 0 select Ss ?? 3:42.56
(syslogd)
0 158 1 0 2 -12 1272 0 select S<s ?? 0:11.33
(ntpd)
0 177 1 0 2 0 1056 0 select Is ?? 0:00.00
(inetd)
0 179 1 0 -18 0 984 0 vmwait DLs ?? 0:00.77 (cron)
0 182 1 9 2 0 2216 0 select Is ?? 0:11.13 (sshd)
0 184 1 0 2 0 908 0 select Ss ?? 0:00.36 (usbd)
0 221 1 0 2 0 2084 0 select Ss ?? 0:07.75
(httpd)
0 229 1 0 2 0 932 0 select Ss ?? 3:55.50
(healthd)
65534 238 221 0 2 0 2148 0 accept I ?? 0:00.13
(httpd)
65534 239 221 0 2 0 2148 0 accept I ?? 0:00.12
(httpd)
65534 240 221 0 2 0 2148 0 accept I ?? 0:00.12
(httpd)
65534 241 221 0 2 0 2148 0 accept I ?? 0:00.12
(httpd)
65534 242 221 0 2 0 2148 0 accept I ?? 0:00.12
(httpd)
8 288 1 0 -14 0 133420 0 inode DLs ?? 28:25.45
(innd)
0 307 1 34 2 0 1580 0 select IWs ?? 0:00.00
(radiusd)
0 671 182 0 2 0 2320 0 select I ?? 0:00.85 (sshd)
8 10135 288 0 36 4 0 0 - ZN ?? 0:00.00
(nnrpd)
65534 10363 221 0 2 0 2152 0 accept I ?? 0:00.12
(httpd)
65534 10364 221 0 2 0 2152 0 accept I ?? 0:00.12
(httpd)
65534 10366 221 0 2 0 2152 0 accept I ?? 0:00.12
(httpd)
65534 10367 221 0 2 0 2152 0 accept I ?? 0:00.12
(httpd)
65534 10368 221 0 2 0 2152 0 accept I ?? 0:00.12
(httpd)
8 11811 288 0 2 4 48124 0 select IN ?? 0:01.14
(nnrpd)
0 12306 182 0 -18 0 2320 0 vmwait DL ?? 0:29.42 (sshd)
8 13265 288 0 36 4 0 0 - ZN ?? 0:00.00
(nnrpd)
8 13333 288 0 -18 4 46448 0 vmwait DNL ?? 0:00.03
(nnrpd)
8 13338 288 0 -18 4 46448 0 vmwait DNL ?? 0:00.04
(nnrpd)
8 11811 288 0 2 4 48124 0 select IN ?? 0:01.14 (nnrpd)
0 12306 182 0 -18 0 2320 0 vmwait DL ?? 0:29.42 (sshd)
8 13265 288 0 36 4 0 0 - ZN ?? 0:00.00
(nnrpd)
8 13333 288 0 -18 4 46448 0 vmwait DNL ?? 0:00.03
(nnrpd)
8 13338 288 0 -18 4 46448 0 vmwait DNL ?? 0:00.04
(nnrpd)
8 13390 288 0 -18 4 46468 0 vmwait DNL ?? 0:00.02
(nnrpd)
8 13447 288 0 36 4 0 0 - ZN ?? 0:00.00
(nnrpd)
8 13764 288 0 36 4 0 0 - ZN ?? 0:00.00
(nnrpd)
8 13767 288 0 -18 4 2052 0 vmwait DNL ?? 0:00.00
(nnrpd)
0 13768 179 0 10 0 984 0 ppwait D ?? 0:00.00 (cron)
0 13769 13768 0 -18 0 984 0 vmwait DVLs ?? 0:00.00 (cron)
8 13770 288 0 -18 4 2052 0 vmwait DNL ?? 0:00.00
(nnrpd)
8 13771 288 0 -14 4 2072 0 inode DN ?? 0:00.01
(nnrpd)
8 13772 288 0 -18 4 2128 0 vmwait DNL ?? 0:00.01
(nnrpd)
8 13773 288 0 -18 4 2052 0 vmwait DNL ?? 0:00.00
(nnrpd)
8 13774 288 0 -18 4 2108 0 vmwait DNL ?? 0:00.01
(nnrpd)
8 13775 288 0 -14 4 2128 0 inode DN ?? 0:00.01
(nnrpd)
8 13776 288 0 -18 4 2080 0 vmwait DNL ?? 0:00.00
(nnrpd)
8 13777 288 0 -18 4 584 0 vmwait DNL ?? 0:00.00
(nnrpd)
8 13778 288 0 -18 4 584 0 vmwait DNL ?? 0:00.00
(nnrpd)
8 13779 288 0 -14 4 2120 0 inode DN ?? 0:00.01
(nnrpd)
8 13780 288 0 -18 4 1916 0 vmwait DNL ?? 0:00.00
(nnrpd)
8 13781 288 0 -14 4 45968 0 inode DN ?? 0:00.01
(nnrpd)
0 13782 179 0 28 0 680 0 - R ?? 0:00.00 (cron)
8 13783 292 0 -18 0 680 0 vmwait DL ?? 0:00.00 (sh)
0 35035 182 0 2 0 2320 0 select I ?? 0:01.03 (sshd)
0 59812 182 0 -18 0 2320 0 vmwait DL ?? 0:00.10 (sshd)
8 70957 288 0 2 4 1632 0 sbwait IN ?? 0:06.20
(ninpaths)
0 75942 1 0 2 0 1564 0 select Is ?? 0:00.02
(nsrexecd)
0 75943 75942 0 2 0 1556 0 select S ?? 0:00.15
(nsrexecd)
8 76594 288 0 2 4 45876 0 select IN ?? 4:15.73
(innfeed)
8 76595 288 0 2 4 3144 0 sbwait IN ?? 2:16.47
(overchan)
Anyway, I noted that many processes are stuck at the same place in
ffs_read:
(kgdb) defproc 13769
13769 d78cd0c0 d78bb000 0 13768 13769 800014 3 cron vmwait
c031f718
frame 0 at 0xd78bdc08: ebp d78bdc34, eip 0xc017a529 <tsleep+417>:
mov 0x141(%ebx),%al
frame 1 at 0xd78bdc34: ebp d78bdc50, eip 0xc0238897 <vm_await+35>:
add $0x10,%esp
frame 2 at 0xd78bdc50: ebp d78bdd00, eip 0xc0230574 <vm_fault+192>:
add $0x8,%esp
frame 3 at 0xd78bdd00: ebp d78bdd28, eip 0xc0284b9e <trap_pfault+118>:
mov %eax,%ecx
frame 4 at 0xd78bdd28: ebp d78bdd6c, eip 0xc02847af <trap+767>:
jmp 0xc0284afe <trap+1614>
frame 5 at 0xd78bdd6c: ebp d78bdde4, eip 0xc0276730 <alltraps>:
push %ebx
frame 6 at 0xd78bdde4: ebp d78bde48, eip 0xc0226dbe <ffs_read+842>:
mov %eax,0xffffffd4(%ebp)
frame 7 at 0xd78bde48: ebp d78bde7c, eip 0xc01aba18 <vn_read+300>:
mov %eax,0xffffffe8(%ebp)
frame 8 at 0xd78bde7c: ebp d78bdef8, eip 0xc0185cc0 <dofileread+196>:
mov %eax,%esi
frame 9 at 0xd78bdef8: ebp d78bdf2c, eip 0xc0185b86 <read+74>: mov
%eax,%esi
frame 10 at 0xd78bdf2c: ebp d78bdfa0, eip 0xc028520d <syscall2+465>:
mov %eax,0xffffffb8(%ebp)
(kgdb) defproc 13750
13750 d78cd0c0 d7b9b000 8 13742 13742 804006 3 suck vmwait
c031f718
frame 0 at 0xd7b9dc08: ebp d7b9dc34, eip 0xc017a529 <tsleep+417>:
mov 0x141(%ebx),%al
frame 1 at 0xd7b9dc34: ebp d7b9dc50, eip 0xc0238897 <vm_await+35>:
add $0x10,%esp
frame 2 at 0xd7b9dc50: ebp d7b9dd00, eip 0xc0230574 <vm_fault+192>:
add $0x8,%esp
frame 3 at 0xd7b9dd00: ebp d7b9dd28, eip 0xc0284b9e <trap_pfault+118>:
mov %eax,%ecx
frame 4 at 0xd7b9dd28: ebp d7b9dd6c, eip 0xc02847af <trap+767>:
jmp 0xc0284afe <trap+1614>
frame 5 at 0xd7b9dd6c: ebp d7b9dde4, eip 0xc0276730 <alltraps>:
push %ebx
frame 6 at 0xd7b9dde4: ebp d7b9de48, eip 0xc0226dbe <ffs_read+842>:
mov %eax,0xffffffd4(%ebp)
frame 7 at 0xd7b9de48: ebp d7b9de7c, eip 0xc01aba18 <vn_read+300>:
mov %eax,0xffffffe8(%ebp)
frame 8 at 0xd7b9de7c: ebp d7b9def8, eip 0xc0185cc0 <dofileread+196>:
mov %eax,%esi
frame 9 at 0xd7b9def8: ebp d7b9df2c, eip 0xc0185b86 <read+74>: mov
%eax,%esi
frame 10 at 0xd7b9df2c: ebp d7b9dfa0, eip 0xc028520d <syscall2+465>:
mov %eax,0xffffffb8(%ebp)
Do you have a clue how I can debug this problem ?
Martin
Martin Blapp, mb@imp.ch
------------------------------------------------------------------
Improware AG, UNIX solution and service provider
Zurlindenstrasse 29, 4133 Pratteln, Switzerland
Phone: +41 061 826 93 00: +41 61 826 93 01
PGP Fingerprint: 57E 7CCD 2769 E7AC C5FA DF2C 19C6 DCD1 1B3A EC9C
------------------------------------------------------------------
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20011217125752.P43207-100000>
