Date: Sun, 19 Sep 2010 12:57:10 +0300 From: Mikolaj Golub <to.my.trociny@gmail.com> To: freebsd-fs@freebsd.org Subject: hastd: parent got stuck in waitpid() Message-ID: <868w2yaweh.fsf@kopusha.home.net>
next in thread | raw e-mail | index | archive | help
Hi, When trying to produce the scenario described in another thread (hastd: possible race when a worker is starting) I stepped on another issue. I was running the following script: #!/bin/sh for i in `jot 1000`; do hastctl status storage > /dev/null done & for i in `jot 1000`; do hastctl role init storage hastctl role primary storage done Parent hastd got stuck but that time when changing the role to init and terminating the worker: in waitpid() after sending kill() to the worker. It looked like the signal was lost. I don't have a clue how this might happen but it is rather easy reproducible in my environment with the script above. After the hung: [root@lolek /usr/src/sbin/hastctl]# ps auxww |grep hast root 3334 0.0 0.5 11244 2372 ?? Is 12:13PM 0:00.10 /sbin/hastd -ddd root 3473 0.0 7.0 44908 35664 ?? I 12:13PM 0:00.43 hastd: storage (primary) (hastd) root 3474 0.0 0.3 10924 1764 1 I+ 12:13PM 0:00.01 hastctl role init storage root 3475 0.0 0.3 10924 1764 1 I+ 12:13PM 0:00.01 hastctl status storage [root@lolek /usr/src/sbin/hastctl]# gdb /usr/obj/usr/src/sbin/hastd/hastd 3334 [Switching to Thread 28404140 (LWP 100070)] 0x282b9689 in wait4 () from /lib/libc.so.7 (gdb) bt #0 0x282b9689 in wait4 () from /lib/libc.so.7 #1 0x282902a3 in waitpid () from /lib/libc.so.7 #2 0x280de272 in waitpid () from /lib/libthr.so.3 #3 0x0804c664 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=1 '\001', res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:103 #4 0x0804cf91 in control_handle (cfg=0x28419600) at /usr/src/sbin/hastd/control.c:344 #5 0x08050357 in main_loop () at /usr/src/sbin/hastd/hastd.c:682 #6 0x0805076d in main (argc=0, argv=0xbfbfecd0) at /usr/src/sbin/hastd/hastd.c:792 (gdb) fr 3 #3 0x0804c664 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=1 '\001', res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:103 103 } else if (waitpid(res->hr_workerpid, NULL, 0) != (gdb) list 98 if (res->hr_workerpid != 0) { 99 if (kill(res->hr_workerpid, SIGTERM) < 0) { 100 pjdlog_errno(LOG_WARNING, 101 "Unable to kill worker process %u", 102 (unsigned int)res->hr_workerpid); 103 } else if (waitpid(res->hr_workerpid, NULL, 0) != 104 res->hr_workerpid) { 105 pjdlog_errno(LOG_WARNING, 106 "Error while waiting for worker process %u", 107 (unsigned int)res->hr_workerpid); [root@lolek /usr/src/sbin/hastctl]# gdb /usr/obj/usr/src/sbin/hastd/hastd 3473 Thread 8 (Thread 28404140 (LWP 100079)): #0 0x282a14bb in sigtimedwait () from /lib/libc.so.7 #1 0x280dff3b in sigtimedwait () from /lib/libthr.so.3 #2 0x0805e5ec in guard_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1986 #3 0x0805aa47 in hastd_primary (res=0x284eb500) at /usr/src/sbin/hastd/primary.c:828 #4 0x0804c6d5 in control_set_role_common (cfg=0x28419600, nvout=0x2850e0d0, role=2 '\002', res=0x284eb500, name=0x284a4442 "storage", no=0) at /usr/src/sbin/hastd/control.c:117 #5 0x0804cf91 in control_handle (cfg=0x28419600) at /usr/src/sbin/hastd/control.c:344 #6 0x08050357 in main_loop () at /usr/src/sbin/hastd/hastd.c:682 #7 0x0805076d in main (argc=0, argv=0xbfbfecd0) at /usr/src/sbin/hastd/hastd.c:792 Thread 7 (Thread 28404280 (LWP 100089)): #0 0x2834b233 in ioctl () from /lib/libc.so.7 #1 0x0805b0f4 in ggate_recv_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:943 #2 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #3 0x00000000 in ?? () Thread 6 (Thread 284043c0 (LWP 100090)): #0 0x280e6ea7 in __error () from /lib/libthr.so.3 #1 0x280e6a88 in __error () from /lib/libthr.so.3 #2 0x284a14e0 in ?? () #3 0x00000008 in ?? () #4 0x00000001 in ?? () #5 0x284a14c0 in ?? () #6 0x00000000 in ?? () #7 0x00000000 in ?? () #8 0x00000000 in ?? () #9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3 #10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3 #11 0x0805b6a8 in cv_wait (cv=0x284ea110, lock=0x284ea108) at synch.h:149 #12 0x0805b767 in local_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1081 #13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #14 0x00000000 in ?? () Thread 5 (Thread 28404500 (LWP 100091)): #0 0x280e6ea7 in __error () from /lib/libthr.so.3 #1 0x280e6a88 in __error () from /lib/libthr.so.3 #2 0x284a15e0 in ?? () #3 0x00000008 in ?? () #4 0x00000001 in ?? () #5 0x284a15c0 in ?? () #6 0x00000000 in ?? () #7 0x00000000 in ?? () #8 0x00000000 in ?? () #9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3 #10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3 #11 0x0805b6a8 in cv_wait (cv=0x284ea114, lock=0x284ea10c) at synch.h:149 #12 0x0805bd3f in remote_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1166 #13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #14 0x00000000 in ?? () Thread 4 (Thread 28404640 (LWP 100093)): #0 0x280e6ea7 in __error () from /lib/libthr.so.3 #1 0x280e6a88 in __error () from /lib/libthr.so.3 #2 0x284a1660 in ?? () #3 0x00000008 in ?? () #4 0x00000001 in ?? () #5 0x284a1640 in ?? () #6 0x00000000 in ?? () #7 0x00000000 in ?? () #8 0x00000000 in ?? () #9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3 #10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3 #11 0x0805b6a8 in cv_wait (cv=0x284ea124, lock=0x284ea11c) at synch.h:149 #12 0x0805c58e in remote_recv_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1312 #13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #14 0x00000000 in ?? () Thread 3 (Thread 28404780 (LWP 100094)): #0 0x280e6ea7 in __error () from /lib/libthr.so.3 #1 0x280e6a88 in __error () from /lib/libthr.so.3 #2 0x284a16e0 in ?? () #3 0x00000008 in ?? () #4 0x00000001 in ?? () #5 0x284a16c0 in ?? () #6 0x00000000 in ?? () #7 0x00000000 in ?? () #8 0x00000000 in ?? () #9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3 #10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3 #11 0x0805b6a8 in cv_wait (cv=0x806ba54, lock=0x806ba50) at synch.h:149 #12 0x0805cc3b in ggate_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1432 #13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #14 0x00000000 in ?? () Thread 2 (Thread 284048c0 (LWP 100095)): #0 0x280e6ea7 in __error () from /lib/libthr.so.3 #1 0x280e6a88 in __error () from /lib/libthr.so.3 #2 0x28bf48e0 in ?? () #3 0x00000008 in ?? () #4 0x00000001 in ?? () #5 0x284a16c0 in ?? () #6 0x00000000 in ?? () #7 0x00000000 in ?? () #8 0x00000000 in ?? () #9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3 #10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3 #11 0x0805b6a8 in cv_wait (cv=0x806ba54, lock=0x806ba50) at synch.h:149 #12 0x0805cc3b in ggate_send_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1432 #13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #14 0x00000000 in ?? () Thread 2 (Thread 284048c0 (LWP 100095)): #0 0x280e6ea7 in __error () from /lib/libthr.so.3 #1 0x280e6a88 in __error () from /lib/libthr.so.3 #2 0x28bf48e0 in ?? () ---Type <return> to continue, or q <return> to quit--- #3 0x00000008 in ?? () #4 0x00000001 in ?? () #5 0x28bf48c0 in ?? () #6 0x00000000 in ?? () #7 0xbf4f9e84 in ?? () #8 0x00000000 in ?? () #9 0x280e53ef in pthread_setcancelstate () from /lib/libthr.so.3 #10 0x280e4c8e in pthread_cond_signal () from /lib/libthr.so.3 #11 0x0805b6a8 in cv_wait (cv=0x806ba60, lock=0x806ba5c) at synch.h:149 #12 0x0805d053 in sync_thread (arg=0x284eb500) at /usr/src/sbin/hastd/primary.c:1528 #13 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #14 0x00000000 in ?? () Thread 1 (Thread 28404a00 (LWP 100096)): #0 0x28300ed5 in recvfrom () from /lib/libc.so.7 #1 0x28286f52 in recv () from /lib/libc.so.7 #2 0x0805f237 in proto_common_recv (fd=33, data=0xbf3f8f47 "", size=5) at /usr/src/sbin/hastd/proto_common.c:77 #3 0x0805f68d in sp_recv (ctx=0x2850e3f0, data=0xbf3f8f47 "", size=5) at /usr/src/sbin/hastd/proto_socketpair.c:185 #4 0x0805ec61 in proto_recv (conn=0x2850e3e0, data=0xbf3f8f47, size=5) at /usr/src/sbin/hastd/proto.c:207 #5 0x0804e42e in hast_proto_recv_hdr (conn=0x2850e3e0, nvp=0xbf3f8f80) at /usr/src/sbin/hastd/hast_proto.c:308 #6 0x0804d0b7 in ctrl_thread (arg=0x284eb500) at /usr/src/sbin/hastd/control.c:385 #7 0x280dc35f in pthread_getprio () from /lib/libthr.so.3 #8 0x00000000 in ?? () -- Mikolaj Golub
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?868w2yaweh.fsf>