Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 12 Apr 2018 15:30:22 +0500
From:      "Eugene M. Zheganin" <eugene@zhegan.in>
To:        freebsd-stable@freebsd.org, freebsd-fs@freebsd.org
Subject:   HAST, cyclic singal 6, and inability to start
Message-ID:  <dbc8f749-1d2b-f8e7-317b-f16bcc497062@zhegan.in>

next in thread | raw e-mail | index | archive | help
Hi.


About a month ago I was experimenting with HAST on my servers, and, 
though I did have a complications with signal 6 on init phase, I was 
able to start it and it was working in test mode for a couple of weeks. 
After that I had to reboo both of them and now it doesn't start al all - 
both node hast is crashing on signal 6, and I'm unable to launch it as 
primary in either one. As soon as I switch from init or secondary to 
primary on either node - bad things are starting to happen - cyclic 
signal 6 for hastd and hangups for hastctl.


Both  nodes are running FreeBSD 11.1-RELEASE-pX (p1 and p6).

Here's an extempt from the PR 
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=227461 I just created:

===Cut===

Node A (11.1-RELEASE-p1):
========

[root@gw0:/var/log]# service hastd start
Starting hastd.
[root@gw0:/var/log]# hastctl status
Name    Status   Role           Components
hasta   -        init           /dev/gpt/hasta  tcp4://192.168.0.247
hastb   -        init           /dev/gpt/hastb  tcp4://192.168.0.247
[root@gw0:/var/log]# hastctl role secondary hasta
[root@gw0:/var/log]# hastctl role secondary hastb
[root@gw0:/var/log]# hastctl status
Name    Status   Role           Components
hasta   -        secondary      /dev/gpt/hasta  tcp4://192.168.0.247
hastb   -        secondary      /dev/gpt/hastb  tcp4://192.168.0.247

Node B (11.1-RELEASE-p6):
========
[root@gw1:/var/log]# service hastd start
Starting hastd.
[root@gw1:/var/log]# hastctl status
Name    Status   Role           Components
hasta   -        init           /dev/gpt/hasta  tcp4://192.168.0.248
hastb   -        init           /dev/gpt/hastb  tcp4://192.168.0.248
[root@gw1:/var/log]# hastctl role promary hasta
usage: hastctl create [-d] [-c config] [-e extentsize] [-k keepdirty]
                 [-m mediasize] name ...
        hastctl role [-d] [-c config] <init | primary | secondary> all | name ...
        hastctl list [-d] [-c config] [all | name ...]
        hastctl status [-d] [-c config] [all | name ...]
        hastctl dump [-d] [-c config] [all | name ...]
[root@gw1:/var/log]# hastctl role primary hasta
[root@gw1:/var/log]# hastctl role primary hastb
[root@gw1:/var/log]# hastctl status
(hangs)

Node B dmesg:
pid 26813 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26814 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26815 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26816 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26817 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26822 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26825 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26828 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26829 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26830 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26831 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26833 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26836 (hastd), uid 0: exited on signal 6 (core dumped)
pid 26837 (hastd), uid 0: exited on signal 6 (core dumped)

Node B messages:
Apr 12 15:02:49 gw1 kernel: pid 26891 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:02:50 gw1 hastd[26679]: [hastb] (primary) Worker process killed (pid=26891, signal=6).
Apr 12 15:02:50 gw1 hastd[26893]: [hasta] (primary) Descriptor 7 is open (pipe or FIFO), but should be closed.
Apr 12 15:02:50 gw1 hastd[26893]: [hasta] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:02:50 gw1 kernel: pid 26893 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:02:51 gw1 hastd[26679]: [hasta] (primary) Worker process killed (pid=26893, signal=6).
Apr 12 15:02:51 gw1 hastd[26896]: [hastb] (primary) Descriptor 7 is open (pipe or FIFO), but should be closed.
Apr 12 15:02:51 gw1 hastd[26896]: [hastb] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:02:52 gw1 kernel: pid 26896 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:02:52 gw1 hastd[26679]: [hastb] (primary) Worker process killed (pid=26896, signal=6).
Apr 12 15:02:52 gw1 hastd[26900]: [hasta] (primary) Descriptor 7 is open (pipe or FIFO), but should be closed.
Apr 12 15:02:52 gw1 hastd[26900]: [hasta] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:02:53 gw1 kernel: pid 26900 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:02:54 gw1 hastd[26679]: [hasta] (primary) Worker process killed (pid=26900, signal=6).
Apr 12 15:02:54 gw1 hastd[26904]: [hastb] (primary) Descriptor 7 is open (pipe or FIFO), but should be closed.
Apr 12 15:02:54 gw1 hastd[26904]: [hastb] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:02:54 gw1 kernel: pid 26904 (hastd), uid 0: exited on signal 6 (core dumped)

Now when I'm trying to switch A to primary:

[root@gw0:/var/log]# hastctl role primary hastb
[root@gw0:/var/log]# hastctl role primary hasta
[root@gw0:/var/log]# hastctl status
(hangs)

Node A dmesg:

pid 72301 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72328 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72355 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72389 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72412 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72436 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72467 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72496 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72514 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72530 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72554 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72584 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72620 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72656 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72708 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72759 (hastd), uid 0: exited on signal 6 (core dumped)
pid 72799 (hastd), uid 0: exited on signal 6 (core dumped)

Apr 12 15:04:40 gw0 kernel: pid 72530 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:41 gw0 hastd[63097]: [hasta] (primary) Worker process killed (pid=72530, signal=6).
Apr 12 15:04:41 gw0 hastd[72554]: [hastb] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:41 gw0 hastd[72554]: [hastb] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:04:41 gw0 kernel: pid 72554 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:42 gw0 hastd[63097]: [hastb] (primary) Worker process killed (pid=72554, signal=6).
Apr 12 15:04:42 gw0 hastd[72584]: [hasta] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:42 gw0 hastd[72584]: [hasta] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:04:42 gw0 kernel: pid 72584 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:43 gw0 hastd[63097]: [hasta] (primary) Worker process killed (pid=72584, signal=6).
Apr 12 15:04:43 gw0 hastd[72620]: [hastb] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:43 gw0 hastd[72620]: [hastb] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:04:43 gw0 kernel: pid 72620 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:44 gw0 hastd[63097]: [hastb] (primary) Worker process killed (pid=72620, signal=6).
Apr 12 15:04:44 gw0 hastd[72656]: [hasta] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:44 gw0 hastd[72656]: [hasta] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/hastd.c, line 303.
Apr 12 15:04:44 gw0 kernel: pid 72656 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:45 gw0 hastd[63097]: [hasta] (primary) Worker process killed (pid=72656, signal=6).
Apr 12 15:04:45 gw0 hastd[72708]: [hastb] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:45 gw0 hastd[72708]: [hastb] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/has
td.c, line 303.
Apr 12 15:04:45 gw0 kernel: pid 72708 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:46 gw0 hastd[63097]: [hastb] (primary) Worker process killed (pid=72708, signal=6).
Apr 12 15:04:46 gw0 hastd[72759]: [hasta] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:46 gw0 hastd[72759]: [hasta] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/has
td.c, line 303.
Apr 12 15:04:46 gw0 kernel: pid 72759 (hastd), uid 0: exited on signal 6 (core dumped)
Apr 12 15:04:47 gw0 hastd[63097]: [hasta] (primary) Worker process killed (pid=72759, signal=6).
Apr 12 15:04:47 gw0 hastd[72799]: [hastb] (primary) Descriptor 8 is open (pipe or FIFO), but should be closed.
Apr 12 15:04:47 gw0 hastd[72799]: [hastb] (primary) Aborted at function descriptors_assert, file /usr/src/sbin/hastd/has
td.c, line 303.
Apr 12 15:04:47 gw0 kernel: pid 72799 (hastd), uid 0: exited on signal 6 (core dumped)

Node A config:
==============
resource hasta {
     local /dev/gpt/hasta
     on gw0 {
         remote tcp4://192.168.0.247
         source tcp4://192.168.0.248
     }
     on gw1 {
         remote tcp4://192.168.0.248
         source tcp4://192.168.0.247
     }
}

resource hastb {
     local /dev/gpt/hastb
     on gw0 {
         remote tcp4://192.168.0.247
         source tcp4://192.168.0.248
     }
     on gw1 {
         remote tcp4://192.168.0.248
         source tcp4://192.168.0.247
     }
}


Node B config:
==============

resource hasta {
     local /dev/gpt/hasta
     on gw0 {
         remote tcp4://192.168.0.247
         source tcp4://192.168.0.248
     }
     on gw1 {
         remote tcp4://192.168.0.248
         source tcp4://192.168.0.247
     }
}

resource hastb {
     local /dev/gpt/hastb
     on gw0 {
         remote tcp4://192.168.0.247
         source tcp4://192.168.0.248
     }
     on gw1 {
         remote tcp4://192.168.0.248
         source tcp4://192.168.0.247
     }
}

Backtrace:

(gdb) bt
#0  0x000000080155a84a in thr_kill () from /lib/libc.so.7
#1  0x000000080155a814 in __raise (s=6) at /usr/src/lib/libc/gen/raise.c:52
#2  0x000000080155a789 in abort () at /usr/src/lib/libc/stdlib/abort.c:65
#3  0x0000000000414579 in pjdlog_abort (func=0x420e3f "descriptors_assert",
     file=0x420aeb "/usr/src/sbin/hastd/hastd.c", line=303, failedexpr=0x0, fmt=<value optimized out>)
     at /usr/src/sbin/hastd/pjdlog.c:613
#4  0x0000000000408267 in descriptors_assert (res=0x80204b400, pjdlogmode=<value optimized out>)
     at /usr/src/sbin/hastd/hastd.c:303
#5  0x00000000004146eb in hastd_primary (res=0x80204b400) at /usr/src/sbin/hastd/primary.c:1030
#6  0x000000000040a55a in check_signals () at /usr/src/sbin/hastd/hastd.c:359
#7  0x0000000000408852 in main (argc=<value optimized out>, argv=<value optimized out>)
     at /usr/src/sbin/hastd/hastd.c:1138
#8  0x0000000000403b0f in _start ()
#9  0x000000080064f000 in ?? ()
#10 0x0000000000000000 in ?? ()
(gdb)

===Cut===

If somebody has any idea how do I bring it up - please let me know.

Thanks.

Eugene.




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?dbc8f749-1d2b-f8e7-317b-f16bcc497062>