Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 28 Oct 2015 04:52:49 +0000
From:      bugzilla-noreply@freebsd.org
To:        freebsd-bugs@FreeBSD.org
Subject:   [Bug 204081] Uninterruptible sleep under 10.2-RELEASE
Message-ID:  <bug-204081-8@https.bugs.freebsd.org/bugzilla/>

next in thread | raw e-mail | index | archive | help
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204081

            Bug ID: 204081
           Summary: Uninterruptible sleep under 10.2-RELEASE
           Product: Base System
           Version: 10.2-RELEASE
          Hardware: amd64
                OS: Any
            Status: New
          Severity: Affects Some People
          Priority: ---
         Component: kern
          Assignee: freebsd-bugs@FreeBSD.org
          Reporter: nick.frampton@akips.com

We have a process which randomly gets stuck in uninterruptible sleep.

The process is a large scale ping and SNMP poller, using libpcap, raw ICMP
socket, and UDP socket.

If sendto(2) fails, we call usleep(50000) and retry. We suspect (but can't
confirm) that the process is stuck in this sleep.

We have not been able to reliably reproduce this problem. It has happened on a
customer's server and in our test lab. The process is still currently stuck in
this state and we can provide additional debug info if requested.


# freebsd-version -ku; uname -apKU
10.2-RELEASE-p3
10.2-RELEASE-p3
FreeBSD shed163.akips.com 10.2-RELEASE-p3 FreeBSD 10.2-RELEASE-p3 #0 r287883:
Thu Sep 17 05:59:12 UTC 2015    
root@shed31.akips.com:/usr/obj/usr/src/sys/GENERIC  amd64 amd64 1002000 1002000



# top -b
last pid: 27815;  load averages:  1.79,  1.29,  1.23  up 12+18:46:53   
14:50:31
107 processes: 4 running, 98 sleeping, 5 zombie

Mem: 723M Active, 183M Inact, 2937M Wired, 2400K Cache, 435M Buf, 106M Free
ARC: 2496M Total, 134M MFU, 2342M MRU, 100K Anon, 11M Header, 10M Other
Swap: 4096M Total, 524M Used, 3572M Free, 12% Inuse


  PID USERNAME    THR PRI NICE   SIZE    RES STATE   C   TIME    WCPU COMMAND
20499 akips         1  31    0   117M 71800K RUN     1   5:21   6.05% perl
19844 akips         1  23    0 70948K 21456K RUN     0  11:41   4.05% perl
19845 akips         1  22    0   750M 45900K select  0   9:53   3.08% nm-db
20500 akips         1  21    0   802M   104M select  1   1:06   1.46% nm-db
24516 akips         1  20    0 74020K  5352K select  0  70:06   1.07% nm-httpd
24464 akips         1  20    0 73948K  3308K RUN     0  41:37   0.59% nm-timed
24467 root          1  20    0   421M 63696K select  1  52:24   0.29% nm-joatd
24525 akips         1  20    0   104M 21600K select  0  85:04   0.20%
nm-flow-collector
24528 akips         1   4    0   782M 77196K         0 222:50   0.00% nm-poller
24511 akips         1  20    0   123M  3512K select  0  18:14   0.00% nm-msgd
24538 akips         1  20    0    98M 28432K select  1  18:05   0.00%
nm-flow-meter
24510 akips         1  20    0   750M   269M nanslp  0  17:34   0.00%
nm-poller-helper
24536 akips         1  20    0    98M 27008K select  0  15:44   0.00%
nm-flow-meter
24512 root          1  20    0 21948K  2652K nanslp  1  13:45   0.00% top
24537 akips         1  20    0 92408K 18668K select  0   6:37   0.00%
nm-flow-meter
24527 akips         1  20    0   750M 43756K select  0   4:58   0.00%
nm-snmp-trapd
24544 akips         1  20    0 88312K 13080K select  1   3:59   0.00%
nm-flow-meter
24493 akips         1  20    0 78056K  3652K select  0   3:56   0.00%
nm-fifo-tee



# procstat -kk 24528
  PID    TID COMM             TDNAME           KSTACK                       
24528 100360 nm-poller        -                mi_switch+0xe1
sleepq_timedwait_sig+0x8b _cv_timedwait_sig_sbt+0x18b seltdwait+0xa4
kern_select+0x8fa sys_select+0x54 amd64_syscall+0x357 Xfast_syscall+0xfb


# ps -o user,pid,ppid,pending,caught,ignored,blocked,stat,wchan 24528
USER    PID PPID PENDING   CAUGHT IGNORED  BLOCKED STAT WCHAN
akips 24528    1       0 f008568b 8408000 60085003 Ds   -


# ps -o pid,paddr 24528
  PID            PADDR
24528 fffff8009d563000


(kgdb) p *(struct proc *)0xfffff8009d563000
$1 = {p_list = {le_next = 0xfffff800194339d0, le_prev = 0xfffff800133da000},
p_threads = {
    tqh_first = 0xfffff8009d566000, tqh_last = 0xfffff8009d566010}, p_slock =
{lock_object = {
      lo_name = 0xffffffff80fc8265 "process slock", lo_flags = 720896, lo_data
= 0, 
      lo_witness = 0x0}, mtx_lock = 4}, p_ucred = 0xfffff80013daf300, p_fd =
0xfffff800c19f8000, 
  p_fdtol = 0x0, p_stats = 0xfffff80013c1b200, p_limit = 0xfffff800ac7d2400,
p_limco = {c_links = {
      le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe =
{tqe_next = 0x0, 
        tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0, 
    c_lock = 0xfffff8009d563100, c_flags = 0, c_iflags = 0, c_cpu = 0}, 
  p_sigacts = 0xfffff800320aa000, p_flag = 268435712, p_flag2 = 0, p_state =
PRS_NORMAL, 
  p_pid = 24528, p_hash = {le_next = 0x0, le_prev = 0xfffffe00006f1e80},
p_pglist = {le_next = 0x0, 
    le_prev = 0xfffff8007bd35090}, p_pptr = 0xfffff800029724e8, p_sibling = {
    le_next = 0xfffff800194339d0, le_prev = 0xfffff800029725e0}, p_children =
{lh_first = 0x0}, 
  p_mtx = {lock_object = {lo_name = 0xffffffff80fc8258 "process lock", lo_flags
= 21168128, 
      lo_data = 0, lo_witness = 0x0}, mtx_lock = 4}, p_ksi =
0xfffff80013301070, p_sigqueue = {
    sq_signals = {__bits = {0, 0, 0, 0}}, sq_kill = {__bits = {0, 0, 0, 0}},
sq_list = {
      tqh_first = 0x0, tqh_last = 0xfffff8009d563148}, sq_proc =
0xfffff8009d563000, sq_flags = 1}, 
  p_oppid = 0, p_vmspace = 0xfffff801062e7380, p_swtick = 686191978,
p_realtimer = {it_interval = {
      tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, p_ru =
{ru_utime = {
      tv_sec = 0, tv_usec = 0}, ru_stime = {tv_sec = 0, tv_usec = 0}, ru_maxrss
= 0, ru_ixrss = 0, 
    ru_idrss = 0, ru_isrss = 0, ru_minflt = 0, ru_majflt = 0, ru_nswap = 0,
ru_inblock = 0, 
    ru_oublock = 0, ru_msgsnd = 0, ru_msgrcv = 0, ru_nsignals = 0, ru_nvcsw =
0, ru_nivcsw = 0}, 
  p_rux = {rux_runtime = 22692949475677, rux_uticks = 290352, rux_sticks =
1363204, rux_iticks = 0, 
    rux_uu = 2347747468, rux_su = 11022676790, rux_tu = 13370422433}, p_crux =
{rux_runtime = 0, 
    rux_uticks = 0, rux_sticks = 0, rux_iticks = 0, rux_uu = 0, rux_su = 0,
rux_tu = 0}, 
  p_profthreads = 0, p_exitthreads = 0, p_traceflag = 0, p_tracevp = 0x0,
p_tracecred = 0x0, 
  p_textvp = 0xfffff800cfa9d588, p_lock = 0, p_sigiolst = {slh_first = 0x0},
p_sigparent = 20, 
  p_sig = 0, p_code = 0, p_stops = 0, p_stype = 0, p_step = 0 '\0', p_pfsflags
= 0 '\0', 
  p_nlminfo = 0x0, p_aioinfo = 0x0, p_singlethread = 0x0, p_suspcount = 0,
p_xthread = 0x0, 
  p_boundary_count = 0, p_pendingcnt = 0, p_itimers = 0x0, p_procdesc = 0x0,
p_magic = 3203398350, 
  p_osrel = 1002000, p_comm =
"nm-poller\000\000\000\000\000\000\000\000\000\000", 
  p_pgrp = 0xfffff8007bd35080, p_sysent = 0xffffffff814d4388, p_args =
0xfffff80002b73360, 
  p_cpulimit = 9223372036854775807, p_nice = 0 '\0', p_fibnum = 0, p_xstat = 0,
p_klist = {
    kl_list = {slh_first = 0x0}, kl_lock = 0xffffffff80907d90
<knlist_mtx_lock>, 
    kl_unlock = 0xffffffff80907dd0 <knlist_mtx_unlock>, 
    kl_assert_locked = 0xffffffff80907e10 <knlist_mtx_assert_locked>, 
    kl_assert_unlocked = 0xffffffff80907e20 <knlist_mtx_assert_unlocked>, 
    kl_lockarg = 0xfffff8009d563100}, p_numthreads = 1, p_md = {md_ldt = 0x0,
md_ldt_sd = {
      sd_lolimit = 0, sd_lobase = 0, sd_type = 0, sd_dpl = 0, sd_p = 0,
sd_hilimit = 0, sd_xx0 = 0, 
      sd_gran = 0, sd_hibase = 0, sd_xx1 = 0, sd_mbz = 0, sd_xx2 = 0}},
p_itcallout = {c_links = {
      le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe =
{tqe_next = 0x0, 
        tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0, 
    c_lock = 0xfffff8009d563100, c_flags = 0, c_iflags = 0, c_cpu = 0},
p_acflag = 1, p_peers = 0x0, 
  p_leader = 0xfffff8009d563000, p_emuldata = 0x0, p_label = 0x0, p_sched =
0xfffff8009d5634e8, 
  p_ktr = {stqh_first = 0x0, stqh_last = 0xfffff8009d563448}, p_mqnotifier =
{lh_first = 0x0}, 
  p_dtrace = 0xfffff8008855f100, p_pwait = {cv_description = 0xffffffff80fc8c59
"ppwait", 
    cv_waiters = 0}, p_dbgwait = {cv_description = 0xffffffff80fc8c60
"dbgwait", cv_waiters = 0}, 
  p_prev_runtime = 0, p_racct = 0x0, p_throttled = 0 '\0', p_orphan = {le_next
= 0x0, 
    le_prev = 0x0}, p_orphans = {lh_first = 0x0}, p_treeflag = 0, p_reaper =
0xfffff800029724e8, 
  p_reaplist = {lh_first = 0x0}, p_reapsibling = {le_next = 0xfffff800194339d0, 
    le_prev = 0xfffff800133da4d0}, p_reapsubtree = 16}


(kgdb) p/x *(((struct proc *)0xfffff8009d563000)->p_threads.tqh_first)
$2 = {td_lock = 0xffffffff815cb380, td_proc = 0xfffff8009d563000, td_plist =
{tqe_next = 0x0, 
    tqe_prev = 0xfffff8009d563010}, td_runq = {tqe_next = 0x0, tqe_prev =
0xffffffff815cb5c8}, 
  td_slpq = {tqe_next = 0x0, tqe_prev = 0xfffff800027cdc80}, td_lockq =
{tqe_next = 0x0, 
    tqe_prev = 0xfffffe0122b4e638}, td_hash = {le_next = 0x0, le_prev =
0xfffffe0000866040}, 
  td_cpuset = 0xfffff8000296c3a8, td_sel = 0xfffff80106647300, td_sleepqueue =
0xfffff800027cdc80, 
  td_turnstile = 0xfffff8009d5b5a80, td_rlqe = 0xfffff80013434d70, td_umtxq =
0xfffff80013c2cb80, 
  td_tid = 0x18808, td_sigqueue = {sq_signals = {__bits = {0x0, 0x0, 0x0,
0x0}}, sq_kill = {
      __bits = {0x0, 0x0, 0x0, 0x0}}, sq_list = {tqh_first = 0x0, tqh_last =
0xfffff8009d5660b8}, 
    sq_proc = 0xfffff8009d563000, sq_flags = 0x1}, td_lend_user_pri = 0xff,
td_flags = 0x14, 
  td_inhibitors = 0x2, td_pflags = 0x0, td_dupfd = 0x0, td_sqqueue = 0x0,
td_wchan = 0x0, 
  td_wmesg = 0x0, td_lastcpu = 0x0, td_oncpu = 0xff, td_owepreempt = 0x0,
td_tsqueue = 0x0, 
  td_locks = 0xca3f, td_rw_rlocks = 0x0, td_lk_slocks = 0x0, td_stopsched =
0x0, td_blocked = 0x0, 
  td_lockname = 0x0, td_contested = {lh_first = 0x0}, td_sleeplocks = 0x0, 
  td_intr_nesting_level = 0x0, td_pinned = 0x0, td_ucred = 0xfffff80013daf300,
td_estcpu = 0x0, 
  td_slptick = 0x0, td_blktick = 0x0, td_swvoltick = 0x36cff98d, td_cow = 0x22,
td_ru = {ru_utime = {
      tv_sec = 0x0, tv_usec = 0x0}, ru_stime = {tv_sec = 0x0, tv_usec = 0x0},
ru_maxrss = 0x37b40, 
    ru_ixrss = 0x2f4f0180, ru_idrss = 0xc3b0abce0, ru_isrss = 0xc9d9a00,
ru_minflt = 0x20b07, 
    ru_majflt = 0x3df, ru_nswap = 0x0, ru_inblock = 0x0, ru_oublock = 0x1,
ru_msgsnd = 0x4e8632d, 
    ru_msgrcv = 0x2385fa8, ru_nsignals = 0x0, ru_nvcsw = 0x2d34637, ru_nivcsw =
0xa450a3}, td_rux = {
    rux_runtime = 0x14a39d31415d, rux_uticks = 0x46e30, rux_sticks = 0x14cd04,
rux_iticks = 0x0, 
    rux_uu = 0x8befc16a, rux_su = 0x29100b336, rux_tu = 0x31cf074a1},
td_incruntime = 0x0, 
  td_runtime = 0x14a39d31415d, td_pticks = 0x0, td_sticks = 0x0, td_iticks =
0x0, td_uticks = 0x0, 
  td_intrval = 0x0, td_oldsigmask = {__bits = {0x0, 0x0, 0x0, 0x0}},
td_generation = 0x37796da, 
  td_sigstk = {ss_sp = 0x0, ss_size = 0x0, ss_flags = 0x4}, td_xsig = 0x0,
td_profil_addr = 0x0, 
  td_profil_ticks = 0x0, td_name = {0x6e, 0x6d, 0x2d, 0x70, 0x6f, 0x6c, 0x6c,
0x65, 0x72, 
    0x0 <repeats 11 times>}, td_fpop = 0x0, td_dbgflags = 0x0, td_dbgksi =
{ksi_link = {
      tqe_next = 0x0, tqe_prev = 0x0}, ksi_info = {si_signo = 0x0, si_errno =
0x0, si_code = 0x0, 
      si_pid = 0x0, si_uid = 0x0, si_status = 0x0, si_addr = 0x0, si_value =
{sival_int = 0x0, 
        sival_ptr = 0x0, sigval_int = 0x0, sigval_ptr = 0x0}, _reason = {_fault
= {_trapno = 0x0}, 
        _timer = {_timerid = 0x0, _overrun = 0x0}, _mesgq = {_mqd = 0x0}, _poll
= {_band = 0x0}, 
        __spare__ = {__spare1__ = 0x0, __spare2__ = {0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0}}}}, 
    ksi_flags = 0x0, ksi_sigq = 0x0}, td_ng_outbound = 0x0, td_osd =
{osd_nslots = 0x0, 
    osd_slots = 0x0, osd_next = {le_next = 0x0, le_prev = 0x0}},
td_map_def_user = 0x0, 
  td_dbg_forked = 0x0, td_vp_reserv = 0x0, td_no_sleeping = 0x0, td_dom_rr_idx
= 0x0, td_sigmask = {
    __bits = {0x60085003, 0x0, 0x0, 0x0}}, td_rqindex = 0x1a, td_base_pri =
0x68, 
  td_priority = 0x68, td_pri_class = 0x3, td_user_pri = 0x7f, td_base_user_pri
= 0x7f, 
  td_pcb = 0xfffffe0122aa4cc0, td_state = 0x1, td_retval = {0x0, 0x0},
td_slpcallout = {c_links = {
      le = {le_next = 0x0, le_prev = 0xfffffe0000880840}, sle = {sle_next =
0x0}, tqe = {
        tqe_next = 0x0, tqe_prev = 0xfffffe0000880840}}, c_time =
0xe082d08b24e81, 
    c_precision = 0x12381, c_arg = 0xfffff8009d566000, c_func =
0xffffffff809901f0, c_lock = 0x0, 
    c_flags = 0x0, c_iflags = 0x110, c_cpu = 0x1}, td_frame =
0xfffffe0122aa4c00, 
  td_kstack_obj = 0xfffff8009d709700, td_kstack = 0xfffffe0122aa1000,
td_kstack_pages = 0x4, 
  td_critnest = 0x1, td_md = {md_spinlock_count = 0x1, md_saved_flags = 0x246, 
    md_spurflt_addr = 0x801018000}, td_sched = 0xfffff8009d566470, td_ar = 0x0,
td_lprof = {{
      lh_first = 0x0}, {lh_first = 0x0}}, td_dtrace = 0xfffff8001394db00,
td_errno = 0x0, 
  td_vnet = 0x0, td_vnet_lpush = 0x0, td_intr_frame = 0x0, td_rfppwait_p =
0xfffff8001334c4e8, 
  td_ma = 0x0, td_ma_cnt = 0x0, td_su = 0x0}

-- 
You are receiving this mail because:
You are the assignee for the bug.



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?bug-204081-8>