Date: Thu, 7 Nov 2019 21:01:36 +0000 (UTC) From: Gleb Smirnoff <glebius@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r354479 - in head/sys: netinet netinet6 Message-ID: <201911072101.xA7L1a9t078550@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: glebius Date: Thu Nov 7 21:01:36 2019 New Revision: 354479 URL: https://svnweb.freebsd.org/changeset/base/354479 Log: Now with epoch synchronized PCB lookup tables we can greatly simplify locking in udp_output() and udp6_output(). First, we select if we need read or write lock in PCB itself, we take the lock and enter network epoch. Then, we proceed for the rest of the function. In case if we need to modify PCB hash, we would take write lock on it for a short piece of code. We could exit the epoch before allocating an mbuf, but with this patch we are keeping it all the way into ip_output()/ip6_output(). Today this creates an epoch recursion, since ip_output() enters epoch itself. However, once all protocols are reviewed, ip_output() and ip6_output() would require epoch instead of entering it. Note: I'm not 100% sure that in udp6_output() the epoch is required. We don't do PCB hash lookup for a bound socket. And all branches of in6_select_src() don't require epoch, at least they lack assertions. Today inet6 address list is protected by rmlock, although it is CKLIST. AFAIU, the future plan is to protect it by network epoch. That would require epoch in in6_select_src(). Anyway, in future ip6_output() would require epoch, udp6_output() would need to enter it. Modified: head/sys/netinet/udp_usrreq.c head/sys/netinet6/udp6_usrreq.c Modified: head/sys/netinet/udp_usrreq.c ============================================================================== --- head/sys/netinet/udp_usrreq.c Thu Nov 7 20:57:51 2019 (r354478) +++ head/sys/netinet/udp_usrreq.c Thu Nov 7 21:01:36 2019 (r354479) @@ -1118,9 +1118,6 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) } #ifdef INET -#define UH_WLOCKED 2 -#define UH_RLOCKED 1 -#define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) @@ -1136,19 +1133,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct s int error = 0; int ipflags; u_short fport, lport; - int unlock_udbinfo, unlock_inp; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; - /* - * udp_output() may need to temporarily bind or connect the current - * inpcb. As such, we don't know up front whether we will need the - * pcbinfo lock or not. Do any work to decide what is needed up - * front before acquiring any locks. - */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); @@ -1158,28 +1148,22 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct s src.sin_family = 0; sin = (struct sockaddr_in *)addr; -retry: + + /* + * udp_output() may need to temporarily bind or connect the current + * inpcb. As such, we don't know up front whether we will need the + * pcbinfo lock or not. Do any work to decide what is needed up + * front before acquiring any locks. + * + * We will need network epoch in either case, to safely lookup into + * pcb hash. + */ if (sin == NULL || - (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) INP_WLOCK(inp); - /* - * In case we lost a race and another thread bound addr/port - * on the inp we cannot keep the wlock (which still would be - * fine) as further down, based on these values we make - * decisions for the pcbinfo lock. If the locks are not in - * synch the assertions on unlock will fire, hence we go for - * one retry loop. - */ - if (sin != NULL && (inp->inp_laddr.s_addr != INADDR_ANY || - inp->inp_lport != 0)) { - INP_WUNLOCK(inp); - goto retry; - } - unlock_inp = UH_WLOCKED; - } else { + else INP_RLOCK(inp); - unlock_inp = UH_RLOCKED; - } + NET_EPOCH_ENTER(et); tos = inp->inp_ip_tos; if (control != NULL) { /* @@ -1187,13 +1171,9 @@ retry: * stored in a single mbuf. */ if (control->m_next) { - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); m_freem(control); - m_freem(m); - return (EINVAL); + error = EINVAL; + goto release; } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), @@ -1264,56 +1244,11 @@ retry: } m_freem(control); } - if (error) { - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); - m_freem(m); - return (error); - } + if (error) + goto release; - /* - * In the old days, depending on whether or not the application had - * bound or connected the socket, we had to do varying levels of work. - * The optimal case was for a connected UDP socket, as a global lock - * wasn't required at all. - * In order to decide which we need, we required stability of the - * inpcb binding, which we ensured by acquiring a read lock on the - * inpcb. This didn't strictly follow the lock order, so we played - * the trylock and retry game. - * With the re-introduction of the route-cache in some cases, we started - * to acquire an early inp wlock and a possible race during re-lock - * went away. With the introduction of epoch(9) some read locking - * became epoch(9) and the lock-order issues also went away. - * Due to route-cache we may now hold more conservative locks than - * otherwise required and have split up the 2nd case in case 2 and 3 - * in order to keep the udpinfo lock level in sync with the inp one - * for the IP_SENDSRCADDR case below. - */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); - if (sin != NULL && - (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { - INP_HASH_WLOCK(pcbinfo); - unlock_udbinfo = UH_WLOCKED; - } else if (sin != NULL && - (sin->sin_addr.s_addr == INADDR_ANY || - sin->sin_addr.s_addr == INADDR_BROADCAST || - inp->inp_laddr.s_addr == INADDR_ANY || - inp->inp_lport == 0)) { - INP_HASH_RLOCK_ET(pcbinfo, et); - unlock_udbinfo = UH_RLOCKED; - } else if (src.sin_family == AF_INET) { - if (unlock_inp == UH_WLOCKED) { - INP_HASH_WLOCK(pcbinfo); - unlock_udbinfo = UH_WLOCKED; - } else { - INP_HASH_RLOCK_ET(pcbinfo, et); - unlock_udbinfo = UH_RLOCKED; - } - } else - unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the @@ -1389,7 +1324,6 @@ retry: if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1397,7 +1331,10 @@ retry: if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { + INP_HASH_WLOCK(pcbinfo); + error = in_pcbinshash(inp); + INP_HASH_WUNLOCK(pcbinfo); + if (error != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; @@ -1562,48 +1499,20 @@ retry: ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ - if (unlock_udbinfo == UH_WLOCKED) - INP_HASH_WUNLOCK(pcbinfo); - else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK_ET(pcbinfo, et); if (pr == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); else UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, - (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, + INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags, inp->inp_moptions, inp); - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); return (error); release: - if (unlock_udbinfo == UH_WLOCKED) { - KASSERT(unlock_inp == UH_WLOCKED, - ("%s: excl udbinfo lock %#03x, shared inp lock %#03x, " - "sin %p daddr %#010x inp %p laddr %#010x lport %#06x " - "src fam %#04x", - __func__, unlock_udbinfo, unlock_inp, sin, - (sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp, - inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family)); - INP_HASH_WUNLOCK(pcbinfo); - INP_WUNLOCK(inp); - } else if (unlock_udbinfo == UH_RLOCKED) { - KASSERT(unlock_inp == UH_RLOCKED, - ("%s: shared udbinfo lock %#03x, excl inp lock %#03x, " - "sin %p daddr %#010x inp %p laddr %#010x lport %#06x " - "src fam %#04x", - __func__, unlock_udbinfo, unlock_inp, sin, - (sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp, - inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family)); - INP_HASH_RUNLOCK_ET(pcbinfo, et); - INP_RUNLOCK(inp); - } else if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); m_freem(m); return (error); } Modified: head/sys/netinet6/udp6_usrreq.c ============================================================================== --- head/sys/netinet6/udp6_usrreq.c Thu Nov 7 20:57:51 2019 (r354478) +++ head/sys/netinet6/udp6_usrreq.c Thu Nov 7 21:01:36 2019 (r354479) @@ -678,14 +678,10 @@ udp6_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); -#define UH_WLOCKED 2 -#define UH_RLOCKED 1 -#define UH_UNLOCKED 0 static int udp6_output(struct socket *so, int flags_arg, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { - struct inpcbinfo *pcbinfo; struct inpcb *inp; struct ip6_hdr *ip6; struct udphdr *udp6; @@ -697,7 +693,7 @@ udp6_output(struct socket *so, int flags_arg, struct m u_int32_t ulen, plen; uint16_t cscov; u_short fport; - uint8_t nxt, unlock_inp, unlock_udbinfo; + uint8_t nxt; /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; @@ -740,30 +736,17 @@ udp6_output(struct socket *so, int flags_arg, struct m * - on connected sockets (sin6 is NULL) for route cache updates, * - when we are not bound to an address and source port (it is * in6_pcbsetport() which will require the write lock). + * + * We check the inp fields before actually locking the inp, so + * here exists a race, and we may WLOCK the inp and end with already + * bound one by other thread. This is fine. */ -retry: if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - inp->inp_lport == 0)) { + inp->inp_lport == 0)) INP_WLOCK(inp); - /* - * In case we lost a race and another thread bound addr/port - * on the inp we cannot keep the wlock (which still would be - * fine) as further down, based on these values we make - * decisions for the pcbinfo lock. If the locks are not in - * synch the assertions on unlock will fire, hence we go for - * one retry loop. - */ - if (sin6 != NULL && - (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || - inp->inp_lport != 0)) { - INP_WUNLOCK(inp); - goto retry; - } - unlock_inp = UH_WLOCKED; - } else { + else INP_RLOCK(inp); - unlock_inp = UH_RLOCKED; - } + nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; @@ -787,10 +770,7 @@ retry: * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); pru = inetsw[ip_protox[nxt]].pr_usrreqs; @@ -805,21 +785,17 @@ retry: * Given this is either an IPv6-only socket or no INET is * supported we will fail the send if the given destination * address is a v4mapped address. + * + * XXXGL: do we leak m and control? */ - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); return (EINVAL); } if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); ip6_clearpktopts(&opt, -1); if (control) m_freem(control); @@ -830,20 +806,7 @@ retry: } else optp = inp->in6p_outputopts; - pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); - if (sin6 != NULL && - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) { - INP_HASH_WLOCK(pcbinfo); - unlock_udbinfo = UH_WLOCKED; - } else if (sin6 != NULL && - (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || - inp->inp_lport == 0)) { - INP_HASH_RLOCK_ET(pcbinfo, et); - unlock_udbinfo = UH_RLOCKED; - } else - unlock_udbinfo = UH_UNLOCKED; - + NET_EPOCH_ENTER(et); if (sin6) { /* @@ -879,9 +842,14 @@ retry: laddr = &in6a; if (inp->inp_lport == 0) { + struct inpcbinfo *pcbinfo; INP_WLOCK_ASSERT(inp); + + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); + INP_HASH_WLOCK(pcbinfo); error = in6_pcbsetport(laddr, inp, td->td_ucred); + INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; @@ -1005,21 +973,15 @@ retry: #endif UDPSTAT_INC(udps_opackets); - if (unlock_udbinfo == UH_WLOCKED) - INP_HASH_WUNLOCK(pcbinfo); - else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK_ET(pcbinfo, et); if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); else UDP_PROBE(send, NULL, inp, ip6, inp, udp6); error = ip6_output(m, optp, - (unlock_inp == UH_WLOCKED) ? &inp->inp_route6 : NULL, flags, + INP_WLOCKED(inp) ? &inp->inp_route6 : NULL, flags, inp->in6p_moptions, NULL, inp); - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); @@ -1028,22 +990,8 @@ retry: return (error); release: - if (unlock_udbinfo == UH_WLOCKED) { - KASSERT(unlock_inp == UH_WLOCKED, ("%s: excl udbinfo lock, " - "non-excl inp lock: pcbinfo %p %#x inp %p %#x", - __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); - INP_HASH_WUNLOCK(pcbinfo); - INP_WUNLOCK(inp); - } else if (unlock_udbinfo == UH_RLOCKED) { - KASSERT(unlock_inp == UH_RLOCKED, ("%s: non-excl udbinfo lock, " - "excl inp lock: pcbinfo %p %#x inp %p %#x", - __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); - INP_HASH_RUNLOCK_ET(pcbinfo, et); - INP_RUNLOCK(inp); - } else if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201911072101.xA7L1a9t078550>