Date: Mon, 24 Apr 2017 16:31:28 +0000 (UTC) From: Steven Hartland <smh@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r317375 - in stable/10: cddl/lib/libdtrace sys/netinet Message-ID: <201704241631.v3OGVSx1062457@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: smh Date: Mon Apr 24 16:31:28 2017 New Revision: 317375 URL: https://svnweb.freebsd.org/changeset/base/317375 Log: Partial MFC r316676 and the required r313045 MFC r316676: Use estimated RTT for receive buffer auto resizing instead of timestamps. This is a partial MFC as stable/10 doesn't include the TCP stack modularisation. MFC r313045: Add an mbuf to ipinfo_t translator to finish cleanup of mbuf passing to TCP probes. This is a partial MFC (missing debug__output & debug__drop changes) due to the massive amount of additional dtrace changes that would be required for a full MFC. Relnotes: Yes Sponsored by: Multiplay Modified: stable/10/cddl/lib/libdtrace/ip.d stable/10/sys/netinet/in_kdtrace.c stable/10/sys/netinet/in_kdtrace.h stable/10/sys/netinet/tcp_input.c stable/10/sys/netinet/tcp_output.c stable/10/sys/netinet/tcp_var.h Directory Properties: stable/10/ (props changed) Modified: stable/10/cddl/lib/libdtrace/ip.d ============================================================================== --- stable/10/cddl/lib/libdtrace/ip.d Mon Apr 24 16:07:30 2017 (r317374) +++ stable/10/cddl/lib/libdtrace/ip.d Mon Apr 24 16:31:28 2017 (r317375) @@ -240,6 +240,24 @@ translator ipinfo_t < uint8_t *p > { #pragma D binding "1.0" IFF_LOOPBACK inline int IFF_LOOPBACK = 0x8; +#pragma D binding "1.13" translator +translator ipinfo_t < struct mbuf *m > { + ip_ver = m == NULL ? 0 : ((struct ip *)m->m_data)->ip_v; + ip_plength = m == NULL ? 0 : + ((struct ip *)m->m_data)->ip_v == 4 ? + ntohs(((struct ip *)m->m_data)->ip_len) - + (((struct ip *)m->m_data)->ip_hl << 2): + ntohs(((struct ip6_hdr *)m->m_data)->ip6_ctlun.ip6_un1.ip6_un1_plen); + ip_saddr = m == NULL ? 0 : + ((struct ip *)m->m_data)->ip_v == 4 ? + inet_ntoa(&((struct ip *)m->m_data)->ip_src.s_addr) : + inet_ntoa6(&((struct ip6_hdr *)m->m_data)->ip6_src); + ip_daddr = m == NULL ? 0 : + ((struct ip *)m->m_data)->ip_v == 4 ? + inet_ntoa(&((struct ip *)m->m_data)->ip_dst.s_addr) : + inet_ntoa6(&((struct ip6_hdr *)m->m_data)->ip6_dst); +}; + #pragma D binding "1.0" translator translator ifinfo_t < struct ifnet *p > { if_name = p->if_xname; Modified: stable/10/sys/netinet/in_kdtrace.c ============================================================================== --- stable/10/sys/netinet/in_kdtrace.c Mon Apr 24 16:07:30 2017 (r317374) +++ stable/10/sys/netinet/in_kdtrace.c Mon Apr 24 16:31:28 2017 (r317375) @@ -58,28 +58,28 @@ SDT_PROBE_DEFINE6_XLATE(ip, , , send, SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", - "uint8_t *", "ipinfo_t *", + "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", - "uint8_t *", "ipinfo_t *", + "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", - "uint8_t *", "ipinfo_t *", + "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", - "uint8_t *", "ipinfo_t *", + "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); @@ -93,7 +93,7 @@ SDT_PROBE_DEFINE5_XLATE(tcp, , , connect SDT_PROBE_DEFINE5_XLATE(tcp, , , receive, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", - "uint8_t *", "ipinfo_t *", + "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); @@ -112,6 +112,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__ "void *", "void *", "int", "tcplsinfo_t *"); +SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize, + "void *", "void *", + "struct tcpcb *", "csinfo_t *", + "struct mbuf *", "ipinfo_t *", + "struct tcpcb *", "tcpsinfo_t *" , + "struct tcphdr *", "tcpinfoh_t *", + "int", "int"); + SDT_PROBE_DEFINE5_XLATE(udp, , , receive, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", Modified: stable/10/sys/netinet/in_kdtrace.h ============================================================================== --- stable/10/sys/netinet/in_kdtrace.h Mon Apr 24 16:07:30 2017 (r317374) +++ stable/10/sys/netinet/in_kdtrace.h Mon Apr 24 16:31:28 2017 (r317375) @@ -52,6 +52,7 @@ SDT_PROBE_DECLARE(tcp, , , connect__requ SDT_PROBE_DECLARE(tcp, , , receive); SDT_PROBE_DECLARE(tcp, , , send); SDT_PROBE_DECLARE(tcp, , , state__change); +SDT_PROBE_DECLARE(tcp, , , receive__autoresize); SDT_PROBE_DECLARE(udp, , , receive); SDT_PROBE_DECLARE(udp, , , send); Modified: stable/10/sys/netinet/tcp_input.c ============================================================================== --- stable/10/sys/netinet/tcp_input.c Mon Apr 24 16:07:30 2017 (r317374) +++ stable/10/sys/netinet/tcp_input.c Mon Apr 24 16:31:28 2017 (r317375) @@ -1469,6 +1469,68 @@ drop: m_freem(m); } +/* + * Automatic sizing of receive socket buffer. Often the send + * buffer size is not optimally adjusted to the actual network + * conditions at hand (delay bandwidth product). Setting the + * buffer size too small limits throughput on links with high + * bandwidth and high delay (eg. trans-continental/oceanic links). + * + * On the receive side the socket buffer memory is only rarely + * used to any significant extent. This allows us to be much + * more aggressive in scaling the receive socket buffer. For + * the case that the buffer space is actually used to a large + * extent and we run out of kernel memory we can simply drop + * the new segments; TCP on the sender will just retransmit it + * later. Setting the buffer size too big may only consume too + * much kernel memory if the application doesn't read() from + * the socket or packet loss or reordering makes use of the + * reassembly queue. + * + * The criteria to step up the receive buffer one notch are: + * 1. Application has not set receive buffer size with + * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. + * 2. the number of bytes received during the time it takes + * one timestamp to be reflected back to us (the RTT); + * 3. received bytes per RTT is within seven eighth of the + * current socket buffer size; + * 4. receive buffer size has not hit maximal automatic size; + * + * This algorithm does one step per RTT at most and only if + * we receive a bulk stream w/o packet losses or reorderings. + * Shrinking the buffer during idle times is not necessary as + * it doesn't consume any memory when idle. + * + * TODO: Only step up if the application is actually serving + * the buffer to better manage the socket buffer resources. + */ +int +tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int tlen) +{ + int newsize = 0; + + if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && + tp->t_srtt != 0 && tp->rfbuf_ts != 0 && + TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > + (tp->t_srtt >> TCP_RTT_SHIFT)) { + if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && + so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { + newsize = min(so->so_rcv.sb_hiwat + + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); + } + TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); + + /* Start over with next RTT. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + } else { + tp->rfbuf_cnt += tlen; /* add up */ + } + + return (newsize); +} + static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, @@ -1811,60 +1873,7 @@ tcp_do_segment(struct mbuf *m, struct tc tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - /* - * Automatic sizing of receive socket buffer. Often the send - * buffer size is not optimally adjusted to the actual network - * conditions at hand (delay bandwidth product). Setting the - * buffer size too small limits throughput on links with high - * bandwidth and high delay (eg. trans-continental/oceanic links). - * - * On the receive side the socket buffer memory is only rarely - * used to any significant extent. This allows us to be much - * more aggressive in scaling the receive socket buffer. For - * the case that the buffer space is actually used to a large - * extent and we run out of kernel memory we can simply drop - * the new segments; TCP on the sender will just retransmit it - * later. Setting the buffer size too big may only consume too - * much kernel memory if the application doesn't read() from - * the socket or packet loss or reordering makes use of the - * reassembly queue. - * - * The criteria to step up the receive buffer one notch are: - * 1. the number of bytes received during the time it takes - * one timestamp to be reflected back to us (the RTT); - * 2. received bytes per RTT is within seven eighth of the - * current socket buffer size; - * 3. receive buffer size has not hit maximal automatic size; - * - * This algorithm does one step per RTT at most and only if - * we receive a bulk stream w/o packet losses or reorderings. - * Shrinking the buffer during idle times is not necessary as - * it doesn't consume any memory when idle. - * - * TODO: Only step up if the application is actually serving - * the buffer to better manage the socket buffer resources. - */ - if (V_tcp_do_autorcvbuf && - (to.to_flags & TOF_TS) && - to.to_tsecr && - (so->so_rcv.sb_flags & SB_AUTOSIZE)) { - if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && - to.to_tsecr - tp->rfbuf_ts < hz) { - if (tp->rfbuf_cnt > - (so->so_rcv.sb_hiwat / 8 * 7) && - so->so_rcv.sb_hiwat < - V_tcp_autorcvbuf_max) { - newsize = - min(so->so_rcv.sb_hiwat + - V_tcp_autorcvbuf_inc, - V_tcp_autorcvbuf_max); - } - /* Start over with next RTT. */ - tp->rfbuf_ts = 0; - tp->rfbuf_cnt = 0; - } else - tp->rfbuf_cnt += tlen; /* add up */ - } + newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); @@ -1905,10 +1914,6 @@ tcp_do_segment(struct mbuf *m, struct tc win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); - /* Reset receive buffer auto scaling when not in bulk receive mode. */ - tp->rfbuf_ts = 0; - tp->rfbuf_cnt = 0; - switch (tp->t_state) { /* Modified: stable/10/sys/netinet/tcp_output.c ============================================================================== --- stable/10/sys/netinet/tcp_output.c Mon Apr 24 16:07:30 2017 (r317374) +++ stable/10/sys/netinet/tcp_output.c Mon Apr 24 16:31:28 2017 (r317375) @@ -790,11 +790,13 @@ send: to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; - /* Set receive buffer autosizing timestamp. */ - if (tp->rfbuf_ts == 0 && - (so->so_rcv.sb_flags & SB_AUTOSIZE)) - tp->rfbuf_ts = tcp_ts_getticks(); } + + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0 && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = tcp_ts_getticks(); + /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) Modified: stable/10/sys/netinet/tcp_var.h ============================================================================== --- stable/10/sys/netinet/tcp_var.h Mon Apr 24 16:07:30 2017 (r317374) +++ stable/10/sys/netinet/tcp_var.h Mon Apr 24 16:31:28 2017 (r317375) @@ -704,6 +704,8 @@ int tcp_reass(struct tcpcb *, struct tc void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_input(struct mbuf *, int); +int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, + struct tcpcb *, int); u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201704241631.v3OGVSx1062457>