Date: Sun, 4 Nov 2007 00:17:42 GMT From: Kip Macy <kmacy@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 128601 for review Message-ID: <200711040017.lA40HgRp005496@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=128601 Change 128601 by kmacy@kmacy:storage:toestack on 2007/11/04 00:16:43 - add support for rx - add support for returning rx credits (update receive window) - but credit calculation needs to be fixed (clearly marked) - add partial support for connection close (some shutdown states are not handled yet) Affected files ... .. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#12 edit Differences ... ==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#12 (text+ko) ==== @@ -172,9 +172,6 @@ req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15)); toep->tp_flags |= TP_DATASENT; } - - - } int @@ -200,7 +197,7 @@ cdev = d->cdev; last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb; total_bytes = 0; - if (toep->tp_m_last == last) { + if (last && toep->tp_m_last == last) { KASSERT(tail, ("sbdrop error")); last = tail = tail->m_next; } @@ -233,6 +230,7 @@ so->so_snd.sb_sndptroff += bytes; total_bytes += bytes; + toep->tp_write_seq += bytes; /* @@ -266,14 +264,170 @@ l2t_send(cdev, m0, toep->tp_l2t); } + + + return (total_bytes); +} + + + +/* + * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail + * under any circumstances. We take the easy way out and always queue the + * message to the write_queue. We can optimize the case where the queue is + * already empty though the optimization is probably not worth it. + */ +static void +close_conn(struct socket *so) +{ + struct mbuf *m; + struct cpl_close_con_req *req; + struct tom_data *d; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + unsigned int tid = toep->tp_tid; + + d = TOM_DATA(TOE_DEV(so)); + + if (tp->t_state != TCPS_SYN_SENT) + t3_push_frames(so, 1); + + if (toep->tp_flags & TP_FIN_SENT) + return; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + /* + * XXX + */ + printf("need to defer connection close to taskq thread!!!\n"); + return; + } + toep->tp_flags |= TP_FIN_SENT; + req = mtod(m, struct cpl_close_con_req *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = htonl(toep->tp_write_seq); + + /* + * XXX - need to defer shutdown while there is still data in the queue + * + */ + cxgb_ofld_send(d->cdev, m); + +} + + +/* + * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are + * permitted to return without sending the message in case we cannot allocate + * an sk_buff. Returns the number of credits sent. + */ +uint32_t +t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) +{ + struct mbuf *m; + struct cpl_rx_data_ack *req; + struct toepcb *toep = tp->t_toe; + struct toedev *tdev = toep->tp_toedev; + + m = m_gethdr(M_NOWAIT, MT_DATA); + + if (m == NULL) { + /* + * XXX need to cache mbufs for nofail allocation + */ + if (nofail) + log(LOG_ERR, "failing nofail t3_send_rx_credits!!!\n"); + return (0); + } + + printf("returning %u credits to HW\n", credits); + + req = mtod(m, struct cpl_rx_data_ack *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); + req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); + m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + return (credits); +} + + +/* + * Set of states for which we should return RX credits. + */ +#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) + +/* + * Called after some received data has been read. It returns RX credits + * to the HW for the amount of data processed. + */ +void +t3_cleanup_rbuf(struct tcpcb *tp) +{ + struct toepcb *toep = tp->t_toe; + struct toedev *dev; + int dack_mode, must_send; + u32 thres, credits, dack = 0; + + if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || + (tp->t_state == TCPS_FIN_WAIT_2))) + return; + + printf("inaccurately calculating return credits - PLZ FIX\n"); + /* + * XXX this won't accurately reflect credit return - we need + * to look at the difference between the amount that has been + * put in the recv sockbuf and what is there now + */ + credits = toep->tp_copied_seq - toep->tp_rcv_wup; + if (__predict_false(!credits)) + return; + + dev = toep->tp_toedev; + thres = TOM_TUNABLE(dev, rx_credit_thres); + + if (__predict_false(thres == 0)) + return; + + if (toep->tp_ulp_mode) + dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + else { + dack_mode = TOM_TUNABLE(dev, delack); + if (__predict_false(dack_mode != toep->tp_delack_mode)) { + u32 r = tp->rcv_nxt - toep->tp_delack_seq; + + if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) + dack = F_RX_DACK_CHANGE | + V_RX_DACK_MODE(dack_mode); + } + } + + /* + * For coalescing to work effectively ensure the receive window has + * at least 16KB left. + */ + must_send = credits + 16384 >= tp->rcv_wnd; - return (total_bytes); + if (must_send || credits >= thres) + toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); } static int cxgb_toe_disconnect(struct tcpcb *tp) { - printf("%s UNIMPLEMENTED!!!!\n", __FUNCTION__); + struct socket *so; + + printf("cxgb_toe_disconnect\n"); + + so = tp->t_inpcb->inp_socket; + close_conn(so); + return (0); } @@ -313,7 +467,8 @@ static int cxgb_toe_rcvd(struct tcpcb *tp) { - printf("%s UNIMPLEMENTED!!!!\n", __FUNCTION__); + t3_cleanup_rbuf(tp); + return (0); } @@ -385,7 +540,82 @@ return (idx); } +void +t3_release_ddp_resources(struct socket *so) +{ + /* + * This is a no-op until we have DDP support + */ +} + +static inline void +free_atid(struct t3cdev *cdev, unsigned int tid) +{ + struct socket *so = cxgb_free_atid(cdev, tid); + if (so) { + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + toepcb_release(toep); + } +} + +/* + * Release resources held by an offload connection (TID, L2T entry, etc.) + */ static void +t3_release_offload_resources(struct socket *so) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct toedev *tdev = TOE_DEV(so); + struct t3cdev *cdev; + unsigned int tid = toep->tp_tid; + + if (!tdev) + return; + + cdev = T3C_DEV(so); + if (!cdev) + return; + + toep->tp_qset = 0; + t3_release_ddp_resources(so); + +#ifdef CTRL_SKB_CACHE + kfree_skb(CTRL_SKB_CACHE(tp)); + CTRL_SKB_CACHE(tp) = NULL; +#endif + + if (toep->tp_wr_avail != toep->tp_wr_max) { + purge_wr_queue(tp); + reset_wr_list(tp); + } + + if (toep->tp_l2t) { + l2t_release(L2DATA(cdev), toep->tp_l2t); + toep->tp_l2t = NULL; + } + + if (tp->t_state == TCPS_SYN_SENT) { + free_atid(cdev, tid); +#ifdef notyet + __skb_queue_purge(&tp->out_of_order_queue); +#endif + } else { // we have TID + cxgb_remove_tid(cdev, (void *)so, tid); + toepcb_release(toep); + } +#ifdef notyet + t3_set_ca_ops(sk, &tcp_init_congestion_ops); +#endif + TOE_DEV(so) = NULL; +#if 0 + log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); +#endif +} + +static void install_offload_ops(struct socket *so) { struct tcpcb *tp = sototcpcb(so); @@ -580,15 +810,14 @@ static void fail_act_open(struct socket *so, int errno) { + struct tcpcb *tp = sototcpcb(so); + + t3_release_offload_resources(so); + tcp_drop(tp, errno); + #ifdef notyet - sk->sk_err = errno; - sk->sk_error_report(sk); - t3_release_offload_resources(sk); - connection_done(sk); TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); #endif - printf("%s:%s:%d implement me", __FUNCTION__, __FILE__, __LINE__); - } /* @@ -719,13 +948,287 @@ return (0); free_tid: - cxgb_free_atid(d->cdev, atid); + free_atid(d->cdev, atid); out_err: return (ENOMEM); } +/* + * Process new data received for a connection. + */ +static void +new_rx_data(struct socket *so, struct mbuf *m) +{ + struct cpl_rx_data *hdr = cplhdr(m); + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + int len = be16toh(hdr->len); + +#ifdef notyet + if (__predict_false(sk_no_receive(sk))) { + handle_excess_rx(sk, skb); + return; + } + if (ULP_MODE(tp) == ULP_MODE_TCPDDP) + handle_ddp_data(sk, skb); + + TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); + TCP_SKB_CB(skb)->flags = 0; + skb_ulp_mode(skb) = 0; /* for iSCSI */ +#endif +#if VALIDATE_SEQ + if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { + printk(KERN_ERR + "%s: TID %u: Bad sequence number %u, expected %u\n", + TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, + tp->rcv_nxt); + __kfree_skb(skb); + return; + } +#endif + m_adj(m, sizeof(*hdr)); + +#ifdef notyet + /* + * We don't handle urgent data yet + */ + if (__predict_false(hdr->urg)) + handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg)); + if (__predict_false(tp->urg_data == TCP_URG_NOTYET && + tp->urg_seq - tp->rcv_nxt < skb->len)) + tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - + tp->rcv_nxt]; +#endif + if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { + toep->tp_delack_mode = hdr->dack_mode; + toep->tp_delack_seq = tp->rcv_nxt; + } + + printf("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len); + + if (len < m->m_pkthdr.len) + m->m_pkthdr.len = m->m_len = len; + + tp->rcv_nxt += m->m_pkthdr.len; + tp->t_rcvtime = ticks; + +#ifdef T3_TRACE + T3_TRACE2(TIDTB(sk), + "new_rx_data: seq 0x%x len %u", + TCP_SKB_CB(skb)->seq, skb->len); +#endif + + sbappend(&so->so_rcv, m); + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup(so); +} + /* + * Handler for RX_DATA CPL messages. + */ +static int +do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct socket *so = (struct socket *)ctx; + + VALIDATE_SOCK(so); + + new_rx_data(so, m); + + return (0); +} + +/* + * Move a socket to TIME_WAIT state. We need to make some adjustments to the + * socket state before calling tcp_time_wait to comply with its expectations. + */ +static void +enter_timewait(struct socket *so) +{ + struct tcpcb *tp = sototcpcb(so); + + /* + * Bump rcv_nxt for the peer FIN. We don't do this at the time we + * process peer_close because we don't want to carry the peer FIN in + * the socket's receive queue and if we increment rcv_nxt without + * having the FIN in the receive queue we'll confuse facilities such + * as SIOCINQ. + */ + tp->rcv_nxt++; + + tp->ts_recent_age = 0; /* defeat recycling */ + tp->t_srtt = 0; /* defeat tcp_update_metrics */ + tcp_twstart(tp); +} + + +/* + * Handle a peer FIN. + */ +static void +do_peer_fin(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + int keep = 0, dead = (so->so_state & SS_NOFDREF); + +#ifdef T3_TRACE + T3_TRACE0(TIDTB(sk),"do_peer_fin:"); +#endif +#ifdef notyet + if (!is_t3a(TOE_DEV(sk)) && sock_flag(sk, ABORT_RPL_PENDING)) + goto out; + + if (ULP_MODE(tp) == ULP_MODE_TCPDDP) { + keep = handle_peer_close_data(sk, skb); + if (keep < 0) + return; + } + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); +#endif + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + case TCPS_FIN_WAIT_2: + /* + * If we've sent an abort_req we must have sent it too late, + * HW will send us a reply telling us so, and this peer_close + * is really the last message for this connection and needs to + * be treated as an abort_rpl, i.e., transition the connection + * to TCP_CLOSE (note that the host stack does this at the + * time of generating the RST but we must wait for HW). + * Otherwise we enter TIME_WAIT. + */ + t3_release_offload_resources(so); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) + tcp_close(tp); + else + enter_timewait(so); + break; + default: + log(LOG_ERR, + "%s: TID %u received PEER_CLOSE in bad state %d\n", + TOE_DEV(so)->name, toep->tp_tid, tp->t_state); + } + + if (!dead) { +#ifdef notyet + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if ((sk->sk_shutdown & SEND_SHUTDOWN) || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, 1, POLL_HUP); + else + sk_wake_async(sk, 1, POLL_IN); +#endif + } +#ifdef notyet +out: +#endif + if (!keep) + m_free(m); +} + +/* + * Handler for PEER_CLOSE CPL messages. + */ +static int +do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct socket *so = (struct socket *)ctx; + + VALIDATE_SOCK(so); + + do_peer_fin(so, m); + return (0); +} + +static void +process_close_con_rpl(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct cpl_close_con_rpl *rpl = cplhdr(m); + struct toepcb *toep = tp->t_toe; + + tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ + +#if 0 + if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) + goto out; +#endif + + switch (tp->t_state) { + case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ + t3_release_offload_resources(so); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) + tcp_close(tp); + else + enter_timewait(so); + break; + case TCPS_LAST_ACK: + /* + * In this state we don't care about pending abort_rpl. + * If we've sent abort_req it was post-close and was sent too + * late, this close_con_rpl is the actual last message. + */ + t3_release_offload_resources(so); + tcp_close(tp); + break; + case TCPS_FIN_WAIT_1: +#ifdef notyet + dst_confirm(sk->sk_dst_cache); +#endif + if ((so->so_state & SS_NOFDREF) == 0) { + /* + * Wake up lingering close + */ + sowwakeup(so); + sorwakeup(so); + + } else + printf("FIN_WAIT1 shutdown handling incomplete\n"); + +#if 0 + else if (tcp_sk(sk)->linger2 < 0 && + !sock_flag(sk, ABORT_SHUTDOWN)) + abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER); +#endif + break; + default: + log(LOG_ERR, + "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", + TOE_DEV(so)->name, toep->tp_tid, + tp->t_state); + } +#if 0 +out: +#endif + m_free(m); +} + +/* + * Handler for CLOSE_CON_RPL CPL messages. + */ +static int do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, + void *ctx) +{ + struct socket *so = (struct socket *)ctx; + + VALIDATE_SOCK(so); + + process_close_con_rpl(so, m); + return (0); +} + +/* * Called when a connection is established to translate the TCP options * reported by HW to Linux's native format. */ @@ -750,17 +1253,19 @@ * * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. */ -static void make_established(struct socket *so, u32 snd_isn, unsigned int opt) +static void +make_established(struct socket *so, u32 snd_isn, unsigned int opt) { struct tcpcb *tp = sototcpcb(so); - - tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; + struct toepcb *toep = tp->t_toe; + + toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; + assign_rxopt(so, opt); #if 0 inet_sk(sk)->id = tp->write_seq ^ jiffies; #endif - assign_rxopt(so, opt); + -#ifdef notyet /* * XXX not clear what rcv_wup maps to */ @@ -768,9 +1273,10 @@ * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't * pass through opt0. */ - if (tp->rcv_wnd > (M_RCV_BUFSIZ << 102)) - tp->rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); + if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) + toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); +#ifdef notyet /* * no clean interface for marking ARP up to date */ @@ -823,9 +1329,8 @@ tp->ts_recent_age = ticks; tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; -#if 0 - DELACK_SEQ(tp) = tp->copied_seq = tp->rcv_wup = tp->rcv_nxt = rcv_isn; -#endif + toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; + make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); /* @@ -894,7 +1399,7 @@ */ toep->tp_tid = tid; so_insert_tid(d, so, tid); - cxgb_free_atid(cdev, atid); + free_atid(cdev, atid); toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); socket_act_establish(so, m); @@ -1037,7 +1542,7 @@ #ifdef notyet tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); if (!tcphdr_skb) { - printk(KERN_ERR + log(LOG_ERR, "Chelsio TCP offload: can't allocate sk_buff\n"); return -1; } @@ -1049,17 +1554,17 @@ t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); + t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); + t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); + t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); #ifdef notyet t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); - t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); - t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); - t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); #endif
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200711040017.lA40HgRp005496>