Date: Sun, 26 Jul 2009 14:28:13 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 166586 for review Message-ID: <200907261428.n6QESDAv099504@repoman.freebsd.org>
index | next in thread | raw e-mail
http://perforce.freebsd.org/chv.cgi?CH=166586 Change 166586 by andre@andre_t61 on 2009/07/26 14:27:39 First round of comment and annotation improvements. Move tcp_reass_enable check further up. Enhance D-SACK full duplicate test. On new block insert start reassembly timer. Some style improvements. Update TODO. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#53 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#53 (text+ko) ==== @@ -39,31 +39,65 @@ * It is the purpose of tcp reassembly to store segments that are received * out of order. This happens when packets are lost along the way due to * various reasons. The most common one is traffic overload which causes - * routers to stop accepting packets for brief moments. + * routers to drop packets for brief moments. + * + * Upon arrival of the missing segment the whole chain of stored contiguous + * segments is moved into the socket buffer at once. In case of multiple + * missing segments the remainder kept in the queue until the next missing + * segment arrives. + * + * When the TCP receiver is in reassembly state *all* arrving segments are + * passed through the reassembly queue until all missing segments have been + * received and all data is dequeued. + * + * Implementation details and choices: + * + * Instead of storing all segments on their own and tracking each with + * with a queue structure we build blocks of contiguous segments merged + * together. This way one missing segment needs only one queue structure + * tracking the whole block of following contiguous segments. If a new + * segment matches the end of one block and the start of the next block + * the two are joined together. If no match is found a new block is created. + * + * Instead of a classical list or tailqueue a red-black tree is used to + * cope with arbitrary complexity of the blocks and missing segments. A + * queue has O(n) worst case behavior whereas the red-black tree is + * O(log n). This prevents complexity attacks where a long chain of + * blocks would have to be traversed to find the right place for the new + * segment. * - * Upon arrival of the missing segment(s) the whole chain of stored segments - * is moved into the socket buffer. In case of multiple missing segments - * the first consequtive part is moved with the remainder being kept in - * store until the next missing segment arrives. + * For the segment merging into a block queue structure the operator can + * chose between time and space efficiency. For time efficiency only the + * head or tail mbuf chain pointers are updated with the new segments mbuf. + * This results in a minimal amount of data accesses and no memory allocations + * unless a new block is created. For space efficiency mbufs and mbuf chains + * are compacted and possibly merged together. It may even allocate a new + * and larger mbuf(-chain) to store the segment data with less overhead and + * less wasted space. This makes it immune against mbuf exhaustion attacks + * where only tiny amounts of data are received per segment, possibly only + * one byte. Usually network drivers allocate only mbuf cluster of 2KBytes + * on receive no matter how large the packet actually is for efficiency + * reasons and because can't easily know at DMA time how large the packet + * effectively actually is. * - * While in reassembly mode *all* arrving segments are put into the reassembly - * queue. + * Limits, timeout. XXX * - * Instead of storing all segments on their own we build blocks of consequtive - * segments chained together. We use a red-black tree to cope with arbitrary - * complexity. If a segment matches the end of one block and the start of the - * next block the two blocks are joined together. If no match is found a - * new block is created. + * The reassembly queue block structure is also used to track SACK + * information as the data receiver. A double-linked list is added + * that tracks the blocks LIFO order of their arrival or updating. * - * The reassembly queues block structure is also used to track SACK - * information as a data receiver. A double-linked list is added - * that links the blocks the reverse order of their arrival or updating. - * This makes us fully compliant to RFC2018 Section 4 including all - * optional parts marked as "SHOULD". + * Implemented / relevant RFC's: + * RFC793: Transmission Control Protocol + * RFC1123: + * RFC2018: This makes us fully compliant to RFC2018 Section 4 including all optional parts marked as "SHOULD". + * RFC2883: * * TODO: * - Improve comments and annotate RFC references. * - Style improvements. + * - Acticate timeout on first insert. + * - Partial D-SACK support. + * - Return flags should be same minus FIN. * - Lots of testing. */ @@ -132,7 +166,9 @@ static struct tcp_reass_block * tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *); -/* Trim empty mbufs from head of chain. */ +/* + * Trim empty mbufs from the head of an mbuf chain. + */ static struct mbuf * m_trimhead(struct mbuf *m) { @@ -165,6 +201,7 @@ static __inline int tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b) { + if (SEQ_LT(a->trb_seqe, b->trb_seqs)) return (-1); else if (SEQ_GT(a->trb_seqs, b->trb_seqe)) @@ -176,6 +213,9 @@ RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp); RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp); +/* + * Verify the integrity of the reassembly queue. + */ #ifdef INVARIANTS static int tcp_reass_verify(struct tcpcb *tp, int prepost) @@ -214,8 +254,12 @@ return (1); } -#endif +#endif /* INVARIANTS */ +/* + * Remove a single block from the reassembly queue + * and free all its mbufs, if any. + */ static void tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb) { @@ -229,6 +273,9 @@ uma_zfree(tcp_reass_zone, trb); } +/* + * Remove and free all blocks from the reassembly queue. + */ void tcp_reass_flush(struct tcpcb *tp) { @@ -246,6 +293,10 @@ ("%s: rcv_reass_size not zero after flushing", __func__)); } +/* + * Move block to front of SACK list to report SACK blocks in LIFO order. + * RFC2018: section x + */ static __inline void tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb) { @@ -257,7 +308,8 @@ } /* - * Insert segments into the reassembly queue. + * Insert segment into the reassembly queue and + * XXX append to socket buffer. * * NB: We must always consume the mbuf. Either by appeding it to * the queue or by freeing it. @@ -274,6 +326,12 @@ INP_WLOCK_ASSERT(tp->t_inpcb); /* + * Check if it is really neccessary to do all the work. + */ + if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass)) + goto done; + + /* * Call with th==NULL after becoming established to * force pre-ESTABLISHED data up to user socket. * XXX: Was used for T/TCP of which code remains. @@ -303,10 +361,6 @@ th_seq = th->th_seq; th = NULL; /* Prevent further use. */ - /* Check if it is really neccessary to do all the work. */ - if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass)) - goto done; - KASSERT(SEQ_GEQ(th_seq, tp->rcv_nxt), ("%s: sequence number below rcv_nxt", __func__)); KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)), @@ -374,9 +428,9 @@ } /* - * Get rid of packet header and mtags. + * Remove and free packet header and mtags. * Trim empty mbufs from head of chain. - * Compact mbuf chain. + * Compact the mbuf chain. */ m_demote(m, 1); m_adj(m, hlen); @@ -393,50 +447,68 @@ trbs.trb_mt = m_last(m); /* - * Return match that has at least partial overlap to either side or - * insert a new reassembly block. + * Find a block that has at least partial overlap to either side. + * If no block is found either insert a new one or use the stack + * if the segment directly fits rcv_nxt. + * RFC793: xxx + * RFC2018: section 3 */ if ((trb = RB_FIND(tcp_ra, &tp->rcv_reass, &trbs)) != NULL) { - - /* Within an already received block. */ + /* + * The new segment is a retransmit and within an already + * received block and thus a full duplicate. + * + * Report it with D-SACK only if it is a subset of and + * not equal to the already existing block. + * RFC2883: section 4, part (1) and (4) + * RFC2883: section 4.1, Reporting Full Duplicate Segments + */ if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) && SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) { tcp_reass_sacktrack(tp, trb); - tp->rcv_reass_dsack.start = trbs.trb_seqs; - tp->rcv_reass_dsack.end = trbs.trb_seqe; + if (SEQ_GT(trbs.trb_seqs, trb->trb_seqs) || + SEQ_LT(trbs.trb_seqe, trb->trb_seqe)) { + tp->rcv_reass_dsack.start = trbs.trb_seqs; + tp->rcv_reass_dsack.end = trbs.trb_seqe; + } TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, len); goto done; } + /* + * Merge the new segment with the already existing block. + */ tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe); - - /* Merge in the new segment. */ (void)tcp_reass_merge(trb, &trbs); tcp_reass_sacktrack(tp, trb); + /* + * Update XXX + * RFC2883: section 4.2, Reporting Partial Duplicate Segments + * XXXAO: Add D-SACK block. + */ if ((len = SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe)) > 0) { tp->rcv_reass_size -= len; TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, len); } - /* Merge in previous block(s) if there is overlap. */ + /* + * Merge in previous/next block(s) if there is overlap. + */ while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL && SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) { trbn = tcp_reass_merge(trb, trbn); tcp_reass_free(tp, trbn); TCPSTAT_INC(tcps_reass_merge); } - - /* Merge in next block(s) if there is overlap. */ while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL && SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) { trbn = tcp_reass_merge(trb, trbn); tcp_reass_free(tp, trbn); TCPSTAT_INC(tcps_reass_merge); } - } else if (tp->rcv_nxt == th_seq) { trb = &trbs; } else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) { @@ -449,6 +521,8 @@ LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack); tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe); tp->rcv_reass_blocks++; + if (RB_EMPTY(&tp->rcv_reass)) + tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout); TCPSTAT_INC(tcps_reass_blocks); } else { /* Memory allocation failure. */ @@ -458,8 +532,12 @@ KASSERT(tcp_reass_verify(tp, 1), ("%s: reassembly queue went inconsistent", __func__)); + /* + * Deliver data if we've got the missing segment. + */ if (trb->trb_seqs == tp->rcv_nxt) goto present; + return (0); present: @@ -475,7 +553,10 @@ TCPSTAT_INC(tcps_reass_missingseg); SOCKBUF_LOCK(&so->so_rcv); - /* We can only ever dequeue one block. */ + /* + * We can only ever dequeue one consecutive + * block of data at most. + */ if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) { sbappendstream_locked(&so->so_rcv, trb->trb_m); tp->rcv_nxt += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); @@ -488,13 +569,19 @@ else tcp_reass_free(tp, trb); - /* NB: sorwakeup_locked() does a implicit socket buffer unlock. */ + /* NB: sorwakeup_locked() does an implicit socket buffer unlock. */ sorwakeup_locked(so); /* + * Don't hold on to data in the reassembly queue for too long. + * Kernel memory is limited and if the connection doesn't make + * any progress in filling the holes we don't want to wait + * forever to prevent memory exhaustion attacks. The sender + * can always retransmit again. + * RFC2018: section 8, Data Receiver Reneging + * * Restart the reassembly queue flush timer after advancing - * the sequence space and if queue is not empty. Otherwise - * deactivate it. + * the sequence space and if the queue still isn't empty. */ if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass)) tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout); @@ -510,8 +597,8 @@ } /* - * Trim a block. - * Positive value is from head, negative from tail. + * Trim a SACK block. + * A positive value is from head, negative from tail. */ static void tcp_reass_trim(struct tcp_reass_block *trb, int i) @@ -519,6 +606,10 @@ m_adj(trb->trb_m, i); + /* + * Update the tail pointer or free empty mbufs + * at the head of the chain. + */ if (i < 0) { trb->trb_mt = m_last(trb->trb_m); trb->trb_seqe -= i; @@ -529,7 +620,10 @@ } /* - * Merge one or more consecutive blocks together. + * Merge one or more consecutive blocks together. The number of + * redundant bytes is reported as the difference between trbn-> + * trb_seqs and trb_seqe. + * * NB: trbn is always merged into trb! */ static struct tcp_reass_block * @@ -543,7 +637,9 @@ SEQ_GT(trbn->trb_seqe, trb->trb_seqe), ("%s: blocks not overlapping", __func__)); - /* Replace, prepend and append. */ + /* + * Replace, prepend or append a block. + */ if (SEQ_LEQ(trbn->trb_seqs, trb->trb_seqs) && SEQ_GEQ(trbn->trb_seqe, trb->trb_seqe)) { @@ -596,10 +692,12 @@ } /* - * Put the sequence number of the reassembly queue blocks into - * the SACK options of an outgoing segment. - * RFC2018: section ... - * RFC2883: section ... + * Put the sequence number of the reassembly queue blocks into the + * SACK options of an outgoing segment. If a D-SACK block is available + * insert it in the first position followed by the regular SACK blocks. + * RFC2018: section 3, Sack Option Format + * RFC2018: section 4, Generating Sack Options: Data Receiver Behavior + * RFC2883: section 4, Use of the SACK option for reporting a duplicate segment */ int tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks) @@ -614,7 +712,10 @@ KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack), ("%s: sack list empty", __func__)); - /* Create fake SACK block for D-SACK and prepend it. */ + /* + * Create fake SACK block on the stack for D-SACK and prepend it. + * RFC2883: section 4, part (3) + */ if (tp->rcv_reass_dsack.start != tp->rcv_reass_dsack.end) { bzero(&trbs, sizeof(trbs)); trbs.trb_seqs = htonl(tp->rcv_reass_dsack.start); @@ -625,7 +726,7 @@ /* * The most recent block must appear first. Add the other * blocks in most recent created or updated order. - * RFC2018: section 4 + * RFC2018: section 3 and 4, part (4) and (5) */ LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) { if (numsacks < 1) @@ -640,7 +741,11 @@ nsacks++; } - /* Remove fake D-SACK block again. */ + /* + * Remove fake D-SACK block again and zero out the D-SACK + * information. It must be reported only once. + * RFC2883: section 4, part (2) + */ if (LIST_FIRST(&tp->rcv_reass_sack) == &trbs) { LIST_REMOVE(&trbs, trb_sack); tp->rcv_reass_dsack.start = 0;home | help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200907261428.n6QESDAv099504>
