From owner-freebsd-net@FreeBSD.ORG Wed Nov 12 14:53:30 2003 Return-Path: Delivered-To: freebsd-net@freebsd.org Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125]) by hub.freebsd.org (Postfix) with ESMTP id 635A116A4CE; Wed, 12 Nov 2003 14:53:30 -0800 (PST) Received: from freesbee.wheel.dk (freesbee.wheel.dk [193.162.159.97]) by mx1.FreeBSD.org (Postfix) with ESMTP id 4B4CC43FB1; Wed, 12 Nov 2003 14:53:28 -0800 (PST) (envelope-from jesper@skriver.dk) Received: by freesbee.wheel.dk (Postfix, from userid 1001) id 91134384E7; Wed, 12 Nov 2003 23:53:26 +0100 (CET) Date: Wed, 12 Nov 2003 23:53:26 +0100 From: Jesper Skriver To: Andre Oppermann Message-ID: <20031112225326.GI41949@FreeBSD.org> Mail-Followup-To: Jesper Skriver , Andre Oppermann , freebsd-current@freebsd.org, freebsd-net@freebsd.org, sam@errno.com, mb@imp.ch, ume@freebsd.org References: <3FAE68FB.64D262FF@pipeline.ch> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <3FAE68FB.64D262FF@pipeline.ch> User-Agent: Mutt/1.4.1i X-PGP-Fingerprint: 6B88 9CE8 66E9 E631 C9C5 5EB4 22AB F0EC F956 1C31 X-PGP-Public-Key: http://freesbee.wheel.dk/~jesper/gpgkey.pub cc: freebsd-net@freebsd.org cc: freebsd-current@freebsd.org cc: mb@imp.ch cc: ume@freebsd.org cc: sam@errno.com Subject: Re: tcp hostcache and ip fastforward for review X-BeenThere: freebsd-net@freebsd.org X-Mailman-Version: 2.1.1 Precedence: list List-Id: Networking and TCP/IP with FreeBSD List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 12 Nov 2003 22:53:30 -0000 On Sun, Nov 09, 2003 at 05:19:07PM +0100, Andre Oppermann wrote: > Hello all, > > this patch contains three things (to be separated for committing): > > tcp_hostcache > > - removes protocol cloning from routing table (IPv4+6) > - removes rtentry pointer from inpcb and in6pcb > - removes ip route cache in ip_input.c (locking much easier) > - removes most (tcp specific) metrics from rtentry metrics > - adds a hostcache table which carries the metrics for tcp > - works transparently for IPv4 and IPv6 > - is designed for concurrent access in SMP environments > - significant reduction of routing table size (no cloning anymore) > - eases many routing table locking situations in ip/tcp code > > ip_fastforward > > - removes ip_flow forwarding code > - adds full direct process-to-completion IPv4 forwarding code > - handles ip fragmentation incl. hw support (ip_flow did not) > - supports ipfw and ipfilter (ip_flow did not) > - supports divert and ipfw fwd (ip_flow did not) > - drops anything it can't handle back to normal ip_input I have a few comments to this code, see inline, look for #jesper Apart from that it looks good. /Jesper > +int > +ip_fastforward(struct mbuf *m) > +{ > + struct ip *ip; > + struct mbuf *m0 = NULL; > +#ifdef IPDIVERT > + struct ip *tip; > + struct mbuf *teem = NULL; > +#endif > + struct mbuf *tag = NULL; > + struct route ro; > + struct sockaddr_in *dst = NULL; > + struct in_ifaddr *ia = NULL; > + struct ifaddr *ifa = NULL; > + struct ifnet *ifp = NULL; > + struct ip_fw_args args; > + in_addr_t odest, dest; > + u_short sum; > + int hlen; > + int error = 0; > + int ipfw; > + > + /* > + * Are we active and forwarding packets? > + */ > + if (!ipfastforward_active || !ipforwarding) > + return 0; > + > + /* > + * If there is any MT_TAG we fall back to ip_input because we can't > + * handle TAGs here. > + */ > + if (m && m->m_type == MT_TAG) > + return 0; > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + [...] > + > + /* > + * Only unicast IP, not from loopback, no L2 or IP broadcast, > + * no multicast, no INADDR_ANY > + */ > + if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || > + (ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST) || #jesper You will never see packets with a multicast source address. > + (ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST) || > + (IN_MULTICAST(ntohl(ip->ip_src.s_addr))) || > + (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) || > + (ip->ip_dst.s_addr == INADDR_ANY) ) > + goto fallback; > + > + /* > + * Is it for a local address on this host? > + */ > + LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { > + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) { > + goto fallback; > + } > + } > + > + /* > + * Or is it for a local IP broadcast address on this host? > + */ > + if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { > + TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { > + if (ifa->ifa_addr->sa_family != AF_INET) > + continue; > + ia = ifatoia(ifa); > + if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) > + goto fallback; > + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == > + ip->ip_dst.s_addr) > + goto fallback; > + continue; > +fallback: > + /* drop the packet back to netisr */ > + ip->ip_len = htons(ip->ip_len); > + ip->ip_off = htons(ip->ip_off); > + return 0; > + } > + } > + ipstat.ips_total++; #jesper If we stored special "for us" /32 routes in the routing table for addresses configured on this host, we could avoid the above 2 loops, which can quite expensive. These special routes will simply mean that the packet is for us, and needs to given to ip_input > + /** > + ** Third: incoming packet firewall processing > + **/ > + > + odest = dest = ip->ip_dst.s_addr; #jesper You could save a few cycles by doing #ifdef PFIL_HOOKS odest = ip->ip_dst.s_addr; /* * Run through list of ipfilter hooks for input packets */ if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN) || m == NULL) return 1; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); /* if m changed during fw processing */ dest = ip->ip_dst.s_addr; #else odest = dest = ip->ip_dst.s_addr; #endif Thus avoiding writing to dest twice. > +#ifdef PFIL_HOOKS > + /* > + * Run through list of ipfilter hooks for input packets > + */ > + if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN) || > + m == NULL) > + return 1; > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + ip = mtod(m, struct ip *); /* if m changed during fw processing */ > + dest = ip->ip_dst.s_addr; > +#endif > + > + /* > + * Run through ipfw for input packets > + */ > + if (fw_enable && IPFW_LOADED) { > + bzero(&args, sizeof(args)); > + args.m = m; > + ipfw = 0; > + > + ipfw = ip_fw_chk_ptr(&args); > + m = args.m; > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + /* > + * Packet denied, drop it > + */ > + if ( (ipfw & IP_FW_PORT_DENY_FLAG) || m == NULL) { > + if (m) > + m_freem(m); > + return 1; > + } > + /* > + * Send packet to the appropriate pipe > + */ > + if (DUMMYNET_LOADED && (ipfw & IP_FW_PORT_DYNT_FLAG) != 0) { #jesper Whitespace bug, spaces instead of tabs > + ip_dn_io_ptr(m, ipfw & 0xffff, DN_TO_IP_IN, &args); > + return 1; > + } > +#ifdef IPDIVERT > + /* > + * Divert packet > + */ > + if (ipfw != 0 && (ipfw & IP_FW_PORT_DYNT_FLAG) == 0) { > + /* > + * See if this is a fragment > + */ > + if (ip->ip_off & (IP_MF | IP_OFFMASK)) { > + MGETHDR(tag, M_DONTWAIT, MT_TAG); > + if (tag == NULL) { > + m_freem(m); > + return 1; > + } > + tag->m_flags = PACKET_TAG_DIVERT; > + tag->m_data = (caddr_t)(u_long)args.divert_rule; > + tag->m_next = m; > + m = tag; > + tag = NULL; > + > + goto droptoours; #jesper Whitespace bug, spaces instead of tabs > + } > + /* > + * Tee packet > + */ > + if ((ipfw & IP_FW_PORT_TEE_FLAG) != 0) > + teem = m_dup(m, M_DONTWAIT); > + else > + teem = m; > + if (teem == NULL) > + goto passin; > + > + M_ASSERTVALID(teem); > + M_ASSERTPKTHDR(teem); > + > + /* > + * Delayed checksums are not compatible > + */ > + if (teem->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { > + in_delayed_cksum(teem); > + teem->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; > + } > + /* > + * Restore packet header fields to original values > + */ > + tip = mtod(teem, struct ip *); > + tip->ip_len = htons(tip->ip_len); > + tip->ip_off = htons(tip->ip_off); > + /* > + * Deliver packet to divert input routine > + */ > + divert_packet(teem, 0, ipfw & 0xffff, args.divert_rule); > + /* > + * If this was not tee, we are done > + */ > + if ((ipfw & IP_FW_PORT_TEE_FLAG) == 0) > + return 1; > + /* Continue if it was tee */ > + goto passin; > + } > +#endif > + if (ipfw == 0 && args.next_hop != NULL) { > + dest = args.next_hop->sin_addr.s_addr; > + goto passin; > + } > + /* > + * Let through or not? > + */ > + if (ipfw != 0) { > + m_freem(m); > + return 1; > + } > + } > +passin: > + ip = mtod(m, struct ip *); /* if m changed during fw processing */ > + > + /* > + * Destination address changed? > + */ > + if (odest != dest) { > + /* > + * Is it now for a local address on this host? > + */ > + LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { > + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) > + goto forwardlocal; > + } #jesper Same comment as above - and do we really want to see if the original destination address was ours if we're doing NAT ? > + /* > + * Go on with new destination address > + */ > + } > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + /** > + ** Forth: decrement TTL and look up route > + **/ > + > + /* > + * Check TTL > + */ > +#ifdef IPSTEALTH > + if (!ipstealth) { > +#endif > + if (ip->ip_ttl <= IPTTLDEC) { > + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, NULL, NULL); > + return 1; > + } > + > + /* > + * Decrement the TTL and incrementally change the checksum. > + * Don't bother doing this with hw checksum offloading. > + */ > + ip->ip_ttl -= IPTTLDEC; > + if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) > + ip->ip_sum -= ~htons(IPTTLDEC << 8); > + else > + ip->ip_sum += htons(IPTTLDEC << 8); > +#ifdef IPSTEALTH > + } > +#endif > + > + /* > + * Find route to destination. > + */ > + bzero(&ro, sizeof(ro)); > + dst = (struct sockaddr_in *)&ro.ro_dst; > + dst->sin_family = AF_INET; > + dst->sin_len = sizeof(*dst); > + dst->sin_addr.s_addr = dest; > + rtalloc(&ro); > + > + /* > + * Route there and interface still up? > + */ > + if ((ro.ro_rt) && > + (ro.ro_rt->rt_flags & RTF_UP) && > + (ro.ro_rt->rt_ifp->if_flags & IFF_UP)) { > + ia = ifatoia(ro.ro_rt->rt_ifa); > + ifp = ro.ro_rt->rt_ifp; > + if (ro.ro_rt->rt_flags & RTF_GATEWAY) > + dst = (struct sockaddr_in *)ro.ro_rt->rt_gateway; > + } else { > + ipstat.ips_noroute++; > + ipstat.ips_cantforward++; > + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, NULL, NULL); > + if (ro.ro_rt) > + RTFREE(ro.ro_rt); > + return 1; > + } > + > + > + /** > + ** Fifth: outgoing firewall packet processing > + **/ > + > +#ifdef PFIL_HOOKS > + /* > + * Run through list of hooks for output packets. > + */ > + if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT) || > + m == NULL) { > + RTFREE(ro.ro_rt); > + return 1; > + } > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + ip = mtod(m, struct ip *); > + dest = ip->ip_dst.s_addr; > +#endif > + if (fw_enable && IPFW_LOADED && !args.next_hop) { > + bzero(&args, sizeof(args)); > + args.m = m; > + args.oif = ifp; > + ipfw = 0; > + > + ipfw = ip_fw_chk_ptr(&args); > + m = args.m; > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + if ( (ipfw & IP_FW_PORT_DENY_FLAG) || m == NULL) { > + if (m) > + m_freem(m); > + RTFREE(ro.ro_rt); > + return 1; > + } > + if (DUMMYNET_LOADED && (ipfw & IP_FW_PORT_DYNT_FLAG) != 0) { #jesper Whitespace bug, spaces instead of tabs > + /* > + * XXX note: if the ifp or rt entry are deleted > + * while a pkt is in dummynet, we are in trouble! > + */ > + args.ro = &ro; /* dummynet does not save it */ > + args.dst = dst; > + > + ip_dn_io_ptr(m, ipfw & 0xffff, DN_TO_IP_OUT, &args); > + RTFREE(ro.ro_rt); > + return 1; > + } > +#ifdef IPDIVERT > + if (ipfw != 0 && (ipfw & IP_FW_PORT_DYNT_FLAG) == 0) { > + /* > + * See if this is a fragment > + */ > + if (ip->ip_off & (IP_MF | IP_OFFMASK)) { > + MGETHDR(tag, M_DONTWAIT, MT_TAG); > + if (tag == NULL) { > + m_freem(m); > + RTFREE(ro.ro_rt); > + return 1; > + } > + tag->m_flags = PACKET_TAG_DIVERT; > + tag->m_data = (caddr_t)(u_int32_t)args.divert_rule; > + tag->m_next = m; > + m = tag; > + tag = NULL; > + > + goto droptoours; > + } > + /* > + * Tee packet > + */ > + if ((ipfw & IP_FW_PORT_TEE_FLAG) != 0) > + teem = m_dup(m, M_DONTWAIT); > + else > + teem = m; > + if (teem == NULL) > + goto passout; > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + /* > + * delayed checksums are not compatible with divert > + */ > + if (teem->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { > + in_delayed_cksum(teem); > + teem->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; > + } > + /* > + * Restore packet header fields to original values > + */ > + tip = mtod(teem, struct ip *); > + tip->ip_len = htons(tip->ip_len); > + tip->ip_off = htons(tip->ip_off); > + /* > + * Deliver packet to divert input routine > + */ > + divert_packet(teem, 0, ipfw & 0xffff, args.divert_rule); > + /* > + * If this was not tee, we are done > + */ > + if ((ipfw & IP_FW_PORT_TEE_FLAG) == 0) { > + RTFREE(ro.ro_rt); > + return 1; > + } > + /* Continue if it was tee */ > + goto passout; > + } > +#endif > + if (ipfw == 0 && args.next_hop != NULL) { > + dest = args.next_hop->sin_addr.s_addr; > + goto passout; > + } > + /* > + * Let through or not? > + */ > + if (ipfw != 0) { > + m_freem(m); > + return 1; > + } > + } > +passout: > + ip = mtod(m, struct ip *); > + > + /* > + * Destination address changed? > + */ > + if (odest != dest) { > + /* > + * Is it now for a local address on this host? > + */ #jesper Again, do we really want to look for packets destined for us after being translated ? > + LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { > + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) { > +forwardlocal: > + if(args.next_hop) { > + MGETHDR(tag, M_DONTWAIT, MT_TAG); > + if (tag == NULL) { > + m_freem(m); > + if(ro.ro_rt) > + RTFREE(ro.ro_rt); > + return 1; > + } > + tag->m_flags = PACKET_TAG_IPFORWARD; > + tag->m_data = (caddr_t)args.next_hop; > + tag->m_next = m; > + m = tag; > + tag = NULL; > + } > +#ifdef IPDIVERT > +droptoours: /* Used for DIVERT */ > +#endif > + MGETHDR(tag, M_DONTWAIT, MT_TAG); > + if (tag == NULL) { > + m_freem(m); > + if(ro.ro_rt) > + RTFREE(ro.ro_rt); > + return 1; > + } > + tag->m_flags = PACKET_TAG_IPFASTFWD_OURS; > + tag->m_data = NULL; > + tag->m_next = m; #jesper Whitespace bug, spaces instead of tabs > + m = tag; > + tag = NULL; > + > +#if 0 > + /* > + * Do some checks, we never know what fw has > + * done to it. > + */ > + if (m->m_pkthdr.rcvif == NULL) > + m->m_pkthdr.rcvif = ifunit("lo0"); > + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { > + m->m_pkthdr.csum_flags |= > + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; > + m->m_pkthdr.csum_data = 0xffff; > + } > + m->m_pkthdr.csum_flags |= > + CSUM_IP_CHECKED | CSUM_IP_VALID; > +#endif > + > + /* ip still points to the real packet */ > + ip->ip_len = htons(ip->ip_len); > + ip->ip_off = htons(ip->ip_off); > + > + M_ASSERTVALID(m); > + > + /* > + * Drop packet to ip_input > + */ > + if (ro.ro_rt) > + RTFREE(ro.ro_rt); > + return 0; > + } > + } > + /* > + * Redo route lookup with new destination address > + */ > + RTFREE(ro.ro_rt); > + bzero(&ro, sizeof(ro)); > + dst = (struct sockaddr_in *)&ro.ro_dst; > + dst->sin_family = AF_INET; > + dst->sin_len = sizeof(*dst); > + dst->sin_addr.s_addr = dest; > + rtalloc(&ro); > + > + /* > + * Route there and interface still up? > + */ > + if ((ro.ro_rt) && > + (ro.ro_rt->rt_flags & RTF_UP) && > + (ro.ro_rt->rt_ifp->if_flags & IFF_UP)) { > + ia = ifatoia(ro.ro_rt->rt_ifa); > + ifp = ro.ro_rt->rt_ifp; > + if (ro.ro_rt->rt_flags & RTF_GATEWAY) > + dst = (struct sockaddr_in *)ro.ro_rt->rt_gateway; > + } else { > + ipstat.ips_noroute++; > + ipstat.ips_cantforward++; > + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, NULL, NULL); > + if (ro.ro_rt) > + RTFREE(ro.ro_rt); > + return 1; > + } > + } > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + /** > + ** Sixth: send off the packet > + **/ > + > + /* > + * Check if packet fits MTU or if hardware will fragement for us > + */ > + if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && > + ((ip->ip_off & IP_DF) == 0))) { > + /* > + * Restore packet header fields to original values > + */ > + ip->ip_len = htons(ip->ip_len); > + ip->ip_off = htons(ip->ip_off); > + /* > + * Send off the packet via outgoing interface > + */ > + error = (ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro.ro_rt); > + if (ia) { > + ia->ia_ifa.if_opackets++; > + ia->ia_ifa.if_obytes += m->m_pkthdr.len; > + } > + } else { > + /* > + * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery > + */ > + if (ip->ip_off & IP_DF) { > + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, NULL, ifp); > + ipstat.ips_cantfrag++; > + RTFREE(ro.ro_rt); > + return 1; > + } else { > + /* > + * We have to fragement the packet > + */ > + m->m_pkthdr.csum_flags |= CSUM_IP; > + if (ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, > + (~ifp->if_hwassist & CSUM_DELAY_IP))) { > + m_freem(m); > + RTFREE(ro.ro_rt); > + return 1; > + } > + /* > + * Send off the fragments via outgoing interface > + */ > + for (; m; m = m0) { > + m0 = m->m_nextpkt; > + m->m_nextpkt = 0; > + > + M_ASSERTVALID(m); > + M_ASSERTPKTHDR(m); > + > + if (error == 0) { > + error = (*ifp->if_output)(ifp, m, > + (struct sockaddr *)dst, ro.ro_rt); > + if (ia) { > + ia->ia_ifa.if_opackets++; > + ia->ia_ifa.if_obytes += m->m_pkthdr.len; > + } > + } else { > + m_freem(m); > + } > + } > + if (error == 0) > + ipstat.ips_fragmented++; > + } > + } > + > + if (error == ENOBUFS) > + ipstat.ips_odropped++; > + else if (error != 0) > + ipstat.ips_odropped++; > + else { > + ro.ro_rt->rt_rmx.rmx_pksent++; > + ipstat.ips_forward++; > + ipstat.ips_fastforward++; > + } > + RTFREE(ro.ro_rt); > + return 1; > +} > +