From owner-svn-src-head@FreeBSD.ORG Tue Apr 14 23:05:36 2009 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id AAF31106564A; Tue, 14 Apr 2009 23:05:36 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 9904B8FC08; Tue, 14 Apr 2009 23:05:36 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n3EN5a5X022660; Tue, 14 Apr 2009 23:05:36 GMT (envelope-from kmacy@svn.freebsd.org) Received: (from kmacy@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n3EN5a2E022652; Tue, 14 Apr 2009 23:05:36 GMT (envelope-from kmacy@svn.freebsd.org) Message-Id: <200904142305.n3EN5a2E022652@svn.freebsd.org> From: Kip Macy Date: Tue, 14 Apr 2009 23:05:36 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r191080 - in head: . sbin/route sys/net sys/sys X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 14 Apr 2009 23:05:37 -0000 Author: kmacy Date: Tue Apr 14 23:05:36 2009 New Revision: 191080 URL: http://svn.freebsd.org/changeset/base/191080 Log: Extend route command: - add show as alias for get - add weights to allow mpath to do more than equal cost - add sticky / nostick to disable / re-enable per-connection load balancing This adds a field to rt_metrics_lite so network bits of world will need to be re-built. Reviewed by: jeli & qingli Modified: head/UPDATING head/sbin/route/keywords head/sbin/route/route.c head/sys/net/radix_mpath.c head/sys/net/route.c head/sys/net/route.h head/sys/net/rtsock.c head/sys/sys/param.h Modified: head/UPDATING ============================================================================== --- head/UPDATING Tue Apr 14 22:53:22 2009 (r191079) +++ head/UPDATING Tue Apr 14 23:05:36 2009 (r191080) @@ -22,6 +22,13 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8. to maximize performance. (To disable malloc debugging, run ln -s aj /etc/malloc.conf.) +20090414: + The size of rt_metrics_lite and by extension rtentry has changed. + Networking administration apps will need to be recompiled. + The route command now supports show as an alias for get, weighting + of routes, sticky and nostick flags to alter the behavior of stateful + load balancing. + Bump __FreeBSD_version to 800078. 20090408: Do not use Giant for kbdmux(4) locking. This is wrong and apparently causing more problems than it solves. This will Modified: head/sbin/route/keywords ============================================================================== --- head/sbin/route/keywords Tue Apr 14 22:53:22 2009 (r191079) +++ head/sbin/route/keywords Tue Apr 14 23:05:36 2009 (r191080) @@ -33,6 +33,7 @@ mtu net netmask nostatic +nostick osi prefixlen proto1 @@ -44,8 +45,11 @@ rtt rttvar sa sendpipe +show ssthresh static +sticky +weight x25 xns xresolve Modified: head/sbin/route/route.c ============================================================================== --- head/sbin/route/route.c Tue Apr 14 22:53:22 2009 (r191079) +++ head/sbin/route/route.c Tue Apr 14 23:05:36 2009 (r191080) @@ -169,6 +169,7 @@ main(argc, argv) if (*argv) switch (keyword(*argv)) { case K_GET: + case K_SHOW: uid = 0; /* FALLTHROUGH */ @@ -548,6 +549,7 @@ set_metric(value, key) caseof(K_SSTHRESH, RTV_SSTHRESH, rmx_ssthresh); caseof(K_RTT, RTV_RTT, rmx_rtt); caseof(K_RTTVAR, RTV_RTTVAR, rmx_rttvar); + caseof(K_WEIGHT, RTV_WEIGHT, rmx_weight); } rtm_inits |= flag; if (lockrest || locking) @@ -571,8 +573,9 @@ newroute(argc, argv) errx(EX_NOPERM, "must be root to alter routing table"); } cmd = argv[0]; - if (*cmd != 'g') + if (*cmd != 'g' && *cmd != 's') shutdown(s, SHUT_RD); /* Don't want to read back our messages */ + while (--argc > 0) { if (**(++argv)== '-') { switch (key = keyword(1 + *argv)) { @@ -635,6 +638,12 @@ newroute(argc, argv) case K_STATIC: flags |= RTF_STATIC; break; + case K_STICKY: + flags |= RTF_STICKY; + break; + case K_NOSTICK: + flags &= ~RTF_STICKY; + break; case K_IFA: if (!--argc) usage((char *)NULL); @@ -688,6 +697,7 @@ newroute(argc, argv) case K_SSTHRESH: case K_RTT: case K_RTTVAR: + case K_WEIGHT: if (!--argc) usage((char *)NULL); set_metric(*++argv, key); @@ -741,7 +751,7 @@ newroute(argc, argv) } else break; } - if (*cmd == 'g') + if (*cmd == 'g' || *cmd == 's') exit(ret != 0); if (!qflag) { oerrno = errno; @@ -1193,7 +1203,7 @@ rtmsg(cmd, flags) cmd = RTM_ADD; else if (cmd == 'c') cmd = RTM_CHANGE; - else if (cmd == 'g') { + else if (cmd == 'g' || cmd == 's') { cmd = RTM_GET; if (so_ifp.sa.sa_family == 0) { so_ifp.sa.sa_family = AF_LINK; @@ -1297,13 +1307,13 @@ char *msgtypes[] = { }; char metricnames[] = -"\011pksent\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire\2hopcount" +"\011weight\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire" "\1mtu"; char routeflags[] = -"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE\010MASK_PRESENT" -"\011CLONING\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE\016b016" -"\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3\024CHAINDELETE" -"\025PINNED\026LOCAL\027BROADCAST\030MULTICAST"; +"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE" +"\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE" +"\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3" +"\025PINNED\026LOCAL\027BROADCAST\030MULTICAST\035STICKY"; char ifnetflags[] = "\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6b6\7RUNNING\010NOARP" "\011PPROMISC\012ALLMULTI\013OACTIVE\014SIMPLEX\015LINK0\016LINK1" @@ -1466,14 +1476,13 @@ print_getmsg(rtm, msglen) #define msec(u) (((u) + 500) / 1000) /* usec to msec */ (void) printf("\n%s\n", "\ - recvpipe sendpipe ssthresh rtt,msec rttvar hopcount mtu expire"); + recvpipe sendpipe ssthresh rtt,msec mtu weight expire"); printf("%8ld%c ", rtm->rtm_rmx.rmx_recvpipe, lock(RPIPE)); printf("%8ld%c ", rtm->rtm_rmx.rmx_sendpipe, lock(SPIPE)); printf("%8ld%c ", rtm->rtm_rmx.rmx_ssthresh, lock(SSTHRESH)); printf("%8ld%c ", msec(rtm->rtm_rmx.rmx_rtt), lock(RTT)); - printf("%8ld%c ", msec(rtm->rtm_rmx.rmx_rttvar), lock(RTTVAR)); - printf("%8ld%c ", rtm->rtm_rmx.rmx_hopcount, lock(HOPCOUNT)); printf("%8ld%c ", rtm->rtm_rmx.rmx_mtu, lock(MTU)); + printf("%8ld%c ", rtm->rtm_rmx.rmx_weight, lock(WEIGHT)); if (rtm->rtm_rmx.rmx_expire) rtm->rtm_rmx.rmx_expire -= time(0); printf("%8ld%c\n", rtm->rtm_rmx.rmx_expire, lock(EXPIRE)); Modified: head/sys/net/radix_mpath.c ============================================================================== --- head/sys/net/radix_mpath.c Tue Apr 14 22:53:22 2009 (r191079) +++ head/sys/net/radix_mpath.c Tue Apr 14 23:05:36 2009 (r191080) @@ -77,15 +77,18 @@ rn_mpath_next(struct radix_node *rn) return NULL; } -u_int32_t +uint32_t rn_mpath_count(struct radix_node *rn) { - u_int32_t i; - - i = 1; - while ((rn = rn_mpath_next(rn)) != NULL) - i++; - return i; + uint32_t i = 0; + struct rtentry *rt; + + while (rn != NULL) { + rt = (struct rtentry *)rn; + i += rt->rt_rmx.rmx_weight; + rn = rn_mpath_next(rn); + } + return (i); } struct rtentry * @@ -256,10 +259,12 @@ different: } void -rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum) +rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum) { struct radix_node *rn0, *rn; u_int32_t n; + struct rtentry *rt; + int64_t weight; /* * XXX we don't attempt to lookup cached route again; what should @@ -284,25 +289,31 @@ rtalloc_mpath_fib(struct route *ro, u_in /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */ hash += hashjitter; hash %= n; - while (hash-- > 0 && rn) { + for (weight = abs((int32_t)hash), rt = ro->ro_rt; + weight >= rt->rt_rmx.rmx_weight && rn; + weight -= rt->rt_rmx.rmx_weight) { + /* stay within the multipath routes */ if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask) break; rn = rn->rn_dupedkey; + rt = (struct rtentry *)rn; } - /* XXX try filling rt_gwroute and avoid unreachable gw */ - /* if gw selection fails, use the first match (default) */ + /* gw selection has failed - there must be only zero weight routes */ if (!rn) { RT_UNLOCK(ro->ro_rt); + ro->ro_rt = NULL; return; } - - RTFREE_LOCKED(ro->ro_rt); - ro->ro_rt = (struct rtentry *)rn; - RT_LOCK(ro->ro_rt); - RT_ADDREF(ro->ro_rt); + if (ro->ro_rt != rt) { + RTFREE_LOCKED(ro->ro_rt); + ro->ro_rt = (struct rtentry *)rn; + RT_LOCK(ro->ro_rt); + RT_ADDREF(ro->ro_rt); + + } RT_UNLOCK(ro->ro_rt); } Modified: head/sys/net/route.c ============================================================================== --- head/sys/net/route.c Tue Apr 14 22:53:22 2009 (r191079) +++ head/sys/net/route.c Tue Apr 14 23:05:36 2009 (r191080) @@ -826,6 +826,103 @@ bad: return (error); } +#ifdef RADIX_MPATH +static int +rn_mpath_update(int req, struct rt_addrinfo *info, + struct radix_node_head *rnh, struct rtentry **ret_nrt) +{ + /* + * if we got multipath routes, we require users to specify + * a matching RTAX_GATEWAY. + */ + struct rtentry *rt, *rto = NULL; + register struct radix_node *rn; + int error = 0; + + rn = rnh->rnh_matchaddr(dst, rnh); + if (rn == NULL) + return (ESRCH); + rto = rt = RNTORT(rn); + rt = rt_mpath_matchgate(rt, gateway); + if (rt == NULL) + return (ESRCH); + /* + * this is the first entry in the chain + */ + if (rto == rt) { + rn = rn_mpath_next((struct radix_node *)rt); + /* + * there is another entry, now it's active + */ + if (rn) { + rto = RNTORT(rn); + RT_LOCK(rto); + rto->rt_flags |= RTF_UP; + RT_UNLOCK(rto); + } else if (rt->rt_flags & RTF_GATEWAY) { + /* + * For gateway routes, we need to + * make sure that we we are deleting + * the correct gateway. + * rt_mpath_matchgate() does not + * check the case when there is only + * one route in the chain. + */ + if (gateway && + (rt->rt_gateway->sa_len != gateway->sa_len || + memcmp(rt->rt_gateway, gateway, gateway->sa_len))) + error = ESRCH; + goto done; + } + /* + * use the normal delete code to remove + * the first entry + */ + if (req != RTM_DELETE) + goto nondelete; + + error = ENOENT; + goto done; + } + + /* + * if the entry is 2nd and on up + */ + if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt)) + panic ("rtrequest1: rt_mpath_deldup"); + RT_LOCK(rt); + RT_ADDREF(rt); + if (req == RTM_DELETE) { + rt->rt_flags &= ~RTF_UP; + /* + * One more rtentry floating around that is not + * linked to the routing table. rttrash will be decremented + * when RTFREE(rt) is eventually called. + */ + V_rttrash++; + + } + +nondelete: + if (req != RTM_DELETE) + panic("unrecognized request %d", req); + + + /* + * If the caller wants it, then it can have it, + * but it's up to it to free the rtentry as we won't be + * doing it. + */ + if (ret_nrt) { + *ret_nrt = rt; + RT_UNLOCK(rt); + } else + RTFREE_LOCKED(rt); +done: + return (error); +} +#endif + int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) @@ -864,65 +961,15 @@ rtrequest1_fib(int req, struct rt_addrin switch (req) { case RTM_DELETE: #ifdef RADIX_MPATH - /* - * if we got multipath routes, we require users to specify - * a matching RTAX_GATEWAY. - */ if (rn_mpath_capable(rnh)) { - struct rtentry *rto = NULL; - - rn = rnh->rnh_matchaddr(dst, rnh); - if (rn == NULL) - senderr(ESRCH); - rto = rt = RNTORT(rn); - rt = rt_mpath_matchgate(rt, gateway); - if (!rt) - senderr(ESRCH); - /* - * this is the first entry in the chain - */ - if (rto == rt) { - rn = rn_mpath_next((struct radix_node *)rt); - /* - * there is another entry, now it's active - */ - if (rn) { - rto = RNTORT(rn); - RT_LOCK(rto); - rto->rt_flags |= RTF_UP; - RT_UNLOCK(rto); - } else if (rt->rt_flags & RTF_GATEWAY) { - /* - * For gateway routes, we need to - * make sure that we we are deleting - * the correct gateway. - * rt_mpath_matchgate() does not - * check the case when there is only - * one route in the chain. - */ - if (gateway && - (rt->rt_gateway->sa_len != gateway->sa_len || - memcmp(rt->rt_gateway, gateway, gateway->sa_len))) - senderr(ESRCH); - } - /* - * use the normal delete code to remove - * the first entry - */ - goto normal_rtdel; - } + error = rn_mpath_update(req, info, rnh, ret_nrt); /* - * if the entry is 2nd and on up + * "bad" holds true for the success case + * as well */ - if (!rt_mpath_deldup(rto, rt)) - panic ("rtrequest1: rt_mpath_deldup"); - RT_LOCK(rt); - RT_ADDREF(rt); - rt->rt_flags &= ~RTF_UP; - goto deldone; /* done with the RTM_DELETE command */ + if (error != ENOENT) + goto bad; } - -normal_rtdel: #endif /* * Remove the item from the tree and return it. @@ -944,9 +991,6 @@ normal_rtdel: if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) ifa->ifa_rtrequest(RTM_DELETE, rt, info); -#ifdef RADIX_MPATH -deldone: -#endif /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented @@ -1019,6 +1063,7 @@ deldone: IFAREF(ifa); rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; + rt->rt_rmx.rmx_weight = 1; #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ Modified: head/sys/net/route.h ============================================================================== --- head/sys/net/route.h Tue Apr 14 22:53:22 2009 (r191079) +++ head/sys/net/route.h Tue Apr 14 23:05:36 2009 (r191080) @@ -58,6 +58,7 @@ struct rt_metrics_lite { u_long rmx_mtu; /* MTU for this path */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_pksent; /* packets sent using this route */ + u_long rmx_weight; /* absolute weight */ }; struct rt_metrics { @@ -71,7 +72,8 @@ struct rt_metrics { u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ - u_long rmx_filler[4]; /* will be used for T/TCP later */ + u_long rmx_weight; /* route weight */ + u_long rmx_filler[3]; /* will be used for T/TCP later */ }; /* @@ -193,13 +195,15 @@ struct ortentry { #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ - /* 0x1000000 and up unassigned */ -#define RTF_RNH_LOCKED 0x40000000 /* radix node head locked by caller */ + /* 0x8000000 and up unassigned */ +#define RTF_STICKY 0x10000000 /* always route dst->src */ + +#define RTF_RNH_LOCKED 0x40000000 /* radix node head is locked */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ - RTF_REJECT | RTF_STATIC) + RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * Routing statistics. @@ -225,12 +229,11 @@ struct rt_msghdr { int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ -#define rtm_use rtm_fmask /* deprecated, use rtm_rmx->rmx_pksent */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; -#define RTM_VERSION 5 /* Up the ante and ignore older versions */ +#define RTM_VERSION 6 /* Up the ante and ignore older versions */ /* * Message types. @@ -265,6 +268,7 @@ struct rt_msghdr { #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ +#define RTV_WEIGHT 0x100 /* init or lock _weight */ /* * Bitmask values for rtm_addrs. Modified: head/sys/net/rtsock.c ============================================================================== --- head/sys/net/rtsock.c Tue Apr 14 22:53:22 2009 (r191079) +++ head/sys/net/rtsock.c Tue Apr 14 23:05:36 2009 (r191080) @@ -637,7 +637,6 @@ route_output(struct mbuf *m, struct sock } (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL); rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = 0; rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; break; @@ -691,10 +690,8 @@ route_output(struct mbuf *m, struct sock rt->rt_ifp = info.rti_ifp; } /* Allow some flags to be toggled on change. */ - if (rtm->rtm_fmask & RTF_FMASK) - rt->rt_flags = (rt->rt_flags & - ~rtm->rtm_fmask) | - (rtm->rtm_flags & rtm->rtm_fmask); + rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) | + (rtm->rtm_flags & RTF_FMASK); rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, &rt->rt_rmx); rtm->rtm_index = rt->rt_ifp->if_index; @@ -773,6 +770,7 @@ rt_setmetrics(u_long which, const struct * of tcp hostcache. The rest is ignored. */ metric(RTV_MTU, rmx_mtu); + metric(RTV_WEIGHT, rmx_weight); /* Userland -> kernel timebase conversion. */ if (which & RTV_EXPIRE) out->rmx_expire = in->rmx_expire ? @@ -786,6 +784,7 @@ rt_getmetrics(const struct rt_metrics_li #define metric(e) out->e = in->e; bzero(out, sizeof(*out)); metric(rmx_mtu); + metric(rmx_weight); /* Kernel -> userland timebase conversion. */ out->rmx_expire = in->rmx_expire ? in->rmx_expire - time_uptime + time_second : 0; @@ -1257,7 +1256,10 @@ sysctl_dumpentry(struct radix_node *rn, struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = rt->rt_rmx.rmx_pksent; + /* + * let's be honest about this being a retarded hack + */ + rtm->rtm_fmask = rt->rt_rmx.rmx_pksent; rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; Modified: head/sys/sys/param.h ============================================================================== --- head/sys/sys/param.h Tue Apr 14 22:53:22 2009 (r191079) +++ head/sys/sys/param.h Tue Apr 14 23:05:36 2009 (r191080) @@ -57,7 +57,7 @@ * is created, otherwise 1. */ #undef __FreeBSD_version -#define __FreeBSD_version 800077 /* Master, propagated to newvers */ +#define __FreeBSD_version 800078 /* Master, propagated to newvers */ #ifndef LOCORE #include