Date: Thu, 1 Jan 2009 22:11:44 +0000 (UTC) From: Robert Watson <rwatson@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r186686 - projects/pnet/sys/netinet Message-ID: <200901012211.n01MBist080219@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: rwatson Date: Thu Jan 1 22:11:44 2009 New Revision: 186686 URL: http://svn.freebsd.org/changeset/base/186686 Log: Add IP SUBSET patch to pnet branch: the IP_SUBSET socket option allows identically bound UDP sockets to balance load between them using various strategies, including random assignment, flow-based assignment, CPU-based assignment, and kernel thread ID-based assignment. UDP applications, such as BIND, memcached, etc, can create multiple sockets, each with SO_REUSEPORT set, followed by specifying their index among a set of matching sockets all servicing the same port number. Modified: projects/pnet/sys/netinet/in.h projects/pnet/sys/netinet/in_pcb.c projects/pnet/sys/netinet/in_pcb.h projects/pnet/sys/netinet/in_proto.c projects/pnet/sys/netinet/udp_usrreq.c projects/pnet/sys/netinet/udp_var.h Modified: projects/pnet/sys/netinet/in.h ============================================================================== --- projects/pnet/sys/netinet/in.h Thu Jan 1 20:47:09 2009 (r186685) +++ projects/pnet/sys/netinet/in.h Thu Jan 1 22:11:44 2009 (r186686) @@ -486,6 +486,21 @@ __END_DECLS #define MCAST_BLOCK_SOURCE 84 /* block a source */ #define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */ +/* Binding subsets. */ +#define IP_SUBSET 86 /* get/set binding subset */ + +struct ip_subset { + u_int is_strategy; + u_int is_count; + u_int is_member; +}; + +#define IP_SUBSET_STRATEGY_DISABLED 0 +#define IP_SUBSET_STRATEGY_FLOW 1 +#define IP_SUBSET_STRATEGY_RANDOM 2 +#define IP_SUBSET_STRATEGY_THREADID 3 +#define IP_SUBSET_STRATEGY_CPU 4 + /* * Defaults and limits for options */ Modified: projects/pnet/sys/netinet/in_pcb.c ============================================================================== --- projects/pnet/sys/netinet/in_pcb.c Thu Jan 1 20:47:09 2009 (r186685) +++ projects/pnet/sys/netinet/in_pcb.c Thu Jan 1 22:11:44 2009 (r186686) @@ -204,6 +204,7 @@ in_pcballoc(struct socket *so, struct in inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; + inp->inp_subset_strategy = IP_SUBSET_STRATEGY_DISABLED; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) @@ -1284,12 +1285,114 @@ in_pcblookup_local(struct inpcbinfo *pcb #undef INP_LOOKUP_MAPPED_PCB_COST /* + * Implement various subsetting strategies: determine whether a particular + * inpcb, implementing a particular strategy, matches the passed tuple or + * not. + */ +static int +in_subset_match(struct inpcb *inp, struct in_addr faddr, u_short fport, + struct in_addr laddr, u_short lport, u_short ip_id, u_int32_t flowid) +{ + + switch (inp->inp_subset_strategy) { + case IP_SUBSET_STRATEGY_FLOW: + /* + * If the packet has a flow tag, use that, but otherwise, + * calculate our own flow tag using the IP/port tuple. + */ + if (flowid != 0) { + if ((flowid % inp->inp_subset_count) == + inp->inp_subset_member) + return (1); + } else { + /* + * XXXRW: This hash is not the hash that you are + * looking for. + */ + if (((faddr.s_addr ^ laddr.s_addr ^ fport ^ lport) % + inp->inp_subset_count) == inp->inp_subset_member) + return (1); + } + return (0); + + case IP_SUBSET_STRATEGY_RANDOM: + /* + * If there is a flow tag, use that and the IP ID as a source + * of entropy. Otherwise, calculate our own flow tag as + * above and combine with the IP ID. + * + * XXXRW: This hash is also not the hash that you are looking + * for. + */ + if (flowid != 0) { + if (((flowid ^ ip_id) % inp->inp_subset_count) == + inp->inp_subset_member) + return (1); + } else { + if (((faddr.s_addr ^ laddr.s_addr ^ fport ^ lport ^ + ip_id) % inp->inp_subset_count) == + inp->inp_subset_member) + return (1); + } + return (0); + + case IP_SUBSET_STRATEGY_THREADID: + /* + * Experiment: pick the socket to use based on the kernel + * thread ID processing the packet. This will be fixed for + * particular RSS input queues, so will assign work to a + * particular socket based on which input queue it came from. + * This doesn't attempt to balance the work at all, simply + * ensure that datagrams local to a particular CPU are + * assigned to the same socket consistently. + */ + if ((curthread->td_tid % inp->inp_subset_count) == + inp->inp_subset_member) + return (1); + return (0); + + case IP_SUBSET_STRATEGY_CPU: + /* + * Experimental: packets from the same CPU will always get + * assigned to the same socket. Doesn't attempt to load + * balance or maintain ordering, as source threads may not + * always be on the same CPU. However, may achieve a more + * even or predictable balance than + * IP_SUBSET_STRATEGY_THREADID. + * + * This might be quite a bit more interesting if sockets had + * a formal affinity themselves, as then we could direct + * datagrams to that explicitly. + */ + if ((curcpu % inp->inp_subset_count) == + inp->inp_subset_member) + return (1); + return (0); + + /* case IP_SUBSET_STRATEGY_FILLSOCK: */ + /* + * In this theoretical mode, we attempt to fill sockets in + * the order they are matched, and don't move onto the next + * socket unless the previous one is filled. This requires + * us to peak up a layer and see if there is room for the + * current datagram; this proves somewhat tricky as we need + * to make sure we don't return ICMP when the last one proves + * full, so we don't try to do that yet. + */ + + default: + panic("in_subset_match: strategy %d", + inp->inp_subset_strategy); + } +} + +/* * Lookup PCB in hash list. */ struct inpcb * -in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, - u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, - struct ifnet *ifp) +in_pcblookup_hash_full(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_short ip_id, + u_int32_t flowid, int wildcard, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; @@ -1309,20 +1412,25 @@ in_pcblookup_hash(struct inpcbinfo *pcbi if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif - if (inp->inp_faddr.s_addr == faddr.s_addr && - inp->inp_laddr.s_addr == laddr.s_addr && - inp->inp_fport == fport && - inp->inp_lport == lport) { - /* - * XXX We should be able to directly return - * the inp here, without any checks. - * Well unless both bound with SO_REUSEPORT? - */ - if (jailed(inp->inp_cred)) - return (inp); - if (tmpinp == NULL) - tmpinp = inp; - } + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_laddr.s_addr != laddr.s_addr || + inp->inp_fport != fport || + inp->inp_lport != lport) + continue; + if (inp->inp_subset_strategy != IP_SUBSET_STRATEGY_DISABLED + && !in_subset_match(inp, faddr, fport, laddr, lport, + ip_id, flowid)) + continue; + + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (jailed(inp->inp_cred)) + return (inp); + if (tmpinp == NULL) + tmpinp = inp; } if (tmpinp != NULL) return (tmpinp); @@ -1372,6 +1480,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbi continue; } + if (inp->inp_subset_strategy != + IP_SUBSET_STRATEGY_DISABLED && + !in_subset_match(inp, faddr, fport, laddr, lport, + ip_id, flowid)) + continue; + if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); @@ -1405,6 +1519,16 @@ in_pcblookup_hash(struct inpcbinfo *pcbi return (NULL); } +struct inpcb * +in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, + struct ifnet *ifp) +{ + + return (in_pcblookup_hash_full(pcbinfo, faddr, fport_arg, laddr, + lport_arg, 0, 0, wildcard, ifp)); +} + /* * Insert PCB onto various hash lists. */ Modified: projects/pnet/sys/netinet/in_pcb.h ============================================================================== --- projects/pnet/sys/netinet/in_pcb.h Thu Jan 1 20:47:09 2009 (r186685) +++ projects/pnet/sys/netinet/in_pcb.h Thu Jan 1 22:11:44 2009 (r186686) @@ -199,6 +199,9 @@ struct inpcb { } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ struct inpcbport *inp_phd; /* (i/p) head of this list */ + u_int inp_subset_strategy; + u_int inp_subset_count; + u_int inp_subset_member; #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ struct rwlock inp_lock; @@ -493,6 +496,11 @@ struct inpcb * struct inpcb * in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); +struct inpcb * + in_pcblookup_hash_full(struct inpcbinfo *pcbinfo, + struct in_addr faddr, u_int fport_arg, struct in_addr laddr, + u_int lport_arg, u_short ip_id, u_int32_t flowid, int wildcard, + struct ifnet *ifp); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); Modified: projects/pnet/sys/netinet/in_proto.c ============================================================================== --- projects/pnet/sys/netinet/in_proto.c Thu Jan 1 20:47:09 2009 (r186685) +++ projects/pnet/sys/netinet/in_proto.c Thu Jan 1 22:11:44 2009 (r186686) @@ -124,7 +124,7 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp_input, .pr_ctlinput = udp_ctlinput, - .pr_ctloutput = ip_ctloutput, + .pr_ctloutput = udp_ctloutput, .pr_init = udp_init, .pr_usrreqs = &udp_usrreqs }, Modified: projects/pnet/sys/netinet/udp_usrreq.c ============================================================================== --- projects/pnet/sys/netinet/udp_usrreq.c Thu Jan 1 20:47:09 2009 (r186685) +++ projects/pnet/sys/netinet/udp_usrreq.c Thu Jan 1 22:11:44 2009 (r186686) @@ -526,8 +526,8 @@ udp_input(struct mbuf *m, int off) /* * Locate pcb for datagram. */ - inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, 1, ifp); + inp = in_pcblookup_hash_full(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, ip->ip_id, m->m_pkthdr.flowid, 1, ifp); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; @@ -621,6 +621,9 @@ udp_ctlinput(int cmd, struct sockaddr *s * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. + * + * XXXRW: With subsetting, we should deliver this to all matching + * connections for the specific tuple. */ if (cmd == PRC_HOSTDEAD) ip = NULL; @@ -644,6 +647,67 @@ udp_ctlinput(int cmd, struct sockaddr *s udp_notify); } +int +udp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + INIT_VNET_INET(so->so_vnet); + struct ip_subset is; + struct inpcb *inp; + int error; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_ctloutput: inp == NULL")); + + if (sopt->sopt_level != IPPROTO_UDP) + return (ip_ctloutput(so, sopt)); + + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_SUBSET: + bzero(&is, sizeof(is)); + INP_RLOCK(inp); + is.is_strategy = inp->inp_subset_strategy; + is.is_count = inp->inp_subset_count; + is.is_member = inp->inp_subset_member; + INP_RUNLOCK(inp); + return (sooptcopyout(sopt, &is, sizeof(is))); + } + break; + + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_SUBSET: + error = sooptcopyin(sopt, &is, sizeof(is), + sizeof(is)); + if (error) + return (error); + switch (is.is_strategy) { + case IP_SUBSET_STRATEGY_DISABLED: + break; + + case IP_SUBSET_STRATEGY_FLOW: + case IP_SUBSET_STRATEGY_RANDOM: + if (is.is_count == 0 || + is.is_member >= is.is_count) + return (EINVAL); + break; + + default: + return (EINVAL); + } + INP_WLOCK(inp); + inp->inp_subset_strategy = is.is_strategy; + inp->inp_subset_count = is.is_count; + inp->inp_subset_member = is.is_member; + INP_WUNLOCK(inp); + return (0); + } + break; + } + return (ENOPROTOOPT); +} + static int udp_pcblist(SYSCTL_HANDLER_ARGS) { @@ -758,6 +822,11 @@ udp_getcred(SYSCTL_HANDLER_ARGS) error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); + + /* + * XXXRW: with IP subsetting, potentially more than one socket may + * match, so we just return the cred for the first one. + */ INP_INFO_RLOCK(&V_udbinfo); inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); Modified: projects/pnet/sys/netinet/udp_var.h ============================================================================== --- projects/pnet/sys/netinet/udp_var.h Thu Jan 1 20:47:09 2009 (r186685) +++ projects/pnet/sys/netinet/udp_var.h Thu Jan 1 22:11:44 2009 (r186686) @@ -106,6 +106,7 @@ extern u_long udp_recvspace; extern int udp_log_in_vain; void udp_ctlinput(int, struct sockaddr *, void *); +int udp_ctloutput(struct socket *so, struct sockopt *sopt); void udp_init(void); void udp_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200901012211.n01MBist080219>