Date: Sat, 1 Oct 2022 14:19:03 GMT From: "Alexander V. Chernikov" <melifaro@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 7e5bf68495cc - main - netlink: add netlink support Message-ID: <202210011419.291EJ3aa000309@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by melifaro: URL: https://cgit.FreeBSD.org/src/commit/?id=7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6 commit 7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6 Author: Alexander V. Chernikov <melifaro@FreeBSD.org> AuthorDate: 2022-01-20 21:39:21 +0000 Commit: Alexander V. Chernikov <melifaro@FreeBSD.org> CommitDate: 2022-10-01 14:15:35 +0000 netlink: add netlink support Netlinks is a communication protocol currently used in Linux kernel to modify, read and subscribe for nearly all networking state. Interfaces, addresses, routes, firewall, fibs, vnets, etc are controlled via netlink. It is async, TLV-based protocol, providing 1-1 and 1-many communications. The current implementation supports the subset of NETLINK_ROUTE family. To be more specific, the following is supported: * Dumps: - routes - nexthops / nexthop groups - interfaces - interface addresses - neighbors (arp/ndp) * Notifications: - interface arrival/departure - interface address arrival/departure - route addition/deletion * Modifications: - adding/deleting routes - adding/deleting nexthops/nexthops groups - adding/deleting neghbors - adding/deleting interfaces (basic support only) * Rtsock interaction - route events are bridged both ways The implementation also supports the NETLINK_GENERIC family framework. Implementation notes: Netlink is implemented via loadable/unloadable kernel module, not touching many kernel parts. Each netlink socket uses dedicated taskqueue to support async operations that can sleep, such as interface creation. All message processing is performed within these taskqueues. Compatibility: Most of the Netlink data models specified above maps to FreeBSD concepts nicely. Unmodified ip(8) binary correctly works with interfaces, addresses, routes, nexthops and nexthop groups. Some software such as net/bird require header-only modifications to compile and work with FreeBSD netlink. Reviewed by: imp Differential Revision: https://reviews.freebsd.org/D36002 MFC after: 2 months --- etc/mtree/BSD.include.dist | 4 + sys/modules/Makefile | 1 + sys/modules/netlink/Makefile | 17 + sys/net/route.c | 11 + sys/net/route/route_ctl.h | 7 + sys/net/rtsock.c | 42 ++ sys/netlink/netlink.h | 257 +++++++++ sys/netlink/netlink_ctl.h | 102 ++++ sys/netlink/netlink_debug.h | 82 +++ sys/netlink/netlink_domain.c | 689 +++++++++++++++++++++++ sys/netlink/netlink_generic.c | 472 ++++++++++++++++ sys/netlink/netlink_generic.h | 112 ++++ sys/netlink/netlink_io.c | 528 ++++++++++++++++++ sys/netlink/netlink_linux.h | 54 ++ sys/netlink/netlink_message_parser.c | 472 ++++++++++++++++ sys/netlink/netlink_message_parser.h | 270 +++++++++ sys/netlink/netlink_message_writer.c | 686 +++++++++++++++++++++++ sys/netlink/netlink_message_writer.h | 250 +++++++++ sys/netlink/netlink_module.c | 228 ++++++++ sys/netlink/netlink_route.c | 135 +++++ sys/netlink/netlink_route.h | 43 ++ sys/netlink/netlink_var.h | 142 +++++ sys/netlink/route/common.h | 213 ++++++++ sys/netlink/route/iface.c | 857 +++++++++++++++++++++++++++++ sys/netlink/route/iface_drivers.c | 165 ++++++ sys/netlink/route/ifaddrs.h | 90 +++ sys/netlink/route/interface.h | 245 +++++++++ sys/netlink/route/neigh.c | 571 +++++++++++++++++++ sys/netlink/route/neigh.h | 105 ++++ sys/netlink/route/nexthop.c | 1000 ++++++++++++++++++++++++++++++++++ sys/netlink/route/nexthop.h | 102 ++++ sys/netlink/route/route.c | 972 +++++++++++++++++++++++++++++++++ sys/netlink/route/route.h | 366 +++++++++++++ sys/netlink/route/route_var.h | 101 ++++ 34 files changed, 9391 insertions(+) diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index bb5453252d86..192508bbf6f1 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -269,6 +269,10 @@ .. netinet6 .. + netlink + route + .. + .. netipsec .. netnatm diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 68b3dfcac776..a6aee9bbab36 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -266,6 +266,7 @@ SUBDIR= \ my \ ${_nctgpio} \ ${_neta} \ + netlink \ ${_netgraph} \ ${_nfe} \ nfscl \ diff --git a/sys/modules/netlink/Makefile b/sys/modules/netlink/Makefile new file mode 100644 index 000000000000..046ecf5a2961 --- /dev/null +++ b/sys/modules/netlink/Makefile @@ -0,0 +1,17 @@ +.PATH: ${SRCTOP}/sys/netlink +KMOD= netlink + +SRCS = netlink_module.c netlink_domain.c netlink_io.c \ + netlink_message_parser.c netlink_message_writer.c netlink_generic.c \ + netlink_route.c route/iface.c route/iface_drivers.c route/neigh.c \ + route/nexthop.c route/route.c + +EXPORT_SYMS= +EXPORT_SYMS+= nlmsg_get_chain_writer +EXPORT_SYMS+= nlmsg_refill_buffer +EXPORT_SYMS+= nlmsg_end +EXPORT_SYMS+= nlmsg_flush + +EXPORT_SYMS= YES + +.include <bsd.kmod.mk> diff --git a/sys/net/route.c b/sys/net/route.c index 7d46ba2588ed..9773f899f5af 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -694,3 +694,14 @@ rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) return (rtsock_routemsg_info(cmd, info, fibnum)); } + +/* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */ +static void +ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ +} +static struct rtbridge ignore_cb = { .route_f = ignore_route_event }; + +void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */ +struct rtbridge *rtsock_callback_p = &ignore_cb; +struct rtbridge *netlink_callback_p = &ignore_cb; diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h index 0b331e5f7d2c..d150da6264d4 100644 --- a/sys/net/route/route_ctl.h +++ b/sys/net/route/route_ctl.h @@ -189,4 +189,11 @@ void rib_unsubscribe_locked(struct rib_subscription *rs); void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); +/* Event bridge */ +typedef void route_event_f(uint32_t fibnum, const struct rib_cmd_info *rc); +struct rtbridge{ + route_event_f *route_f; +}; +extern struct rtbridge *rtsock_callback_p; +extern struct rtbridge *netlink_callback_p; #endif diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 91ad8c79a5eb..99d962c972cb 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -219,6 +219,7 @@ static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, int rtm_errno); static bool can_export_rte(struct ucred *td_ucred, bool rt_is_host, const struct sockaddr *rt_dst); +static void rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", @@ -274,6 +275,45 @@ VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_uninit, 0); #endif +static void +report_route_event(const struct rib_cmd_info *rc, void *_cbdata) +{ + uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata; + struct nhop_object *nh; + + nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new; + rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum); +} + +static void +rts_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ +#ifdef ROUTE_MPATH + if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) || + (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) { + rib_decompose_notification(rc, report_route_event, + (void *)(uintptr_t)fibnum); + } else +#endif + report_route_event(rc, (void *)(uintptr_t)fibnum); +} +static struct rtbridge rtsbridge = { .route_f = rts_handle_route_event }; +static struct rtbridge *rtsbridge_orig_p; + +static void +rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ + netlink_callback_p->route_f(fibnum, rc); +} + +static void +rtsock_init(void) +{ + rtsbridge_orig_p = rtsock_callback_p; + rtsock_callback_p = &rtsbridge; +} +SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL); + static void rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp) { @@ -1074,6 +1114,7 @@ rts_send(struct socket *so, int flags, struct mbuf *m, } error = rib_action(fibnum, rtm->rtm_type, &info, &rc); if (error == 0) { + rtsock_notify_event(fibnum, &rc); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_new) || (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { @@ -1095,6 +1136,7 @@ rts_send(struct socket *so, int flags, struct mbuf *m, case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { + rtsock_notify_event(fibnum, &rc); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_old) || (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h new file mode 100644 index 000000000000..6a68dcec1382 --- /dev/null +++ b/sys/netlink/netlink.h @@ -0,0 +1,257 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) The Internet Society (2003). All Rights Reserved. + * + * This document and translations of it may be copied and furnished to + * others, and derivative works that comment on or otherwise explain it + * or assist in its implementation may be prepared, copied, published + * and distributed, in whole or in part, without restriction of any + * kind, provided that the above copyright notice and this paragraph are + * included on all such copies and derivative works. However, this + * document itself may not be modified in any way, such as by removing + * the copyright notice or references to the Internet Society or other + * Internet organizations, except as needed for the purpose of + * developing Internet standards in which case the procedures for + * copyrights defined in the Internet Standards process must be + * followed, or as required to translate it into languages other than + * English. + * + * The limited permissions granted above are perpetual and will not be + * revoked by the Internet Society or its successors or assignees. + * + * This document and the information contained herein is provided on an + * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + + */ + +/* + * This file contains structures and constants for RFC 3549 (Netlink) + * protocol. Some values have been taken from Linux implementation. + */ + +#ifndef _NETLINK_NETLINK_H_ +#define _NETLINK_NETLINK_H_ + +#include <sys/types.h> +#include <sys/socket.h> + +struct sockaddr_nl { + uint8_t nl_len; /* sizeof(sockaddr_nl) */ + sa_family_t nl_family; /* netlink family */ + uint16_t nl_pad; /* reserved, set to 0 */ + uint32_t nl_pid; /* desired port ID, 0 for auto-select */ + uint32_t nl_groups; /* multicast groups mask to bind to */ +}; + +#define SOL_NETLINK 270 + +/* Netlink socket options */ +#define NETLINK_ADD_MEMBERSHIP 1 /* Subscribe for the specified group notifications */ +#define NETLINK_DROP_MEMBERSHIP 2 /* Unsubscribe from the specified group */ +#define NETLINK_PKTINFO 3 /* XXX: not supported */ +#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */ +#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */ +#define NETLINK_RX_RING 6 /* XXX: not supported */ +#define NETLINK_TX_RING 7 /* XXX: not supported */ +#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */ + +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 /* Send only original message header in the reply */ +#define NETLINK_EXT_ACK 11 /* Ack support for receiving additional TLVs in ack */ +#define NETLINK_GET_STRICT_CHK 12 /* Strict header checking */ + + +/* + * RFC 3549, 2.3.2 Netlink Message Header + */ +struct nlmsghdr { + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message type identifier */ + uint16_t nlmsg_flags; /* Flags (NLM_F_) */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ +}; + +/* + * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags) + */ +#define NLM_F_REQUEST 0x01 /* Indicateds request to kernel */ +#define NLM_F_MULTI 0x02 /* Message is part of a group terminated by NLMSG_DONE msg */ +#define NLM_F_ACK 0x04 /* Reply with ack message containing resulting error code */ +#define NLM_F_ECHO 0x08 /* (not supported) Echo this request back */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* + * RFC 3549, 2.3.2 Additional flag bits for GET requests + */ +#define NLM_F_ROOT 0x100 /* Return the complete table */ +#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */ +#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot (ignored) */ +#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH) + +/* + * RFC 3549, 2.3.2 Additional flag bits for NEW requests + */ +#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */ +#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */ +#define NLM_F_CREATE 0x400 /* Create if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE requests */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + * RFC 3549, 2.3.2 standard message types (nlmsg_type). + */ +#define NLMSG_NOOP 0x1 /* Message is ignored. */ +#define NLMSG_ERROR 0x2 /* reply error code reporting */ +#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */ +#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +/* + * Defition of numbers assigned to the netlink subsystems. + */ +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* not supported */ +#define NETLINK_USERSOCK 2 /* not supported */ +#define NETLINK_FIREWALL 3 /* not supported */ +#define NETLINK_SOCK_DIAG 4 /* not supported */ +#define NETLINK_NFLOG 5 /* not supported */ +#define NETLINK_XFRM 6 /* (not supported) PF_SETKEY */ +#define NETLINK_SELINUX 7 /* not supported */ +#define NETLINK_ISCSI 8 /* not supported */ +#define NETLINK_AUDIT 9 /* not supported */ +#define NETLINK_FIB_LOOKUP 10 /* not supported */ +#define NETLINK_CONNECTOR 11 /* not supported */ +#define NETLINK_NETFILTER 12 /* not supported */ +#define NETLINK_IP6_FW 13 /* not supported */ +#define NETLINK_DNRTMSG 14 /* not supported */ +#define NETLINK_KOBJECT_UEVENT 15 /* not supported */ +#define NETLINK_GENERIC 16 /* Generic netlink (dynamic families) */ + +/* + * RFC 3549, 2.3.2.2 The ACK Netlink Message + */ +struct nlmsgerr { + int error; + struct nlmsghdr msg; +}; + +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG = 1, /* string, error message */ + NLMSGERR_ATTR_OFFS = 2, /* u32, offset of the invalid attr from nl header */ + NLMSGERR_ATTR_COOKIE = 3, /* binary, data to pass to userland */ + NLMSGERR_ATTR_POLICY = 4, /* not supported */ + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + + +#ifndef roundup2 +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#endif +#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t) +#define NL_ITEM_ALIGN(_len) roundup2(_len, NL_ITEM_ALIGN_SIZE) +#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off)) +#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off)) + +#define NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M) \ + ((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len)) +#define NL_ITEM_NEXT(_ptr, _LEN_M) ((typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr))) +#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \ + ((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO)) + + +#ifndef _KERNEL +/* part of netlink(3) API */ +#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLMSG_HDRLEN ((int)sizeof(struct nlmsghdr)) +#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(_len)) +#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, NLMSG_HDRLEN) +#define _NLMSG_LEN(_hdr) ((int)(_hdr)->nlmsg_len) +#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr)) +#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN) +#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len))) +#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN) + +#else +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1)) +#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#endif + +/* + * Base netlink attribute TLV header. + */ +struct nlattr { + uint16_t nla_len; /* Total attribute length */ + uint16_t nla_type; /* Attribute type */ +}; + +/* + * + * nl_type field enconding: + * + * 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |N|O| Attribute type | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * N - attribute contains other attributes (mostly unused) + * O - encoded in network byte order (mostly unused) + * Note: N & O are mutually exclusive + * + * Note: attribute type value scope normally is either parent attribute + * or the message/message group. + */ + +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#ifndef _KERNEL +#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#endif + +#endif diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h new file mode 100644 index 000000000000..fb5a8b30e0aa --- /dev/null +++ b/sys/netlink/netlink_ctl.h @@ -0,0 +1,102 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_CTL_H_ +#define _NETLINK_NETLINK_CTL_H_ + +/* + * This file provides headers for the public KPI of the netlink + * subsystem + */ + +MALLOC_DECLARE(M_NETLINK); + +/* + * Macro for handling attribute TLVs + */ +#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) + +#define NETLINK_ALIGN_SIZE sizeof(uint32_t) +#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) + +#define NLA_ALIGN_SIZE sizeof(uint32_t) +#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#define NLA_DATA_LEN(_nla) ((int)((_nla)->nla_len - NLA_HDRLEN)) +#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN) +#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN) +#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF) + +#ifndef typeof +#define typeof __typeof +#endif + +#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) +#define _NLA_END(_start, _len) ((char *)(_start) + (_len)) +#define NLA_FOREACH(_attr, _start, _len) \ + for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \ + ((char *)_attr < (char *)_end) && \ + ((char *)NLA_NEXT(_attr) <= (char *)_end); \ + _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr))) + +#define NL_ARRAY_LEN(_a) (sizeof(_a) / sizeof((_a)[0])) + +#include <netlink/netlink_message_writer.h> +#include <netlink/netlink_message_parser.h> + + +/* Protocol handlers */ +struct nl_pstate; +typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt); + +bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler); +bool netlink_unregister_proto(int proto); + +/* Common helpers */ +bool nl_has_listeners(int netlink_family, uint32_t groups_mask); +bool nlp_has_priv(struct nlpcb *nlp, int priv); + +/* netlink_generic.c */ +struct genl_cmd { + const char *cmd_name; + nl_handler_f cmd_cb; + uint32_t cmd_flags; + uint32_t cmd_priv; + uint32_t cmd_num; +}; + +uint32_t genl_register_family(const char *family_name, size_t hdrsize, + int family_version, int max_attr_idx); +bool genl_unregister_family(const char *family_name); +bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, + int count); +uint32_t genl_register_group(const char *family_name, const char *group_name); + +/* Debug */ +uint32_t nlp_get_pid(const struct nlpcb *nlp); + +#endif diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h new file mode 100644 index 000000000000..6ff6811c6a5a --- /dev/null +++ b/sys/netlink/netlink_debug.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETLINK_NETLINK_DEBUG_H_ +#define _NETLINK_NETLINK_DEBUG_H_ + +#define _DEBUG_SYSCTL_OID _net_netlink_debug +#include <net/route/route_debug.h> + +SYSCTL_DECL(_net_netlink_debug); + +/* + * Generic debug + * [nl_domain] func_name: debug text + */ +#define NL_LOG RT_LOG + +/* + * Logging for events specific for particular process + * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45 + */ +#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__) +#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ + _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__, ##__VA_ARGS__); \ +} + +#define NLP_LOG(_l, _nlp, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__) + +#if DEBUG_MAX_LEVEL>=LOG_DEBUG3 +#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG2 +#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG +#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_INFO +#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...) +#endif +#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG + + + +#endif diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c new file mode 100644 index 000000000000..159dfd03724d --- /dev/null +++ b/sys/netlink/netlink_domain.c @@ -0,0 +1,689 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains socket and protocol bindings for netlink. + */ + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/ck.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysent.h> +#include <sys/syslog.h> +#include <sys/priv.h> /* priv_check */ + +#include <netlink/netlink.h> +#include <netlink/netlink_ctl.h> +#include <netlink/netlink_var.h> + +#define DEBUG_MOD_NAME nl_domain +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include <netlink/netlink_debug.h> +_DECLARE_DEBUG(LOG_DEBUG); + + +#define NLCTL_TRACKER struct rm_priotracker nl_tracker +#define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) +#define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) + +#define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) +#define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) + +static u_long nl_sendspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, + "Default netlink socket send space"); + +static u_long nl_recvspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, + "Default netlink socket receive space"); + +extern u_long sb_max_adj; +static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ + +uint32_t +nlp_get_pid(const struct nlpcb *nlp) +{ + return (nlp->nl_process_id); +} + +/* + * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. + * Returns nlpcb pointer if present else NULL + */ +static struct nlpcb * +nl_port_lookup(uint32_t port_id) +{ + struct nlpcb *nlp; + + CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { + if (nlp->nl_port == port_id) + return (nlp); + } + return (NULL); +} + +static void +nl_update_groups_locked(struct nlpcb *nlp, uint64_t nl_groups) +{ + /* Update group mask */ + NL_LOG(LOG_DEBUG2, "socket %p, groups 0x%lX -> 0x%lX", + nlp->nl_socket, nlp->nl_groups, nl_groups); + nlp->nl_groups = nl_groups; +} + +/* + * Broadcasts message @m to the protocol @proto group specified by @group_id + */ +void +nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id) +{ + struct nlpcb *nlp_last = NULL; + struct nlpcb *nlp; + NLCTL_TRACKER; + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); + NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d", + m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id); + } + + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + if (__predict_false(ctl == NULL)) { + /* + * Can be the case when notification is sent within VNET + * which doesn't have any netlink sockets. + */ + m_freem(m); + return; + } + + NLCTL_RLOCK(ctl); + + int io_flags = NL_IOF_UNTRANSLATED; + uint64_t groups_mask = 1 << ((uint64_t)group_id - 1); + + CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { + if (nlp->nl_groups & groups_mask && nlp->nl_proto == proto) { + if (nlp_last != NULL) { + struct mbuf *m_copy; + m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (m_copy != NULL) + nl_send_one(m_copy, nlp_last, num_messages, io_flags); + else { + NLP_LOCK(nlp_last); + if (nlp_last->nl_socket != NULL) + sorwakeup(nlp_last->nl_socket); + NLP_UNLOCK(nlp_last); + } + } + nlp_last = nlp; + } + } + if (nlp_last != NULL) + nl_send_one(m, nlp_last, num_messages, io_flags); + else + m_freem(m); + + NLCTL_RUNLOCK(ctl); +} + +bool +nl_has_listeners(int netlink_family, uint32_t groups_mask) +{ + return (V_nl_ctl != NULL); +} + +bool +nlp_has_priv(struct nlpcb *nlp, int priv) +{ + return (priv_check_cred(nlp->nl_cred, priv) == 0); +} + +static uint32_t +nl_find_port() { + /* + * app can open multiple netlink sockets. + * Start with current pid, if already taken, + * try random numbers in 65k..256k+65k space, + * avoiding clash with pids. + */ + if (nl_port_lookup(curproc->p_pid) == NULL) + return (curproc->p_pid); + for (int i = 0; i < 16; i++) { + uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; + if (nl_port_lookup(nl_port) == 0) + return (nl_port); + NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); + } + return (curproc->p_pid); +} + +static int +nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) +{ + if (nlp->nl_bound) { + if (nlp->nl_port != snl->nl_pid) { + NL_LOG(LOG_DEBUG, + "bind() failed: program pid %d " + "is different from provided pid %d", + nlp->nl_port, snl->nl_pid); + return (EINVAL); // XXX: better error + } + } else { + if (snl->nl_pid == 0) + snl->nl_pid = nl_find_port(); + if (nl_port_lookup(snl->nl_pid) != NULL) + return (EADDRINUSE); + nlp->nl_port = snl->nl_pid; + nlp->nl_bound = true; + CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); + } + nl_update_groups_locked(nlp, snl->nl_groups); + + return (0); +} + +static int +nl_pru_attach(struct socket *so, int proto, struct thread *td) +{ + struct nlpcb *nlp; + int error; + + if (__predict_false(netlink_unloading != 0)) + return (EAFNOSUPPORT); + + error = nl_verify_proto(proto); + if (error != 0) + return (error); + + bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; + NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", + so, is_linux ? "(linux) " : "", curproc->p_pid, + nl_get_proto_name(proto)); + + /* Create per-VNET state on first socket init */ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + if (ctl == NULL) + ctl = vnet_nl_ctl_init(); + KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); + + MPASS(sotonlpcb(so) == NULL); + + nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); + error = soreserve(so, nl_sendspace, nl_recvspace); + if (error != 0) { + free(nlp, M_PCB); + return (error); + } + so->so_pcb = nlp; + nlp->nl_socket = so; + /* Copy so_cred to avoid having socket_var.h in every header */ + nlp->nl_cred = so->so_cred; + nlp->nl_proto = proto; + nlp->nl_process_id = curproc->p_pid; + nlp->nl_linux = is_linux; + nlp->nl_active = true; + NLP_LOCK_INIT(nlp); + refcount_init(&nlp->nl_refcount, 1); + nl_init_io(nlp); + + nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, + taskqueue_thread_enqueue, &nlp->nl_taskqueue); + TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); + taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, + "netlink_socket (PID %u)", nlp->nl_process_id); + + NLCTL_WLOCK(ctl); + /* XXX: check ctl is still alive */ + CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); *** 8742 LINES SKIPPED ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202210011419.291EJ3aa000309>