Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 1 Oct 2022 14:19:03 GMT
From:      "Alexander V. Chernikov" <melifaro@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 7e5bf68495cc - main - netlink: add netlink support
Message-ID:  <202210011419.291EJ3aa000309@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by melifaro:

URL: https://cgit.FreeBSD.org/src/commit/?id=7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6

commit 7e5bf68495cc0a8c9793a338a8a02009a7f6dbb6
Author:     Alexander V. Chernikov <melifaro@FreeBSD.org>
AuthorDate: 2022-01-20 21:39:21 +0000
Commit:     Alexander V. Chernikov <melifaro@FreeBSD.org>
CommitDate: 2022-10-01 14:15:35 +0000

    netlink: add netlink support
    
    Netlinks is a communication protocol currently used in Linux kernel to modify,
     read and subscribe for nearly all networking state. Interfaces, addresses, routes,
     firewall, fibs, vnets, etc are controlled via netlink.
    It is async, TLV-based protocol, providing 1-1 and 1-many communications.
    
    The current implementation supports the subset of NETLINK_ROUTE
    family. To be more specific, the following is supported:
    * Dumps:
     - routes
     - nexthops / nexthop groups
     - interfaces
     - interface addresses
     - neighbors (arp/ndp)
    * Notifications:
     - interface arrival/departure
     - interface address arrival/departure
     - route addition/deletion
    * Modifications:
     - adding/deleting routes
     - adding/deleting nexthops/nexthops groups
     - adding/deleting neghbors
     - adding/deleting interfaces (basic support only)
    * Rtsock interaction
     - route events are bridged both ways
    
    The implementation also supports the NETLINK_GENERIC family framework.
    
    Implementation notes:
    Netlink is implemented via loadable/unloadable kernel module,
     not touching many kernel parts.
    Each netlink socket uses dedicated taskqueue to support async operations
     that can sleep, such as interface creation. All message processing is
     performed within these taskqueues.
    
    Compatibility:
    Most of the Netlink data models specified above maps to FreeBSD concepts
     nicely. Unmodified ip(8) binary correctly works with
    interfaces, addresses, routes, nexthops and nexthop groups. Some
    software such as net/bird require header-only modifications to compile
    and work with FreeBSD netlink.
    
    Reviewed by:    imp
    Differential Revision: https://reviews.freebsd.org/D36002
    MFC after:      2 months
---
 etc/mtree/BSD.include.dist           |    4 +
 sys/modules/Makefile                 |    1 +
 sys/modules/netlink/Makefile         |   17 +
 sys/net/route.c                      |   11 +
 sys/net/route/route_ctl.h            |    7 +
 sys/net/rtsock.c                     |   42 ++
 sys/netlink/netlink.h                |  257 +++++++++
 sys/netlink/netlink_ctl.h            |  102 ++++
 sys/netlink/netlink_debug.h          |   82 +++
 sys/netlink/netlink_domain.c         |  689 +++++++++++++++++++++++
 sys/netlink/netlink_generic.c        |  472 ++++++++++++++++
 sys/netlink/netlink_generic.h        |  112 ++++
 sys/netlink/netlink_io.c             |  528 ++++++++++++++++++
 sys/netlink/netlink_linux.h          |   54 ++
 sys/netlink/netlink_message_parser.c |  472 ++++++++++++++++
 sys/netlink/netlink_message_parser.h |  270 +++++++++
 sys/netlink/netlink_message_writer.c |  686 +++++++++++++++++++++++
 sys/netlink/netlink_message_writer.h |  250 +++++++++
 sys/netlink/netlink_module.c         |  228 ++++++++
 sys/netlink/netlink_route.c          |  135 +++++
 sys/netlink/netlink_route.h          |   43 ++
 sys/netlink/netlink_var.h            |  142 +++++
 sys/netlink/route/common.h           |  213 ++++++++
 sys/netlink/route/iface.c            |  857 +++++++++++++++++++++++++++++
 sys/netlink/route/iface_drivers.c    |  165 ++++++
 sys/netlink/route/ifaddrs.h          |   90 +++
 sys/netlink/route/interface.h        |  245 +++++++++
 sys/netlink/route/neigh.c            |  571 +++++++++++++++++++
 sys/netlink/route/neigh.h            |  105 ++++
 sys/netlink/route/nexthop.c          | 1000 ++++++++++++++++++++++++++++++++++
 sys/netlink/route/nexthop.h          |  102 ++++
 sys/netlink/route/route.c            |  972 +++++++++++++++++++++++++++++++++
 sys/netlink/route/route.h            |  366 +++++++++++++
 sys/netlink/route/route_var.h        |  101 ++++
 34 files changed, 9391 insertions(+)

diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist
index bb5453252d86..192508bbf6f1 100644
--- a/etc/mtree/BSD.include.dist
+++ b/etc/mtree/BSD.include.dist
@@ -269,6 +269,10 @@
     ..
     netinet6
     ..
+    netlink
+        route
+        ..
+    ..
     netipsec
     ..
     netnatm
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 68b3dfcac776..a6aee9bbab36 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -266,6 +266,7 @@ SUBDIR=	\
 	my \
 	${_nctgpio} \
 	${_neta} \
+	netlink \
 	${_netgraph} \
 	${_nfe} \
 	nfscl \
diff --git a/sys/modules/netlink/Makefile b/sys/modules/netlink/Makefile
new file mode 100644
index 000000000000..046ecf5a2961
--- /dev/null
+++ b/sys/modules/netlink/Makefile
@@ -0,0 +1,17 @@
+.PATH:	${SRCTOP}/sys/netlink
+KMOD=	netlink
+
+SRCS =	netlink_module.c netlink_domain.c netlink_io.c \
+	netlink_message_parser.c netlink_message_writer.c  netlink_generic.c \
+	netlink_route.c route/iface.c route/iface_drivers.c route/neigh.c \
+	route/nexthop.c route/route.c
+
+EXPORT_SYMS=
+EXPORT_SYMS+=	nlmsg_get_chain_writer
+EXPORT_SYMS+=	nlmsg_refill_buffer
+EXPORT_SYMS+=	nlmsg_end
+EXPORT_SYMS+=	nlmsg_flush
+
+EXPORT_SYMS= YES
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/route.c b/sys/net/route.c
index 7d46ba2588ed..9773f899f5af 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -694,3 +694,14 @@ rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
 
 	return (rtsock_routemsg_info(cmd, info, fibnum));
 }
+
+/* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */
+static void
+ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+}
+static struct rtbridge ignore_cb = { .route_f = ignore_route_event };
+
+void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */
+struct rtbridge *rtsock_callback_p = &ignore_cb;
+struct rtbridge *netlink_callback_p = &ignore_cb;
diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h
index 0b331e5f7d2c..d150da6264d4 100644
--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@@ -189,4 +189,11 @@ void rib_unsubscribe_locked(struct rib_subscription *rs);
 void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
     struct rib_cmd_info *rc);
 
+/* Event bridge */
+typedef void route_event_f(uint32_t fibnum, const struct rib_cmd_info *rc);
+struct rtbridge{
+	route_event_f	*route_f;
+};
+extern struct rtbridge *rtsock_callback_p;
+extern struct rtbridge *netlink_callback_p;
 #endif
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 91ad8c79a5eb..99d962c972cb 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -219,6 +219,7 @@ static void	send_rtm_reply(struct socket *so, struct rt_msghdr *rtm,
 			int rtm_errno);
 static bool	can_export_rte(struct ucred *td_ucred, bool rt_is_host,
 			const struct sockaddr *rt_dst);
+static void	rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
@@ -274,6 +275,45 @@ VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_uninit, 0);
 #endif
 
+static void
+report_route_event(const struct rib_cmd_info *rc, void *_cbdata)
+{
+	uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata;
+	struct nhop_object *nh;
+
+	nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new;
+	rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum);
+}
+
+static void
+rts_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+#ifdef ROUTE_MPATH
+	if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) ||
+	    (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) {
+		rib_decompose_notification(rc, report_route_event,
+		    (void *)(uintptr_t)fibnum);
+	} else
+#endif
+		report_route_event(rc, (void *)(uintptr_t)fibnum);
+}
+static struct rtbridge rtsbridge = { .route_f = rts_handle_route_event };
+static struct rtbridge *rtsbridge_orig_p;
+
+static void
+rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc)
+{
+	netlink_callback_p->route_f(fibnum, rc);
+}
+
+static void
+rtsock_init(void)
+{
+	rtsbridge_orig_p = rtsock_callback_p;
+	rtsock_callback_p = &rtsbridge;
+}
+SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL);
+
 static void
 rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp)
 {
@@ -1074,6 +1114,7 @@ rts_send(struct socket *so, int flags, struct mbuf *m,
 		}
 		error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
 		if (error == 0) {
+			rtsock_notify_event(fibnum, &rc);
 #ifdef ROUTE_MPATH
 			if (NH_IS_NHGRP(rc.rc_nh_new) ||
 			    (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
@@ -1095,6 +1136,7 @@ rts_send(struct socket *so, int flags, struct mbuf *m,
 	case RTM_DELETE:
 		error = rib_action(fibnum, RTM_DELETE, &info, &rc);
 		if (error == 0) {
+			rtsock_notify_event(fibnum, &rc);
 #ifdef ROUTE_MPATH
 			if (NH_IS_NHGRP(rc.rc_nh_old) ||
 			    (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h
new file mode 100644
index 000000000000..6a68dcec1382
--- /dev/null
+++ b/sys/netlink/netlink.h
@@ -0,0 +1,257 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) The Internet Society (2003).  All Rights Reserved.
+ *
+ * This document and translations of it may be copied and furnished to
+ * others, and derivative works that comment on or otherwise explain it
+ * or assist in its implementation may be prepared, copied, published
+ * and distributed, in whole or in part, without restriction of any
+ * kind, provided that the above copyright notice and this paragraph are
+ * included on all such copies and derivative works.  However, this
+ * document itself may not be modified in any way, such as by removing
+ * the copyright notice or references to the Internet Society or other
+ * Internet organizations, except as needed for the purpose of
+ * developing Internet standards in which case the procedures for
+ * copyrights defined in the Internet Standards process must be
+ * followed, or as required to translate it into languages other than
+ * English.
+ *
+ * The limited permissions granted above are perpetual and will not be
+ * revoked by the Internet Society or its successors or assignees.
+ *
+ * This document and the information contained herein is provided on an
+ * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
+ * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
+ * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+ */
+
+/*
+ * This file contains structures and constants for RFC 3549 (Netlink)
+ * protocol. Some values have been taken from Linux implementation.
+ */
+
+#ifndef _NETLINK_NETLINK_H_
+#define _NETLINK_NETLINK_H_
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+struct sockaddr_nl {
+	uint8_t		nl_len;		/* sizeof(sockaddr_nl) */
+	sa_family_t	nl_family;	/* netlink family */
+	uint16_t	nl_pad;		/* reserved, set to 0 */
+	uint32_t	nl_pid;		/* desired port ID, 0 for auto-select */
+	uint32_t	nl_groups;	/* multicast groups mask to bind to */
+};
+
+#define	SOL_NETLINK			270
+
+/* Netlink socket options */
+#define NETLINK_ADD_MEMBERSHIP		1 /* Subscribe for the specified group notifications */
+#define NETLINK_DROP_MEMBERSHIP		2 /* Unsubscribe from the specified group */
+#define NETLINK_PKTINFO			3 /* XXX: not supported */
+#define NETLINK_BROADCAST_ERROR		4 /* XXX: not supported */
+#define NETLINK_NO_ENOBUFS		5 /* XXX: not supported */
+#define NETLINK_RX_RING			6 /* XXX: not supported */
+#define NETLINK_TX_RING			7 /* XXX: not supported */
+#define NETLINK_LISTEN_ALL_NSID		8 /* XXX: not supported */
+
+#define NETLINK_LIST_MEMBERSHIPS	9
+#define NETLINK_CAP_ACK			10 /* Send only original message header in the reply */
+#define NETLINK_EXT_ACK			11 /* Ack support for receiving additional TLVs in ack */
+#define NETLINK_GET_STRICT_CHK		12 /* Strict header checking */
+
+
+/*
+ * RFC 3549, 2.3.2 Netlink Message Header
+ */
+struct nlmsghdr {
+	uint32_t nlmsg_len;   /* Length of message including header */
+	uint16_t nlmsg_type;  /* Message type identifier */
+	uint16_t nlmsg_flags; /* Flags (NLM_F_) */
+	uint32_t nlmsg_seq;   /* Sequence number */
+	uint32_t nlmsg_pid;   /* Sending process port ID */
+};
+
+/*
+ * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags)
+ */
+#define NLM_F_REQUEST		0x01	/* Indicateds request to kernel */
+#define NLM_F_MULTI		0x02	/* Message is part of a group terminated by NLMSG_DONE msg */
+#define NLM_F_ACK		0x04	/* Reply with ack message containing resulting error code */
+#define NLM_F_ECHO		0x08	/* (not supported) Echo this request back */
+#define NLM_F_DUMP_INTR		0x10	/* Dump was inconsistent due to sequence change */
+#define NLM_F_DUMP_FILTERED	0x20	/* Dump was filtered as requested */
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for GET requests
+ */
+#define NLM_F_ROOT		0x100	/* Return the complete table */
+#define NLM_F_MATCH		0x200	/* Return all entries matching criteria */
+#define NLM_F_ATOMIC		0x400	/* Return an atomic snapshot (ignored) */
+#define NLM_F_DUMP		(NLM_F_ROOT | NLM_F_MATCH)
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for NEW requests
+ */
+#define NLM_F_REPLACE		0x100	/* Replace existing matching config object */
+#define NLM_F_EXCL		0x200	/* Don't replace the object if exists */
+#define NLM_F_CREATE		0x400	/* Create if it does not exist */
+#define NLM_F_APPEND		0x800	/* Add to end of list */
+
+/* Modifiers to DELETE requests */
+#define NLM_F_NONREC		0x100	/* Do not delete recursively */
+
+/* Flags for ACK message */
+#define NLM_F_CAPPED		0x100	/* request was capped */
+#define NLM_F_ACK_TLVS		0x200	/* extended ACK TVLs were included */
+
+/*
+ * RFC 3549, 2.3.2 standard message types (nlmsg_type).
+ */
+#define NLMSG_NOOP		0x1	/* Message is ignored. */
+#define NLMSG_ERROR		0x2	/* reply error code reporting */
+#define NLMSG_DONE		0x3	/* Message terminates a multipart message. */
+#define NLMSG_OVERRUN		0x4	/* overrun detected, data is lost */
+
+#define NLMSG_MIN_TYPE		0x10	/* < 0x10: reserved control messages */
+
+/*
+ * Defition of numbers assigned to the netlink subsystems.
+ */
+#define NETLINK_ROUTE		0	/* Routing/device hook */
+#define NETLINK_UNUSED		1	/* not supported */
+#define NETLINK_USERSOCK	2	/* not supported */
+#define NETLINK_FIREWALL	3	/* not supported */
+#define NETLINK_SOCK_DIAG	4	/* not supported */
+#define NETLINK_NFLOG		5	/* not supported */
+#define NETLINK_XFRM		6	/* (not supported) PF_SETKEY */
+#define NETLINK_SELINUX		7	/* not supported */
+#define NETLINK_ISCSI		8	/* not supported */
+#define NETLINK_AUDIT		9	/* not supported */
+#define NETLINK_FIB_LOOKUP	10	/* not supported */
+#define NETLINK_CONNECTOR	11	/* not supported */
+#define NETLINK_NETFILTER	12	/* not supported */
+#define NETLINK_IP6_FW		13	/* not supported  */
+#define NETLINK_DNRTMSG		14	/* not supported */
+#define NETLINK_KOBJECT_UEVENT	15	/* not supported */
+#define NETLINK_GENERIC		16	/* Generic netlink (dynamic families) */
+
+/*
+ * RFC 3549, 2.3.2.2 The ACK Netlink Message
+ */
+struct nlmsgerr {
+	int	error;
+	struct	nlmsghdr msg;
+};
+
+enum nlmsgerr_attrs {
+	NLMSGERR_ATTR_UNUSED,
+	NLMSGERR_ATTR_MSG	= 1, /* string, error message */
+	NLMSGERR_ATTR_OFFS	= 2, /* u32, offset of the invalid attr from nl header */
+	NLMSGERR_ATTR_COOKIE	= 3, /* binary, data to pass to userland */
+	NLMSGERR_ATTR_POLICY	= 4, /* not supported */
+	__NLMSGERR_ATTR_MAX,
+	NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
+};
+
+
+#ifndef roundup2
+#define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+#endif
+#define	NL_ITEM_ALIGN_SIZE		sizeof(uint32_t)
+#define	NL_ITEM_ALIGN(_len)		roundup2(_len, NL_ITEM_ALIGN_SIZE)
+#define	NL_ITEM_DATA(_ptr, _off)	((void *)((char *)(_ptr) + _off))
+#define	NL_ITEM_DATA_CONST(_ptr, _off)	((const void *)((const char *)(_ptr) + _off))
+
+#define	NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M)	\
+	((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len))
+#define	NL_ITEM_NEXT(_ptr, _LEN_M)	((typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr)))
+#define	NL_ITEM_ITER(_ptr, _len, _LEN_MACRO)	\
+	((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO))
+
+
+#ifndef _KERNEL
+/* part of netlink(3) API */
+#define NLMSG_ALIGNTO			NL_ITEM_ALIGN_SIZE
+#define NLMSG_ALIGN(_len)		NL_ITEM_ALIGN(_len)
+#define NLMSG_HDRLEN			((int)sizeof(struct nlmsghdr))
+#define NLMSG_LENGTH(_len)		((_len) + NLMSG_HDRLEN)
+#define NLMSG_SPACE(len)		NLMSG_ALIGN(NLMSG_LENGTH(_len))
+#define NLMSG_DATA(_hdr)		NL_ITEM_DATA(_hdr, NLMSG_HDRLEN)
+#define	_NLMSG_LEN(_hdr)		((int)(_hdr)->nlmsg_len)
+#define	_NLMSG_ALIGNED_LEN(_hdr)	NLMSG_ALIGN(_NLMSG_LEN(_hdr))
+#define	NLMSG_OK(_hdr, _len)		NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN)
+#define NLMSG_PAYLOAD(_hdr,_len)	(_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len)))
+#define	NLMSG_NEXT(_hdr, _len)		NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN)
+
+#else
+#define NLMSG_ALIGNTO 4U
+#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1))
+#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr)))
+#endif
+
+/*
+ * Base netlink attribute TLV header.
+ */
+struct nlattr {
+	uint16_t nla_len;	/* Total attribute length */
+	uint16_t nla_type;	/* Attribute type */
+};
+
+/*
+ *
+ * nl_type field enconding:
+ *
+ * 0                   1
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|O|  Attribute type           |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * N - attribute contains other attributes (mostly unused)
+ * O - encoded in network byte order (mostly unused)
+ * Note: N & O are mutually exclusive
+ *
+ * Note: attribute type value scope normally is either parent attribute
+ * or the message/message group.
+ */
+
+#define NLA_F_NESTED (1 << 15)
+#define NLA_F_NET_BYTEORDER (1 << 14)
+#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)
+
+#ifndef _KERNEL
+#define	NLA_ALIGNTO	NL_ITEM_ALIGN_SIZE
+#define	NLA_ALIGN(_len)	NL_ITEM_ALIGN(_len)
+#define	NLA_HDRLEN	((int)sizeof(struct nlattr))
+#endif
+
+#endif
diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h
new file mode 100644
index 000000000000..fb5a8b30e0aa
--- /dev/null
+++ b/sys/netlink/netlink_ctl.h
@@ -0,0 +1,102 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_CTL_H_
+#define _NETLINK_NETLINK_CTL_H_
+
+/*
+ * This file provides headers for the public KPI of the netlink
+ * subsystem
+ */
+
+MALLOC_DECLARE(M_NETLINK);
+
+/*
+ * Macro for handling attribute TLVs
+ */
+#define _roundup2(x, y)         (((x)+((y)-1))&(~((y)-1)))
+
+#define NETLINK_ALIGN_SIZE      sizeof(uint32_t)
+#define NETLINK_ALIGN(_len)     _roundup2(_len, NETLINK_ALIGN_SIZE)
+
+#define NLA_ALIGN_SIZE          sizeof(uint32_t)
+#define NLA_ALIGN(_len)         _roundup2(_len, NLA_ALIGN_SIZE)
+#define	NLA_HDRLEN		((int)sizeof(struct nlattr))
+#define	NLA_DATA_LEN(_nla)	((int)((_nla)->nla_len - NLA_HDRLEN))
+#define	NLA_DATA(_nla)		NL_ITEM_DATA(_nla, NLA_HDRLEN)
+#define	NLA_DATA_CONST(_nla)	NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN)
+#define	NLA_TYPE(_nla)		((_nla)->nla_type & 0x3FFF)
+
+#ifndef	typeof
+#define	typeof	__typeof
+#endif
+
+#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len))
+#define	_NLA_END(_start, _len)	((char *)(_start) + (_len))
+#define NLA_FOREACH(_attr, _start, _len)      \
+        for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start);		\
+		((char *)_attr < (char *)_end) && \
+		((char *)NLA_NEXT(_attr) <= (char *)_end);	\
+		_attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr)))
+
+#define	NL_ARRAY_LEN(_a)	(sizeof(_a) / sizeof((_a)[0]))
+
+#include <netlink/netlink_message_writer.h>
+#include <netlink/netlink_message_parser.h>
+
+
+/* Protocol handlers */
+struct nl_pstate;
+typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt);
+
+bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler);
+bool netlink_unregister_proto(int proto);
+
+/* Common helpers */
+bool nl_has_listeners(int netlink_family, uint32_t groups_mask);
+bool nlp_has_priv(struct nlpcb *nlp, int priv);
+
+/* netlink_generic.c */
+struct genl_cmd {
+	const char	*cmd_name;
+	nl_handler_f	cmd_cb;
+	uint32_t	cmd_flags;
+	uint32_t	cmd_priv;
+	uint32_t	cmd_num;
+};
+
+uint32_t genl_register_family(const char *family_name, size_t hdrsize,
+    int family_version, int max_attr_idx);
+bool genl_unregister_family(const char *family_name);
+bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds,
+    int count);
+uint32_t genl_register_group(const char *family_name, const char *group_name);
+
+/* Debug */
+uint32_t nlp_get_pid(const struct nlpcb *nlp);
+
+#endif
diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h
new file mode 100644
index 000000000000..6ff6811c6a5a
--- /dev/null
+++ b/sys/netlink/netlink_debug.h
@@ -0,0 +1,82 @@
+/*-
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETLINK_NETLINK_DEBUG_H_
+#define	_NETLINK_NETLINK_DEBUG_H_
+
+#define	_DEBUG_SYSCTL_OID	_net_netlink_debug
+#include <net/route/route_debug.h>
+
+SYSCTL_DECL(_net_netlink_debug);
+
+/*
+ * Generic debug
+ * [nl_domain] func_name: debug text
+ */
+#define	NL_LOG	RT_LOG
+
+/*
+ * Logging for events specific for particular process
+ * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45
+ */
+#define	NL_RAW_PID_LOG(_l, _pid, _fmt, ...)	NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__)
+#define	_NL_RAW_PID_LOG(_l, _pid, _fmt, ...)	if (_DEBUG_PASS_MSG(_l)) {	\
+	_output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__, ##__VA_ARGS__); \
+}
+
+#define	NLP_LOG(_l, _nlp, _fmt, ...)	NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__)
+
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG3
+#define	NL_RAW_PID_LOG_LOG_DEBUG3	_NL_RAW_PID_LOG
+#else
+#define	NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG2
+#define	NL_RAW_PID_LOG_LOG_DEBUG2	_NL_RAW_PID_LOG
+#else
+#define	NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG
+#define	NL_RAW_PID_LOG_LOG_DEBUG	_NL_RAW_PID_LOG
+#else
+#define	NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_INFO
+#define	NL_RAW_PID_LOG_LOG_INFO	_NL_RAW_PID_LOG
+#else
+#define	NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...)
+#endif
+#define	NL_RAW_PID_LOG_LOG_NOTICE	_NL_RAW_PID_LOG
+#define	NL_RAW_PID_LOG_LOG_ERR         _NL_RAW_PID_LOG
+#define	NL_RAW_PID_LOG_LOG_WARNING	_NL_RAW_PID_LOG
+
+
+
+#endif
diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
new file mode 100644
index 000000000000..159dfd03724d
--- /dev/null
+++ b/sys/netlink/netlink_domain.c
@@ -0,0 +1,689 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains socket and protocol bindings for netlink.
+ */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/priv.h> /* priv_check */
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define	DEBUG_MOD_NAME	nl_domain
+#define	DEBUG_MAX_LEVEL	LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+
+#define	NLCTL_TRACKER		struct rm_priotracker nl_tracker
+#define	NLCTL_RLOCK(_ctl)	rm_rlock(&((_ctl)->ctl_lock), &nl_tracker)
+#define	NLCTL_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->ctl_lock), &nl_tracker)
+
+#define	NLCTL_WLOCK(_ctl)	rm_wlock(&((_ctl)->ctl_lock))
+#define	NLCTL_WUNLOCK(_ctl)	rm_wunlock(&((_ctl)->ctl_lock))
+
+static u_long nl_sendspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
+    "Default netlink socket send space");
+
+static u_long nl_recvspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
+    "Default netlink socket receive space");
+
+extern u_long sb_max_adj;
+static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
+
+uint32_t
+nlp_get_pid(const struct nlpcb *nlp)
+{
+	return (nlp->nl_process_id);
+}
+
+/*
+ * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
+ * Returns nlpcb pointer if present else NULL
+ */
+static struct nlpcb *
+nl_port_lookup(uint32_t port_id)
+{
+	struct nlpcb *nlp;
+
+	CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) {
+		if (nlp->nl_port == port_id)
+			return (nlp);
+	}
+	return (NULL);
+}
+
+static void
+nl_update_groups_locked(struct nlpcb *nlp, uint64_t nl_groups)
+{
+	/* Update group mask */
+	NL_LOG(LOG_DEBUG2, "socket %p, groups 0x%lX -> 0x%lX",
+	    nlp->nl_socket, nlp->nl_groups, nl_groups);
+	nlp->nl_groups = nl_groups;
+}
+
+/*
+ * Broadcasts message @m to the protocol @proto group specified by @group_id
+ */
+void
+nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
+{
+	struct nlpcb *nlp_last = NULL;
+	struct nlpcb *nlp;
+	NLCTL_TRACKER;
+
+	IF_DEBUG_LEVEL(LOG_DEBUG2) {
+		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+		NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d",
+		    m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id);
+	}
+
+	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+	if (__predict_false(ctl == NULL)) {
+		/*
+		 * Can be the case when notification is sent within VNET
+		 * which doesn't have any netlink sockets.
+		 */
+		m_freem(m);
+		return;
+	}
+
+	NLCTL_RLOCK(ctl);
+
+	int io_flags = NL_IOF_UNTRANSLATED;
+	uint64_t groups_mask = 1 << ((uint64_t)group_id - 1);
+
+	CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
+		if (nlp->nl_groups & groups_mask && nlp->nl_proto == proto) {
+			if (nlp_last != NULL) {
+				struct mbuf *m_copy;
+				m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+				if (m_copy != NULL)
+					nl_send_one(m_copy, nlp_last, num_messages, io_flags);
+				else {
+					NLP_LOCK(nlp_last);
+					if (nlp_last->nl_socket != NULL)
+						sorwakeup(nlp_last->nl_socket);
+					NLP_UNLOCK(nlp_last);
+				}
+			}
+			nlp_last = nlp;
+		}
+	}
+	if (nlp_last != NULL)
+		nl_send_one(m, nlp_last, num_messages, io_flags);
+	else
+		m_freem(m);
+
+	NLCTL_RUNLOCK(ctl);
+}
+
+bool
+nl_has_listeners(int netlink_family, uint32_t groups_mask)
+{
+	return (V_nl_ctl != NULL);
+}
+
+bool
+nlp_has_priv(struct nlpcb *nlp, int priv)
+{
+	return (priv_check_cred(nlp->nl_cred, priv) == 0);
+}
+
+static uint32_t
+nl_find_port() {
+	/*
+	 * app can open multiple netlink sockets.
+	 * Start with current pid, if already taken,
+	 * try random numbers in 65k..256k+65k space,
+	 * avoiding clash with pids.
+	 */
+	if (nl_port_lookup(curproc->p_pid) == NULL)
+		return (curproc->p_pid);
+	for (int i = 0; i < 16; i++) {
+		uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
+		if (nl_port_lookup(nl_port) == 0)
+			return (nl_port);
+		NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
+	}
+	return (curproc->p_pid);
+}
+
+static int
+nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
+{
+	if (nlp->nl_bound) {
+		if (nlp->nl_port != snl->nl_pid) {
+			NL_LOG(LOG_DEBUG,
+			    "bind() failed: program pid %d "
+			    "is different from provided pid %d",
+			    nlp->nl_port, snl->nl_pid);
+			return (EINVAL); // XXX: better error
+		}
+	} else {
+		if (snl->nl_pid == 0)
+			snl->nl_pid = nl_find_port();
+		if (nl_port_lookup(snl->nl_pid) != NULL)
+			return (EADDRINUSE);
+		nlp->nl_port = snl->nl_pid;
+		nlp->nl_bound = true;
+		CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next);
+	}
+	nl_update_groups_locked(nlp, snl->nl_groups);
+
+	return (0);
+}
+
+static int
+nl_pru_attach(struct socket *so, int proto, struct thread *td)
+{
+	struct nlpcb *nlp;
+	int error;
+
+	if (__predict_false(netlink_unloading != 0))
+		return (EAFNOSUPPORT);
+
+	error = nl_verify_proto(proto);
+	if (error != 0)
+		return (error);
+
+	bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
+	NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
+	    so, is_linux ? "(linux) " : "", curproc->p_pid,
+	    nl_get_proto_name(proto));
+
+	/* Create per-VNET state on first socket init */
+	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+	if (ctl == NULL)
+		ctl = vnet_nl_ctl_init();
+	KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed"));
+
+	MPASS(sotonlpcb(so) == NULL);
+
+	nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
+	error = soreserve(so, nl_sendspace, nl_recvspace);
+	if (error != 0) {
+		free(nlp, M_PCB);
+		return (error);
+	}
+	so->so_pcb = nlp;
+	nlp->nl_socket = so;
+	/* Copy so_cred to avoid having socket_var.h in every header */
+	nlp->nl_cred = so->so_cred;
+	nlp->nl_proto = proto;
+	nlp->nl_process_id = curproc->p_pid;
+	nlp->nl_linux = is_linux;
+	nlp->nl_active = true;
+	NLP_LOCK_INIT(nlp);
+	refcount_init(&nlp->nl_refcount, 1);
+	nl_init_io(nlp);
+
+	nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
+	    taskqueue_thread_enqueue, &nlp->nl_taskqueue);
+	TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
+	taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
+	    "netlink_socket (PID %u)", nlp->nl_process_id);
+
+	NLCTL_WLOCK(ctl);
+	/* XXX: check ctl is still alive */
+	CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next);
*** 8742 LINES SKIPPED ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202210011419.291EJ3aa000309>