Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 20 Oct 2017 08:20:15 +0000 (UTC)
From:      Hans Petter Selasky <hselasky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r324792 - in head/sys: dev/cxgbe/iw_cxgbe dev/mlx4/mlx4_ib dev/mlx5/mlx5_ib ofed/drivers/infiniband/core ofed/drivers/infiniband/hw/mthca ofed/include/rdma
Message-ID:  <201710200820.v9K8KF0J032638@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: hselasky
Date: Fri Oct 20 08:20:15 2017
New Revision: 324792
URL: https://svnweb.freebsd.org/changeset/base/324792

Log:
  The remote DMA TCP portspace selector, RDMA_PS_TCP, is used for both
  iWarp and RoCE in ibcore. The selection of RDMA_PS_TCP can not be used
  to indicate iWarp protocol use. Backport the proper IB device
  capabilities from Linux upstream to distinguish between iWarp and
  RoCE. Only allocate the additional socket required for iWarp for RDMA
  IDs when at least one iWarp device present. This resolves
  interopability issues between iWarp and RoCE in ibcore
  
  Reviewed by:		np @
  Differential Revision:	https://reviews.freebsd.org/D12563
  Sponsored by:		Mellanox Technologies
  MFC after:		3 days

Modified:
  head/sys/dev/cxgbe/iw_cxgbe/provider.c
  head/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c
  head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c
  head/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c
  head/sys/ofed/drivers/infiniband/core/cma.c
  head/sys/ofed/drivers/infiniband/core/device.c
  head/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
  head/sys/ofed/include/rdma/ib_mad.h
  head/sys/ofed/include/rdma/ib_verbs.h

Modified: head/sys/dev/cxgbe/iw_cxgbe/provider.c
==============================================================================
--- head/sys/dev/cxgbe/iw_cxgbe/provider.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/dev/cxgbe/iw_cxgbe/provider.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -388,6 +388,24 @@ c4iw_query_port(struct ib_device *ibdev, u8 port, stru
 	return 0;
 }
 
+static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
 /*
  * Returns -errno on error.
  */
@@ -471,6 +489,7 @@ c4iw_register_device(struct c4iw_dev *dev)
 	ibdev->post_send = c4iw_post_send;
 	ibdev->post_recv = c4iw_post_receive;
 	ibdev->uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
+	ibdev->get_port_immutable = c4iw_port_immutable;
 
 	iwcm = kmalloc(sizeof(*iwcm), GFP_KERNEL);
 	if (iwcm == NULL)

Modified: head/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c
==============================================================================
--- head/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -2229,6 +2229,38 @@ static int mlx4_ib_dev_idx(struct mlx4_dev *dev)
 	return -1;
 }
 
+static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	int err;
+
+	if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) {
+		immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	} else {
+		if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCEV2)
+			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+				RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+		immutable->core_cap_flags |= RDMA_CORE_PORT_RAW_PACKET;
+		if (immutable->core_cap_flags & (RDMA_CORE_PORT_IBA_ROCE |
+		    RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP))
+			immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	}
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
 	struct mlx4_ib_dev *ibdev;
@@ -2360,6 +2392,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+	ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
 	ibdev->ib_dev.get_netdev	= mlx4_ib_get_netdev;
 	ibdev->ib_dev.ioctl		= mlx4_ib_ioctl;
 	ibdev->ib_dev.query_values	= mlx4_ib_query_values;

Modified: head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -1670,6 +1670,57 @@ static void destroy_dev_resources(struct mlx5_ib_resou
 	mlx5_ib_dealloc_pd(devr->p0);
 }
 
+static u32 get_core_cap_flags(struct ib_device *ibdev)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
+	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
+	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+	u32 ret = 0;
+
+	if (ll == IB_LINK_LAYER_INFINIBAND)
+		return RDMA_CORE_PORT_IBA_IB;
+
+	ret = RDMA_CORE_PORT_RAW_PACKET;
+
+	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
+		return ret;
+
+	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
+		return ret;
+
+	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
+		ret |= RDMA_CORE_PORT_IBA_ROCE;
+
+	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
+		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	return ret;
+}
+
+static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
+	int err;
+
+	immutable->core_cap_flags = get_core_cap_flags(ibdev);
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = get_core_cap_flags(ibdev);
+	if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
+		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
 static void enable_dc_tracer(struct mlx5_ib_dev *dev)
 {
 	struct device *device = dev->ib_dev.dma_device;
@@ -2115,6 +2166,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
+	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
 	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
 	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;

Modified: head/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -1576,7 +1576,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, cons
 	int gid_type;
 
 	if ((ll == IB_LINK_LAYER_ETHERNET) || (ah->ah_flags & IB_AH_GRH)) {
-		int len = dev->ib_dev.gid_tbl_len[port - 1];
+		int len = dev->mdev->port_caps[port - 1].gid_table_len;
 		if (ah->grh.sgid_index >= len) {
 			printf("mlx5_ib: ERR: ""sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, len - 1);
 			return -EINVAL;

Modified: head/sys/ofed/drivers/infiniband/core/cma.c
==============================================================================
--- head/sys/ofed/drivers/infiniband/core/cma.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/ofed/drivers/infiniband/core/cma.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -75,11 +75,6 @@ static int def_prec2sl = 3;
 module_param_named(def_prec2sl, def_prec2sl, int, 0644);
 MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
 
-static int unify_tcp_port_space = 1;
-module_param(unify_tcp_port_space, int, 0644);
-MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
-		 "space allocation (default=1)");
-
 static int debug_level = 0;
 #define cma_pr(level, priv, format, arg...)		\
 	printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg)
@@ -198,6 +193,7 @@ struct rdma_id_private {
 	/* cache for mc record params */
 	struct ib_sa_mcmember_rec rec;
 	int is_valid_rec;
+	int unify_ps_tcp;
 };
 
 struct cma_multicast {
@@ -822,29 +818,24 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib
 	int ret = 0;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
-	case RDMA_TRANSPORT_IB:
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
 		if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
 			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
 		else
 			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
 						 qp_attr_mask);
+
 		if (qp_attr->qp_state == IB_QPS_RTR)
 			qp_attr->rq_psn = id_priv->seq_num;
-		break;
-	case RDMA_TRANSPORT_IWARP:
-	case RDMA_TRANSPORT_SCIF:
+	} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
 		if (!id_priv->cm_id.iw) {
 			qp_attr->qp_access_flags = 0;
 			*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
 		} else
 			ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
 						 qp_attr_mask);
-		break;
-	default:
+	} else
 		ret = -ENOSYS;
-		break;
-	}
 
 	return ret;
 }
@@ -1105,6 +1096,10 @@ static void __rdma_free(struct work_struct *work)
 	if (id_priv->internal_id)
 		cma_deref_id(id_priv->id.context);
 
+	if (id_priv->sock != NULL && !id_priv->internal_id &&
+	    !id_priv->unify_ps_tcp)
+		sock_release(id_priv->sock);
+
 	kfree(id_priv->id.route.path_rec);
 	kfree(id_priv);
 }
@@ -1128,8 +1123,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
 	mutex_unlock(&id_priv->handler_mutex);
 
 	if (id_priv->cma_dev) {
-		switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
-		case RDMA_TRANSPORT_IB:
+		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
 			spin_lock_irqsave(&id_priv->cm_lock, flags);
 			if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) {
 				ib = id_priv->cm_id.ib;
@@ -1138,14 +1132,9 @@ void rdma_destroy_id(struct rdma_cm_id *id)
 				ib_destroy_cm_id(ib);
 			} else
 				spin_unlock_irqrestore(&id_priv->cm_lock, flags);
-			break;
-		case RDMA_TRANSPORT_IWARP:
-		case RDMA_TRANSPORT_SCIF:
+		} else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
 			if (id_priv->cm_id.iw)
 				iw_destroy_cm_id(id_priv->cm_id.iw);
-			break;
-		default:
-			break;
 		}
 		cma_leave_mc_groups(id_priv);
 		cma_release_dev(id_priv);
@@ -2141,27 +2130,15 @@ int rdma_resolve_route(struct rdma_cm_id *id, int time
 		return -EINVAL;
 
 	atomic_inc(&id_priv->refcount);
-	switch (rdma_node_get_transport(id->device->node_type)) {
-	case RDMA_TRANSPORT_IB:
-		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
-		case IB_LINK_LAYER_INFINIBAND:
-			ret = cma_resolve_ib_route(id_priv, timeout_ms);
-			break;
-		case IB_LINK_LAYER_ETHERNET:
-			ret = cma_resolve_iboe_route(id_priv);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-		break;
-	case RDMA_TRANSPORT_IWARP:
-	case RDMA_TRANSPORT_SCIF:
+	if (rdma_cap_ib_sa(id->device, id->port_num))
+		ret = cma_resolve_ib_route(id_priv, timeout_ms);
+	else if (rdma_protocol_roce(id->device, id->port_num))
+		ret = cma_resolve_iboe_route(id_priv);
+	else if (rdma_protocol_iwarp(id->device, id->port_num))
 		ret = cma_resolve_iw_route(id_priv, timeout_ms);
-		break;
-	default:
+	else
 		ret = -ENOSYS;
-		break;
-	}
+
 	if (ret)
 		goto err;
 
@@ -2608,6 +2585,10 @@ static int cma_get_tcp_port(struct rdma_id_private *id
 			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
 			ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
 #else
+	SOCK_LOCK(sock);
+	sock->so_options |= SO_REUSEADDR;
+	SOCK_UNLOCK(sock);
+
 	ret = -sobind(sock,
 			(struct sockaddr *)&id_priv->id.route.addr.src_addr,
 			curthread);
@@ -2632,6 +2613,7 @@ static int cma_get_tcp_port(struct rdma_id_private *id
 
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
+	struct cma_device *cma_dev;
 	struct idr *ps;
 	int ret;
 
@@ -2641,7 +2623,18 @@ static int cma_get_port(struct rdma_id_private *id_pri
 		break;
 	case RDMA_PS_TCP:
 		ps = &tcp_ps;
-		if (unify_tcp_port_space) {
+
+		mutex_lock(&lock);
+		/* check if there are any iWarp IB devices present */
+		list_for_each_entry(cma_dev, &dev_list, list) {
+			if (rdma_protocol_iwarp(cma_dev->device, 1)) {
+				id_priv->unify_ps_tcp = 1;
+				break;
+			}
+		}
+		mutex_unlock(&lock);
+
+		if (id_priv->unify_ps_tcp) {
 			ret = cma_get_tcp_port(id_priv);
 			if (ret)
 				goto out;
@@ -2713,19 +2706,15 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
 
 	id_priv->backlog = backlog;
 	if (id->device) {
-		switch (rdma_node_get_transport(id->device->node_type)) {
-		case RDMA_TRANSPORT_IB:
+		if (rdma_cap_ib_cm(id->device, 1)) {
 			ret = cma_ib_listen(id_priv);
 			if (ret)
 				goto err;
-			break;
-		case RDMA_TRANSPORT_IWARP:
-		case RDMA_TRANSPORT_SCIF:
+		} else if (rdma_cap_iw_cm(id->device, 1)) {
 			ret = cma_iw_listen(id_priv, backlog);
 			if (ret)
 				goto err;
-			break;
-		default:
+		} else {
 			ret = -ENOSYS;
 			goto err;
 		}
@@ -3094,21 +3083,15 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_co
 		id_priv->srq = conn_param->srq;
 	}
 
-	switch (rdma_node_get_transport(id->device->node_type)) {
-	case RDMA_TRANSPORT_IB:
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
 		if (id->qp_type == IB_QPT_UD)
 			ret = cma_resolve_ib_udp(id_priv, conn_param);
 		else
 			ret = cma_connect_ib(id_priv, conn_param);
-		break;
-	case RDMA_TRANSPORT_IWARP:
-	case RDMA_TRANSPORT_SCIF:
+	} else if (rdma_cap_iw_cm(id->device, id->port_num))
 		ret = cma_connect_iw(id_priv, conn_param);
-		break;
-	default:
+	else
 		ret = -ENOSYS;
-		break;
-	}
 	if (ret)
 		goto err;
 
@@ -3214,8 +3197,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_con
 		id_priv->srq = conn_param->srq;
 	}
 
-	switch (rdma_node_get_transport(id->device->node_type)) {
-	case RDMA_TRANSPORT_IB:
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
 		if (id->qp_type == IB_QPT_UD) {
 			if (conn_param)
 			ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
@@ -3230,15 +3212,10 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_con
 			else
 				ret = cma_rep_recv(id_priv);
 		}
-		break;
-	case RDMA_TRANSPORT_IWARP:
-	case RDMA_TRANSPORT_SCIF:
+	} else if (rdma_cap_iw_cm(id->device, id->port_num))
 		ret = cma_accept_iw(id_priv, conn_param);
-		break;
-	default:
+	else
 		ret = -ENOSYS;
-		break;
-	}
 
 	if (ret)
 		goto reject;
@@ -3282,8 +3259,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *pri
 	if (!id_priv->cm_id.ib)
 		return -EINVAL;
 
-	switch (rdma_node_get_transport(id->device->node_type)) {
-	case RDMA_TRANSPORT_IB:
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
 		if (id->qp_type == IB_QPT_UD)
 			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
 						private_data, private_data_len);
@@ -3293,16 +3269,12 @@ int rdma_reject(struct rdma_cm_id *id, const void *pri
 					     IB_CM_REJ_CONSUMER_DEFINED, NULL,
 					     0, private_data, private_data_len);
 		}
-		break;
-	case RDMA_TRANSPORT_IWARP:
-	case RDMA_TRANSPORT_SCIF:
+	} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
 		ret = iw_cm_reject(id_priv->cm_id.iw,
 				   private_data, private_data_len);
-		break;
-	default:
+	} else
 		ret = -ENOSYS;
-		break;
-	}
+
 	return ret;
 }
 EXPORT_SYMBOL(rdma_reject);
@@ -3316,8 +3288,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
 	if (!id_priv->cm_id.ib)
 		return -EINVAL;
 
-	switch (rdma_node_get_transport(id->device->node_type)) {
-	case RDMA_TRANSPORT_IB:
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
 		ret = cma_modify_qp_err(id_priv);
 		if (ret)
 			goto out;
@@ -3327,15 +3298,11 @@ int rdma_disconnect(struct rdma_cm_id *id)
 			cma_dbg(id_priv, "sending DREP\n");
 			ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
 		}
-		break;
-	case RDMA_TRANSPORT_IWARP:
-	case RDMA_TRANSPORT_SCIF:
+	} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
 		ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
-		break;
-	default:
+	} else
 		ret = -EINVAL;
-		break;
-	}
+
 out:
 	return ret;
 }

Modified: head/sys/ofed/drivers/infiniband/core/device.c
==============================================================================
--- head/sys/ofed/drivers/infiniband/core/device.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/ofed/drivers/infiniband/core/device.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -90,7 +90,8 @@ static int ib_device_check_mandatory(struct ib_device 
 		IB_MANDATORY_FUNC(poll_cq),
 		IB_MANDATORY_FUNC(req_notify_cq),
 		IB_MANDATORY_FUNC(get_dma_mr),
-		IB_MANDATORY_FUNC(dereg_mr)
+		IB_MANDATORY_FUNC(dereg_mr),
+		IB_MANDATORY_FUNC(get_port_immutable)
 	};
 	int i;
 
@@ -149,13 +150,13 @@ static int alloc_name(char *name)
 	return 0;
 }
 
-static int start_port(struct ib_device *device)
+static int rdma_start_port(struct ib_device *device)
 {
 	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
 }
 
 
-static int end_port(struct ib_device *device)
+static int rdma_end_port(struct ib_device *device)
 {
 	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
 		0 : device->phys_port_cnt;
@@ -193,6 +194,7 @@ EXPORT_SYMBOL(ib_alloc_device);
 void ib_dealloc_device(struct ib_device *device)
 {
 	if (device->reg_state == IB_DEV_UNINITIALIZED) {
+		kfree(device->port_immutable);
 		kfree(device);
 		return;
 	}
@@ -225,43 +227,42 @@ static int add_client_context(struct ib_device *device
 	return 0;
 }
 
-static int read_port_table_lengths(struct ib_device *device)
+static int verify_immutable(const struct ib_device *dev, u8 port)
 {
-	struct ib_port_attr *tprops = NULL;
-	int num_ports, ret = -ENOMEM;
-	u8 port_index;
+	return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+			    rdma_max_mad_size(dev, port) != 0);
+}
 
-	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
-	if (!tprops)
-		goto out;
+static int read_port_immutable(struct ib_device *device)
+{
+	int ret;
+	u8 start_port = rdma_start_port(device);
+	u8 end_port = rdma_end_port(device);
+	u8 port;
 
-	num_ports = end_port(device) - start_port(device) + 1;
+	/**
+	 * device->port_immutable is indexed directly by the port number to make
+	 * access to this data as efficient as possible.
+	 *
+	 * Therefore port_immutable is declared as a 1 based array with
+	 * potential empty slots at the beginning.
+	 */
+	device->port_immutable = kzalloc(sizeof(*device->port_immutable)
+					 * (end_port + 1),
+					 GFP_KERNEL);
+	if (!device->port_immutable)
+		return -ENOMEM;
 
-	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
-				       GFP_KERNEL);
-	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
-				      GFP_KERNEL);
-	if (!device->pkey_tbl_len || !device->gid_tbl_len)
-		goto err;
-
-	for (port_index = 0; port_index < num_ports; ++port_index) {
-		ret = ib_query_port(device, port_index + start_port(device),
-					tprops);
+	for (port = start_port; port <= end_port; ++port) {
+		ret = device->get_port_immutable(device, port,
+						 &device->port_immutable[port]);
 		if (ret)
-			goto err;
-		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
-		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
-	}
+			return ret;
 
-	ret = 0;
-	goto out;
-
-err:
-	kfree(device->gid_tbl_len);
-	kfree(device->pkey_tbl_len);
-out:
-	kfree(tprops);
-	return ret;
+		if (verify_immutable(device, port))
+			return -EINVAL;
+	}
+	return 0;
 }
 
 /**
@@ -297,10 +298,11 @@ int ib_register_device(struct ib_device *device,
 	spin_lock_init(&device->event_handler_lock);
 	spin_lock_init(&device->client_data_lock);
 
-	ret = read_port_table_lengths(device);
+
+	ret = read_port_immutable(device);
 	if (ret) {
-		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
-		       device->name);
+		printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
+			device->name);
 		goto out;
 	}
 
@@ -308,8 +310,7 @@ int ib_register_device(struct ib_device *device,
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
-		kfree(device->gid_tbl_len);
-		kfree(device->pkey_tbl_len);
+		kfree(device->port_immutable);
 		goto out;
 	}
 
@@ -351,9 +352,6 @@ void ib_unregister_device(struct ib_device *device)
 
 	list_del(&device->core_list);
 
-	kfree(device->gid_tbl_len);
-	kfree(device->pkey_tbl_len);
-
 	mutex_unlock(&device_mutex);
 
 	ib_device_unregister_sysfs(device);
@@ -578,7 +576,7 @@ int ib_query_port(struct ib_device *device,
 		  u8 port_num,
 		  struct ib_port_attr *port_attr)
 {
-	if (port_num < start_port(device) || port_num > end_port(device))
+	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
 		return -EINVAL;
 
 	return device->query_port(device, port_num, port_attr);
@@ -656,7 +654,7 @@ int ib_modify_port(struct ib_device *device,
 	if (!device->modify_port)
 		return -ENOSYS;
 
-	if (port_num < start_port(device) || port_num > end_port(device))
+	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
 		return -EINVAL;
 
 	return device->modify_port(device, port_num, port_modify_mask,
@@ -679,8 +677,8 @@ int ib_find_gid(struct ib_device *device, union ib_gid
 	union ib_gid tmp_gid;
 	int ret, port, i;
 
-	for (port = start_port(device); port <= end_port(device); ++port) {
-		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
 			ret = ib_query_gid(device, port, i, &tmp_gid);
 			if (ret)
 				return ret;
@@ -712,7 +710,7 @@ int ib_find_pkey(struct ib_device *device,
 	u16 tmp_pkey;
 	int partial_ix = -1;
 
-	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+	for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
 		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
 		if (ret)
 			return ret;

Modified: head/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
==============================================================================
--- head/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c	Fri Oct 20 08:20:15 2017	(r324792)
@@ -1297,6 +1297,25 @@ out:
 	return err;
 }
 
+static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
+			        struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
 int mthca_register_device(struct mthca_dev *dev)
 {
 	int ret;
@@ -1376,6 +1395,7 @@ int mthca_register_device(struct mthca_dev *dev)
 	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
 	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
 	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+	dev->ib_dev.get_port_immutable   = mthca_port_immutable;
 
 	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
 		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;

Modified: head/sys/ofed/include/rdma/ib_mad.h
==============================================================================
--- head/sys/ofed/include/rdma/ib_mad.h	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/ofed/include/rdma/ib_mad.h	Fri Oct 20 08:20:15 2017	(r324792)
@@ -134,6 +134,7 @@ enum {
 	IB_MGMT_SA_DATA = 200,
 	IB_MGMT_DEVICE_HDR = 64,
 	IB_MGMT_DEVICE_DATA = 192,
+	IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA,
 };
 
 struct ib_mad_hdr {

Modified: head/sys/ofed/include/rdma/ib_verbs.h
==============================================================================
--- head/sys/ofed/include/rdma/ib_verbs.h	Fri Oct 20 07:42:00 2017	(r324791)
+++ head/sys/ofed/include/rdma/ib_verbs.h	Fri Oct 20 08:20:15 2017	(r324792)
@@ -348,6 +348,56 @@ union rdma_protocol_stats {
 	struct iw_protocol_stats	iw;
 };
 
+/* Define bits for the various functionality this port needs to be supported by
+ * the core.
+ */
+/* Management                           0x00000FFF */
+#define RDMA_CORE_CAP_IB_MAD            0x00000001
+#define RDMA_CORE_CAP_IB_SMI            0x00000002
+#define RDMA_CORE_CAP_IB_CM             0x00000004
+#define RDMA_CORE_CAP_IW_CM             0x00000008
+#define RDMA_CORE_CAP_IB_SA             0x00000010
+#define RDMA_CORE_CAP_OPA_MAD           0x00000020
+
+/* Address format                       0x000FF000 */
+#define RDMA_CORE_CAP_AF_IB             0x00001000
+#define RDMA_CORE_CAP_ETH_AH            0x00002000
+#define RDMA_CORE_CAP_OPA_AH            0x00004000
+
+/* Protocol                             0xFFF00000 */
+#define RDMA_CORE_CAP_PROT_IB           0x00100000
+#define RDMA_CORE_CAP_PROT_ROCE         0x00200000
+#define RDMA_CORE_CAP_PROT_IWARP        0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
+#define RDMA_CORE_CAP_PROT_RAW_PACKET   0x01000000
+#define RDMA_CORE_CAP_PROT_USNIC        0x02000000
+
+#define RDMA_CORE_PORT_IBA_IB          (RDMA_CORE_CAP_PROT_IB  \
+					| RDMA_CORE_CAP_IB_MAD \
+					| RDMA_CORE_CAP_IB_SMI \
+					| RDMA_CORE_CAP_IB_CM  \
+					| RDMA_CORE_CAP_IB_SA  \
+					| RDMA_CORE_CAP_AF_IB)
+#define RDMA_CORE_PORT_IBA_ROCE        (RDMA_CORE_CAP_PROT_ROCE \
+					| RDMA_CORE_CAP_IB_MAD  \
+					| RDMA_CORE_CAP_IB_CM   \
+					| RDMA_CORE_CAP_AF_IB   \
+					| RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP			\
+					(RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+					| RDMA_CORE_CAP_IB_MAD  \
+					| RDMA_CORE_CAP_IB_CM   \
+					| RDMA_CORE_CAP_AF_IB   \
+					| RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IWARP           (RDMA_CORE_CAP_PROT_IWARP \
+					| RDMA_CORE_CAP_IW_CM)
+#define RDMA_CORE_PORT_INTEL_OPA       (RDMA_CORE_PORT_IBA_IB  \
+					| RDMA_CORE_CAP_OPA_MAD)
+
+#define RDMA_CORE_PORT_RAW_PACKET	(RDMA_CORE_CAP_PROT_RAW_PACKET)
+
+#define RDMA_CORE_PORT_USNIC		(RDMA_CORE_CAP_PROT_USNIC)
+
 struct ib_port_attr {
 	enum ib_port_state	state;
 	enum ib_mtu		max_mtu;
@@ -1623,6 +1673,14 @@ struct ib_dma_mapping_ops {
 };
 
 struct iw_cm_verbs;
+
+struct ib_port_immutable {
+	int                           pkey_tbl_len;
+	int                           gid_tbl_len;
+	u32                           core_cap_flags;
+	u32                           max_mad_size;
+};
+
 struct ib_exp_device_attr;
 struct ib_exp_qp_init_attr;
 
@@ -1639,8 +1697,10 @@ struct ib_device {
 	struct list_head              client_data_list;
 
 	struct ib_cache               cache;
-	int                          *pkey_tbl_len;
-	int                          *gid_tbl_len;
+	/**
+	 * port_immutable is indexed by port number
+	 */
+	struct ib_port_immutable     *port_immutable;
 
 	int			      num_comp_vectors;
 
@@ -1845,6 +1905,15 @@ struct ib_device {
 	u32			     cmd_n;
 	spinlock_t		     cmd_perf_lock;
 
+
+	/**
+	 * The following mandatory functions are used only at device
+	 * registration.  Keep functions such as these at the end of this
+	 * structure to avoid cache line misses when accessing struct ib_device
+	 * in fast paths.
+	 */
+	int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
+
 	/*
 	 * Experimental data and functions
 	 */
@@ -1927,6 +1996,252 @@ int ib_query_port(struct ib_device *device,
 
 enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
 					       u8 port_num);
+
+static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB;
+}
+
+static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags &
+		(RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
+}
+
+static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP;
+}
+
+static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
+{
+	return rdma_protocol_ib(device, port_num) ||
+		rdma_protocol_roce(device, port_num);
+}
+
+/**
+ * rdma_cap_ib_mad - Check if the port of a device supports Infiniband
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Management Datagrams (MAD) are a required part of the InfiniBand
+ * specification and are supported on all InfiniBand devices.  A slightly
+ * extended version are also supported on OPA interfaces.
+ *
+ * Return: true if the port supports sending/receiving of MAD packets.
+ */
+static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD;
+}
+
+/**
+ * rdma_cap_opa_mad - Check if the port of device provides support for OPA
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Intel OmniPath devices extend and/or replace the InfiniBand Management
+ * datagrams with their own versions.  These OPA MADs share many but not all of
+ * the characteristics of InfiniBand MADs.
+ *
+ * OPA MADs differ in the following ways:
+ *
+ *    1) MADs are variable size up to 2K
+ *       IBTA defined MADs remain fixed at 256 bytes
+ *    2) OPA SMPs must carry valid PKeys
+ *    3) OPA SMP packets are a different format
+ *
+ * Return: true if the port supports OPA MAD packet formats.
+ */
+static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
+{
+	return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD)
+		== RDMA_CORE_CAP_OPA_MAD;
+}
+
+/**
+ * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband
+ * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI).
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Each InfiniBand node is required to provide a Subnet Management Agent
+ * that the subnet manager can access.  Prior to the fabric being fully
+ * configured by the subnet manager, the SMA is accessed via a well known
+ * interface called the Subnet Management Interface (SMI).  This interface
+ * uses directed route packets to communicate with the SM to get around the
+ * chicken and egg problem of the SM needing to know what's on the fabric
+ * in order to configure the fabric, and needing to configure the fabric in
+ * order to send packets to the devices on the fabric.  These directed
+ * route packets do not need the fabric fully configured in order to reach
+ * their destination.  The SMI is the only method allowed to send
+ * directed route packets on an InfiniBand fabric.
+ *
+ * Return: true if the port provides an SMI.
+ */
+static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI;
+}
+
+/**
+ * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * The InfiniBand Communication Manager is one of many pre-defined General
+ * Service Agents (GSA) that are accessed via the General Service
+ * Interface (GSI).  It's role is to facilitate establishment of connections
+ * between nodes as well as other management related tasks for established
+ * connections.
+ *
+ * Return: true if the port supports an IB CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM;
+}
+
+/**
+ * rdma_cap_iw_cm - Check if the port of device has the capability IWARP
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Similar to above, but specific to iWARP connections which have a different
+ * managment protocol than InfiniBand.
+ *
+ * Return: true if the port supports an iWARP CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM;
+}
+
+/**
+ * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband
+ * Subnet Administration.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * An InfiniBand Subnet Administration (SA) service is a pre-defined General
+ * Service Agent (GSA) provided by the Subnet Manager (SM).  On InfiniBand
+ * fabrics, devices should resolve routes to other hosts by contacting the
+ * SA to query the proper route.
+ *
+ * Return: true if the port should act as a client to the fabric Subnet
+ * Administration interface.  This does not imply that the SA service is
+ * running locally.
+ */
+static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA;
+}
+
+/**
+ * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband
+ * Multicast.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand multicast registration is more complex than normal IPv4 or
+ * IPv6 multicast registration.  Each Host Channel Adapter must register
+ * with the Subnet Manager when it wishes to join a multicast group.  It
+ * should do so only once regardless of how many queue pairs it subscribes
+ * to this group.  And it should leave the group only after all queue pairs
+ * attached to the group have been detached.
+ *
+ * Return: true if the port must undertake the additional adminstrative
+ * overhead of registering/unregistering with the SM and tracking of the
+ * total number of queue pairs attached to the multicast group.
+ */
+static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num)
+{
+	return rdma_cap_ib_sa(device, port_num);
+}
+
+/**
+ * rdma_cap_af_ib - Check if the port of device has the capability
+ * Native Infiniband Address.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default
+ * GID.  RoCE uses a different mechanism, but still generates a GID via
+ * a prescribed mechanism and port specific data.
+ *
+ * Return: true if the port uses a GID address to identify devices on the
+ * network.
+ */
+static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB;
+}
+
+/**
+ * rdma_cap_eth_ah - Check if the port of device has the capability

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201710200820.v9K8KF0J032638>