Date: Thu, 16 Jun 2016 02:48:18 +0000 (UTC) From: Sepherosa Ziehau <sephe@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r301943 - in stable/10/sys/dev/hyperv: include netvsc vmbus Message-ID: <201606160248.u5G2mIOP039213@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: sephe Date: Thu Jun 16 02:48:18 2016 New Revision: 301943 URL: https://svnweb.freebsd.org/changeset/base/301943 Log: MFC 296379,296380,296381,296593,296594,296595 296379 hyperv/hn: Add multiple channel support, a.k.a. vRSS Each channel contains one RX ring and one TX ring. And we try to distribute the channels to different evenly. Note: Currently we don't have enough information to extract the RSS type and RSS hash value from the received packets. This greatly improves the TX/RX performance for 8 virtual CPU Hyper-V over 10Ge: it can max out 10Ge for TCP when multiple RX/TX rings are enabled. This almost doubles the TX/RX performance for locally connected Hyper-Vs: was 6Gbps w/ 128 TCP streams, now 11Gbps w/ multiple RX/TX rings enabled. It is not enabled by default; it will be switched on after more tests. Collaborated with: Hongjiang Zhang <honzhan microsoft com> MFC after: 2 week Sponsored by: Microsoft OSTC 296380 hyperv/hn: Pass channel to send done callbacks. Mainly to strigent the data packet send done check. MFC after: 2 weeks Sponsored by: Microsoft OSTC 296381 hyperv/hn: Add per-TX ring stats for # of transmitted packets MFC after: 2 weeks Sponsored by: Microsoft OSTC 296593 hyperv/hn: Move if_initname to an earlier place So that functions shared w/ attach path could use if_printf(). While I'm here, remove unnecessary if_dunit and if_dname assignment. MFC after: 1 week Sponsored by: Microsoft OSTC Differential Revision: https://reviews.freebsd.org/D5576 296594 hyperv/hn: Factor out hn_channel_attach MFC after: 1 week Sponsored by: Microsoft OSTC Differential Revision: https://reviews.freebsd.org/D5577 296595 hyperv/hn: Make the # of TX rings configurable. Rename the tunables to avoid confusion. MFC after: 1 week Sponsored by: Microsoft OSTC Differential Revision: https://reviews.freebsd.org/D5578 Modified: stable/10/sys/dev/hyperv/include/hyperv.h stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.c stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.h stable/10/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c stable/10/sys/dev/hyperv/netvsc/hv_rndis.h stable/10/sys/dev/hyperv/netvsc/hv_rndis_filter.c stable/10/sys/dev/hyperv/netvsc/hv_rndis_filter.h stable/10/sys/dev/hyperv/vmbus/hv_channel_mgmt.c Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/dev/hyperv/include/hyperv.h ============================================================================== --- stable/10/sys/dev/hyperv/include/hyperv.h Thu Jun 16 01:57:16 2016 (r301942) +++ stable/10/sys/dev/hyperv/include/hyperv.h Thu Jun 16 02:48:18 2016 (r301943) @@ -911,6 +911,8 @@ int hv_vmbus_channel_teardown_gpdal( struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); +void vmbus_channel_cpu_set(struct hv_vmbus_channel *chan, int cpu); + /** * @brief Get physical address from virtual */ Modified: stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.c ============================================================================== --- stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.c Thu Jun 16 01:57:16 2016 (r301942) +++ stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.c Thu Jun 16 02:48:18 2016 (r301943) @@ -56,14 +56,14 @@ MALLOC_DEFINE(M_NETVSC, "netvsc", "Hyper /* * Forward declarations */ -static void hv_nv_on_channel_callback(void *context); +static void hv_nv_on_channel_callback(void *xchan); static int hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device); static int hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device); static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev); static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev); static int hv_nv_connect_to_vsp(struct hv_device *device); static void hv_nv_on_send_completion(netvsc_dev *net_dev, - struct hv_device *device, hv_vm_packet_descriptor *pkt); + struct hv_device *device, struct hv_vmbus_channel *, hv_vm_packet_descriptor *pkt); static void hv_nv_on_receive_completion(struct hv_vmbus_channel *chan, uint64_t tid, uint32_t status); static void hv_nv_on_receive(netvsc_dev *net_dev, @@ -661,6 +661,34 @@ hv_nv_disconnect_from_vsp(netvsc_dev *ne } /* + * Callback handler for subchannel offer + * @@param context new subchannel + */ +static void +hv_nv_subchan_callback(void *xchan) +{ + struct hv_vmbus_channel *chan = xchan; + netvsc_dev *net_dev; + uint16_t chn_index = chan->offer_msg.offer.sub_channel_index; + struct hv_device *device = chan->device; + hn_softc_t *sc = device_get_softc(device->device); + int ret; + + net_dev = sc->net_dev; + + if (chn_index >= net_dev->num_channel) { + /* Would this ever happen? */ + return; + } + netvsc_subchan_callback(sc, chan); + + chan->hv_chan_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); + ret = hv_vmbus_channel_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE, + NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, + hv_nv_on_channel_callback, chan); +} + +/* * Net VSC on device add * * Callback when the device belonging to this driver is added @@ -692,6 +720,7 @@ hv_nv_on_device_add(struct hv_device *de free(chan->hv_chan_rdbuf, M_NETVSC); goto cleanup; } + chan->sc_creation_callback = hv_nv_subchan_callback; /* * Connect with the NetVsp @@ -757,7 +786,8 @@ hv_nv_on_device_remove(struct hv_device */ static void hv_nv_on_send_completion(netvsc_dev *net_dev, - struct hv_device *device, hv_vm_packet_descriptor *pkt) + struct hv_device *device, struct hv_vmbus_channel *chan, + hv_vm_packet_descriptor *pkt) { nvsp_msg *nvsp_msg_pkt; netvsc_packet *net_vsc_pkt; @@ -769,7 +799,9 @@ hv_nv_on_send_completion(netvsc_dev *net || nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_rx_buf_complete || nvsp_msg_pkt->hdr.msg_type - == nvsp_msg_1_type_send_send_buf_complete) { + == nvsp_msg_1_type_send_send_buf_complete + || nvsp_msg_pkt->hdr.msg_type + == nvsp_msg5_type_subchannel) { /* Copy the response back */ memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt, sizeof(nvsp_msg)); @@ -806,7 +838,7 @@ hv_nv_on_send_completion(netvsc_dev *net } /* Notify the layer above us */ - net_vsc_pkt->compl.send.on_send_completion( + net_vsc_pkt->compl.send.on_send_completion(chan, net_vsc_pkt->compl.send.send_completion_context); } @@ -963,6 +995,46 @@ retry_send_cmplt: } /* + * Net VSC receiving vRSS send table from VSP + */ +static void +hv_nv_send_table(struct hv_device *device, hv_vm_packet_descriptor *pkt) +{ + netvsc_dev *net_dev; + nvsp_msg *nvsp_msg_pkt; + int i; + uint32_t count, *table; + + net_dev = hv_nv_get_inbound_net_device(device); + if (!net_dev) + return; + + nvsp_msg_pkt = + (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3)); + + if (nvsp_msg_pkt->hdr.msg_type != + nvsp_msg5_type_send_indirection_table) { + printf("Netvsc: !Warning! receive msg type not " + "send_indirection_table. type = %d\n", + nvsp_msg_pkt->hdr.msg_type); + return; + } + + count = nvsp_msg_pkt->msgs.vers_5_msgs.send_table.count; + if (count != VRSS_SEND_TABLE_SIZE) { + printf("Netvsc: Received wrong send table size: %u\n", count); + return; + } + + table = (uint32_t *) + ((unsigned long)&nvsp_msg_pkt->msgs.vers_5_msgs.send_table + + nvsp_msg_pkt->msgs.vers_5_msgs.send_table.offset); + + for (i = 0; i < count; i++) + net_dev->vrss_send_table[i] = table[i]; +} + +/* * Net VSC on channel callback */ static void @@ -993,11 +1065,15 @@ hv_nv_on_channel_callback(void *xchan) desc = (hv_vm_packet_descriptor *)buffer; switch (desc->type) { case HV_VMBUS_PACKET_TYPE_COMPLETION: - hv_nv_on_send_completion(net_dev, device, desc); + hv_nv_on_send_completion(net_dev, device, + chan, desc); break; case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES: hv_nv_on_receive(net_dev, device, chan, desc); break; + case HV_VMBUS_PACKET_TYPE_DATA_IN_BAND: + hv_nv_send_table(device, desc); + break; default: device_printf(dev, "hv_cb recv unknow type %d " Modified: stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.h ============================================================================== --- stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.h Thu Jun 16 01:57:16 2016 (r301942) +++ stable/10/sys/dev/hyperv/netvsc/hv_net_vsc.h Thu Jun 16 02:48:18 2016 (r301943) @@ -86,6 +86,92 @@ MALLOC_DECLARE(M_NETVSC); */ #define NVSP_MAX_PACKETS_PER_RECEIVE 375 +/* vRSS stuff */ +#define RNDIS_OBJECT_TYPE_RSS_CAPABILITIES 0x88 +#define RNDIS_OBJECT_TYPE_RSS_PARAMETERS 0x89 + +#define RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2 2 +#define RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2 2 + +struct rndis_obj_header { + uint8_t type; + uint8_t rev; + uint16_t size; +} __packed; + +/* rndis_recv_scale_cap/cap_flag */ +#define RNDIS_RSS_CAPS_MESSAGE_SIGNALED_INTERRUPTS 0x01000000 +#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_ISR 0x02000000 +#define RNDIS_RSS_CAPS_CLASSIFICATION_AT_DPC 0x04000000 +#define RNDIS_RSS_CAPS_USING_MSI_X 0x08000000 +#define RNDIS_RSS_CAPS_RSS_AVAILABLE_ON_PORTS 0x10000000 +#define RNDIS_RSS_CAPS_SUPPORTS_MSI_X 0x20000000 +#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV4 0x00000100 +#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6 0x00000200 +#define RNDIS_RSS_CAPS_HASH_TYPE_TCP_IPV6_EX 0x00000400 + +/* RNDIS_RECEIVE_SCALE_CAPABILITIES */ +struct rndis_recv_scale_cap { + struct rndis_obj_header hdr; + uint32_t cap_flag; + uint32_t num_int_msg; + uint32_t num_recv_que; + uint16_t num_indirect_tabent; +} __packed; + +/* rndis_recv_scale_param flags */ +#define RNDIS_RSS_PARAM_FLAG_BASE_CPU_UNCHANGED 0x0001 +#define RNDIS_RSS_PARAM_FLAG_HASH_INFO_UNCHANGED 0x0002 +#define RNDIS_RSS_PARAM_FLAG_ITABLE_UNCHANGED 0x0004 +#define RNDIS_RSS_PARAM_FLAG_HASH_KEY_UNCHANGED 0x0008 +#define RNDIS_RSS_PARAM_FLAG_DISABLE_RSS 0x0010 + +/* Hash info bits */ +#define RNDIS_HASH_FUNC_TOEPLITZ 0x00000001 +#define RNDIS_HASH_IPV4 0x00000100 +#define RNDIS_HASH_TCP_IPV4 0x00000200 +#define RNDIS_HASH_IPV6 0x00000400 +#define RNDIS_HASH_IPV6_EX 0x00000800 +#define RNDIS_HASH_TCP_IPV6 0x00001000 +#define RNDIS_HASH_TCP_IPV6_EX 0x00002000 + +#define RNDIS_RSS_INDIRECTION_TABLE_MAX_SIZE_REVISION_2 (128 * 4) +#define RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2 40 + +#define ITAB_NUM 128 +#define HASH_KEYLEN RNDIS_RSS_HASH_SECRET_KEY_MAX_SIZE_REVISION_2 + +/* RNDIS_RECEIVE_SCALE_PARAMETERS */ +typedef struct rndis_recv_scale_param_ { + struct rndis_obj_header hdr; + + /* Qualifies the rest of the information */ + uint16_t flag; + + /* The base CPU number to do receive processing. not used */ + uint16_t base_cpu_number; + + /* This describes the hash function and type being enabled */ + uint32_t hashinfo; + + /* The size of indirection table array */ + uint16_t indirect_tabsize; + + /* The offset of the indirection table from the beginning of this + * structure + */ + uint32_t indirect_taboffset; + + /* The size of the hash secret key */ + uint16_t hashkey_size; + + /* The offset of the secret key from the beginning of this structure */ + uint32_t hashkey_offset; + + uint32_t processor_masks_offset; + uint32_t num_processor_masks; + uint32_t processor_masks_entry_size; +} rndis_recv_scale_param; typedef enum nvsp_msg_type_ { nvsp_msg_type_none = 0, @@ -146,6 +232,27 @@ typedef enum nvsp_msg_type_ { nvsp_msg_2_type_alloc_chimney_handle, nvsp_msg_2_type_alloc_chimney_handle_complete, + + nvsp_msg2_max = nvsp_msg_2_type_alloc_chimney_handle_complete, + + /* + * Version 4 Messages + */ + nvsp_msg4_type_send_vf_association, + nvsp_msg4_type_switch_data_path, + nvsp_msg4_type_uplink_connect_state_deprecated, + + nvsp_msg4_max = nvsp_msg4_type_uplink_connect_state_deprecated, + + /* + * Version 5 Messages + */ + nvsp_msg5_type_oid_query_ex, + nvsp_msg5_type_oid_query_ex_comp, + nvsp_msg5_type_subchannel, + nvsp_msg5_type_send_indirection_table, + + nvsp_msg5_max = nvsp_msg5_type_send_indirection_table, } nvsp_msg_type; typedef enum nvsp_status_ { @@ -793,6 +900,39 @@ typedef struct nvsp_2_msg_send_vmq_rndis uint32_t status; } __packed nvsp_2_msg_send_vmq_rndis_pkt_complete; +/* + * Version 5 messages + */ +enum nvsp_subchannel_operation { + NVSP_SUBCHANNEL_NONE = 0, + NVSP_SUBCHANNE_ALLOCATE, + NVSP_SUBCHANNE_MAX +}; + +typedef struct nvsp_5_subchannel_request_ +{ + uint32_t op; + uint32_t num_subchannels; +} __packed nvsp_5_subchannel_request; + +typedef struct nvsp_5_subchannel_complete_ +{ + uint32_t status; + /* Actual number of subchannels allocated */ + uint32_t num_subchannels; +} __packed nvsp_5_subchannel_complete; + +typedef struct nvsp_5_send_indirect_table_ +{ + /* The number of entries in the send indirection table */ + uint32_t count; + /* + * The offset of the send indireciton table from top of + * this struct. The send indirection table tells which channel + * to put the send traffic on. Each entry is a channel number. + */ + uint32_t offset; +} __packed nvsp_5_send_indirect_table; typedef union nvsp_1_msg_uber_ { nvsp_1_msg_send_ndis_version send_ndis_vers; @@ -838,11 +978,18 @@ typedef union nvsp_2_msg_uber_ { nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete; } __packed nvsp_2_msg_uber; +typedef union nvsp_5_msg_uber_ +{ + nvsp_5_subchannel_request subchannel_request; + nvsp_5_subchannel_complete subchn_complete; + nvsp_5_send_indirect_table send_table; +} __packed nvsp_5_msg_uber; typedef union nvsp_all_msgs_ { nvsp_msg_init_uber init_msgs; nvsp_1_msg_uber vers_1_msgs; nvsp_2_msg_uber vers_2_msgs; + nvsp_5_msg_uber vers_5_msgs; } __packed nvsp_all_msgs; /* @@ -883,6 +1030,7 @@ typedef struct nvsp_msg_ { #define NETVSC_MAX_CONFIGURABLE_MTU (9 * 1024) #define NETVSC_PACKET_SIZE PAGE_SIZE +#define VRSS_SEND_TABLE_SIZE 16 /* * Data types @@ -923,10 +1071,15 @@ typedef struct netvsc_dev_ { hv_bool_uint8_t destroy; /* Negotiated NVSP version */ uint32_t nvsp_version; + + uint32_t num_channel; + + uint32_t vrss_send_table[VRSS_SEND_TABLE_SIZE]; } netvsc_dev; +struct hv_vmbus_channel; -typedef void (*pfn_on_send_rx_completion)(void *); +typedef void (*pfn_on_send_rx_completion)(struct hv_vmbus_channel *, void *); #define NETVSC_DEVICE_RING_BUFFER_SIZE (128 * PAGE_SIZE) #define NETVSC_PACKET_MAXPAGE 32 @@ -1010,13 +1163,18 @@ struct hn_rx_ring { u_long hn_csum_trusted; u_long hn_lro_tried; u_long hn_small_pkts; + u_long hn_pkts; + + /* Rarely used stuffs */ + struct sysctl_oid *hn_rx_sysctl_tree; + int hn_rx_flags; } __aligned(CACHE_LINE_SIZE); #define HN_TRUST_HCSUM_IP 0x0001 #define HN_TRUST_HCSUM_TCP 0x0002 #define HN_TRUST_HCSUM_UDP 0x0004 -struct hv_vmbus_channel; +#define HN_RX_FLAG_ATTACHED 0x1 struct hn_tx_ring { #ifndef HN_USE_TXDESC_BUFRING @@ -1053,13 +1211,17 @@ struct hn_tx_ring { u_long hn_txdma_failed; u_long hn_tx_collapsed; u_long hn_tx_chimney; + u_long hn_pkts; /* Rarely used stuffs */ struct hn_txdesc *hn_txdesc; bus_dma_tag_t hn_tx_rndis_dtag; struct sysctl_oid *hn_tx_sysctl_tree; + int hn_tx_flags; } __aligned(CACHE_LINE_SIZE); +#define HN_TX_FLAG_ATTACHED 0x1 + /* * Device-specific softc structure */ @@ -1085,9 +1247,12 @@ typedef struct hn_softc { int hn_tx_ring_cnt; int hn_tx_ring_inuse; struct hn_tx_ring *hn_tx_ring; + + int hn_cpu; int hn_tx_chimney_max; struct taskqueue *hn_tx_taskq; struct sysctl_oid *hn_tx_sysctl_tree; + struct sysctl_oid *hn_rx_sysctl_tree; } hn_softc_t; /* Modified: stable/10/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c ============================================================================== --- stable/10/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c Thu Jun 16 01:57:16 2016 (r301942) +++ stable/10/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c Thu Jun 16 02:48:18 2016 (r301943) @@ -281,13 +281,16 @@ static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); -static int hn_ring_cnt = 1; -SYSCTL_INT(_hw_hn, OID_AUTO, ring_cnt, CTLFLAG_RDTUN, - &hn_ring_cnt, 0, "# of TX/RX rings to used"); - -static int hn_single_tx_ring = 1; -SYSCTL_INT(_hw_hn, OID_AUTO, single_tx_ring, CTLFLAG_RDTUN, - &hn_single_tx_ring, 0, "Use one TX ring"); +static int hn_chan_cnt = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, + &hn_chan_cnt, 0, + "# of channels to use; each channel has one RX ring and one TX ring"); + +static int hn_tx_ring_cnt = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, + &hn_tx_ring_cnt, 0, "# of TX rings to use"); + +static u_int hn_cpu_index; /* * Forward declarations @@ -327,6 +330,7 @@ static int hn_encap(struct hn_tx_ring *, static void hn_create_rx_data(struct hn_softc *sc, int); static void hn_destroy_rx_data(struct hn_softc *sc); static void hn_set_tx_chimney_size(struct hn_softc *, int); +static void hn_channel_attach(struct hn_softc *, struct hv_vmbus_channel *); static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); @@ -454,37 +458,46 @@ netvsc_attach(device_t dev) ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ring_cnt = hn_ring_cnt; - if (ring_cnt <= 0 || ring_cnt >= mp_ncpus) + /* + * Figure out the # of RX rings (ring_cnt) and the # of TX rings + * to use (tx_ring_cnt). + * + * NOTE: + * The # of RX rings to use is same as the # of channels to use. + */ + ring_cnt = hn_chan_cnt; + if (ring_cnt <= 0 || ring_cnt > mp_ncpus) ring_cnt = mp_ncpus; - tx_ring_cnt = ring_cnt; - if (hn_single_tx_ring || hn_use_if_start) { - /* - * - Explicitly asked to use single TX ring. - * - ifnet.if_start is used; ifnet.if_start only needs - * one TX ring. - */ + tx_ring_cnt = hn_tx_ring_cnt; + if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) + tx_ring_cnt = ring_cnt; + if (hn_use_if_start) { + /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } + + /* + * Set the leader CPU for channels. + */ + sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; + error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; - hn_create_rx_data(sc, ring_cnt); /* * Associate the first TX/RX ring w/ the primary channel. */ chan = device_ctx->channel; - chan->hv_chan_rxr = &sc->hn_rx_ring[0]; - chan->hv_chan_txr = &sc->hn_tx_ring[0]; - sc->hn_tx_ring[0].hn_chan = chan; - - if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_dunit = unit; - ifp->if_dname = NETVSC_DEVNAME; + KASSERT(HV_VMBUS_CHAN_ISPRIMARY(chan), ("not primary channel")); + KASSERT(chan->offer_msg.offer.sub_channel_index == 0, + ("primary channel subidx %u", + chan->offer_msg.offer.sub_channel_index)); + hn_channel_attach(sc, chan); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; @@ -522,10 +535,18 @@ netvsc_attach(device_t dev) error = hv_rf_on_device_add(device_ctx, &device_info, ring_cnt); if (error) goto failed; + KASSERT(sc->net_dev->num_channel > 0 && + sc->net_dev->num_channel <= sc->hn_rx_ring_inuse, + ("invalid channel count %u, should be less than %d", + sc->net_dev->num_channel, sc->hn_rx_ring_inuse)); - /* TODO: vRSS */ - sc->hn_tx_ring_inuse = 1; - sc->hn_rx_ring_inuse = 1; + /* + * Set the # of TX/RX rings that could be used according to + * the # of channels that host offered. + */ + if (sc->hn_tx_ring_inuse > sc->net_dev->num_channel) + sc->hn_tx_ring_inuse = sc->net_dev->num_channel; + sc->hn_rx_ring_inuse = sc->net_dev->num_channel; device_printf(dev, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); @@ -730,7 +751,7 @@ hn_txdesc_hold(struct hn_txdesc *txd) } static void -hn_tx_done(void *xpkt) +hn_tx_done(struct hv_vmbus_channel *chan, void *xpkt) { netvsc_packet *packet = xpkt; struct hn_txdesc *txd; @@ -740,6 +761,11 @@ hn_tx_done(void *xpkt) packet->compl.send.send_completion_tid; txr = txd->txr; + KASSERT(txr->hn_chan == chan, + ("channel mismatch, on channel%u, should be channel%u", + chan->offer_msg.offer.sub_channel_index, + txr->hn_chan->offer_msg.offer.sub_channel_index)); + txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); } @@ -1025,6 +1051,7 @@ again: if (txd->m->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } + txr->hn_pkts++; } hn_txdesc_put(txr, txd); @@ -1357,6 +1384,7 @@ skip: */ ifp->if_ipackets++; + rxr->hn_pkts++; if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) @@ -2122,6 +2150,13 @@ hn_create_rx_data(struct hn_softc *sc, i #endif #endif /* INET || INET6 */ + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + /* Create dev.hn.UNIT.rx sysctl tree */ + sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", + CTLFLAG_RD, 0, ""); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; @@ -2149,10 +2184,27 @@ hn_create_rx_data(struct hn_softc *sc, i rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ - } - ctx = device_get_sysctl_ctx(dev); - child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + if (sc->hn_rx_sysctl_tree != NULL) { + char name[16]; + + /* + * Create per RX ring sysctl tree: + * dev.hn.UNIT.rx.RINGID + */ + snprintf(name, sizeof(name), "%d", i); + rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), + OID_AUTO, name, CTLFLAG_RD, 0, ""); + + if (rxr->hn_rx_sysctl_tree != NULL) { + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "packets", CTLFLAG_RW, + &rxr->hn_pkts, "# of packets received"); + } + } + } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW, sc, @@ -2419,6 +2471,9 @@ hn_create_tx_ring(struct hn_softc *sc, i CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", + CTLFLAG_RW, &txr->hn_pkts, + "# of packets transmitted"); } } @@ -2783,6 +2838,55 @@ hn_xmit_txeof_taskfunc(void *xtxr, int p } static void +hn_channel_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan) +{ + struct hn_rx_ring *rxr; + int idx; + + idx = chan->offer_msg.offer.sub_channel_index; + + KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, + ("invalid channel index %d, should > 0 && < %d", + idx, sc->hn_rx_ring_inuse)); + rxr = &sc->hn_rx_ring[idx]; + KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, + ("RX ring %d already attached", idx)); + rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; + + chan->hv_chan_rxr = rxr; + if_printf(sc->hn_ifp, "link RX ring %d to channel%u\n", + idx, chan->offer_msg.child_rel_id); + + if (idx < sc->hn_tx_ring_inuse) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; + + KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, + ("TX ring %d already attached", idx)); + txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; + + chan->hv_chan_txr = txr; + txr->hn_chan = chan; + if_printf(sc->hn_ifp, "link TX ring %d to channel%u\n", + idx, chan->offer_msg.child_rel_id); + } + + /* Bind channel to a proper CPU */ + vmbus_channel_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); +} + +void +netvsc_subchan_callback(struct hn_softc *sc, struct hv_vmbus_channel *chan) +{ + + KASSERT(!HV_VMBUS_CHAN_ISPRIMARY(chan), + ("subchannel callback on primary channel")); + KASSERT(chan->offer_msg.offer.sub_channel_index > 0, + ("invalid channel subidx %u", + chan->offer_msg.offer.sub_channel_index)); + hn_channel_attach(sc, chan); +} + +static void hn_tx_taskq_create(void *arg __unused) { if (!hn_share_tx_taskq) Modified: stable/10/sys/dev/hyperv/netvsc/hv_rndis.h ============================================================================== --- stable/10/sys/dev/hyperv/netvsc/hv_rndis.h Thu Jun 16 01:57:16 2016 (r301942) +++ stable/10/sys/dev/hyperv/netvsc/hv_rndis.h Thu Jun 16 02:48:18 2016 (r301943) @@ -167,6 +167,14 @@ #define RNDIS_OID_GEN_MACHINE_NAME 0x0001021A #define RNDIS_OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B +/* + * For receive side scale + */ +/* Query only */ +#define RNDIS_OID_GEN_RSS_CAPABILITIES 0x00010203 +/* Query and set */ +#define RNDIS_OID_GEN_RSS_PARAMETERS 0x00010204 + #define RNDIS_OID_GEN_XMIT_OK 0x00020101 #define RNDIS_OID_GEN_RCV_OK 0x00020102 #define RNDIS_OID_GEN_XMIT_ERROR 0x00020103 @@ -1060,6 +1068,8 @@ struct hv_vmbus_channel; int netvsc_recv(struct hv_vmbus_channel *chan, netvsc_packet *packet, rndis_tcp_ip_csum_info *csum_info); void netvsc_channel_rollup(struct hv_vmbus_channel *chan); +void netvsc_subchan_callback(struct hn_softc *sc, + struct hv_vmbus_channel *chan); void* hv_set_rppi_data(rndis_msg *rndis_mesg, uint32_t rppi_size, Modified: stable/10/sys/dev/hyperv/netvsc/hv_rndis_filter.c ============================================================================== --- stable/10/sys/dev/hyperv/netvsc/hv_rndis_filter.c Thu Jun 16 01:57:16 2016 (r301942) +++ stable/10/sys/dev/hyperv/netvsc/hv_rndis_filter.c Thu Jun 16 02:48:18 2016 (r301943) @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <vm/pmap.h> #include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/vmbus/hv_vmbus_priv.h> #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" @@ -69,8 +70,8 @@ static int hv_rf_set_packet_filter(rndi static int hv_rf_init_device(rndis_device *device); static int hv_rf_open_device(rndis_device *device); static int hv_rf_close_device(rndis_device *device); -static void hv_rf_on_send_request_completion(void *context); -static void hv_rf_on_send_request_halt_completion(void *context); +static void hv_rf_on_send_request_completion(struct hv_vmbus_channel *, void *context); +static void hv_rf_on_send_request_halt_completion(struct hv_vmbus_channel *, void *context); int hv_rf_send_offload_request(struct hv_device *device, rndis_offload_params *offloads); @@ -224,6 +225,8 @@ hv_rf_send_request(rndis_device *device, { int ret; netvsc_packet *packet; + netvsc_dev *net_dev = device->net_dev; + int send_buf_section_idx; /* Set up the packet to send it */ packet = &request->pkt; @@ -238,6 +241,20 @@ hv_rf_send_request(rndis_device *device, packet->page_buffers[0].offset = (unsigned long)&request->request_msg & (PAGE_SIZE - 1); + if (packet->page_buffers[0].offset + + packet->page_buffers[0].length > PAGE_SIZE) { + packet->page_buf_count = 2; + packet->page_buffers[0].length = + PAGE_SIZE - packet->page_buffers[0].offset; + packet->page_buffers[1].pfn = + hv_get_phys_addr((char*)&request->request_msg + + packet->page_buffers[0].length) >> PAGE_SHIFT; + packet->page_buffers[1].offset = 0; + packet->page_buffers[1].length = + request->request_msg.msg_len - + packet->page_buffers[0].length; + } + packet->compl.send.send_completion_context = request; /* packet */ if (message_type != REMOTE_NDIS_HALT_MSG) { packet->compl.send.on_send_completion = @@ -247,10 +264,25 @@ hv_rf_send_request(rndis_device *device, hv_rf_on_send_request_halt_completion; } packet->compl.send.send_completion_tid = (unsigned long)device; - packet->send_buf_section_idx = - NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; + if (packet->tot_data_buf_len < net_dev->send_section_size) { + send_buf_section_idx = hv_nv_get_next_send_section(net_dev); + if (send_buf_section_idx != + NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { + char *dest = ((char *)net_dev->send_buf + + send_buf_section_idx * net_dev->send_section_size); + + memcpy(dest, &request->request_msg, request->request_msg.msg_len); + packet->send_buf_section_idx = send_buf_section_idx; + packet->send_buf_section_size = packet->tot_data_buf_len; + packet->page_buf_count = 0; + goto sendit; + } + /* Failed to allocate chimney send buffer; move on */ + } + packet->send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; packet->send_buf_section_size = 0; +sendit: ret = hv_nv_on_send(device->net_dev->dev->channel, packet); return (ret); @@ -528,6 +560,19 @@ hv_rf_query_device(rndis_device *device, query->info_buffer_length = 0; query->device_vc_handle = 0; + if (oid == RNDIS_OID_GEN_RSS_CAPABILITIES) { + struct rndis_recv_scale_cap *cap; + + request->request_msg.msg_len += + sizeof(struct rndis_recv_scale_cap); + query->info_buffer_length = sizeof(struct rndis_recv_scale_cap); + cap = (struct rndis_recv_scale_cap *)((unsigned long)query + + query->info_buffer_offset); + cap->hdr.type = RNDIS_OBJECT_TYPE_RSS_CAPABILITIES; + cap->hdr.rev = RNDIS_RECEIVE_SCALE_CAPABILITIES_REVISION_2; + cap->hdr.size = sizeof(struct rndis_recv_scale_cap); + } + ret = hv_rf_send_request(device, request, REMOTE_NDIS_QUERY_MSG); if (ret != 0) { /* Fixme: printf added */ @@ -582,6 +627,114 @@ hv_rf_query_device_link_status(rndis_dev RNDIS_OID_GEN_MEDIA_CONNECT_STATUS, &device->link_status, &size)); } +static uint8_t netvsc_hash_key[HASH_KEYLEN] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa +}; + +/* + * RNDIS set vRSS parameters + */ +static int +hv_rf_set_rss_param(rndis_device *device, int num_queue) +{ + rndis_request *request; + rndis_set_request *set; + rndis_set_complete *set_complete; + rndis_recv_scale_param *rssp; + uint32_t extlen = sizeof(rndis_recv_scale_param) + + (4 * ITAB_NUM) + HASH_KEYLEN; + uint32_t *itab, status; + uint8_t *keyp; + int i, ret; + + + request = hv_rndis_request(device, REMOTE_NDIS_SET_MSG, + RNDIS_MESSAGE_SIZE(rndis_set_request) + extlen); + if (request == NULL) { + if (bootverbose) + printf("Netvsc: No memory to set vRSS parameters.\n"); + ret = -1; + goto cleanup; + } + + set = &request->request_msg.msg.set_request; + set->oid = RNDIS_OID_GEN_RSS_PARAMETERS; + set->info_buffer_length = extlen; + set->info_buffer_offset = sizeof(rndis_set_request); + set->device_vc_handle = 0; + + /* Fill out the rssp parameter structure */ + rssp = (rndis_recv_scale_param *)(set + 1); + rssp->hdr.type = RNDIS_OBJECT_TYPE_RSS_PARAMETERS; + rssp->hdr.rev = RNDIS_RECEIVE_SCALE_PARAMETERS_REVISION_2; + rssp->hdr.size = sizeof(rndis_recv_scale_param); + rssp->flag = 0; + rssp->hashinfo = RNDIS_HASH_FUNC_TOEPLITZ | RNDIS_HASH_IPV4 | + RNDIS_HASH_TCP_IPV4 | RNDIS_HASH_IPV6 | RNDIS_HASH_TCP_IPV6; + rssp->indirect_tabsize = 4 * ITAB_NUM; + rssp->indirect_taboffset = sizeof(rndis_recv_scale_param); + rssp->hashkey_size = HASH_KEYLEN; + rssp->hashkey_offset = rssp->indirect_taboffset + + rssp->indirect_tabsize; + + /* Set indirection table entries */ + itab = (uint32_t *)(rssp + 1); + for (i = 0; i < ITAB_NUM; i++) + itab[i] = i % num_queue; + + /* Set hash key values */ + keyp = (uint8_t *)((unsigned long)rssp + rssp->hashkey_offset); + for (i = 0; i < HASH_KEYLEN; i++) + keyp[i] = netvsc_hash_key[i]; + + ret = hv_rf_send_request(device, request, REMOTE_NDIS_SET_MSG); + if (ret != 0) { + goto cleanup; + } + + /* + * Wait for the response from the host. Another thread will signal + * us when the response has arrived. In the failure case, + * sema_timedwait() returns a non-zero status after waiting 5 seconds. + */ + ret = sema_timedwait(&request->wait_sema, 5 * hz); + if (ret == 0) { + /* Response received, check status */ + set_complete = &request->response_msg.msg.set_complete; + status = set_complete->status; + if (status != RNDIS_STATUS_SUCCESS) { + /* Bad response status, return error */ + if (bootverbose) + printf("Netvsc: Failed to set vRSS " + "parameters.\n"); + ret = -2; + } else { + if (bootverbose) + printf("Netvsc: Successfully set vRSS " + "parameters.\n"); + } + } else { + /* + * We cannot deallocate the request since we may still + * receive a send completion for it. + */ + printf("Netvsc: vRSS set timeout, id = %u, ret = %d\n", + request->request_msg.msg.init_request.request_id, ret); + goto exit; + } + +cleanup: + if (request != NULL) { + hv_put_rndis_request(device, request); + } +exit: + return (ret); +} + /* * RNDIS filter set packet filter * Sends an rndis request with the new filter, then waits for a response @@ -817,12 +970,15 @@ hv_rf_close_device(rndis_device *device) */ int hv_rf_on_device_add(struct hv_device *device, void *additl_info, - int nchan __unused) + int nchan) { int ret; netvsc_dev *net_dev; rndis_device *rndis_dev; + nvsp_msg *init_pkt; rndis_offload_params offloads; + struct rndis_recv_scale_cap rsscaps; + uint32_t rsscaps_size = sizeof(struct rndis_recv_scale_cap); netvsc_device_info *dev_info = (netvsc_device_info *)additl_info; device_t dev = device->device; @@ -888,6 +1044,67 @@ hv_rf_on_device_add(struct hv_device *de dev_info->link_state = rndis_dev->link_status; + net_dev->num_channel = 1; + if (net_dev->nvsp_version < NVSP_PROTOCOL_VERSION_5 || nchan == 1) + return (0); + + memset(&rsscaps, 0, rsscaps_size); + ret = hv_rf_query_device(rndis_dev, + RNDIS_OID_GEN_RSS_CAPABILITIES, + &rsscaps, &rsscaps_size); + if ((ret != 0) || (rsscaps.num_recv_que < 2)) { + device_printf(dev, "hv_rf_query_device failed or " + "rsscaps.num_recv_que < 2 \n"); + goto out; + } + device_printf(dev, "channel, offered %u, requested %d\n", + rsscaps.num_recv_que, nchan); + if (nchan > rsscaps.num_recv_que) + nchan = rsscaps.num_recv_que; + net_dev->num_channel = nchan; + + if (net_dev->num_channel == 1) { + device_printf(dev, "net_dev->num_channel == 1 under VRSS\n"); + goto out; + } + + /* request host to create sub channels */ + init_pkt = &net_dev->channel_init_packet; + memset(init_pkt, 0, sizeof(nvsp_msg)); + + init_pkt->hdr.msg_type = nvsp_msg5_type_subchannel; + init_pkt->msgs.vers_5_msgs.subchannel_request.op = + NVSP_SUBCHANNE_ALLOCATE; + init_pkt->msgs.vers_5_msgs.subchannel_request.num_subchannels = + net_dev->num_channel - 1; + + ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, + sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret != 0) { + device_printf(dev, "Fail to allocate subchannel\n"); + goto out; + } + + sema_wait(&net_dev->channel_init_sema); + + if (init_pkt->msgs.vers_5_msgs.subchn_complete.status != *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201606160248.u5G2mIOP039213>