Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 15 Apr 2020 11:56:02 +0000
From:      "whu (Wei Hu)" <phabric-noreply@FreeBSD.org>
To:        freebsd-net@freebsd.org
Subject:   [Differential] D24061: Hyper-V socket implementation for FreeBSD guest
Message-ID:  <79d16d788f9a405bf42820916438b2a4@localhost.localdomain>
In-Reply-To: <differential-rev-PHID-DREV-jw7sxemgfjxsdvkov4cn-req@reviews.freebsd.org>
References:  <differential-rev-PHID-DREV-jw7sxemgfjxsdvkov4cn-req@reviews.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
whu updated this revision to Diff 70595.
whu edited the summary of this revision.
whu added reviewers: decui_microsoft.com, freebsd-net-list.
whu added a comment.


  Out for broader review

REPOSITORY
  rS FreeBSD src repository

CHANGES SINCE LAST UPDATE
  https://reviews.freebsd.org/D24061?vs=69478&id=70595

CHANGES SINCE LAST ACTION
  https://reviews.freebsd.org/D24061/new/

REVISION DETAIL
  https://reviews.freebsd.org/D24061

AFFECTED FILES
  sys/conf/files.x86
  sys/dev/hyperv/hvsock/hv_sock.c
  sys/dev/hyperv/hvsock/hv_sock.h
  sys/dev/hyperv/include/vmbus.h
  sys/dev/hyperv/vmbus/vmbus.c
  sys/dev/hyperv/vmbus/vmbus_br.c
  sys/dev/hyperv/vmbus/vmbus_brvar.h
  sys/dev/hyperv/vmbus/vmbus_chan.c
  sys/dev/hyperv/vmbus/vmbus_chanvar.h
  sys/dev/hyperv/vmbus/vmbus_reg.h
  sys/modules/hyperv/Makefile
  sys/modules/hyperv/hvsock/Makefile
  sys/sys/socket.h

EMAIL PREFERENCES
  https://reviews.freebsd.org/settings/panel/emailpreferences/

To: whu, decui_microsoft.com, freebsd-net-list
Cc: greg_unrelenting.technology, imp

[-- Attachment #2 --]
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -265,7 +265,8 @@
 #define	AF_IEEE80211	37		/* IEEE 802.11 protocol */
 #define	AF_INET_SDP	40		/* OFED Socket Direct Protocol ipv4 */
 #define	AF_INET6_SDP	42		/* OFED Socket Direct Protocol ipv6 */
-#define	AF_MAX		42
+#define	AF_HYPERV	43		/* HyperV sockets */
+#define	AF_MAX		43
 /*
  * When allocating a new AF_ constant, please only allocate
  * even numbered constants for FreeBSD until 134 as odd numbered AF_
@@ -273,7 +274,6 @@
  */
 #define AF_VENDOR00 39
 #define AF_VENDOR01 41
-#define AF_VENDOR02 43
 #define AF_VENDOR03 45
 #define AF_VENDOR04 47
 #define AF_VENDOR05 49
diff --git a/sys/modules/hyperv/hvsock/Makefile b/sys/modules/hyperv/hvsock/Makefile
--- a/sys/modules/hyperv/hvsock/Makefile
+++ b/sys/modules/hyperv/hvsock/Makefile
@@ -0,0 +1,13 @@
+# $FreeBSD$
+
+.PATH:  ${SRCTOP}/sys/dev/hyperv/hvsock
+
+KMOD=	hv_sock
+SRCS=	hv_sock.c
+SRCS+=	hv_sock.h
+
+CFLAGS+= -I${SRCTOP}/sys/dev/hyperv/include	\
+	 -I${SRCTOP}/sys/dev/hyperv/vmbus	\
+	 -I${SRCTOP}/sys/dev/hyperv/hvsock
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/hyperv/Makefile b/sys/modules/hyperv/Makefile
--- a/sys/modules/hyperv/Makefile
+++ b/sys/modules/hyperv/Makefile
@@ -1,5 +1,5 @@
 # $FreeBSD$
 
-SUBDIR = vmbus netvsc storvsc utilities
+SUBDIR = vmbus netvsc storvsc utilities hvsock
 
 .include <bsd.subdir.mk>
diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h
--- a/sys/dev/hyperv/vmbus/vmbus_reg.h
+++ b/sys/dev/hyperv/vmbus/vmbus_reg.h
@@ -127,7 +127,54 @@
 	 */
 	volatile uint32_t	br_imask;
 
-	uint8_t			br_rsvd[4084];
+	/*
+	 * WS2012/Win8 and later versions of Hyper-V implement interrupt
+	 * driven flow management. The feature bit feat_pending_snd_sz
+	 * is set by the host on the host->guest buffer ring, and by the
+	 * guest on the guest->host buffer ring.
+	 *
+	 * The meaning of the feature bit is a bit complex in that it has
+	 * semantics that apply to both buffer rings.  If the guest sets
+	 * the feature bit in the guest->host buffer ring, the guest is
+	 * telling the host that:
+	 * 1) It will set the br_pending_snd_sz field in the guest->host buffer
+	 *    ring when it is waiting for space to become available, and
+	 * 2) It will read the pending_send_sz field in the host->guest
+	 *    ring buffer and interrupt the host when it frees enough space
+	 *
+	 * Similarly, if the host sets the feature bit in the host->guest
+	 * ring buffer, the host is telling the guest that:
+	 * 1) It will set the pending_send_sz field in the host->guest ring
+	 *    buffer when it is waiting for space to become available, and
+	 * 2) It will read the pending_send_sz field in the guest->host
+	 *    ring buffer and interrupt the guest when it frees enough space
+	 *
+	 * If either the guest or host does not set the feature bit that it
+	 * owns, that guest or host must do polling if it encounters a full
+	 * ring buffer, and not signal the other end with an interrupt.
+	 */
+	volatile uint32_t	br_pending_snd_sz;
+	uint32_t		br_rsvd1[12];
+	union	{
+		struct {
+			uint32_t feat_pending_snd_sz:1;
+		};
+		uint32_t value;
+	} br_feature_bits;
+
+	/* Padding to PAGE_SIZE */
+	uint8_t			br_rsvd2[4020];
+
+	/*
+	 * Total guest to host interrupt count
+	 * - For rx ring, this counts the guest signaling host when this rx
+	 * ring changing from full to not full.
+	 *
+	 * - For tx ring, this counts the guest signaling host when this tx
+	 * ring changing from empty to non empty.
+	 */
+	uint64_t		br_g2h_intr_cnt;
+
 	uint8_t			br_data[];
 } __packed;
 CTASSERT(sizeof(struct vmbus_bufring) == PAGE_SIZE);
@@ -196,7 +243,14 @@
 #define VMBUS_CHANMSG_TYPE_CONNECT		14	/* REQ */
 #define VMBUS_CHANMSG_TYPE_CONNECT_RESP		15	/* RESP */
 #define VMBUS_CHANMSG_TYPE_DISCONNECT		16	/* REQ */
-#define VMBUS_CHANMSG_TYPE_MAX			22
+#define VMBUS_CHANMSG_TYPE_17			17
+#define VMBUS_CHANMSG_TYPE_18			18
+#define VMBUS_CHANMSG_TYPE_19			19
+#define VMBUS_CHANMSG_TYPE_20			20
+#define VMBUS_CHANMSG_TYPE_TL_CONN		21	/* REQ */
+#define VMBUS_CHANMSG_TYPE_22			22
+#define VMBUS_CHANMSG_TYPE_TL_RESULT		23	/* RESP */
+#define VMBUS_CHANMSG_TYPE_MAX			24
 
 struct vmbus_chanmsg_hdr {
 	uint32_t	chm_type;	/* VMBUS_CHANMSG_TYPE_ */
@@ -229,6 +283,15 @@
 	struct vmbus_chanmsg_hdr chm_hdr;
 } __packed;
 
+/* VMBUS_CHANMSG_TYPE_TL_CONN */
+/* Hyper-V socket guest connect request */
+struct vmbus_chanmsg_tl_connect {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	struct hyperv_guid guest_endpoint_id;
+	struct hyperv_guid host_service_id;
+} __packed;
+
+
 /* VMBUS_CHANMSG_TYPE_CHOPEN */
 struct vmbus_chanmsg_chopen {
 	struct vmbus_chanmsg_hdr chm_hdr;
@@ -310,6 +373,12 @@
 	uint32_t	chm_chanid;
 } __packed;
 
+/* Size of the user defined data buffer for non-pipe offers */
+#define VMBUS_CHANMSG_CHOFFER_UDATA_SIZE		120
+
+/* Size of the user defined data buffer for pipe offers. */
+#define VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE		116
+
 /* VMBUS_CHANMSG_TYPE_CHOFFER */
 struct vmbus_chanmsg_choffer {
 	struct vmbus_chanmsg_hdr chm_hdr;
@@ -320,7 +389,26 @@
 	uint32_t	chm_svrctx_sz;
 	uint16_t	chm_chflags;
 	uint16_t	chm_mmio_sz;	/* unit: MB */
-	uint8_t		chm_udata[120];
+
+	union {
+		/* Non-pipes */
+		struct {
+			uint8_t	user_def[VMBUS_CHANMSG_CHOFFER_UDATA_SIZE];
+		} std;
+		/*
+		 * Pipes:
+		 * For integrated pipe protocol, which is implemented on
+		 * top of standard user-defined data. Pipe clients have
+		 * VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE bytes left for
+		 * their own user.
+		 */
+		struct {
+			uint32_t pipe_mode;
+			uint8_t
+			    user_def[VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE];
+		} pipe;
+	} chm_udata;
+
 	uint16_t	chm_subidx;
 	uint16_t	chm_rsvd;
 	uint32_t	chm_chanid;
@@ -331,6 +419,9 @@
 } __packed;
 CTASSERT(sizeof(struct vmbus_chanmsg_choffer) <= VMBUS_MSG_DSIZE_MAX);
 
+/* Server Flag */
+#define VMBUS_CHAN_TLNPI_PROVIDER_OFFER			0x2000
+
 #define VMBUS_CHOFFER_FLAG1_HASMNF	0x01
 
 #endif	/* !_VMBUS_REG_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_chanvar.h b/sys/dev/hyperv/vmbus/vmbus_chanvar.h
--- a/sys/dev/hyperv/vmbus/vmbus_chanvar.h
+++ b/sys/dev/hyperv/vmbus/vmbus_chanvar.h
@@ -149,6 +149,12 @@
 
 	int				ch_refs;
 
+	/*
+	 * These are for HyperV socket channel only
+	 */
+	bool				ch_is_hvs;
+	uint8_t				ch_hvs_conn_from_host;
+
 	struct sysctl_ctx_list		ch_sysctl_ctx;
 } __aligned(CACHE_LINE_SIZE);
 
diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c
--- a/sys/dev/hyperv/vmbus/vmbus_chan.c
+++ b/sys/dev/hyperv/vmbus/vmbus_chan.c
@@ -127,10 +127,11 @@
 };
 
 /*
- * Notify host that there are data pending on our TX bufring.
+ * Notify host that there are data pending on our TX bufring or
+ * we have put some data on the TX bufring.
  */
 static __inline void
-vmbus_chan_signal_tx(const struct vmbus_channel *chan)
+vmbus_chan_signal(const struct vmbus_channel *chan)
 {
 	atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask);
 	if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
@@ -139,6 +140,22 @@
 		hypercall_signal_event(chan->ch_monprm_dma.hv_paddr);
 }
 
+static __inline void
+vmbus_chan_signal_tx(struct vmbus_channel *chan)
+{
+	chan->ch_txbr.txbr_intrcnt ++;
+
+	vmbus_chan_signal(chan);
+}
+
+static __inline void
+vmbus_chan_signal_rx(struct vmbus_channel *chan)
+{
+	chan->ch_rxbr.rxbr_intrcnt ++;
+
+	vmbus_chan_signal(chan);
+}
+
 static void
 vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
 {
@@ -1012,7 +1029,60 @@
 	taskqueue_drain(chan->ch_tq, &chan->ch_task);
 }
 
+uint32_t
+vmbus_chan_write_available(struct vmbus_channel *chan)
+{
+	return (vmbus_txbr_available(&chan->ch_txbr));
+}
+
+bool
+vmbus_chan_write_signal(struct vmbus_channel *chan,
+    int32_t min_signal_size)
+{
+	if (min_signal_size >= 0 &&
+	    vmbus_chan_write_available(chan) > min_signal_size) {
+		return false;
+	}
+
+	if (!vmbus_txbr_get_imask(&chan->ch_txbr)) {
+		/* txbr imask is not set, signal the reader */
+		vmbus_chan_signal_tx(chan);
+		return true;
+	}
+
+	return false;
+}
+
+void
+vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
+    uint32_t size)
+{
+	if (chan)
+		vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size);
+}
+
 int
+vmbus_chan_iov_send(struct vmbus_channel *chan,
+    const struct iovec iov[], int iovlen,
+    vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	int error;
+	boolean_t send_evt;
+
+	if (iovlen == 0)
+		return (0);
+
+	error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen,
+	    cb, cbarg, &send_evt);
+
+	if (!error && send_evt) {
+		vmbus_chan_signal_tx(chan);
+	}
+
+	return error;
+}
+
+int
 vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags,
     void *data, int dlen, uint64_t xactid)
 {
@@ -1211,6 +1281,78 @@
 	return (0);
 }
 
+uint32_t
+vmbus_chan_read_available(struct vmbus_channel *chan)
+{
+	return (vmbus_rxbr_available(&chan->ch_rxbr));
+}
+
+/*
+ * This routine does:
+ *     - Advance the channel read index for 'advance' bytes
+ *     - Copy data_len bytes in to the buffer pointed by 'data'
+ * Return 0 if operation succeed. EAGAIN if operations if failed.
+ * If failed, the buffer pointed by 'data' is intact, and the
+ * channel read index is not advanced at all.
+ */
+int
+vmbus_chan_recv_peek(struct vmbus_channel *chan,
+    void *data, int data_len, uint32_t advance)
+{
+	int error;
+	boolean_t sig_event;
+
+	if (data == NULL || data_len <= 0)
+		return (EINVAL);
+
+	error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr,
+	    data, data_len, advance, &sig_event);
+
+	if (!error && sig_event) {
+		vmbus_chan_signal_rx(chan);
+	}
+
+	return (error);
+}
+
+/*
+ * This routine does:
+ *     - Advance the channel read index for 'advance' bytes
+ */
+int
+vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance)
+{
+	int error;
+	boolean_t sig_event;
+
+	if (advance == 0)
+		return (EINVAL);
+
+	error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event);
+
+	if (!error && sig_event) {
+		vmbus_chan_signal_rx(chan);
+	}
+
+	return (error);
+}
+
+
+/*
+ * Caller should hold its own lock to serialize the ring buffer
+ * copy.
+ */
+int
+vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len,
+    uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	if (!chan || data_len <= 0 || cb == NULL)
+		return (EINVAL);
+
+	return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip,
+	    cb, cbarg));
+}
+
 static void
 vmbus_chan_task(void *xchan, int pending __unused)
 {
@@ -1732,6 +1874,25 @@
 		    1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN);
 	}
 
+	if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) {
+		/* This is HyperV socket channel */
+		chan->ch_is_hvs = true;
+		/* The first byte != 0 means the host initiated connection. */
+		chan->ch_hvs_conn_from_host =
+		    offer->chm_udata.pipe.user_def[0];
+
+		if (bootverbose) {
+			device_printf(sc->vmbus_dev,
+			    "chan%u is hyperv socket channel "
+			    "connected %s host\n",
+			    chan->ch_id,
+			    (chan->ch_hvs_conn_from_host != 0) ?
+			    "from" : "to");
+		}
+	} else {
+		chan->ch_is_hvs = false;
+	}
+
 	/*
 	 * Setup event flag.
 	 */
@@ -2047,9 +2208,32 @@
 		return false;
 }
 
-const struct hyperv_guid *
-vmbus_chan_guid_inst(const struct vmbus_channel *chan)
+bool
+vmbus_chan_is_hvs(const struct vmbus_channel *chan)
 {
+	return chan->ch_is_hvs;
+}
+
+bool
+vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan)
+{
+	KASSERT(vmbus_chan_is_hvs(chan) == true,
+	    ("Not a HyperV Socket channel %u", chan->ch_id));
+	if (chan->ch_hvs_conn_from_host != 0)
+		return true;
+	else
+		return false;
+}
+
+struct hyperv_guid *
+vmbus_chan_guid_type(struct vmbus_channel *chan)
+{
+	return &chan->ch_guid_type;
+}
+
+struct hyperv_guid *
+vmbus_chan_guid_inst(struct vmbus_channel *chan)
+{
 	return &chan->ch_guid_inst;
 }
 
diff --git a/sys/dev/hyperv/vmbus/vmbus_brvar.h b/sys/dev/hyperv/vmbus/vmbus_brvar.h
--- a/sys/dev/hyperv/vmbus/vmbus_brvar.h
+++ b/sys/dev/hyperv/vmbus/vmbus_brvar.h
@@ -44,6 +44,10 @@
 #define vbr_windex		vbr->br_windex
 #define vbr_rindex		vbr->br_rindex
 #define vbr_imask		vbr->br_imask
+#define vbr_psndsz		vbr->br_pending_snd_sz
+#define vbr_fpsndsz		vbr->br_feature_bits.feat_pending_snd_sz
+#define vbr_fvalue		vbr->br_feature_bits.value
+#define vbr_intrcnt		vbr->br_g2h_intr_cnt
 #define vbr_data		vbr->br_data
 
 struct vmbus_rxbr {
@@ -54,6 +58,10 @@
 #define rxbr_windex		rxbr.vbr_windex
 #define rxbr_rindex		rxbr.vbr_rindex
 #define rxbr_imask		rxbr.vbr_imask
+#define rxbr_psndsz		rxbr.vbr_psndsz
+#define rxbr_fpsndsz		rxbr.vbr_fpsndsz
+#define rxbr_fvalue		rxbr.vbr_fvalue
+#define rxbr_intrcnt		rxbr.vbr_intrcnt
 #define rxbr_data		rxbr.vbr_data
 #define rxbr_dsize		rxbr.vbr_dsize
 
@@ -65,6 +73,10 @@
 #define txbr_windex		txbr.vbr_windex
 #define txbr_rindex		txbr.vbr_rindex
 #define txbr_imask		txbr.vbr_imask
+#define txbr_psndsz		txbr.vbr_psndsz
+#define txbr_fpsndsz		txbr.vbr_fpsndsz
+#define txbr_fvalue		txbr.vbr_fvalue
+#define txbr_intrcnt		txbr.vbr_intrcnt
 #define txbr_data		txbr.vbr_data
 #define txbr_dsize		txbr.vbr_dsize
 
@@ -118,8 +130,15 @@
 int		vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen);
 int		vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen,
 		    uint32_t skip);
+int		vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv,
+		    boolean_t *need_sig);
+int		vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data,
+		    int dlen, uint32_t idx_adv, boolean_t *need_sig);
+int		vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen,
+		    uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg);
 void		vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr);
 uint32_t	vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr);
+uint32_t	vmbus_rxbr_available(const struct vmbus_rxbr *rbr);
 
 void		vmbus_txbr_init(struct vmbus_txbr *tbr);
 void		vmbus_txbr_deinit(struct vmbus_txbr *tbr);
@@ -126,5 +145,13 @@
 void		vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen);
 int		vmbus_txbr_write(struct vmbus_txbr *tbr,
 		    const struct iovec iov[], int iovlen, boolean_t *need_sig);
+int		vmbus_txbr_write_call(struct vmbus_txbr *tbr,
+		    const struct iovec iov[], int iovlen,
+		    vmbus_br_copy_callback_t cb, void *cbarg,
+		    boolean_t *need_sig);
+uint32_t	vmbus_txbr_available(const struct vmbus_txbr *tbr);
+uint32_t	vmbus_txbr_get_imask(const struct vmbus_txbr *tbr);
+void		vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr,
+		    uint32_t size);
 
 #endif  /* _VMBUS_BRVAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_br.c b/sys/dev/hyperv/vmbus/vmbus_br.c
--- a/sys/dev/hyperv/vmbus/vmbus_br.c
+++ b/sys/dev/hyperv/vmbus/vmbus_br.c
@@ -52,18 +52,23 @@
 vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS)
 {
 	const struct vmbus_br *br = arg1;
-	uint32_t rindex, windex, imask, ravail, wavail;
+	uint32_t rindex, windex, imask, psndsz, fvalue, ravail, wavail;
+	uint64_t intrcnt;
 	char state[256];
 
+	intrcnt = br->vbr_intrcnt;
 	rindex = br->vbr_rindex;
 	windex = br->vbr_windex;
 	imask = br->vbr_imask;
+	psndsz = br->vbr_psndsz;
+	fvalue = br->vbr_fvalue;
 	wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize);
 	ravail = br->vbr_dsize - wavail;
 
 	snprintf(state, sizeof(state),
-	    "rindex:%u windex:%u imask:%u ravail:%u wavail:%u",
-	    rindex, windex, imask, ravail, wavail);
+	    "intrcnt:%lu rindex:%u windex:%u imask:%u psndsz:%u fvalue:%u "
+	    "ravail:%u wavail:%u",
+	    intrcnt, rindex, windex, imask, psndsz, fvalue, ravail, wavail);
 	return sysctl_handle_string(oidp, state, sizeof(state), req);
 }
 
@@ -76,9 +81,11 @@
 #define BR_STATE_RIDX	0
 #define BR_STATE_WIDX	1
 #define BR_STATE_IMSK	2
-#define BR_STATE_RSPC	3
-#define BR_STATE_WSPC	4
-#define BR_STATE_MAX	5
+#define BR_STATE_PSSZ	3
+#define BR_STATE_FVAL	4
+#define BR_STATE_RSPC	5
+#define BR_STATE_WSPC	6
+#define BR_STATE_MAX	7
 
 	const struct vmbus_br *br = arg1;
 	uint32_t rindex, windex, wavail, state[BR_STATE_MAX];
@@ -90,6 +97,8 @@
 	state[BR_STATE_RIDX] = rindex;
 	state[BR_STATE_WIDX] = windex;
 	state[BR_STATE_IMSK] = br->vbr_imask;
+	state[BR_STATE_PSSZ] = br->vbr_psndsz;
+	state[BR_STATE_FVAL] = br->vbr_fvalue;
 	state[BR_STATE_WSPC] = wavail;
 	state[BR_STATE_RSPC] = br->vbr_dsize - wavail;
 
@@ -140,6 +149,12 @@
 }
 
 uint32_t
+vmbus_rxbr_available(const struct vmbus_rxbr *rbr)
+{
+	return (vmbus_rxbr_avail(rbr));
+}
+
+uint32_t
 vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr)
 {
 	rbr->rxbr_imask = 0;
@@ -178,6 +193,40 @@
 	vmbus_br_setup(&rbr->rxbr, buf, blen);
 }
 
+static __inline boolean_t
+vmbus_rxbr_need_signal(const struct vmbus_rxbr *rbr, uint32_t bytes_read)
+{
+	uint32_t pending_snd_sz, canwrite_size;
+
+	/* No need to signal if host doesn't want us to */
+	if (!rbr->rxbr_fpsndsz)
+		return false;
+
+	mb();
+
+	pending_snd_sz = rbr->rxbr_psndsz;
+	/* No need to signal if host sets pending_snd_sz to 0 */
+	if (!pending_snd_sz)
+		return false;
+
+	mb();
+
+	canwrite_size = rbr->rxbr_dsize - vmbus_rxbr_avail(rbr);
+
+	/* No need to signal if br already has enough space before read */
+	if (canwrite_size - bytes_read > pending_snd_sz)
+		return false;
+
+	/*
+	 * No need to signal if still doesn't have enough space
+	 * asked by host
+	 */
+	if (canwrite_size <= pending_snd_sz)
+		return false;
+
+	return true;
+}
+
 void
 vmbus_txbr_init(struct vmbus_txbr *tbr)
 {
@@ -194,8 +243,25 @@
 vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen)
 {
 	vmbus_br_setup(&tbr->txbr, buf, blen);
+
+	/* Set feature bit enabling flow control */
+	tbr->txbr_fpsndsz = 1;
 }
 
+uint32_t
+vmbus_txbr_get_imask(const struct vmbus_txbr *tbr)
+{
+	mb();
+
+	return(tbr->txbr_imask);
+}
+
+void
+vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, uint32_t size)
+{
+	tbr->txbr_psndsz = size;
+}
+
 /*
  * When we write to the ring buffer, check if the host needs to be
  * signaled.
@@ -260,7 +326,117 @@
 	return VMBUS_BR_IDXINC(windex, cplen, br_dsize);
 }
 
+static __inline uint32_t
+vmbus_txbr_copyto_call(const struct vmbus_txbr *tbr, uint32_t windex,
+    uint32_t cplen, vmbus_br_copy_callback_t cb, void *cbarg, int *ret)
+{
+	uint8_t *br_data = tbr->txbr_data;
+	uint32_t br_dsize = tbr->txbr_dsize;
+	int err = 0;
+
+	if (cplen > br_dsize - windex) {
+		uint32_t fraglen = br_dsize - windex;
+
+		/* Wrap-around detected */
+		err = cb((void *)(br_data + windex), fraglen, cbarg);
+		if (!err)
+			err = cb((void *)br_data, cplen - fraglen, cbarg);
+	} else {
+		err = cb((void *)(br_data + windex), cplen, cbarg);
+	}
+
+	*ret = err;
+
+	return VMBUS_BR_IDXINC(windex, cplen, br_dsize);
+}
+
+uint32_t
+vmbus_txbr_available(const struct vmbus_txbr *tbr)
+{
+	return (vmbus_txbr_avail(tbr));
+}
+
 /*
+ * NOTE:
+ * Not holding lock when calling user provided callback routine.
+ * Caller should hold lock to serialize ring buffer accesses.
+ */
+int
+vmbus_txbr_write_call(struct vmbus_txbr *tbr,
+    const struct iovec iov[], int iovlen,
+    vmbus_br_copy_callback_t cb, void *cbarg,
+    boolean_t *need_sig)
+{
+	uint32_t old_windex, windex, total;
+	uint64_t save_windex;
+	int i;
+	int cb_ret = 0;
+
+	total = 0;
+	for (i = 0; i < iovlen; i++)
+		total += iov[i].iov_len;
+	total += sizeof(save_windex);
+
+
+	/*
+	 * NOTE:
+	 * If this write is going to make br_windex same as br_rindex,
+	 * i.e. the available space for write is same as the write size,
+	 * we can't do it then, since br_windex == br_rindex means that
+	 * the bufring is empty.
+	 */
+	if (vmbus_txbr_avail(tbr) <= total) {
+		return (EAGAIN);
+	}
+
+	/* Save br_windex for later use */
+	old_windex = tbr->txbr_windex;
+
+	/*
+	 * Copy the scattered channel packet to the TX bufring.
+	 */
+	windex = old_windex;
+	for (i = 0; i < iovlen; i++) {
+		if (iov[i].iov_base != NULL) {
+			windex = vmbus_txbr_copyto(tbr, windex,
+			    iov[i].iov_base, iov[i].iov_len);
+		} else if (cb != NULL) {
+			windex = vmbus_txbr_copyto_call(tbr, windex,
+			    iov[i].iov_len, cb, cbarg, &cb_ret);
+			/*
+			 * If callback fails, return without updating
+			 * write index.
+			 */
+			if (cb_ret)
+				return (cb_ret);
+		}
+	}
+
+	mtx_lock_spin(&tbr->txbr_lock);
+
+	/*
+	 * Set the offset of the current channel packet.
+	 */
+	save_windex = ((uint64_t)old_windex) << 32;
+	windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+	    sizeof(save_windex));
+
+	/*
+	 * Update the write index _after_ the channel packet
+	 * is copied.
+	 */
+	__compiler_membar();
+	tbr->txbr_windex = windex;
+
+	mtx_unlock_spin(&tbr->txbr_lock);
+
+	if (need_sig)
+		*need_sig = vmbus_txbr_need_signal(tbr, old_windex);
+
+	return (0);
+}
+
+/*
  * Write scattered channel packet to TX bufring.
  *
  * The offset of this channel packet is written as a 64bits value
@@ -346,6 +522,27 @@
 	return VMBUS_BR_IDXINC(rindex, cplen, br_dsize);
 }
 
+static __inline uint32_t
+vmbus_rxbr_copyfrom_call(const struct vmbus_rxbr *rbr, uint32_t rindex,
+    int cplen, vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	uint8_t *br_data = rbr->rxbr_data;
+	uint32_t br_dsize = rbr->rxbr_dsize;
+	int error = 0;
+
+	if (cplen > br_dsize - rindex) {
+		uint32_t fraglen = br_dsize - rindex;
+
+		/* Wrap-around detected. */
+		error = cb((void *)(br_data + rindex), fraglen, cbarg);
+		if (!error)
+			error = cb((void *)br_data, cplen - fraglen, cbarg);
+	} else {
+		error = cb((void *)(br_data + rindex), cplen, cbarg);
+	}
+	return (error);
+}
+
 int
 vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen)
 {
@@ -368,6 +565,121 @@
 
 /*
  * NOTE:
+ * We only hold spin lock to check the ring buffer space. It is
+ * released before calling user provided callback routine.
+ * Caller should hold lock to serialize ring buffer accesses.
+ */
+int
+vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, uint32_t skip,
+    vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	uint32_t rindex, br_dsize0 = rbr->rxbr_dsize;
+	int ret;
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+	/*
+	 * The requested data + skip and the 64bits channel packet
+	 * offset should be there at least.
+	 */
+	if (vmbus_rxbr_avail(rbr) < skip + dlen + sizeof(uint64_t)) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize0);
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	ret = vmbus_rxbr_copyfrom_call(rbr, rindex, dlen, cb, cbarg);
+
+	return (ret);
+}
+
+/*
+ * NOTE:
+ * We assume idx_adv == sizeof(channel packet).
+ */
+int
+vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, int dlen,
+    uint32_t idx_adv, boolean_t *need_sig)
+{
+	uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+	/*
+	 * Make sure it has enough data to read.
+	 */
+	if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t) + dlen) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	if (idx_adv > 0) {
+		/*
+		 * Advance the read index first, including the channel's 64bit
+		 * previous write offset.
+		 */
+		rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex,
+		    idx_adv + sizeof(uint64_t), br_dsize);
+		__compiler_membar();
+		rbr->rxbr_rindex = rindex;
+	}
+
+	vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen);
+
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	if (need_sig) {
+		if (idx_adv > 0)
+			*need_sig =
+			    vmbus_rxbr_need_signal(rbr, idx_adv +
+			    sizeof(uint64_t));
+		else
+			*need_sig = false;
+	}
+
+	return (0);
+}
+
+/*
+ * NOTE:
+ * Just update the RX rb index.
+ */
+int
+vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv,
+    boolean_t *need_sig)
+{
+	uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+	/*
+	 * Make sure it has enough space to advance.
+	 */
+	if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t)) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	/*
+	 * Advance the read index, including the channel's 64bit
+	 * previous write offset.
+	 */
+	rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex,
+	    idx_adv + sizeof(uint64_t), br_dsize);
+	__compiler_membar();
+	rbr->rxbr_rindex = rindex;
+
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	if (need_sig) {
+		*need_sig =
+		    vmbus_rxbr_need_signal(rbr, idx_adv + sizeof(uint64_t));
+	}
+
+	return (0);
+}
+
+/*
+ * NOTE:
  * We assume (dlen + skip) == sizeof(channel packet).
  */
 int
diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c
--- a/sys/dev/hyperv/vmbus/vmbus.c
+++ b/sys/dev/hyperv/vmbus/vmbus.c
@@ -365,12 +365,48 @@
 	uint32_t gpadl;
 
 again:
-	gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); 
+	gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1);
 	if (gpadl == 0)
 		goto again;
 	return (gpadl);
 }
 
+/* Used for Hyper-V socket when guest client connects to host */
+int
+vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id,
+    struct hyperv_guid *host_srv_id)
+{
+	struct vmbus_softc *sc = vmbus_get_softc();
+	struct vmbus_chanmsg_tl_connect *req;
+	struct vmbus_msghc *mh;
+	int error;
+
+	if (!sc)
+		return ENXIO;
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		device_printf(sc->vmbus_dev,
+		    "can not get msg hypercall for tl connect\n");
+		return ENXIO;
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN;
+	req->guest_endpoint_id = *guest_srv_id;
+	req->host_service_id = *host_srv_id;
+
+	error = vmbus_msghc_exec_noresult(mh);
+	vmbus_msghc_put(sc, mh);
+
+	if (error) {
+		device_printf(sc->vmbus_dev,
+		    "tl connect msg hypercall failed\n");
+	}
+
+	return error;
+}
+
 static int
 vmbus_connect(struct vmbus_softc *sc, uint32_t version)
 {
diff --git a/sys/dev/hyperv/include/vmbus.h b/sys/dev/hyperv/include/vmbus.h
--- a/sys/dev/hyperv/include/vmbus.h
+++ b/sys/dev/hyperv/include/vmbus.h
@@ -31,6 +31,7 @@
 
 #include <sys/param.h>
 #include <sys/bus.h>
+#include <sys/_iovec.h>
 
 /*
  * VMBUS version is 32 bit, upper 16 bit for major_number and lower
@@ -130,6 +131,7 @@
 struct taskqueue;
 
 typedef void	(*vmbus_chan_callback_t)(struct vmbus_channel *, void *);
+typedef int	(*vmbus_br_copy_callback_t)(void *, int, void *);
 
 static __inline struct vmbus_channel *
 vmbus_get_channel(device_t dev)
@@ -205,6 +207,14 @@
 int		vmbus_chan_recv_pkt(struct vmbus_channel *chan,
 		    struct vmbus_chanpkt_hdr *pkt, int *pktlen);
 
+int		vmbus_chan_recv_idxadv(struct vmbus_channel *chan,
+		    uint32_t advance);
+int		vmbus_chan_recv_peek(struct vmbus_channel *chan,
+		    void *data, int data_len, uint32_t advance);
+int		vmbus_chan_recv_peek_call(struct vmbus_channel *chan,
+		    int data_len, uint32_t skip,
+		    vmbus_br_copy_callback_t cb, void *cbarg);
+
 int		vmbus_chan_send(struct vmbus_channel *chan, uint16_t type,
 		    uint16_t flags, void *data, int dlen, uint64_t xactid);
 int		vmbus_chan_send_sglist(struct vmbus_channel *chan,
@@ -213,13 +223,30 @@
 int		vmbus_chan_send_prplist(struct vmbus_channel *chan,
 		    struct vmbus_gpa_range *prp, int prp_cnt, void *data,
 		    int dlen, uint64_t xactid);
+int		vmbus_chan_iov_send(struct vmbus_channel *chan,
+		    const struct iovec iov[], int iovlen,
+		    vmbus_br_copy_callback_t cb, void *cbarg);
+uint32_t	vmbus_chan_write_available(struct vmbus_channel *chan);
+uint32_t	vmbus_chan_read_available(struct vmbus_channel *chan);
+bool		vmbus_chan_write_signal(struct vmbus_channel *chan,
+		    int32_t min_signal_size);
+void		vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
+		    uint32_t size);
 
 uint32_t	vmbus_chan_id(const struct vmbus_channel *chan);
 uint32_t	vmbus_chan_subidx(const struct vmbus_channel *chan);
 bool		vmbus_chan_is_primary(const struct vmbus_channel *chan);
 bool		vmbus_chan_is_revoked(const struct vmbus_channel *chan);
-const struct hyperv_guid *
-		vmbus_chan_guid_inst(const struct vmbus_channel *chan);
+bool		vmbus_chan_is_hvs(const struct vmbus_channel *chan);
+bool		vmbus_chan_is_hvs_conn_from_host(
+		    const struct vmbus_channel *chan);
+int		vmbus_req_tl_connect(struct hyperv_guid *,
+		    struct hyperv_guid *);
+
+struct hyperv_guid *
+		vmbus_chan_guid_type(struct vmbus_channel *chan);
+struct hyperv_guid *
+		vmbus_chan_guid_inst(struct vmbus_channel *chan);
 int		vmbus_chan_prplist_nelem(int br_size, int prpcnt_max,
 		    int dlen_max);
 bool		vmbus_chan_rx_empty(const struct vmbus_channel *chan);
diff --git a/sys/dev/hyperv/hvsock/hv_sock.h b/sys/dev/hyperv/hvsock/hv_sock.h
--- a/sys/dev/hyperv/hvsock/hv_sock.h
+++ b/sys/dev/hyperv/hvsock/hv_sock.h
@@ -0,0 +1,122 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HVSOCK_H
+#define _HVSOCK_H
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+
+/*
+ * HyperV Socket Protocols
+ */
+#define	HYPERV_SOCK_PROTO_TRANS		1	/* Transport protocol */
+
+#define	HVADDR_PORT_ANY			-1U
+#define	HVADDR_PORT_UNKNOWN		-1U
+
+#define HVS_LIST_BOUND			0x01
+#define HVS_LIST_CONNECTED		0x02
+#define HVS_LIST_ALL			(HVS_LIST_BOUND | HVS_LIST_CONNECTED)
+
+struct sockaddr_hvs {
+	unsigned char	sa_len;
+	sa_family_t	sa_family;
+	unsigned int	hvs_port;
+	unsigned char	hvs_zero[sizeof(struct sockaddr) -
+				 sizeof(sa_family_t) -
+				 sizeof(unsigned char) -
+				 sizeof(unsigned int)];
+};
+
+struct vmpipe_proto_header {
+	uint32_t			vmpipe_pkt_type;
+	uint32_t			vmpipe_data_size;
+} __packed;
+
+struct hvs_pkt_header {
+	struct vmbus_chanpkt_hdr	chan_pkt_hdr;
+	struct vmpipe_proto_header	vmpipe_pkt_hdr;
+} __packed;
+
+struct hvs_pcb {
+	struct socket			*so;		/* Pointer to socket */
+	struct sockaddr_hvs		local_addr;
+	struct sockaddr_hvs		remote_addr;
+
+	struct hyperv_guid		vm_srv_id;
+	struct hyperv_guid		host_srv_id;
+
+	struct vmbus_channel		*chan;
+	/* Current packet header on rx ring */
+	struct hvs_pkt_header		hvs_pkt;
+	/* Available data in receive br in current packet */
+	uint32_t			recv_data_len;
+	/* offset in the packet */
+	uint32_t			recv_data_off;
+	bool				rb_init;
+	/* Link lists for global bound and connected sockets */
+	LIST_ENTRY(hvs_pcb)		bound_next;
+	LIST_ENTRY(hvs_pcb)		connected_next;
+};
+
+#define so2hvspcb(so) \
+	((struct hvs_pcb *)((so)->so_pcb))
+#define hsvpcb2so(hvspcb) \
+	((struct socket *)((hvspcb)->so))
+
+void	hvs_addr_init(struct sockaddr_hvs *, const struct hyperv_guid *);
+void	hvs_trans_init(void);
+void	hvs_trans_close(struct socket *);
+void	hvs_trans_detach(struct socket *);
+void	hvs_trans_abort(struct socket *);
+int	hvs_trans_attach(struct socket *, int, struct thread *);
+int	hvs_trans_bind(struct socket *, struct sockaddr *, struct thread *);
+int	hvs_trans_listen(struct socket *, int, struct thread *);
+int	hvs_trans_accept(struct socket *, struct sockaddr **);
+int	hvs_trans_connect(struct socket *,
+	    struct sockaddr *, struct thread *);
+int	hvs_trans_peeraddr(struct socket *, struct sockaddr **);
+int	hvs_trans_sockaddr(struct socket *, struct sockaddr **);
+int	hvs_trans_soreceive(struct socket *, struct sockaddr **,
+	    struct uio *, struct mbuf **, struct mbuf **, int *);
+int	hvs_trans_sosend(struct socket *, struct sockaddr *, struct uio *,
+	     struct mbuf *, struct mbuf *, int, struct thread *);
+int	hvs_trans_disconnect(struct socket *);
+int	hvs_trans_shutdown(struct socket *);
+
+int	hvs_trans_lock(void);
+void	hvs_trans_unlock(void);
+
+void	hvs_remove_socket_from_list(struct socket *, unsigned char);
+#endif /* _HVSOCK_H */
diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c
--- a/sys/dev/hyperv/hvsock/hv_sock.c
+++ b/sys/dev/hyperv/hvsock/hv_sock.c
@@ -0,0 +1,1748 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/domain.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sockbuf.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+
+#include <net/vnet.h>
+
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+
+#include "hv_sock.h"
+
+#define HVSOCK_DBG_NONE			0x0
+#define HVSOCK_DBG_INFO			0x1
+#define HVSOCK_DBG_ERR			0x2
+#define HVSOCK_DBG_VERBOSE		0x3
+
+
+SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
+
+static int hvs_dbg_level;
+SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
+    0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
+
+
+#define HVSOCK_DBG(level, ...) do {					\
+	if (hvs_dbg_level >= (level))					\
+		printf(__VA_ARGS__);					\
+	} while (0)
+
+MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
+
+/* The MTU is 16KB per host side's design */
+#define HVSOCK_MTU_SIZE		(1024 * 16)
+#define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
+
+#define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
+
+#define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
+					 roundup2(payload_len, 8) + \
+					 sizeof(uint64_t))
+
+
+static struct domain		hv_socket_domain;
+
+/*
+ * HyperV Transport sockets
+ */
+static struct pr_usrreqs	hvs_trans_usrreqs = {
+	.pru_attach =		hvs_trans_attach,
+	.pru_bind =		hvs_trans_bind,
+	.pru_listen =		hvs_trans_listen,
+	.pru_accept =		hvs_trans_accept,
+	.pru_connect =		hvs_trans_connect,
+	.pru_peeraddr =		hvs_trans_peeraddr,
+	.pru_sockaddr =		hvs_trans_sockaddr,
+	.pru_soreceive =	hvs_trans_soreceive,
+	.pru_sosend =		hvs_trans_sosend,
+	.pru_disconnect =	hvs_trans_disconnect,
+	.pru_close =		hvs_trans_close,
+	.pru_detach =		hvs_trans_detach,
+	.pru_shutdown =		hvs_trans_shutdown,
+	.pru_abort =		hvs_trans_abort,
+};
+
+/*
+ * Definitions of protocols supported in HyperV socket domain
+ */
+static struct protosw		hv_socket_protosw[] = {
+{
+	.pr_type =		SOCK_STREAM,
+	.pr_domain =		&hv_socket_domain,
+	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
+	.pr_flags =		PR_CONNREQUIRED,
+	.pr_init =		hvs_trans_init,
+	.pr_usrreqs =		&hvs_trans_usrreqs,
+},
+};
+
+static struct domain		hv_socket_domain = {
+	.dom_family =		AF_HYPERV,
+	.dom_name =		"hyperv",
+	.dom_protosw =		hv_socket_protosw,
+	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
+};
+
+VNET_DOMAIN_SET(hv_socket_);
+
+#define MAX_PORT			((uint32_t)0xFFFFFFFF)
+#define MIN_PORT			((uint32_t)0x0)
+
+/* 00000000-facb-11e6-bd58-64006a7986d3 */
+static const struct hyperv_guid srv_id_template = {
+	.hv_guid = {
+	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
+	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
+};
+
+static int		hvsock_br_callback(void *, int, void *);
+static uint32_t		hvsock_canread_check(struct hvs_pcb *);
+static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
+static int		hvsock_send_data(struct vmbus_channel *chan,
+    struct uio *uio, uint32_t to_write, struct sockbuf *sb);
+
+
+
+/* Globals */
+static struct sx		hvs_trans_socks_sx;
+static struct mtx		hvs_trans_socks_mtx;
+static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
+static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
+static uint32_t			previous_auto_bound_port;
+
+static void
+hvsock_print_guid(struct hyperv_guid *guid)
+{
+	unsigned char *p = (unsigned char *)guid;
+
+	HVSOCK_DBG(HVSOCK_DBG_INFO,
+	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
+	    *(unsigned int *)p,
+	    *((unsigned short *) &p[4]),
+	    *((unsigned short *) &p[6]),
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+}
+
+static bool
+is_valid_srv_id(const struct hyperv_guid *id)
+{
+	return !memcmp(&id->hv_guid[4],
+	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
+}
+
+static unsigned int
+get_port_by_srv_id(const struct hyperv_guid *srv_id)
+{
+	return *((const unsigned int *)srv_id);
+}
+
+static void
+set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
+{
+	*((unsigned int *)srv_id) = port;
+}
+
+
+static void
+__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
+{
+	struct hvs_pcb *p = NULL;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
+
+	if (!pcb)
+		return;
+
+	if (list & HVS_LIST_BOUND) {
+		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
+			if  (p == pcb)
+				LIST_REMOVE(p, bound_next);
+	}
+
+	if (list & HVS_LIST_CONNECTED) {
+		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
+			if (p == pcb)
+				LIST_REMOVE(pcb, connected_next);
+	}
+}
+
+static void
+__hvs_remove_socket_from_list(struct socket *so, unsigned char list)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
+
+	__hvs_remove_pcb_from_list(pcb, list);
+}
+
+static void
+__hvs_insert_socket_on_list(struct socket *so, unsigned char list)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	if (list & HVS_LIST_BOUND)
+		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
+		   pcb, bound_next);
+
+	if (list & HVS_LIST_CONNECTED)
+		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
+		   pcb, connected_next);
+}
+
+void
+hvs_remove_socket_from_list(struct socket *so, unsigned char list)
+{
+	if (!so || !so->so_pcb) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: socket or so_pcb is null\n", __func__);
+		return;
+	}
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	__hvs_remove_socket_from_list(so, list);
+	mtx_unlock(&hvs_trans_socks_mtx);
+}
+
+static void
+hvs_insert_socket_on_list(struct socket *so, unsigned char list)
+{
+	if (!so || !so->so_pcb) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: socket or so_pcb is null\n", __func__);
+		return;
+	}
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	__hvs_insert_socket_on_list(so, list);
+	mtx_unlock(&hvs_trans_socks_mtx);
+}
+
+static struct socket *
+__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
+{
+	struct hvs_pcb *p = NULL;
+
+	if (list & HVS_LIST_BOUND)
+		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
+			if (p->so != NULL &&
+			    addr->hvs_port == p->local_addr.hvs_port)
+				return p->so;
+
+	if (list & HVS_LIST_CONNECTED)
+		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
+			if (p->so != NULL &&
+			    addr->hvs_port == p->local_addr.hvs_port)
+				return p->so;
+
+	return NULL;
+}
+
+static struct socket *
+hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
+{
+	struct socket *s = NULL;
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	s = __hvs_find_socket_on_list(addr, list);
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	return s;
+}
+
+static inline void
+hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
+{
+	memset(addr, 0, sizeof(*addr));
+	addr->sa_family = AF_HYPERV;
+	addr->hvs_port = port;
+}
+
+void
+hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
+{
+	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
+}
+
+int
+hvs_trans_lock(void)
+{
+	sx_xlock(&hvs_trans_socks_sx);
+	return (0);
+}
+
+void
+hvs_trans_unlock(void)
+{
+	sx_xunlock(&hvs_trans_socks_sx);
+}
+
+void
+hvs_trans_init(void)
+{
+	/* Skip initialization of globals for non-default instances. */
+	if (!IS_DEFAULT_VNET(curvnet))
+		return;
+
+	if (vm_guest != VM_GUEST_HV)
+		return;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
+
+	/* Initialize Globals */
+	previous_auto_bound_port = MAX_PORT;
+	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
+	mtx_init(&hvs_trans_socks_mtx,
+	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
+	LIST_INIT(&hvs_trans_bound_socks);
+	LIST_INIT(&hvs_trans_connected_socks);
+}
+
+/*
+ * Called in two cases:
+ * 1) When user calls socket();
+ * 2) When we accept new incoming conneciton and call sonewconn().
+ */
+int
+hvs_trans_attach(struct socket *so, int proto, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
+
+	if (so->so_type != SOCK_STREAM)
+		return (ESOCKTNOSUPPORT);
+
+	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
+		return (EPROTONOSUPPORT);
+
+	if (pcb != NULL)
+		return (EISCONN);
+	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
+	if (pcb == NULL)
+		return (ENOMEM);
+
+	pcb->so = so;
+	so->so_pcb = (void *)pcb;
+
+	return (0);
+}
+
+void
+hvs_trans_detach(struct socket *so)
+{
+	struct hvs_pcb *pcb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
+
+	(void) hvs_trans_lock();
+	pcb = so2hvspcb(so);
+	if (pcb == NULL) {
+		hvs_trans_unlock();
+		return;
+	}
+
+	if (SOLISTENING(so)) {
+		bzero(pcb, sizeof(*pcb));
+		free(pcb, M_HVSOCK);
+	}
+
+	so->so_pcb = NULL;
+
+	hvs_trans_unlock();
+}
+
+int
+hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
+	int error = 0;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
+
+	if (sa == NULL) {
+		return (EINVAL);
+	}
+
+	if (pcb == NULL) {
+		return (EINVAL);
+	}
+
+	if (sa->sa_family != AF_HYPERV) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: Not supported, sa_family is %u\n",
+		    __func__, sa->sa_family);
+		return (EAFNOSUPPORT);
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	if (__hvs_find_socket_on_list(sa,
+	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
+		error = EADDRINUSE;
+	} else {
+		/*
+		 * The address is available for us to bind.
+		 * Add socket to the bound list.
+		 */
+		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
+		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
+		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
+	}
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	return (error);
+}
+
+int
+hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct socket *bound_so;
+	int error;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/* Check if the address is already bound and it was by us. */
+	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
+	if (bound_so == NULL || bound_so != so) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: Address not bound or not by us.\n", __func__);
+		return (EADDRNOTAVAIL);
+	}
+
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0)
+		solisten_proto(so, backlog);
+	SOCK_UNLOCK(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket listen error = %d\n", __func__, error);
+	return (error);
+}
+
+int
+hvs_trans_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
+	    M_NOWAIT);
+
+	return ((*nam == NULL) ? ENOMEM : 0);
+}
+
+int
+hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
+	bool found_auto_bound_port = false;
+	int i, error = 0;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
+	    __func__, raddr->hvs_port);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/* Verify the remote address */
+	if (raddr == NULL)
+		return (EINVAL);
+	if (raddr->sa_family != AF_HYPERV)
+		return (EAFNOSUPPORT);
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	if (so->so_state &
+	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
+			HVSOCK_DBG(HVSOCK_DBG_ERR,
+			    "%s: socket connect in progress\n",
+			    __func__);
+			error = EINPROGRESS;
+			goto out;
+	}
+
+	/*
+	 * Find an available port for us to auto bind the local
+	 * address.
+	 */
+	hvs_addr_set(&pcb->local_addr, 0);
+
+	for (i = previous_auto_bound_port - 1;
+	    i != previous_auto_bound_port; i --) {
+		if (i == MIN_PORT)
+			i = MAX_PORT;
+
+		pcb->local_addr.hvs_port = i;
+
+		if (__hvs_find_socket_on_list(&pcb->local_addr,
+		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
+			found_auto_bound_port = true;
+			previous_auto_bound_port = i;
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: found local bound port is %x\n",
+			    __func__, pcb->local_addr.hvs_port);
+			break;
+		}
+	}
+
+	if (found_auto_bound_port == true) {
+		/* Found available port for auto bound, put on list */
+		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
+		/* Set VM service ID */
+		pcb->vm_srv_id = srv_id_template;
+		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
+		/* Set host service ID and remote port */
+		pcb->host_srv_id = srv_id_template;
+		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
+		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
+
+		/* Change the socket state to SS_ISCONNECTING */
+		soisconnecting(so);
+	} else {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: No local port available for auto bound\n",
+		    __func__);
+		error = EADDRINUSE;
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
+	hvsock_print_guid(&pcb->vm_srv_id);
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
+	hvsock_print_guid(&pcb->host_srv_id);
+
+out:
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	if (found_auto_bound_port == true)
+		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
+
+	return (error);
+}
+
+int
+hvs_trans_disconnect(struct socket *so)
+{
+	struct hvs_pcb *pcb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
+
+	(void) hvs_trans_lock();
+	pcb = so2hvspcb(so);
+	if (pcb == NULL) {
+		hvs_trans_unlock();
+		return (EINVAL);
+	}
+
+	/* If socket is already disconnected, skip this */
+	if ((so->so_state & SS_ISDISCONNECTED) == 0)
+		soisdisconnecting(so);
+
+	hvs_trans_unlock();
+
+	return (0);
+}
+
+#define SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+struct hvs_callback_arg {
+	struct uio *uio;
+	struct sockbuf *sb;
+};
+
+int
+hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockbuf *sb;
+	ssize_t orig_resid;
+	uint32_t canread, to_read;
+	int flags, error = 0;
+	struct hvs_callback_arg cbarg;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
+
+	if (so->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (pcb == NULL)
+		return (EINVAL);
+
+	if (flagsp != NULL)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+
+	if (flags & MSG_PEEK)
+		return (EOPNOTSUPP);
+
+	/* If no space to copy out anything */
+	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
+		return (EINVAL);
+
+	sb = &so->so_rcv;
+
+	orig_resid = uio->uio_resid;
+
+	/* Prevent other readers from entering the socket. */
+	error = sblock(sb, SBLOCKWAIT(flags));
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: sblock returned error = %d\n", __func__, error);
+		return (error);
+	}
+
+	SOCKBUF_LOCK(sb);
+
+	cbarg.uio = uio;
+	cbarg.sb = sb;
+	/*
+	 * If the socket is closing, there might still be some data
+	 * in rx br to read. However we need to make sure
+	 * the channel is still open.
+	 */
+	if ((sb->sb_state & SBS_CANTRCVMORE) &&
+	    (so->so_state & SS_ISDISCONNECTED)) {
+		/* Other thread already closed the channel */
+		error = EPIPE;
+		goto out;
+	}
+
+	while (true) {
+		while (uio->uio_resid > 0 &&
+		    (canread = hvsock_canread_check(pcb)) > 0) {
+			to_read = MIN(canread, uio->uio_resid);
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
+			    (unsigned int)(sizeof(struct hvs_pkt_header) +
+			    pcb->recv_data_off));
+
+			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
+			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
+			    hvsock_br_callback, (void *)&cbarg);
+			/*
+			 * It is possible socket is disconnected becasue
+			 * we released lock in hvsock_br_callback. So we
+			 * need to check the state to make sure it is not
+			 * disconnected.
+			 */
+			if (error || so->so_state & SS_ISDISCONNECTED) {
+				break;
+			}
+
+			pcb->recv_data_len -= to_read;
+			pcb->recv_data_off += to_read;
+		}
+
+		if (error)
+			break;
+
+		/* Abort if socket has reported problems. */
+		if (so->so_error) {
+			if (so->so_error == ESHUTDOWN &&
+			    orig_resid > uio->uio_resid) {
+				/*
+				 * Although we got a FIN, we also received
+				 * some data in this round. Delivery it
+				 * to user.
+				 */
+				error = 0;
+			} else {
+				if (so->so_error != ESHUTDOWN)
+					error = so->so_error;
+			}
+
+			break;
+		}
+
+		/* Cannot received more. */
+		if (sb->sb_state & SBS_CANTRCVMORE)
+			break;
+
+		/* We are done if buffer has been filled */
+		if (uio->uio_resid == 0)
+			break;
+
+		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
+			break;
+
+		/* Buffer ring is empty and we shall not block */
+		if ((so->so_state & SS_NBIO) ||
+		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+			if (orig_resid == uio->uio_resid) {
+				/* We have not read anything */
+				error = EAGAIN;
+			}
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: non blocked read return, error %d.\n",
+			    __func__, error);
+			break;
+		}
+
+		/*
+		 * Wait and block until (more) data comes in.
+		 * Note: Drops the sockbuf lock during wait.
+		 */
+		error = sbwait(sb);
+
+		if (error)
+			break;
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: wake up from sbwait, read available is %u\n",
+		    __func__, vmbus_chan_read_available(pcb->chan));
+	}
+
+out:
+	SOCKBUF_UNLOCK(sb);
+
+	sbunlock(sb);
+
+	/* We recieved a FIN in this call */
+	if (so->so_error == ESHUTDOWN) {
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			/* Send has already closed */
+			soisdisconnecting(so);
+		} else {
+			/* Just close the receive side */
+			socantrcvmore(so);
+		}
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: returning error = %d, so_error = %d\n",
+	    __func__, error, so->so_error);
+
+	return (error);
+}
+
+int
+hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockbuf *sb;
+	ssize_t orig_resid;
+	uint32_t canwrite, to_write;
+	int error = 0;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %lu\n",
+	    __func__, uio->uio_resid);
+
+	if (so->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/* If nothing to send */
+	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
+		return (EINVAL);
+
+	sb = &so->so_snd;
+
+	orig_resid = uio->uio_resid;
+
+	/* Prevent other writers from entering the socket. */
+	error = sblock(sb, SBLOCKWAIT(flags));
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: sblock returned error = %d\n", __func__, error);
+		return (error);
+	}
+
+	SOCKBUF_LOCK(sb);
+
+	if ((sb->sb_state & SBS_CANTSENDMORE) ||
+	    so->so_error == ESHUTDOWN) {
+		error = EPIPE;
+		goto out;
+	}
+
+	while (uio->uio_resid > 0) {
+		canwrite = hvsock_canwrite_check(pcb);
+		if (canwrite == 0) {
+			/* We have sent some data */
+			if (orig_resid > uio->uio_resid)
+				break;
+			/*
+			 * We have not sent any data and it is
+			 * non-blocked io
+			 */
+			if (so->so_state & SS_NBIO ||
+			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
+				error = EWOULDBLOCK;
+				break;
+			} else {
+				/*
+				 * We are here because there is no space on
+				 * send buffer ring. Signal the other side
+				 * to read and free more space.
+				 * Sleep wait until space avaiable to send
+				 * Note: Drops the sockbuf lock during wait.
+				 */
+				error = sbwait(sb);
+
+				if (error)
+					break;
+
+				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+				    "%s: wake up from sbwait, space avail on "
+				    "tx ring is %u\n",
+				    __func__,
+				    vmbus_chan_write_available(pcb->chan));
+
+				continue;
+			}
+		}
+		to_write = MIN(canwrite, uio->uio_resid);
+		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: canwrite is %u, to_write = %u\n", __func__,
+		    canwrite, to_write);
+		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
+
+		if (error)
+			break;
+	}
+
+out:
+	SOCKBUF_UNLOCK(sb);
+	sbunlock(sb);
+
+	return (error);
+}
+
+int
+hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
+
+	return ((*nam == NULL)? ENOMEM : 0);
+}
+
+int
+hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
+
+	return ((*nam == NULL)? ENOMEM : 0);
+}
+
+void
+hvs_trans_close(struct socket *so)
+{
+	struct hvs_pcb *pcb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
+
+	(void) hvs_trans_lock();
+	pcb = so2hvspcb(so);
+	if (!pcb) {
+		hvs_trans_unlock();
+		return;
+	}
+
+	if (so->so_state & SS_ISCONNECTED) {
+		/* Send a FIN to peer */
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
+		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
+	}
+
+	if (so->so_state &
+	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
+		soisdisconnected(so);
+
+	pcb->chan = NULL;
+	pcb->so = NULL;
+
+	if (SOLISTENING(so)) {
+		mtx_lock(&hvs_trans_socks_mtx);
+		/* Remove from bound list */
+		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+		mtx_unlock(&hvs_trans_socks_mtx);
+	}
+
+	hvs_trans_unlock();
+
+	return;
+}
+
+void
+hvs_trans_abort(struct socket *so)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
+
+	(void) hvs_trans_lock();
+	if (pcb == NULL) {
+		hvs_trans_unlock();
+		return;
+	}
+
+	if (SOLISTENING(so)) {
+		mtx_lock(&hvs_trans_socks_mtx);
+		/* Remove from bound list */
+		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+		mtx_unlock(&hvs_trans_socks_mtx);
+	}
+
+	if (so->so_state & SS_ISCONNECTED) {
+		(void) sodisconnect(so);
+	}
+	hvs_trans_unlock();
+
+	return;
+}
+
+int
+hvs_trans_shutdown(struct socket *so)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockbuf *sb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/*
+	 * Only get called with the shutdown method is SHUT_WR or
+	 * SHUT_RDWR.
+	 * When the method is SHUT_RD or SHUT_RDWR, the caller
+	 * already set the SBS_CANTRCVMORE on receive side socket
+	 * buffer.
+	 */
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
+		/*
+		 * SHUT_WR only case.
+		 * Receive side is still open. Just close
+		 * the send side.
+		 */
+		socantsendmore(so);
+	} else {
+		/* SHUT_RDWR case */
+		if (so->so_state & SS_ISCONNECTED) {
+			/* Send a FIN to peer */
+			sb = &so->so_snd;
+			SOCKBUF_LOCK(sb);
+			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
+			SOCKBUF_UNLOCK(sb);
+
+			soisdisconnecting(so);
+		}
+	}
+
+	return (0);
+}
+
+/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
+ * <port> (see struct sockaddr_hvs).
+ *
+ * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
+ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
+ * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
+ * the below sockaddr:
+ *
+ * struct SOCKADDR_HV
+ * {
+ *    ADDRESS_FAMILY Family;
+ *    USHORT Reserved;
+ *    GUID VmId;
+ *    GUID ServiceId;
+ * };
+ * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
+ * VMBus, because here it's obvious the host and the VM can easily identify
+ * each other. Though the VmID is useful on the host, especially in the case
+ * of Windows container, FreeBSD VM doesn't need it at all.
+ *
+ * To be compatible with similar infrastructure in Linux VMs, we have
+ * to limit the available GUID space of SOCKADDR_HV so that we can create
+ * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
+ * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
+ *
+ ****************************************************************************
+ * The only valid Service GUIDs, from the perspectives of both the host and *
+ * FreeBSD VM, that can be connected by the other end, must conform to this *
+ * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
+ ****************************************************************************
+ *
+ * When we write apps on the host to connect(), the GUID ServiceID is used.
+ * When we write apps in FreeBSD VM to connect(), we only need to specify the
+ * port and the driver will form the GUID and use that to request the host.
+ *
+ * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
+ * auto-generated remote port for a connect request initiated by the host's
+ * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
+ * FreeBSD guest.
+ */
+
+/*
+ * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
+ * restricts HyperV socket ring buffer size to six 4K pages. Newer
+ * HyperV hosts doen't have this limit.
+ */
+#define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
+#define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
+#define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
+
+struct hvsock_sc {
+	device_t		dev;
+	struct hvs_pcb		*pcb;
+	struct vmbus_channel	*channel;
+};
+
+static bool
+hvsock_chan_readable(struct vmbus_channel *chan)
+{
+	uint32_t readable = vmbus_chan_read_available(chan);
+
+	return (readable >= HVSOCK_PKT_LEN(0));
+}
+
+static void
+hvsock_chan_cb(struct vmbus_channel *chan, void *context)
+{
+	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
+	struct socket *so;
+	uint32_t canwrite;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: host send us a wakeup on rb data, pcb = %p\n",
+	    __func__, pcb);
+
+	/*
+	 * Check if the socket is still attached and valid.
+	 * Here we know channel is still open. Need to make
+	 * sure the socket has not been closed or freed.
+	 */
+	(void) hvs_trans_lock();
+	so = hsvpcb2so(pcb);
+
+	if (pcb->chan != NULL && so != NULL) {
+		/*
+		 * Wake up reader if there are data to read.
+		 */
+		SOCKBUF_LOCK(&(so)->so_rcv);
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: read available = %u\n", __func__,
+		    vmbus_chan_read_available(pcb->chan));
+
+		if (hvsock_chan_readable(pcb->chan))
+			sorwakeup_locked(so);
+		else
+			SOCKBUF_UNLOCK(&(so)->so_rcv);
+
+		/*
+		 * Wake up sender if space becomes available to write.
+		 */
+		SOCKBUF_LOCK(&(so)->so_snd);
+		canwrite = hvsock_canwrite_check(pcb);
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: canwrite = %u\n", __func__, canwrite);
+
+		if (canwrite > 0) {
+			sowwakeup_locked(so);
+		} else {
+			SOCKBUF_UNLOCK(&(so)->so_snd);
+		}
+	}
+
+	hvs_trans_unlock();
+
+	return;
+}
+
+static int
+hvsock_br_callback(void *datap, int cplen, void *cbarg)
+{
+	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
+	struct uio *uio = arg->uio;
+	struct sockbuf *sb = arg->sb;
+	int error = 0;
+
+	if (cbarg == NULL || datap == NULL)
+		return (EINVAL);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: called, uio_rw = %s, uio_resid = %lu, cplen = %u, "
+	    "datap = %p\n",
+	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
+	    uio->uio_resid, cplen, datap);
+
+	if (sb)
+		SOCKBUF_UNLOCK(sb);
+
+	error = uiomove(datap, cplen, uio);
+
+	if (sb)
+		SOCKBUF_LOCK(sb);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: after uiomove, uio_resid = %lu, error = %d\n",
+	    __func__, uio->uio_resid, error);
+
+	return (error);
+}
+
+static int
+hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
+    uint32_t to_write, struct sockbuf *sb)
+{
+	struct hvs_pkt_header hvs_pkt;
+	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
+	uint64_t pad = 0;
+	struct iovec iov[3];
+	struct hvs_callback_arg cbarg;
+
+	if (chan == NULL)
+		return (ENOTCONN);
+
+	hlen = sizeof(struct vmbus_chanpkt_hdr);
+	hvs_pkthlen = sizeof(struct hvs_pkt_header);
+	hvs_pktlen = hvs_pkthlen + to_write;
+	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
+	    "pad_pktlen = %u, data_len = %u\n",
+	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
+
+	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
+	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
+	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
+	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
+	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
+
+	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
+	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
+
+	cbarg.uio = uio;
+	cbarg.sb = sb;
+
+	if (uio && to_write > 0) {
+		iov[0].iov_base = &hvs_pkt;
+		iov[0].iov_len = hvs_pkthlen;
+		iov[1].iov_base = NULL;
+		iov[1].iov_len = to_write;
+		iov[2].iov_base = &pad;
+		iov[2].iov_len = pad_pktlen - hvs_pktlen;
+
+		error = vmbus_chan_iov_send(chan, iov, 3,
+		    hvsock_br_callback, &cbarg);
+	} else {
+		if (to_write == 0) {
+			iov[0].iov_base = &hvs_pkt;
+			iov[0].iov_len = hvs_pkthlen;
+			iov[1].iov_base = &pad;
+			iov[1].iov_len = pad_pktlen - hvs_pktlen;
+			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
+		}
+	}
+
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: error = %d\n", __func__, error);
+	}
+
+	return (error);
+}
+
+/*
+ * Check if we have data on current ring buffer to read
+ * or not. If not, advance the ring buffer read index to
+ * next packet. Update the recev_data_len and recev_data_off
+ * to new value.
+ * Return the number of bytes can read.
+ */
+static uint32_t
+hvsock_canread_check(struct hvs_pcb *pcb)
+{
+	uint32_t advance;
+	uint32_t tlen, hlen, dlen;
+	uint32_t bytes_canread = 0;
+	int error;
+
+	if (pcb == NULL || pcb->chan == NULL) {
+		pcb->so->so_error = EIO;
+		return (0);
+	}
+
+	/* Still have data not read yet on current packet */
+	if (pcb->recv_data_len > 0)
+		return (pcb->recv_data_len);
+
+	if (pcb->rb_init)
+		advance =
+		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
+	else
+		advance = 0;
+
+	bytes_canread = vmbus_chan_read_available(pcb->chan);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: bytes_canread on br = %u, advance = %u\n",
+	    __func__, bytes_canread, advance);
+
+	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
+		/*
+		 * Nothing to read. Need to advance the rindex before
+		 * calling sbwait, so host knows to wake us up when data
+		 * is available to read on rb.
+		 */
+		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
+		if (error) {
+			HVSOCK_DBG(HVSOCK_DBG_ERR,
+			    "%s: after calling vmbus_chan_recv_idxadv, "
+			    "got error = %d\n",  __func__, error);
+			return (0);
+		} else {
+			pcb->rb_init = false;
+			pcb->recv_data_len = 0;
+			pcb->recv_data_off = 0;
+			bytes_canread = vmbus_chan_read_available(pcb->chan);
+
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: advanced %u bytes, "
+			    " bytes_canread on br now = %u\n",
+			    __func__, advance, bytes_canread);
+
+			if (bytes_canread == 0)
+				return (0);
+			else
+				advance = 0;
+		}
+	}
+
+	if (bytes_canread <
+	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
+		return (0);
+
+	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
+	    sizeof(struct hvs_pkt_header), advance);
+
+	/* Don't have anything to read */
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
+		    __func__, error);
+		return (0);
+	}
+
+	/*
+	 * We just read in a new packet header. Do some sanity checks.
+	 */
+	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
+	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
+	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
+	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
+	    __predict_false(hlen > tlen) ||
+	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
+		    tlen, hlen, dlen);
+		pcb->so->so_error = EIO;
+		return (0);
+	}
+	if (pcb->rb_init == false)
+		pcb->rb_init = true;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
+	    tlen, hlen, dlen);
+
+	/* The other side has sent a close FIN */
+	if (dlen == 0) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: Received FIN from other side\n", __func__);
+		/* inform the caller by seting so_error to ESHUTDOWN */
+		pcb->so->so_error = ESHUTDOWN;
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: canread on receive ring is %u \n", __func__, dlen);
+
+	pcb->recv_data_len = dlen;
+	pcb->recv_data_off = 0;
+
+	return (pcb->recv_data_len);
+}
+
+static uint32_t
+hvsock_canwrite_check(struct hvs_pcb *pcb)
+{
+	uint32_t writeable;
+	uint32_t ret;
+
+	if (pcb == NULL || pcb->chan == NULL)
+		return (0);
+
+	writeable = vmbus_chan_write_available(pcb->chan);
+
+	/*
+	 * We must always reserve a 0-length-payload packet for the FIN.
+	 */
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: writeable is %u, should be greater than %lu\n",
+	    __func__, writeable, HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0));
+
+	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
+		/*
+		 * The Tx ring seems full.
+		 */
+		return (0);
+	}
+
+	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
+
+	return (rounddown2(ret, 8));
+}
+
+static void
+hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
+{
+	vmbus_chan_set_pending_send_size(chan,
+	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
+}
+
+static int
+hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
+{
+	unsigned int rcvbuf, sndbuf;
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	int ret;
+
+	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
+		sndbuf = HVS_RINGBUF_SND_SIZE;
+		rcvbuf = HVS_RINGBUF_RCV_SIZE;
+	} else {
+		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
+		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
+		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
+		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
+		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
+		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
+	}
+
+	/*
+	 * Can only read whatever user provided size of data
+	 * from ring buffer. Turn off batched reading.
+	 */
+	vmbus_chan_set_readbatch(chan, false);
+
+	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
+	    hvsock_chan_cb, pcb);
+
+	if (ret != 0) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: failed to open hvsock channel, sndbuf = %u, "
+		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
+	} else {
+		HVSOCK_DBG(HVSOCK_DBG_INFO,
+		    "%s: hvsock channel opened, sndbuf = %u, i"
+		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
+		/*
+		 * Se the pending send size so to receive wakeup
+		 * signals from host when there is enough space on
+		 * rx buffer ring to write.
+		 */
+		hvsock_set_chan_pending_send_size(chan);
+	}
+
+	return ret;
+}
+
+/*
+ * Guest is listening passively on the socket. Open channel and
+ * create a new socket for the conneciton.
+ */
+static void
+hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
+    struct hvsock_sc *sc)
+{
+	struct socket *new_so;
+	struct hvs_pcb *new_pcb, *pcb;
+	int error;
+
+	/* Do nothing if socket is not listening */
+	if ((so->so_options & SO_ACCEPTCONN) == 0) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: socket is not a listening one\n", __func__);
+		return;
+	}
+
+	/*
+	 * Create a new socket. This will call pru_attach to complete
+	 * the socket initialization and put the new socket onto
+	 * listening socket's sol_incomp list, waiting to be promoted
+	 * to sol_comp list.
+	 * The new socket created has ref count 0. There is no other
+	 * thread that changes the state of this new one at the
+	 * moment, so we don't need to hold its lock while opening
+	 * channel and filling out its pcb information.
+	 */
+	new_so = sonewconn(so, 0);
+	if (!new_so)
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: creating new socket failed\n", __func__);
+
+	/*
+	 * Now open the vmbus channel. If it fails, the socket will be
+	 * on the listening socket's sol_incomp queue until it is
+	 * replaced and aborted.
+	 */
+	error = hvsock_open_channel(chan, new_so);
+	if (error) {
+		new_so->so_error = error;
+		return;
+	}
+
+	pcb = so->so_pcb;
+	new_pcb = new_so->so_pcb;
+
+	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
+	/* Remote port is unknown to guest in this type of conneciton */
+	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
+	new_pcb->chan = chan;
+	new_pcb->recv_data_len = 0;
+	new_pcb->recv_data_off = 0;
+	new_pcb->rb_init = false;
+
+	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
+	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
+
+	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
+
+	sc->pcb = new_pcb;
+
+	/*
+	 * Change the socket state to SS_ISCONNECTED. This will promote
+	 * the socket to sol_comp queue and wake up the thread which
+	 * is accepting connection.
+	 */
+	soisconnected(new_so);
+}
+
+
+/*
+ * Guest is actively connecting to host.
+ */
+static void
+hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
+{
+	struct hvs_pcb *pcb;
+	int error;
+
+	error = hvsock_open_channel(chan, so);
+	if (error) {
+		so->so_error = error;
+		return;
+	}
+
+	pcb = so->so_pcb;
+	pcb->chan = chan;
+	pcb->recv_data_len = 0;
+	pcb->recv_data_off = 0;
+	pcb->rb_init = false;
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	/*
+	 * Change the socket state to SS_ISCONNECTED. This will wake up
+	 * the thread sleeping in connect call.
+	 */
+	soisconnected(so);
+}
+
+static void
+hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
+{
+	struct hyperv_guid *inst_guid, *type_guid;
+	bool conn_from_host;
+	struct sockaddr_hvs addr;
+	struct socket *so;
+	struct hvs_pcb *pcb;
+
+	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
+	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
+	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
+
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
+	hvsock_print_guid(type_guid);
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
+	hvsock_print_guid(inst_guid);
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
+	    (conn_from_host == true ) ? "from" : "to");
+
+	/*
+	 * The listening port should be in [0, MAX_LISTEN_PORT]
+	 */
+	if (!is_valid_srv_id(type_guid))
+		return;
+
+	/*
+	 * There should be a bound socket already created no matter
+	 * it is a passive or active connection.
+	 * For host initiated connection (passive on guest side),
+	 * the  type_guid contains the port which guest is bound and
+	 * listening.
+	 * For the guest initiated connection (active on guest side),
+	 * the inst_guid contains the port that guest has auto bound
+	 * to.
+	 */
+	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
+	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
+	if (!so) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: no bound socket found for port %u\n",
+		    __func__, addr.hvs_port);
+		return;
+	}
+
+	if (conn_from_host) {
+		hvsock_open_conn_passive(chan, so, sc);
+	} else {
+		(void) hvs_trans_lock();
+		pcb = so->so_pcb;
+		if (pcb && pcb->so) {
+			sc->pcb = so2hvspcb(so);
+			hvsock_open_conn_active(chan, so);
+		} else {
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: channel detached before open\n", __func__);
+		}
+		hvs_trans_unlock();
+	}
+
+}
+
+static int
+hvsock_probe(device_t dev)
+{
+	struct vmbus_channel *channel = vmbus_get_channel(dev);
+
+	if (!channel || !vmbus_chan_is_hvs(channel)) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "hvsock_probe called but not a hvsock channel id %u\n",
+		    vmbus_chan_id(channel));
+
+		return ENXIO;
+	} else {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "hvsock_probe got a hvsock channel id %u\n",
+		    vmbus_chan_id(channel));
+
+		return BUS_PROBE_DEFAULT;
+	}
+}
+
+static int
+hvsock_attach(device_t dev)
+{
+	struct vmbus_channel *channel = vmbus_get_channel(dev);
+	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
+
+	hvsock_open_connection(channel, sc);
+
+	/*
+	 * Always return success. On error the host will rescind the device
+	 * in 30 seconds and we can do cleanup at that time in
+	 * vmbus_chan_msgproc_chrescind().
+	 */
+	return (0);
+}
+
+static int
+hvsock_detach(device_t dev)
+{
+	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
+	struct socket *so;
+	int error, retry;
+
+	if (bootverbose)
+		device_printf(dev, "hvsock_detach called.\n");
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
+
+	if (sc->pcb != NULL) {
+		(void) hvs_trans_lock();
+
+		so = hsvpcb2so(sc->pcb);
+		if (so) {
+			/* Close the connection */
+			if (so->so_state &
+			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
+				soisdisconnected(so);
+		}
+
+		mtx_lock(&hvs_trans_socks_mtx);
+		__hvs_remove_pcb_from_list(sc->pcb,
+		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
+		mtx_unlock(&hvs_trans_socks_mtx);
+
+		/*
+		 * Close channel while no reader and sender are working
+		 * on the buffer rings.
+		 */
+		if (so) {
+			retry = 0;
+			while ((error = sblock(&so->so_rcv, 0)) ==
+			    EWOULDBLOCK) {
+				/*
+				 * Someone is reading, rx br is busy
+				 */
+				soisdisconnected(so);
+				DELAY(500);
+				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+				    "waiting for rx reader to exit, "
+				    "retry = %d\n", retry++);
+			}
+			retry = 0;
+			while ((error = sblock(&so->so_snd, 0)) ==
+			    EWOULDBLOCK) {
+				/*
+				 * Someone is sending, tx br is busy
+				 */
+				soisdisconnected(so);
+				DELAY(500);
+				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+				    "waiting for tx sender to exit, "
+				    "retry = %d\n", retry++);
+			}
+		}
+
+
+		bzero(sc->pcb, sizeof(struct hvs_pcb));
+		free(sc->pcb, M_HVSOCK);
+		sc->pcb = NULL;
+
+		if (so) {
+			sbunlock(&so->so_rcv);
+			sbunlock(&so->so_snd);
+			so->so_pcb = NULL;
+		}
+
+		hvs_trans_unlock();
+	}
+
+	vmbus_chan_close(vmbus_get_channel(dev));
+
+	return (0);
+}
+
+static device_method_t hvsock_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hvsock_probe),
+	DEVMETHOD(device_attach, hvsock_attach),
+	DEVMETHOD(device_detach, hvsock_detach),
+	DEVMETHOD_END
+};
+
+static driver_t hvsock_driver = {
+	"hv_sock",
+	hvsock_methods,
+	sizeof(struct hvsock_sc)
+};
+
+static devclass_t hvsock_devclass;
+
+DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
+MODULE_VERSION(hvsock, 1);
+MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
diff --git a/sys/conf/files.x86 b/sys/conf/files.x86
--- a/sys/conf/files.x86
+++ b/sys/conf/files.x86
@@ -113,6 +113,7 @@
 dev/hwpmc/hwpmc_uncore.c	optional	hwpmc
 dev/hwpmc/hwpmc_tsc.c		optional	hwpmc
 dev/hwpmc/hwpmc_x86.c		optional	hwpmc
+dev/hyperv/hvsock/hv_sock.c				optional	hyperv
 dev/hyperv/pcib/vmbus_pcib.c				optional	hyperv pci
 dev/hyperv/netvsc/hn_nvs.c				optional	hyperv
 dev/hyperv/netvsc/hn_rndis.c				optional	hyperv


Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?79d16d788f9a405bf42820916438b2a4>