Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 29 May 2014 14:20:54 +0400
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        arch@FreeBSD.org
Subject:   [CFT/review] new sendfile(2)
Message-ID:  <20140529102054.GX50679@FreeBSD.org>

next in thread | raw e-mail | index | archive | help

--IuJpT0rwbUevm2bB
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

  Hello!

  At Netflix and Nginx we are experimenting with improving FreeBSD
wrt sending large amounts of static data via HTTP.

  One of the approaches we are experimenting with is new sendfile(2)
implementation, that doesn't block on the I/O done from the file
descriptor.

  The problem with classic sendfile(2) is that if the the request
length is large enough, and file data is not cached in VM, then
sendfile(2) syscall would not return until it fills socket buffer
with data. With modern internet socket buffers can be up to 1 Mb,
thus time taken by the syscall raises by order of magnitude. All
the time, the nginx worker is blocked in syscall and doesn't
process data from other clients. The best current practice to
mitigate that is known as "sendfile(2) + aio_read(2)". This is
special mode of nginx operation on FreeBSD. The sendfile(2) call
is issued with SF_NODISKIO flag, that forbids the syscall to
perform disk I/O, and send only data that is cached by VM. If
sendfile(2) reports that I/O needs to be done (but forbidden), then
nginx would do aio_read() of a chunk of the file. The data read
is cached by VM, as side affect. Then sendfile() is called again.

  Now for the new sendfile. The core idea is that sendfile()
schedules the I/O, but doesn't wait for it to complete. It
returns immediately to the process, and I/O completion is
processed in kernel context. Unlike aio(4), no additional
threads in kernel are created. The new sendfile is a drop-in
replacement for the old one. Applications (like nginx) doesn't
need recompile, neither configuration change. The SF_NODISKIO is
ignored.

  The patch for review is available at:

https://phabric.freebsd.org/D102

And for those who prefer email attachments, it is also attached.
The patch has 3 logically separate changes in itself:

1) Split of socket buffer sb_cc field into sb_acc and sb_ccc. Where
sb_acc stands for "available character count" and sb_ccc is "claimed
character count". This allows us to write a data to a socket, that is
not ready yet. The data sits in the socket, consumes its space, and
keeps itself in the right order with earlier or later writes to socket.
But it can be send only after it is marked as ready. This change is
split across many files.

2) A new vnode operation: VOP_GETPAGES_ASYNC(). This one lives in sys/vm.

3) Actual implementation of new sendfile(2). This one lives in
kern/uipc_syscalls.c



  At Netflix, we already see improvements with new sendfile(2).
We can send more data utilizing same amount of CPU, and we can
push closer to 0% idle, without experiencing short lags.

However, we have somewhat modified VM subsystem, that behaves
optimal for our task, but suboptimal for average FreeBSD system.
I'd like someone from community to try the new sendfile(2) at
other setup and see how does it serve for you.

  To be the early tester you need to checkout projects/sendfile
branch and build kernel from it. The world from head/ would
run fine with it.

  svn co http://svn.freebsd.org/base/projects/sendfile
  cd sendfile
  ... build kernel ...

Limitations:
- No testing were done on serving files on NFS.
- No testing were done on serving files on ZFS.

-- 
Totus tuus, Glebius.

--IuJpT0rwbUevm2bB
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="project-sendfile.diff"

Index: sys/dev/ti/if_ti.c
===================================================================
--- sys/dev/ti/if_ti.c	(.../head)	(revision 266804)
+++ sys/dev/ti/if_ti.c	(.../projects/sendfile)	(revision 266807)
@@ -1629,7 +1629,7 @@ ti_newbuf_jumbo(struct ti_softc *sc, int idx, stru
 			m[i]->m_data = (void *)sf_buf_kva(sf[i]);
 			m[i]->m_len = PAGE_SIZE;
 			MEXTADD(m[i], sf_buf_kva(sf[i]), PAGE_SIZE,
-			    sf_buf_mext, (void*)sf_buf_kva(sf[i]), sf[i],
+			    sf_mext_free, (void*)sf_buf_kva(sf[i]), sf[i],
 			    0, EXT_DISPOSABLE);
 			m[i]->m_next = m[i+1];
 		}
@@ -1694,7 +1694,7 @@ nobufs:
 		if (m[i])
 			m_freem(m[i]);
 		if (sf[i])
-			sf_buf_mext((void *)sf_buf_kva(sf[i]), sf[i]);
+			sf_mext_free((void *)sf_buf_kva(sf[i]), sf[i]);
 	}
 	return (ENOBUFS);
 }
Index: sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- sys/dev/cxgbe/tom/t4_cpl_io.c	(.../head)	(revision 266804)
+++ sys/dev/cxgbe/tom/t4_cpl_io.c	(.../projects/sendfile)	(revision 266807)
@@ -338,11 +338,11 @@ t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK(sb);
-	KASSERT(toep->sb_cc >= sb->sb_cc,
+	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sb->sb_cc, toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sb->sb_cc;
-	toep->sb_cc = sb->sb_cc;
+	    __func__, sb, sbused(sb), toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sbused(sb);
+	toep->sb_cc = sbused(sb);
 	credits = toep->rx_credits;
 	SOCKBUF_UNLOCK(sb);
 
@@ -863,15 +863,15 @@ do_peer_close(struct sge_iq *iq, const struct rss_
 		tp->rcv_nxt = be32toh(cpl->rcv_nxt);
 		toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE);
 
-		KASSERT(toep->sb_cc >= sb->sb_cc,
+		KASSERT(toep->sb_cc >= sbused(sb),
 		    ("%s: sb %p has more data (%d) than last time (%d).",
-		    __func__, sb, sb->sb_cc, toep->sb_cc));
-		toep->rx_credits += toep->sb_cc - sb->sb_cc;
+		    __func__, sb, sbused(sb), toep->sb_cc));
+		toep->rx_credits += toep->sb_cc - sbused(sb);
 #ifdef USE_DDP_RX_FLOW_CONTROL
 		toep->rx_credits -= m->m_len;	/* adjust for F_RX_FC_DDP */
 #endif
-		sbappendstream_locked(sb, m);
-		toep->sb_cc = sb->sb_cc;
+		sbappendstream_locked(sb, m, 0);
+		toep->sb_cc = sbused(sb);
 	}
 	socantrcvmore_locked(so);	/* unlocks the sockbuf */
 
@@ -1281,12 +1281,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_hea
 		}
 	}
 
-	KASSERT(toep->sb_cc >= sb->sb_cc,
+	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sb->sb_cc, toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sb->sb_cc;
-	sbappendstream_locked(sb, m);
-	toep->sb_cc = sb->sb_cc;
+	    __func__, sb, sbused(sb), toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sbused(sb);
+	sbappendstream_locked(sb, m, 0);
+	toep->sb_cc = sbused(sb);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
Index: sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- sys/dev/cxgbe/tom/t4_ddp.c	(.../head)	(revision 266804)
+++ sys/dev/cxgbe/tom/t4_ddp.c	(.../projects/sendfile)	(revision 266807)
@@ -224,15 +224,15 @@ insert_ddp_data(struct toepcb *toep, uint32_t n)
 	tp->rcv_wnd -= n;
 #endif
 
-	KASSERT(toep->sb_cc >= sb->sb_cc,
+	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sb->sb_cc, toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+	    __func__, sb, sbused(sb), toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sbused(sb);
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits -= n;	/* adjust for F_RX_FC_DDP */
 #endif
-	sbappendstream_locked(sb, m);
-	toep->sb_cc = sb->sb_cc;
+	sbappendstream_locked(sb, m, 0);
+	toep->sb_cc = sbused(sb);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
@@ -459,15 +459,15 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_re
 	else
 		discourage_ddp(toep);
 
-	KASSERT(toep->sb_cc >= sb->sb_cc,
+	KASSERT(toep->sb_cc >= sbused(sb),
 	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sb->sb_cc, toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+	    __func__, sb, sbused(sb), toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sbused(sb);
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
 #endif
-	sbappendstream_locked(sb, m);
-	toep->sb_cc = sb->sb_cc;
+	sbappendstream_locked(sb, m, 0);
+	toep->sb_cc = sbused(sb);
 wakeup:
 	KASSERT(toep->ddp_flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
@@ -897,7 +897,7 @@ handle_ddp(struct socket *so, struct uio *uio, int
 #endif
 
 	/* XXX: too eager to disable DDP, could handle NBIO better than this. */
-	if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
+	if (sbused(sb) >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
 	    uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
 	    so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
 	    error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
@@ -935,7 +935,7 @@ handle_ddp(struct socket *so, struct uio *uio, int
 	 * payload.
 	 */
 	ddp_flags = select_ddp_flags(so, flags, db_idx);
-	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
+	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sbused(sb), ddp_flags);
 	if (wr == NULL) {
 		/*
 		 * Just unhold the pages.  The DDP buffer's software state is
@@ -960,8 +960,9 @@ handle_ddp(struct socket *so, struct uio *uio, int
 	 */
 	rc = sbwait(sb);
 	while (toep->ddp_flags & buf_flag) {
+		/* XXXGL: shouldn't here be sbwait() call? */
 		sb->sb_flags |= SB_WAIT;
-		msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
+		msleep(&sb->sb_acc, &sb->sb_mtx, PSOCK , "sbwait", 0);
 	}
 	unwire_ddp_buffer(db);
 	return (rc);
@@ -1123,8 +1124,8 @@ restart:
 
 		/* uio should be just as it was at entry */
 		KASSERT(oresid == uio->uio_resid,
-		    ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
-		    __func__, oresid, uio->uio_resid, sb->sb_cc));
+		    ("%s: oresid = %d, uio_resid = %zd, sbused = %d",
+		    __func__, oresid, uio->uio_resid, sbused(sb)));
 
 		error = handle_ddp(so, uio, flags, 0);
 		ddp_handled = 1;
@@ -1134,7 +1135,7 @@ restart:
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
-		if (sb->sb_cc > 0)
+		if (sbused(sb))
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
@@ -1146,7 +1147,7 @@ restart:
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
-		if (sb->sb_cc > 0)
+		if (sbused(sb))
 			goto deliver;
 		else
 			goto out;
@@ -1153,7 +1154,7 @@ restart:
 	}
 
 	/* Socket buffer is empty and we shall not block. */
-	if (sb->sb_cc == 0 &&
+	if (sbused(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
@@ -1160,18 +1161,18 @@ restart:
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
-	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+	if (sbused(sb) && !(flags & MSG_WAITALL) &&
 	    ((sb->sb_flags & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
-	     sb->sb_cc >= sb->sb_lowat ||
-	     sb->sb_cc >= uio->uio_resid ||
-	     sb->sb_cc >= sb->sb_hiwat) ) {
+	     sbused(sb) >= sb->sb_lowat ||
+	     sbused(sb) >= uio->uio_resid ||
+	     sbused(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
-	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
+	    (sbused(sb) >= uio->uio_resid || sbused(sb) >= sb->sb_lowat))
 		goto deliver;
 
 	/*
@@ -1190,7 +1191,7 @@ restart:
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+	KASSERT(sbused(sb) > 0, ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
@@ -1201,7 +1202,7 @@ deliver:
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
-	len = min(uio->uio_resid, sb->sb_cc);
+	len = min(uio->uio_resid, sbused(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
Index: sys/dev/cxgbe/iw_cxgbe/cm.c
===================================================================
--- sys/dev/cxgbe/iw_cxgbe/cm.c	(.../head)	(revision 266804)
+++ sys/dev/cxgbe/iw_cxgbe/cm.c	(.../projects/sendfile)	(revision 266807)
@@ -585,8 +585,8 @@ process_data(struct c4iw_ep *ep)
 {
 	struct sockaddr_in *local, *remote;
 
-	CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s, sb_cc %d", __func__,
-	    ep->com.so, ep, states[ep->com.state], ep->com.so->so_rcv.sb_cc);
+	CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s, sbused %d", __func__,
+	    ep->com.so, ep, states[ep->com.state], sbused(&ep->com.so->so_rcv));
 
 	switch (state_read(&ep->com)) {
 	case MPA_REQ_SENT:
@@ -602,11 +602,11 @@ process_data(struct c4iw_ep *ep)
 		process_mpa_request(ep);
 		break;
 	default:
-		if (ep->com.so->so_rcv.sb_cc)
-			log(LOG_ERR, "%s: Unexpected streaming data.  "
-			    "ep %p, state %d, so %p, so_state 0x%x, sb_cc %u\n",
+		if (sbused(&ep->com.so->so_rcv))
+			log(LOG_ERR, "%s: Unexpected streaming data. ep %p, "
+			    "state %d, so %p, so_state 0x%x, sbused %u\n",
 			    __func__, ep, state_read(&ep->com), ep->com.so,
-			    ep->com.so->so_state, ep->com.so->so_rcv.sb_cc);
+			    ep->com.so->so_state, sbused(&ep->com.so->so_rcv));
 		break;
 	}
 }
Index: sys/dev/iscsi/icl.c
===================================================================
--- sys/dev/iscsi/icl.c	(.../head)	(revision 266804)
+++ sys/dev/iscsi/icl.c	(.../projects/sendfile)	(revision 266807)
@@ -758,7 +758,7 @@ icl_receive_thread(void *arg)
 		 * is enough data received to read the PDU.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
-		available = so->so_rcv.sb_cc;
+		available = sbavail(&so->so_rcv);
 		if (available < ic->ic_receive_len) {
 			so->so_rcv.sb_lowat = ic->ic_receive_len;
 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
Index: sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
===================================================================
--- sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(.../head)	(revision 266804)
+++ sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(.../projects/sendfile)	(revision 266807)
@@ -445,8 +445,8 @@ t3_push_frames(struct socket *so, int req_completi
 	 * Autosize the send buffer.
 	 */
 	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
-		if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) &&
-		    snd->sb_cc < VNET(tcp_autosndbuf_max)) {
+		if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) &&
+		    sbused(snd) < VNET(tcp_autosndbuf_max)) {
 			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
 			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
 			    so, curthread))
@@ -597,10 +597,10 @@ t3_rcvd(struct toedev *tod, struct tcpcb *tp)
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK(so_rcv);
-	KASSERT(toep->tp_enqueued >= so_rcv->sb_cc,
-	    ("%s: so_rcv->sb_cc > enqueued", __func__));
-	toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc;
-	toep->tp_enqueued = so_rcv->sb_cc;
+	KASSERT(toep->tp_enqueued >= sbused(so_rcv),
+	    ("%s: sbused(so_rcv) > enqueued", __func__));
+	toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv);
+	toep->tp_enqueued = sbused(so_rcv);
 	SOCKBUF_UNLOCK(so_rcv);
 
 	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
@@ -1199,7 +1199,7 @@ do_rx_data(struct sge_qset *qs, struct rsp_desc *r
 	}
 
 	toep->tp_enqueued += m->m_pkthdr.len;
-	sbappendstream_locked(so_rcv, m);
+	sbappendstream_locked(so_rcv, m, 0);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(so_rcv);
 
@@ -1768,7 +1768,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
 		so_sowwakeup_locked(so);
 	}
 
-	if (snd->sb_sndptroff < snd->sb_cc)
+	if (snd->sb_sndptroff < sbused(snd))
 		t3_push_frames(so, 0);
 
 out_free:
Index: sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c
===================================================================
--- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c	(.../head)	(revision 266804)
+++ sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c	(.../projects/sendfile)	(revision 266807)
@@ -1515,11 +1515,11 @@ process_data(struct iwch_ep *ep)
 		process_mpa_request(ep);
 		break;
 	default:
-		if (ep->com.so->so_rcv.sb_cc) 
+		if (sbavail(&ep->com.so->so_rcv)) 
 			printf("%s Unexpected streaming data."
 			       " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n",
 			       __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state,
-			       ep->com.so->so_rcv.sb_cc, ep->com.so->so_rcv.sb_mb);
+			       sbavail(&ep->com.so->so_rcv), ep->com.so->so_rcv.sb_mb);
 		break;
 	}
 	return;
Index: sys/kern/uipc_debug.c
===================================================================
--- sys/kern/uipc_debug.c	(.../head)	(revision 266804)
+++ sys/kern/uipc_debug.c	(.../projects/sendfile)	(revision 266807)
@@ -403,7 +403,8 @@ db_print_sockbuf(struct sockbuf *sb, const char *s
 	db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff);
 
 	db_print_indent(indent);
-	db_printf("sb_cc: %u   ", sb->sb_cc);
+	db_printf("sb_acc: %u   ", sb->sb_acc);
+	db_printf("sb_ccc: %u   ", sb->sb_ccc);
 	db_printf("sb_hiwat: %u   ", sb->sb_hiwat);
 	db_printf("sb_mbcnt: %u   ", sb->sb_mbcnt);
 	db_printf("sb_mbmax: %u\n", sb->sb_mbmax);
Index: sys/kern/uipc_mbuf.c
===================================================================
--- sys/kern/uipc_mbuf.c	(.../head)	(revision 266804)
+++ sys/kern/uipc_mbuf.c	(.../projects/sendfile)	(revision 266807)
@@ -389,7 +389,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m)
  * cleaned too.
  */
 void
-m_demote(struct mbuf *m0, int all)
+m_demote(struct mbuf *m0, int all, int flags)
 {
 	struct mbuf *m;
 
@@ -405,7 +405,7 @@ void
 			m_freem(m->m_nextpkt);
 			m->m_nextpkt = NULL;
 		}
-		m->m_flags = m->m_flags & (M_EXT|M_RDONLY|M_NOFREE);
+		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
 	}
 }
 
Index: sys/kern/sys_socket.c
===================================================================
--- sys/kern/sys_socket.c	(.../head)	(revision 266804)
+++ sys/kern/sys_socket.c	(.../projects/sendfile)	(revision 266807)
@@ -167,20 +167,17 @@ soo_ioctl(struct file *fp, u_long cmd, void *data,
 
 	case FIONREAD:
 		/* Unlocked read. */
-		*(int *)data = so->so_rcv.sb_cc;
+		*(int *)data = sbavail(&so->so_rcv);
 		break;
 
 	case FIONWRITE:
 		/* Unlocked read. */
-		*(int *)data = so->so_snd.sb_cc;
+		*(int *)data = sbavail(&so->so_snd);
 		break;
 
 	case FIONSPACE:
-		if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc) ||
-		    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
-			*(int *)data = 0;
-		else
-			*(int *)data = sbspace(&so->so_snd);
+		/* Unlocked read. */
+		*(int *)data = sbspace(&so->so_snd);
 		break;
 
 	case FIOSETOWN:
@@ -246,6 +243,7 @@ soo_stat(struct file *fp, struct stat *ub, struct
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
+	struct sockbuf *sb;
 #ifdef MAC
 	int error;
 #endif
@@ -261,15 +259,18 @@ soo_stat(struct file *fp, struct stat *ub, struct
 	 * If SBS_CANTRCVMORE is set, but there's still data left in the
 	 * receive buffer, the socket is still readable.
 	 */
-	SOCKBUF_LOCK(&so->so_rcv);
-	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 ||
-	    so->so_rcv.sb_cc != 0)
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
 		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
-	ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	/* Unlocked read. */
-	if ((so->so_snd.sb_state & SBS_CANTSENDMORE) == 0)
+	ub->st_size = sbavail(sb) - sb->sb_ctl;
+	SOCKBUF_UNLOCK(sb);
+
+	sb = &so->so_snd;
+	SOCKBUF_LOCK(sb);
+	if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
 		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+	SOCKBUF_UNLOCK(sb);
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
 	return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
Index: sys/kern/uipc_usrreq.c
===================================================================
--- sys/kern/uipc_usrreq.c	(.../head)	(revision 266804)
+++ sys/kern/uipc_usrreq.c	(.../projects/sendfile)	(revision 266807)
@@ -790,11 +790,10 @@ uipc_rcvd(struct socket *so, int flags)
 	u_int mbcnt, sbcc;
 
 	unp = sotounpcb(so);
-	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
+	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
+	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
+	    ("%s: socktype %d", __func__, so->so_type));
 
-	if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
-		panic("uipc_rcvd socktype %d", so->so_type);
-
 	/*
 	 * Adjust backpressure on sender and wakeup any waiting to write.
 	 *
@@ -807,7 +806,7 @@ uipc_rcvd(struct socket *so, int flags)
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	mbcnt = so->so_rcv.sb_mbcnt;
-	sbcc = so->so_rcv.sb_cc;
+	sbcc = sbavail(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	/*
 	 * There is a benign race condition at this point.  If we're planning to
@@ -843,7 +842,10 @@ uipc_send(struct socket *so, int flags, struct mbu
 	int error = 0;
 
 	unp = sotounpcb(so);
-	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
+	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
+	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM ||
+	    so->so_type == SOCK_SEQPACKET,
+	    ("%s: socktype %d", __func__, so->so_type));
 
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
@@ -994,7 +996,7 @@ uipc_send(struct socket *so, int flags, struct mbu
 		}
 
 		mbcnt = so2->so_rcv.sb_mbcnt;
-		sbcc = so2->so_rcv.sb_cc;
+		sbcc = sbavail(&so2->so_rcv);
 		sorwakeup_locked(so2);
 
 		/*
@@ -1011,9 +1013,6 @@ uipc_send(struct socket *so, int flags, struct mbu
 		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
-
-	default:
-		panic("uipc_send unknown socktype");
 	}
 
 	/*
Index: sys/kern/vfs_default.c
===================================================================
--- sys/kern/vfs_default.c	(.../head)	(revision 266804)
+++ sys/kern/vfs_default.c	(.../projects/sendfile)	(revision 266807)
@@ -111,6 +111,7 @@ struct vop_vector default_vnodeops = {
 	.vop_close =		VOP_NULL,
 	.vop_fsync =		VOP_NULL,
 	.vop_getpages =		vop_stdgetpages,
+	.vop_getpages_async =	vop_stdgetpages_async,
 	.vop_getwritemount = 	vop_stdgetwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_ioctl =		VOP_ENOTTY,
@@ -726,10 +727,19 @@ vop_stdgetpages(ap)
 {
 
 	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
-	    ap->a_count, ap->a_reqpage);
+	    ap->a_count, ap->a_reqpage, NULL, NULL);
 }
 
+/* XXX Needs good comment and a manpage. */
 int
+vop_stdgetpages_async(struct vop_getpages_async_args *ap)
+{
+
+	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+	    ap->a_count, ap->a_reqpage, ap->a_vop_getpages_iodone, ap->a_arg);
+}
+
+int
 vop_stdkqfilter(struct vop_kqfilter_args *ap)
 {
 	return vfs_kqfilter(ap);
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c	(.../head)	(revision 266804)
+++ sys/kern/uipc_socket.c	(.../projects/sendfile)	(revision 266807)
@@ -1459,12 +1459,12 @@ restart:
 	 *   2. MSG_DONTWAIT is not set
 	 */
 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
-	    so->so_rcv.sb_cc < uio->uio_resid) &&
-	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
+	    sbavail(&so->so_rcv) < uio->uio_resid) &&
+	    sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
-		KASSERT(m != NULL || !so->so_rcv.sb_cc,
-		    ("receive: m == %p so->so_rcv.sb_cc == %u",
-		    m, so->so_rcv.sb_cc));
+		KASSERT(m != NULL || !sbavail(&so->so_rcv),
+		    ("receive: m == %p sbavail == %u",
+		    m, sbavail(&so->so_rcv)));
 		if (so->so_error) {
 			if (m != NULL)
 				goto dontblock;
@@ -1746,9 +1746,7 @@ dontblock:
 						SOCKBUF_LOCK(&so->so_rcv);
 					}
 				}
-				m->m_data += len;
-				m->m_len -= len;
-				so->so_rcv.sb_cc -= len;
+				sbmtrim(&so->so_rcv, m, len);
 			}
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
@@ -1913,7 +1911,7 @@ restart:
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
-		if (sb->sb_cc > 0)
+		if (sbavail(sb) > 0)
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
@@ -1925,7 +1923,7 @@ restart:
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
-		if (sb->sb_cc > 0)
+		if (sbavail(sb) > 0)
 			goto deliver;
 		else
 			goto out;
@@ -1932,7 +1930,7 @@ restart:
 	}
 
 	/* Socket buffer is empty and we shall not block. */
-	if (sb->sb_cc == 0 &&
+	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
@@ -1939,18 +1937,18 @@ restart:
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
-	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+	if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
 	    ((sb->sb_flags & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
-	     sb->sb_cc >= sb->sb_lowat ||
-	     sb->sb_cc >= uio->uio_resid ||
-	     sb->sb_cc >= sb->sb_hiwat) ) {
+	     sbavail(sb) >= sb->sb_lowat ||
+	     sbavail(sb) >= uio->uio_resid ||
+	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
-	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
+	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
 		goto deliver;
 
 	/*
@@ -1964,7 +1962,7 @@ restart:
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+	KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
@@ -1972,7 +1970,7 @@ deliver:
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
-	len = min(uio->uio_resid, sb->sb_cc);
+	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
@@ -1983,6 +1981,8 @@ deliver:
 			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
+				KASSERT(!(m->m_flags & M_NOTAVAIL),
+				    ("%s: m %p not available", __func__, m));
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
@@ -2107,9 +2107,9 @@ soreceive_dgram(struct socket *so, struct sockaddr
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	while ((m = so->so_rcv.sb_mb) == NULL) {
-		KASSERT(so->so_rcv.sb_cc == 0,
-		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
-		    so->so_rcv.sb_cc));
+		KASSERT(sbavail(&so->so_rcv) == 0,
+		    ("soreceive_dgram: sb_mb NULL but sbavail %u",
+		    sbavail(&so->so_rcv)));
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
@@ -3157,7 +3157,7 @@ filt_soread(struct knote *kn, long hint)
 	so = kn->kn_fp->f_data;
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
-	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
@@ -3167,7 +3167,7 @@ filt_soread(struct knote *kn, long hint)
 	else if (kn->kn_sfflags & NOTE_LOWAT)
 		return (kn->kn_data >= kn->kn_sdata);
 	else
-		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
+		return (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat);
 }
 
 static void
@@ -3350,7 +3350,7 @@ soisdisconnected(struct socket *so)
 	sorwakeup_locked(so);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_state |= SBS_CANTSENDMORE;
-	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
+	sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
 	sowwakeup_locked(so);
 	wakeup(&so->so_timeo);
 }
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src	(.../head)	(revision 266804)
+++ sys/kern/vnode_if.src	(.../projects/sendfile)	(revision 266807)
@@ -477,6 +477,19 @@ vop_getpages {
 };
 
 
+%% getpages_async	vp	L L L
+
+vop_getpages_async {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int reqpage;
+	IN vm_ooffset_t offset;
+	IN void (*vop_getpages_iodone)(void *);
+	IN void *arg;
+};
+
+
 %% putpages	vp	L L L
 
 vop_putpages {
Index: sys/kern/uipc_sockbuf.c
===================================================================
--- sys/kern/uipc_sockbuf.c	(.../head)	(revision 266804)
+++ sys/kern/uipc_sockbuf.c	(.../projects/sendfile)	(revision 266807)
@@ -68,7 +68,152 @@ static	u_long sb_efficiency = 8;	/* parameter for
 static struct mbuf	*sbcut_internal(struct sockbuf *sb, int len);
 static void	sbflush_internal(struct sockbuf *sb);
 
+static void
+sb_shift_nrdy(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m));
+
+	m = m->m_next;
+	while (m != NULL && !(m->m_flags & M_NOTREADY)) {
+		m->m_flags &= ~M_BLOCKED;
+		sb->sb_acc += m->m_len;
+		m = m->m_next;
+	}
+
+	sb->sb_fnrdy = m;
+}
+
+int
+sbready(struct sockbuf *sb, struct mbuf *m, int count)
+{
+	u_int blocker;
+
+	SOCKBUF_LOCK(sb);
+
+	if (sb->sb_state & SBS_CANTSENDMORE) {
+		SOCKBUF_UNLOCK(sb);
+		return (ENOTCONN);
+	}
+
+	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
+
+	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
+
+	for (int i = 0; i < count; i++, m = m->m_next) {
+		KASSERT(m->m_flags & M_NOTREADY,
+		    ("%s: m %p !M_NOTREADY", __func__, m));
+		m->m_flags &= ~(M_NOTREADY | blocker);
+		if (blocker)
+			sb->sb_acc += m->m_len;
+	}
+
+	if (!blocker) {
+		SOCKBUF_UNLOCK(sb);
+		return (EWOULDBLOCK);
+	}
+
+	/* This one was blocking all the queue. */
+	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
+		KASSERT(m->m_flags & M_BLOCKED,
+		    ("%s: m %p !M_BLOCKED", __func__, m));
+		m->m_flags &= ~M_BLOCKED;
+		sb->sb_acc += m->m_len;
+	}
+
+	sb->sb_fnrdy = m;
+
+	SOCKBUF_UNLOCK(sb);
+
+	return (0);
+}
+
 /*
+ * Adjust sockbuf state reflecting allocation of m.
+ */
+void
+sballoc(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sb->sb_ccc += m->m_len;
+
+	if (sb->sb_fnrdy == NULL) {
+		if (m->m_flags & M_NOTREADY)
+			sb->sb_fnrdy = m;
+		else
+			sb->sb_acc += m->m_len;
+	} else
+		m->m_flags |= M_BLOCKED;
+
+	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+		sb->sb_ctl += m->m_len;
+
+	sb->sb_mbcnt += MSIZE;
+	sb->sb_mcnt += 1;
+
+	if (m->m_flags & M_EXT) {
+		sb->sb_mbcnt += m->m_ext.ext_size;
+		sb->sb_ccnt += 1;
+	}
+}
+
+/*
+ * Adjust sockbuf state reflecting freeing of m.
+ */
+void
+sbfree(struct sockbuf *sb, struct mbuf *m)
+{
+
+#if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+
+	sb->sb_ccc -= m->m_len;
+
+	if (!(m->m_flags & M_NOTAVAIL))
+		sb->sb_acc -= m->m_len;
+
+	if (sb->sb_fnrdy == m)
+		sb_shift_nrdy(sb, m);
+
+	if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+		sb->sb_ctl -= m->m_len;
+
+	sb->sb_mbcnt -= MSIZE;
+	sb->sb_mcnt -= 1;
+	if (m->m_flags & M_EXT) {
+		sb->sb_mbcnt -= m->m_ext.ext_size;
+		sb->sb_ccnt -= 1;
+	}
+
+	if (sb->sb_sndptr == m) {
+		sb->sb_sndptr = NULL;
+		sb->sb_sndptroff = 0;
+	}
+	if (sb->sb_sndptroff != 0)
+		sb->sb_sndptroff -= m->m_len;
+}
+
+/*
+ * Trim some amount of data from (first?) mbuf in buffer.
+ */
+void
+sbmtrim(struct sockbuf *sb, struct mbuf *m, int len)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	KASSERT(len < m->m_len, ("%s: m %p len %d", __func__, m, len));
+
+	m->m_data += len;
+	m->m_len -= len;
+	sb->sb_acc -= len;
+	sb->sb_ccc -= len;
+}
+
+/*
  * Socantsendmore indicates that no more data will be sent on the socket; it
  * would normally be applied to a socket when the user informs the system
  * that no more data is to be sent, by the protocol code (in case
@@ -127,7 +272,7 @@ sbwait(struct sockbuf *sb)
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sb->sb_flags |= SB_WAIT;
-	return (msleep_sbt(&sb->sb_cc, &sb->sb_mtx,
+	return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx,
 	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
 	    sb->sb_timeo, 0, 0));
 }
@@ -184,7 +329,7 @@ sowakeup(struct socket *so, struct sockbuf *sb)
 		sb->sb_flags &= ~SB_SEL;
 	if (sb->sb_flags & SB_WAIT) {
 		sb->sb_flags &= ~SB_WAIT;
-		wakeup(&sb->sb_cc);
+		wakeup(&sb->sb_acc);
 	}
 	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
 	if (sb->sb_upcall != NULL) {
@@ -519,7 +664,7 @@ sbappend(struct sockbuf *sb, struct mbuf *m)
  * that is, a stream protocol (such as TCP).
  */
 void
-sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
+sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags)
 {
 	SOCKBUF_LOCK_ASSERT(sb);
 
@@ -529,8 +674,8 @@ void
 	SBLASTMBUFCHK(sb);
 
 	/* Remove all packet headers and mbuf tags to get a pure data chain. */
-	m_demote(m, 1);
-	
+	m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0);
+
 	sbcompress(sb, m, sb->sb_mbtail);
 
 	sb->sb_lastrecord = sb->sb_mb;
@@ -543,38 +688,59 @@ void
  * that is, a stream protocol (such as TCP).
  */
 void
-sbappendstream(struct sockbuf *sb, struct mbuf *m)
+sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags)
 {
 
 	SOCKBUF_LOCK(sb);
-	sbappendstream_locked(sb, m);
+	sbappendstream_locked(sb, m, flags);
 	SOCKBUF_UNLOCK(sb);
 }
 
 #ifdef SOCKBUF_DEBUG
 void
-sbcheck(struct sockbuf *sb)
+sbcheck(struct sockbuf *sb, const char *file, int line)
 {
-	struct mbuf *m;
-	struct mbuf *n = 0;
-	u_long len = 0, mbcnt = 0;
+	struct mbuf *m, *n, *fnrdy;
+	u_long acc, ccc, mbcnt;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
+	acc = ccc = mbcnt = 0;
+	fnrdy = NULL;
+
 	for (m = sb->sb_mb; m; m = n) {
 	    n = m->m_nextpkt;
 	    for (; m; m = m->m_next) {
-		len += m->m_len;
+		if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) {
+			if (m != sb->sb_fnrdy) {
+				printf("sb %p: fnrdy %p != m %p\n",
+				    sb, sb->sb_fnrdy, m);
+				goto fail;
+			}
+			fnrdy = m;
+		}
+		if (fnrdy) {
+			if (!(m->m_flags & M_NOTAVAIL)) {
+				printf("sb %p: fnrdy %p, m %p is avail\n",
+				    sb, sb->sb_fnrdy, m);
+				goto fail;
+			}
+		} else
+			acc += m->m_len;
+		ccc += m->m_len;
 		mbcnt += MSIZE;
 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
 			mbcnt += m->m_ext.ext_size;
 	    }
 	}
-	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
-		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
-		    mbcnt, sb->sb_mbcnt);
-		panic("sbcheck");
+	if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) {
+		printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n",
+		    acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt);
+		goto fail;
 	}
+	return;
+fail:
+	panic("%s from %s:%u", __func__, file, line);
 }
 #endif
 
@@ -800,6 +966,7 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, str
 		if (n && (n->m_flags & M_EOR) == 0 &&
 		    M_WRITABLE(n) &&
 		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
+		    !(m->m_flags & M_NOTREADY) &&
 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
 		    m->m_len <= M_TRAILINGSPACE(n) &&
 		    n->m_type == m->m_type) {
@@ -806,7 +973,9 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, str
 			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
 			    (unsigned)m->m_len);
 			n->m_len += m->m_len;
-			sb->sb_cc += m->m_len;
+			sb->sb_ccc += m->m_len;
+			if (sb->sb_fnrdy == NULL)
+				sb->sb_acc += m->m_len;
 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
 				/* XXX: Probably don't need.*/
 				sb->sb_ctl += m->m_len;
@@ -843,13 +1012,13 @@ sbflush_internal(struct sockbuf *sb)
 		 * Don't call sbcut(sb, 0) if the leading mbuf is non-empty:
 		 * we would loop forever. Panic instead.
 		 */
-		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+		if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len))
 			break;
-		m_freem(sbcut_internal(sb, (int)sb->sb_cc));
+		m_freem(sbcut_internal(sb, (int)sb->sb_ccc));
 	}
-	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
-		panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
-		    sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+	KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
+	    ("%s: ccc %u mb %p mbcnt %u", __func__,
+	    sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
 }
 
 void
@@ -891,7 +1060,9 @@ sbcut_internal(struct sockbuf *sb, int len)
 		if (m->m_len > len) {
 			m->m_len -= len;
 			m->m_data += len;
-			sb->sb_cc -= len;
+			sb->sb_ccc -= len;
+			if (!(m->m_flags & M_NOTAVAIL))
+				sb->sb_acc -= len;
 			if (sb->sb_sndptroff != 0)
 				sb->sb_sndptroff -= len;
 			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
@@ -977,8 +1148,8 @@ sbsndptr(struct sockbuf *sb, u_int off, u_int len,
 	struct mbuf *m, *ret;
 
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
-	KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
-	KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
+	KASSERT(off + len <= sb->sb_acc, ("%s: beyond sb", __func__));
+	KASSERT(sb->sb_sndptroff <= sb->sb_acc, ("%s: sndptroff broken", __func__));
 
 	/*
 	 * Is off below stored offset? Happens on retransmits.
@@ -1091,7 +1262,7 @@ void
 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
 {
 
-	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_cc = sb->sb_ccc;
 	xsb->sb_hiwat = sb->sb_hiwat;
 	xsb->sb_mbcnt = sb->sb_mbcnt;
 	xsb->sb_mcnt = sb->sb_mcnt;	
Index: sys/kern/uipc_syscalls.c
===================================================================
--- sys/kern/uipc_syscalls.c	(.../head)	(revision 266804)
+++ sys/kern/uipc_syscalls.c	(.../projects/sendfile)	(revision 266807)
@@ -132,9 +132,10 @@ static int	filt_sfsync(struct knote *kn, long hint
  */
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
     "sendfile(2) tunables");
-static int sfreadahead = 1;
+
+static int sfreadahead = 0;
 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
-    &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
+    &sfreadahead, 0, "Read this more pages than socket buffer can accept");
 
 #ifdef	SFSYNC_DEBUG
 static int sf_sync_debug = 0;
@@ -1988,7 +1989,7 @@ filt_sfsync(struct knote *kn, long hint)
  * Detach mapped page and release resources back to the system.
  */
 int
-sf_buf_mext(struct mbuf *mb, void *addr, void *args)
+sf_mext_free(struct mbuf *mb, void *addr, void *args)
 {
 	vm_page_t m;
 	struct sendfile_sync *sfs;
@@ -2009,13 +2010,42 @@ int
 		sfs = addr;
 		sf_sync_deref(sfs);
 	}
-	/*
-	 * sfs may be invalid at this point, don't use it!
-	 */
 	return (EXT_FREE_OK);
 }
 
 /*
+ * Same as above, but forces the page to be detached from the object
+ * and go into free pool.
+ */
+static int
+sf_mext_free_nocache(struct mbuf *mb, void *addr, void *args)
+{
+	vm_page_t m;
+	struct sendfile_sync *sfs;
+
+	m = sf_buf_page(args);
+	sf_buf_free(args);
+	vm_page_lock(m);
+	vm_page_unwire(m, 0);
+	if (m->wire_count == 0) {
+		vm_object_t obj;
+
+		if ((obj = m->object) == NULL)
+			vm_page_free(m);
+		else if (!vm_page_xbusied(m) && VM_OBJECT_TRYWLOCK(obj)) {
+			vm_page_free(m);
+			VM_OBJECT_WUNLOCK(obj);
+		}
+	}
+	vm_page_unlock(m);
+	if (addr != NULL) {
+		sfs = addr;
+		sf_sync_deref(sfs);
+	}
+	return (EXT_FREE_OK);
+}
+
+/*
  * Called to remove a reference to a sf_sync object.
  *
  * This is generally done during the mbuf free path to signify
@@ -2608,106 +2638,181 @@ freebsd4_sendfile(struct thread *td, struct freebs
 }
 #endif /* COMPAT_FREEBSD4 */
 
+ /*
+  * How much data to put into page i of n.
+  * Only first and last pages are special.
+  */
+static inline off_t
+xfsize(int i, int n, off_t off, off_t len)
+{
+
+	if (i == 0)
+		return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
+
+	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
+		return ((off + len) & PAGE_MASK);
+
+	return (PAGE_SIZE);
+}
+
+/*
+ * Offset within object for i page.
+ */
+static inline vm_offset_t
+vmoff(int i, off_t off)
+{
+
+	if (i == 0)
+		return ((vm_offset_t)off);
+
+	return (trunc_page(off + i * PAGE_SIZE));
+}
+
+/*
+ * Pretend as if we don't have enough space, subtract xfsize() of
+ * all pages that failed.
+ */
+static inline void
+fixspace(int old, int new, off_t off, int *space)
+{
+
+	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
+
+	/* Subtract last one. */
+	*space -= xfsize(old - 1, old, off, *space);
+	old--;
+
+	if (new == old)
+		/* There was only one page. */
+		return;
+
+	/* Subtract first one. */
+	if (new == 0) {
+		*space -= xfsize(0, old, off, *space);
+		new++;
+	}
+
+	/* Rest of pages are full sized. */
+	*space -= (old - new) * PAGE_SIZE;
+
+	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
+}
+
+struct sf_io {
+	u_int		nios;
+	int		npages;
+	struct file	*sock_fp;
+	struct mbuf	*m;
+	vm_page_t	pa[];
+};
+
+static void
+sf_io_done(void *arg)
+{
+	struct sf_io *sfio = arg;
+	struct socket *so;
+
+	if (!refcount_release(&sfio->nios))
+		return;
+
+	so  = sfio->sock_fp->f_data;
+
+	if (sbready(&so->so_snd, sfio->m, sfio->npages) == 0) {
+		struct mbuf *m;
+
+		m = m_get(M_NOWAIT, MT_DATA);
+		if (m == NULL) {
+			panic("XXXGL");
+		}
+		m->m_len = 0;
+		CURVNET_SET(so->so_vnet);
+		/* XXXGL: curthread */
+		(void )(so->so_proto->pr_usrreqs->pru_send)
+		    (so, 0, m, NULL, NULL, curthread);
+		CURVNET_RESTORE();
+	}
+
+	/* XXXGL: curthread */
+	fdrop(sfio->sock_fp, curthread);
+	free(sfio, M_TEMP);
+}
+
 static int
-sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
-    off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
+sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len,
+    int npages, int rhpages)
 {
-	vm_page_t m;
-	vm_pindex_t pindex;
-	ssize_t resid;
-	int error, readahead, rv;
+	vm_page_t *pa = sfio->pa;
+	int nios;
 
-	pindex = OFF_TO_IDX(off);
+	nios = 0;
 	VM_OBJECT_WLOCK(obj);
-	m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
-	    VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
+	for (int i = 0; i < npages; i++)
+		pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)),
+		    VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
 
-	/*
-	 * Check if page is valid for what we need, otherwise initiate I/O.
-	 *
-	 * The non-zero nd argument prevents disk I/O, instead we
-	 * return the caller what he specified in nd.  In particular,
-	 * if we already turned some pages into mbufs, nd == EAGAIN
-	 * and the main function send them the pages before we come
-	 * here again and block.
-	 */
-	if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
-		if (vp == NULL)
-			vm_page_xunbusy(m);
-		VM_OBJECT_WUNLOCK(obj);
-		*res = m;
-		return (0);
-	} else if (nd != 0) {
-		if (vp == NULL)
-			vm_page_xunbusy(m);
-		error = nd;
-		goto free_page;
-	}
+	for (int i = 0; i < npages;) {
+		int j, a, count, rv;
 
-	/*
-	 * Get the page from backing store.
-	 */
-	error = 0;
-	if (vp != NULL) {
-		VM_OBJECT_WUNLOCK(obj);
-		readahead = sfreadahead * MAXBSIZE;
+		if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
+		    xfsize(i, npages, off, len))) {
+			vm_page_xunbusy(pa[i]);
+			i++;
+			continue;
+		}
 
-		/*
-		 * Use vn_rdwr() instead of the pager interface for
-		 * the vnode, to allow the read-ahead.
-		 *
-		 * XXXMAC: Because we don't have fp->f_cred here, we
-		 * pass in NOCRED.  This is probably wrong, but is
-		 * consistent with our original implementation.
-		 */
-		error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
-		    UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
-		    bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
-		SFSTAT_INC(sf_iocnt);
-		VM_OBJECT_WLOCK(obj);
-	} else {
-		if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
-			rv = vm_pager_get_pages(obj, &m, 1, 0);
-			SFSTAT_INC(sf_iocnt);
-			m = vm_page_lookup(obj, pindex);
-			if (m == NULL)
-				error = EIO;
-			else if (rv != VM_PAGER_OK) {
-				vm_page_lock(m);
-				vm_page_free(m);
-				vm_page_unlock(m);
-				m = NULL;
-				error = EIO;
+		for (j = i + 1; j < npages; j++)
+			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
+			    xfsize(j, npages, off, len)))
+				break;
+
+		while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)),
+		    NULL, &a) && i < j) {
+			pmap_zero_page(pa[i]);
+			pa[i]->valid = VM_PAGE_BITS_ALL;
+			pa[i]->dirty = 0;
+			vm_page_xunbusy(pa[i]);
+			i++;
+		}
+		if (i == j)
+			continue;
+
+		count = min(a + 1, npages + rhpages - i);
+		for (j = npages; j < i + count; j++) {
+			pa[j] = vm_page_grab(obj, OFF_TO_IDX(vmoff(j, off)),
+			    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT);
+			if (pa[j] == NULL) {
+				count = j - i;
+				break;
 			}
-		} else {
-			pmap_zero_page(m);
-			m->valid = VM_PAGE_BITS_ALL;
-			m->dirty = 0;
+			if (pa[j]->valid) {
+				vm_page_xunbusy(pa[j]);
+				count = j - i;
+				break;
+			}
 		}
-		if (m != NULL)
-			vm_page_xunbusy(m);
+
+		refcount_acquire(&sfio->nios);
+		rv = vm_pager_get_pages_async(obj, pa + i, count, 0,
+		    &sf_io_done, sfio);
+
+		KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p",
+		    __func__, obj, pa[i]));
+
+		SFSTAT_INC(sf_iocnt);
+		nios++;
+
+		for (j = i; j < i + count && j < npages; j++)
+			KASSERT(pa[j] == vm_page_lookup(obj,
+			    OFF_TO_IDX(vmoff(j, off))),
+			    ("pa[j] %p lookup %p\n", pa[j],
+			    vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off)))));
+
+		i += count;
 	}
-	if (error == 0) {
-		*res = m;
-	} else if (m != NULL) {
-free_page:
-		vm_page_lock(m);
-		vm_page_unwire(m, 0);
 
-		/*
-		 * See if anyone else might know about this page.  If
-		 * not and it is not valid, then free it.
-		 */
-		if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
-			vm_page_free(m);
-		vm_page_unlock(m);
-	}
-	KASSERT(error != 0 || (m->wire_count > 0 &&
-	    vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
-	    ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
-	    xfsize));
 	VM_OBJECT_WUNLOCK(obj);
-	return (error);
+
+	return (nios);
 }
 
 static int
@@ -2814,41 +2919,26 @@ vn_sendfile(struct file *fp, int sockfd, struct ui
 	struct vnode *vp;
 	struct vm_object *obj;
 	struct socket *so;
-	struct mbuf *m;
+	struct mbuf *m, *mh, *mhtail;
 	struct sf_buf *sf;
-	struct vm_page *pg;
 	struct shmfd *shmfd;
 	struct vattr va;
-	off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
-	int error, bsize, nd, hdrlen, mnw;
+	off_t off, sbytes, rem, obj_size;
+	int error, serror, bsize, hdrlen;
 
-	pg = NULL;
 	obj = NULL;
 	so = NULL;
-	m = NULL;
-	fsbytes = sbytes = 0;
-	hdrlen = mnw = 0;
-	rem = nbytes;
-	obj_size = 0;
+	m = mh = NULL;
+	sbytes = 0;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
-	if (rem == 0)
-		rem = obj_size;
 
 	error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
-	/*
-	 * Do not wait on memory allocations but return ENOMEM for
-	 * caller to retry later.
-	 * XXX: Experimental.
-	 */
-	if (flags & SF_MNOWAIT)
-		mnw = 1;
-
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
@@ -2856,31 +2946,27 @@ vn_sendfile(struct file *fp, int sockfd, struct ui
 #endif
 
 	/* If headers are specified copy them into mbufs. */
-	if (hdr_uio != NULL) {
+	if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
 		hdr_uio->uio_td = td;
 		hdr_uio->uio_rw = UIO_WRITE;
-		if (hdr_uio->uio_resid > 0) {
-			/*
-			 * In FBSD < 5.0 the nbytes to send also included
-			 * the header.  If compat is specified subtract the
-			 * header size from nbytes.
-			 */
-			if (kflags & SFK_COMPAT) {
-				if (nbytes > hdr_uio->uio_resid)
-					nbytes -= hdr_uio->uio_resid;
-				else
-					nbytes = 0;
-			}
-			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
-			    0, 0, 0);
-			if (m == NULL) {
-				error = mnw ? EAGAIN : ENOBUFS;
-				goto out;
-			}
-			hdrlen = m_length(m, NULL);
+		/*
+		 * In FBSD < 5.0 the nbytes to send also included
+		 * the header.  If compat is specified subtract the
+		 * header size from nbytes.
+		 */
+		if (kflags & SFK_COMPAT) {
+			if (nbytes > hdr_uio->uio_resid)
+				nbytes -= hdr_uio->uio_resid;
+			else
+				nbytes = 0;
 		}
-	}
+		mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0);
+		hdrlen = m_length(mh, &mhtail);
+	} else
+		hdrlen = 0;
 
+	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
+
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
@@ -2900,21 +2986,13 @@ vn_sendfile(struct file *fp, int sockfd, struct ui
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
-	for (off = offset; ; ) {
+	for (off = offset; rem > 0; ) {
+		struct sf_io *sfio;
+		vm_page_t *pa;
 		struct mbuf *mtail;
-		int loopbytes;
-		int space;
-		int done;
+		int nios, space, npages, rhpages;
 
-		if ((nbytes != 0 && nbytes == fsbytes) ||
-		    (nbytes == 0 && obj_size == fsbytes))
-			break;
-
 		mtail = NULL;
-		loopbytes = 0;
-		space = 0;
-		done = 0;
-
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
@@ -2990,53 +3068,44 @@ retry_space:
 				VOP_UNLOCK(vp, 0);
 				goto done;
 			}
-			obj_size = va.va_size;
+			if (va.va_size != obj_size) {
+				if (nbytes == 0)
+					rem += va.va_size - obj_size;
+				else if (offset + nbytes > va.va_size)
+					rem -= (offset + nbytes - va.va_size);
+				obj_size = va.va_size;
+			}
 		}
 
+		if (space > rem)
+			space = rem;
+
+		if (off & PAGE_MASK)
+			npages = 1 + howmany(space -
+			    (PAGE_SIZE - (off & PAGE_MASK)), PAGE_SIZE);
+		else
+			npages = howmany(space, PAGE_SIZE);
+
+		rhpages = SF_READAHEAD(flags) ?
+		    SF_READAHEAD(flags) : sfreadahead;
+		rhpages = min(howmany(obj_size - (off & ~PAGE_MASK) -
+		    (npages * PAGE_SIZE), PAGE_SIZE), rhpages);
+
+		sfio = malloc(sizeof(struct sf_io) +
+		    (rhpages + npages) * sizeof(vm_page_t), M_TEMP, M_WAITOK);
+		refcount_init(&sfio->nios, 1);
+
+		nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages);
+
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
-		while (space > loopbytes) {
-			vm_offset_t pgoff;
+		pa = sfio->pa;
+		for (int i = 0; i < npages; i++) {
 			struct mbuf *m0;
 
 			/*
-			 * Calculate the amount to transfer.
-			 * Not to exceed a page, the EOF,
-			 * or the passed in nbytes.
-			 */
-			pgoff = (vm_offset_t)(off & PAGE_MASK);
-			rem = obj_size - offset;
-			if (nbytes != 0)
-				rem = omin(rem, nbytes);
-			rem -= fsbytes + loopbytes;
-			xfsize = omin(PAGE_SIZE - pgoff, rem);
-			xfsize = omin(space - loopbytes, xfsize);
-			if (xfsize <= 0) {
-				done = 1;		/* all data sent */
-				break;
-			}
-
-			/*
-			 * Attempt to look up the page.  Allocate
-			 * if not found or wait and loop if busy.
-			 */
-			if (m != NULL)
-				nd = EAGAIN; /* send what we already got */
-			else if ((flags & SF_NODISKIO) != 0)
-				nd = EBUSY;
-			else
-				nd = 0;
-			error = sendfile_readpage(obj, vp, nd, off,
-			    xfsize, bsize, td, &pg);
-			if (error != 0) {
-				if (error == EAGAIN)
-					error = 0;	/* not a real error */
-				break;
-			}
-
-			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
@@ -3045,17 +3114,18 @@ retry_space:
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
-			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
-			    SFB_CATCH);
+			sf = sf_buf_alloc(pa[i],
+			    m != NULL ? SFB_NOWAIT : SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
-				vm_page_lock(pg);
-				vm_page_unwire(pg, 0);
-				KASSERT(pg->object != NULL,
-				    ("%s: object disappeared", __func__));
-				vm_page_unlock(pg);
+				for (int j = i; j < npages; j++) {
+					vm_page_lock(pa[j]);
+					vm_page_unwire(pa[j], 0);
+					vm_page_unlock(pa[j]);
+				}
 				if (m == NULL)
-					error = (mnw ? EAGAIN : EINTR);
+					error = ENOBUFS;
+				fixspace(npages, i, off, &space);
 				break;
 			}
 
@@ -3063,36 +3133,26 @@ retry_space:
 			 * Get an mbuf and set it up as having
 			 * external storage.
 			 */
-			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
-			if (m0 == NULL) {
-				error = (mnw ? EAGAIN : ENOBUFS);
-				(void)sf_buf_mext(NULL, NULL, sf);
-				break;
-			}
-			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
-			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
-			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
-				error = (mnw ? EAGAIN : ENOBUFS);
-				(void)sf_buf_mext(NULL, NULL, sf);
-				m_freem(m0);
-				break;
-			}
-			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
-			m0->m_len = xfsize;
+			m0 = m_get(M_WAITOK, MT_DATA);
+			(void )m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
+			    (flags & SF_NOCACHE) ? sf_mext_free_nocache :
+			    sf_mext_free, sfs, sf, M_RDONLY, EXT_SFBUF,
+			    M_WAITOK);
+			m0->m_data = (char *)sf_buf_kva(sf) +
+			    (vmoff(i, off) & PAGE_MASK);
+			m0->m_len = xfsize(i, npages, off, space);
+			m0->m_flags |= M_NOTREADY;
 
+			if (i == 0)
+				sfio->m = m0;
+
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
-			else if (m != NULL)
-				m_last(m)->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 
-			/* Keep track of bits processed. */
-			loopbytes += xfsize;
-			off += xfsize;
-
 			/*
 			 * XXX eventually this should be a sfsync
 			 * method call!
@@ -3104,47 +3164,51 @@ retry_space:
 		if (vp != NULL)
 			VOP_UNLOCK(vp, 0);
 
+		/* Keep track of bytes processed. */
+		off += space;
+		rem -= space;
+
+		/* Prepend header, if any. */
+		if (hdrlen) {
+			mhtail->m_next = m;
+			m = mh;
+			mh = NULL;
+		}
+
+		if (error) {
+			free(sfio, M_TEMP);
+			goto done;
+		}
+
 		/* Add the buffer chain to the socket buffer. */
-		if (m != NULL) {
-			int mlen, err;
+		KASSERT(m_length(m, NULL) == space + hdrlen,
+		    ("%s: mlen %u space %d hdrlen %d",
+		    __func__, m_length(m, NULL), space, hdrlen));
 
-			mlen = m_length(m, NULL);
-			SOCKBUF_LOCK(&so->so_snd);
-			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
-				error = EPIPE;
-				SOCKBUF_UNLOCK(&so->so_snd);
-				goto done;
-			}
-			SOCKBUF_UNLOCK(&so->so_snd);
-			CURVNET_SET(so->so_vnet);
-			/* Avoid error aliasing. */
-			err = (*so->so_proto->pr_usrreqs->pru_send)
-				    (so, 0, m, NULL, NULL, td);
-			CURVNET_RESTORE();
-			if (err == 0) {
-				/*
-				 * We need two counters to get the
-				 * file offset and nbytes to send
-				 * right:
-				 * - sbytes contains the total amount
-				 *   of bytes sent, including headers.
-				 * - fsbytes contains the total amount
-				 *   of bytes sent from the file.
-				 */
-				sbytes += mlen;
-				fsbytes += mlen;
-				if (hdrlen) {
-					fsbytes -= hdrlen;
-					hdrlen = 0;
-				}
-			} else if (error == 0)
-				error = err;
-			m = NULL;	/* pru_send always consumes */
+		CURVNET_SET(so->so_vnet);
+		if (nios == 0) {
+			free(sfio, M_TEMP);
+			serror = (*so->so_proto->pr_usrreqs->pru_send)
+			    (so, 0, m, NULL, NULL, td);
+		} else {
+			sfio->sock_fp = sock_fp;
+			sfio->npages = npages;
+			fhold(sock_fp);
+			serror = (*so->so_proto->pr_usrreqs->pru_send)
+			    (so, PRUS_NOTREADY, m, NULL, NULL, td);
+			sf_io_done(sfio);
 		}
+		CURVNET_RESTORE();
 
-		/* Quit outer loop on error or when we're done. */
-		if (done)
-			break;
+		if (serror == 0) {
+			sbytes += space + hdrlen;
+			if (hdrlen)
+				hdrlen = 0;
+		} else if (error == 0)
+			error = serror;
+		m = NULL;	/* pru_send always consumes */
+
+		/* Quit outer loop on error. */
 		if (error != 0)
 			goto done;
 	}
@@ -3179,6 +3243,8 @@ out:
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
+	if (mh)
+		m_freem(mh);
 
 	if (error == ERESTART)
 		error = EINTR;
Index: sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c
===================================================================
--- sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c	(.../head)	(revision 266804)
+++ sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c	(.../projects/sendfile)	(revision 266807)
@@ -1127,9 +1127,8 @@ ng_btsocket_l2cap_process_l2ca_write_rsp(struct ng
 	/*
  	 * Check if we have more data to send
  	 */
-
 	sbdroprecord(&pcb->so->so_snd);
-	if (pcb->so->so_snd.sb_cc > 0) {
+	if (sbavail(&pcb->so->so_snd) > 0) {
 		if (ng_btsocket_l2cap_send2(pcb) == 0)
 			ng_btsocket_l2cap_timeout(pcb);
 		else
@@ -2510,7 +2509,7 @@ ng_btsocket_l2cap_send2(ng_btsocket_l2cap_pcb_p pc
 	
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
-	if (pcb->so->so_snd.sb_cc == 0)
+	if (sbavail(&pcb->so->so_snd) == 0)
 		return (EINVAL); /* XXX */
 
 	m = m_dup(pcb->so->so_snd.sb_mb, M_NOWAIT);
Index: sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c
===================================================================
--- sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c	(.../head)	(revision 266804)
+++ sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c	(.../projects/sendfile)	(revision 266807)
@@ -3274,7 +3274,7 @@ ng_btsocket_rfcomm_pcb_send(ng_btsocket_rfcomm_pcb
 	}
 
 	for (error = 0, sent = 0; sent < limit; sent ++) { 
-		length = min(pcb->mtu, pcb->so->so_snd.sb_cc);
+		length = min(pcb->mtu, sbavail(&pcb->so->so_snd));
 		if (length == 0)
 			break;
 
Index: sys/netgraph/bluetooth/socket/ng_btsocket_sco.c
===================================================================
--- sys/netgraph/bluetooth/socket/ng_btsocket_sco.c	(.../head)	(revision 266804)
+++ sys/netgraph/bluetooth/socket/ng_btsocket_sco.c	(.../projects/sendfile)	(revision 266807)
@@ -906,7 +906,7 @@ ng_btsocket_sco_default_msg_input(struct ng_mesg *
 				sbdroprecord(&pcb->so->so_snd);
 
 			/* Send more if we have any */
-			if (pcb->so->so_snd.sb_cc > 0)
+			if (sbavail(&pcb->so->so_snd) > 0)
 				if (ng_btsocket_sco_send2(pcb) == 0)
 					ng_btsocket_sco_timeout(pcb);
 
@@ -1744,7 +1744,7 @@ ng_btsocket_sco_send2(ng_btsocket_sco_pcb_p pcb)
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	while (pcb->rt->pending < pcb->rt->num_pkts &&
-	       pcb->so->so_snd.sb_cc > 0) {
+	       sbavail(&pcb->so->so_snd) > 0) {
 		/* Get a copy of the first packet on send queue */
 		m = m_dup(pcb->so->so_snd.sb_mb, M_NOWAIT);
 		if (m == NULL) {
Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
===================================================================
--- sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c	(.../head)	(revision 266804)
+++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c	(.../projects/sendfile)	(revision 266807)
@@ -746,7 +746,7 @@ sdp_start_disconnect(struct sdp_sock *ssk)
 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
 	} else {
 		soisdisconnecting(so);
-		unread = so->so_rcv.sb_cc;
+		unread = sbused(&so->so_rcv);
 		sbflush(&so->so_rcv);
 		sdp_usrclosed(ssk);
 		if (!(ssk->flags & SDP_DROPPED)) {
@@ -888,7 +888,7 @@ sdp_append(struct sdp_sock *ssk, struct sockbuf *s
 		m_adj(mb, SDP_HEAD_SIZE);
 		n->m_pkthdr.len += mb->m_pkthdr.len;
 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
-		m_demote(mb, 1);
+		m_demote(mb, 1, 0);
 		sbcompress(sb, mb, sb->sb_mbtail);
 		return;
 	}
@@ -1258,7 +1258,7 @@ sdp_sorecv(struct socket *so, struct sockaddr **ps
 	/* We will never ever get anything unless we are connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		/* When disconnecting there may be still some data left. */
-		if (sb->sb_cc > 0)
+		if (sbavail(sb))
 			goto deliver;
 		if (!(so->so_state & SS_ISDISCONNECTED))
 			error = ENOTCONN;
@@ -1266,7 +1266,7 @@ sdp_sorecv(struct socket *so, struct sockaddr **ps
 	}
 
 	/* Socket buffer is empty and we shall not block. */
-	if (sb->sb_cc == 0 &&
+	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
@@ -1277,7 +1277,7 @@ restart:
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
-		if (sb->sb_cc > 0)
+		if (sbavail(sb))
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
@@ -1289,7 +1289,7 @@ restart:
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
-		if (sb->sb_cc > 0)
+		if (sbavail(sb))
 			goto deliver;
 		else
 			goto out;
@@ -1296,18 +1296,18 @@ restart:
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
-	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
 	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
-	     sb->sb_cc >= sb->sb_lowat ||
-	     sb->sb_cc >= uio->uio_resid ||
-	     sb->sb_cc >= sb->sb_hiwat) ) {
+	     sbavail(sb) >= sb->sb_lowat ||
+	     sbavail(sb) >= uio->uio_resid ||
+	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
-	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
+	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
 		goto deliver;
 
 	/*
@@ -1321,7 +1321,7 @@ restart:
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
@@ -1329,7 +1329,7 @@ deliver:
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
-	len = min(uio->uio_resid, sb->sb_cc);
+	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
@@ -1509,7 +1509,7 @@ sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
 	if (so == NULL)
 		return;
 
-	so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1;
+	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
 	sohasoutofband(so);
 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
 	if (!(so->so_options & SO_OOBINLINE)) {
Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
===================================================================
--- sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c	(.../head)	(revision 266804)
+++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c	(.../projects/sendfile)	(revision 266807)
@@ -183,7 +183,7 @@ sdp_post_recvs_needed(struct sdp_sock *ssk)
 	 * Compute bytes in the receive queue and socket buffer.
 	 */
 	bytes_in_process = (posted - SDP_MIN_TX_CREDITS) * buffer_size;
-	bytes_in_process += ssk->socket->so_rcv.sb_cc;
+	bytes_in_process += sbused(&ssk->socket->so_rcv);
 
 	return bytes_in_process < max_bytes;
 }
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h	(.../head)	(revision 266804)
+++ sys/sys/socket.h	(.../projects/sendfile)	(revision 266807)
@@ -602,12 +602,15 @@ struct sf_hdtr_all {
  * Sendfile-specific flag(s)
  */
 #define	SF_NODISKIO     0x00000001
-#define	SF_MNOWAIT	0x00000002
+#define	SF_MNOWAIT	0x00000002	/* unused since 11.0 */
 #define	SF_SYNC		0x00000004
 #define	SF_KQUEUE	0x00000008
+#define	SF_NOCACHE	0x00000010
+#define	SF_FLAGS(rh, flags)	(((rh) << 16) | (flags))
 
 #ifdef _KERNEL
 #define	SFK_COMPAT	0x00000001
+#define	SF_READAHEAD(flags)	((flags) >> 16)
 #endif /* _KERNEL */
 #endif /* __BSD_VISIBLE */
 
Index: sys/sys/sockbuf.h
===================================================================
--- sys/sys/sockbuf.h	(.../head)	(revision 266804)
+++ sys/sys/sockbuf.h	(.../projects/sendfile)	(revision 266807)
@@ -89,8 +89,13 @@ struct	sockbuf {
 	struct	mbuf *sb_lastrecord;	/* (c/d) first mbuf of last
 					 * record in socket buffer */
 	struct	mbuf *sb_sndptr; /* (c/d) pointer into mbuf chain */
+	struct	mbuf *sb_fnrdy;	/* (c/d) pointer to first not ready buffer */
+#if 0
+	struct	mbuf *sb_lnrdy;	/* (c/d) pointer to last not ready buffer */
+#endif
 	u_int	sb_sndptroff;	/* (c/d) byte offset of ptr into chain */
-	u_int	sb_cc;		/* (c/d) actual chars in buffer */
+	u_int	sb_acc;		/* (c/d) available chars in buffer */
+	u_int	sb_ccc;		/* (c/d) claimed chars in buffer */
 	u_int	sb_hiwat;	/* (c/d) max actual char count */
 	u_int	sb_mbcnt;	/* (c/d) chars of mbufs used */
 	u_int   sb_mcnt;        /* (c/d) number of mbufs in buffer */
@@ -120,10 +125,17 @@ struct	sockbuf {
 #define	SOCKBUF_LOCK_ASSERT(_sb)	mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED)
 #define	SOCKBUF_UNLOCK_ASSERT(_sb)	mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED)
 
+/*
+ * Socket buffer private mbuf(9) flags.
+ */
+#define	M_NOTREADY	M_PROTO1	/* m_data not populated yet */
+#define	M_BLOCKED	M_PROTO2	/* M_NOTREADY in front of m */
+#define	M_NOTAVAIL	(M_NOTREADY | M_BLOCKED)
+
 void	sbappend(struct sockbuf *sb, struct mbuf *m);
 void	sbappend_locked(struct sockbuf *sb, struct mbuf *m);
-void	sbappendstream(struct sockbuf *sb, struct mbuf *m);
-void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m);
+void	sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags);
+void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags);
 int	sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
@@ -136,7 +148,6 @@ int	sbappendcontrol_locked(struct sockbuf *sb, str
 	    struct mbuf *control);
 void	sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
 void	sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0);
-void	sbcheck(struct sockbuf *sb);
 void	sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n);
 struct mbuf *
 	sbcreatecontrol(caddr_t p, int size, int type, int level);
@@ -162,59 +173,54 @@ void	sbtoxsockbuf(struct sockbuf *sb, struct xsock
 int	sbwait(struct sockbuf *sb);
 int	sblock(struct sockbuf *sb, int flags);
 void	sbunlock(struct sockbuf *sb);
+void	sballoc(struct sockbuf *, struct mbuf *);
+void	sbfree(struct sockbuf *, struct mbuf *);
+void	sbmtrim(struct sockbuf *, struct mbuf *, int);
+int	sbready(struct sockbuf *, struct mbuf *, int);
 
+static inline u_int
+sbavail(struct sockbuf *sb)
+{
+
+#if 0
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+	return (sb->sb_acc);
+}
+
+static inline u_int
+sbused(struct sockbuf *sb)
+{
+
+#if 0
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+	return (sb->sb_ccc);
+}
+
 /*
  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
  * This is problematical if the fields are unsigned, as the space might
- * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
- * overflow and return 0.  Should use "lmin" but it doesn't exist now.
+ * still be negative (ccc > hiwat or mbcnt > mbmax).
  */
-static __inline
-long
+static inline long
 sbspace(struct sockbuf *sb)
 {
-	long bleft;
-	long mleft;
+	long bleft, mleft;
 
+#if 0
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+
 	if (sb->sb_flags & SB_STOP)
 		return(0);
-	bleft = sb->sb_hiwat - sb->sb_cc;
+
+	bleft = sb->sb_hiwat - sb->sb_ccc;
 	mleft = sb->sb_mbmax - sb->sb_mbcnt;
-	return((bleft < mleft) ? bleft : mleft);
-}
 
-/* adjust counters in sb reflecting allocation of m */
-#define	sballoc(sb, m) { \
-	(sb)->sb_cc += (m)->m_len; \
-	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
-		(sb)->sb_ctl += (m)->m_len; \
-	(sb)->sb_mbcnt += MSIZE; \
-	(sb)->sb_mcnt += 1; \
-	if ((m)->m_flags & M_EXT) { \
-		(sb)->sb_mbcnt += (m)->m_ext.ext_size; \
-		(sb)->sb_ccnt += 1; \
-	} \
+	return ((bleft < mleft) ? bleft : mleft);
 }
 
-/* adjust counters in sb reflecting freeing of m */
-#define	sbfree(sb, m) { \
-	(sb)->sb_cc -= (m)->m_len; \
-	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
-		(sb)->sb_ctl -= (m)->m_len; \
-	(sb)->sb_mbcnt -= MSIZE; \
-	(sb)->sb_mcnt -= 1; \
-	if ((m)->m_flags & M_EXT) { \
-		(sb)->sb_mbcnt -= (m)->m_ext.ext_size; \
-		(sb)->sb_ccnt -= 1; \
-	} \
-	if ((sb)->sb_sndptr == (m)) { \
-		(sb)->sb_sndptr = NULL; \
-		(sb)->sb_sndptroff = 0; \
-	} \
-	if ((sb)->sb_sndptroff != 0) \
-		(sb)->sb_sndptroff -= (m)->m_len; \
-}
-
 #define SB_EMPTY_FIXUP(sb) do {						\
 	if ((sb)->sb_mb == NULL) {					\
 		(sb)->sb_mbtail = NULL;					\
@@ -224,13 +230,15 @@ sbspace(struct sockbuf *sb)
 
 #ifdef SOCKBUF_DEBUG
 void	sblastrecordchk(struct sockbuf *, const char *, int);
+void	sblastmbufchk(struct sockbuf *, const char *, int);
+void	sbcheck(struct sockbuf *, const char *, int);
 #define	SBLASTRECORDCHK(sb)	sblastrecordchk((sb), __FILE__, __LINE__)
-
-void	sblastmbufchk(struct sockbuf *, const char *, int);
 #define	SBLASTMBUFCHK(sb)	sblastmbufchk((sb), __FILE__, __LINE__)
+#define	SBCHECK(sb)		sbcheck((sb), __FILE__, __LINE__)
 #else
-#define	SBLASTRECORDCHK(sb)      /* nothing */
-#define	SBLASTMBUFCHK(sb)        /* nothing */
+#define	SBLASTRECORDCHK(sb)	do {} while (0)
+#define	SBLASTMBUFCHK(sb)	do {} while (0)
+#define	SBCHECK(sb)		do {} while (0)
 #endif /* SOCKBUF_DEBUG */
 
 #endif /* _KERNEL */
Index: sys/sys/protosw.h
===================================================================
--- sys/sys/protosw.h	(.../head)	(revision 266804)
+++ sys/sys/protosw.h	(.../projects/sendfile)	(revision 266807)
@@ -209,6 +209,7 @@ struct pr_usrreqs {
 #define	PRUS_OOB	0x1
 #define	PRUS_EOF	0x2
 #define	PRUS_MORETOCOME	0x4
+#define	PRUS_NOTREADY	0x8
 	int	(*pru_sense)(struct socket *so, struct stat *sb);
 	int	(*pru_shutdown)(struct socket *so);
 	int	(*pru_flush)(struct socket *so, int direction);
Index: sys/sys/sf_buf.h
===================================================================
--- sys/sys/sf_buf.h	(.../head)	(revision 266804)
+++ sys/sys/sf_buf.h	(.../projects/sendfile)	(revision 266807)
@@ -52,7 +52,7 @@ struct sfstat {				/* sendfile statistics */
 #include <machine/sf_buf.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
-struct mbuf;	/* for sf_buf_mext() */
+struct mbuf;	/* for sf_mext_free() */
 
 extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 #define	SFSTAT_ADD(name, val)	\
@@ -61,6 +61,6 @@ extern counter_u64_t sfstat[sizeof(struct sfstat)
 #define	SFSTAT_INC(name)	SFSTAT_ADD(name, 1)
 #endif /* _KERNEL */
 
-int	sf_buf_mext(struct mbuf *mb, void *addr, void *args);
+int	sf_mext_free(struct mbuf *mb, void *addr, void *args);
 
 #endif /* !_SYS_SF_BUF_H_ */
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h	(.../head)	(revision 266804)
+++ sys/sys/vnode.h	(.../projects/sendfile)	(revision 266807)
@@ -719,6 +719,7 @@ int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
+int	vop_stdgetpages_async(struct vop_getpages_async_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
Index: sys/sys/socketvar.h
===================================================================
--- sys/sys/socketvar.h	(.../head)	(revision 266804)
+++ sys/sys/socketvar.h	(.../projects/sendfile)	(revision 266807)
@@ -205,7 +205,7 @@ struct xsocket {
 
 /* can we read something from so? */
 #define	soreadabledata(so) \
-    ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
+    (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \
 	!TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error)
 #define	soreadable(so) \
 	(soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE))
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h	(.../head)	(revision 266804)
+++ sys/sys/mbuf.h	(.../projects/sendfile)	(revision 266807)
@@ -922,7 +922,7 @@ struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
-void		 m_demote(struct mbuf *, int);
+void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 struct mbuf	*m_dup(struct mbuf *, int);
Index: sys/vm/vnode_pager.h
===================================================================
--- sys/vm/vnode_pager.h	(.../head)	(revision 266804)
+++ sys/vm/vnode_pager.h	(.../projects/sendfile)	(revision 266807)
@@ -41,7 +41,7 @@
 #ifdef _KERNEL
 
 int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m,
-					  int count, int reqpage);
+    int count, int reqpage, void (*iodone)(void *), void *arg);
 int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m,
 					  int count, boolean_t sync,
 					  int *rtvals);
Index: sys/vm/vm_pager.h
===================================================================
--- sys/vm/vm_pager.h	(.../head)	(revision 266804)
+++ sys/vm/vm_pager.h	(.../projects/sendfile)	(revision 266807)
@@ -51,18 +51,21 @@ typedef vm_object_t pgo_alloc_t(void *, vm_ooffset
     struct ucred *);
 typedef void pgo_dealloc_t(vm_object_t);
 typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int);
+typedef int pgo_getpages_async_t(vm_object_t, vm_page_t *, int, int,
+    void(*)(void *), void *);
 typedef void pgo_putpages_t(vm_object_t, vm_page_t *, int, int, int *);
 typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int *, int *);
 typedef void pgo_pageunswapped_t(vm_page_t);
 
 struct pagerops {
-	pgo_init_t	*pgo_init;		/* Initialize pager. */
-	pgo_alloc_t	*pgo_alloc;		/* Allocate pager. */
-	pgo_dealloc_t	*pgo_dealloc;		/* Disassociate. */
-	pgo_getpages_t	*pgo_getpages;		/* Get (read) page. */
-	pgo_putpages_t	*pgo_putpages;		/* Put (write) page. */
-	pgo_haspage_t	*pgo_haspage;		/* Does pager have page? */
-	pgo_pageunswapped_t *pgo_pageunswapped;
+	pgo_init_t		*pgo_init;		/* Initialize pager. */
+	pgo_alloc_t		*pgo_alloc;		/* Allocate pager. */
+	pgo_dealloc_t		*pgo_dealloc;		/* Disassociate. */
+	pgo_getpages_t		*pgo_getpages;		/* Get (read) page. */
+	pgo_getpages_async_t	*pgo_getpages_async;	/* Get page asyncly. */
+	pgo_putpages_t		*pgo_putpages;		/* Put (write) page. */
+	pgo_haspage_t		*pgo_haspage;		/* Query page. */
+	pgo_pageunswapped_t	*pgo_pageunswapped;
 };
 
 extern struct pagerops defaultpagerops;
@@ -103,6 +106,8 @@ vm_object_t vm_pager_allocate(objtype_t, void *, v
 void vm_pager_bufferinit(void);
 void vm_pager_deallocate(vm_object_t);
 static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int);
+static __inline int vm_pager_get_pages_async(vm_object_t, vm_page_t *, int,
+    int, void(*)(void *), void *);
 static __inline boolean_t vm_pager_has_page(vm_object_t, vm_pindex_t, int *, int *);
 void vm_pager_init(void);
 vm_object_t vm_pager_object_lookup(struct pagerlst *, void *);
@@ -131,6 +136,27 @@ vm_pager_get_pages(
 	return (r);
 }
 
+static __inline int
+vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count,
+    int reqpage, void (*iodone)(void *), void *arg)
+{
+	int r;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	if (*pagertab[object->type]->pgo_getpages_async == NULL) {
+		/* Emulate async operation. */
+		r = vm_pager_get_pages(object, m, count, reqpage);
+		VM_OBJECT_WUNLOCK(object);
+		(iodone)(arg);
+		VM_OBJECT_WLOCK(object);
+	} else
+		r = (*pagertab[object->type]->pgo_getpages_async)(object, m,
+		    count, reqpage, iodone, arg);
+
+	return (r);
+}
+
 static __inline void
 vm_pager_put_pages(
 	vm_object_t object,
Index: sys/vm/vm_page.c
===================================================================
--- sys/vm/vm_page.c	(.../head)	(revision 266804)
+++ sys/vm/vm_page.c	(.../projects/sendfile)	(revision 266807)
@@ -2689,6 +2689,8 @@ retrylookup:
 		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 		    vm_page_xbusied(m) : vm_page_busied(m);
 		if (sleep) {
+			if (allocflags & VM_ALLOC_NOWAIT)
+				return (NULL);
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
@@ -2716,6 +2718,8 @@ retrylookup:
 	}
 	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
 	if (m == NULL) {
+		if (allocflags & VM_ALLOC_NOWAIT)
+			return (NULL);
 		VM_OBJECT_WUNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_WLOCK(object);
Index: sys/vm/vm_page.h
===================================================================
--- sys/vm/vm_page.h	(.../head)	(revision 266804)
+++ sys/vm/vm_page.h	(.../projects/sendfile)	(revision 266807)
@@ -390,6 +390,7 @@ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
 #define	VM_ALLOC_IGN_SBUSY	0x1000	/* vm_page_grab() only */
 #define	VM_ALLOC_NODUMP		0x2000	/* don't include in dump */
 #define	VM_ALLOC_SBUSY		0x4000	/* Shared busy the page */
+#define	VM_ALLOC_NOWAIT		0x8000	/* Return NULL instead of sleeping */
 
 #define	VM_ALLOC_COUNT_SHIFT	16
 #define	VM_ALLOC_COUNT(count)	((count) << VM_ALLOC_COUNT_SHIFT)
Index: sys/vm/vnode_pager.c
===================================================================
--- sys/vm/vnode_pager.c	(.../head)	(revision 266804)
+++ sys/vm/vnode_pager.c	(.../projects/sendfile)	(revision 266807)
@@ -83,6 +83,8 @@ static int vnode_pager_input_smlfs(vm_object_t obj
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int,
+    void(*)(void  *), void *);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
@@ -92,6 +94,7 @@ struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
+	.pgo_getpages_async = vnode_pager_getpages_async,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
@@ -664,6 +667,40 @@ vnode_pager_getpages(vm_object_t object, vm_page_t
 	return rtval;
 }
 
+static int
+vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
+    int reqpage, void (*iodone)(void *), void *arg)
+{
+	int rtval;
+	struct vnode *vp;
+	int bytes = count * PAGE_SIZE;
+
+	vp = object->handle;
+	VM_OBJECT_WUNLOCK(object);
+	rtval = VOP_GETPAGES_ASYNC(vp, m, bytes, reqpage, 0, iodone, arg);
+	KASSERT(rtval != EOPNOTSUPP,
+	    ("vnode_pager: FS getpages_async not implemented\n"));
+	VM_OBJECT_WLOCK(object);
+	return rtval;
+}
+
+struct getpages_softc {
+	vm_page_t *m;
+	struct buf *bp;
+	vm_object_t object;
+	vm_offset_t kva;
+	off_t foff;
+	int size;
+	int count;
+	int unmapped;
+	int reqpage;
+	void (*iodone)(void *);
+	void *arg;
+};
+
+int	vnode_pager_generic_getpages_done(struct getpages_softc *);
+void	vnode_pager_generic_getpages_done_async(struct buf *);
+
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
@@ -670,11 +707,11 @@ vnode_pager_getpages(vm_object_t object, vm_page_t
  */
 int
 vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount,
-    int reqpage)
+    int reqpage, void (*iodone)(void *), void *arg)
 {
 	vm_object_t object;
 	vm_offset_t kva;
-	off_t foff, tfoff, nextoff;
+	off_t foff;
 	int i, j, size, bsize, first;
 	daddr_t firstaddr, reqblock;
 	struct bufobj *bo;
@@ -684,6 +721,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_
 	struct mount *mp;
 	int count;
 	int error;
+	int unmapped;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
@@ -891,8 +929,8 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_
 	 * requires mapped buffers.
 	 */
 	mp = vp->v_mount;
-	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
-	    unmapped_buf_allowed) {
+	unmapped = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS));
+	if (unmapped && unmapped_buf_allowed) {
 		bp->b_data = unmapped_buf;
 		bp->b_kvabase = unmapped_buf;
 		bp->b_offset = 0;
@@ -905,7 +943,6 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_
 
 	/* build a minimal buffer header */
 	bp->b_iocmd = BIO_READ;
-	bp->b_iodone = bdone;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
@@ -923,10 +960,88 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_
 
 	/* do the input */
 	bp->b_iooffset = dbtob(bp->b_blkno);
-	bstrategy(bp);
 
-	bwait(bp, PVM, "vnread");
+	if (iodone) { /* async */
+		struct getpages_softc *sc;
 
+		sc = malloc(sizeof(*sc), M_TEMP, M_WAITOK);
+
+		sc->m = m;
+		sc->bp = bp;
+		sc->object = object;
+		sc->foff = foff;
+		sc->size = size;
+		sc->count = count;
+		sc->unmapped = unmapped;
+		sc->reqpage = reqpage;
+		sc->kva = kva;
+
+		sc->iodone = iodone;
+		sc->arg = arg;
+
+		bp->b_iodone = vnode_pager_generic_getpages_done_async;
+		bp->b_caller1 = sc;
+		BUF_KERNPROC(bp);
+		bstrategy(bp);
+		/* Good bye! */
+	} else {
+		struct getpages_softc sc;
+
+		sc.m = m;
+		sc.bp = bp;
+		sc.object = object;
+		sc.foff = foff;
+		sc.size = size;
+		sc.count = count;
+		sc.unmapped = unmapped;
+		sc.reqpage = reqpage;
+		sc.kva = kva;
+
+		bp->b_iodone = bdone;
+		bstrategy(bp);
+		bwait(bp, PVM, "vnread");
+		error = vnode_pager_generic_getpages_done(&sc);
+	}
+
+	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+}
+
+void
+vnode_pager_generic_getpages_done_async(struct buf *bp)
+{
+	struct getpages_softc *sc = bp->b_caller1;
+	int error;
+
+	error = vnode_pager_generic_getpages_done(sc);
+
+	vm_page_xunbusy(sc->m[sc->reqpage]);
+
+	sc->iodone(sc->arg);
+
+	free(sc, M_TEMP);
+}
+
+int
+vnode_pager_generic_getpages_done(struct getpages_softc *sc)
+{
+	vm_object_t object;
+	vm_offset_t kva;
+	vm_page_t *m;
+	struct buf *bp;
+	off_t foff, tfoff, nextoff;
+	int i, size, count, unmapped, reqpage;
+	int error = 0;
+
+	m = sc->m;
+	bp = sc->bp;
+	object = sc->object;
+	foff = sc->foff;
+	size = sc->size;
+	count = sc->count;
+	unmapped = sc->unmapped;
+	reqpage = sc->reqpage;
+	kva = sc->kva;
+
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		error = EIO;
 
@@ -939,7 +1054,7 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_
 	}
 	if ((bp->b_flags & B_UNMAPPED) == 0)
 		pmap_qremove(kva, count);
-	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) {
+	if (unmapped) {
 		bp->b_data = (caddr_t)kva;
 		bp->b_kvabase = (caddr_t)kva;
 		bp->b_flags &= ~B_UNMAPPED;
@@ -995,7 +1110,8 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_
 	if (error) {
 		printf("vnode_pager_getpages: I/O read error\n");
 	}
-	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+
+	return (error);
 }
 
 /*
Index: sys/rpc/clnt_vc.c
===================================================================
--- sys/rpc/clnt_vc.c	(.../head)	(revision 266804)
+++ sys/rpc/clnt_vc.c	(.../projects/sendfile)	(revision 266807)
@@ -860,7 +860,7 @@ clnt_vc_soupcall(struct socket *so, void *arg, int
 			 * error condition
 			 */
 			do_read = FALSE;
-			if (so->so_rcv.sb_cc >= sizeof(uint32_t)
+			if (sbavail(&so->so_rcv) >= sizeof(uint32_t)
 			    || (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			    || so->so_error)
 				do_read = TRUE;
@@ -913,7 +913,7 @@ clnt_vc_soupcall(struct socket *so, void *arg, int
 			 * buffered.
 			 */
 			do_read = FALSE;
-			if (so->so_rcv.sb_cc >= ct->ct_record_resid
+			if (sbavail(&so->so_rcv) >= ct->ct_record_resid
 			    || (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			    || so->so_error)
 				do_read = TRUE;
Index: sys/rpc/svc_vc.c
===================================================================
--- sys/rpc/svc_vc.c	(.../head)	(revision 266804)
+++ sys/rpc/svc_vc.c	(.../projects/sendfile)	(revision 266807)
@@ -546,7 +546,7 @@ svc_vc_ack(SVCXPRT *xprt, uint32_t *ack)
 {
 
 	*ack = atomic_load_acq_32(&xprt->xp_snt_cnt);
-	*ack -= xprt->xp_socket->so_snd.sb_cc;
+	*ack -= sbused(&xprt->xp_socket->so_snd);
 	return (TRUE);
 }
 
Index: sys/ufs/ffs/ffs_vnops.c
===================================================================
--- sys/ufs/ffs/ffs_vnops.c	(.../head)	(revision 266804)
+++ sys/ufs/ffs/ffs_vnops.c	(.../projects/sendfile)	(revision 266807)
@@ -105,6 +105,7 @@ extern int	ffs_rawread(struct vnode *vp, struct ui
 static vop_fsync_t	ffs_fsync;
 static vop_lock1_t	ffs_lock;
 static vop_getpages_t	ffs_getpages;
+static vop_getpages_async_t ffs_getpages_async;
 static vop_read_t	ffs_read;
 static vop_write_t	ffs_write;
 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
@@ -125,6 +126,7 @@ struct vop_vector ffs_vnodeops1 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_getpages =		ffs_getpages,
+	.vop_getpages_async =	ffs_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
@@ -847,18 +849,16 @@ ffs_write(ap)
 }
 
 /*
- * get page routine
+ * Get page routines.
  */
 static int
-ffs_getpages(ap)
-	struct vop_getpages_args *ap;
+ffs_getpages_checkvalid(vm_page_t *m, int count, int reqpage)
 {
-	int i;
 	vm_page_t mreq;
 	int pcount;
 
-	pcount = round_page(ap->a_count) / PAGE_SIZE;
-	mreq = ap->a_m[ap->a_reqpage];
+	pcount = round_page(count) / PAGE_SIZE;
+	mreq = m[reqpage];
 
 	/*
 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
@@ -870,24 +870,48 @@ static int
 	if (mreq->valid) {
 		if (mreq->valid != VM_PAGE_BITS_ALL)
 			vm_page_zero_invalid(mreq, TRUE);
-		for (i = 0; i < pcount; i++) {
-			if (i != ap->a_reqpage) {
-				vm_page_lock(ap->a_m[i]);
-				vm_page_free(ap->a_m[i]);
-				vm_page_unlock(ap->a_m[i]);
+		for (int i = 0; i < pcount; i++) {
+			if (i != reqpage) {
+				vm_page_lock(m[i]);
+				vm_page_free(m[i]);
+				vm_page_unlock(m[i]);
 			}
 		}
 		VM_OBJECT_WUNLOCK(mreq->object);
-		return VM_PAGER_OK;
+		return (VM_PAGER_OK);
 	}
 	VM_OBJECT_WUNLOCK(mreq->object);
 
-	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
-					    ap->a_count,
-					    ap->a_reqpage);
+	return (-1);
 }
 
+static int
+ffs_getpages(struct vop_getpages_args *ap)
+{
+	int rv;
 
+	rv = ffs_getpages_checkvalid(ap->a_m, ap->a_count, ap->a_reqpage);
+	if (rv == VM_PAGER_OK)
+		return (rv);
+
+	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+	    ap->a_reqpage, NULL, NULL));
+}
+
+static int
+ffs_getpages_async(struct vop_getpages_async_args *ap)
+{
+	int rv;
+
+	rv = ffs_getpages_checkvalid(ap->a_m, ap->a_count, ap->a_reqpage);
+	if (rv == VM_PAGER_OK) {
+		(ap->a_vop_getpages_iodone)(ap->a_arg);
+		return (rv);
+	}
+	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+	    ap->a_reqpage, ap->a_vop_getpages_iodone, ap->a_arg));
+}
+
 /*
  * Extended attribute area reading.
  */
Index: sys/tools/vnode_if.awk
===================================================================
--- sys/tools/vnode_if.awk	(.../head)	(revision 266804)
+++ sys/tools/vnode_if.awk	(.../projects/sendfile)	(revision 266807)
@@ -254,16 +254,26 @@ while ((getline < srcfile) > 0) {
 		if (sub(/;$/, "") < 1)
 			die("Missing end-of-line ; in \"%s\".", $0);
 
-		# pick off variable name
-		if ((argp = match($0, /[A-Za-z0-9_]+$/)) < 1)
-			die("Missing var name \"a_foo\" in \"%s\".", $0);
-		args[numargs] = substr($0, argp);
-		$0 = substr($0, 1, argp - 1);
-
-		# what is left must be type
-		# remove trailing space (if any)
-		sub(/ $/, "");
-		types[numargs] = $0;
+		# pick off argument name
+		if ((argp = match($0, /[A-Za-z0-9_]+$/)) > 0) {
+			args[numargs] = substr($0, argp);
+			$0 = substr($0, 1, argp - 1);
+			sub(/ $/, "");
+			delete fargs[numargs];
+			types[numargs] = $0;
+		} else {	# try to parse a function pointer argument
+			if ((argp = match($0,
+			    /\(\*[A-Za-z0-9_]+\)\([A-Za-z0-9_*, ]+\)$/)) < 1)
+				die("Missing var name \"a_foo\" in \"%s\".",
+				    $0);
+			args[numargs] = substr($0, argp + 2);
+			sub(/\).+/, "", args[numargs]);
+			fargs[numargs] = substr($0, argp);
+			sub(/^\([^)]+\)/, "", fargs[numargs]);
+			$0 = substr($0, 1, argp - 1);
+			sub(/ $/, "");
+			types[numargs] = $0;
+		}
 	}
 	if (numargs > 4)
 		ctrargs = 4;
@@ -286,8 +296,13 @@ while ((getline < srcfile) > 0) {
 	if (hfile) {
 		# Print out the vop_F_args structure.
 		printh("struct "name"_args {\n\tstruct vop_generic_args a_gen;");
-		for (i = 0; i < numargs; ++i)
-			printh("\t" t_spc(types[i]) "a_" args[i] ";");
+		for (i = 0; i < numargs; ++i) {
+			if (fargs[i]) {
+				printh("\t" t_spc(types[i]) "(*a_" args[i] \
+				    ")" fargs[i] ";");
+			} else
+				printh("\t" t_spc(types[i]) "a_" args[i] ";");
+		}
 		printh("};");
 		printh("");
 
@@ -301,8 +316,14 @@ while ((getline < srcfile) > 0) {
 		printh("");
 		printh("static __inline int " uname "(");
 		for (i = 0; i < numargs; ++i) {
-			printh("\t" t_spc(types[i]) args[i] \
-			    (i < numargs - 1 ? "," : ")"));
+			if (fargs[i]) {
+				printh("\t" t_spc(types[i]) "(*" args[i] \
+				    ")" fargs[i] \
+				    (i < numargs - 1 ? "," : ")"));
+			} else {
+				printh("\t" t_spc(types[i]) args[i] \
+				    (i < numargs - 1 ? "," : ")"));
+			}
 		}
 		printh("{");
 		printh("\tstruct " name "_args a;");
Index: sys/netinet/tcp_reass.c
===================================================================
--- sys/netinet/tcp_reass.c	(.../head)	(revision 266804)
+++ sys/netinet/tcp_reass.c	(.../projects/sendfile)	(revision 266807)
@@ -248,7 +248,7 @@ present:
 			m_freem(mq);
 		else {
 			mq->m_nextpkt = NULL;
-			sbappendstream_locked(&so->so_rcv, mq);
+			sbappendstream_locked(&so->so_rcv, mq, 0);
 			wakeup = 1;
 		}
 	}
Index: sys/netinet/accf_http.c
===================================================================
--- sys/netinet/accf_http.c	(.../head)	(revision 266804)
+++ sys/netinet/accf_http.c	(.../projects/sendfile)	(revision 266807)
@@ -92,7 +92,7 @@ sbfull(struct sockbuf *sb)
 	    "mbcnt(%ld) >= mbmax(%ld): %d",
 	    sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat,
 	    sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax);
-	return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax);
+	return (sbused(sb) >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax);
 }
 
 /*
@@ -162,13 +162,14 @@ static int
 sohashttpget(struct socket *so, void *arg, int waitflag)
 {
 
-	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) {
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 &&
+	    !sbfull(&so->so_rcv)) {
 		struct mbuf *m;
 		char *cmp;
 		int	cmplen, cc;
 
 		m = so->so_rcv.sb_mb;
-		cc = so->so_rcv.sb_cc - 1;
+		cc = sbavail(&so->so_rcv) - 1;
 		if (cc < 1)
 			return (SU_OK);
 		switch (*mtod(m, char *)) {
@@ -215,7 +216,7 @@ soparsehttpvers(struct socket *so, void *arg, int
 		goto fallout;
 
 	m = so->so_rcv.sb_mb;
-	cc = so->so_rcv.sb_cc;
+	cc = sbavail(&so->so_rcv);
 	inspaces = spaces = 0;
 	for (m = so->so_rcv.sb_mb; m; m = n) {
 		n = m->m_nextpkt;
@@ -304,7 +305,7 @@ soishttpconnected(struct socket *so, void *arg, in
 	 * have NCHRS left
 	 */
 	copied = 0;
-	ccleft = so->so_rcv.sb_cc;
+	ccleft = sbavail(&so->so_rcv);
 	if (ccleft < NCHRS)
 		goto readmore;
 	a = b = c = '\0';
Index: sys/netinet/sctp_os_bsd.h
===================================================================
--- sys/netinet/sctp_os_bsd.h	(.../head)	(revision 266804)
+++ sys/netinet/sctp_os_bsd.h	(.../projects/sendfile)	(revision 266807)
@@ -405,7 +405,7 @@ typedef struct callout sctp_os_timer_t;
 #define SCTP_SOWAKEUP(so)	wakeup(&(so)->so_timeo)
 /* clear the socket buffer state */
 #define SCTP_SB_CLEAR(sb)	\
-	(sb).sb_cc = 0;		\
+	(sb).sb_ccc = 0;		\
 	(sb).sb_mb = NULL;	\
 	(sb).sb_mbcnt = 0;
 
Index: sys/netinet/tcp_output.c
===================================================================
--- sys/netinet/tcp_output.c	(.../head)	(revision 266804)
+++ sys/netinet/tcp_output.c	(.../projects/sendfile)	(revision 266807)
@@ -322,7 +322,7 @@ after_sack_rexmit:
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
-			if (off < so->so_snd.sb_cc)
+			if (off < sbavail(&so->so_snd))
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
@@ -348,7 +348,8 @@ after_sack_rexmit:
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
-			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
+			len = ((long)ulmin(sbavail(&so->so_snd), sendwin) -
+			    off);
 		else {
 			long cwin;
 
@@ -357,8 +358,8 @@ after_sack_rexmit:
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
-			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
-			       - off);
+			len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) -
+			    off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it 
@@ -457,12 +458,15 @@ after_sack_rexmit:
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
+	 *
+	 * XXXGL: should there be used sbused() or sbavail()?
 	 */
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
-		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
-		    so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
-		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+		    sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) &&
+		    sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
+		    sendwin >= (sbused(&so->so_snd) -
+		    (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
 			     V_tcp_autosndbuf_max), so, curthread))
@@ -499,10 +503,11 @@ after_sack_rexmit:
 		tso = 1;
 
 	if (sack_rxmit) {
-		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
+		if (SEQ_LT(p->rxmit + len, tp->snd_una + sbavail(&so->so_snd)))
 			flags &= ~TH_FIN;
 	} else {
-		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
+		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
+		    sbavail(&so->so_snd)))
 			flags &= ~TH_FIN;
 	}
 
@@ -532,7 +537,7 @@ after_sack_rexmit:
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
-		    len + off >= so->so_snd.sb_cc &&
+		    len + off >= sbavail(&so->so_snd) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
@@ -660,7 +665,7 @@ dontupdate:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
-	if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
+	if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
@@ -786,7 +791,7 @@ send:
 			 * fractional unless the send sockbuf can
 			 * be emptied.
 			 */
-			if (sendalot && off + len < so->so_snd.sb_cc) {
+			if (sendalot && off + len < sbavail(&so->so_snd)) {
 				len -= len % (tp->t_maxopd - optlen);
 				sendalot = 1;
 			}
@@ -889,7 +894,7 @@ send:
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
-		if (off + len == so->so_snd.sb_cc)
+		if (off + len == sbavail(&so->so_snd))
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
Index: sys/netinet/siftr.c
===================================================================
--- sys/netinet/siftr.c	(.../head)	(revision 266804)
+++ sys/netinet/siftr.c	(.../projects/sendfile)	(revision 266807)
@@ -781,9 +781,9 @@ siftr_siftdata(struct pkt_node *pn, struct inpcb *
 	pn->flags = tp->t_flags;
 	pn->rxt_length = tp->t_rxtcur;
 	pn->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat;
-	pn->snd_buf_cc = inp->inp_socket->so_snd.sb_cc;
+	pn->snd_buf_cc = sbused(&inp->inp_socket->so_snd);
 	pn->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat;
-	pn->rcv_buf_cc = inp->inp_socket->so_rcv.sb_cc;
+	pn->rcv_buf_cc = sbused(&inp->inp_socket->so_rcv);
 	pn->sent_inflight_bytes = tp->snd_max - tp->snd_una;
 	pn->t_segqlen = tp->t_segqlen;
 
Index: sys/netinet/sctp_indata.c
===================================================================
--- sys/netinet/sctp_indata.c	(.../head)	(revision 266804)
+++ sys/netinet/sctp_indata.c	(.../projects/sendfile)	(revision 266807)
@@ -70,7 +70,7 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_
 
 	/*
 	 * This is really set wrong with respect to a 1-2-m socket. Since
-	 * the sb_cc is the count that everyone as put up. When we re-write
+	 * the sb_ccc is the count that everyone as put up. When we re-write
 	 * sctp_soreceive then we will fix this so that ONLY this
 	 * associations data is taken into account.
 	 */
@@ -77,7 +77,7 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_
 	if (stcb->sctp_socket == NULL)
 		return (calc);
 
-	if (stcb->asoc.sb_cc == 0 &&
+	if (stcb->asoc.sb_ccc == 0 &&
 	    asoc->size_on_reasm_queue == 0 &&
 	    asoc->size_on_all_streams == 0) {
 		/* Full rwnd granted */
@@ -1358,7 +1358,7 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, s
 		 * When we have NO room in the rwnd we check to make sure
 		 * the reader is doing its job...
 		 */
-		if (stcb->sctp_socket->so_rcv.sb_cc) {
+		if (stcb->sctp_socket->so_rcv.sb_ccc) {
 			/* some to read, wake-up */
 #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			struct socket *so;
Index: sys/netinet/sctp_pcb.c
===================================================================
--- sys/netinet/sctp_pcb.c	(.../head)	(revision 266804)
+++ sys/netinet/sctp_pcb.c	(.../projects/sendfile)	(revision 266807)
@@ -3328,7 +3328,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immedi
 			if ((asoc->asoc.size_on_reasm_queue > 0) ||
 			    (asoc->asoc.control_pdapi) ||
 			    (asoc->asoc.size_on_all_streams > 0) ||
-			    (so && (so->so_rcv.sb_cc > 0))) {
+			    (so && (so->so_rcv.sb_ccc > 0))) {
 				/* Left with Data unread */
 				struct mbuf *op_err;
 
@@ -3556,7 +3556,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immedi
 		TAILQ_REMOVE(&inp->read_queue, sq, next);
 		sctp_free_remote_addr(sq->whoFrom);
 		if (so)
-			so->so_rcv.sb_cc -= sq->length;
+			so->so_rcv.sb_ccc -= sq->length;
 		if (sq->data) {
 			sctp_m_freem(sq->data);
 			sq->data = NULL;
@@ -4775,7 +4775,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sct
 			inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED;
 			if (so) {
 				SOCK_LOCK(so);
-				if (so->so_rcv.sb_cc == 0) {
+				if (so->so_rcv.sb_ccc == 0) {
 					so->so_state &= ~(SS_ISCONNECTING |
 					    SS_ISDISCONNECTING |
 					    SS_ISCONFIRMING |
Index: sys/netinet/sctp_pcb.h
===================================================================
--- sys/netinet/sctp_pcb.h	(.../head)	(revision 266804)
+++ sys/netinet/sctp_pcb.h	(.../projects/sendfile)	(revision 266807)
@@ -369,7 +369,7 @@ struct sctp_inpcb {
 	}     ip_inp;
 
 
-	/* Socket buffer lock protects read_queue and of course sb_cc */
+	/* Socket buffer lock protects read_queue and of course sb_ccc */
 	struct sctp_readhead read_queue;
 
 	              LIST_ENTRY(sctp_inpcb) sctp_list;	/* lists all endpoints */
Index: sys/netinet/sctp_usrreq.c
===================================================================
--- sys/netinet/sctp_usrreq.c	(.../head)	(revision 266804)
+++ sys/netinet/sctp_usrreq.c	(.../projects/sendfile)	(revision 266807)
@@ -586,7 +586,7 @@ sctp_must_try_again:
 	if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 	    (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
 		if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) ||
-		    (so->so_rcv.sb_cc > 0)) {
+		    (so->so_rcv.sb_ccc > 0)) {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 13);
 #endif
@@ -751,7 +751,7 @@ sctp_disconnect(struct socket *so)
 			}
 			if (((so->so_options & SO_LINGER) &&
 			    (so->so_linger == 0)) ||
-			    (so->so_rcv.sb_cc > 0)) {
+			    (so->so_rcv.sb_ccc > 0)) {
 				if (SCTP_GET_STATE(asoc) !=
 				    SCTP_STATE_COOKIE_WAIT) {
 					/* Left with Data unread */
@@ -916,7 +916,7 @@ sctp_flush(struct socket *so, int how)
 		inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_CANT_READ;
 		SCTP_INP_READ_UNLOCK(inp);
 		SCTP_INP_WUNLOCK(inp);
-		so->so_rcv.sb_cc = 0;
+		so->so_rcv.sb_ccc = 0;
 		so->so_rcv.sb_mbcnt = 0;
 		so->so_rcv.sb_mb = NULL;
 	}
@@ -925,7 +925,7 @@ sctp_flush(struct socket *so, int how)
 		 * First make sure the sb will be happy, we don't use these
 		 * except maybe the count
 		 */
-		so->so_snd.sb_cc = 0;
+		so->so_snd.sb_ccc = 0;
 		so->so_snd.sb_mbcnt = 0;
 		so->so_snd.sb_mb = NULL;
 
Index: sys/netinet/sctp_structs.h
===================================================================
--- sys/netinet/sctp_structs.h	(.../head)	(revision 266804)
+++ sys/netinet/sctp_structs.h	(.../projects/sendfile)	(revision 266807)
@@ -982,7 +982,7 @@ struct sctp_association {
 
 	uint32_t total_output_queue_size;
 
-	uint32_t sb_cc;		/* shadow of sb_cc */
+	uint32_t sb_ccc;		/* shadow of sb_ccc */
 	uint32_t sb_send_resv;	/* amount reserved on a send */
 	uint32_t my_rwnd_control_len;	/* shadow of sb_mbcnt used for rwnd
 					 * control */
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c	(.../head)	(revision 266804)
+++ sys/netinet/tcp_input.c	(.../projects/sendfile)	(revision 266807)
@@ -1729,7 +1729,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th,
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
 				sowwakeup(so);
-				if (so->so_snd.sb_cc)
+				if (sbavail(&so->so_snd))
 					(void) tcp_output(tp);
 				goto check_delack;
 			}
@@ -1837,7 +1837,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th,
 					    newsize, so, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
-				sbappendstream_locked(&so->so_rcv, m);
+				sbappendstream_locked(&so->so_rcv, m, 0);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
@@ -2541,7 +2541,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th,
 					 * Otherwise we would send pure ACKs.
 					 */
 					SOCKBUF_LOCK(&so->so_snd);
-					avail = so->so_snd.sb_cc -
+					avail = sbavail(&so->so_snd) -
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0)
@@ -2676,10 +2676,10 @@ process_ACK:
 		cc_ack_received(tp, th, CC_ACK);
 
 		SOCKBUF_LOCK(&so->so_snd);
-		if (acked > so->so_snd.sb_cc) {
-			tp->snd_wnd -= so->so_snd.sb_cc;
+		if (acked > sbavail(&so->so_snd)) {
+			tp->snd_wnd -= sbavail(&so->so_snd);
 			mfree = sbcut_locked(&so->so_snd,
-			    (int)so->so_snd.sb_cc);
+			    (int)sbavail(&so->so_snd));
 			ourfinisacked = 1;
 		} else {
 			mfree = sbcut_locked(&so->so_snd, acked);
@@ -2805,7 +2805,7 @@ step6:
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
-		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
@@ -2827,7 +2827,7 @@ step6:
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
-			so->so_oobmark = so->so_rcv.sb_cc +
+			so->so_oobmark = sbavail(&so->so_rcv) +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
@@ -2897,7 +2897,7 @@ dodata:							/* XXX */
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
-				sbappendstream_locked(&so->so_rcv, m);
+				sbappendstream_locked(&so->so_rcv, m, 0);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
Index: sys/netinet/sctp_input.c
===================================================================
--- sys/netinet/sctp_input.c	(.../head)	(revision 266804)
+++ sys/netinet/sctp_input.c	(.../projects/sendfile)	(revision 266807)
@@ -1042,7 +1042,7 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_
 	if (stcb->sctp_socket) {
 		if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 		    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
-			stcb->sctp_socket->so_snd.sb_cc = 0;
+			stcb->sctp_socket->so_snd.sb_ccc = 0;
 		}
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 	}
Index: sys/netinet/sctp_var.h
===================================================================
--- sys/netinet/sctp_var.h	(.../head)	(revision 266804)
+++ sys/netinet/sctp_var.h	(.../projects/sendfile)	(revision 266807)
@@ -82,9 +82,9 @@ extern struct pr_usrreqs sctp_usrreqs;
 
 #define sctp_maxspace(sb) (max((sb)->sb_hiwat,SCTP_MINIMAL_RWND))
 
-#define	sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_cc) ? (sctp_maxspace(sb) - (asoc)->sb_cc) : 0))
+#define	sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_ccc) ? (sctp_maxspace(sb) - (asoc)->sb_ccc) : 0))
 
-#define	sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_cc) ? (sctp_maxspace(sb) - (sb)->sb_cc) : 0))
+#define	sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_ccc) ? (sctp_maxspace(sb) - (sb)->sb_ccc) : 0))
 
 #define sctp_sbspace_sub(a,b) ((a > b) ? (a - b) : 0)
 
@@ -195,10 +195,10 @@ extern struct pr_usrreqs sctp_usrreqs;
 }
 
 #define sctp_sbfree(ctl, stcb, sb, m) { \
-	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_cc, SCTP_BUF_LEN((m))); \
+	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_ccc, SCTP_BUF_LEN((m))); \
 	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_mbcnt, MSIZE); \
 	if (((ctl)->do_not_ref_stcb == 0) && stcb) {\
-		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \
+		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_ccc, SCTP_BUF_LEN((m))); \
 		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
 	} \
 	if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
@@ -207,10 +207,10 @@ extern struct pr_usrreqs sctp_usrreqs;
 }
 
 #define sctp_sballoc(stcb, sb, m) { \
-	atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \
+	atomic_add_int(&(sb)->sb_ccc,SCTP_BUF_LEN((m))); \
 	atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \
 	if (stcb) { \
-		atomic_add_int(&(stcb)->asoc.sb_cc,SCTP_BUF_LEN((m))); \
+		atomic_add_int(&(stcb)->asoc.sb_ccc,SCTP_BUF_LEN((m))); \
 		atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
 	} \
 	if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
Index: sys/netinet/sctp_output.c
===================================================================
--- sys/netinet/sctp_output.c	(.../head)	(revision 266804)
+++ sys/netinet/sctp_output.c	(.../projects/sendfile)	(revision 266807)
@@ -7104,7 +7104,7 @@ one_more_time:
 			if ((stcb->sctp_socket != NULL) && \
 			    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
-				atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length);
+				atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_ccc, sp->length);
 			}
 			if (sp->data) {
 				sctp_m_freem(sp->data);
@@ -11382,7 +11382,7 @@ jump_out:
 		drp->current_onq = htonl(asoc->size_on_reasm_queue +
 		    asoc->size_on_all_streams +
 		    asoc->my_rwnd_control_len +
-		    stcb->sctp_socket->so_rcv.sb_cc);
+		    stcb->sctp_socket->so_rcv.sb_ccc);
 	} else {
 		/*-
 		 * If my rwnd is 0, possibly from mbuf depletion as well as
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c	(.../head)	(revision 266804)
+++ sys/netinet/tcp_usrreq.c	(.../projects/sendfile)	(revision 266807)
@@ -826,7 +826,7 @@ tcp_usr_send(struct socket *so, int flags, struct
 		m_freem(control);	/* empty control, just free it */
 	}
 	if (!(flags & PRUS_OOB)) {
-		sbappendstream(&so->so_snd, m);
+		sbappendstream(&so->so_snd, m, flags);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,
@@ -858,7 +858,8 @@ tcp_usr_send(struct socket *so, int flags, struct
 			socantsendmore(so);
 			tcp_usrclosed(tp);
 		}
-		if (!(inp->inp_flags & INP_DROPPED)) {
+		if (!(inp->inp_flags & INP_DROPPED) &&
+		    !(flags & PRUS_NOTREADY)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
 			error = tcp_output(tp);
@@ -884,7 +885,7 @@ tcp_usr_send(struct socket *so, int flags, struct
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
-		sbappendstream_locked(&so->so_snd, m);
+		sbappendstream_locked(&so->so_snd, m, flags);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
@@ -908,10 +909,12 @@ tcp_usr_send(struct socket *so, int flags, struct
 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
 			tcp_mss(tp, -1);
 		}
-		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
-		tp->t_flags |= TF_FORCEDATA;
-		error = tcp_output(tp);
-		tp->t_flags &= ~TF_FORCEDATA;
+		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
+		if (!(flags & PRUS_NOTREADY)) {
+			tp->t_flags |= TF_FORCEDATA;
+			error = tcp_output(tp);
+			tp->t_flags &= ~TF_FORCEDATA;
+		}
 	}
 out:
 	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
Index: sys/netinet/accf_dns.c
===================================================================
--- sys/netinet/accf_dns.c	(.../head)	(revision 266804)
+++ sys/netinet/accf_dns.c	(.../projects/sendfile)	(revision 266807)
@@ -75,7 +75,7 @@ sohasdns(struct socket *so, void *arg, int waitfla
 	struct sockbuf *sb = &so->so_rcv;
 
 	/* If the socket is full, we're ready. */
-	if (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax)
+	if (sbused(sb) >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax)
 		goto ready;
 
 	/* Check to see if we have a request. */
@@ -115,7 +115,7 @@ skippacket(struct sockbuf *sb) {
 	unsigned long packlen;
 	struct packet q, *p = &q;
 
-	if (sb->sb_cc < 2)
+	if (sbavail(sb) < 2)
 		return DNS_WAIT;
 
 	q.m = sb->sb_mb;
@@ -122,7 +122,7 @@ skippacket(struct sockbuf *sb) {
 	q.n = q.m->m_nextpkt;
 	q.moff = 0;
 	q.offset = 0;
-	q.len = sb->sb_cc;
+	q.len = sbavail(sb);
 
 	GET16(p, packlen);
 	if (packlen + 2 > q.len)
Index: sys/netinet/sctputil.c
===================================================================
--- sys/netinet/sctputil.c	(.../head)	(revision 266804)
+++ sys/netinet/sctputil.c	(.../projects/sendfile)	(revision 266807)
@@ -67,9 +67,9 @@ sctp_sblog(struct sockbuf *sb, struct sctp_tcb *st
 	struct sctp_cwnd_log sctp_clog;
 
 	sctp_clog.x.sb.stcb = stcb;
-	sctp_clog.x.sb.so_sbcc = sb->sb_cc;
+	sctp_clog.x.sb.so_sbcc = sb->sb_ccc;
 	if (stcb)
-		sctp_clog.x.sb.stcb_sbcc = stcb->asoc.sb_cc;
+		sctp_clog.x.sb.stcb_sbcc = stcb->asoc.sb_ccc;
 	else
 		sctp_clog.x.sb.stcb_sbcc = 0;
 	sctp_clog.x.sb.incr = incr;
@@ -4356,7 +4356,7 @@ sctp_add_to_readq(struct sctp_inpcb *inp,
 {
 	/*
 	 * Here we must place the control on the end of the socket read
-	 * queue AND increment sb_cc so that select will work properly on
+	 * queue AND increment sb_ccc so that select will work properly on
 	 * read.
 	 */
 	struct mbuf *m, *prev = NULL;
@@ -4482,7 +4482,7 @@ sctp_append_to_readq(struct sctp_inpcb *inp,
 	 * the reassembly queue.
 	 * 
 	 * If PDAPI this means we need to add m to the end of the data.
-	 * Increase the length in the control AND increment the sb_cc.
+	 * Increase the length in the control AND increment the sb_ccc.
 	 * Otherwise sb is NULL and all we need to do is put it at the end
 	 * of the mbuf chain.
 	 */
@@ -4694,10 +4694,10 @@ sctp_free_bufspace(struct sctp_tcb *stcb, struct s
 
 	if (stcb->sctp_socket && (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) ||
 	    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)))) {
-		if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) {
-			stcb->sctp_socket->so_snd.sb_cc -= tp1->book_size;
+		if (stcb->sctp_socket->so_snd.sb_ccc >= tp1->book_size) {
+			stcb->sctp_socket->so_snd.sb_ccc -= tp1->book_size;
 		} else {
-			stcb->sctp_socket->so_snd.sb_cc = 0;
+			stcb->sctp_socket->so_snd.sb_ccc = 0;
 
 		}
 	}
@@ -5232,11 +5232,11 @@ sctp_sorecvmsg(struct socket *so,
 	in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
 		sctp_misc_ints(SCTP_SORECV_ENTER,
-		    rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, uio->uio_resid);
+		    rwnd_req, in_eeor_mode, so->so_rcv.sb_ccc, uio->uio_resid);
 	}
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
 		sctp_misc_ints(SCTP_SORECV_ENTERPL,
-		    rwnd_req, block_allowed, so->so_rcv.sb_cc, uio->uio_resid);
+		    rwnd_req, block_allowed, so->so_rcv.sb_ccc, uio->uio_resid);
 	}
 	error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0));
 	if (error) {
@@ -5255,7 +5255,7 @@ restart_nosblocks:
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		goto out;
 	}
-	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && (so->so_rcv.sb_cc == 0)) {
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && (so->so_rcv.sb_ccc == 0)) {
 		if (so->so_error) {
 			error = so->so_error;
 			if ((in_flags & MSG_PEEK) == 0)
@@ -5262,7 +5262,7 @@ restart_nosblocks:
 				so->so_error = 0;
 			goto out;
 		} else {
-			if (so->so_rcv.sb_cc == 0) {
+			if (so->so_rcv.sb_ccc == 0) {
 				/* indicate EOF */
 				error = 0;
 				goto out;
@@ -5269,9 +5269,9 @@ restart_nosblocks:
 			}
 		}
 	}
-	if ((so->so_rcv.sb_cc <= held_length) && block_allowed) {
+	if ((so->so_rcv.sb_ccc <= held_length) && block_allowed) {
 		/* we need to wait for data */
-		if ((so->so_rcv.sb_cc == 0) &&
+		if ((so->so_rcv.sb_ccc == 0) &&
 		    ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 		    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) {
@@ -5307,7 +5307,7 @@ restart_nosblocks:
 		}
 		held_length = 0;
 		goto restart_nosblocks;
-	} else if (so->so_rcv.sb_cc == 0) {
+	} else if (so->so_rcv.sb_ccc == 0) {
 		if (so->so_error) {
 			error = so->so_error;
 			if ((in_flags & MSG_PEEK) == 0)
@@ -5364,11 +5364,11 @@ restart_nosblocks:
 			SCTP_INP_READ_LOCK(inp);
 		}
 		control = TAILQ_FIRST(&inp->read_queue);
-		if ((control == NULL) && (so->so_rcv.sb_cc != 0)) {
+		if ((control == NULL) && (so->so_rcv.sb_ccc != 0)) {
 #ifdef INVARIANTS
 			panic("Huh, its non zero and nothing on control?");
 #endif
-			so->so_rcv.sb_cc = 0;
+			so->so_rcv.sb_ccc = 0;
 		}
 		SCTP_INP_READ_UNLOCK(inp);
 		hold_rlock = 0;
@@ -5489,11 +5489,11 @@ restart_nosblocks:
 		}
 		/*
 		 * if we reach here, not suitable replacement is available
-		 * <or> fragment interleave is NOT on. So stuff the sb_cc
+		 * <or> fragment interleave is NOT on. So stuff the sb_ccc
 		 * into the our held count, and its time to sleep again.
 		 */
-		held_length = so->so_rcv.sb_cc;
-		control->held_length = so->so_rcv.sb_cc;
+		held_length = so->so_rcv.sb_ccc;
+		control->held_length = so->so_rcv.sb_ccc;
 		goto restart;
 	}
 	/* Clear the held length since there is something to read */
@@ -5790,10 +5790,10 @@ get_more_data:
 					if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
 						sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, cp_len);
 					}
-					atomic_subtract_int(&so->so_rcv.sb_cc, cp_len);
+					atomic_subtract_int(&so->so_rcv.sb_ccc, cp_len);
 					if ((control->do_not_ref_stcb == 0) &&
 					    stcb) {
-						atomic_subtract_int(&stcb->asoc.sb_cc, cp_len);
+						atomic_subtract_int(&stcb->asoc.sb_ccc, cp_len);
 					}
 					copied_so_far += cp_len;
 					freed_so_far += cp_len;
@@ -5938,7 +5938,7 @@ wait_some_more:
 		    (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE))) {
 			goto release;
 		}
-		if (so->so_rcv.sb_cc <= control->held_length) {
+		if (so->so_rcv.sb_ccc <= control->held_length) {
 			error = sbwait(&so->so_rcv);
 			if (error) {
 				goto release;
@@ -5965,8 +5965,8 @@ wait_some_more:
 				}
 				goto done_with_control;
 			}
-			if (so->so_rcv.sb_cc > held_length) {
-				control->held_length = so->so_rcv.sb_cc;
+			if (so->so_rcv.sb_ccc > held_length) {
+				control->held_length = so->so_rcv.sb_ccc;
 				held_length = 0;
 			}
 			goto wait_some_more;
@@ -6113,13 +6113,13 @@ out:
 			    freed_so_far,
 			    ((uio) ? (slen - uio->uio_resid) : slen),
 			    stcb->asoc.my_rwnd,
-			    so->so_rcv.sb_cc);
+			    so->so_rcv.sb_ccc);
 		} else {
 			sctp_misc_ints(SCTP_SORECV_DONE,
 			    freed_so_far,
 			    ((uio) ? (slen - uio->uio_resid) : slen),
 			    0,
-			    so->so_rcv.sb_cc);
+			    so->so_rcv.sb_ccc);
 		}
 	}
 stage_left:
Index: sys/netinet/sctputil.h
===================================================================
--- sys/netinet/sctputil.h	(.../head)	(revision 266804)
+++ sys/netinet/sctputil.h	(.../projects/sendfile)	(revision 266807)
@@ -284,10 +284,10 @@ do { \
 		} \
    	        if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
 	            (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
-			if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \
-				atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \
+			if (stcb->sctp_socket->so_snd.sb_ccc >= tp1->book_size) { \
+				atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_ccc), tp1->book_size); \
 			} else { \
-				stcb->sctp_socket->so_snd.sb_cc = 0; \
+				stcb->sctp_socket->so_snd.sb_ccc = 0; \
 			} \
 		} \
         } \
@@ -305,10 +305,10 @@ do { \
 		} \
    	        if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
 	            (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
-			if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \
-				atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \
+			if (stcb->sctp_socket->so_snd.sb_ccc >= sp->length) { \
+				atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_ccc,sp->length); \
 			} else { \
-				stcb->sctp_socket->so_snd.sb_cc = 0; \
+				stcb->sctp_socket->so_snd.sb_ccc = 0; \
 			} \
 		} \
         } \
@@ -320,7 +320,7 @@ do { \
 	if ((stcb->sctp_socket != NULL) && \
 	    ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
 	     (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
-		atomic_add_int(&stcb->sctp_socket->so_snd.sb_cc,sz); \
+		atomic_add_int(&stcb->sctp_socket->so_snd.sb_ccc,sz); \
 	} \
 } while (0)
 
Index: usr.bin/bluetooth/btsockstat/btsockstat.c
===================================================================
--- usr.bin/bluetooth/btsockstat/btsockstat.c	(.../head)	(revision 266804)
+++ usr.bin/bluetooth/btsockstat/btsockstat.c	(.../projects/sendfile)	(revision 266807)
@@ -255,8 +255,8 @@ hcirawpr(kvm_t *kvmd, u_long addr)
 			(unsigned long) pcb.so,
 			(unsigned long) this,
 			pcb.flags,
-			so.so_rcv.sb_cc,
-			so.so_snd.sb_cc,
+			so.so_rcv.sb_ccc,
+			so.so_snd.sb_ccc,
 			pcb.addr.hci_node);
 	}
 } /* hcirawpr */
@@ -303,8 +303,8 @@ l2caprawpr(kvm_t *kvmd, u_long addr)
 "%-8lx %-8lx %6d %6d %-17.17s\n",
 			(unsigned long) pcb.so,
 			(unsigned long) this,
-			so.so_rcv.sb_cc,
-			so.so_snd.sb_cc,
+			so.so_rcv.sb_ccc,
+			so.so_snd.sb_ccc,
 			bdaddrpr(&pcb.src, NULL, 0));
 	}
 } /* l2caprawpr */
@@ -361,8 +361,8 @@ l2cappr(kvm_t *kvmd, u_long addr)
 		fprintf(stdout,
 "%-8lx %6d %6d %-17.17s/%-5d %-17.17s %-5d %s\n",
 			(unsigned long) this,
-			so.so_rcv.sb_cc,
-			so.so_snd.sb_cc,
+			so.so_rcv.sb_ccc,
+			so.so_snd.sb_ccc,
 			bdaddrpr(&pcb.src, local, sizeof(local)),
 			pcb.psm,
 			bdaddrpr(&pcb.dst, remote, sizeof(remote)),
@@ -467,8 +467,8 @@ rfcommpr(kvm_t *kvmd, u_long addr)
 		fprintf(stdout,
 "%-8lx %6d %6d %-17.17s %-17.17s %-4d %-4d %s\n",
 			(unsigned long) this,
-			so.so_rcv.sb_cc,
-			so.so_snd.sb_cc,
+			so.so_rcv.sb_ccc,
+			so.so_snd.sb_ccc,
 			bdaddrpr(&pcb.src, local, sizeof(local)),
 			bdaddrpr(&pcb.dst, remote, sizeof(remote)),
 			pcb.channel,
Index: usr.bin/systat/netstat.c
===================================================================
--- usr.bin/systat/netstat.c	(.../head)	(revision 266804)
+++ usr.bin/systat/netstat.c	(.../projects/sendfile)	(revision 266807)
@@ -333,8 +333,8 @@ enter_kvm(struct inpcb *inp, struct socket *so, in
 	struct netinfo *p;
 
 	if ((p = enter(inp, state, proto)) != NULL) {
-		p->ni_rcvcc = so->so_rcv.sb_cc;
-		p->ni_sndcc = so->so_snd.sb_cc;
+		p->ni_rcvcc = so->so_rcv.sb_ccc;
+		p->ni_sndcc = so->so_snd.sb_ccc;
 	}
 }
 
Index: usr.bin/netstat/netgraph.c
===================================================================
--- usr.bin/netstat/netgraph.c	(.../head)	(revision 266804)
+++ usr.bin/netstat/netgraph.c	(.../projects/sendfile)	(revision 266807)
@@ -119,7 +119,7 @@ netgraphprotopr(u_long off, const char *name, int
 		if (Aflag)
 			printf("%8lx ", (u_long) this);
 		printf("%-5.5s %6u %6u ",
-		    name, sockb.so_rcv.sb_cc, sockb.so_snd.sb_cc);
+		    name, sockb.so_rcv.sb_ccc, sockb.so_snd.sb_ccc);
 
 		/* Get info on associated node */
 		if (ngpcb.node_id == 0 || csock == -1)
Index: usr.bin/netstat/unix.c
===================================================================
--- usr.bin/netstat/unix.c	(.../head)	(revision 266804)
+++ usr.bin/netstat/unix.c	(.../projects/sendfile)	(revision 266807)
@@ -287,7 +287,8 @@ unixdomainpr(struct xunpcb *xunp, struct xsocket *
 	} else {
 		printf("%8lx %-6.6s %6u %6u %8lx %8lx %8lx %8lx",
 		    (long)so->so_pcb, socktype[so->so_type], so->so_rcv.sb_cc,
-		    so->so_snd.sb_cc, (long)unp->unp_vnode, (long)unp->unp_conn,
+		    so->so_snd.sb_cc, (long)unp->unp_vnode,
+		    (long)unp->unp_conn,
 		    (long)LIST_FIRST(&unp->unp_refs),
 		    (long)LIST_NEXT(unp, unp_reflink));
 	}
Index: usr.bin/netstat/inet.c
===================================================================
--- usr.bin/netstat/inet.c	(.../head)	(revision 266804)
+++ usr.bin/netstat/inet.c	(.../projects/sendfile)	(revision 266807)
@@ -137,7 +137,7 @@ pcblist_sysctl(int proto, const char *name, char *
 static void
 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
 {
-	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_cc = sb->sb_ccc;
 	xsb->sb_hiwat = sb->sb_hiwat;
 	xsb->sb_mbcnt = sb->sb_mbcnt;
 	xsb->sb_mcnt = sb->sb_mcnt;
@@ -479,7 +479,8 @@ protopr(u_long off, const char *name, int af1, int
 				printf("%6u %6u %6u ", tp->t_sndrexmitpack,
 				       tp->t_rcvoopack, tp->t_sndzerowin);
 		} else {
-			printf("%6u %6u ", so->so_rcv.sb_cc, so->so_snd.sb_cc);
+			printf("%6u %6u ",
+			    so->so_rcv.sb_cc, so->so_snd.sb_cc);
 		}
 		if (numeric_port) {
 			if (inp->inp_vflag & INP_IPV4) {

--IuJpT0rwbUevm2bB--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20140529102054.GX50679>