Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 22 Apr 2015 18:11:34 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r281860 - head/sys/kern
Message-ID:  <201504221811.t3MIBYoA099702@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Wed Apr 22 18:11:34 2015
New Revision: 281860
URL: https://svnweb.freebsd.org/changeset/base/281860

Log:
  Make AIO to not allocate pbufs for unmapped I/O like r281825.
  
  While there, make few more performance optimizations.
  
  On 40-core system doing many 512-byte AIO reads from array of raw SSDs
  this change removes lock congestions inside pbuf allocator and devfs,
  and bottleneck on single AIO completion taskqueue thread.  It improves
  peak AIO performance from ~600K to ~1.3M IOPS.
  
  MFC after:	2 weeks

Modified:
  head/sys/kern/vfs_aio.c

Modified: head/sys/kern/vfs_aio.c
==============================================================================
--- head/sys/kern/vfs_aio.c	Wed Apr 22 17:35:58 2015	(r281859)
+++ head/sys/kern/vfs_aio.c	Wed Apr 22 18:11:34 2015	(r281860)
@@ -59,10 +59,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
+#include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
+#include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
@@ -232,9 +234,10 @@ struct aiocblist {
 	int	jobstate;		/* (b) job state */
 	int	inputcharge;		/* (*) input blockes */
 	int	outputcharge;		/* (*) output blockes */
-	struct	buf *bp;		/* (*) private to BIO backend,
-				  	 * buffer pointer
-					 */
+	struct	bio *bp;		/* (*) BIO backend BIO pointer */
+	struct	buf *pbuf;		/* (*) BIO backend buffer pointer */
+	struct	vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
+	int	npages;			/* BIO backend number of pages */
 	struct	proc *userproc;		/* (*) user process */
 	struct  ucred *cred;		/* (*) active credential when created */
 	struct	file *fd_file;		/* (*) pointer to file structure */
@@ -243,7 +246,6 @@ struct aiocblist {
 	struct	knlist klist;		/* (a) list of knotes */
 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
 	ksiginfo_t ksi;			/* (a) realtime signal info */
-	struct	task biotask;		/* (*) private to BIO backend */
 	uint64_t seqno;			/* (*) job number */
 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
 };
@@ -344,11 +346,10 @@ static void	aio_process_mlock(struct aio
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *job,
 			struct aioliojob *lio, int type, struct aiocb_ops *ops);
-static void	aio_physwakeup(struct buf *bp);
+static void	aio_physwakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void	biohelper(void *, int);
 static void	aio_daemon(void *param);
 static void	aio_swake_cb(struct socket *, struct sockbuf *);
 static int	aio_unload(void);
@@ -1294,13 +1295,15 @@ aio_qphysio(struct proc *p, struct aiocb
 {
 	struct aiocb *cb;
 	struct file *fp;
-	struct buf *bp;
+	struct bio *bp;
+	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
-	int error, ref;
+	int error, ref, unmap, poff;
+	vm_prot_t prot;
 
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
@@ -1309,107 +1312,121 @@ aio_qphysio(struct proc *p, struct aiocb
 		return (-1);
 
 	vp = fp->f_vnode;
-
-	/*
-	 * If its not a disk, we don't want to return a positive error.
-	 * It causes the aio code to not fall through to try the thread
-	 * way when you're talking to a regular file.
-	 */
-	if (!vn_isdisk(vp, &error)) {
-		if (error == ENOTBLK)
-			return (-1);
-		else
-			return (error);
-	}
-
-	if (vp->v_bufobj.bo_bsize == 0)
-		return (-1);
-
- 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+	if (vp->v_type != VCHR)
 		return (-1);
-
-	if (cb->aio_nbytes >
-	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
-
-	ki = p->p_aioinfo;
-	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
+	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
+
+	if ((csw->d_flags & D_DISK) == 0) {
+		error = -1;
+		goto unref;
+	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
-	/* Create and build a buffer header for a transfer. */
-	bp = (struct buf *)getpbuf(NULL);
-	BUF_KERNPROC(bp);
+	ki = p->p_aioinfo;
+	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+	unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
+	if (unmap) {
+		if (cb->aio_nbytes > MAXPHYS) {
+			error = -1;
+			goto unref;
+		}
+	} else {
+		if (cb->aio_nbytes > MAXPHYS - poff) {
+			error = -1;
+			goto unref;
+		}
+		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+			error = -1;
+			goto unref;
+		}
+	}
+	aiocbe->bp = bp = g_alloc_bio();
+	if (!unmap) {
+		aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+		BUF_KERNPROC(pbuf);
+	}
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
-	ki->kaio_buffer_count++;
+	if (!unmap)
+		ki->kaio_buffer_count++;
 	lj = aiocbe->lio;
 	if (lj)
 		lj->lioj_count++;
-	AIO_UNLOCK(ki);
-
-	/*
-	 * Get a copy of the kva from the physical buffer.
-	 */
-	error = 0;
-
-	bp->b_bcount = cb->aio_nbytes;
-	bp->b_bufsize = cb->aio_nbytes;
-	bp->b_iodone = aio_physwakeup;
-	bp->b_saveaddr = bp->b_data;
-	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
-	bp->b_offset = cb->aio_offset;
-	bp->b_iooffset = cb->aio_offset;
-	bp->b_blkno = btodb(cb->aio_offset);
-	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
-
-	/*
-	 * Bring buffer into kernel space.
-	 */
-	if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) {
-		error = EFAULT;
-		goto doerror;
-	}
-
-	AIO_LOCK(ki);
-	aiocbe->bp = bp;
-	bp->b_caller1 = (void *)aiocbe;
 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	aiocbe->jobstate = JOBST_JOBQBUF;
 	cb->_aiocb_private.status = cb->aio_nbytes;
 	AIO_UNLOCK(ki);
 
-	atomic_add_int(&num_queue_count, 1);
-	atomic_add_int(&num_buf_aio, 1);
-
-	bp->b_error = 0;
+	bp->bio_length = cb->aio_nbytes;
+	bp->bio_bcount = cb->aio_nbytes;
+	bp->bio_done = aio_physwakeup;
+	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+	bp->bio_offset = cb->aio_offset;
+	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+	bp->bio_dev = dev;
+	bp->bio_caller1 = (void *)aiocbe;
+
+	prot = VM_PROT_READ;
+	if (cb->aio_lio_opcode == LIO_READ)
+		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+	if ((aiocbe->npages = vm_fault_quick_hold_pages(
+	    &curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
+	    sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
+		error = EFAULT;
+		goto doerror;
+	}
+	if (!unmap) {
+		pmap_qenter((vm_offset_t)pbuf->b_data,
+		    aiocbe->pages, aiocbe->npages);
+		bp->bio_data = pbuf->b_data + poff;
+	} else {
+		bp->bio_ma = aiocbe->pages;
+		bp->bio_ma_n = aiocbe->npages;
+		bp->bio_ma_offset = poff;
+		bp->bio_data = unmapped_buf;
+		bp->bio_flags |= BIO_UNMAPPED;
+	}
 
-	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
+	atomic_add_int(&num_queue_count, 1);
+	if (!unmap)
+		atomic_add_int(&num_buf_aio, 1);
 
 	/* Perform transfer. */
-	dev_strategy_csw(dev, csw, bp);
+	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	AIO_LOCK(ki);
+	aiocbe->jobstate = JOBST_NULL;
+	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
 	ki->kaio_count--;
-	ki->kaio_buffer_count--;
+	if (!unmap)
+		ki->kaio_buffer_count--;
 	if (lj)
 		lj->lioj_count--;
-	aiocbe->bp = NULL;
 	AIO_UNLOCK(ki);
-	relpbuf(bp, NULL);
+	if (pbuf) {
+		relpbuf(pbuf, NULL);
+		aiocbe->pbuf = NULL;
+	}
+	g_destroy_bio(bp);
+	aiocbe->bp = NULL;
 unref:
 	dev_relthread(dev, ref);
 	return (error);
@@ -1787,8 +1804,6 @@ no_kqueue:
 	}
 #endif
 queueit:
-	/* No buffer for daemon I/O. */
-	aiocbe->bp = NULL;
 	atomic_add_int(&num_queue_count, 1);
 
 	AIO_LOCK(ki);
@@ -2425,54 +2440,43 @@ sys_lio_listio(struct thread *td, struct
 	return (error);
 }
 
-/*
- * Called from interrupt thread for physio, we should return as fast
- * as possible, so we schedule a biohelper task.
- */
 static void
-aio_physwakeup(struct buf *bp)
+aio_physwakeup(struct bio *bp)
 {
-	struct aiocblist *aiocbe;
-
-	aiocbe = (struct aiocblist *)bp->b_caller1;
-	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
-}
-
-/*
- * Task routine to perform heavy tasks, process wakeup, and signals.
- */
-static void
-biohelper(void *context, int pending)
-{
-	struct aiocblist *aiocbe = context;
-	struct buf *bp;
+	struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	int nblks;
 
+	/* Release mapping into kernel space. */
+	if (aiocbe->pbuf) {
+		pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
+		relpbuf(aiocbe->pbuf, NULL);
+		aiocbe->pbuf = NULL;
+		atomic_subtract_int(&num_buf_aio, 1);
+	}
+	vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
+
 	bp = aiocbe->bp;
+	aiocbe->bp = NULL;
 	userp = aiocbe->userproc;
 	ki = userp->p_aioinfo;
 	AIO_LOCK(ki);
-	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+	aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
 	aiocbe->uaiocb._aiocb_private.error = 0;
-	if (bp->b_ioflags & BIO_ERROR)
-		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+	if (bp->bio_flags & BIO_ERROR)
+		aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
 		aiocbe->outputcharge += nblks;
 	else
 		aiocbe->inputcharge += nblks;
-	aiocbe->bp = NULL;
 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
 	ki->kaio_buffer_count--;
 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
 	AIO_UNLOCK(ki);
 
-	/* Release mapping into kernel space. */
-	vunmapbuf(bp);
-	relpbuf(bp, NULL);
-	atomic_subtract_int(&num_buf_aio, 1);
+	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201504221811.t3MIBYoA099702>