Date: Tue, 19 Mar 2013 14:53:24 +0000 (UTC) From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r248518 - head/sys/dev/md Message-ID: <201303191453.r2JErOCl025555@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kib Date: Tue Mar 19 14:53:23 2013 New Revision: 248518 URL: http://svnweb.freebsd.org/changeset/base/248518 Log: Support unmapped i/o for the md(4). The vnode-backed md(4) has to map the unmapped bio because VOP_READ() and VOP_WRITE() interfaces do not allow to pass unmapped requests to the filesystem. Vnode-backed md(4) uses pbufs instead of relying on the bio_transient_map, to avoid usual md deadlock. Sponsored by: The FreeBSD Foundation Tested by: pho, scottl Modified: head/sys/dev/md/md.c Modified: head/sys/dev/md/md.c ============================================================================== --- head/sys/dev/md/md.c Tue Mar 19 14:50:24 2013 (r248517) +++ head/sys/dev/md/md.c Tue Mar 19 14:53:23 2013 (r248518) @@ -18,11 +18,16 @@ * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,6 +64,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/bio.h> +#include <sys/buf.h> #include <sys/conf.h> #include <sys/devicestat.h> #include <sys/fcntl.h> @@ -169,6 +175,8 @@ static LIST_HEAD(, md_s) md_softc_list = #define NMASK (NINDIR-1) static int nshift; +static int md_vnode_pbuf_freecnt; + struct indir { uintptr_t *array; u_int total; @@ -415,11 +423,103 @@ g_md_start(struct bio *bp) wakeup(sc); } +#define MD_MALLOC_MOVE_ZERO 1 +#define MD_MALLOC_MOVE_FILL 2 +#define MD_MALLOC_MOVE_READ 3 +#define MD_MALLOC_MOVE_WRITE 4 +#define MD_MALLOC_MOVE_CMP 5 + +static int +md_malloc_move(vm_page_t **mp, int *ma_offs, unsigned sectorsize, + void *ptr, u_char fill, int op) +{ + struct sf_buf *sf; + vm_page_t m, *mp1; + char *p, first; + off_t *uc; + unsigned n; + int error, i, ma_offs1, sz, first_read; + + m = NULL; + error = 0; + sf = NULL; + /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ + first = 0; + first_read = 0; + uc = ptr; + mp1 = *mp; + ma_offs1 = *ma_offs; + /* } */ + sched_pin(); + for (n = sectorsize; n != 0; n -= sz) { + sz = imin(PAGE_SIZE - *ma_offs, n); + if (m != **mp) { + if (sf != NULL) + sf_buf_free(sf); + m = **mp; + sf = sf_buf_alloc(m, SFB_CPUPRIVATE | + (md_malloc_wait ? 0 : SFB_NOWAIT)); + if (sf == NULL) { + error = ENOMEM; + break; + } + } + p = (char *)sf_buf_kva(sf) + *ma_offs; + switch (op) { + case MD_MALLOC_MOVE_ZERO: + bzero(p, sz); + break; + case MD_MALLOC_MOVE_FILL: + memset(p, fill, sz); + break; + case MD_MALLOC_MOVE_READ: + bcopy(ptr, p, sz); + cpu_flush_dcache(p, sz); + break; + case MD_MALLOC_MOVE_WRITE: + bcopy(p, ptr, sz); + break; + case MD_MALLOC_MOVE_CMP: + for (i = 0; i < sz; i++, p++) { + if (!first_read) { + *uc = (u_char)*p; + first = *p; + first_read = 1; + } else if (*p != first) { + error = EDOOFUS; + break; + } + } + break; + default: + KASSERT(0, ("md_malloc_move unknown op %d\n", op)); + break; + } + if (error != 0) + break; + *ma_offs += sz; + *ma_offs %= PAGE_SIZE; + if (*ma_offs == 0) + (*mp)++; + ptr = (char *)ptr + sz; + } + + if (sf != NULL) + sf_buf_free(sf); + sched_unpin(); + if (op == MD_MALLOC_MOVE_CMP && error != 0) { + *mp = mp1; + *ma_offs = ma_offs1; + } + return (error); +} + static int mdstart_malloc(struct md_s *sc, struct bio *bp) { - int i, error; u_char *dst; + vm_page_t *m; + int i, error, error1, ma_offs, notmapped; off_t secno, nsec, uc; uintptr_t sp, osp; @@ -432,9 +532,17 @@ mdstart_malloc(struct md_s *sc, struct b return (EOPNOTSUPP); } + notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; + if (notmapped) { + m = bp->bio_ma; + ma_offs = bp->bio_ma_offset; + dst = NULL; + } else { + dst = bp->bio_data; + } + nsec = bp->bio_length / sc->sectorsize; secno = bp->bio_offset / sc->sectorsize; - dst = bp->bio_data; error = 0; while (nsec--) { osp = s_read(sc->indir, secno); @@ -442,21 +550,45 @@ mdstart_malloc(struct md_s *sc, struct b if (osp != 0) error = s_write(sc->indir, secno, 0); } else if (bp->bio_cmd == BIO_READ) { - if (osp == 0) - bzero(dst, sc->sectorsize); - else if (osp <= 255) - memset(dst, osp, sc->sectorsize); - else { - bcopy((void *)osp, dst, sc->sectorsize); - cpu_flush_dcache(dst, sc->sectorsize); + if (osp == 0) { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, NULL, 0, + MD_MALLOC_MOVE_ZERO); + } else + bzero(dst, sc->sectorsize); + } else if (osp <= 255) { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, NULL, osp, + MD_MALLOC_MOVE_FILL); + } else + memset(dst, osp, sc->sectorsize); + } else { + if (notmapped) { + error = md_malloc_move(&m, &ma_offs, + sc->sectorsize, (void *)osp, 0, + MD_MALLOC_MOVE_READ); + } else { + bcopy((void *)osp, dst, sc->sectorsize); + cpu_flush_dcache(dst, sc->sectorsize); + } } osp = 0; } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { - uc = dst[0]; - for (i = 1; i < sc->sectorsize; i++) - if (dst[i] != uc) - break; + if (notmapped) { + error1 = md_malloc_move(&m, &ma_offs, + sc->sectorsize, &uc, 0, + MD_MALLOC_MOVE_CMP); + i = error1 == 0 ? sc->sectorsize : 0; + } else { + uc = dst[0]; + for (i = 1; i < sc->sectorsize; i++) { + if (dst[i] != uc) + break; + } + } } else { i = 0; uc = 0; @@ -473,10 +605,26 @@ mdstart_malloc(struct md_s *sc, struct b error = ENOSPC; break; } - bcopy(dst, (void *)sp, sc->sectorsize); + if (notmapped) { + error = md_malloc_move(&m, + &ma_offs, sc->sectorsize, + (void *)sp, 0, + MD_MALLOC_MOVE_WRITE); + } else { + bcopy(dst, (void *)sp, + sc->sectorsize); + } error = s_write(sc->indir, secno, sp); } else { - bcopy(dst, (void *)osp, sc->sectorsize); + if (notmapped) { + error = md_malloc_move(&m, + &ma_offs, sc->sectorsize, + (void *)osp, 0, + MD_MALLOC_MOVE_WRITE); + } else { + bcopy(dst, (void *)osp, + sc->sectorsize); + } osp = 0; } } @@ -488,7 +636,8 @@ mdstart_malloc(struct md_s *sc, struct b if (error != 0) break; secno++; - dst += sc->sectorsize; + if (!notmapped) + dst += sc->sectorsize; } bp->bio_resid = 0; return (error); @@ -521,6 +670,7 @@ mdstart_vnode(struct md_s *sc, struct bi struct iovec aiov; struct mount *mp; struct vnode *vp; + struct buf *pb; struct thread *td; off_t end, zerosize; @@ -592,7 +742,17 @@ mdstart_vnode(struct md_s *sc, struct bi return (error); } - aiov.iov_base = bp->bio_data; + KASSERT(bp->bio_length <= MAXPHYS, ("bio_length %jd", + (uintmax_t)bp->bio_length)); + if ((bp->bio_flags & BIO_UNMAPPED) == 0) { + pb = NULL; + aiov.iov_base = bp->bio_data; + } else { + pb = getpbuf(&md_vnode_pbuf_freecnt); + pmap_qenter((vm_offset_t)pb->b_data, bp->bio_ma, bp->bio_ma_n); + aiov.iov_base = (void *)((vm_offset_t)pb->b_data + + bp->bio_ma_offset); + } aiov.iov_len = bp->bio_length; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; @@ -622,6 +782,10 @@ mdstart_vnode(struct md_s *sc, struct bi VOP_UNLOCK(vp, 0); vn_finished_write(mp); } + if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + pmap_qremove((vm_offset_t)pb->b_data, bp->bio_ma_n); + relpbuf(pb, &md_vnode_pbuf_freecnt); + } bp->bio_resid = auio.uio_resid; return (error); } @@ -629,11 +793,10 @@ mdstart_vnode(struct md_s *sc, struct bi static int mdstart_swap(struct md_s *sc, struct bio *bp) { - struct sf_buf *sf; - int rv, offs, len, lastend; - vm_pindex_t i, lastp; vm_page_t m; u_char *p; + vm_pindex_t i, lastp; + int rv, ma_offs, offs, len, lastend; switch (bp->bio_cmd) { case BIO_READ: @@ -645,6 +808,7 @@ mdstart_swap(struct md_s *sc, struct bio } p = bp->bio_data; + ma_offs = (bp->bio_flags & BIO_UNMAPPED) == 0 ? 0 : bp->bio_ma_offset; /* * offs is the offset at which to start operating on the @@ -662,19 +826,12 @@ mdstart_swap(struct md_s *sc, struct bio vm_object_pip_add(sc->object, 1); for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; - - m = vm_page_grab(sc->object, i, - VM_ALLOC_NORMAL|VM_ALLOC_RETRY); - VM_OBJECT_WUNLOCK(sc->object); - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - VM_OBJECT_WLOCK(sc->object); + m = vm_page_grab(sc->object, i, VM_ALLOC_NORMAL | + VM_ALLOC_RETRY); if (bp->bio_cmd == BIO_READ) { if (m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } else if (rv == VM_PAGER_FAIL) { @@ -684,40 +841,44 @@ mdstart_swap(struct md_s *sc, struct bio * valid. Do not set dirty, the page * can be recreated if thrown out. */ - bzero((void *)sf_buf_kva(sf), PAGE_SIZE); + pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } - bcopy((void *)(sf_buf_kva(sf) + offs), p, len); - cpu_flush_dcache(p, len); + if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + pmap_copy_pages(&m, offs, bp->bio_ma, + ma_offs, len); + } else { + physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); + cpu_flush_dcache(p, len); + } } else if (bp->bio_cmd == BIO_WRITE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } - bcopy(p, (void *)(sf_buf_kva(sf) + offs), len); + if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + pmap_copy_pages(bp->bio_ma, ma_offs, &m, + offs, len); + } else { + physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); + } m->valid = VM_PAGE_BITS_ALL; } else if (bp->bio_cmd == BIO_DELETE) { if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL) rv = vm_pager_get_pages(sc->object, &m, 1, 0); if (rv == VM_PAGER_ERROR) { - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); break; } if (len != PAGE_SIZE) { - bzero((void *)(sf_buf_kva(sf) + offs), len); + pmap_zero_page_area(m, offs, len); vm_page_clear_dirty(m, offs, len); m->valid = VM_PAGE_BITS_ALL; } else vm_pager_page_unswapped(m); } - sf_buf_free(sf); - sched_unpin(); vm_page_wakeup(m); vm_page_lock(m); if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE) @@ -731,6 +892,7 @@ mdstart_swap(struct md_s *sc, struct bio /* Actions on further pages start at offset 0 */ p += PAGE_SIZE - offs; offs = 0; + ma_offs += len; } vm_object_pip_subtract(sc->object, 1); VM_OBJECT_WUNLOCK(sc->object); @@ -846,6 +1008,7 @@ mdinit(struct md_s *sc) pp = g_new_providerf(gp, "md%d", sc->unit); pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; + pp->flags |= G_PF_ACCEPT_UNMAPPED; sc->gp = gp; sc->pp = pp; g_error_provider(pp, 0); @@ -1372,6 +1535,7 @@ g_md_init(struct g_class *mp __unused) sx_xunlock(&md_sx); } } + md_vnode_pbuf_freecnt = nswbuf / 10; status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock();
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201303191453.r2JErOCl025555>