From owner-svn-src-user@FreeBSD.ORG Sun Sep 28 00:43:05 2014 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 2527869C; Sun, 28 Sep 2014 00:43:05 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 116D48AD; Sun, 28 Sep 2014 00:43:05 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id s8S0h4Ed081855; Sun, 28 Sep 2014 00:43:04 GMT (envelope-from marcel@FreeBSD.org) Received: (from marcel@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id s8S0h4Pu081854; Sun, 28 Sep 2014 00:43:04 GMT (envelope-from marcel@FreeBSD.org) Message-Id: <201409280043.s8S0h4Pu081854@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: marcel set sender to marcel@FreeBSD.org using -f From: Marcel Moolenaar Date: Sun, 28 Sep 2014 00:43:04 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r272233 - user/marcel/mkimg X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 28 Sep 2014 00:43:05 -0000 Author: marcel Date: Sun Sep 28 00:43:04 2014 New Revision: 272233 URL: http://svnweb.freebsd.org/changeset/base/272233 Log: Start performance optimizations: Output formats typically need to know whether a sequence of blocks has data or not. They use this to determine whether to allocate disk space for them or not. The image_data() function provides that functionality, but is implemented by reading the amount of blocks from the temporary file and comparing that with zeroes. The QCOW format needs to go over the image 3 times and every time we read from the temporary file. We can speed this up by building a "chunk" list in memory while we read the partition data. Each chunk is a sequence of blocks that is either defined as a gap (i.e. all zeroes) or defined as containing data. For each chunk we keep track of the file and the offset in that file where the chunk's data comes from. This allows us handle regular files and in particular sparse files more optimally. For sparse files we can trivially build a chunk for each of the holes in the parse file by using SEEK_HOLE and SEEK_DATA. The data regions still need to be read to handle zeroe blocks for block sizes smaller that the underlying file system's block size. However, we don't have to copy the data into a temporary file anymore. For anything but regular files, we still use the temporary file. We call it a swap file now. With that all data can be mapped and unmapped as we need to access it. This commit implements the creation of the chunk list and the swap file usage for non-regular files (i.e. streams). Mappable files are now handled like streams, so that needs some work. The big part that is missing is the use of the chunk list for determining whether a sequence of blocks has data and all the handling of writing the image data to the output file. As such: this commit breaks mkimg and makes it useless. It's a good WIP to safe thogh -- hence doing it on my branch. Modified: user/marcel/mkimg/image.c Modified: user/marcel/mkimg/image.c ============================================================================== --- user/marcel/mkimg/image.c Sun Sep 28 00:24:01 2014 (r272232) +++ user/marcel/mkimg/image.c Sun Sep 28 00:43:04 2014 (r272233) @@ -27,67 +27,303 @@ #include __FBSDID("$FreeBSD$"); +#include +#include +#include #include #include #include #include #include +#include #include #include +#include #include #include "image.h" #include "mkimg.h" -#define BUFFER_SIZE (1024*1024) +struct chunk { + lba_t ch_block; /* Block address in image. */ + off_t ch_ofs; /* Offset in backing file. */ + STAILQ_ENTRY(chunk) ch_list; + size_t ch_size; /* Size of chunk in bytes. */ + int ch_fd; /* FD of backing file. */ + u_int ch_flags; +#define CH_FLAGS_GAP 1 /* Chunk is a gap (no FD). */ +#define CH_FLAGS_DIRTY 2 /* Data modified/only in memory. */ +}; + +static STAILQ_HEAD(chunk_head, chunk) image_chunks; +static u_int image_nchunks; + +static char image_swap_file[PATH_MAX]; +static int image_swap_fd = -1; +static u_int image_swap_pgsz; +static off_t image_swap_size; -static char image_tmpfile[PATH_MAX]; -static int image_fd = -1; static lba_t image_size; +/* + * Swap file handlng. + */ + +static off_t +image_swap_alloc(size_t size) +{ + off_t ofs; + size_t unit; + + unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz; + assert((unit & (unit - 1)) == 0); + + size = (size + unit - 1) & ~(unit - 1); + + ofs = image_swap_size; + image_swap_size += size; + if (ftruncate(image_swap_fd, image_swap_size) == -1) { + image_swap_size = ofs; + ofs = -1LL; + } + fprintf(stderr, "SWAP: off=%jd, size=%zu\n", (intmax_t)ofs, size); + return (ofs); +} + +/* + * Image chunk handling. + */ + static void -cleanup(void) +image_chunk_dump(void) { + struct chunk *ch; - if (image_fd != -1) - close(image_fd); - unlink(image_tmpfile); + fprintf(stderr, "%u chunks:\n", image_nchunks); + STAILQ_FOREACH(ch, &image_chunks, ch_list) { + fprintf(stderr, "\tblk=%jd, ofs=%jd, fd=%d, sz=%zu, fl=%u\n", + (intmax_t)ch->ch_block, (intmax_t)ch->ch_ofs, ch->ch_fd, + ch->ch_size, ch->ch_flags); + } } -int -image_copyin(lba_t blk, int fd, uint64_t *sizep) +static size_t +image_chunk_grow(struct chunk *ch, size_t sz) +{ + size_t dsz, newsz; + + newsz = ch->ch_size + sz; + if (newsz > ch->ch_size) { + ch->ch_size = newsz; + return (0); + } + /* We would overflow -- create new chunk for remainder. */ + dsz = SIZE_MAX - ch->ch_size; + assert(dsz < sz); + ch->ch_size = SIZE_MAX; + return (sz - dsz); +} + +static int +image_chunk_skipto(lba_t to) +{ + struct chunk *ch; + lba_t from; + size_t sz; + + ch = STAILQ_LAST(&image_chunks, chunk, ch_list); + from = (ch != NULL) ? ch->ch_block + (ch->ch_size / secsz) : 0LL; + + assert(from <= to); + + /* Nothing to do? */ + if (from == to) + return (0); + /* Avoid bugs due to overflows. */ + if ((uintmax_t)(to - from) > (uintmax_t)(SIZE_MAX / secsz)) + return (EFBIG); + sz = (to - from) * secsz; + if (ch != NULL && (ch->ch_flags & CH_FLAGS_GAP)) { + sz = image_chunk_grow(ch, sz); + if (sz == 0) + return (0); + from = ch->ch_block + (ch->ch_size / secsz); + } + ch = malloc(sizeof(*ch)); + if (ch == NULL) + return (ENOMEM); + memset(ch, 0, sizeof(*ch)); + ch->ch_block = from; + ch->ch_size = sz; + ch->ch_fd = -1; + ch->ch_flags |= CH_FLAGS_GAP; + STAILQ_INSERT_TAIL(&image_chunks, ch, ch_list); + image_nchunks++; + return (0); +} + +static int +image_chunk_append(lba_t blk, size_t sz, off_t ofs, int fd) +{ + struct chunk *ch; + + ch = STAILQ_LAST(&image_chunks, chunk, ch_list); + if (ch != NULL && (ch->ch_flags & CH_FLAGS_GAP) == 0) { + if (fd == ch->ch_fd && + blk == (lba_t)(ch->ch_block + (ch->ch_size / secsz)) && + ofs == (off_t)(ch->ch_ofs + ch->ch_size)) { + sz = image_chunk_grow(ch, sz); + if (sz == 0) + return (0); + blk = ch->ch_block + (ch->ch_size / secsz); + ofs = ch->ch_ofs + ch->ch_size; + } + } + ch = malloc(sizeof(*ch)); + if (ch == NULL) + return (ENOMEM); + memset(ch, 0, sizeof(*ch)); + ch->ch_block = blk; + ch->ch_ofs = ofs; + ch->ch_size = sz; + ch->ch_fd = fd; + STAILQ_INSERT_TAIL(&image_chunks, ch, ch_list); + image_nchunks++; + return (0); +} + +static int +image_chunk_copyin(lba_t blk, void *buf, size_t sz, off_t ofs, int fd) +{ + uint64_t *p = buf; + size_t n; + int error; + + assert(((uintptr_t)p & 3) == 0); + + error = 0; + sz = (sz + secsz - 1) & ~(secsz - 1); + while (!error && sz > 0) { + n = 0; + while (n < (secsz >> 3) && p[n] == 0) + n++; + if (n == (secsz >> 3)) + error = image_chunk_skipto(blk + 1); + else + error = image_chunk_append(blk, secsz, ofs, fd); + blk++; + p += (secsz >> 3); + sz -= secsz; + ofs += secsz; + } + return (error); +} + +/* + * File mapping support. + */ + +static void * +image_file_map(int fd, off_t ofs, size_t sz) +{ + void *ptr; + size_t unit; + int flags, prot; + + unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz; + assert((unit & (unit - 1)) == 0); + + flags = MAP_NOCORE | MAP_NOSYNC | MAP_SHARED; + /* Allow writing to our swap file only. */ + prot = PROT_READ | ((fd == image_swap_fd) ? PROT_WRITE : 0); + sz = (sz + unit - 1) & ~(unit - 1); + ptr = mmap(NULL, sz, prot, flags, fd, ofs); + return ((ptr == MAP_FAILED) ? NULL : ptr); +} + +static int +image_file_unmap(void *buffer, size_t sz) +{ + size_t unit; + + unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz; + sz = (sz + unit - 1) & ~(unit - 1); + munmap(buffer, sz); + return (0); +} + +/* + * Input/source file handling. + */ + +static int +image_copyin_stream(lba_t blk, int fd, uint64_t *sizep) { char *buffer; uint64_t bytesize; - ssize_t bcnt, rdsz; - int error, partial; + off_t swofs; + size_t iosz; + ssize_t rdsz; + int error; - assert(BUFFER_SIZE % secsz == 0); + /* + * This makes sure we're doing I/O in multiples of the page + * size as well as of the sector size. 2MB is the minimum + * by virtue of secsz at least 512 bytes and the page size + * at least 4K bytes. + */ + iosz = secsz * image_swap_pgsz; - buffer = malloc(BUFFER_SIZE); - if (buffer == NULL) - return (ENOMEM); bytesize = 0; - partial = 0; - while (1) { - rdsz = read(fd, buffer, BUFFER_SIZE); - if (rdsz <= 0) { - error = (rdsz < 0) ? errno : 0; - break; - } - if (partial) - abort(); - bytesize += rdsz; - bcnt = (rdsz + secsz - 1) / secsz; - error = image_write(blk, buffer, bcnt); + do { + swofs = image_swap_alloc(iosz); + if (swofs == -1LL) + return (errno); + buffer = image_file_map(image_swap_fd, swofs, iosz); + if (buffer == NULL) + return (errno); + rdsz = read(fd, buffer, iosz); + if (rdsz > 0) + error = image_chunk_copyin(blk, buffer, rdsz, swofs, + image_swap_fd); + else if (rdsz < 0) + error = errno; + else + error = 0; + image_file_unmap(buffer, iosz); + /* XXX should we relinguish unused swap space? */ if (error) - break; - blk += bcnt; - partial = ((ssize_t)(bcnt * secsz) != rdsz) ? 1 : 0; - } - free(buffer); + return (error); + + bytesize += rdsz; + blk += (rdsz + secsz - 1) / secsz; + } while (rdsz > 0); + if (sizep != NULL) *sizep = bytesize; + return (0); +} + +static int +image_copyin_mapped(lba_t blk, int fd, uint64_t *sizep) +{ + + return (image_copyin_stream(blk, fd, sizep)); +} + +int +image_copyin(lba_t blk, int fd, uint64_t *sizep) +{ + struct stat sb; + int error; + + error = image_chunk_skipto(blk); + if (!error) { + if (fstat(fd, &sb) == -1 || !S_ISREG(sb.st_mode)) + error = image_copyin_stream(blk, fd, sizep); + else + error = image_copyin_mapped(blk, fd, sizep); + } return (error); } @@ -120,23 +356,25 @@ image_copyout_region(int fd, lba_t blk, { char *buffer; off_t ofs; - size_t sz; + size_t bufsz, sz; ssize_t rdsz, wrsz; int error; + bufsz = secsz * image_swap_pgsz; + ofs = lseek(fd, 0L, SEEK_CUR); blk *= secsz; - if (lseek(image_fd, blk, SEEK_SET) != blk) + if (lseek(image_swap_fd, blk, SEEK_SET) != blk) return (errno); - buffer = malloc(BUFFER_SIZE); + buffer = malloc(bufsz); if (buffer == NULL) return (errno); error = 0; size *= secsz; while (size > 0) { - sz = (BUFFER_SIZE < size) ? BUFFER_SIZE : size; - rdsz = read(image_fd, buffer, sz); + sz = ((ssize_t)bufsz < size) ? bufsz : (size_t)size; + rdsz = read(image_swap_fd, buffer, sz); if (rdsz <= 0) { error = (rdsz < 0) ? errno : 0; break; @@ -161,7 +399,7 @@ image_data(lba_t blk, lba_t size) char *buffer, *p; blk *= secsz; - if (lseek(image_fd, blk, SEEK_SET) != blk) + if (lseek(image_swap_fd, blk, SEEK_SET) != blk) return (1); size *= secsz; @@ -169,7 +407,7 @@ image_data(lba_t blk, lba_t size) if (buffer == NULL) return (1); - if (read(image_fd, buffer, size) != (ssize_t)size) { + if (read(image_swap_fd, buffer, size) != (ssize_t)size) { free(buffer); return (1); } @@ -185,7 +423,12 @@ image_data(lba_t blk, lba_t size) lba_t image_get_size(void) { + static int once = 0; + if (once == 0) { + once++; + image_chunk_dump(); + } return (image_size); } @@ -193,8 +436,10 @@ int image_set_size(lba_t blk) { + image_chunk_skipto(blk); + image_size = blk; - if (ftruncate(image_fd, blk * secsz) == -1) + if (ftruncate(image_swap_fd, blk * secsz) == -1) return (errno); return (0); } @@ -204,27 +449,42 @@ image_write(lba_t blk, void *buf, ssize_ { blk *= secsz; - if (lseek(image_fd, blk, SEEK_SET) != blk) + if (lseek(image_swap_fd, blk, SEEK_SET) != blk) return (errno); len *= secsz; - if (sparse_write(image_fd, buf, len) != len) + if (sparse_write(image_swap_fd, buf, len) != len) return (errno); return (0); } +static void +image_cleanup(void) +{ + + if (image_swap_fd != -1) + close(image_swap_fd); + unlink(image_swap_file); +} + int image_init(void) { const char *tmpdir; - if (atexit(cleanup) == -1) + STAILQ_INIT(&image_chunks); + image_nchunks = 0; + + image_swap_size = 0; + image_swap_pgsz = getpagesize(); + + if (atexit(image_cleanup) == -1) return (errno); if ((tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') tmpdir = _PATH_TMP; - snprintf(image_tmpfile, sizeof(image_tmpfile), "%s/mkimg-XXXXXX", + snprintf(image_swap_file, sizeof(image_swap_file), "%s/mkimg-XXXXXX", tmpdir); - image_fd = mkstemp(image_tmpfile); - if (image_fd == -1) + image_swap_fd = mkstemp(image_swap_file); + if (image_swap_fd == -1) return (errno); return (0); }