Mercurial > illumos > illumos-gate
changeset 13871:a9c12c2c1647
3306 zdb should be able to issue reads in parallel
3321 'zpool reopen' command should be documented in the man page and help message
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
author | George Wilson <george.wilson@delphix.com> |
---|---|
date | Fri, 07 Sep 2012 17:29:12 -0700 |
parents | 387db3e6d543 |
children | 20563857eb2d |
files | usr/src/cmd/zdb/zdb.c usr/src/cmd/zpool/zpool_main.c usr/src/lib/libzpool/common/kernel.c usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/man/man1m/zdb.1m usr/src/man/man1m/zpool.1m usr/src/uts/common/fs/zfs/sys/vdev_impl.h usr/src/uts/common/fs/zfs/vdev_disk.c usr/src/uts/common/fs/zfs/vdev_file.c usr/src/uts/common/fs/zfs/zio.c |
diffstat | 10 files changed, 291 insertions(+), 70 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/cmd/zdb/zdb.c Fri Sep 07 17:29:12 2012 -0700 @@ -86,6 +86,7 @@ uint64_t *zopt_object = NULL; int zopt_objects = 0; libzfs_handle_t *g_zfs; +uint64_t max_inflight = 200; /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -108,13 +109,14 @@ { (void) fprintf(stderr, "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " - "poolname [object...]\n" - " %s [-divPA] [-e -p path...] dataset [object...]\n" - " %s -m [-LXFPA] [-t txg] [-e [-p path...]] " + "[-U config] [-M inflight I/Os] poolname [object...]\n" + " %s [-divPA] [-e -p path...] [-U config] dataset " + "[object...]\n" + " %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " "poolname [vdev [metaslab...]]\n" " %s -R [-A] [-e [-p path...]] poolname " "vdev:offset:size[:flags]\n" - " %s -S [-PA] [-e [-p path...]] poolname\n" + " %s -S [-PA] [-e [-p path...]] [-U config] poolname\n" " %s -l [-uA] device\n" " %s -C [-A] [-U config]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); @@ -161,6 +163,8 @@ (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t <txg> -- highest txg to use when " "searching for uberblocks\n"); + (void) fprintf(stderr, " -M <number of inflight I/Os> -- " + "specify the maximum number of checksumming I/Os [default is 200]"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); @@ -2028,6 +2032,45 @@ bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_t *zb = &zio->io_bookmark; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + sprintf_blkptr(blkbuf, bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + /* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, @@ -2049,39 +2092,23 @@ is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { - int ioerr; size_t size = BP_GET_PSIZE(bp); - void *data = malloc(size); + void *data = zio_data_buf_alloc(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; - ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb)); - - free(data); - - if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) { - zcb->zcb_haderrors = 1; - zcb->zcb_errors[ioerr]++; - - if (dump_opt['b'] >= 2) - sprintf_blkptr(blkbuf, bp); - else - blkbuf[0] = '\0'; - - (void) printf("zdb_blkptr_cb: " - "Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- skipping\n", - ioerr, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf); - } + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > max_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + } zcb->zcb_readfails = 0; @@ -2283,6 +2310,18 @@ zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + (void) zio_wait(spa->spa_async_zio_root); + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); @@ -3040,7 +3079,7 @@ dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) { + while ((c = getopt(argc, argv, "bcdhilmM:suCDRSAFLXevp:t:U:P")) != -1) { switch (c) { case 'b': case 'c': @@ -3069,6 +3108,15 @@ case 'v': verbose++; break; + case 'M': + max_inflight = strtoull(optarg, NULL, 0); + if (max_inflight == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight I/Os must be greater " + "than 0\n"); + usage(); + } + break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *),
--- a/usr/src/cmd/zpool/zpool_main.c Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/cmd/zpool/zpool_main.c Fri Sep 07 17:29:12 2012 -0700 @@ -238,7 +238,7 @@ case HELP_REMOVE: return (gettext("\tremove <pool> <device> ...\n")); case HELP_REOPEN: - return (""); /* Undocumented command */ + return (gettext("\treopen <pool>\n")); case HELP_SCRUB: return (gettext("\tscrub [-s] <pool> ...\n")); case HELP_STATUS: @@ -3550,22 +3550,37 @@ * zpool reopen <pool> * * Reopen the pool so that the kernel can update the sizes of all vdevs. - * - * NOTE: This command is currently undocumented. If the command is ever - * exposed then the appropriate usage() messages will need to be made. */ int zpool_do_reopen(int argc, char **argv) { + int c; int ret = 0; zpool_handle_t *zhp; char *pool; + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc--; argv++; - if (argc != 1) - return (2); + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
--- a/usr/src/lib/libzpool/common/kernel.c Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/lib/libzpool/common/kernel.c Fri Sep 07 17:29:12 2012 -0700 @@ -1012,3 +1012,48 @@ { return (0); } + +void +bioinit(buf_t *bp) +{ + bzero(bp, sizeof (buf_t)); +} + +void +biodone(buf_t *bp) +{ + if (bp->b_iodone != NULL) { + (*(bp->b_iodone))(bp); + return; + } + ASSERT((bp->b_flags & B_DONE) == 0); + bp->b_flags |= B_DONE; +} + +void +bioerror(buf_t *bp, int error) +{ + ASSERT(bp != NULL); + ASSERT(error >= 0); + + if (error != 0) { + bp->b_flags |= B_ERROR; + } else { + bp->b_flags &= ~B_ERROR; + } + bp->b_error = error; +} + + +int +geterror(struct buf *bp) +{ + int error = 0; + + if (bp->b_flags & B_ERROR) { + error = bp->b_error; + if (!error) + error = EIO; + } + return (error); +}
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Fri Sep 07 17:29:12 2012 -0700 @@ -603,6 +603,36 @@ extern void cyclic_remove(cyclic_id_t); extern int cyclic_reprogram(cyclic_id_t, hrtime_t); +/* + * Buf structure + */ +#define B_BUSY 0x0001 +#define B_DONE 0x0002 +#define B_ERROR 0x0004 +#define B_READ 0x0040 /* read when I/O occurs */ +#define B_WRITE 0x0100 /* non-read pseudo-flag */ + +typedef struct buf { + int b_flags; + size_t b_bcount; + union { + caddr_t b_addr; + } b_un; + + lldaddr_t _b_blkno; +#define b_lblkno _b_blkno._f + size_t b_resid; + size_t b_bufsize; + int (*b_iodone)(struct buf *); + int b_error; + void *b_private; +} buf_t; + +extern void bioinit(buf_t *); +extern void biodone(buf_t *); +extern void bioerror(buf_t *, int); +extern int geterror(buf_t *); + #ifdef __cplusplus } #endif
--- a/usr/src/man/man1m/zdb.1m Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/man/man1m/zdb.1m Fri Sep 07 17:29:12 2012 -0700 @@ -11,6 +11,7 @@ .\" .\" .\" Copyright 2012, Richard Lowe. +.\" Copyright (c) 2012 by Delphix. All rights reserved. .\" .TH "ZDB" "1M" "February 15, 2012" "" "" @@ -19,21 +20,23 @@ .SH "SYNOPSIS" \fBzdb\fR [-CumdibcsDvhLXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR] - \fIpoolname\fR [\fIobject\fR ...] - -.P -\fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] \fIdataset\fR [\fIobject\fR ...] + [-U \fIcache\fR] [-M \fIinflight I/Os\fR] [\fIpoolname\fR + [\fIobject\fR ...]] .P -\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] \fIpoolname\fR - [\fIvdev\fR [\fImetaslab\fR ...]] +\fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] + \fIdataset\fR [\fIobject\fR ...] .P -\fBzdb\fR -R [-A] [-e [-p \fIpath\fR...]] \fIpoolname\fR +\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] + \fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]] + +.P +\fBzdb\fR -R [-A] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fIpoolname\fR \fIvdev\fR:\fIoffset\fR:\fIsize\fR[:\fIflags\fR] .P -\fBzdb\fR -S [-AP] [-e [-p \fIpath\fR...]] \fIpoolname\fR +\fBzdb\fR -S [-AP] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fIpoolname\fR .P \fBzdb\fR -l [-uA] \fIdevice\fR @@ -357,6 +360,18 @@ .sp .ne 2 .na +\fB-M \fIinflight I/Os\fR \fR +.ad +.sp .6 +.RS 4n +Limit the number of outstanding checksum I/Os to the specified value. The +default value is 200. This option affects the performance of the \fB-c\fR +option. +.RE + +.sp +.ne 2 +.na \fB-P\fR .ad .sp .6 @@ -384,8 +399,7 @@ .ad .sp .6 .RS 4n -Use a cache file other than \fB/etc/zfs/zpool.cache\fR. This option is only -valid with \fB-C\fR +Use a cache file other than \fB/etc/zfs/zpool.cache\fR. .RE .sp
--- a/usr/src/man/man1m/zpool.1m Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/man/man1m/zpool.1m Fri Sep 07 17:29:12 2012 -0700 @@ -113,6 +113,11 @@ .LP .nf +\fBzpool reopen\fR \fIpool\fR +.fi + +.LP +.nf \fBzpool remove\fR \fIpool\fR \fIdevice\fR ... .fi @@ -1550,8 +1555,18 @@ .ad .sp .6 .RS 4n -Generates a new unique identifier for the pool. You must ensure that all devices in this pool are online and -healthy before performing this action. +Generates a new unique identifier for the pool. You must ensure that all +devices in this pool are online and healthy before performing this action. +.RE + +.sp +.ne 2 +.na +\fB\fBzpool reopen\fR \fIpool\fR +.ad +.sp .6 +.RS 4n +Reopen all the vdevs associated with the pool. .RE .sp
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Fri Sep 07 17:29:12 2012 -0700 @@ -322,6 +322,14 @@ */ extern int zfs_vdev_cache_size; +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; + #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c Fri Sep 07 17:29:12 2012 -0700 @@ -40,11 +40,6 @@ extern ldi_ident_t zfs_li; -typedef struct vdev_disk_buf { - buf_t vdb_buf; - zio_t *vdb_io; -} vdev_disk_buf_t; - static void vdev_disk_hold(vdev_t *vd) { @@ -397,8 +392,8 @@ static void vdev_disk_io_intr(buf_t *bp) { - vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; - zio_t *zio = vdb->vdb_io; + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. @@ -410,7 +405,7 @@ if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = EIO; - kmem_free(vdb, sizeof (vdev_disk_buf_t)); + kmem_free(vb, sizeof (vdev_buf_t)); zio_interrupt(zio); } @@ -441,7 +436,7 @@ { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_buf_t *vdb; + vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; @@ -505,10 +500,10 @@ return (ZIO_PIPELINE_CONTINUE); } - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - vdb->vdb_io = zio; - bp = &vdb->vdb_buf; + vb->vb_io = zio; + bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE |
--- a/usr/src/uts/common/fs/zfs/vdev_file.c Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev_file.c Fri Sep 07 17:29:12 2012 -0700 @@ -25,6 +25,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_file.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -140,12 +141,55 @@ vd->vdev_tsd = NULL; } +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = ENOSPC; + + kmem_free(vb, sizeof (vdev_buf_t)); + zio_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + buf_t *bp = arg; + vnode_t *vp = bp->b_private; + ssize_t resid; + int error; + + error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE, + vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno), + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (error == 0) { + bp->b_resid = resid; + biodone(bp); + } else { + bioerror(bp, error); + biodone(bp); + } +} + static int vdev_file_io_start(zio_t *zio) { + spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; + vdev_buf_t *vb; + buf_t *bp; if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ @@ -166,15 +210,22 @@ return (ZIO_PIPELINE_CONTINUE); } - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); + + vb->vb_io = zio; + bp = &vb->vb_buf; - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; + bioinit(bp); + bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + bp->b_bcount = zio->io_size; + bp->b_un.b_addr = zio->io_data; + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_private = vf->vf_vnode; + bp->b_iodone = (int (*)())vdev_file_io_intr; - zio_interrupt(zio); + taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE], + vdev_file_io_strategy, bp, 0, &zio->io_tqent); return (ZIO_PIPELINE_STOP); }
--- a/usr/src/uts/common/fs/zfs/zio.c Wed Oct 31 05:51:14 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/zio.c Fri Sep 07 17:29:12 2012 -0700 @@ -2917,7 +2917,7 @@ * Hand it off to the otherwise-unused claim taskq. */ ASSERT(zio->io_tqent.tqent_next == NULL); - (void) taskq_dispatch_ent( + taskq_dispatch_ent( spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent);