Mercurial > illumos > illumos-gate
changeset 13790:ac6eff781c67
3112 ztest does not honor ZFS_DEBUG
3113 ztest should use watchpoints to protect frozen arc bufs
3114 some leaked nvlists in zfsdev_ioctl
3115 poll(2) returns prematurely in presence of spurious wakeups
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matt Amdur <Matt.Amdur@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Approved by: Eric Schrock <eric.schrock@delphix.com>
author | Matthew Ahrens <mahrens@delphix.com> |
---|---|
date | Thu, 30 Aug 2012 05:13:49 -0700 |
parents | f0c17d471b7a |
children | 40cea5d62fa3 |
files | usr/src/cmd/ztest/ztest.c usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/uts/common/fs/zfs/arc.c usr/src/uts/common/fs/zfs/dsl_dataset.c usr/src/uts/common/fs/zfs/dsl_synctask.c usr/src/uts/common/fs/zfs/spa_history.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/arc.h usr/src/uts/common/fs/zfs/sys/zfs_debug.h usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/io/devpoll.c usr/src/uts/common/os/condvar.c usr/src/uts/common/sys/condvar.h usr/src/uts/common/syscall/poll.c |
diffstat | 14 files changed, 190 insertions(+), 111 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/ztest/ztest.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/cmd/ztest/ztest.c Thu Aug 30 05:13:49 2012 -0700 @@ -5835,6 +5835,8 @@ (void) setvbuf(stdout, NULL, _IOLBF, 0); + dprintf_setup(&argc, argv); + if (!ischild) { process_options(argc, argv);
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Thu Aug 30 05:13:49 2012 -0700 @@ -61,6 +61,7 @@ #include <atomic.h> #include <dirent.h> #include <time.h> +#include <procfs.h> #include <libsysevent.h> #include <sys/note.h> #include <sys/types.h>
--- a/usr/src/uts/common/fs/zfs/arc.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/arc.c Thu Aug 30 05:13:49 2012 -0700 @@ -135,6 +135,12 @@ #include <sys/kstat.h> #include <zfs_fletcher.h> +#ifndef _KERNEL +/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ +boolean_t arc_watch = B_FALSE; +int arc_procfd; +#endif + static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; @@ -474,6 +480,7 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); +static void arc_buf_watch(arc_buf_t *buf); static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); @@ -949,6 +956,50 @@ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); + arc_buf_watch(buf); +} + +#ifndef _KERNEL +typedef struct procctl { + long cmd; + prwatch_t prwatch; +} procctl_t; +#endif + +/* ARGSUSED */ +static void +arc_buf_unwatch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + int result; + procctl_t ctl; + ctl.cmd = PCWATCH; + ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; + ctl.prwatch.pr_size = 0; + ctl.prwatch.pr_wflags = 0; + result = write(arc_procfd, &ctl, sizeof (ctl)); + ASSERT3U(result, ==, sizeof (ctl)); + } +#endif +} + +/* ARGSUSED */ +static void +arc_buf_watch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + int result; + procctl_t ctl; + ctl.cmd = PCWATCH; + ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; + ctl.prwatch.pr_size = buf->b_hdr->b_size; + ctl.prwatch.pr_wflags = WA_WRITE; + result = write(arc_procfd, &ctl, sizeof (ctl)); + ASSERT3U(result, ==, sizeof (ctl)); + } +#endif } void @@ -975,6 +1026,8 @@ } mutex_exit(&buf->b_hdr->b_freeze_lock); + + arc_buf_unwatch(buf); } void @@ -992,6 +1045,7 @@ buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); + } static void @@ -1348,21 +1402,22 @@ * the buffer is placed on l2arc_free_on_write to be freed later. */ static void -arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - void *data, size_t size) +arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); - df->l2df_data = data; - df->l2df_size = size; + df->l2df_data = buf->b_data; + df->l2df_size = hdr->b_size; df->l2df_func = free_func; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - free_func(data, size); + free_func(buf->b_data, hdr->b_size); } } @@ -1378,16 +1433,15 @@ arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); + arc_buf_unwatch(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf->b_hdr, zio_buf_free, - buf->b_data, size); + arc_buf_data_free(buf, zio_buf_free); arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf->b_hdr, - zio_data_buf_free, buf->b_data, size); + arc_buf_data_free(buf, zio_data_buf_free); ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } @@ -2556,6 +2610,7 @@ } arc_cksum_compute(buf, B_FALSE); + arc_buf_watch(buf); if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { /* @@ -3113,6 +3168,7 @@ } hdr->b_datacnt -= 1; arc_cksum_verify(buf); + arc_buf_unwatch(buf); mutex_exit(hash_lock);
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Thu Aug 30 05:13:49 2012 -0700 @@ -2302,7 +2302,6 @@ } } } - } void
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c Thu Aug 30 05:13:49 2012 -0700 @@ -230,12 +230,7 @@ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) { - dsl_sync_task_group_t *dstg; - - if (!spa_writeable(dp->dp_spa)) - return; - - dstg = dsl_sync_task_group_create(dp); + dsl_sync_task_group_t *dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); dsl_sync_task_group_nowait(dstg, tx);
--- a/usr/src/uts/common/fs/zfs/spa_history.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_history.c Thu Aug 30 05:13:49 2012 -0700 @@ -303,7 +303,7 @@ dmu_tx_t *tx; nvlist_t *nvarg; - if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) return (EINVAL); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -439,8 +439,9 @@ /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. + * Likewise if the pool is not writeable. */ - if (tx->tx_txg == TXG_INITIAL) { + if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { fnvlist_free(nvl); return; }
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Thu Aug 30 05:13:49 2012 -0700 @@ -1600,6 +1600,18 @@ spa_mode_global = mode; +#ifndef _KERNEL + if (spa_mode_global != FREAD && dprintf_find_string("watch")) { + arc_procfd = open("/proc/self/ctl", O_WRONLY); + if (arc_procfd == -1) { + perror("could not enable watchpoints: " + "opening /proc/self/ctl failed: "); + } else { + arc_watch = B_TRUE; + } + } +#endif + refcount_init(); unique_init(); zio_init();
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/arc.h Thu Aug 30 05:13:49 2012 -0700 @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ARC_H @@ -135,6 +136,11 @@ void l2arc_start(void); void l2arc_stop(void); +#ifndef _KERNEL +extern boolean_t arc_watch; +extern int arc_procfd; +#endif + #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h Thu Aug 30 05:13:49 2012 -0700 @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -75,6 +76,10 @@ extern void zfs_dbgmsg_fini(void); extern void zfs_dbgmsg(const char *fmt, ...); +#ifndef _KERNEL +extern int dprintf_find_string(const char *string); +#endif + #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/fs/zfs/zio.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/fs/zfs/zio.c Thu Aug 30 05:13:49 2012 -0700 @@ -125,11 +125,21 @@ while (p2 & (p2 - 1)) p2 &= p2 - 1; +#ifndef _KERNEL + /* + * If we are using watchpoints, put each buffer on its own page, + * to eliminate the performance overhead of trapping to the + * kernel when modifying a non-watched buffer that shares the + * page with a watched buffer. + */ + if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) + continue; +#endif if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; - } else if (P2PHASE(size, PAGESIZE) == 0) { + } else if (IS_P2ALIGNED(size, PAGESIZE)) { align = PAGESIZE; - } else if (P2PHASE(size, p2 >> 2) == 0) { + } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = p2 >> 2; }
--- a/usr/src/uts/common/io/devpoll.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/io/devpoll.c Thu Aug 30 05:13:49 2012 -0700 @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -/* Copyright (c) 2011 by Delphix. All rights reserved. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #include <sys/types.h> #include <sys/devops.h> @@ -695,9 +697,15 @@ minor_t minor; dp_entry_t *dpep; pollcache_t *pcp; + hrtime_t now; int error = 0; STRUCT_DECL(dvpoll, dvpoll); + if (cmd == DP_POLL) { + /* do this now, before we sleep on DP_WRITER_PRESENT */ + now = gethrtime(); + } + minor = getminor(dev); mutex_enter(&devpoll_lock); ASSERT(minor < dptblsize); @@ -725,9 +733,7 @@ pollstate_t *ps; nfds_t nfds; int fdcnt = 0; - int time_out; - clock_t *deltap = NULL; - clock_t delta; + hrtime_t deadline = 0; STRUCT_INIT(dvpoll, mode); error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), @@ -737,18 +743,16 @@ return (EFAULT); } - time_out = STRUCT_FGET(dvpoll, dp_timeout); - if (time_out > 0) { + deadline = STRUCT_FGET(dvpoll, dp_timeout); + if (deadline > 0) { /* - * cv_relwaituntil_sig operates at the tick - * granularity, which by default is 10 ms. - * This results in rounding user specified - * timeouts up but prevents the system - * from being flooded with small high - * resolution timers. + * Convert the deadline from relative milliseconds + * to absolute nanoseconds. They must wait for at + * least a tick. */ - delta = MSEC_TO_TICK_ROUNDUP(time_out); - deltap = δ + deadline = deadline * NANOSEC / MILLISEC; + deadline = MAX(deadline, nsec_per_tick); + deadline += now; } if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { @@ -758,16 +762,15 @@ * Do not check for signals if we have a zero timeout. */ DP_REFRELE(dpep); - if (time_out == 0) + if (deadline == 0) return (0); mutex_enter(&curthread->t_delay_lock); - while ((delta = cv_relwaituntil_sig( - &curthread->t_delay_cv, &curthread->t_delay_lock, - deltap, TR_MILLISEC)) > 0) { + while ((error = + cv_timedwait_sig_hrtime(&curthread->t_delay_cv, + &curthread->t_delay_lock, deadline)) > 0) continue; - } mutex_exit(&curthread->t_delay_lock); - return (delta == 0 ? EINTR : 0); + return (error == 0 ? EINTR : 0); } /* @@ -814,21 +817,22 @@ /* * Sleep until we are notified, signaled, or timed out. - * Do not check for signals if we have a zero timeout. */ - if (time_out == 0) /* immediate timeout */ + if (deadline == 0) { + /* immediate timeout; do not check signals */ break; - - delta = cv_relwaituntil_sig(&pcp->pc_cv, &pcp->pc_lock, - deltap, TR_MILLISEC); + } + error = cv_timedwait_sig_hrtime(&pcp->pc_cv, + &pcp->pc_lock, deadline); /* * If we were awakened by a signal or timeout * then break the loop, else poll again. */ - if (delta <= 0) { - if (delta == 0) /* signal */ - error = EINTR; + if (error <= 0) { + error = (error == 0) ? EINTR : 0; break; + } else { + error = 0; } } mutex_exit(&pcp->pc_lock);
--- a/usr/src/uts/common/os/condvar.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/os/condvar.c Thu Aug 30 05:13:49 2012 -0700 @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -/* Copyright (c) 2011 by Delphix. All rights reserved. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #include <sys/thread.h> #include <sys/proc.h> @@ -481,6 +483,21 @@ } /* + * Wait until the specified time. + * If tim == -1, waits without timeout using cv_wait_sig_swap(). + */ +int +cv_timedwait_sig_hrtime(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim) +{ + if (tim == -1) { + return (cv_wait_sig_swap(cvp, mp)); + } else { + return (cv_timedwait_sig_hires(cvp, mp, tim, 1, + CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP)); + } +} + +/* * Same as cv_timedwait_sig() except that the third argument is a relative * timeout value, as opposed to an absolute one. There is also a fourth * argument that specifies how accurately the timeout must be implemented. @@ -502,30 +519,6 @@ } /* - * Same as cv_reltimedwait_sig() except that the timeout is optional. If - * there is no timeout then the function will block until woken up - * or interrupted. - */ -clock_t -cv_relwaituntil_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t *delta, - time_res_t res) -{ - /* - * If there is no timeout specified wait indefinitely for a - * signal or a wakeup. - */ - if (delta == NULL) { - return (cv_wait_sig_swap(cvp, mp)); - } - - /* - * cv_reltimedwait_sig will wait for the relative timeout - * specified by delta. - */ - return (cv_reltimedwait_sig(cvp, mp, *delta, res)); -} - -/* * Like cv_wait_sig_swap but allows the caller to indicate (with a * non-NULL sigret) that they will take care of signalling the cv * after wakeup, if necessary. This is a vile hack that should only @@ -766,6 +759,10 @@ * so the caller can return a premature timeout to the calling process * so it can reevaluate the situation in light of the new system time. * (The system clock has been reset if timecheck != timechanged.) + * + * Generally, cv_timedwait_sig_hrtime() should be used instead of this + * routine. It waits based on hrtime rather than wall-clock time and therefore + * does not need to deal with the time changing. */ int cv_waituntil_sig(kcondvar_t *cvp, kmutex_t *mp,
--- a/usr/src/uts/common/sys/condvar.h Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/sys/condvar.h Thu Aug 30 05:13:49 2012 -0700 @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -/* Copyright (c) 2011 by Delphix. All rights reserved. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ /* * condvar.h: @@ -95,10 +97,9 @@ extern clock_t cv_reltimedwait(kcondvar_t *, kmutex_t *, clock_t, time_res_t); extern int cv_wait_sig(kcondvar_t *, kmutex_t *); extern clock_t cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); +extern int cv_timedwait_sig_hrtime(kcondvar_t *, kmutex_t *, hrtime_t); extern clock_t cv_reltimedwait_sig(kcondvar_t *, kmutex_t *, clock_t, time_res_t); -extern clock_t cv_relwaituntil_sig(kcondvar_t *, kmutex_t *, clock_t *, - time_res_t); extern int cv_wait_sig_swap(kcondvar_t *, kmutex_t *); extern int cv_wait_sig_swap_core(kcondvar_t *, kmutex_t *, int *); extern void cv_signal(kcondvar_t *);
--- a/usr/src/uts/common/syscall/poll.c Thu Aug 30 03:32:10 2012 -0700 +++ b/usr/src/uts/common/syscall/poll.c Thu Aug 30 05:13:49 2012 -0700 @@ -24,11 +24,14 @@ * Use is subject to license terms. */ -/* Copyright (c) 2011 by Delphix. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* * Portions of this source code were derived from Berkeley 4.3 BSD * under license from the Regents of the University of California. */ @@ -288,9 +291,7 @@ proc_t *p = ttoproc(t); int fdcnt = 0; int i; - int imm_timeout = 0; - clock_t *deltap = NULL; - clock_t delta; + hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; pollstate_t *ps; pollcache_t *pcp; @@ -301,24 +302,15 @@ /* * Determine the precise future time of the requested timeout, if any. */ - if (tsp != NULL) { - if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { - imm_timeout = 1; - } else { - /* - * cv_relwaituntil_sig operates at - * the tick granularity, which by default is 10 ms. - * Convert the specified timespec to ticks, rounding - * up to at least 1 tick to avoid flooding the - * system with small high resolution timers. - */ - delta = SEC_TO_TICK(tsp->tv_sec) + - NSEC_TO_TICK(tsp->tv_nsec); - if (delta < 1) { - delta = 1; - } - deltap = δ - } + if (tsp == NULL) { + deadline = -1; + } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { + deadline = 0; + } else { + /* They must wait at least a tick. */ + deadline = tsp->tv_sec * NANOSEC + tsp->tv_nsec; + deadline = MAX(deadline, nsec_per_tick); + deadline += gethrtime(); } /* @@ -351,16 +343,15 @@ /* * Sleep until we have passed the requested future * time or until interrupted by a signal. - * Do not check for signals if we have a zero timeout. + * Do not check for signals if we do not want to wait. */ - if (!imm_timeout) { + if (deadline != 0) { mutex_enter(&t->t_delay_lock); - while ((delta = cv_relwaituntil_sig(&t->t_delay_cv, - &t->t_delay_lock, deltap, TR_MILLISEC)) > 0) + while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, + &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); - if (delta == 0) - error = EINTR; + error = (error == 0) ? EINTR : 0; } goto pollout; } @@ -550,20 +541,19 @@ * Do not check for signals if we have a zero timeout. */ mutex_exit(&ps->ps_lock); - if (imm_timeout) { - delta = -1; + if (deadline == 0) { + error = -1; } else { - delta = cv_relwaituntil_sig(&pcp->pc_cv, &pcp->pc_lock, - deltap, TR_MILLISEC); + error = cv_timedwait_sig_hrtime(&pcp->pc_cv, + &pcp->pc_lock, deadline); } mutex_exit(&pcp->pc_lock); /* * If we have received a signal or timed out * then break out and return. */ - if (delta <= 0) { - if (delta == 0) - error = EINTR; + if (error <= 0) { + error = (error == 0) ? EINTR : 0; break; } /*