changeset 11173:87f3734e64df

6881015 ZFS write activity prevents other threads from running in a timely manner 6899867 mstate_thread_onproc_time() doesn't account for runnable time correctly PSARC/2009/615 System Duty Cycle Scheduling Class and ZFS IO Observability
author Jonathan Adams <Jonathan.Adams@Sun.COM>
date Mon, 23 Nov 2009 15:29:44 -0800
parents a792f425ae2e
children ff18993837d6
files usr/src/Targetdirs usr/src/cmd/dispadmin/Makefile usr/src/cmd/dispadmin/sdcdispadmin.c usr/src/cmd/priocntl/Makefile usr/src/cmd/priocntl/sdcpriocntl.c usr/src/lib/libzpool/common/kernel.c usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/lib/libzpool/common/taskq.c usr/src/pkgdefs/SUNWckr/prototype_i386 usr/src/pkgdefs/SUNWckr/prototype_sparc usr/src/pkgdefs/SUNWcsu/prototype_com usr/src/pkgdefs/SUNWhea/prototype_com usr/src/uts/common/Makefile.files usr/src/uts/common/brand/lx/os/lx_pid.c usr/src/uts/common/disp/cpupart.c usr/src/uts/common/disp/disp.c usr/src/uts/common/disp/priocntl.c usr/src/uts/common/disp/sysdc.c usr/src/uts/common/disp/thread.c usr/src/uts/common/fs/proc/prcontrol.c usr/src/uts/common/fs/vfs.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/spa_impl.h usr/src/uts/common/fs/zfs/zio.c usr/src/uts/common/os/condvar.c usr/src/uts/common/os/exit.c usr/src/uts/common/os/fork.c usr/src/uts/common/os/lwp.c usr/src/uts/common/os/main.c usr/src/uts/common/os/mem_cage.c usr/src/uts/common/os/msacct.c usr/src/uts/common/os/pid.c usr/src/uts/common/os/sig.c usr/src/uts/common/os/taskq.c usr/src/uts/common/os/vm_pageout.c usr/src/uts/common/os/zone.c usr/src/uts/common/sys/Makefile usr/src/uts/common/sys/class.h usr/src/uts/common/sys/debug.h usr/src/uts/common/sys/param.h usr/src/uts/common/sys/proc.h usr/src/uts/common/sys/sysdc.h usr/src/uts/common/sys/sysdc_impl.h usr/src/uts/common/sys/taskq.h usr/src/uts/common/sys/taskq_impl.h usr/src/uts/common/sys/vmsystm.h usr/src/uts/i86pc/os/startup.c usr/src/uts/i86pc/vm/vm_machdep.c usr/src/uts/intel/Makefile.intel.shared usr/src/uts/intel/SDC/Makefile usr/src/uts/intel/ia32/ml/modstubs.s usr/src/uts/sparc/Makefile.sparc.shared usr/src/uts/sparc/SDC/Makefile usr/src/uts/sparc/ml/modstubs.s usr/src/uts/sparc/zfs/Makefile usr/src/uts/sun4/os/startup.c usr/src/uts/sun4/vm/vm_dep.c
diffstat 58 files changed, 2915 insertions(+), 426 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/Targetdirs	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/Targetdirs	Mon Nov 23 15:29:44 2009 -0800
@@ -216,6 +216,7 @@
 	/usr/lib/class/FX \
 	/usr/lib/class/IA \
 	/usr/lib/class/RT \
+	/usr/lib/class/SDC \
 	/usr/lib/class/TS \
 	/usr/lib/crypto \
 	/usr/lib/drv \
--- a/usr/src/cmd/dispadmin/Makefile	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/cmd/dispadmin/Makefile	Mon Nov 23 15:29:44 2009 -0800
@@ -28,12 +28,13 @@
 PROG= dispadmin
 MANIFEST= scheduler.xml
 SVCMETHOD= svc-scheduler
+SDC=  SDC$(PROG)
 RT=   RT$(PROG)
 TS=   TS$(PROG)
 IA=   IA$(PROG)
 FSS=  FSS$(PROG)
 FX=   FX$(PROG)
-PROGS= $(PROG) $(RT) $(TS) $(IA) $(FSS) $(FX)
+PROGS= $(PROG) $(RT) $(TS) $(IA) $(FSS) $(FX) $(SDC)
 
 include ../Makefile.cmd
 
@@ -41,38 +42,43 @@
 
 ROOTDIR=	$(ROOT)/usr/lib/class
 ROOTDIRS=	$(ROOTDIR)	\
-		$(ROOTDIR)/RT	\
-		$(ROOTDIR)/TS	\
+		$(ROOTDIR)/FSS	\
+		$(ROOTDIR)/FX	\
 		$(ROOTDIR)/IA	\
-		$(ROOTDIR)/FSS	\
-		$(ROOTDIR)/FX
+		$(ROOTDIR)/RT	\
+		$(ROOTDIR)/SDC	\
+		$(ROOTDIR)/TS
+
 ROOTPROG=	$(PROG:%=$(ROOTUSRSBIN)/%)
-ROOTRT=		$(RT:%=$(ROOTDIR)/RT/%)
-ROOTTS=		$(TS:%=$(ROOTDIR)/TS/%)
-ROOTIA=		$(IA:%=$(ROOTDIR)/IA/%)
 ROOTFSS=	$(FSS:%=$(ROOTDIR)/FSS/%)
 ROOTFX=		$(FX:%=$(ROOTDIR)/FX/%)
+ROOTIA=		$(IA:%=$(ROOTDIR)/IA/%)
+ROOTRT=		$(RT:%=$(ROOTDIR)/RT/%)
+ROOTSDC=	$(SDC:%=$(ROOTDIR)/SDC/%)
+ROOTTS=		$(TS:%=$(ROOTDIR)/TS/%)
 ROOTMANIFESTDIR=	$(ROOTSVCSYSTEM)
 
 # this would be simpler if we renamed rtdispadmin.c and tsdispadmin.c
 OBJECTS= $(PROG).o rt$(PROG).o ts$(PROG).o ia$(PROG).o \
-		fss$(PROG).o fx$(PROG).o subr.o
+		fss$(PROG).o fx$(PROG).o sdc$(PROG).o subr.o
 
 # conditional assignments, because of above names
 $(PROG):=	OBJ= $(PROG).o
-$(RT):=		OBJ= rt$(PROG).o
-$(TS):=		OBJ= ts$(PROG).o
-$(IA):=		OBJ= ia$(PROG).o
 $(FSS):=	OBJ= fss$(PROG).o
 $(FX):=		OBJ= fx$(PROG).o
+$(IA):=		OBJ= ia$(PROG).o
+$(RT):=		OBJ= rt$(PROG).o
+$(SDC):=	OBJ= sdc$(PROG).o
+$(TS):=		OBJ= ts$(PROG).o
 
 # install rules
 $(ROOTDIR)/% \
-$(ROOTDIR)/RT/% \
+$(ROOTDIR)/FSS/% \
+$(ROOTDIR)/FX/% \
 $(ROOTDIR)/IA/% \
-$(ROOTDIR)/TS/% \
-$(ROOTDIR)/FSS/% \
-$(ROOTDIR)/FX/% : %
+$(ROOTDIR)/RT/% \
+$(ROOTDIR)/SDC/% \
+$(ROOTDIR)/TS/% : %
 	$(INS.file)
 
 .KEEP_STATE:
@@ -89,7 +95,7 @@
 lint :=	LDLIBS += -L. -lsubr
 
 install: all $(ROOTPROG) $(ROOTRT) $(ROOTTS) $(ROOTIA) $(ROOTFSS) $(ROOTFX) \
-		$(ROOTMANIFEST) $(ROOTSVCMETHOD)
+		$(ROOTSDC) $(ROOTMANIFEST) $(ROOTSVCMETHOD)
 
 # Don't re-install directories already installed by Targetdirs
 #$(ROOTDIRS):
@@ -102,10 +108,11 @@
  
 lint:	llib-lsubr.ln
 	$(LINT.c) dispadmin.c $(LDLIBS)
-	$(LINT.c) rtdispadmin.c $(LDLIBS)
-	$(LINT.c) tsdispadmin.c $(LDLIBS)
-	$(LINT.c) iadispadmin.c $(LDLIBS)
 	$(LINT.c) fssdispadmin.c $(LDLIBS)
 	$(LINT.c) fxdispadmin.c $(LDLIBS)
+	$(LINT.c) iadispadmin.c $(LDLIBS)
+	$(LINT.c) rtdispadmin.c $(LDLIBS)
+	$(LINT.c) sdcdispadmin.c $(LDLIBS)
+	$(LINT.c) tsdispadmin.c $(LDLIBS)
  
 include ../Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/dispadmin/sdcdispadmin.c	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include	<errno.h>
+#include	<stdio.h>
+#include	<stdlib.h>
+#include	<string.h>
+#include	<sys/param.h>
+#include	<sys/priocntl.h>
+#include	<sys/types.h>
+
+#include	"dispadmin.h"
+
+static char usage[] = "usage:	dispadmin -l\n";
+
+int
+main(int argc, char *argv[])
+{
+	int lflag = 0;
+	int c;
+
+	while ((c = getopt(argc, argv, "lc:")) != -1) {
+		switch (c) {
+
+		case 'l':
+			lflag++;
+			break;
+
+		case 'c':
+			if (strcmp(optarg, "SDC") != 0)
+				fatalerr("error: %s executed for %s class, "
+				    "%s is actually sub-command for %s class\n",
+				    argv[0], optarg, argv[0], "SDC");
+
+			fatalerr("error: no scheduling-class specific options"
+			    " for SDC\n");
+			break;
+
+		case '?':
+			fatalerr(usage);
+		default:
+			break;
+		}
+	}
+
+	if (!lflag)
+		fatalerr(usage);
+
+	(void) printf("SDC\t(System Duty-Cycle Class)\n");
+	return (0);
+}
--- a/usr/src/cmd/priocntl/Makefile	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/cmd/priocntl/Makefile	Mon Nov 23 15:29:44 2009 -0800
@@ -29,7 +29,7 @@
 
 PRIOCNTLSRC= $(PRIOCNTLOBJ:%.o=%.c)
 OBJS= $(PRIOCNTLOBJ) rt$(PROG).o ts$(PROG).o ia$(PROG).o fss$(PROG).o \
-	fx$(PROG).o
+	fx$(PROG).o sdc$(PROG).o
 SRCS= $(OBJ:%.o=%.c)
 
 include ../Makefile.cmd
@@ -38,19 +38,22 @@
 
 CLASSD = $(ROOTLIB)/class
 RTD = $(CLASSD)/RT
+SDCD = $(CLASSD)/SDC
 TSD = $(CLASSD)/TS
 IAD = $(CLASSD)/IA
 FSSD = $(CLASSD)/FSS
 FXD = $(CLASSD)/FX
-DIRS = $(CLASSD) $(RTD) $(TSD) $(IAD) $(FSSD) $(FXD)
+DIRS = $(CLASSD) $(RTD) $(SDCD) $(TSD) $(IAD) $(FSSD) $(FXD)
 
 RTPROG = RT$(PROG)
+SDCPROG = SDC$(PROG)
 TSPROG = TS$(PROG)
 IAPROG = IA$(PROG)
 FSSPROG = FSS$(PROG)
 FXPROG = FX$(PROG)
 
 ROOTRTPROG = $(RTD)/$(RTPROG)
+ROOTSDCPROG = $(SDCD)/$(SDCPROG)
 ROOTTSPROG = $(TSD)/$(TSPROG)
 ROOTIAPROG = $(IAD)/$(IAPROG)
 ROOTFSSPROG = $(FSSD)/$(FSSPROG)
@@ -58,12 +61,15 @@
 
 $(ROOTUSRSBINPROG) := FILEMODE = 04555
 $(DIRS) := FILEMODE = 0775
-CLOBBERFILES += $(RTPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG)
+CLOBBERFILES += $(RTPROG) $(SDCPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG)
 
 # installation rules
 $(RTD)/% : %
 	$(INS.file)
 
+$(SDCD)/% : %
+	$(INS.file)
+
 $(TSD)/% : %
 	$(INS.file)
 
@@ -78,7 +84,7 @@
 
 .KEEP_STATE:
 
-all: $(PROG) $(RTPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG)
+all: $(PROG) $(RTPROG) $(SDCPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG)
 
 $(PROG): $(PRIOCNTLOBJ)
 	$(LINK.c) $(PRIOCNTLOBJ) -o $@ $(LDLIBS)
@@ -88,6 +94,10 @@
 	$(LINK.c) rt$(PRIOCNTLOBJ) -o $@ $(LDLIBS)
 	$(POST_PROCESS)
 
+$(SDCPROG): sdc$(PRIOCNTLOBJ)
+	$(LINK.c) sdc$(PRIOCNTLOBJ) -o $@ $(LDLIBS)
+	$(POST_PROCESS)
+
 $(TSPROG): ts$(PRIOCNTLOBJ)
 	$(LINK.c) ts$(PRIOCNTLOBJ) -o $@ $(LDLIBS)
 	$(POST_PROCESS)
@@ -107,6 +117,7 @@
 install: all $(DIRS) 	\
 	$(ROOTPROG)	\
 	$(ROOTRTPROG)	\
+	$(ROOTSDCPROG)	\
 	$(ROOTTSPROG)	\
 	$(ROOTIAPROG)	\
 	$(ROOTFSSPROG)	\
@@ -121,6 +132,7 @@
 lint:
 	$(LINT.c) $(PRIOCNTLSRC) $(LDLIBS)
 	$(LINT.c) rt$(PRIOCNTLSRC) $(LDLIBS)
+	$(LINT.c) sdc$(PRIOCNTLSRC) $(LDLIBS)
 	$(LINT.c) ts$(PRIOCNTLSRC) $(LDLIBS)
 	$(LINT.c) ia$(PRIOCNTLSRC) $(LDLIBS)
 	$(LINT.c) fss$(PRIOCNTLSRC) $(LDLIBS)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/priocntl/sdcpriocntl.c	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include	<errno.h>
+#include	<stdio.h>
+#include	<stdlib.h>
+#include	<string.h>
+#include	<sys/param.h>
+#include	<sys/priocntl.h>
+#include	<sys/types.h>
+
+#include	"priocntl.h"
+
+static char usage[] = "usage:	priocntl -l\n";
+
+int
+main(int argc, char *argv[])
+{
+	int lflag = 0;
+	int c;
+
+	while ((c = getopt(argc, argv, "lc:")) != -1) {
+		switch (c) {
+
+		case 'l':
+			lflag++;
+			break;
+
+		case 'c':
+			if (strcmp(optarg, "SDC") != 0)
+				fatalerr("error: %s executed for %s class, "
+				    "%s is actually sub-command for %s class\n",
+				    argv[0], optarg, argv[0], "SDC");
+
+			fatalerr("error: no scheduling-class specific options"
+			    " for SDC\n");
+			break;
+
+		case '?':
+			fatalerr(usage);
+		default:
+			break;
+		}
+	}
+
+	if (!lflag)
+		fatalerr(usage);
+
+	(void) printf("SDC\t(System Duty-Cycle Class)\n");
+	return (0);
+}
--- a/usr/src/lib/libzpool/common/kernel.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/lib/libzpool/common/kernel.c	Mon Nov 23 15:29:44 2009 -0800
@@ -50,6 +50,9 @@
 	"userland", "libzpool", "1", "1", "na"
 };
 
+/* this only exists to have its address taken */
+struct proc p0;
+
 /*
  * =========================================================================
  * threads
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Mon Nov 23 15:29:44 2009 -0800
@@ -197,6 +197,18 @@
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
 	zk_thread_create(func, arg)
 #define	thread_exit() thr_exit(NULL)
+#define	thread_join(t)	panic("libzpool cannot join threads")
+
+#define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
+
+/* in libzpool, p0 exists only to have its address taken */
+struct proc {
+	uintptr_t	this_is_never_used_dont_dereference_it;
+};
+
+extern struct proc p0;
+
+#define	PS_NONE		-1
 
 extern kthread_t *zk_thread_create(void (*func)(), void *arg);
 
@@ -319,15 +331,21 @@
 #define	TASKQ_PREPOPULATE	0x0001
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
-#define	TASKQ_THREADS_CPU_PCT	0x0008	/* Use dynamic thread scheduling */
+#define	TASKQ_THREADS_CPU_PCT	0x0008	/* Scale # threads by # cpus */
+#define	TASKQ_DC_BATCH		0x0010	/* Mark threads as batch */
 
 #define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
 #define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
-#define	TQ_NOQUEUE	0x02	/* Do not enqueue if can't dispatch */
+#define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
+#define	TQ_FRONT	0x08		/* Queue in front */
 
 extern taskq_t *system_taskq;
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+#define	taskq_create_proc(a, b, c, d, e, p, f) \
+	    (taskq_create(a, b, c, d, e, f))
+#define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
+	    (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
--- a/usr/src/lib/libzpool/common/taskq.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/lib/libzpool/common/taskq.c	Mon Nov 23 15:29:44 2009 -0800
@@ -114,8 +114,13 @@
 		mutex_exit(&tq->tq_lock);
 		return (0);
 	}
-	t->task_next = &tq->tq_task;
-	t->task_prev = tq->tq_task.task_prev;
+	if (tqflags & TQ_FRONT) {
+		t->task_next = tq->tq_task.task_next;
+		t->task_prev = &tq->tq_task;
+	} else {
+		t->task_next = &tq->tq_task;
+		t->task_prev = tq->tq_task.task_prev;
+	}
 	t->task_next->task_prev = t;
 	t->task_prev->task_next = t;
 	t->task_func = func;
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/pkgdefs/SUNWckr/prototype_i386	Mon Nov 23 15:29:44 2009 -0800
@@ -239,6 +239,7 @@
 f none kernel/misc/strplumb 755 root sys
 f none kernel/misc/tem 755 root sys
 f none kernel/misc/tlimod 755 root sys
+f none kernel/sched/SDC 755 root sys
 f none kernel/sched/TS 755 root sys
 f none kernel/sched/TS_DPTBL 755 root sys
 l none kernel/strmod/arp=../../kernel/drv/arp
@@ -459,6 +460,7 @@
 f none kernel/misc/amd64/tem 755 root sys
 f none kernel/misc/amd64/tlimod 755 root sys
 d none kernel/sched/amd64 755 root sys
+f none kernel/sched/amd64/SDC 755 root sys
 f none kernel/sched/amd64/TS 755 root sys
 f none kernel/sched/amd64/TS_DPTBL 755 root sys
 d none kernel/strmod/amd64 755 root sys
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc	Mon Nov 23 15:29:44 2009 -0800
@@ -220,6 +220,7 @@
 f none kernel/misc/sparcv9/tlimod 755 root sys
 f none kernel/misc/sparcv9/tem 755 root sys
 d none kernel/sched/sparcv9 755 root sys
+f none kernel/sched/sparcv9/SDC 755 root sys
 f none kernel/sched/sparcv9/TS 755 root sys
 f none kernel/sched/sparcv9/TS_DPTBL 755 root sys
 d none kernel/strmod/sparcv9 755 root sys
--- a/usr/src/pkgdefs/SUNWcsu/prototype_com	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/pkgdefs/SUNWcsu/prototype_com	Mon Nov 23 15:29:44 2009 -0800
@@ -355,6 +355,9 @@
 d none usr/lib/class/RT 755 root bin
 f none usr/lib/class/RT/RTdispadmin 555 root bin
 f none usr/lib/class/RT/RTpriocntl 555 root bin
+d none usr/lib/class/SDC 755 root bin
+f none usr/lib/class/SDC/SDCdispadmin 555 root bin
+f none usr/lib/class/SDC/SDCpriocntl 555 root bin
 d none usr/lib/class/TS 755 root bin
 f none usr/lib/class/TS/TSdispadmin 555 root bin
 f none usr/lib/class/TS/TSpriocntl 555 root bin
--- a/usr/src/pkgdefs/SUNWhea/prototype_com	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com	Mon Nov 23 15:29:44 2009 -0800
@@ -1299,6 +1299,8 @@
 f none usr/include/sys/sysconf.h 644 root bin
 f none usr/include/sys/sysconfig.h 644 root bin
 f none usr/include/sys/sysconfig_impl.h 644 root bin
+f none usr/include/sys/sysdc.h 644 root bin
+f none usr/include/sys/sysdc_impl.h 644 root bin
 d none usr/include/sys/sysevent 755 root bin
 f none usr/include/sys/sysevent/ap_driver.h 644 root bin
 f none usr/include/sys/sysevent/domain.h 644 root bin
--- a/usr/src/uts/common/Makefile.files	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/Makefile.files	Mon Nov 23 15:29:44 2009 -0800
@@ -1444,6 +1444,8 @@
 #
 #			scheduling class modules
 #
+SDC_OBJS +=		sysdc.o
+
 RT_OBJS +=		rt.o
 RT_DPTBL_OBJS +=	rt_dptbl.o
 
--- a/usr/src/uts/common/brand/lx/os/lx_pid.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/brand/lx/os/lx_pid.c	Mon Nov 23 15:29:44 2009 -0800
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
@@ -119,7 +117,7 @@
 		/*
 		 * Allocate a pid for any thread other than the first
 		 */
-		if ((newpid = pid_allocate(p, 0)) < 0)
+		if ((newpid = pid_allocate(p, 0, 0)) < 0)
 			return (-1);
 
 		pidp = pid_find(newpid);
--- a/usr/src/uts/common/disp/cpupart.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/disp/cpupart.c	Mon Nov 23 15:29:44 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -681,6 +681,11 @@
 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
 			return (EBUSY);
 	}
+
+	if (tp->t_cid == sysdccid) {
+		return (EINVAL);	/* For now, sysdc threads can't move */
+	}
+
 	return (0);
 }
 
--- a/usr/src/uts/common/disp/disp.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/disp/disp.c	Mon Nov 23 15:29:44 2009 -0800
@@ -1765,11 +1765,16 @@
 	}
 }
 
-
 /*
  *	Make a thread give up its processor.  Find the processor on
  *	which this thread is executing, and have that processor
  *	preempt.
+ *
+ *	We allow System Duty Cycle (SDC) threads to be preempted even if
+ *	they are running at kernel priorities.  To implement this, we always
+ *	set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
+ *	calls cpu_surrender() very often, we only preempt if there is anyone
+ *	competing with us.
  */
 void
 cpu_surrender(kthread_t *tp)
@@ -1789,9 +1794,16 @@
 	if (max_pri < max_run_pri)
 		max_pri = max_run_pri;
 
-	cpup->cpu_runrun = 1;
-	if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
-		cpup->cpu_kprunrun = 1;
+	if (tp->t_cid == sysdccid) {
+		uint_t t_pri = DISP_PRIO(tp);
+		if (t_pri > max_pri)
+			return;		/* we are not competing w/ anyone */
+		cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
+	} else {
+		cpup->cpu_runrun = 1;
+		if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
+			cpup->cpu_kprunrun = 1;
+		}
 	}
 
 	/*
@@ -1816,7 +1828,6 @@
 	    "cpu_surrender:tid %p cpu %p", tp, cpup);
 }
 
-
 /*
  * Commit to and ratify a scheduling decision
  */
--- a/usr/src/uts/common/disp/priocntl.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/disp/priocntl.c	Mon Nov 23 15:29:44 2009 -0800
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
@@ -713,7 +711,7 @@
 
 		if (getcid(clname, &classid) != 0)
 			return (set_errno(EINVAL));
-		if (classid == syscid)
+		if (CLASS_KERNEL(classid))
 			return (set_errno(EINVAL));
 		defaultcid = classid;
 		ASSERT(defaultcid > 0 && defaultcid < loaded_classes);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/disp/sysdc.c	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,1328 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * The System Duty Cycle (SDC) scheduling class
+ * --------------------------------------------
+ *
+ * Background
+ *
+ * Kernel threads in Solaris have traditionally not been large consumers
+ * of CPU time.  They typically wake up, perform a small amount of
+ * work, then go back to sleep waiting for either a timeout or another
+ * signal.  On the assumption that the small amount of work that they do
+ * is important for the behavior of the whole system, these threads are
+ * treated kindly by the dispatcher and the SYS scheduling class: they run
+ * without preemption from anything other than real-time and interrupt
+ * threads; when preempted, they are put at the front of the queue, so they
+ * generally do not migrate between CPUs; and they are allowed to stay
+ * running until they voluntarily give up the CPU.
+ *
+ * As Solaris has evolved, new workloads have emerged which require the
+ * kernel to perform significant amounts of CPU-intensive work.  One
+ * example of such a workload is ZFS's transaction group sync processing.
+ * Each sync operation generates a large batch of I/Os, and each I/O
+ * may need to be compressed and/or checksummed before it is written to
+ * storage.  The taskq threads which perform the compression and checksums
+ * will run nonstop as long as they have work to do; a large sync operation
+ * on a compression-heavy dataset can keep them busy for seconds on end.
+ * This causes human-time-scale dispatch latency bubbles for any other
+ * threads which have the misfortune to share a CPU with the taskq threads.
+ *
+ * The SDC scheduling class is a solution to this problem.
+ *
+ *
+ * Overview
+ *
+ * SDC is centered around the concept of a thread's duty cycle (DC):
+ *
+ *			      ONPROC time
+ *	Duty Cycle =	----------------------
+ *			ONPROC + Runnable time
+ *
+ * This is the ratio of the time that the thread spent running on a CPU
+ * divided by the time it spent running or trying to run.  It is unaffected
+ * by any time the thread spent sleeping, stopped, etc.
+ *
+ * A thread joining the SDC class specifies a "target" DC that it wants
+ * to run at.  To implement this policy, the routine sysdc_update() scans
+ * the list of active SDC threads every few ticks and uses each thread's
+ * microstate data to compute the actual duty cycle that that thread
+ * has experienced recently.  If the thread is under its target DC, its
+ * priority is increased to the maximum available (sysdc_maxpri, which is
+ * 99 by default).  If the thread is over its target DC, its priority is
+ * reduced to the minimum available (sysdc_minpri, 0 by default).  This
+ * is a fairly primitive approach, in that it doesn't use any of the
+ * intermediate priorities, but it's not completely inappropriate.  Even
+ * though threads in the SDC class might take a while to do their job, they
+ * are by some definition important if they're running inside the kernel,
+ * so it is reasonable that they should get to run at priority 99.
+ *
+ * If a thread is running when sysdc_update() calculates its actual duty
+ * cycle, and there are other threads of equal or greater priority on its
+ * CPU's dispatch queue, sysdc_update() preempts that thread.  The thread
+ * acknowledges the preemption by calling sysdc_preempt(), which calls
+ * setbackdq(), which gives other threads with the same priority a chance
+ * to run.  This creates a de facto time quantum for threads in the SDC
+ * scheduling class.
+ *
+ * An SDC thread which is assigned priority 0 can continue to run if
+ * nothing else needs to use the CPU that it's running on.  Similarly, an
+ * SDC thread at priority 99 might not get to run as much as it wants to
+ * if there are other priority-99 or higher threads on its CPU.  These
+ * situations would cause the thread to get ahead of or behind its target
+ * DC; the longer the situations lasted, the further ahead or behind the
+ * thread would get.  Rather than condemning a thread to a lifetime of
+ * paying for its youthful indiscretions, SDC keeps "base" values for
+ * ONPROC and Runnable times in each thread's sysdc data, and updates these
+ * values periodically.  The duty cycle is then computed using the elapsed
+ * amount of ONPROC and Runnable times since those base times.
+ *
+ * Since sysdc_update() scans SDC threads fairly frequently, it tries to
+ * keep the list of "active" threads small by pruning out threads which
+ * have been asleep for a brief time.  They are not pruned immediately upon
+ * going to sleep, since some threads may bounce back and forth between
+ * sleeping and being runnable.
+ *
+ *
+ * Interfaces
+ *
+ * void sysdc_thread_enter(t, dc, flags)
+ *
+ *	Moves a kernel thread from the SYS scheduling class to the
+ *	SDC class. t must have an associated LWP (created by calling
+ *	lwp_kernel_create()).  The thread will have a target DC of dc.
+ *	Flags should be either 0 or SYSDC_THREAD_BATCH.  If
+ *	SYSDC_THREAD_BATCH is specified, the thread will run with a
+ *	slightly lower priority (see "Batch threads", below).
+ *
+ *
+ * Complications
+ *
+ * - Run queue balancing
+ *
+ *	The Solaris dispatcher is biased towards letting a thread run
+ *	on the same CPU which it last ran on, if no more than 3 ticks
+ *	(i.e. rechoose_interval) have passed since the thread last ran.
+ *	This helps to preserve cache warmth.  On the other hand, it also
+ *	tries to keep the per-CPU run queues fairly balanced; if the CPU
+ *	chosen for a runnable thread has a run queue which is three or
+ *	more threads longer than a neighboring CPU's queue, the runnable
+ *	thread is dispatched onto the neighboring CPU instead.
+ *
+ *	These policies work well for some workloads, but not for many SDC
+ *	threads.  The taskq client of SDC, for example, has many discrete
+ *	units of work to do.  The work units are largely independent, so
+ *	cache warmth is not an important consideration.  It is important
+ *	that the threads fan out quickly to different CPUs, since the
+ *	amount of work these threads have to do (a few seconds worth at a
+ *	time) doesn't leave much time to correct thread placement errors
+ *	(i.e. two SDC threads being dispatched to the same CPU).
+ *
+ *	To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
+ *	This tells the dispatcher to keep neighboring run queues' lengths
+ *	more evenly matched, which allows SDC threads to migrate more
+ *	easily.
+ *
+ * - LWPs and system processes
+ *
+ *	SDC can only be used for kernel threads.  Since SDC uses microstate
+ *	accounting data to compute each thread's actual duty cycle, all
+ *	threads entering the SDC class must have associated LWPs (which
+ *	store the microstate data).  This means that the threads have to
+ *	be associated with an SSYS process, i.e. one created by newproc().
+ *	If the microstate accounting information is ever moved into the
+ *	kthread_t, this restriction could be lifted.
+ *
+ * - Dealing with oversubscription
+ *
+ *	Since SDC duty cycles are per-thread, it is possible that the
+ *	aggregate requested duty cycle of all SDC threads in a processor
+ *	set could be greater than the total CPU time available in that set.
+ *	The FSS scheduling class has an analogous situation, which it deals
+ *	with by reducing each thread's allotted CPU time proportionally.
+ *	Since SDC doesn't need to be as precise as FSS, it uses a simpler
+ *	solution to the oversubscription problem.
+ *
+ *	sysdc_update() accumulates the amount of time that max-priority SDC
+ *	threads have spent on-CPU in each processor set, and uses that sum
+ *	to create an implied duty cycle for that processor set:
+ *
+ *				accumulated CPU time
+ *	   pset DC =	-----------------------------------
+ *			 (# CPUs) * time since last update
+ *
+ *	If this implied duty cycle is above a maximum pset duty cycle (90%
+ *	by default), sysdc_update() sets the priority of all SDC threads
+ *	in that processor set to sysdc_minpri for a "break" period.  After
+ *	the break period, it waits for a "nobreak" period before trying to
+ *	enforce the pset duty cycle limit again.
+ *
+ * - Processor sets
+ *
+ *	As the above implies, SDC is processor set aware, but it does not
+ *	currently allow threads to change processor sets while in the SDC
+ *	class.  Instead, those threads must join the desired processor set
+ *	before entering SDC. [1]
+ *
+ * - Batch threads
+ *
+ *	A thread joining the SDC class can specify the SDC_THREAD_BATCH
+ *	flag.  This flag causes the maximum priority for that thread to be
+ *	reduced (by default, the maximum is reduced by 1).  This allows
+ *	longer-running, batch-oriented SDC threads to be interrupted by
+ *	more immediate, higher-priority work.
+ *
+ * - t_kpri_req
+ *
+ *	The TS and FSS scheduling classes pay attention to t_kpri_req,
+ *	which provides a simple form of priority inheritance for
+ *	synchronization primitives (such as rwlocks held as READER) which
+ *	cannot be traced to a unique thread.  The SDC class does not honor
+ *	t_kpri_req, for a few reasons:
+ *
+ *	1.  t_kpri_req is notoriously inaccurate.  A measure of its
+ *	    inaccuracy is that it needs to be cleared every time a thread
+ *	    returns to user mode, because it is frequently non-zero at that
+ *	    point.  This can happen because "ownership" of synchronization
+ *	    primitives that use t_kpri_req can be silently handed off,
+ *	    leaving no opportunity to will the t_kpri_req inheritance.
+ *
+ *	2.  Unlike in TS and FSS, threads in SDC *will* eventually run at
+ *	    kernel priority.  This means that even if an SDC thread
+ *	    is holding a synchronization primitive and running at low
+ *	    priority, its priority will eventually be raised above 60,
+ *	    allowing it to drive on and release the resource.
+ *
+ *	3.  The first consumer of SDC uses the taskq subsystem, which holds
+ *	    a reader lock for the duration of the task's execution.  This
+ *	    would mean that SDC threads would never drop below kernel
+ *	    priority in practice, which defeats one of the purposes of SDC.
+ *
+ * - Why not FSS?
+ *
+ *	It might seem that the existing FSS scheduling class could solve
+ *	the problems that SDC is attempting to solve.  FSS's more precise
+ *	solution to the oversubscription problem would hardly cause
+ *	trouble, as long as it performed well.  SDC is implemented as
+ *	a separate scheduling class for two main reasons: the initial
+ *	consumer of SDC does not map well onto the "project" abstraction
+ *	that is central to FSS, and FSS does not expect to run at kernel
+ *	priorities.
+ *
+ *
+ * Tunables
+ *
+ * - sysdc_batch_niceness:  The amount below sysdc_maxpri that
+ *	SDC_THREAD_BATCH threads should use as their per-thread
+ *	maximum priority.
+ *
+ * - sysdc_update_interval_msec:  Number of milliseconds between
+ *	consecutive thread priority updates.
+ *
+ * - sysdc_reset_interval_msec:  Number of milliseconds between
+ *	consecutive resets of a thread's base ONPROC and Runnable
+ *	times.
+ *
+ * - sysdc_prune_interval_msec:  Number of milliseconds of sleeping
+ *	before a thread is pruned from the active list.
+ *
+ * - sysdc_max_pset_DC:  Allowable percentage of a processor set's
+ *	CPU time which SDC can give to its high-priority threads.
+ *
+ * - sysdc_break_msec:  Number of milliseconds of "break" taken when
+ *	sysdc_max_pset_DC is exceeded.
+ *
+ *
+ * Future work (in SDC and related subsystems)
+ *
+ * - Per-thread rechoose interval (0 for SDC)
+ *
+ *	Allow each thread to specify its own rechoose interval.  SDC
+ *	threads would specify an interval of zero, which would rechoose
+ *	the CPU with the lowest priority once per update.
+ *
+ * - Allow threads to change processor sets after joining the SDC class
+ *
+ * - Thread groups and per-group DC
+ *
+ *	It might be nice to be able to specify a duty cycle which applies
+ *	to a group of threads in aggregate.
+ *
+ * - Per-group DC callback to allow dynamic DC tuning
+ *
+ *	Currently, DCs are assigned when the thread joins SDC.  Some
+ *	workloads could benefit from being able to tune their DC using
+ *	subsystem-specific knowledge about the workload.
+ *
+ * - Finer-grained priority updates
+ *
+ * - More nuanced management of oversubscription
+ *
+ * - Moving other CPU-intensive threads into SDC
+ *
+ * - Move msacct data into kthread_t
+ *
+ *	This would allow kernel threads without LWPs to join SDC.
+ *
+ *
+ * Footnotes
+ *
+ * [1] The details of doing so are left as an exercise for the reader.
+ */
+
+#include <sys/types.h>
+#include <sys/sysdc.h>
+#include <sys/sysdc_impl.h>
+
+#include <sys/class.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/cpupart.h>
+#include <sys/debug.h>
+#include <sys/disp.h>
+#include <sys/errno.h>
+#include <sys/inline.h>
+#include <sys/kmem.h>
+#include <sys/modctl.h>
+#include <sys/schedctl.h>
+#include <sys/sdt.h>
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+
+/*
+ * Tunables - loaded into the internal state at module load time
+ */
+uint_t		sysdc_update_interval_msec = 20;
+uint_t		sysdc_reset_interval_msec = 400;
+uint_t		sysdc_prune_interval_msec = 100;
+uint_t		sysdc_max_pset_DC = 90;
+uint_t		sysdc_break_msec = 80;
+pri_t		sysdc_batch_niceness = 1;
+
+/*
+ * Internal state - constants set up by sysdc_initparam()
+ */
+static clock_t	sysdc_update_ticks;	/* ticks between updates */
+static uint_t	sysdc_prune_updates;	/* updates asleep before pruning */
+static uint_t	sysdc_reset_updates;	/* # of updates before reset */
+static uint_t	sysdc_break_updates;	/* updates to break */
+static uint_t	sysdc_nobreak_updates;	/* updates to not check */
+static uint_t	sysdc_minDC;		/* minimum allowed DC */
+static uint_t	sysdc_maxDC;		/* maximum allowed DC */
+static pri_t	sysdc_minpri;		/* minimum allowed priority */
+static pri_t	sysdc_maxpri;		/* maximum allowed priority */
+
+/*
+ * Internal state
+ */
+static kmutex_t	sysdc_pset_lock;	/* lock protecting pset data */
+static list_t	sysdc_psets;		/* list of psets with SDC threads */
+static uint_t	sysdc_param_init;	/* sysdc_initparam() has been called */
+static uint_t	sysdc_update_timeout_started; /* update timeout is active */
+static hrtime_t	sysdc_last_update;	/* time of last sysdc_update() */
+static sysdc_t	sysdc_dummy;		/* used to terminate active lists */
+
+/*
+ * Internal state - active hash table
+ */
+#define	SYSDC_NLISTS	8
+#define	SYSDC_HASH(sdc)	(((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
+static sysdc_list_t	sysdc_active[SYSDC_NLISTS];
+#define	SYSDC_LIST(sdc)		(&sysdc_active[SYSDC_HASH(sdc)])
+
+#ifdef DEBUG
+static struct {
+	uint64_t	sysdc_update_times_asleep;
+	uint64_t	sysdc_update_times_base_ran_backwards;
+	uint64_t	sysdc_update_times_already_done;
+	uint64_t	sysdc_update_times_cur_ran_backwards;
+	uint64_t	sysdc_compute_pri_breaking;
+	uint64_t	sysdc_activate_enter;
+	uint64_t	sysdc_update_enter;
+	uint64_t	sysdc_update_exited;
+	uint64_t	sysdc_update_not_sdc;
+	uint64_t	sysdc_update_idle;
+	uint64_t	sysdc_update_take_break;
+	uint64_t	sysdc_update_no_psets;
+	uint64_t	sysdc_tick_not_sdc;
+	uint64_t	sysdc_tick_quantum_expired;
+	uint64_t	sysdc_thread_enter_enter;
+} sysdc_stats;
+
+#define	SYSDC_INC_STAT(x)	(sysdc_stats.x++)
+#else
+#define	SYSDC_INC_STAT(x)	((void)0)
+#endif
+
+/* macros are UPPER CASE */
+#define	HOWMANY(a, b)	howmany((a), (b))
+#define	MSECTOTICKS(a)	HOWMANY((a) * 1000, usec_per_tick)
+
+static void
+sysdc_initparam(void)
+{
+	uint_t sysdc_break_ticks;
+
+	/* update / prune intervals */
+	sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec);
+
+	sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec,
+	    sysdc_update_interval_msec);
+	sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec,
+	    sysdc_update_interval_msec);
+
+	/* We must get at least a little time on CPU. */
+	sysdc_minDC = 1;
+	sysdc_maxDC = SYSDC_DC_MAX;
+	sysdc_minpri = 0;
+	sysdc_maxpri = maxclsyspri;
+
+	/* break parameters */
+	if (sysdc_max_pset_DC > SYSDC_DC_MAX) {
+		sysdc_max_pset_DC = SYSDC_DC_MAX;
+	}
+	sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec);
+	sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks);
+
+	/*
+	 * We want:
+	 *
+	 *	sysdc_max_pset_DC = (nobreak / (break + nobreak))
+	 *
+	 *	==>	  nobreak = sysdc_max_pset_DC * (break + nobreak)
+	 *
+	 *			    sysdc_max_pset_DC * break
+	 *	==>	  nobreak = -------------------------
+	 *			    1 - sysdc_max_pset_DC
+	 */
+	sysdc_nobreak_updates =
+	    HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC,
+	    (SYSDC_DC_MAX - sysdc_max_pset_DC));
+
+	sysdc_param_init = 1;
+}
+
+#undef HOWMANY
+#undef MSECTOTICKS
+
+#define	SDC_UPDATE_INITIAL	0x1	/* for the initial update */
+#define	SDC_UPDATE_TIMEOUT	0x2	/* from sysdc_update() */
+#define	SDC_UPDATE_TICK		0x4	/* from sysdc_tick(), on expiry */
+
+/*
+ * Updates the recorded times in the sdc, and returns the elapsed ONPROC
+ * and Runnable times since the last reset.
+ *
+ * newO is the thread's actual ONPROC time; it's used during sysdc_update()
+ * to track processor set usage.
+ */
+static void
+sysdc_update_times(sysdc_t *sdc, uint_t flags,
+    hrtime_t *O, hrtime_t *R, hrtime_t *newO)
+{
+	kthread_t *const t = sdc->sdc_thread;
+	const uint_t	initial = (flags & SDC_UPDATE_INITIAL);
+	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
+	const clock_t	now = ddi_get_lbolt();
+	uint_t		do_reset;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	*O = *R = 0;
+
+	/* If we've been sleeping, we know we haven't had any ONPROC time. */
+	if (sdc->sdc_sleep_updates != 0 &&
+	    sdc->sdc_sleep_updates != sdc->sdc_nupdates) {
+		*newO = sdc->sdc_last_base_O;
+		SYSDC_INC_STAT(sysdc_update_times_asleep);
+		return;
+	}
+
+	/*
+	 * If this is our first update, or we've hit the reset point,
+	 * we need to reset our base_{O,R}.  Once we've updated them, we
+	 * report O and R for the entire prior interval.
+	 */
+	do_reset = initial;
+	if (update) {
+		++sdc->sdc_nupdates;
+		if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0)
+			do_reset = 1;
+	}
+	if (do_reset) {
+		hrtime_t baseO, baseR;
+		if (initial) {
+			/*
+			 * Start off our cycle count somewhere in the middle,
+			 * to keep the resets from all happening at once.
+			 *
+			 * 4999 is a handy prime much larger than
+			 * sysdc_reset_updates, so that we don't run into
+			 * trouble if the resolution is a multiple of
+			 * sysdc_reset_updates.
+			 */
+			sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) %
+			    sysdc_reset_updates);
+			baseO = baseR = 0;
+		} else {
+			baseO = sdc->sdc_base_O;
+			baseR = sdc->sdc_base_R;
+		}
+
+		mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R);
+		*newO = sdc->sdc_base_O;
+
+		sdc->sdc_reset = now;
+		sdc->sdc_pri_check = -1; /* force mismatch below */
+
+		/*
+		 * See below for rationale.
+		 */
+		if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) {
+			SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards);
+			baseO = sdc->sdc_base_O;
+			baseR = sdc->sdc_base_R;
+		}
+
+		/* compute based on the entire interval */
+		*O = (sdc->sdc_base_O - baseO);
+		*R = (sdc->sdc_base_R - baseR);
+		return;
+	}
+
+	/*
+	 * If we're called from sysdc_update(), we *must* return a value
+	 * for newO, so we always call mstate_systhread_times().
+	 *
+	 * Otherwise, if we've already done a pri check this tick,
+	 * we can skip it.
+	 */
+	if (!update && sdc->sdc_pri_check == now) {
+		SYSDC_INC_STAT(sysdc_update_times_already_done);
+		return;
+	}
+
+	/* Get the current times from the thread */
+	sdc->sdc_pri_check = now;
+	mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R);
+	*newO = sdc->sdc_cur_O;
+
+	/*
+	 * The updating of microstate accounting is not done under a
+	 * consistent set of locks, particularly the t_waitrq field.  This
+	 * can lead to narrow windows in which we account for time in the
+	 * wrong bucket, which on the next read will be accounted for
+	 * correctly.
+	 *
+	 * If our sdc_base_* fields were affected by one of these blips, we
+	 * throw away the old data, and pretend this tick didn't happen.
+	 */
+	if (sdc->sdc_cur_O < sdc->sdc_base_O ||
+	    sdc->sdc_cur_R < sdc->sdc_base_R) {
+
+		sdc->sdc_base_O = sdc->sdc_cur_O;
+		sdc->sdc_base_R = sdc->sdc_cur_R;
+
+		SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards);
+		return;
+	}
+
+	*O = sdc->sdc_cur_O - sdc->sdc_base_O;
+	*R = sdc->sdc_cur_R - sdc->sdc_base_R;
+}
+
+/*
+ * sysdc_compute_pri()
+ *
+ *	Recomputes the priority of the thread, leaving the result in
+ *	sdc->sdc_epri.  Returns 1 if a priority update should occur
+ *	(which will also trigger a cpu_surrender()), otherwise
+ *	returns 0.
+ */
+static uint_t
+sysdc_compute_pri(sysdc_t *sdc, uint_t flags)
+{
+	kthread_t *const t = sdc->sdc_thread;
+	const uint_t	update = (flags & SDC_UPDATE_TIMEOUT);
+	const uint_t	tick = (flags & SDC_UPDATE_TICK);
+
+	hrtime_t	O, R;
+	hrtime_t	newO = -1;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	sysdc_update_times(sdc, flags, &O, &R, &newO);
+	ASSERT(!update || newO != -1);
+
+	/* If we have new data, recompute our priority. */
+	if ((O + R) != 0) {
+		sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R);
+
+		/* Adjust our priority to move our DC closer to the target. */
+		if (sdc->sdc_cur_DC < sdc->sdc_target_DC)
+			sdc->sdc_pri = sdc->sdc_maxpri;
+		else
+			sdc->sdc_pri = sdc->sdc_minpri;
+	}
+
+	/*
+	 * If our per-pset duty cycle goes over the max, we will take a break.
+	 * This forces all sysdc threads in the pset to minimum priority, in
+	 * order to let everyone else have a chance at the CPU.
+	 */
+	if (sdc->sdc_pset->sdp_need_break) {
+		SYSDC_INC_STAT(sysdc_compute_pri_breaking);
+		sdc->sdc_epri = sdc->sdc_minpri;
+	} else {
+		sdc->sdc_epri = sdc->sdc_pri;
+	}
+
+	DTRACE_PROBE4(sysdc__compute__pri,
+	    kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC,
+	    uint_t, sdc->sdc_target_DC);
+
+	/*
+	 * For sysdc_update(), we compute the ONPROC time for high-priority
+	 * threads, which is used to calculate the per-pset duty cycle.  We
+	 * will always tell our callers to update the thread's priority,
+	 * since we want to force a cpu_surrender().
+	 *
+	 * We reset sdc_update_ticks so that sysdc_tick() will only update
+	 * the thread's priority if our timeout is delayed by a tick or
+	 * more.
+	 */
+	if (update) {
+		/* SDC threads are not allowed to change cpupart bindings. */
+		ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart);
+
+		/* If we were at MAXPRI, account for our onproc time. */
+		if (t->t_pri == sdc->sdc_maxpri &&
+		    sdc->sdc_last_base_O != 0 &&
+		    sdc->sdc_last_base_O < newO) {
+			sdc->sdc_last_O = newO - sdc->sdc_last_base_O;
+			sdc->sdc_pset->sdp_onproc_time +=
+			    (uint64_t)sdc->sdc_last_O;
+			sdc->sdc_pset->sdp_onproc_threads++;
+		} else {
+			sdc->sdc_last_O = 0;
+		}
+		sdc->sdc_last_base_O = newO;
+
+		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1;
+		return (1);
+	}
+
+	/*
+	 * Like sysdc_update(), sysdc_tick() always wants to update the
+	 * thread's priority, so that the CPU is surrendered if necessary.
+	 * We reset sdc_update_ticks so that if the timeout continues to be
+	 * delayed, we'll update at the regular interval.
+	 */
+	if (tick) {
+		ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks);
+		sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks;
+		return (1);
+	}
+
+	/*
+	 * Otherwise, only tell our callers to update the priority if it has
+	 * changed.
+	 */
+	return (sdc->sdc_epri != t->t_pri);
+}
+
+static void
+sysdc_update_pri(sysdc_t *sdc, uint_t flags)
+{
+	kthread_t *t = sdc->sdc_thread;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	if (sysdc_compute_pri(sdc, flags)) {
+		if (!thread_change_pri(t, sdc->sdc_epri, 0)) {
+			cpu_surrender(t);
+		}
+	}
+}
+
+/*
+ * Add a thread onto the active list.  It will only be removed by
+ * sysdc_update().
+ */
+static void
+sysdc_activate(sysdc_t *sdc)
+{
+	sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list;
+	sysdc_t		*head;
+	kthread_t	*t = sdc->sdc_thread;
+
+	SYSDC_INC_STAT(sysdc_activate_enter);
+
+	ASSERT(sdc->sdc_next == NULL);
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	do {
+		head = *headp;
+		sdc->sdc_next = head;
+	} while (atomic_cas_ptr(headp, head, sdc) != head);
+}
+
+/*
+ * sysdc_update() has two jobs:
+ *
+ *	1. It updates the priorities of all active SDC threads on the system.
+ *	2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
+ */
+static void
+sysdc_update(void *arg)
+{
+	int		idx;
+	sysdc_t		*freelist = NULL;
+	sysdc_pset_t	*cur;
+	hrtime_t	now, diff;
+	uint_t		redeploy = 1;
+
+	SYSDC_INC_STAT(sysdc_update_enter);
+
+	ASSERT(sysdc_update_timeout_started);
+
+	/*
+	 * If this is our first time through, diff will be gigantic, and
+	 * no breaks will be necessary.
+	 */
+	now = gethrtime();
+	diff = now - sysdc_last_update;
+	sysdc_last_update = now;
+
+	mutex_enter(&sysdc_pset_lock);
+	for (cur = list_head(&sysdc_psets); cur != NULL;
+	    cur = list_next(&sysdc_psets, cur)) {
+		boolean_t breaking = (cur->sdp_should_break != 0);
+
+		if (cur->sdp_need_break != breaking) {
+			DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur,
+			    boolean_t, breaking);
+		}
+		cur->sdp_onproc_time = 0;
+		cur->sdp_onproc_threads = 0;
+		cur->sdp_need_break = breaking;
+	}
+	mutex_exit(&sysdc_pset_lock);
+
+	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
+		sysdc_list_t		*sdl = &sysdc_active[idx];
+		sysdc_t *volatile	*headp = &sdl->sdl_list;
+		sysdc_t			*head, *tail;
+		sysdc_t			**prevptr;
+
+		if (*headp == &sysdc_dummy)
+			continue;
+
+		/* Prevent any threads from exiting while we're poking them. */
+		mutex_enter(&sdl->sdl_lock);
+
+		/*
+		 * Each sdl_list contains a singly-linked list of active
+		 * threads. Threads which become active while we are
+		 * processing the list will be added to sdl_list.  Since we
+		 * don't want that to interfere with our own processing, we
+		 * swap in an empty list.  Any newly active threads will
+		 * go on to this empty list.  When finished, we'll put any
+		 * such threads at the end of the processed list.
+		 */
+		head = atomic_swap_ptr(headp, &sysdc_dummy);
+		prevptr = &head;
+		while (*prevptr != &sysdc_dummy) {
+			sysdc_t		*const	sdc = *prevptr;
+			kthread_t	*const	t = sdc->sdc_thread;
+
+			/*
+			 * If the thread has exited, move its sysdc_t onto
+			 * freelist, to be freed later.
+			 */
+			if (t == NULL) {
+				*prevptr = sdc->sdc_next;
+				SYSDC_INC_STAT(sysdc_update_exited);
+				sdc->sdc_next = freelist;
+				freelist = sdc;
+				continue;
+			}
+
+			thread_lock(t);
+			if (t->t_cid != sysdccid) {
+				thread_unlock(t);
+				prevptr = &sdc->sdc_next;
+				SYSDC_INC_STAT(sysdc_update_not_sdc);
+				continue;
+			}
+			ASSERT(t->t_cldata == sdc);
+
+			/*
+			 * If the thread has been sleeping for longer
+			 * than sysdc_prune_interval, make it inactive by
+			 * removing it from the list.
+			 */
+			if (!(t->t_state & (TS_RUN | TS_ONPROC)) &&
+			    sdc->sdc_sleep_updates != 0 &&
+			    (sdc->sdc_sleep_updates - sdc->sdc_nupdates) >
+			    sysdc_prune_updates) {
+				*prevptr = sdc->sdc_next;
+				SYSDC_INC_STAT(sysdc_update_idle);
+				sdc->sdc_next = NULL;
+				thread_unlock(t);
+				continue;
+			}
+			sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT);
+			thread_unlock(t);
+
+			prevptr = &sdc->sdc_next;
+		}
+
+		/*
+		 * Add our list to the bucket, putting any new entries
+		 * added while we were working at the tail of the list.
+		 */
+		do {
+			tail = *headp;
+			*prevptr = tail;
+		} while (atomic_cas_ptr(headp, tail, head) != tail);
+
+		mutex_exit(&sdl->sdl_lock);
+	}
+
+	mutex_enter(&sysdc_pset_lock);
+	for (cur = list_head(&sysdc_psets); cur != NULL;
+	    cur = list_next(&sysdc_psets, cur)) {
+
+		cur->sdp_vtime_last_interval =
+		    diff * cur->sdp_cpupart->cp_ncpus;
+		cur->sdp_DC_last_interval =
+		    (cur->sdp_onproc_time * SYSDC_DC_MAX) /
+		    cur->sdp_vtime_last_interval;
+
+		if (cur->sdp_should_break > 0) {
+			cur->sdp_should_break--;	/* breaking */
+			continue;
+		}
+		if (cur->sdp_dont_break > 0) {
+			cur->sdp_dont_break--;	/* waiting before checking */
+			continue;
+		}
+		if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) {
+			cur->sdp_should_break = sysdc_break_updates;
+			cur->sdp_dont_break = sysdc_nobreak_updates;
+			SYSDC_INC_STAT(sysdc_update_take_break);
+		}
+	}
+
+	/*
+	 * If there are no sysdc_psets, there can be no threads, so
+	 * we can stop doing our timeout.  Since we're holding the
+	 * sysdc_pset_lock, no new sysdc_psets can come in, which will
+	 * prevent anyone from racing with this and dropping our timeout
+	 * on the floor.
+	 */
+	if (list_is_empty(&sysdc_psets)) {
+		SYSDC_INC_STAT(sysdc_update_no_psets);
+		ASSERT(sysdc_update_timeout_started);
+		sysdc_update_timeout_started = 0;
+
+		redeploy = 0;
+	}
+	mutex_exit(&sysdc_pset_lock);
+
+	while (freelist != NULL) {
+		sysdc_t *cur = freelist;
+		freelist = cur->sdc_next;
+		kmem_free(cur, sizeof (*cur));
+	}
+
+	if (redeploy) {
+		(void) timeout(sysdc_update, arg, sysdc_update_ticks);
+	}
+}
+
+static void
+sysdc_preempt(kthread_t *t)
+{
+	ASSERT(t == curthread);
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	setbackdq(t);		/* give others a chance to run */
+}
+
+static void
+sysdc_tick(kthread_t *t)
+{
+	sysdc_t *sdc;
+
+	thread_lock(t);
+	if (t->t_cid != sysdccid) {
+		SYSDC_INC_STAT(sysdc_tick_not_sdc);
+		thread_unlock(t);
+		return;
+	}
+	sdc = t->t_cldata;
+	if (t->t_state == TS_ONPROC &&
+	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+		cpu_surrender(t);
+	}
+
+	if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
+		ASSERT(sdc->sdc_sleep_updates == 0);
+	}
+
+	ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
+	sdc->sdc_ticks++;
+	if (sdc->sdc_ticks == sdc->sdc_update_ticks) {
+		SYSDC_INC_STAT(sysdc_tick_quantum_expired);
+		sysdc_update_pri(sdc, SDC_UPDATE_TICK);
+		ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks);
+	}
+	thread_unlock(t);
+}
+
+static void
+sysdc_setrun(kthread_t *t)
+{
+	sysdc_t *sdc = t->t_cldata;
+
+	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
+
+	sdc->sdc_sleep_updates = 0;
+
+	if (sdc->sdc_next == NULL) {
+		/*
+		 * Since we're in transition, we don't want to use the
+		 * full thread_update_pri().
+		 */
+		if (sysdc_compute_pri(sdc, 0)) {
+			THREAD_CHANGE_PRI(t, sdc->sdc_epri);
+		}
+		sysdc_activate(sdc);
+
+		ASSERT(sdc->sdc_next != NULL);
+	}
+
+	setbackdq(t);
+}
+
+static void
+sysdc_wakeup(kthread_t *t)
+{
+	sysdc_setrun(t);
+}
+
+static void
+sysdc_sleep(kthread_t *t)
+{
+	sysdc_t *sdc = t->t_cldata;
+
+	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
+
+	sdc->sdc_sleep_updates = sdc->sdc_nupdates;
+}
+
+/*ARGSUSED*/
+static int
+sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
+    void *bufp)
+{
+	cpupart_t *const cpupart = t->t_cpupart;
+	sysdc_t *sdc = bufp;
+	sysdc_params_t *sdpp = parmsp;
+	sysdc_pset_t *newpset = sdc->sdc_pset;
+	sysdc_pset_t *pset;
+	int start_timeout;
+
+	if (t->t_cid != syscid)
+		return (EPERM);
+
+	ASSERT(ttolwp(t) != NULL);
+	ASSERT(sdpp != NULL);
+	ASSERT(newpset != NULL);
+	ASSERT(sysdc_param_init);
+
+	ASSERT(sdpp->sdp_minpri >= sysdc_minpri);
+	ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri);
+	ASSERT(sdpp->sdp_DC >= sysdc_minDC);
+	ASSERT(sdpp->sdp_DC <= sysdc_maxDC);
+
+	sdc->sdc_thread = t;
+	sdc->sdc_pri = sdpp->sdp_maxpri;	/* start off maximally */
+	sdc->sdc_minpri = sdpp->sdp_minpri;
+	sdc->sdc_maxpri = sdpp->sdp_maxpri;
+	sdc->sdc_target_DC = sdpp->sdp_DC;
+	sdc->sdc_ticks = 0;
+	sdc->sdc_update_ticks = sysdc_update_ticks + 1;
+
+	/* Assign ourselves to the appropriate pset. */
+	sdc->sdc_pset = NULL;
+	mutex_enter(&sysdc_pset_lock);
+	for (pset = list_head(&sysdc_psets); pset != NULL;
+	    pset = list_next(&sysdc_psets, pset)) {
+		if (pset->sdp_cpupart == cpupart) {
+			break;
+		}
+	}
+	if (pset == NULL) {
+		pset = newpset;
+		newpset = NULL;
+		pset->sdp_cpupart = cpupart;
+		list_insert_tail(&sysdc_psets, pset);
+	}
+	pset->sdp_nthreads++;
+	ASSERT(pset->sdp_nthreads > 0);
+
+	sdc->sdc_pset = pset;
+
+	start_timeout = (sysdc_update_timeout_started == 0);
+	sysdc_update_timeout_started = 1;
+	mutex_exit(&sysdc_pset_lock);
+
+	if (newpset != NULL)
+		kmem_free(newpset, sizeof (*newpset));
+
+	/* Update t's scheduling class and priority. */
+	thread_lock(t);
+	t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
+	t->t_cid = cid;
+	t->t_cldata = sdc;
+	t->t_schedflag |= TS_RUNQMATCH;
+
+	sysdc_update_pri(sdc, SDC_UPDATE_INITIAL);
+	thread_unlock(t);
+
+	/* Kick off the thread timeout if we're the first one in. */
+	if (start_timeout) {
+		(void) timeout(sysdc_update, NULL, sysdc_update_ticks);
+	}
+
+	return (0);
+}
+
+static void
+sysdc_leave(sysdc_t *sdc)
+{
+	sysdc_pset_t *sdp = sdc->sdc_pset;
+	sysdc_list_t *sdl = SYSDC_LIST(sdc);
+	uint_t freedc;
+
+	mutex_enter(&sdl->sdl_lock);		/* block sysdc_update() */
+	sdc->sdc_thread = NULL;
+	freedc = (sdc->sdc_next == NULL);
+	mutex_exit(&sdl->sdl_lock);
+
+	mutex_enter(&sysdc_pset_lock);
+	sdp = sdc->sdc_pset;
+	ASSERT(sdp != NULL);
+	ASSERT(sdp->sdp_nthreads > 0);
+	--sdp->sdp_nthreads;
+	if (sdp->sdp_nthreads == 0) {
+		list_remove(&sysdc_psets, sdp);
+	} else {
+		sdp = NULL;
+	}
+	mutex_exit(&sysdc_pset_lock);
+
+	if (freedc)
+		kmem_free(sdc, sizeof (*sdc));
+	if (sdp != NULL)
+		kmem_free(sdp, sizeof (*sdp));
+}
+
+static void
+sysdc_exitclass(void *buf)
+{
+	sysdc_leave((sysdc_t *)buf);
+}
+
+/*ARGSUSED*/
+static int
+sysdc_canexit(kthread_t *t, cred_t *reqpcredp)
+{
+	/* Threads cannot exit SDC once joined, except in a body bag. */
+	return (EPERM);
+}
+
+static void
+sysdc_exit(kthread_t *t)
+{
+	sysdc_t *sdc;
+
+	/* We're exiting, so we just rejoin the SYS class. */
+	thread_lock(t);
+	ASSERT(t->t_cid == sysdccid);
+	sdc = t->t_cldata;
+	t->t_cid = syscid;
+	t->t_cldata = NULL;
+	t->t_clfuncs = &(sclass[syscid].cl_funcs->thread);
+	(void) thread_change_pri(t, maxclsyspri, 0);
+	t->t_schedflag &= ~TS_RUNQMATCH;
+	thread_unlock_nopreempt(t);
+
+	/* Unlink the sdc from everything. */
+	sysdc_leave(sdc);
+}
+
+/*ARGSUSED*/
+static int
+sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp)
+{
+	/*
+	 * Threads cannot be created with SDC as their class; they must
+	 * be created as SYS and then added with sysdc_thread_enter().
+	 * Because of this restriction, sysdc_fork() should never be called.
+	 */
+	panic("sysdc cannot be forked");
+
+	return (ENOSYS);
+}
+
+/*ARGSUSED*/
+static void
+sysdc_forkret(kthread_t *t, kthread_t *ct)
+{
+	/* SDC threads are part of system processes, which never fork. */
+	panic("sysdc cannot be forked");
+}
+
+static pri_t
+sysdc_globpri(kthread_t *t)
+{
+	return (t->t_epri);
+}
+
+/*ARGSUSED*/
+static pri_t
+sysdc_no_swap(kthread_t *t, int flags)
+{
+	/* SDC threads cannot be swapped. */
+	return (-1);
+}
+
+/*
+ * Get maximum and minimum priorities enjoyed by SDC threads.
+ */
+static int
+sysdc_getclpri(pcpri_t *pcprip)
+{
+	pcprip->pc_clpmax = sysdc_maxpri;
+	pcprip->pc_clpmin = sysdc_minpri;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+sysdc_getclinfo(void *arg)
+{
+	return (0);		/* no class-specific info */
+}
+
+/*ARGSUSED*/
+static int
+sysdc_alloc(void **p, int flag)
+{
+	sysdc_t *new;
+
+	*p = NULL;
+	if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) {
+		return (ENOMEM);
+	}
+	if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) ==
+	    NULL) {
+		kmem_free(new, sizeof (*new));
+		return (ENOMEM);
+	}
+	*p = new;
+	return (0);
+}
+
+static void
+sysdc_free(void *p)
+{
+	sysdc_t *sdc = p;
+
+	if (sdc != NULL) {
+		/*
+		 * We must have failed CL_ENTERCLASS(), so our pset should be
+		 * there and unused.
+		 */
+		ASSERT(sdc->sdc_pset != NULL);
+		ASSERT(sdc->sdc_pset->sdp_cpupart == NULL);
+		kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset));
+		kmem_free(sdc, sizeof (*sdc));
+	}
+}
+
+static int sysdc_enosys();	/* Boy, ANSI-C's K&R compatibility is weird. */
+static int sysdc_einval();
+static void sysdc_nullsys();
+
+static struct classfuncs sysdc_classfuncs = {
+	/* messages to class manager */
+	{
+		sysdc_enosys,	/* admin */
+		sysdc_getclinfo,
+		sysdc_enosys,	/* parmsin */
+		sysdc_enosys,	/* parmsout */
+		sysdc_enosys,	/* vaparmsin */
+		sysdc_enosys,	/* vaparmsout */
+		sysdc_getclpri,
+		sysdc_alloc,
+		sysdc_free,
+	},
+	/* operations on threads */
+	{
+		sysdc_enterclass,
+		sysdc_exitclass,
+		sysdc_canexit,
+		sysdc_fork,
+		sysdc_forkret,
+		sysdc_nullsys,	/* parmsget */
+		sysdc_enosys,	/* parmsset */
+		sysdc_nullsys,	/* stop */
+		sysdc_exit,
+		sysdc_nullsys,	/* active */
+		sysdc_nullsys,	/* inactive */
+		sysdc_no_swap,	/* swapin */
+		sysdc_no_swap,	/* swapout */
+		sysdc_nullsys,	/* trapret */
+		sysdc_preempt,
+		sysdc_setrun,
+		sysdc_sleep,
+		sysdc_tick,
+		sysdc_wakeup,
+		sysdc_einval,	/* donice */
+		sysdc_globpri,
+		sysdc_nullsys,	/* set_process_group */
+		sysdc_nullsys,	/* yield */
+		sysdc_einval,	/* doprio */
+	}
+};
+
+static int
+sysdc_enosys()
+{
+	return (ENOSYS);
+}
+
+static int
+sysdc_einval()
+{
+	return (EINVAL);
+}
+
+static void
+sysdc_nullsys()
+{
+}
+
+/*ARGSUSED*/
+static pri_t
+sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
+{
+	int idx;
+
+	list_create(&sysdc_psets, sizeof (sysdc_pset_t),
+	    offsetof(sysdc_pset_t, sdp_node));
+
+	for (idx = 0; idx < SYSDC_NLISTS; idx++) {
+		sysdc_active[idx].sdl_list = &sysdc_dummy;
+	}
+
+	sysdc_initparam();
+
+	sysdccid = cid;
+	*clfuncspp = &sysdc_classfuncs;
+
+	return ((pri_t)v.v_maxsyspri);
+}
+
+static struct sclass csw = {
+	"SDC",
+	sysdc_init,
+	0
+};
+
+static struct modlsched modlsched = {
+	&mod_schedops, "system duty cycle scheduling class", &csw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlsched, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	return (EBUSY);		/* can't unload for now */
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/* --- consolidation-private interfaces --- */
+void
+sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags)
+{
+	void *buf = NULL;
+	sysdc_params_t sdp;
+
+	SYSDC_INC_STAT(sysdc_thread_enter_enter);
+
+	ASSERT(sysdc_param_init);
+	ASSERT(sysdccid >= 0);
+
+	ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0);
+
+	sdp.sdp_minpri = sysdc_minpri;
+	sdp.sdp_maxpri = sysdc_maxpri;
+	sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC);
+
+	if (flags & SYSDC_THREAD_BATCH)
+		sdp.sdp_maxpri -= sysdc_batch_niceness;
+
+	VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0);
+
+	ASSERT(t->t_lwp != NULL);
+	ASSERT(t->t_cid == syscid);
+	ASSERT(t->t_cldata == NULL);
+	VERIFY3U(CL_CANEXIT(t, NULL), ==, 0);
+	VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0);
+	CL_EXITCLASS(syscid, NULL);
+}
--- a/usr/src/uts/common/disp/thread.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/disp/thread.c	Mon Nov 23 15:29:44 2009 -0800
@@ -98,7 +98,10 @@
 
 extern int nthread;
 
+/* System Scheduling classes. */
 id_t	syscid;				/* system scheduling class ID */
+id_t	sysdccid = CLASS_UNUSED;	/* reset when SDC loads */
+
 void	*segkp_thread;			/* cookie for segkp pool */
 
 int lwp_cache_sz = 32;
--- a/usr/src/uts/common/fs/proc/prcontrol.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/proc/prcontrol.c	Mon Nov 23 15:29:44 2009 -0800
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/param.h>
@@ -299,6 +297,12 @@
 	p = pcp->prc_proc;
 	ASSERT(p != NULL);
 
+	/* System processes defy control. */
+	if (p->p_flag & SSYS) {
+		prunlock(pnp);
+		return (EBUSY);
+	}
+
 	switch (cmd) {
 
 	default:
@@ -315,7 +319,7 @@
 			/*
 			 * Can't apply to a system process.
 			 */
-			if ((p->p_flag & SSYS) || p->p_as == &kas) {
+			if (p->p_as == &kas) {
 				error = EBUSY;
 				break;
 			}
@@ -723,6 +727,11 @@
 	p = pcp->prc_proc;
 	ASSERT(p != NULL);
 
+	if (p->p_flag & SSYS) {
+		prunlock(pnp);
+		return (EBUSY);
+	}
+
 	switch (cmd) {
 
 	default:
@@ -739,7 +748,7 @@
 			/*
 			 * Can't apply to a system process.
 			 */
-			if ((p->p_flag & SSYS) || p->p_as == &kas) {
+			if (p->p_as == &kas) {
 				error = EBUSY;
 				break;
 			}
--- a/usr/src/uts/common/fs/vfs.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/vfs.c	Mon Nov 23 15:29:44 2009 -0800
@@ -812,6 +812,7 @@
 	char		*path;
 	size_t		plen;
 	struct vfssw	*vswp;
+	proc_t		*p;
 
 	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
@@ -835,9 +836,22 @@
 	vfs_setmntpoint(rootvfs, "/");
 	if (VFS_ROOT(rootvfs, &rootdir))
 		panic("vfs_mountroot: no root vnode");
-	PTOU(curproc)->u_cdir = rootdir;
-	VN_HOLD(PTOU(curproc)->u_cdir);
-	PTOU(curproc)->u_rdir = NULL;
+
+	/*
+	 * At this point, the process tree consists of p0 and possibly some
+	 * direct children of p0.  (i.e. there are no grandchildren)
+	 *
+	 * Walk through them all, setting their current directory.
+	 */
+	mutex_enter(&pidlock);
+	for (p = practive; p != NULL; p = p->p_next) {
+		ASSERT(p == &p0 || p->p_parent == &p0);
+
+		PTOU(p)->u_cdir = rootdir;
+		VN_HOLD(PTOU(p)->u_cdir);
+		PTOU(p)->u_rdir = NULL;
+	}
+	mutex_exit(&pidlock);
 
 	/*
 	 * Setup the global zone's rootvp, now that it exists.
--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Nov 23 15:29:44 2009 -0800
@@ -62,24 +62,28 @@
 #include <sys/zfs_ioctl.h>
 
 #ifdef	_KERNEL
+#include <sys/bootprops.h>
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/pool.h>
+#include <sys/sysdc.h>
 #include <sys/zone.h>
-#include <sys/bootprops.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
-enum zti_modes {
+typedef enum zti_modes {
 	zti_mode_fixed,			/* value is # of threads (min 1) */
 	zti_mode_online_percent,	/* value is % of online CPUs */
-	zti_mode_tune,			/* fill from zio_taskq_tune_* */
+	zti_mode_batch,			/* cpu-intensive; value is ignored */
 	zti_mode_null,			/* don't create a taskq */
 	zti_nmodes
-};
+} zti_modes_t;
 
 #define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
 #define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
-#define	ZTI_TUNE	{ zti_mode_tune, 0 }
+#define	ZTI_BATCH	{ zti_mode_batch, 0 }
 #define	ZTI_NULL	{ zti_mode_null, 0 }
 
 #define	ZTI_ONE		ZTI_FIX(1)
@@ -90,7 +94,7 @@
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
-		"issue", "issue_high", "intr", "intr_high"
+	"issue", "issue_high", "intr", "intr_high"
 };
 
 /*
@@ -100,19 +104,29 @@
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_TUNE,	ZTI_NULL },
-	{ ZTI_TUNE,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
+	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
+	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
-enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
-uint_t zio_taskq_tune_value = 80;	/* #threads = 80% of # online CPUs */
-
 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 
+uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
+id_t		zio_taskq_psrset_bind = PS_NONE;
+boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
+uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
+
+boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define	TRYIMPORT_NAME	"$import"
+
 /*
  * ==========================================================================
  * SPA properties routines
@@ -584,6 +598,139 @@
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
+static taskq_t *
+spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+    uint_t value)
+{
+	uint_t flags = TASKQ_PREPOPULATE;
+	boolean_t batch = B_FALSE;
+
+	switch (mode) {
+	case zti_mode_null:
+		return (NULL);		/* no taskq needed */
+
+	case zti_mode_fixed:
+		ASSERT3U(value, >=, 1);
+		value = MAX(value, 1);
+		break;
+
+	case zti_mode_batch:
+		batch = B_TRUE;
+		flags |= TASKQ_THREADS_CPU_PCT;
+		value = zio_taskq_batch_pct;
+		break;
+
+	case zti_mode_online_percent:
+		flags |= TASKQ_THREADS_CPU_PCT;
+		break;
+
+	default:
+		panic("unrecognized mode for %s taskq (%u:%u) in "
+		    "spa_activate()",
+		    name, mode, value);
+		break;
+	}
+
+	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+		if (batch)
+			flags |= TASKQ_DC_BATCH;
+
+		return (taskq_create_sysdc(name, value, 50, INT_MAX,
+		    spa->spa_proc, zio_taskq_basedc, flags));
+	}
+	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+	    spa->spa_proc, flags));
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+			enum zti_modes mode = ztip->zti_mode;
+			uint_t value = ztip->zti_value;
+			char name[32];
+
+			(void) snprintf(name, sizeof (name),
+			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
+
+			spa->spa_zio_taskq[t][q] =
+			    spa_taskq_create(spa, name, mode, value);
+		}
+	}
+}
+
+#ifdef _KERNEL
+static void
+spa_thread(void *arg)
+{
+	callb_cpr_t cprinfo;
+
+	spa_t *spa = arg;
+	user_t *pu = PTOU(curproc);
+
+	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+	    spa->spa_name);
+
+	ASSERT(curproc != &p0);
+	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+	    "zpool-%s", spa->spa_name);
+	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+	/* bind this thread to the requested psrset */
+	if (zio_taskq_psrset_bind != PS_NONE) {
+		pool_lock();
+		mutex_enter(&cpu_lock);
+		mutex_enter(&pidlock);
+		mutex_enter(&curproc->p_lock);
+
+		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+		    0, NULL, NULL) == 0)  {
+			curthread->t_bind_pset = zio_taskq_psrset_bind;
+		} else {
+			cmn_err(CE_WARN,
+			    "Couldn't bind process for zfs pool \"%s\" to "
+			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+		}
+
+		mutex_exit(&curproc->p_lock);
+		mutex_exit(&pidlock);
+		mutex_exit(&cpu_lock);
+		pool_unlock();
+	}
+
+	if (zio_taskq_sysdc) {
+		sysdc_thread_enter(curthread, 100, 0);
+	}
+
+	spa->spa_proc = curproc;
+	spa->spa_did = curthread->t_did;
+
+	spa_create_zio_taskqs(spa);
+
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+	spa->spa_proc_state = SPA_PROC_ACTIVE;
+	cv_broadcast(&spa->spa_proc_cv);
+
+	CALLB_CPR_SAFE_BEGIN(&cprinfo);
+	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+	spa->spa_proc_state = SPA_PROC_GONE;
+	spa->spa_proc = &p0;
+	cv_broadcast(&spa->spa_proc_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
+
+	mutex_enter(&curproc->p_lock);
+	lwp_exit();
+}
+#endif
+
 /*
  * Activate an uninitialized pool.
  */
@@ -598,53 +745,38 @@
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
-	for (int t = 0; t < ZIO_TYPES; t++) {
-		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
-			enum zti_modes mode = ztip->zti_mode;
-			uint_t value = ztip->zti_value;
-			char name[32];
-
-			(void) snprintf(name, sizeof (name),
-			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
-
-			if (mode == zti_mode_tune) {
-				mode = zio_taskq_tune_mode;
-				value = zio_taskq_tune_value;
-				if (mode == zti_mode_tune)
-					mode = zti_mode_online_percent;
+	/* Try to create a covering process */
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+	ASSERT(spa->spa_proc == &p0);
+	spa->spa_did = 0;
+
+	/* Only create a process if we're going to be around a while. */
+	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+		    NULL, 0) == 0) {
+			spa->spa_proc_state = SPA_PROC_CREATED;
+			while (spa->spa_proc_state == SPA_PROC_CREATED) {
+				cv_wait(&spa->spa_proc_cv,
+				    &spa->spa_proc_lock);
 			}
-
-			switch (mode) {
-			case zti_mode_fixed:
-				ASSERT3U(value, >=, 1);
-				value = MAX(value, 1);
-
-				spa->spa_zio_taskq[t][q] = taskq_create(name,
-				    value, maxclsyspri, 50, INT_MAX,
-				    TASKQ_PREPOPULATE);
-				break;
-
-			case zti_mode_online_percent:
-				spa->spa_zio_taskq[t][q] = taskq_create(name,
-				    value, maxclsyspri, 50, INT_MAX,
-				    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
-				break;
-
-			case zti_mode_null:
-				spa->spa_zio_taskq[t][q] = NULL;
-				break;
-
-			case zti_mode_tune:
-			default:
-				panic("unrecognized mode for "
-				    "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
-				    "in spa_activate()",
-				    t, q, mode, value);
-				break;
-			}
+			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+			ASSERT(spa->spa_proc != &p0);
+			ASSERT(spa->spa_did != 0);
+		} else {
+#ifdef _KERNEL
+			cmn_err(CE_WARN,
+			    "Couldn't create process for zfs pool \"%s\"\n",
+			    spa->spa_name);
+#endif
 		}
 	}
+	mutex_exit(&spa->spa_proc_lock);
+
+	/* If we didn't create a process, we need to create our taskqs. */
+	if (spa->spa_proc == &p0) {
+		spa_create_zio_taskqs(spa);
+	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
@@ -703,6 +835,31 @@
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+	mutex_enter(&spa->spa_proc_lock);
+	if (spa->spa_proc_state != SPA_PROC_NONE) {
+		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+		cv_broadcast(&spa->spa_proc_cv);
+		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+			ASSERT(spa->spa_proc != &p0);
+			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+		}
+		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+		spa->spa_proc_state = SPA_PROC_NONE;
+	}
+	ASSERT(spa->spa_proc == &p0);
+	mutex_exit(&spa->spa_proc_lock);
+
+	/*
+	 * We want to make sure spa_thread() has actually exited the ZFS
+	 * module, so that the module can't be unloaded out from underneath
+	 * it.
+	 */
+	if (spa->spa_did != 0) {
+		thread_join(spa->spa_did);
+		spa->spa_did = 0;
+	}
 }
 
 /*
@@ -2999,13 +3156,6 @@
 	return (0);
 }
 
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define	TRYIMPORT_NAME	"$import"
-
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Nov 23 15:29:44 2009 -0800
@@ -430,15 +430,17 @@
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
@@ -451,6 +453,8 @@
 	spa->spa_freeze_txg = UINT64_MAX;
 	spa->spa_final_txg = UINT64_MAX;
 	spa->spa_load_max_txg = UINT64_MAX;
+	spa->spa_proc = &p0;
+	spa->spa_proc_state = SPA_PROC_NONE;
 
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
@@ -522,15 +526,17 @@
 	bplist_fini(&spa->spa_deferred_bplist);
 
 	cv_destroy(&spa->spa_async_cv);
+	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
 	mutex_destroy(&spa->spa_async_lock);
-	mutex_destroy(&spa->spa_scrub_lock);
+	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
-	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_history_lock);
+	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
 
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Nov 23 15:29:44 2009 -0800
@@ -86,6 +86,25 @@
 	ZIO_TASKQ_TYPES
 };
 
+/*
+ * State machine for the zpool-pooname process.  The states transitions
+ * are done as follows:
+ *
+ *	From		   To			Routine
+ *	PROC_NONE	-> PROC_CREATED		spa_activate()
+ *	PROC_CREATED	-> PROC_ACTIVE		spa_thread()
+ *	PROC_ACTIVE	-> PROC_DEACTIVATE	spa_deactivate()
+ *	PROC_DEACTIVATE	-> PROC_GONE		spa_thread()
+ *	PROC_GONE	-> PROC_NONE		spa_deactivate()
+ */
+typedef enum spa_proc_state {
+	SPA_PROC_NONE,		/* spa_proc = &p0, no process created */
+	SPA_PROC_CREATED,	/* spa_activate() has proc, is waiting */
+	SPA_PROC_ACTIVE,	/* taskqs created, spa_proc set */
+	SPA_PROC_DEACTIVATE,	/* spa_deactivate() requests process exit */
+	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
@@ -186,6 +205,11 @@
 	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
 	uint64_t	spa_dspace;		/* dspace in normal class */
 	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
+	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
+	kcondvar_t	spa_proc_cv;		/* spa_proc_state transitions */
+	spa_proc_state_t spa_proc_state;	/* see definition */
+	struct proc	*spa_proc;		/* "zpool-poolname" process */
+	uint64_t	spa_did;		/* if procp != p0, did of t1 */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
 	int		spa_vdev_locks;		/* locks grabbed */
 	/*
--- a/usr/src/uts/common/fs/zfs/zio.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Mon Nov 23 15:29:44 2009 -0800
@@ -85,6 +85,8 @@
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
+boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
+
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
 #else
@@ -1024,10 +1026,11 @@
  */
 
 static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
+	int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
@@ -1052,7 +1055,7 @@
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 	(void) taskq_dispatch(spa->spa_zio_taskq[t][q],
-	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
+	    (task_func_t *)zio_execute, zio, flags);
 }
 
 static boolean_t
@@ -1071,7 +1074,7 @@
 static int
 zio_issue_async(zio_t *zio)
 {
-	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
 	return (ZIO_PIPELINE_STOP);
 }
@@ -1079,7 +1082,7 @@
 void
 zio_interrupt(zio_t *zio)
 {
-	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 /*
@@ -1122,10 +1125,15 @@
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
+		 *
+		 * For VDEV_IO_START, we cut in line so that the io will
+		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
-			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+			    zio_requeue_io_start_cut_in_line : B_FALSE;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
@@ -1790,7 +1798,7 @@
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
-			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (ZIO_PIPELINE_STOP);
 		}
 		if (dde->dde_repair_data != NULL) {
@@ -2365,6 +2373,9 @@
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
+	 *
+	 * On retry, we cut in line in the issue queue, since we don't want
+	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
@@ -2374,7 +2385,8 @@
 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
-		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+		    zio_requeue_io_start_cut_in_line);
 		return (ZIO_PIPELINE_STOP);
 	}
 
--- a/usr/src/uts/common/os/condvar.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/condvar.c	Mon Nov 23 15:29:44 2009 -0800
@@ -314,13 +314,15 @@
 	ASSERT(!quiesce_active);
 
 	/*
-	 * The check for t_intr is to catch an interrupt thread
-	 * that has not yet unpinned the thread underneath.
+	 * Threads in system processes don't process signals.  This is
+	 * true both for standard threads of system processes and for
+	 * interrupt threads which have borrowed their pinned thread's LWP.
 	 */
-	if (lwp == NULL || t->t_intr) {
+	if (lwp == NULL || (p->p_flag & SSYS)) {
 		cv_wait(cvp, mp);
 		return (rval);
 	}
+	ASSERT(t->t_intr == NULL);
 
 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
 	cancel_pending = schedctl_cancel_pending();
@@ -374,12 +376,13 @@
 	ASSERT(!quiesce_active);
 
 	/*
-	 * If there is no lwp, then we don't need to wait for a signal.
-	 * The check for t_intr is to catch an interrupt thread
-	 * that has not yet unpinned the thread underneath.
+	 * Threads in system processes don't process signals.  This is
+	 * true both for standard threads of system processes and for
+	 * interrupt threads which have borrowed their pinned thread's LWP.
 	 */
-	if (lwp == NULL || t->t_intr)
+	if (lwp == NULL || (p->p_flag & SSYS))
 		return (cv_timedwait_hires(cvp, mp, tim, res, flag));
+	ASSERT(t->t_intr == NULL);
 
 	/*
 	 * If tim is less than or equal to current hrtime, then the timeout
@@ -516,13 +519,15 @@
 		return (rval);
 
 	/*
-	 * The check for t_intr is to catch an interrupt thread
-	 * that has not yet unpinned the thread underneath.
+	 * Threads in system processes don't process signals.  This is
+	 * true both for standard threads of system processes and for
+	 * interrupt threads which have borrowed their pinned thread's LWP.
 	 */
-	if (lwp == NULL || t->t_intr) {
+	if (lwp == NULL || (p->p_flag & SSYS)) {
 		cv_wait(cvp, mp);
 		return (rval);
 	}
+	ASSERT(t->t_intr == NULL);
 
 	cancel_pending = schedctl_cancel_pending();
 	lwp->lwp_asleep = 1;
@@ -640,14 +645,15 @@
 		return;
 
 	/*
-	 * If there is no lwp, then we don't need to eventually stop it
-	 * The check for t_intr is to catch an interrupt thread
-	 * that has not yet unpinned the thread underneath.
+	 * Threads in system processes don't process signals.  This is
+	 * true both for standard threads of system processes and for
+	 * interrupt threads which have borrowed their pinned thread's LWP.
 	 */
-	if (lwp == NULL || t->t_intr) {
+	if (lwp == NULL || (p->p_flag & SSYS)) {
 		cv_wait(cvp, mp);
 		return;
 	}
+	ASSERT(t->t_intr == NULL);
 
 	/*
 	 * Wakeup in wakeup_time milliseconds, i.e., human time.
--- a/usr/src/uts/common/os/exit.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/exit.c	Mon Nov 23 15:29:44 2009 -0800
@@ -405,10 +405,12 @@
 	 * Allocate a sigqueue now, before we grab locks.
 	 * It will be given to sigcld(), below.
 	 * Special case:  If we will be making the process disappear
-	 * without a trace (for the benefit of posix_spawn() in libc)
-	 * don't bother to allocate a useless sigqueue.
+	 * without a trace because it is either:
+	 *	* an exiting SSYS process, or
+	 *	* a posix_spawn() vfork child who requests it,
+	 * we don't bother to allocate a useless sigqueue.
 	 */
-	evaporate = ((p->p_flag & SVFORK) &&
+	evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
 	    why == CLD_EXITED && what == _EVAPORATE);
 	if (!evaporate)
 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
@@ -747,6 +749,8 @@
 	rdir = PTOU(p)->u_rdir;
 	cwd = PTOU(p)->u_cwd;
 
+	ASSERT(cdir != NULL || p->p_parent == &p0);
+
 	/*
 	 * Release resource controls, as they are no longer enforceable.
 	 */
@@ -840,7 +844,8 @@
 	 * We don't release u_cdir and u_rdir until SZOMB is set.
 	 * This protects us against dofusers().
 	 */
-	VN_RELE(cdir);
+	if (cdir)
+		VN_RELE(cdir);
 	if (rdir)
 		VN_RELE(rdir);
 	if (cwd)
--- a/usr/src/uts/common/os/fork.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/fork.c	Mon Nov 23 15:29:44 2009 -0800
@@ -81,7 +81,10 @@
 #include <sys/fork.h>
 
 static int64_t cfork(int, int, int);
-static int getproc(proc_t **, int);
+static int getproc(proc_t **, pid_t, uint_t);
+#define	GETPROC_USER	0x0
+#define	GETPROC_KERNEL	0x1
+
 static void fork_fail(proc_t *);
 static void forklwp_fail(proc_t *);
 
@@ -224,7 +227,7 @@
 	/*
 	 * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
 	 */
-	if (getproc(&cp, 0) < 0) {
+	if (getproc(&cp, 0, GETPROC_USER) < 0) {
 		mutex_enter(&p->p_lock);
 		pool_barrier_exit();
 		continuelwps(p);
@@ -779,20 +782,24 @@
  * fork a kernel process.
  */
 int
-newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct)
+newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
+    pid_t pid)
 {
 	proc_t *p;
 	struct user *up;
-	klwp_t *lwp;
+	kthread_t *t;
 	cont_process_t *ctp = NULL;
 	rctl_entity_p_t e;
 
-	ASSERT(!(cid == syscid && ct != NULL));
-	if (cid == syscid) {
+	ASSERT(cid != sysdccid);
+	ASSERT(cid != syscid || ct == NULL);
+	if (CLASS_KERNEL(cid)) {
 		rctl_alloc_gp_t *init_gp;
 		rctl_set_t *init_set;
 
-		if (getproc(&p, 1) < 0)
+		ASSERT(pid != 1);
+
+		if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 			return (EAGAIN);
 
 		/*
@@ -827,12 +834,17 @@
 		mutex_exit(&p->p_lock);
 
 		rctl_prealloc_destroy(init_gp);
-	} else  {
+
+		t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
+	} else {
 		rctl_alloc_gp_t *init_gp, *default_gp;
 		rctl_set_t *init_set;
 		task_t *tk, *tk_old;
+		klwp_t *lwp;
 
-		if (getproc(&p, 0) < 0)
+		ASSERT(pid == 1);
+
+		if (getproc(&p, pid, GETPROC_USER) < 0)
 			return (EAGAIN);
 		/*
 		 * init creates a new task, distinct from the task
@@ -865,29 +877,26 @@
 		task_rele(tk_old);
 		rctl_prealloc_destroy(default_gp);
 		rctl_prealloc_destroy(init_gp);
-	}
-
-	p->p_as = &kas;
 
-	if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
-	    &curthread->t_hold, cid, 1)) == NULL) {
-		task_t *tk;
-		fork_fail(p);
-		mutex_enter(&pidlock);
-		mutex_enter(&p->p_lock);
-		tk = p->p_task;
-		task_detach(p);
-		ASSERT(p->p_pool->pool_ref > 0);
-		atomic_add_32(&p->p_pool->pool_ref, -1);
-		mutex_exit(&p->p_lock);
-		pid_exit(p);
-		mutex_exit(&pidlock);
-		task_rele(tk);
+		if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
+		    &curthread->t_hold, cid, 1)) == NULL) {
+			task_t *tk;
+			fork_fail(p);
+			mutex_enter(&pidlock);
+			mutex_enter(&p->p_lock);
+			tk = p->p_task;
+			task_detach(p);
+			ASSERT(p->p_pool->pool_ref > 0);
+			atomic_add_32(&p->p_pool->pool_ref, -1);
+			mutex_exit(&p->p_lock);
+			pid_exit(p);
+			mutex_exit(&pidlock);
+			task_rele(tk);
 
-		return (EAGAIN);
-	}
+			return (EAGAIN);
+		}
+		t = lwptot(lwp);
 
-	if (cid != syscid) {
 		ctp = contract_process_fork(sys_process_tmpl, p, curproc,
 		    B_FALSE);
 		ASSERT(ctp != NULL);
@@ -895,13 +904,14 @@
 			*ct = &ctp->conp_contract;
 	}
 
+	ASSERT3U(t->t_tid, ==, 1);
 	p->p_lwpid = 1;
 	mutex_enter(&pidlock);
-	pgjoin(p, curproc->p_pgidp);
+	pgjoin(p, p->p_parent->p_pgidp);
 	p->p_stat = SRUN;
 	mutex_enter(&p->p_lock);
-	lwptot(lwp)->t_proc_flag &= ~TP_HOLDLWP;
-	lwp_create_done(lwptot(lwp));
+	t->t_proc_flag &= ~TP_HOLDLWP;
+	lwp_create_done(t);
 	mutex_exit(&p->p_lock);
 	mutex_exit(&pidlock);
 	return (0);
@@ -911,7 +921,7 @@
  * create a child proc struct.
  */
 static int
-getproc(proc_t **cpp, int kernel)
+getproc(proc_t **cpp, pid_t pid, uint_t flags)
 {
 	proc_t		*pp, *cp;
 	pid_t		newpid;
@@ -926,7 +936,7 @@
 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 		return (-1);	/* no point in starting new processes */
 
-	pp = curproc;
+	pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
 	cp = kmem_cache_alloc(process_cache, KM_SLEEP);
 	bzero(cp, sizeof (proc_t));
 
@@ -942,6 +952,7 @@
 	mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
 	cp->p_stat = SIDL;
 	cp->p_mstart = gethrtime();
+	cp->p_as = &kas;
 	/*
 	 * p_zone must be set before we call pid_allocate since the process
 	 * will be visible after that and code such as prfind_zone will
@@ -951,7 +962,7 @@
 	cp->p_t1_lgrpid = LGRP_NONE;
 	cp->p_tr_lgrpid = LGRP_NONE;
 
-	if ((newpid = pid_allocate(cp, PID_ALLOC_PROC)) == -1) {
+	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 		if (nproc == v.v_proc) {
 			CPU_STATS_ADDQ(CPU, sys, procovf, 1);
 			cmn_err(CE_WARN, "out of processes");
@@ -1060,7 +1071,7 @@
 	 * always bound to the default pool.
 	 */
 	mutex_enter(&pp->p_lock);
-	if (kernel) {
+	if (flags & GETPROC_KERNEL) {
 		cp->p_pool = pool_default;
 		cp->p_flag |= SSYS;
 	} else {
@@ -1074,7 +1085,7 @@
 	 * are always attached to task0.
 	 */
 	mutex_enter(&cp->p_lock);
-	if (kernel)
+	if (flags & GETPROC_KERNEL)
 		task_attach(task0p, cp);
 	else
 		task_attach(pp->p_task, cp);
@@ -1098,7 +1109,15 @@
 	 */
 	fcnt_add(P_FINFO(pp), 1);
 
-	VN_HOLD(PTOU(pp)->u_cdir);
+	if (PTOU(pp)->u_cdir) {
+		VN_HOLD(PTOU(pp)->u_cdir);
+	} else {
+		ASSERT(pp == &p0);
+		/*
+		 * We must be at or before vfs_mountroot(); it will take care of
+		 * assigning our current directory.
+		 */
+	}
 	if (PTOU(pp)->u_rdir)
 		VN_HOLD(PTOU(pp)->u_rdir);
 	if (PTOU(pp)->u_cwd)
--- a/usr/src/uts/common/os/lwp.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/lwp.c	Mon Nov 23 15:29:44 2009 -0800
@@ -68,6 +68,24 @@
 extern void freectx_ctx(struct ctxop *);
 
 /*
+ * Create a kernel thread associated with a particular system process.  Give
+ * it an LWP so that microstate accounting will be available for it.
+ */
+kthread_t *
+lwp_kernel_create(proc_t *p, void (*proc)(), void *arg, int state, pri_t pri)
+{
+	klwp_t *lwp;
+
+	VERIFY((p->p_flag & SSYS) != 0);
+
+	lwp = lwp_create(proc, arg, 0, p, state, pri, &t0.t_hold, syscid, 0);
+
+	VERIFY(lwp != NULL);
+
+	return (lwptot(lwp));
+}
+
+/*
  * Create a thread that appears to be stopped at sys_rtt.
  */
 klwp_t *
@@ -84,7 +102,7 @@
 	int err = 0;
 	kproject_t *oldkpj, *newkpj;
 	void *bufp = NULL;
-	klwp_t *curlwp = ttolwp(curthread);
+	klwp_t *curlwp;
 	lwpent_t *lep;
 	lwpdir_t *old_dir = NULL;
 	uint_t old_dirsz = 0;
@@ -96,12 +114,16 @@
 	boolean_t branded = 0;
 	struct ctxop *ctx = NULL;
 
+	ASSERT(cid != sysdccid);	/* system threads must start in SYS */
+
+	ASSERT(p != &p0);		/* No new LWPs in p0. */
+
 	mutex_enter(&p->p_lock);
 	mutex_enter(&p->p_zone->zone_nlwps_lock);
 	/*
 	 * don't enforce rctl limits on system processes
 	 */
-	if (cid != syscid) {
+	if (!CLASS_KERNEL(cid)) {
 		if (p->p_task->tk_nlwps >= p->p_task->tk_nlwps_ctl)
 			if (rctl_test(rc_task_lwps, p->p_task->tk_rctls, p,
 			    1, 0) & RCT_DENY)
@@ -128,13 +150,26 @@
 	mutex_exit(&p->p_zone->zone_nlwps_lock);
 	mutex_exit(&p->p_lock);
 
-	if (curlwp == NULL || (stksize = curlwp->lwp_childstksz) == 0)
+	if (CLASS_KERNEL(cid)) {
+		curlwp = NULL;		/* don't inherit from curlwp */
 		stksize = lwp_default_stksize;
+	} else {
+		curlwp = ttolwp(curthread);
+		if (curlwp == NULL || (stksize = curlwp->lwp_childstksz) == 0)
+			stksize = lwp_default_stksize;
+	}
 
 	/*
-	 * Try to reclaim a <lwp,stack> from 'deathrow'
+	 * For system threads, we sleep for our swap reservation, and the
+	 * thread stack can't be swapped.
+	 *
+	 * Otherwise, try to reclaim a <lwp,stack> from 'deathrow'
 	 */
-	if (stksize == lwp_default_stksize) {
+	if (CLASS_KERNEL(cid)) {
+		lwpdata = (caddr_t)segkp_get(segkp, stksize,
+		    (KPD_NO_ANON | KPD_HASREDZONE | KPD_LOCKED));
+
+	} else if (stksize == lwp_default_stksize) {
 		if (lwp_reapcnt > 0) {
 			mutex_enter(&reaplock);
 			if ((t = lwp_deathrow) != NULL) {
@@ -434,11 +469,15 @@
 	kpreempt_disable();	/* can't grab cpu_lock here */
 
 	/*
-	 * Inherit processor and processor set bindings from curthread,
-	 * unless we're creating a new kernel process, in which case
-	 * clear all bindings.
+	 * Inherit processor and processor set bindings from curthread.
+	 *
+	 * For kernel LWPs, we do not inherit processor set bindings at
+	 * process creation time (i.e. when p != curproc).  After the
+	 * kernel process is created, any subsequent LWPs must be created
+	 * by threads in the kernel process, at which point we *will*
+	 * inherit processor set bindings.
 	 */
-	if (cid == syscid) {
+	if (CLASS_KERNEL(cid) && p != curproc) {
 		t->t_bind_cpu = binding = PBIND_NONE;
 		t->t_cpupart = oldpart = &cp_default;
 		t->t_bind_pset = PS_NONE;
@@ -658,6 +697,13 @@
 
 error:
 	if (err) {
+		if (CLASS_KERNEL(cid)) {
+			/*
+			 * This should only happen if a system process runs
+			 * out of lwpids, which shouldn't occur.
+			 */
+			panic("Failed to create a system LWP");
+		}
 		/*
 		 * We have failed to create an lwp, so decrement the number
 		 * of lwps in the task and let the lgroup load averages know
--- a/usr/src/uts/common/os/main.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/main.c	Mon Nov 23 15:29:44 2009 -0800
@@ -450,6 +450,12 @@
 	(void) spl0();
 	interrupts_unleashed = 1;
 
+	/*
+	 * Create kmem cache for proc structures
+	 */
+	process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
 	vfs_mountroot();	/* Mount the root file system */
 	errorq_init();		/* after vfs_mountroot() so DDI root is ready */
 	cpu_kstat_init(CPU);	/* after vfs_mountroot() so TOD is valid */
@@ -500,12 +506,6 @@
 	setupclock(0);
 
 	/*
-	 * Create kmem cache for proc structures
-	 */
-	process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	/*
 	 * Initialize process 0's lwp directory and lwpid hash table.
 	 */
 	p->p_lwpdir = p->p_lwpfree = p0_lwpdir;
@@ -576,24 +576,33 @@
 
 	/*
 	 * Make init process; enter scheduling loop with system process.
+	 *
+	 * Note that we manually assign the pids for these processes, for
+	 * historical reasons.  If more pre-assigned pids are needed,
+	 * FAMOUS_PIDS will have to be updated.
 	 */
 
 	/* create init process */
-	if (newproc(start_init, NULL, defaultcid, 59, NULL))
+	if (newproc(start_init, NULL, defaultcid, 59, NULL,
+	    FAMOUS_PID_INIT))
 		panic("main: unable to fork init.");
 
 	/* create pageout daemon */
-	if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL))
+	if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL,
+	    FAMOUS_PID_PAGEOUT))
 		panic("main: unable to fork pageout()");
 
 	/* create fsflush daemon */
-	if (newproc(fsflush, NULL, syscid, minclsyspri, NULL))
+	if (newproc(fsflush, NULL, syscid, minclsyspri, NULL,
+	    FAMOUS_PID_FSFLUSH))
 		panic("main: unable to fork fsflush()");
 
 	/* create cluster process if we're a member of one */
 	if (cluster_bootflags & CLUSTER_BOOTED) {
-		if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, NULL))
+		if (newproc(cluster_wrapper, NULL, syscid, minclsyspri,
+		    NULL, 0)) {
 			panic("main: unable to fork cluster()");
+		}
 	}
 
 	/*
--- a/usr/src/uts/common/os/mem_cage.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/mem_cage.c	Mon Nov 23 15:29:44 2009 -0800
@@ -1162,9 +1162,8 @@
 kcage_cageout_init()
 {
 	if (kcage_on) {
-
-		(void) thread_create(NULL, 0, kcage_cageout,
-		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
+		(void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL,
+		    TS_RUN, maxclsyspri - 1);
 	}
 }
 
--- a/usr/src/uts/common/os/msacct.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/msacct.c	Mon Nov 23 15:29:44 2009 -0800
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -244,6 +242,7 @@
 {
 	hrtime_t aggr_time;
 	hrtime_t now;
+	hrtime_t waitrq;
 	hrtime_t state_start;
 	struct mstate *ms;
 	klwp_t *lwp;
@@ -255,6 +254,7 @@
 		return (0);
 
 	mstate = t->t_mstate;
+	waitrq = t->t_waitrq;
 	ms = &lwp->lwp_mstate;
 	state_start = ms->ms_state_start;
 
@@ -267,9 +267,15 @@
 	 * NOTE: gethrtime_unscaled on X86 taken on different CPUs is
 	 * inconsistent, so it is possible that now < state_start.
 	 */
-	if ((mstate == LMS_USER || mstate == LMS_SYSTEM ||
-		mstate == LMS_TRAP) && (now > state_start)) {
-			aggr_time += now - state_start;
+	if (mstate == LMS_USER || mstate == LMS_SYSTEM || mstate == LMS_TRAP) {
+		/* if waitrq is zero, count all of the time. */
+		if (waitrq == 0) {
+			waitrq = now;
+		}
+
+		if (waitrq > state_start) {
+			aggr_time += waitrq - state_start;
+		}
 	}
 
 	scalehrtime(&aggr_time);
@@ -277,6 +283,65 @@
 }
 
 /*
+ * Return the amount of onproc and runnable time this thread has experienced.
+ *
+ * Because the fields we read are not protected by locks when updated
+ * by the thread itself, this is an inherently racey interface.  In
+ * particular, the ASSERT(THREAD_LOCK_HELD(t)) doesn't guarantee as much
+ * as it might appear to.
+ *
+ * The implication for users of this interface is that onproc and runnable
+ * are *NOT* monotonically increasing; they may temporarily be larger than
+ * they should be.
+ */
+void
+mstate_systhread_times(kthread_t *t, hrtime_t *onproc, hrtime_t *runnable)
+{
+	struct mstate	*const	ms = &ttolwp(t)->lwp_mstate;
+
+	int		mstate;
+	hrtime_t	now;
+	hrtime_t	state_start;
+	hrtime_t	waitrq;
+	hrtime_t	aggr_onp;
+	hrtime_t	aggr_run;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+	ASSERT(t->t_procp->p_flag & SSYS);
+	ASSERT(ttolwp(t) != NULL);
+
+	/* shouldn't be any non-SYSTEM on-CPU time */
+	ASSERT(ms->ms_acct[LMS_USER] == 0);
+	ASSERT(ms->ms_acct[LMS_TRAP] == 0);
+
+	mstate = t->t_mstate;
+	waitrq = t->t_waitrq;
+	state_start = ms->ms_state_start;
+
+	aggr_onp = ms->ms_acct[LMS_SYSTEM];
+	aggr_run = ms->ms_acct[LMS_WAIT_CPU];
+
+	now = gethrtime_unscaled();
+
+	/* if waitrq == 0, then there is no time to account to TS_RUN */
+	if (waitrq == 0)
+		waitrq = now;
+
+	/* If there is system time to accumulate, do so */
+	if (mstate == LMS_SYSTEM && state_start < waitrq)
+		aggr_onp += waitrq - state_start;
+
+	if (waitrq < now)
+		aggr_run += now - waitrq;
+
+	scalehrtime(&aggr_onp);
+	scalehrtime(&aggr_run);
+
+	*onproc = aggr_onp;
+	*runnable = aggr_run;
+}
+
+/*
  * Return an aggregation of microstate times in scaled nanoseconds (high-res
  * time).  This keeps in mind that p_acct is already scaled, and ms_acct is
  * not.
--- a/usr/src/uts/common/os/pid.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/pid.c	Mon Nov 23 15:29:44 2009 -0800
@@ -20,16 +20,13 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
@@ -94,7 +91,7 @@
 static kmutex_t	pidlinklock;
 static struct pid **pidhash;
 static pid_t minpid;
-static pid_t mpid;
+static pid_t mpid = FAMOUS_PIDS;	/* one more than the last famous pid */
 static union procent *procdir;
 static union procent *procentfree;
 
@@ -132,7 +129,7 @@
 	if (jump_pid && jump_pid > mpid)
 		minpid = mpid = jump_pid;
 	else
-		minpid = mpid + 1;
+		minpid = mpid;
 }
 
 /*
@@ -171,7 +168,7 @@
  * pid_allocate() returns the new pid on success, -1 on failure.
  */
 pid_t
-pid_allocate(proc_t *prp, int flags)
+pid_allocate(proc_t *prp, pid_t pid, int flags)
 {
 	struct pid *pidp;
 	union procent *pep;
@@ -187,17 +184,31 @@
 		goto failed;
 	}
 
-	/*
-	 * Allocate a pid
-	 */
-	startpid = mpid;
-	do  {
-		newpid = (++mpid == maxpid ? mpid = minpid : mpid);
-	} while (pid_lookup(newpid) && newpid != startpid);
+	if (pid != 0) {
+		VERIFY(minpid == 0);
+		VERIFY3P(pid, <, mpid);
+		VERIFY3P(pid_lookup(pid), ==, NULL);
+		newpid = pid;
+	} else {
+		/*
+		 * Allocate a pid
+		 */
+		ASSERT(minpid <= mpid && mpid <= maxpid);
 
-	if (newpid == startpid && pid_lookup(newpid)) {
-		/* couldn't find a free pid */
-		goto failed;
+		startpid = mpid;
+		for (;;) {
+			newpid = mpid;
+			if (mpid >= maxpid)
+				mpid = minpid;
+			else
+				mpid++;
+
+			if (pid_lookup(newpid) == NULL)
+				break;
+
+			if (mpid == startpid)
+				goto failed;
+		}
 	}
 
 	/*
--- a/usr/src/uts/common/os/sig.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/sig.c	Mon Nov 23 15:29:44 2009 -0800
@@ -217,7 +217,8 @@
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
 
-	if (sig <= 0 || sig >= NSIG)
+	/* System processes don't get signals */
+	if (sig <= 0 || sig >= NSIG || (p->p_flag & SSYS))
 		return;
 
 	/*
--- a/usr/src/uts/common/os/taskq.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/taskq.c	Mon Nov 23 15:29:44 2009 -0800
@@ -55,7 +55,7 @@
  *	the same list managed by the same thread.
  *
  * (3) Some tasks may block for a long time, and this should not block other
- * 	tasks in the queue.
+ *	tasks in the queue.
  *
  * To provide useful service in such cases we define a "dynamic task queue"
  * which has an individual thread for each of the tasks. These threads are
@@ -74,7 +74,7 @@
  *
  * INTERFACES ==================================================================
  *
- * taskq_t *taskq_create(name, nthreads, pri_t pri, minalloc, maxall, flags);
+ * taskq_t *taskq_create(name, nthreads, pri, minalloc, maxall, flags);
  *
  *	Create a taskq with specified properties.
  *	Possible 'flags':
@@ -123,6 +123,25 @@
  *	The 'pri' field specifies the default priority for the threads that
  *	service all scheduled tasks.
  *
+ * taskq_t *taskq_create_instance(name, instance, nthreads, pri, minalloc,
+ *    maxall, flags);
+ *
+ *	Like taskq_create(), but takes an instance number (or -1 to indicate
+ *	no instance).
+ *
+ * taskq_t *taskq_create_proc(name, nthreads, pri, minalloc, maxall, proc,
+ *    flags);
+ *
+ *	Like taskq_create(), but creates the taskq threads in the specified
+ *	system process.  If proc != &p0, this must be called from a thread
+ *	in that process.
+ *
+ * taskq_t *taskq_create_sysdc(name, nthreads, minalloc, maxall, proc,
+ *    dc, flags);
+ *
+ *	Like taskq_create_proc(), but the taskq threads will use the
+ *	System Duty Cycle (SDC) scheduling class with a duty cycle of dc.
+ *
  * void taskq_destroy(tap):
  *
  *	Waits for any scheduled tasks to complete, then destroys the taskq.
@@ -147,7 +166,7 @@
  *
  *	  TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to
  *		lack of available resources and fail. If this flag is not
- * 		set, and the task pool is exhausted, the task may be scheduled
+ *		set, and the task pool is exhausted, the task may be scheduled
  *		in the backing queue. This flag may ONLY be used with dynamic
  *		task queues.
  *
@@ -156,9 +175,11 @@
  *		Enqueueing dependent tasks may create deadlocks.
  *
  *	  TQ_SLEEP:   May block waiting for resources. May still fail for
- * 		dynamic task queues if TQ_NOQUEUE is also specified, otherwise
+ *		dynamic task queues if TQ_NOQUEUE is also specified, otherwise
  *		always succeed.
  *
+ *	  TQ_FRONT:   Puts the new task at the front of the queue.  Be careful.
+ *
  *	NOTE: Dynamic task queues are much more likely to fail in
  *		taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it
  *		is important to have backup strategies handling such failures.
@@ -234,7 +255,7 @@
  *   +-------------+  |
  *                    |   DYNAMIC TASK QUEUES:
  *                    |
- *                    +-> taskq_bucket[nCPU]       	taskq_bucket_dispatch()
+ *                    +-> taskq_bucket[nCPU]		taskq_bucket_dispatch()
  *                        +-------------------+                    ^
  *                   +--->| tqbucket_lock     |                    |
  *                   |    +-------------------+   +--------+      +--------+
@@ -249,7 +270,7 @@
  *                   |    +-------------------+<--+--------+<--...+--------+
  *                   |    | ...               |   | thread |      | thread |
  *                   |    +-------------------+   +--------+      +--------+
- *		     +---> 	...
+ *		     +--->	...
  *
  *
  * Task queues use tq_task field to link new entry in the queue. The queue is a
@@ -283,8 +304,8 @@
  *
  * During creation, tq_nthreads and tq_active are set to 0, and
  * tq_nthreads_target is set to the number of threads desired.  The
- * TASKQ_CHANGING flag is set, and taskq_create_thread() is called to
- * create the first thread. taskq_create_thread() increments tq_active,
+ * TASKQ_CHANGING flag is set, and taskq_thread_create() is called to
+ * create the first thread. taskq_thread_create() increments tq_active,
  * sets TASKQ_THREAD_CREATED, and creates the new thread.
  *
  * Each thread starts in taskq_thread(), clears the TASKQ_THREAD_CREATED
@@ -451,13 +472,16 @@
 #include <sys/kmem.h>
 #include <sys/vmem.h>
 #include <sys/callb.h>
+#include <sys/class.h>
 #include <sys/systm.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/vmsystm.h>	/* For throttlefree */
 #include <sys/sysmacros.h>
 #include <sys/cpuvar.h>
+#include <sys/cpupart.h>
 #include <sys/sdt.h>
+#include <sys/sysdc.h>
 #include <sys/note.h>
 
 static kmem_cache_t *taskq_ent_cache, *taskq_cache;
@@ -472,7 +496,7 @@
 
 /*
  * Maximum number of entries in global system taskq is
- * 	system_taskq_size * max_ncpus
+ *	system_taskq_size * max_ncpus
  */
 #define	SYSTEM_TASKQ_SIZE 64
 int system_taskq_size = SYSTEM_TASKQ_SIZE;
@@ -483,6 +507,14 @@
  */
 int taskq_minimum_nthreads_max = 1;
 
+/*
+ * We want to ensure that when taskq_create() returns, there is at least
+ * one thread ready to handle requests.  To guarantee this, we have to wait
+ * for the second thread, since the first one cannot process requests until
+ * the second thread has been created.
+ */
+#define	TASKQ_CREATE_ACTIVE_THREADS	2
+
 /* Maximum percentage allowed for TASKQ_THREADS_CPU_PCT */
 #define	TASKQ_CPUPCT_MAX_PERCENT	1000
 int taskq_cpupct_max_percent = TASKQ_CPUPCT_MAX_PERCENT;
@@ -522,7 +554,7 @@
  * Static functions.
  */
 static taskq_t	*taskq_create_common(const char *, int, int, pri_t, int,
-    int, uint_t);
+    int, proc_t *, uint_t, uint_t);
 static void taskq_thread(void *);
 static void taskq_d_thread(taskq_ent_t *);
 static void taskq_bucket_extend(void *);
@@ -539,6 +571,7 @@
  * Task queues kstats.
  */
 struct taskq_kstat {
+	kstat_named_t	tq_pid;
 	kstat_named_t	tq_tasks;
 	kstat_named_t	tq_executed;
 	kstat_named_t	tq_maxtasks;
@@ -548,6 +581,7 @@
 	kstat_named_t	tq_pri;
 	kstat_named_t	tq_nthreads;
 } taskq_kstat = {
+	{ "pid",		KSTAT_DATA_UINT64 },
 	{ "tasks",		KSTAT_DATA_UINT64 },
 	{ "executed",		KSTAT_DATA_UINT64 },
 	{ "maxtasks",		KSTAT_DATA_UINT64 },
@@ -604,16 +638,9 @@
 static int taskq_d_kstat_update(kstat_t *, int);
 
 /*
- * State for THREAD_CPU_PCT management
+ * List of all TASKQ_THREADS_CPU_PCT taskqs.
  */
-typedef struct taskq_cpupct_ent {
-	list_node_t	tp_link;
-	taskq_t		*tp_taskq;
-} taskq_cpupct_ent_t;
-
-static kmutex_t taskq_cpupct_lock;
-static list_t taskq_cpupct_list;
-static int taskq_cpupct_ncpus_online;
+static list_t taskq_cpupct_list;	/* protected by cpu_lock */
 
 /*
  * Collect per-bucket statistic when TASKQ_STATISTIC is defined.
@@ -678,22 +705,42 @@
 	tqe->tqent_next->tqent_prev = tqe;			\
 	tqe->tqent_prev->tqent_next = tqe;			\
 }
+/*
+ * Prepend 'tqe' to the beginning of l
+ */
+#define	TQ_PREPEND(l, tqe) {					\
+	tqe->tqent_next = l.tqent_next;				\
+	tqe->tqent_prev = &l;					\
+	tqe->tqent_next->tqent_prev = tqe;			\
+	tqe->tqent_prev->tqent_next = tqe;			\
+}
 
 /*
  * Schedule a task specified by func and arg into the task queue entry tqe.
  */
-#define	TQ_ENQUEUE(tq, tqe, func, arg) {			\
-	ASSERT(MUTEX_HELD(&tq->tq_lock));			\
-	TQ_APPEND(tq->tq_task, tqe);				\
-	tqe->tqent_func = (func);				\
-	tqe->tqent_arg = (arg);					\
-	tq->tq_tasks++;						\
-	if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks)	\
+#define	TQ_DO_ENQUEUE(tq, tqe, func, arg, front) {			\
+	ASSERT(MUTEX_HELD(&tq->tq_lock));				\
+	_NOTE(CONSTCOND)						\
+	if (front) {							\
+		TQ_PREPEND(tq->tq_task, tqe);				\
+	} else {							\
+		TQ_APPEND(tq->tq_task, tqe);				\
+	}								\
+	tqe->tqent_func = (func);					\
+	tqe->tqent_arg = (arg);						\
+	tq->tq_tasks++;							\
+	if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks)		\
 		tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed;	\
-	cv_signal(&tq->tq_dispatch_cv);				\
+	cv_signal(&tq->tq_dispatch_cv);					\
 	DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \
 }
 
+#define	TQ_ENQUEUE(tq, tqe, func, arg)					\
+	TQ_DO_ENQUEUE(tq, tqe, func, arg, 0)
+
+#define	TQ_ENQUEUE_FRONT(tq, tqe, func, arg)				\
+	TQ_DO_ENQUEUE(tq, tqe, func, arg, 1)
+
 /*
  * Do-nothing task which may be used to prepopulate thread caches.
  */
@@ -703,7 +750,6 @@
 {
 }
 
-
 /*ARGSUSED*/
 static int
 taskq_constructor(void *buf, void *cdrarg, int kmflags)
@@ -776,51 +822,97 @@
 	    (void *)1, INT32_MAX, 1, NULL, NULL, NULL, 0,
 	    VM_SLEEP | VMC_IDENTIFIER);
 
-	list_create(&taskq_cpupct_list, sizeof (taskq_cpupct_ent_t),
-	    offsetof(taskq_cpupct_ent_t, tp_link));
+	list_create(&taskq_cpupct_list, sizeof (taskq_t),
+	    offsetof(taskq_t, tq_cpupct_link));
+}
+
+static void
+taskq_update_nthreads(taskq_t *tq, uint_t ncpus)
+{
+	uint_t newtarget = TASKQ_THREADS_PCT(ncpus, tq->tq_threads_ncpus_pct);
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(MUTEX_HELD(&tq->tq_lock));
+
+	/* We must be going from non-zero to non-zero; no exiting. */
+	ASSERT3U(tq->tq_nthreads_target, !=, 0);
+	ASSERT3U(newtarget, !=, 0);
+
+	ASSERT3U(newtarget, <=, tq->tq_nthreads_max);
+	if (newtarget != tq->tq_nthreads_target) {
+		tq->tq_flags |= TASKQ_CHANGING;
+		tq->tq_nthreads_target = newtarget;
+		cv_broadcast(&tq->tq_dispatch_cv);
+		cv_broadcast(&tq->tq_exit_cv);
+	}
+}
+
+/* called during task queue creation */
+static void
+taskq_cpupct_install(taskq_t *tq, cpupart_t *cpup)
+{
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+
+	mutex_enter(&cpu_lock);
+	mutex_enter(&tq->tq_lock);
+	tq->tq_cpupart = cpup->cp_id;
+	taskq_update_nthreads(tq, cpup->cp_ncpus);
+	mutex_exit(&tq->tq_lock);
+
+	list_insert_tail(&taskq_cpupct_list, tq);
+	mutex_exit(&cpu_lock);
+}
+
+static void
+taskq_cpupct_remove(taskq_t *tq)
+{
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+
+	mutex_enter(&cpu_lock);
+	list_remove(&taskq_cpupct_list, tq);
+	mutex_exit(&cpu_lock);
 }
 
 /*ARGSUSED*/
 static int
 taskq_cpu_setup(cpu_setup_t what, int id, void *arg)
 {
-	taskq_cpupct_ent_t *tpp;
-	int cpus_online = ncpus_online;
+	taskq_t *tq;
+	cpupart_t *cp = cpu[id]->cpu_part;
+	uint_t ncpus = cp->cp_ncpus;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(ncpus > 0);
 
-	/* offlines are called *before* the cpu is offlined. */
-	if (what == CPU_OFF)
-		cpus_online--;
-	if (cpus_online < 1)
-		cpus_online = 1;
+	switch (what) {
+	case CPU_OFF:
+	case CPU_CPUPART_OUT:
+		/* offlines are called *before* the cpu is offlined. */
+		if (ncpus > 1)
+			ncpus--;
+		break;
 
-	mutex_enter(&taskq_cpupct_lock);
-	if (cpus_online == taskq_cpupct_ncpus_online) {
-		mutex_exit(&taskq_cpupct_lock);
-		return (0);
+	case CPU_ON:
+	case CPU_CPUPART_IN:
+		break;
+
+	default:
+		return (0);		/* doesn't affect cpu count */
 	}
 
-	for (tpp = list_head(&taskq_cpupct_list); tpp != NULL;
-	    tpp = list_next(&taskq_cpupct_list, tpp)) {
-		taskq_t *tq = tpp->tp_taskq;
-		int newtarget;
+	for (tq = list_head(&taskq_cpupct_list); tq != NULL;
+	    tq = list_next(&taskq_cpupct_list, tq)) {
 
 		mutex_enter(&tq->tq_lock);
-		newtarget =
-		    TASKQ_THREADS_PCT(cpus_online, tq->tq_threads_ncpus_pct);
-		ASSERT3S(newtarget, <=, tq->tq_nthreads_max);
-		if (newtarget != tq->tq_nthreads_target) {
-			/* The taskq must not be exiting */
-			ASSERT3S(tq->tq_nthreads_target, !=, 0);
-			tq->tq_flags |= TASKQ_CHANGING;
-			tq->tq_nthreads_target = newtarget;
-			cv_broadcast(&tq->tq_dispatch_cv);
-			cv_broadcast(&tq->tq_exit_cv);
+		/*
+		 * If the taskq is part of the cpuset which is changing,
+		 * update its nthreads_target.
+		 */
+		if (tq->tq_cpupart == cp->cp_id) {
+			taskq_update_nthreads(tq, ncpus);
 		}
 		mutex_exit(&tq->tq_lock);
 	}
-
-	taskq_cpupct_ncpus_online = cpus_online;
-	mutex_exit(&taskq_cpupct_lock);
 	return (0);
 }
 
@@ -829,7 +921,11 @@
 {
 	mutex_enter(&cpu_lock);
 	register_cpu_setup_func(taskq_cpu_setup, NULL);
-	(void) taskq_cpu_setup(CPU_ON, 0, NULL);
+	/*
+	 * Make sure we're up to date.  At this point in boot, there is only
+	 * one processor set, so we only have to update the current CPU.
+	 */
+	(void) taskq_cpu_setup(CPU_ON, CPU->cpu_id, NULL);
 	mutex_exit(&cpu_lock);
 }
 
@@ -840,7 +936,7 @@
 system_taskq_init(void)
 {
 	system_taskq = taskq_create_common("system_taskq", 0,
-	    system_taskq_size * max_ncpus, minclsyspri, 4, 512,
+	    system_taskq_size * max_ncpus, minclsyspri, 4, 512, &p0, 0,
 	    TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
 }
 
@@ -1007,7 +1103,11 @@
 			mutex_exit(&tq->tq_lock);
 			return (NULL);
 		}
-		TQ_ENQUEUE(tq, tqe, func, arg);
+		if (flags & TQ_FRONT) {
+			TQ_ENQUEUE_FRONT(tq, tqe, func, arg);
+		} else {
+			TQ_ENQUEUE(tq, tqe, func, arg);
+		}
 		mutex_exit(&tq->tq_lock);
 		return ((taskqid_t)tqe);
 	}
@@ -1015,7 +1115,7 @@
 	/*
 	 * Dynamic taskq dispatching.
 	 */
-	ASSERT(!(flags & TQ_NOALLOC));
+	ASSERT(!(flags & (TQ_NOALLOC | TQ_FRONT)));
 	TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flags);
 
 	bsize = tq->tq_nbuckets;
@@ -1105,8 +1205,7 @@
 	 */
 	mutex_enter(&tq->tq_lock);
 	if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) {
-		TQ_ENQUEUE(tq, tqe1, taskq_bucket_extend,
-		    bucket);
+		TQ_ENQUEUE(tq, tqe1, taskq_bucket_extend, bucket);
 	} else {
 		TQ_STAT(bucket, tqs_nomem);
 	}
@@ -1223,19 +1322,58 @@
 	return (thread->t_taskq == tq);
 }
 
+/*
+ * Creates a thread in the taskq.  We only allow one outstanding create at
+ * a time.  We drop and reacquire the tq_lock in order to avoid blocking other
+ * taskq activity while thread_create() or lwp_kernel_create() run.
+ *
+ * The first time we're called, we do some additional setup, and do not
+ * return until there are enough threads to start servicing requests.
+ */
 static void
 taskq_thread_create(taskq_t *tq)
 {
-	kthread_t *t;
+	kthread_t	*t;
+	const boolean_t	first = (tq->tq_nthreads == 0);
 
 	ASSERT(MUTEX_HELD(&tq->tq_lock));
+	ASSERT(tq->tq_flags & TASKQ_CHANGING);
+	ASSERT(tq->tq_nthreads < tq->tq_nthreads_target);
 	ASSERT(!(tq->tq_flags & TASKQ_THREAD_CREATED));
 
+
 	tq->tq_flags |= TASKQ_THREAD_CREATED;
 	tq->tq_active++;
-	t = thread_create(NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN,
-	    tq->tq_pri);
-	t->t_taskq = tq;
+	mutex_exit(&tq->tq_lock);
+
+	if (tq->tq_proc != &p0) {
+		t = lwp_kernel_create(tq->tq_proc, taskq_thread, tq, TS_RUN,
+		    tq->tq_pri);
+	} else {
+		t = thread_create(NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN,
+		    tq->tq_pri);
+	}
+
+	if (!first) {
+		mutex_enter(&tq->tq_lock);
+		return;
+	}
+
+	/*
+	 * We know the thread cannot go away, since tq cannot be
+	 * destroyed until creation has completed.  We can therefore
+	 * safely dereference t.
+	 */
+	if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) {
+		taskq_cpupct_install(tq, t->t_cpupart);
+	}
+	mutex_enter(&tq->tq_lock);
+
+	/* Wait until we can service requests. */
+	while (tq->tq_nthreads != tq->tq_nthreads_target &&
+	    tq->tq_nthreads < TASKQ_CREATE_ACTIVE_THREADS) {
+		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+	}
 }
 
 /*
@@ -1276,6 +1414,13 @@
 	callb_cpr_t cprinfo;
 	hrtime_t start, end;
 
+	curthread->t_taskq = tq;	/* mark ourselves for taskq_member() */
+
+	if (curproc != &p0 && (tq->tq_flags & TASKQ_DUTY_CYCLE)) {
+		sysdc_thread_enter(curthread, tq->tq_DC,
+		    (tq->tq_flags & TASKQ_DC_BATCH) ? SYSDC_THREAD_BATCH : 0);
+	}
+
 	if (tq->tq_flags & TASKQ_CPR_SAFE) {
 		CALLB_CPR_INIT_SAFE(curthread, tq->tq_name);
 	} else {
@@ -1285,6 +1430,7 @@
 	mutex_enter(&tq->tq_lock);
 	thread_id = ++tq->tq_nthreads;
 	ASSERT(tq->tq_flags & TASKQ_THREAD_CREATED);
+	ASSERT(tq->tq_flags & TASKQ_CHANGING);
 	tq->tq_flags &= ~TASKQ_THREAD_CREATED;
 
 	VERIFY3S(thread_id, <=, tq->tq_nthreads_max);
@@ -1294,20 +1440,13 @@
 	else
 		tq->tq_threadlist[thread_id - 1] = curthread;
 
+	/* Allow taskq_create_common()'s taskq_thread_create() to return. */
+	if (tq->tq_nthreads == TASKQ_CREATE_ACTIVE_THREADS)
+		cv_broadcast(&tq->tq_wait_cv);
+
 	for (;;) {
 		if (tq->tq_flags & TASKQ_CHANGING) {
-			/* we're done; clear the CHANGING flag */
-			if (tq->tq_nthreads == tq->tq_nthreads_target) {
-				tq->tq_flags &= ~TASKQ_CHANGING;
-				continue;
-			}
-			/* We're low on threads and none have been created */
-			if (tq->tq_nthreads < tq->tq_nthreads_target &&
-			    !(tq->tq_flags & TASKQ_THREAD_CREATED)) {
-				taskq_thread_create(tq);
-				continue;
-			}
-			/* We're no longer needed */
+			/* See if we're no longer needed */
 			if (thread_id > tq->tq_nthreads_target) {
 				/*
 				 * To preserve the one-to-one mapping between
@@ -1329,6 +1468,23 @@
 				    &tq->tq_exit_cv, &cprinfo, -1);
 				continue;
 			}
+
+			/*
+			 * If no thread is starting taskq_thread(), we can
+			 * do some bookkeeping.
+			 */
+			if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) {
+				/* Check if we've reached our target */
+				if (tq->tq_nthreads == tq->tq_nthreads_target) {
+					tq->tq_flags &= ~TASKQ_CHANGING;
+					cv_broadcast(&tq->tq_wait_cv);
+				}
+				/* Check if we need to create a thread */
+				if (tq->tq_nthreads < tq->tq_nthreads_target) {
+					taskq_thread_create(tq);
+					continue; /* tq_lock was dropped */
+				}
+			}
 		}
 		if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) {
 			if (--tq->tq_active == 0)
@@ -1338,6 +1494,7 @@
 			tq->tq_active++;
 			continue;
 		}
+
 		tqe->tqent_prev->tqent_next = tqe->tqent_next;
 		tqe->tqent_next->tqent_prev = tqe->tqent_prev;
 		mutex_exit(&tq->tq_lock);
@@ -1364,19 +1521,30 @@
 	else
 		tq->tq_threadlist[thread_id - 1] = NULL;
 
-	ASSERT(tq->tq_nthreads > 0);
-	if (--tq->tq_nthreads == 0)
-		cv_broadcast(&tq->tq_wait_cv);
-
-	/* let the other threads which need to exit know we're done */
-	cv_broadcast(&tq->tq_exit_cv);
-
 	/* We're exiting, and therefore no longer active */
+	ASSERT(tq->tq_active > 0);
 	tq->tq_active--;
 
+	ASSERT(tq->tq_nthreads > 0);
+	tq->tq_nthreads--;
+
+	/* Wake up anyone waiting for us to exit */
+	cv_broadcast(&tq->tq_exit_cv);
+	if (tq->tq_nthreads == tq->tq_nthreads_target) {
+		if (!(tq->tq_flags & TASKQ_THREAD_CREATED))
+			tq->tq_flags &= ~TASKQ_CHANGING;
+
+		cv_broadcast(&tq->tq_wait_cv);
+	}
+
 	ASSERT(!(tq->tq_flags & TASKQ_CPR_SAFE));
-	CALLB_CPR_EXIT(&cprinfo);
-	thread_exit();
+	CALLB_CPR_EXIT(&cprinfo);		/* drops tq->tq_lock */
+	if (curthread->t_lwp != NULL) {
+		mutex_enter(&curproc->p_lock);
+		lwp_exit();
+	} else {
+		thread_exit();
+	}
 }
 
 /*
@@ -1522,8 +1690,10 @@
 taskq_create(const char *name, int nthreads, pri_t pri, int minalloc,
     int maxalloc, uint_t flags)
 {
-	return taskq_create_common(name, 0, nthreads, pri, minalloc,
-	    maxalloc, flags | TASKQ_NOINSTANCE);
+	ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0);
+
+	return (taskq_create_common(name, 0, nthreads, pri, minalloc,
+	    maxalloc, &p0, 0, flags | TASKQ_NOINSTANCE));
 }
 
 /*
@@ -1539,6 +1709,7 @@
 taskq_create_instance(const char *name, int instance, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
 {
+	ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0);
 	ASSERT((instance >= 0) || (instance == -1));
 
 	if (instance < 0) {
@@ -1546,12 +1717,36 @@
 	}
 
 	return (taskq_create_common(name, instance, nthreads,
-	    pri, minalloc, maxalloc, flags));
+	    pri, minalloc, maxalloc, &p0, 0, flags));
+}
+
+taskq_t *
+taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc,
+    int maxalloc, proc_t *proc, uint_t flags)
+{
+	ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0);
+	ASSERT(proc->p_flag & SSYS);
+
+	return (taskq_create_common(name, 0, nthreads, pri, minalloc,
+	    maxalloc, proc, 0, flags | TASKQ_NOINSTANCE));
 }
 
+taskq_t *
+taskq_create_sysdc(const char *name, int nthreads, int minalloc,
+    int maxalloc, proc_t *proc, uint_t dc, uint_t flags)
+{
+	ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0);
+	ASSERT(proc->p_flag & SSYS);
+
+	return (taskq_create_common(name, 0, nthreads, minclsyspri, minalloc,
+	    maxalloc, proc, dc, flags | TASKQ_NOINSTANCE | TASKQ_DUTY_CYCLE));
+}
+
+#define	IMPLY(a, b)	ASSERT((!(a)) || (b)) /* if (a) { ASSERT (b) } */
+
 static taskq_t *
 taskq_create_common(const char *name, int instance, int nthreads, pri_t pri,
-    int minalloc, int maxalloc, uint_t flags)
+    int minalloc, int maxalloc, proc_t *proc, uint_t dc, uint_t flags)
 {
 	taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP);
 	uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus);
@@ -1559,14 +1754,20 @@
 	int max_nthreads;
 
 	/*
-	 * TASKQ_DYNAMIC is incompatible with TASKQ_CPR_SAFE and
-	 * TASKQ_THREADS_CPU_PCT.
+	 * TASKQ_DYNAMIC, TASKQ_CPR_SAFE and TASKQ_THREADS_CPU_PCT are all
+	 * mutually incompatible.
 	 */
-	ASSERT(!(flags & TASKQ_DYNAMIC) ||
-	    !(flags & (TASKQ_CPR_SAFE | TASKQ_THREADS_CPU_PCT)));
-	/* TASKQ_CPR_SAFE is incompatible with TASKQ_THREADS_CPU_PCT */
+	IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_CPR_SAFE));
+	IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_THREADS_CPU_PCT));
+	IMPLY((flags & TASKQ_CPR_SAFE), !(flags & TASKQ_THREADS_CPU_PCT));
 
-	ASSERT(!(flags & TASKQ_CPR_SAFE) || !(flags & TASKQ_THREADS_CPU_PCT));
+	/* Cannot have DUTY_CYCLE without a non-p0 kernel process */
+	IMPLY((flags & TASKQ_DUTY_CYCLE), proc != &p0);
+
+	/* Cannot have DC_BATCH without DUTY_CYCLE */
+	ASSERT((flags & (TASKQ_DUTY_CYCLE|TASKQ_DC_BATCH)) != TASKQ_DC_BATCH);
+
+	ASSERT(proc != NULL);
 
 	bsize = 1 << (highbit(ncpus) - 1);
 	ASSERT(bsize >= 1);
@@ -1579,10 +1780,7 @@
 		/* For dynamic task queues use just one backup thread */
 		nthreads = max_nthreads = 1;
 
-	} else if (!(flags & TASKQ_THREADS_CPU_PCT)) {
-		ASSERT3S(nthreads, >=, 1);
-		max_nthreads = nthreads;
-	} else {
+	} else if (flags & TASKQ_THREADS_CPU_PCT) {
 		uint_t pct;
 		ASSERT3S(nthreads, >=, 0);
 		pct = nthreads;
@@ -1590,9 +1788,21 @@
 		if (pct > taskq_cpupct_max_percent)
 			pct = taskq_cpupct_max_percent;
 
+		/*
+		 * If you're using THREADS_CPU_PCT, the process for the
+		 * taskq threads must be curproc.  This allows any pset
+		 * binding to be inherited correctly.  If proc is &p0,
+		 * we won't be creating LWPs, so new threads will be assigned
+		 * to the default processor set.
+		 */
+		ASSERT(curproc == proc || proc == &p0);
 		tq->tq_threads_ncpus_pct = pct;
-		nthreads = TASKQ_THREADS_PCT(ncpus_online, pct);
+		nthreads = 1;		/* corrected in taskq_thread_create() */
 		max_nthreads = TASKQ_THREADS_PCT(max_ncpus, pct);
+
+	} else {
+		ASSERT3S(nthreads, >=, 1);
+		max_nthreads = nthreads;
 	}
 
 	if (max_nthreads < taskq_minimum_nthreads_max)
@@ -1613,34 +1823,26 @@
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
 	tq->tq_nbuckets = bsize;
+	tq->tq_proc = proc;
 	tq->tq_pri = pri;
+	tq->tq_DC = dc;
+	list_link_init(&tq->tq_cpupct_link);
 
 	if (max_nthreads > 1)
 		tq->tq_threadlist = kmem_alloc(
 		    sizeof (kthread_t *) * max_nthreads, KM_SLEEP);
 
-	/* Add the taskq to the list of CPU_PCT taskqs */
-	if (flags & TASKQ_THREADS_CPU_PCT) {
-		taskq_cpupct_ent_t *tpp = kmem_zalloc(sizeof (*tpp), KM_SLEEP);
-
-		list_link_init(&tpp->tp_link);
-		tpp->tp_taskq = tq;
-
-		mutex_enter(&taskq_cpupct_lock);
-		list_insert_tail(&taskq_cpupct_list, tpp);
-		/* reset our target, to avoid race conditions */
-		tq->tq_nthreads_target = TASKQ_THREADS_PCT(ncpus_online,
-		    tq->tq_threads_ncpus_pct);
-		mutex_exit(&taskq_cpupct_lock);
-	}
-
 	mutex_enter(&tq->tq_lock);
 	if (flags & TASKQ_PREPOPULATE) {
 		while (minalloc-- > 0)
 			taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
 	}
 
-	/* create the first thread; if more are needed, it'll create them */
+	/*
+	 * Create the first thread, which will create any other threads
+	 * necessary.  taskq_thread_create will not return until we have
+	 * enough threads to be able to process requests.
+	 */
 	taskq_thread_create(tq);
 	mutex_exit(&tq->tq_lock);
 
@@ -1669,7 +1871,7 @@
 	 * Install kstats.
 	 * We have two cases:
 	 *   1) Instance is provided to taskq_create_instance(). In this case it
-	 * 	should be >= 0 and we use it.
+	 *	should be >= 0 and we use it.
 	 *
 	 *   2) Instance is not provided and is automatically generated
 	 */
@@ -1740,20 +1942,7 @@
 	 * Unregister from the cpupct list.
 	 */
 	if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) {
-		taskq_cpupct_ent_t *tpp;
-
-		mutex_enter(&taskq_cpupct_lock);
-		for (tpp = list_head(&taskq_cpupct_list); tpp != NULL;
-		    tpp = list_next(&taskq_cpupct_list, tpp)) {
-			if (tpp->tp_taskq == tq)
-				break;
-		}
-		ASSERT3P(tpp, !=, NULL);
-
-		list_remove(&taskq_cpupct_list, tpp);
-		mutex_exit(&taskq_cpupct_lock);
-
-		kmem_free(tpp, sizeof (*tpp));
+		taskq_cpupct_remove(tq);
 	}
 
 	/*
@@ -1926,6 +2115,7 @@
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
+	tqsp->tq_pid.value.ui64 = tq->tq_proc->p_pid;
 	tqsp->tq_tasks.value.ui64 = tq->tq_tasks;
 	tqsp->tq_executed.value.ui64 = tq->tq_executed;
 	tqsp->tq_maxtasks.value.ui64 = tq->tq_maxtasks;
--- a/usr/src/uts/common/os/vm_pageout.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/vm_pageout.c	Mon Nov 23 15:29:44 2009 -0800
@@ -683,7 +683,10 @@
 		push_req[i].a_next = &push_req[i + 1];
 
 	pageout_pri = curthread->t_pri;
-	pageout_init(pageout_scanner, proc_pageout, pageout_pri - 1);
+
+	/* Create the pageout scanner thread. */
+	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
+	    pageout_pri - 1);
 
 	/*
 	 * kick off pageout scheduler.
--- a/usr/src/uts/common/os/zone.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/os/zone.c	Mon Nov 23 15:29:44 2009 -0800
@@ -2203,7 +2203,7 @@
 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
 		return (err);	/* EFAULT or ENAMETOOLONG */
 
-	if (getcid(sched_class, &classid) != 0 || classid == syscid)
+	if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
 		return (set_errno(EINVAL));
 	zone->zone_defaultcid = classid;
 	ASSERT(zone->zone_defaultcid > 0 &&
@@ -3482,7 +3482,7 @@
 		 * will have to tear down the zone, and fail, or try again.
 		 */
 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
-		    minclsyspri - 1, &ct)) != 0) {
+		    minclsyspri - 1, &ct, 0)) != 0) {
 			mutex_enter(&zone_status_lock);
 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
 			mutex_exit(&zone_status_lock);
@@ -4023,7 +4023,8 @@
 	 * and initialize zsched appropriately.  I'm not sure that that
 	 * makes much of a difference, though.
 	 */
-	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
+	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
+	if (error != 0) {
 		/*
 		 * We need to undo all globally visible state.
 		 */
--- a/usr/src/uts/common/sys/Makefile	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/Makefile	Mon Nov 23 15:29:44 2009 -0800
@@ -114,7 +114,7 @@
 	conf.h			\
 	consdev.h		\
 	console.h		\
-        consplat.h              \
+	consplat.h		\
 	vt.h			\
 	vtdaemon.h		\
 	kd.h			\
@@ -143,7 +143,7 @@
 	cyclic_impl.h		\
 	dacf.h			\
 	dacf_impl.h		\
-	damap.h		\
+	damap.h			\
 	damap_impl.h		\
 	dc_ki.h			\
 	ddi.h			\
@@ -186,7 +186,7 @@
 	dls.h			\
 	dls_mgmt.h		\
 	dls_impl.h		\
-	dma_i8237A.h            \
+	dma_i8237A.h		\
 	dnlc.h			\
 	door.h			\
 	door_data.h		\
@@ -196,9 +196,9 @@
 	dumpadm.h		\
 	dumphdr.h		\
 	ecppsys.h		\
-	ecppio.h                \
-	ecppreg.h               \
-	ecppvar.h               \
+	ecppio.h		\
+	ecppreg.h		\
+	ecppvar.h		\
 	efi_partition.h		\
 	elf.h			\
 	elf_386.h		\
@@ -244,8 +244,8 @@
 	fss.h			\
 	fsspriocntl.h		\
 	fsid.h			\
-	fssnap.h                \
-	fssnap_if.h             \
+	fssnap.h		\
+	fssnap_if.h		\
 	fstyp.h			\
 	ftrace.h		\
 	fx.h			\
@@ -391,7 +391,7 @@
 	multidata_impl.h	\
 	mutex.h			\
 	nbmlock.h		\
-	ndifm.h		\
+	ndifm.h			\
 	ndi_impldefs.h		\
 	net80211.h		\
 	net80211_crypto.h	\
@@ -402,11 +402,11 @@
 	netstack.h		\
 	nexusdefs.h		\
 	note.h			\
-	nvpair.h                \
-	nvpair_impl.h           \
+	nvpair.h		\
+	nvpair_impl.h		\
 	objfs.h			\
 	objfs_impl.h		\
-	ontrap.h                \
+	ontrap.h		\
 	open.h			\
 	openpromio.h		\
 	panic.h			\
@@ -440,7 +440,7 @@
 	port_impl.h		\
 	port_kernel.h		\
 	portif.h		\
-	ppmio.h                 \
+	ppmio.h			\
 	pppt_ic_if.h		\
 	pppt_ioctl.h		\
 	priocntl.h		\
@@ -540,11 +540,13 @@
 	suntty.h		\
 	swap.h			\
 	synch.h			\
+	sysdc.h			\
+	sysdc_impl.h		\
 	syscall.h		\
 	sysconf.h		\
 	sysconfig.h		\
 	sysevent.h		\
-	sysevent_impl.h         \
+	sysevent_impl.h		\
 	sysinfo.h		\
 	syslog.h		\
 	sysmacros.h		\
@@ -737,7 +739,7 @@
 	idm_impl.h	\
 	idm_so.h	\
 	idm_text.h	\
-	idm_transport.h \
+	idm_transport.h	\
 	idm_conn_sm.h
 
 ISCSITHDRS=		\
--- a/usr/src/uts/common/sys/class.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/class.h	Mon Nov 23 15:29:44 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,8 +30,6 @@
 #ifndef _SYS_CLASS_H
 #define	_SYS_CLASS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/t_lock.h>
 #include <sys/cred.h>
 #include <sys/thread.h>
@@ -119,6 +117,8 @@
 
 #ifdef	_KERNEL
 
+#define	CLASS_KERNEL(cid)	((cid) == syscid || (cid) == sysdccid)
+
 extern int	nclass;		/* number of configured scheduling classes */
 extern char	*defaultclass;	/* default class for newproc'd processes */
 extern struct sclass sclass[];	/* the class table */
@@ -127,6 +127,7 @@
 
 extern pri_t	minclsyspri;
 extern id_t	syscid;		/* system scheduling class ID */
+extern id_t	sysdccid;	/* system duty-cycle scheduling class ID */
 extern id_t	defaultcid;	/* "default" class id; see dispadmin(1M) */
 
 extern int	alloc_cid(char *, id_t *);
--- a/usr/src/uts/common/sys/debug.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/debug.h	Mon Nov 23 15:29:44 2009 -0800
@@ -19,19 +19,19 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
-
 #ifndef _SYS_DEBUG_H
 #define	_SYS_DEBUG_H
 
 #include <sys/isa_defs.h>
 #include <sys/types.h>
+#include <sys/note.h>
 
 #ifdef	__cplusplus
 extern "C" {
--- a/usr/src/uts/common/sys/param.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/param.h	Mon Nov 23 15:29:44 2009 -0800
@@ -87,6 +87,12 @@
 #define	MAX_TASKID	999999
 #define	MAX_MAXPID	999999
 #define	MAXEPHUID	0xfffffffcu	/* max ephemeral user id */
+
+#define	FAMOUS_PID_SCHED	0
+#define	FAMOUS_PID_INIT		1
+#define	FAMOUS_PID_PAGEOUT	2
+#define	FAMOUS_PID_FSFLUSH	3
+#define	FAMOUS_PIDS		4
 #endif
 
 #ifdef DEBUG
@@ -95,7 +101,6 @@
 #else
 #define	DEFAULT_MAXPID	30000
 #define	DEFAULT_JUMPPID	0
-
 #endif
 
 #define	MAXUID		2147483647	/* max user id */
--- a/usr/src/uts/common/sys/proc.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/proc.h	Mon Nov 23 15:29:44 2009 -0800
@@ -564,6 +564,9 @@
 /* pseudo-flag to lwp_create() */
 #define	NOCLASS	(-1)
 
+/* unused scheduling class ID */
+#define	CLASS_UNUSED	(-2)
+
 /* LWP stats updated via lwp_stats_update() */
 typedef enum {
 	LWP_STAT_INBLK,
@@ -580,7 +583,7 @@
 
 /* process management functions */
 
-extern int newproc(void (*)(), caddr_t, id_t, int, struct contract **);
+extern int newproc(void (*)(), caddr_t, id_t, int, struct contract **, pid_t);
 extern void vfwait(pid_t);
 extern void proc_detach(proc_t *);
 extern void freeproc(proc_t *);
@@ -620,7 +623,7 @@
 extern void sigdefault(proc_t *);
 
 extern void pid_setmin(void);
-extern pid_t pid_allocate(proc_t *, int);
+extern pid_t pid_allocate(proc_t *, pid_t, int);
 extern struct pid *pid_find(pid_t);
 extern int pid_rele(struct pid *);
 extern void pid_exit(proc_t *);
@@ -658,6 +661,7 @@
 extern	void	disable_msacct(proc_t *);
 extern	hrtime_t mstate_aggr_state(proc_t *, int);
 extern	hrtime_t mstate_thread_onproc_time(kthread_t *);
+extern	void	mstate_systhread_times(kthread_t *, hrtime_t *, hrtime_t *);
 extern	void	syscall_mstate(int, int);
 
 extern	uint_t	cpu_update_pct(kthread_t *, hrtime_t);
@@ -718,6 +722,7 @@
 
 /* lwp function prototypes */
 
+extern kthread_t *lwp_kernel_create(proc_t *, void (*)(), void *, int, pri_t);
 extern	klwp_t 		*lwp_create(
 	void		(*proc)(),
 	caddr_t		arg,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/sysdc.h	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_SYSDC_H
+#define	_SYS_SYSDC_H
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct _kthread;
+
+#define	SYSDC_THREAD_BATCH	0x1	/* thread does batch processing */
+extern void sysdc_thread_enter(struct _kthread *, uint_t, uint_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SYSDC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/sysdc_impl.h	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_SYSDC_IMPL_H
+#define	_SYS_SYSDC_IMPL_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/list.h>
+
+#include <sys/sysdc.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct _kthread;
+struct cpupart;
+
+/*
+ * Tracks per-processor-set information for SDC.  Its main use is to
+ * implement per-processor-set breaks.
+ */
+typedef struct sysdc_pset {
+	list_node_t	sdp_node;	/* node on sysdc_psets list */
+	struct cpupart	*sdp_cpupart;	/* associated cpu partition */
+	size_t		sdp_nthreads;	/* reference count */
+
+	/* The remainder is only touched by sysdc_update() */
+	hrtime_t	sdp_onproc_time; /* time onproc at last update */
+	boolean_t	sdp_need_break;	/* threads forced to minpri */
+	uint_t		sdp_should_break; /* # updates need_break is set */
+	uint_t		sdp_dont_break;	/* after break, # updates until next */
+
+	/* debugging fields */
+	uint_t		sdp_onproc_threads;
+	hrtime_t	sdp_vtime_last_interval;
+	uint_t		sdp_DC_last_interval;
+} sysdc_pset_t;
+
+/*
+ * Per-thread information, pointed to by t_cldata.
+ */
+typedef struct sysdc {
+	uint_t		sdc_target_DC;	/* target duty cycle */
+	uint_t		sdc_minpri;	/* our minimum priority */
+	uint_t		sdc_maxpri;	/* our maximum priority */
+
+	sysdc_pset_t	*sdc_pset;	/* the processor set bound to */
+
+	/* protected by sdl_lock */
+	struct _kthread	*sdc_thread;	/* back-pointer, or NULL if freeable */
+
+	/* protected by arrangement between thread and sysdc_update() */
+	struct sysdc	*sdc_next;	/* next in hash table, NULL if not in */
+
+	/* protected by thread_lock() */
+	uint_t		sdc_nupdates;	/* number of sysdc_update_times() */
+
+	hrtime_t	sdc_base_O;	/* on-cpu time at last reset */
+	hrtime_t	sdc_base_R;	/* runnable time at last reset */
+
+	uint_t		sdc_sleep_updates; /* 0, or nupdates when we slept */
+	clock_t		sdc_ticks;	/* sdc_tick() calls */
+	clock_t		sdc_update_ticks; /* value of ticks for forced update */
+	clock_t		sdc_pri_check;	/* lbolt when we checked our priority */
+	hrtime_t	sdc_last_base_O; /* onproc time at sysdc_update() */
+
+	uint_t		sdc_pri;	/* our last computed priority */
+	uint_t		sdc_epri;	/* our actual thread priority */
+
+	/* for debugging only */
+	clock_t		sdc_reset;	/* lbolt when we reset our bases */
+	hrtime_t	sdc_cur_O;	/* on-cpu time at last prio check */
+	hrtime_t	sdc_cur_R;	/* runnable time at last prio check */
+	hrtime_t	sdc_last_O;	/* onproc time at thread update */
+	uint_t		sdc_cur_DC;	/* our actual duty cycle at last chk */
+} sysdc_t;
+
+/*
+ * Hash bucket of active SDC threads.
+ */
+typedef struct sysdc_list {
+	kmutex_t	sdl_lock;	/* lock keeping threads from exiting */
+	sysdc_t	*volatile sdl_list;	/* list of active threads in bucket */
+	char		sdl_pad[64 - sizeof (kmutex_t) - sizeof (sysdc_t *)];
+} sysdc_list_t;
+
+/*
+ * Args to CL_ENTERCLASS().
+ */
+typedef struct sysdc_params {
+	uint_t		sdp_minpri;
+	uint_t		sdp_maxpri;
+	uint_t		sdp_DC;
+} sysdc_params_t;
+
+/*
+ * Duty cycles are percentages in the range [1,100].
+ */
+#define	SYSDC_DC_MAX		100u	/* 1 <= DC <= DC_MAX */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SYSDC_IMPL_H */
--- a/usr/src/uts/common/sys/taskq.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/taskq.h	Mon Nov 23 15:29:44 2009 -0800
@@ -39,6 +39,8 @@
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
+struct proc;
+
 /*
  * Public flags for taskq_create(): bit range 0-15
  */
@@ -46,6 +48,7 @@
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
 #define	TASKQ_THREADS_CPU_PCT	0x0008	/* number of threads as % of ncpu */
+#define	TASKQ_DC_BATCH		0x0010	/* Taskq uses SDC in batch mode */
 
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
@@ -55,6 +58,7 @@
 #define	TQ_NOSLEEP	0x01	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02	/* Do not enqueue if can't dispatch */
 #define	TQ_NOALLOC	0x04	/* cannot allocate memory; may fail */
+#define	TQ_FRONT	0x08	/* Put task at the front of the queue */
 
 #ifdef _KERNEL
 
@@ -66,6 +70,10 @@
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int,
     int, uint_t);
+extern taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
+    struct proc *, uint_t);
+extern taskq_t	*taskq_create_sysdc(const char *, int, int, int,
+    struct proc *, uint_t, uint_t);
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	nulltask(void *);
 extern void	taskq_destroy(taskq_t *);
--- a/usr/src/uts/common/sys/taskq_impl.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/taskq_impl.h	Mon Nov 23 15:29:44 2009 -0800
@@ -27,7 +27,9 @@
 #define	_SYS_TASKQ_IMPL_H
 
 #include <sys/taskq.h>
+#include <sys/inttypes.h>
 #include <sys/vmem.h>
+#include <sys/list.h>
 #include <sys/kstat.h>
 
 #ifdef	__cplusplus
@@ -81,13 +83,16 @@
 #define	TQBUCKET_CLOSE		0x01
 #define	TQBUCKET_SUSPEND	0x02
 
+#define	TASKQ_INTERFACE_FLAGS	0x0000ffff	/* defined in <sys/taskq.h> */
+
 /*
  * taskq implementation flags: bit range 16-31
  */
-#define	TASKQ_CHANGING		0x00010000
-#define	TASKQ_SUSPENDED		0x00020000
-#define	TASKQ_NOINSTANCE	0x00040000
-#define	TASKQ_THREAD_CREATED	0x00080000
+#define	TASKQ_CHANGING		0x00010000	/* nthreads != target */
+#define	TASKQ_SUSPENDED		0x00020000	/* taskq is suspended */
+#define	TASKQ_NOINSTANCE	0x00040000	/* no instance number */
+#define	TASKQ_THREAD_CREATED	0x00080000	/* a thread has been created */
+#define	TASKQ_DUTY_CYCLE	0x00100000	/* using the SDC class */
 
 struct taskq {
 	char		tq_name[TASKQ_NAMELEN + 1];
@@ -116,13 +121,19 @@
 		kthread_t *_tq_thread;
 		kthread_t **_tq_threadlist;
 	}		tq_thr;
+
+	list_node_t	tq_cpupct_link;	/* linkage for taskq_cpupct_list */
+	struct proc	*tq_proc;	/* process for taskq threads */
+	int		tq_cpupart;	/* cpupart id bound to */
+	uint_t		tq_DC;		/* duty cycle for SDC */
+
 	/*
 	 * Statistics.
 	 */
 	kstat_t		*tq_kstat;	/* Exported statistics */
 	hrtime_t	tq_totaltime;	/* Time spent processing tasks */
-	int		tq_tasks;	/* Total # of tasks posted */
-	int		tq_executed;	/* Total # of tasks executed */
+	uint64_t	tq_tasks;	/* Total # of tasks posted */
+	uint64_t	tq_executed;	/* Total # of tasks executed */
 	int		tq_maxtasks;	/* Max number of tasks in the queue */
 	int		tq_tcreates;
 	int		tq_tdeaths;
--- a/usr/src/uts/common/sys/vmsystm.h	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/sys/vmsystm.h	Mon Nov 23 15:29:44 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,8 +39,6 @@
 #ifndef _SYS_VMSYSTM_H
 #define	_SYS_VMSYSTM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/proc.h>
 
 #ifdef	__cplusplus
@@ -154,7 +152,6 @@
 extern	void	ppmapout(caddr_t);
 
 extern	int pf_is_memory(pfn_t);
-extern	void	pageout_init(void (*proc)(), proc_t *pp, pri_t pri);
 
 extern	void	dcache_flushall(void);
 
--- a/usr/src/uts/i86pc/os/startup.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/i86pc/os/startup.c	Mon Nov 23 15:29:44 2009 -0800
@@ -1509,6 +1509,9 @@
 	if (modload("fs", "dev") == -1)
 		halt("Can't load dev");
 
+	if (modload("fs", "procfs") == -1)
+		halt("Can't load procfs");
+
 	(void) modloadonly("sys", "lbl_edition");
 
 	dispinit();
--- a/usr/src/uts/i86pc/vm/vm_machdep.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c	Mon Nov 23 15:29:44 2009 -0800
@@ -3720,16 +3720,6 @@
 }
 
 /*
- * Create the pageout scanner thread. The thread has to
- * start at procedure with process pp and priority pri.
- */
-void
-pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
-{
-	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
-}
-
-/*
  * Function for flushing D-cache when performing module relocations
  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
  */
--- a/usr/src/uts/intel/Makefile.intel.shared	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/intel/Makefile.intel.shared	Mon Nov 23 15:29:44 2009 -0800
@@ -526,7 +526,7 @@
 #
 #	Scheduling Class Modules (/kernel/sched):
 #
-SCHED_KMODS	+= IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL
+SCHED_KMODS	+= IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL SDC
 
 #
 #	File System Modules (/kernel/fs):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/intel/SDC/Makefile	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,82 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/intel/SDC/Makefile
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#	This makefile drives the production of the SDC scheduling class
+#	kernel module.
+#
+#	intel architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= SDC
+OBJECTS		= $(SDC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(SDC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_SCHED_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
--- a/usr/src/uts/intel/ia32/ml/modstubs.s	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s	Mon Nov 23 15:29:44 2009 -0800
@@ -764,6 +764,15 @@
 #endif
 
 /*
+ * Stubs for sysdc
+ */
+#ifndef SDC_MODULE
+	MODULE(SDC,sched);
+	NO_UNLOAD_STUB(SDC, sysdc_thread_enter,		nomod_zero);
+	END_MODULE(SDC);
+#endif
+
+/*
  * Stubs for ts_dptbl
  */
 #ifndef TS_DPTBL_MODULE
--- a/usr/src/uts/sparc/Makefile.sparc.shared	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/sparc/Makefile.sparc.shared	Mon Nov 23 15:29:44 2009 -0800
@@ -359,7 +359,7 @@
 #
 #	Scheduling Class Modules (/kernel/sched):
 #
-SCHED_KMODS	+= RT TS RT_DPTBL TS_DPTBL IA FSS FX FX_DPTBL
+SCHED_KMODS	+= RT TS RT_DPTBL TS_DPTBL IA FSS FX FX_DPTBL SDC
 
 #
 #	File System Modules (/kernel/fs):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sparc/SDC/Makefile	Mon Nov 23 15:29:44 2009 -0800
@@ -0,0 +1,87 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/sparc/SDC/Makefile
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#	This makefile drives the production of the SDC scheduling class
+#	kernel module.
+#
+#	sparc architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= SDC
+OBJECTS		= $(SDC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(SDC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_SCHED_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+#	Overrides.
+#
+CFLAGS		+= $(CCVERBOSE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
--- a/usr/src/uts/sparc/ml/modstubs.s	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/sparc/ml/modstubs.s	Mon Nov 23 15:29:44 2009 -0800
@@ -652,6 +652,15 @@
 #endif
 
 /*
+ * Stubs for sysdc
+ */
+#ifndef SDC_MODULE
+	MODULE(SDC,sched);
+	NO_UNLOAD_STUB(SDC, sysdc_thread_enter,		nomod_zero);
+	END_MODULE(SDC);
+#endif
+
+/*
  * Stubs for ts_dptbl
  */
 #ifndef TS_DPTBL_MODULE
--- a/usr/src/uts/sparc/zfs/Makefile	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/sparc/zfs/Makefile	Mon Nov 23 15:29:44 2009 -0800
@@ -19,11 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This makefile drives the production of the zfs file system
 #	kernel module.
 
@@ -59,8 +57,11 @@
 #
 #	Overrides and depends_on
 #
+#	We require sched/SDC because by the time vfs_mountroot() runs,
+#	we can no longer load modules through OBP.
+#
 MODSTUBS_DIR	 = $(OBJS_DIR)
-LDFLAGS		+= -dy -Nfs/specfs -Ncrypto/swrand -Nmisc/idmap
+LDFLAGS		+= -dy -Nfs/specfs -Ncrypto/swrand -Nmisc/idmap -Nsched/SDC
 
 INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
 INC_PATH	+= -I$(SRC)/common
--- a/usr/src/uts/sun4/os/startup.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/sun4/os/startup.c	Mon Nov 23 15:29:44 2009 -0800
@@ -1583,6 +1583,9 @@
 	if (modloadonly("fs", "devfs") == -1)
 		halt("Can't load devfs");
 
+	if (modloadonly("fs", "procfs") == -1)
+		halt("Can't load procfs");
+
 	if (modloadonly("misc", "swapgeneric") == -1)
 		halt("Can't load swapgeneric");
 
--- a/usr/src/uts/sun4/vm/vm_dep.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/sun4/vm/vm_dep.c	Mon Nov 23 15:29:44 2009 -0800
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * UNIX machine dependent virtual memory support.
  */
@@ -1004,16 +1002,6 @@
 }
 
 /*
- * Create & Initialise pageout scanner thread. The thread has to
- * start at procedure with process pp and priority pri.
- */
-void
-pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
-{
-	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
-}
-
-/*
  * Function for flushing D-cache when performing module relocations
  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
  * at least for now.