Mercurial > illumos > illumos-gate
changeset 11173:87f3734e64df
6881015 ZFS write activity prevents other threads from running in a timely manner
6899867 mstate_thread_onproc_time() doesn't account for runnable time correctly
PSARC/2009/615 System Duty Cycle Scheduling Class and ZFS IO Observability
line wrap: on
line diff
--- a/usr/src/Targetdirs Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/Targetdirs Mon Nov 23 15:29:44 2009 -0800 @@ -216,6 +216,7 @@ /usr/lib/class/FX \ /usr/lib/class/IA \ /usr/lib/class/RT \ + /usr/lib/class/SDC \ /usr/lib/class/TS \ /usr/lib/crypto \ /usr/lib/drv \
--- a/usr/src/cmd/dispadmin/Makefile Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/cmd/dispadmin/Makefile Mon Nov 23 15:29:44 2009 -0800 @@ -28,12 +28,13 @@ PROG= dispadmin MANIFEST= scheduler.xml SVCMETHOD= svc-scheduler +SDC= SDC$(PROG) RT= RT$(PROG) TS= TS$(PROG) IA= IA$(PROG) FSS= FSS$(PROG) FX= FX$(PROG) -PROGS= $(PROG) $(RT) $(TS) $(IA) $(FSS) $(FX) +PROGS= $(PROG) $(RT) $(TS) $(IA) $(FSS) $(FX) $(SDC) include ../Makefile.cmd @@ -41,38 +42,43 @@ ROOTDIR= $(ROOT)/usr/lib/class ROOTDIRS= $(ROOTDIR) \ - $(ROOTDIR)/RT \ - $(ROOTDIR)/TS \ + $(ROOTDIR)/FSS \ + $(ROOTDIR)/FX \ $(ROOTDIR)/IA \ - $(ROOTDIR)/FSS \ - $(ROOTDIR)/FX + $(ROOTDIR)/RT \ + $(ROOTDIR)/SDC \ + $(ROOTDIR)/TS + ROOTPROG= $(PROG:%=$(ROOTUSRSBIN)/%) -ROOTRT= $(RT:%=$(ROOTDIR)/RT/%) -ROOTTS= $(TS:%=$(ROOTDIR)/TS/%) -ROOTIA= $(IA:%=$(ROOTDIR)/IA/%) ROOTFSS= $(FSS:%=$(ROOTDIR)/FSS/%) ROOTFX= $(FX:%=$(ROOTDIR)/FX/%) +ROOTIA= $(IA:%=$(ROOTDIR)/IA/%) +ROOTRT= $(RT:%=$(ROOTDIR)/RT/%) +ROOTSDC= $(SDC:%=$(ROOTDIR)/SDC/%) +ROOTTS= $(TS:%=$(ROOTDIR)/TS/%) ROOTMANIFESTDIR= $(ROOTSVCSYSTEM) # this would be simpler if we renamed rtdispadmin.c and tsdispadmin.c OBJECTS= $(PROG).o rt$(PROG).o ts$(PROG).o ia$(PROG).o \ - fss$(PROG).o fx$(PROG).o subr.o + fss$(PROG).o fx$(PROG).o sdc$(PROG).o subr.o # conditional assignments, because of above names $(PROG):= OBJ= $(PROG).o -$(RT):= OBJ= rt$(PROG).o -$(TS):= OBJ= ts$(PROG).o -$(IA):= OBJ= ia$(PROG).o $(FSS):= OBJ= fss$(PROG).o $(FX):= OBJ= fx$(PROG).o +$(IA):= OBJ= ia$(PROG).o +$(RT):= OBJ= rt$(PROG).o +$(SDC):= OBJ= sdc$(PROG).o +$(TS):= OBJ= ts$(PROG).o # install rules $(ROOTDIR)/% \ -$(ROOTDIR)/RT/% \ +$(ROOTDIR)/FSS/% \ +$(ROOTDIR)/FX/% \ $(ROOTDIR)/IA/% \ -$(ROOTDIR)/TS/% \ -$(ROOTDIR)/FSS/% \ -$(ROOTDIR)/FX/% : % +$(ROOTDIR)/RT/% \ +$(ROOTDIR)/SDC/% \ +$(ROOTDIR)/TS/% : % $(INS.file) .KEEP_STATE: @@ -89,7 +95,7 @@ lint := LDLIBS += -L. -lsubr install: all $(ROOTPROG) $(ROOTRT) $(ROOTTS) $(ROOTIA) $(ROOTFSS) $(ROOTFX) \ - $(ROOTMANIFEST) $(ROOTSVCMETHOD) + $(ROOTSDC) $(ROOTMANIFEST) $(ROOTSVCMETHOD) # Don't re-install directories already installed by Targetdirs #$(ROOTDIRS): @@ -102,10 +108,11 @@ lint: llib-lsubr.ln $(LINT.c) dispadmin.c $(LDLIBS) - $(LINT.c) rtdispadmin.c $(LDLIBS) - $(LINT.c) tsdispadmin.c $(LDLIBS) - $(LINT.c) iadispadmin.c $(LDLIBS) $(LINT.c) fssdispadmin.c $(LDLIBS) $(LINT.c) fxdispadmin.c $(LDLIBS) + $(LINT.c) iadispadmin.c $(LDLIBS) + $(LINT.c) rtdispadmin.c $(LDLIBS) + $(LINT.c) sdcdispadmin.c $(LDLIBS) + $(LINT.c) tsdispadmin.c $(LDLIBS) include ../Makefile.targ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/cmd/dispadmin/sdcdispadmin.c Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/param.h> +#include <sys/priocntl.h> +#include <sys/types.h> + +#include "dispadmin.h" + +static char usage[] = "usage: dispadmin -l\n"; + +int +main(int argc, char *argv[]) +{ + int lflag = 0; + int c; + + while ((c = getopt(argc, argv, "lc:")) != -1) { + switch (c) { + + case 'l': + lflag++; + break; + + case 'c': + if (strcmp(optarg, "SDC") != 0) + fatalerr("error: %s executed for %s class, " + "%s is actually sub-command for %s class\n", + argv[0], optarg, argv[0], "SDC"); + + fatalerr("error: no scheduling-class specific options" + " for SDC\n"); + break; + + case '?': + fatalerr(usage); + default: + break; + } + } + + if (!lflag) + fatalerr(usage); + + (void) printf("SDC\t(System Duty-Cycle Class)\n"); + return (0); +}
--- a/usr/src/cmd/priocntl/Makefile Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/cmd/priocntl/Makefile Mon Nov 23 15:29:44 2009 -0800 @@ -29,7 +29,7 @@ PRIOCNTLSRC= $(PRIOCNTLOBJ:%.o=%.c) OBJS= $(PRIOCNTLOBJ) rt$(PROG).o ts$(PROG).o ia$(PROG).o fss$(PROG).o \ - fx$(PROG).o + fx$(PROG).o sdc$(PROG).o SRCS= $(OBJ:%.o=%.c) include ../Makefile.cmd @@ -38,19 +38,22 @@ CLASSD = $(ROOTLIB)/class RTD = $(CLASSD)/RT +SDCD = $(CLASSD)/SDC TSD = $(CLASSD)/TS IAD = $(CLASSD)/IA FSSD = $(CLASSD)/FSS FXD = $(CLASSD)/FX -DIRS = $(CLASSD) $(RTD) $(TSD) $(IAD) $(FSSD) $(FXD) +DIRS = $(CLASSD) $(RTD) $(SDCD) $(TSD) $(IAD) $(FSSD) $(FXD) RTPROG = RT$(PROG) +SDCPROG = SDC$(PROG) TSPROG = TS$(PROG) IAPROG = IA$(PROG) FSSPROG = FSS$(PROG) FXPROG = FX$(PROG) ROOTRTPROG = $(RTD)/$(RTPROG) +ROOTSDCPROG = $(SDCD)/$(SDCPROG) ROOTTSPROG = $(TSD)/$(TSPROG) ROOTIAPROG = $(IAD)/$(IAPROG) ROOTFSSPROG = $(FSSD)/$(FSSPROG) @@ -58,12 +61,15 @@ $(ROOTUSRSBINPROG) := FILEMODE = 04555 $(DIRS) := FILEMODE = 0775 -CLOBBERFILES += $(RTPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG) +CLOBBERFILES += $(RTPROG) $(SDCPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG) # installation rules $(RTD)/% : % $(INS.file) +$(SDCD)/% : % + $(INS.file) + $(TSD)/% : % $(INS.file) @@ -78,7 +84,7 @@ .KEEP_STATE: -all: $(PROG) $(RTPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG) +all: $(PROG) $(RTPROG) $(SDCPROG) $(TSPROG) $(IAPROG) $(FSSPROG) $(FXPROG) $(PROG): $(PRIOCNTLOBJ) $(LINK.c) $(PRIOCNTLOBJ) -o $@ $(LDLIBS) @@ -88,6 +94,10 @@ $(LINK.c) rt$(PRIOCNTLOBJ) -o $@ $(LDLIBS) $(POST_PROCESS) +$(SDCPROG): sdc$(PRIOCNTLOBJ) + $(LINK.c) sdc$(PRIOCNTLOBJ) -o $@ $(LDLIBS) + $(POST_PROCESS) + $(TSPROG): ts$(PRIOCNTLOBJ) $(LINK.c) ts$(PRIOCNTLOBJ) -o $@ $(LDLIBS) $(POST_PROCESS) @@ -107,6 +117,7 @@ install: all $(DIRS) \ $(ROOTPROG) \ $(ROOTRTPROG) \ + $(ROOTSDCPROG) \ $(ROOTTSPROG) \ $(ROOTIAPROG) \ $(ROOTFSSPROG) \ @@ -121,6 +132,7 @@ lint: $(LINT.c) $(PRIOCNTLSRC) $(LDLIBS) $(LINT.c) rt$(PRIOCNTLSRC) $(LDLIBS) + $(LINT.c) sdc$(PRIOCNTLSRC) $(LDLIBS) $(LINT.c) ts$(PRIOCNTLSRC) $(LDLIBS) $(LINT.c) ia$(PRIOCNTLSRC) $(LDLIBS) $(LINT.c) fss$(PRIOCNTLSRC) $(LDLIBS)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/cmd/priocntl/sdcpriocntl.c Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/param.h> +#include <sys/priocntl.h> +#include <sys/types.h> + +#include "priocntl.h" + +static char usage[] = "usage: priocntl -l\n"; + +int +main(int argc, char *argv[]) +{ + int lflag = 0; + int c; + + while ((c = getopt(argc, argv, "lc:")) != -1) { + switch (c) { + + case 'l': + lflag++; + break; + + case 'c': + if (strcmp(optarg, "SDC") != 0) + fatalerr("error: %s executed for %s class, " + "%s is actually sub-command for %s class\n", + argv[0], optarg, argv[0], "SDC"); + + fatalerr("error: no scheduling-class specific options" + " for SDC\n"); + break; + + case '?': + fatalerr(usage); + default: + break; + } + } + + if (!lflag) + fatalerr(usage); + + (void) printf("SDC\t(System Duty-Cycle Class)\n"); + return (0); +}
--- a/usr/src/lib/libzpool/common/kernel.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/lib/libzpool/common/kernel.c Mon Nov 23 15:29:44 2009 -0800 @@ -50,6 +50,9 @@ "userland", "libzpool", "1", "1", "na" }; +/* this only exists to have its address taken */ +struct proc p0; + /* * ========================================================================= * threads
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Mon Nov 23 15:29:44 2009 -0800 @@ -197,6 +197,18 @@ #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg) #define thread_exit() thr_exit(NULL) +#define thread_join(t) panic("libzpool cannot join threads") + +#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) + +/* in libzpool, p0 exists only to have its address taken */ +struct proc { + uintptr_t this_is_never_used_dont_dereference_it; +}; + +extern struct proc p0; + +#define PS_NONE -1 extern kthread_t *zk_thread_create(void (*func)(), void *arg); @@ -319,15 +331,21 @@ #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ -#define TASKQ_THREADS_CPU_PCT 0x0008 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ +#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ -#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_FRONT 0x08 /* Queue in front */ extern taskq_t *system_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +#define taskq_create_proc(a, b, c, d, e, p, f) \ + (taskq_create(a, b, c, d, e, f)) +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *);
--- a/usr/src/lib/libzpool/common/taskq.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/lib/libzpool/common/taskq.c Mon Nov 23 15:29:44 2009 -0800 @@ -114,8 +114,13 @@ mutex_exit(&tq->tq_lock); return (0); } - t->task_next = &tq->tq_task; - t->task_prev = tq->tq_task.task_prev; + if (tqflags & TQ_FRONT) { + t->task_next = tq->tq_task.task_next; + t->task_prev = &tq->tq_task; + } else { + t->task_next = &tq->tq_task; + t->task_prev = tq->tq_task.task_prev; + } t->task_next->task_prev = t; t->task_prev->task_next = t; t->task_func = func;
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386 Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 Mon Nov 23 15:29:44 2009 -0800 @@ -239,6 +239,7 @@ f none kernel/misc/strplumb 755 root sys f none kernel/misc/tem 755 root sys f none kernel/misc/tlimod 755 root sys +f none kernel/sched/SDC 755 root sys f none kernel/sched/TS 755 root sys f none kernel/sched/TS_DPTBL 755 root sys l none kernel/strmod/arp=../../kernel/drv/arp @@ -459,6 +460,7 @@ f none kernel/misc/amd64/tem 755 root sys f none kernel/misc/amd64/tlimod 755 root sys d none kernel/sched/amd64 755 root sys +f none kernel/sched/amd64/SDC 755 root sys f none kernel/sched/amd64/TS 755 root sys f none kernel/sched/amd64/TS_DPTBL 755 root sys d none kernel/strmod/amd64 755 root sys
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc Mon Nov 23 15:29:44 2009 -0800 @@ -220,6 +220,7 @@ f none kernel/misc/sparcv9/tlimod 755 root sys f none kernel/misc/sparcv9/tem 755 root sys d none kernel/sched/sparcv9 755 root sys +f none kernel/sched/sparcv9/SDC 755 root sys f none kernel/sched/sparcv9/TS 755 root sys f none kernel/sched/sparcv9/TS_DPTBL 755 root sys d none kernel/strmod/sparcv9 755 root sys
--- a/usr/src/pkgdefs/SUNWcsu/prototype_com Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/pkgdefs/SUNWcsu/prototype_com Mon Nov 23 15:29:44 2009 -0800 @@ -355,6 +355,9 @@ d none usr/lib/class/RT 755 root bin f none usr/lib/class/RT/RTdispadmin 555 root bin f none usr/lib/class/RT/RTpriocntl 555 root bin +d none usr/lib/class/SDC 755 root bin +f none usr/lib/class/SDC/SDCdispadmin 555 root bin +f none usr/lib/class/SDC/SDCpriocntl 555 root bin d none usr/lib/class/TS 755 root bin f none usr/lib/class/TS/TSdispadmin 555 root bin f none usr/lib/class/TS/TSpriocntl 555 root bin
--- a/usr/src/pkgdefs/SUNWhea/prototype_com Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/pkgdefs/SUNWhea/prototype_com Mon Nov 23 15:29:44 2009 -0800 @@ -1299,6 +1299,8 @@ f none usr/include/sys/sysconf.h 644 root bin f none usr/include/sys/sysconfig.h 644 root bin f none usr/include/sys/sysconfig_impl.h 644 root bin +f none usr/include/sys/sysdc.h 644 root bin +f none usr/include/sys/sysdc_impl.h 644 root bin d none usr/include/sys/sysevent 755 root bin f none usr/include/sys/sysevent/ap_driver.h 644 root bin f none usr/include/sys/sysevent/domain.h 644 root bin
--- a/usr/src/uts/common/Makefile.files Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/Makefile.files Mon Nov 23 15:29:44 2009 -0800 @@ -1444,6 +1444,8 @@ # # scheduling class modules # +SDC_OBJS += sysdc.o + RT_OBJS += rt.o RT_DPTBL_OBJS += rt_dptbl.o
--- a/usr/src/uts/common/brand/lx/os/lx_pid.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/brand/lx/os/lx_pid.c Mon Nov 23 15:29:44 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -119,7 +117,7 @@ /* * Allocate a pid for any thread other than the first */ - if ((newpid = pid_allocate(p, 0)) < 0) + if ((newpid = pid_allocate(p, 0, 0)) < 0) return (-1); pidp = pid_find(newpid);
--- a/usr/src/uts/common/disp/cpupart.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/disp/cpupart.c Mon Nov 23 15:29:44 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -681,6 +681,11 @@ if (boundcpu != NULL && boundcpu->cpu_part != cp) return (EBUSY); } + + if (tp->t_cid == sysdccid) { + return (EINVAL); /* For now, sysdc threads can't move */ + } + return (0); }
--- a/usr/src/uts/common/disp/disp.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/disp/disp.c Mon Nov 23 15:29:44 2009 -0800 @@ -1765,11 +1765,16 @@ } } - /* * Make a thread give up its processor. Find the processor on * which this thread is executing, and have that processor * preempt. + * + * We allow System Duty Cycle (SDC) threads to be preempted even if + * they are running at kernel priorities. To implement this, we always + * set cpu_kprunrun; this ensures preempt() will be called. Since SDC + * calls cpu_surrender() very often, we only preempt if there is anyone + * competing with us. */ void cpu_surrender(kthread_t *tp) @@ -1789,9 +1794,16 @@ if (max_pri < max_run_pri) max_pri = max_run_pri; - cpup->cpu_runrun = 1; - if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { - cpup->cpu_kprunrun = 1; + if (tp->t_cid == sysdccid) { + uint_t t_pri = DISP_PRIO(tp); + if (t_pri > max_pri) + return; /* we are not competing w/ anyone */ + cpup->cpu_runrun = cpup->cpu_kprunrun = 1; + } else { + cpup->cpu_runrun = 1; + if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { + cpup->cpu_kprunrun = 1; + } } /* @@ -1816,7 +1828,6 @@ "cpu_surrender:tid %p cpu %p", tp, cpup); } - /* * Commit to and ratify a scheduling decision */
--- a/usr/src/uts/common/disp/priocntl.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/disp/priocntl.c Mon Nov 23 15:29:44 2009 -0800 @@ -20,15 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -713,7 +711,7 @@ if (getcid(clname, &classid) != 0) return (set_errno(EINVAL)); - if (classid == syscid) + if (CLASS_KERNEL(classid)) return (set_errno(EINVAL)); defaultcid = classid; ASSERT(defaultcid > 0 && defaultcid < loaded_classes);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/disp/sysdc.c Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,1328 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * The System Duty Cycle (SDC) scheduling class + * -------------------------------------------- + * + * Background + * + * Kernel threads in Solaris have traditionally not been large consumers + * of CPU time. They typically wake up, perform a small amount of + * work, then go back to sleep waiting for either a timeout or another + * signal. On the assumption that the small amount of work that they do + * is important for the behavior of the whole system, these threads are + * treated kindly by the dispatcher and the SYS scheduling class: they run + * without preemption from anything other than real-time and interrupt + * threads; when preempted, they are put at the front of the queue, so they + * generally do not migrate between CPUs; and they are allowed to stay + * running until they voluntarily give up the CPU. + * + * As Solaris has evolved, new workloads have emerged which require the + * kernel to perform significant amounts of CPU-intensive work. One + * example of such a workload is ZFS's transaction group sync processing. + * Each sync operation generates a large batch of I/Os, and each I/O + * may need to be compressed and/or checksummed before it is written to + * storage. The taskq threads which perform the compression and checksums + * will run nonstop as long as they have work to do; a large sync operation + * on a compression-heavy dataset can keep them busy for seconds on end. + * This causes human-time-scale dispatch latency bubbles for any other + * threads which have the misfortune to share a CPU with the taskq threads. + * + * The SDC scheduling class is a solution to this problem. + * + * + * Overview + * + * SDC is centered around the concept of a thread's duty cycle (DC): + * + * ONPROC time + * Duty Cycle = ---------------------- + * ONPROC + Runnable time + * + * This is the ratio of the time that the thread spent running on a CPU + * divided by the time it spent running or trying to run. It is unaffected + * by any time the thread spent sleeping, stopped, etc. + * + * A thread joining the SDC class specifies a "target" DC that it wants + * to run at. To implement this policy, the routine sysdc_update() scans + * the list of active SDC threads every few ticks and uses each thread's + * microstate data to compute the actual duty cycle that that thread + * has experienced recently. If the thread is under its target DC, its + * priority is increased to the maximum available (sysdc_maxpri, which is + * 99 by default). If the thread is over its target DC, its priority is + * reduced to the minimum available (sysdc_minpri, 0 by default). This + * is a fairly primitive approach, in that it doesn't use any of the + * intermediate priorities, but it's not completely inappropriate. Even + * though threads in the SDC class might take a while to do their job, they + * are by some definition important if they're running inside the kernel, + * so it is reasonable that they should get to run at priority 99. + * + * If a thread is running when sysdc_update() calculates its actual duty + * cycle, and there are other threads of equal or greater priority on its + * CPU's dispatch queue, sysdc_update() preempts that thread. The thread + * acknowledges the preemption by calling sysdc_preempt(), which calls + * setbackdq(), which gives other threads with the same priority a chance + * to run. This creates a de facto time quantum for threads in the SDC + * scheduling class. + * + * An SDC thread which is assigned priority 0 can continue to run if + * nothing else needs to use the CPU that it's running on. Similarly, an + * SDC thread at priority 99 might not get to run as much as it wants to + * if there are other priority-99 or higher threads on its CPU. These + * situations would cause the thread to get ahead of or behind its target + * DC; the longer the situations lasted, the further ahead or behind the + * thread would get. Rather than condemning a thread to a lifetime of + * paying for its youthful indiscretions, SDC keeps "base" values for + * ONPROC and Runnable times in each thread's sysdc data, and updates these + * values periodically. The duty cycle is then computed using the elapsed + * amount of ONPROC and Runnable times since those base times. + * + * Since sysdc_update() scans SDC threads fairly frequently, it tries to + * keep the list of "active" threads small by pruning out threads which + * have been asleep for a brief time. They are not pruned immediately upon + * going to sleep, since some threads may bounce back and forth between + * sleeping and being runnable. + * + * + * Interfaces + * + * void sysdc_thread_enter(t, dc, flags) + * + * Moves a kernel thread from the SYS scheduling class to the + * SDC class. t must have an associated LWP (created by calling + * lwp_kernel_create()). The thread will have a target DC of dc. + * Flags should be either 0 or SYSDC_THREAD_BATCH. If + * SYSDC_THREAD_BATCH is specified, the thread will run with a + * slightly lower priority (see "Batch threads", below). + * + * + * Complications + * + * - Run queue balancing + * + * The Solaris dispatcher is biased towards letting a thread run + * on the same CPU which it last ran on, if no more than 3 ticks + * (i.e. rechoose_interval) have passed since the thread last ran. + * This helps to preserve cache warmth. On the other hand, it also + * tries to keep the per-CPU run queues fairly balanced; if the CPU + * chosen for a runnable thread has a run queue which is three or + * more threads longer than a neighboring CPU's queue, the runnable + * thread is dispatched onto the neighboring CPU instead. + * + * These policies work well for some workloads, but not for many SDC + * threads. The taskq client of SDC, for example, has many discrete + * units of work to do. The work units are largely independent, so + * cache warmth is not an important consideration. It is important + * that the threads fan out quickly to different CPUs, since the + * amount of work these threads have to do (a few seconds worth at a + * time) doesn't leave much time to correct thread placement errors + * (i.e. two SDC threads being dispatched to the same CPU). + * + * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS. + * This tells the dispatcher to keep neighboring run queues' lengths + * more evenly matched, which allows SDC threads to migrate more + * easily. + * + * - LWPs and system processes + * + * SDC can only be used for kernel threads. Since SDC uses microstate + * accounting data to compute each thread's actual duty cycle, all + * threads entering the SDC class must have associated LWPs (which + * store the microstate data). This means that the threads have to + * be associated with an SSYS process, i.e. one created by newproc(). + * If the microstate accounting information is ever moved into the + * kthread_t, this restriction could be lifted. + * + * - Dealing with oversubscription + * + * Since SDC duty cycles are per-thread, it is possible that the + * aggregate requested duty cycle of all SDC threads in a processor + * set could be greater than the total CPU time available in that set. + * The FSS scheduling class has an analogous situation, which it deals + * with by reducing each thread's allotted CPU time proportionally. + * Since SDC doesn't need to be as precise as FSS, it uses a simpler + * solution to the oversubscription problem. + * + * sysdc_update() accumulates the amount of time that max-priority SDC + * threads have spent on-CPU in each processor set, and uses that sum + * to create an implied duty cycle for that processor set: + * + * accumulated CPU time + * pset DC = ----------------------------------- + * (# CPUs) * time since last update + * + * If this implied duty cycle is above a maximum pset duty cycle (90% + * by default), sysdc_update() sets the priority of all SDC threads + * in that processor set to sysdc_minpri for a "break" period. After + * the break period, it waits for a "nobreak" period before trying to + * enforce the pset duty cycle limit again. + * + * - Processor sets + * + * As the above implies, SDC is processor set aware, but it does not + * currently allow threads to change processor sets while in the SDC + * class. Instead, those threads must join the desired processor set + * before entering SDC. [1] + * + * - Batch threads + * + * A thread joining the SDC class can specify the SDC_THREAD_BATCH + * flag. This flag causes the maximum priority for that thread to be + * reduced (by default, the maximum is reduced by 1). This allows + * longer-running, batch-oriented SDC threads to be interrupted by + * more immediate, higher-priority work. + * + * - t_kpri_req + * + * The TS and FSS scheduling classes pay attention to t_kpri_req, + * which provides a simple form of priority inheritance for + * synchronization primitives (such as rwlocks held as READER) which + * cannot be traced to a unique thread. The SDC class does not honor + * t_kpri_req, for a few reasons: + * + * 1. t_kpri_req is notoriously inaccurate. A measure of its + * inaccuracy is that it needs to be cleared every time a thread + * returns to user mode, because it is frequently non-zero at that + * point. This can happen because "ownership" of synchronization + * primitives that use t_kpri_req can be silently handed off, + * leaving no opportunity to will the t_kpri_req inheritance. + * + * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at + * kernel priority. This means that even if an SDC thread + * is holding a synchronization primitive and running at low + * priority, its priority will eventually be raised above 60, + * allowing it to drive on and release the resource. + * + * 3. The first consumer of SDC uses the taskq subsystem, which holds + * a reader lock for the duration of the task's execution. This + * would mean that SDC threads would never drop below kernel + * priority in practice, which defeats one of the purposes of SDC. + * + * - Why not FSS? + * + * It might seem that the existing FSS scheduling class could solve + * the problems that SDC is attempting to solve. FSS's more precise + * solution to the oversubscription problem would hardly cause + * trouble, as long as it performed well. SDC is implemented as + * a separate scheduling class for two main reasons: the initial + * consumer of SDC does not map well onto the "project" abstraction + * that is central to FSS, and FSS does not expect to run at kernel + * priorities. + * + * + * Tunables + * + * - sysdc_batch_niceness: The amount below sysdc_maxpri that + * SDC_THREAD_BATCH threads should use as their per-thread + * maximum priority. + * + * - sysdc_update_interval_msec: Number of milliseconds between + * consecutive thread priority updates. + * + * - sysdc_reset_interval_msec: Number of milliseconds between + * consecutive resets of a thread's base ONPROC and Runnable + * times. + * + * - sysdc_prune_interval_msec: Number of milliseconds of sleeping + * before a thread is pruned from the active list. + * + * - sysdc_max_pset_DC: Allowable percentage of a processor set's + * CPU time which SDC can give to its high-priority threads. + * + * - sysdc_break_msec: Number of milliseconds of "break" taken when + * sysdc_max_pset_DC is exceeded. + * + * + * Future work (in SDC and related subsystems) + * + * - Per-thread rechoose interval (0 for SDC) + * + * Allow each thread to specify its own rechoose interval. SDC + * threads would specify an interval of zero, which would rechoose + * the CPU with the lowest priority once per update. + * + * - Allow threads to change processor sets after joining the SDC class + * + * - Thread groups and per-group DC + * + * It might be nice to be able to specify a duty cycle which applies + * to a group of threads in aggregate. + * + * - Per-group DC callback to allow dynamic DC tuning + * + * Currently, DCs are assigned when the thread joins SDC. Some + * workloads could benefit from being able to tune their DC using + * subsystem-specific knowledge about the workload. + * + * - Finer-grained priority updates + * + * - More nuanced management of oversubscription + * + * - Moving other CPU-intensive threads into SDC + * + * - Move msacct data into kthread_t + * + * This would allow kernel threads without LWPs to join SDC. + * + * + * Footnotes + * + * [1] The details of doing so are left as an exercise for the reader. + */ + +#include <sys/types.h> +#include <sys/sysdc.h> +#include <sys/sysdc_impl.h> + +#include <sys/class.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/cpupart.h> +#include <sys/debug.h> +#include <sys/disp.h> +#include <sys/errno.h> +#include <sys/inline.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/schedctl.h> +#include <sys/sdt.h> +#include <sys/sunddi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> + +/* + * Tunables - loaded into the internal state at module load time + */ +uint_t sysdc_update_interval_msec = 20; +uint_t sysdc_reset_interval_msec = 400; +uint_t sysdc_prune_interval_msec = 100; +uint_t sysdc_max_pset_DC = 90; +uint_t sysdc_break_msec = 80; +pri_t sysdc_batch_niceness = 1; + +/* + * Internal state - constants set up by sysdc_initparam() + */ +static clock_t sysdc_update_ticks; /* ticks between updates */ +static uint_t sysdc_prune_updates; /* updates asleep before pruning */ +static uint_t sysdc_reset_updates; /* # of updates before reset */ +static uint_t sysdc_break_updates; /* updates to break */ +static uint_t sysdc_nobreak_updates; /* updates to not check */ +static uint_t sysdc_minDC; /* minimum allowed DC */ +static uint_t sysdc_maxDC; /* maximum allowed DC */ +static pri_t sysdc_minpri; /* minimum allowed priority */ +static pri_t sysdc_maxpri; /* maximum allowed priority */ + +/* + * Internal state + */ +static kmutex_t sysdc_pset_lock; /* lock protecting pset data */ +static list_t sysdc_psets; /* list of psets with SDC threads */ +static uint_t sysdc_param_init; /* sysdc_initparam() has been called */ +static uint_t sysdc_update_timeout_started; /* update timeout is active */ +static hrtime_t sysdc_last_update; /* time of last sysdc_update() */ +static sysdc_t sysdc_dummy; /* used to terminate active lists */ + +/* + * Internal state - active hash table + */ +#define SYSDC_NLISTS 8 +#define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1)) +static sysdc_list_t sysdc_active[SYSDC_NLISTS]; +#define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)]) + +#ifdef DEBUG +static struct { + uint64_t sysdc_update_times_asleep; + uint64_t sysdc_update_times_base_ran_backwards; + uint64_t sysdc_update_times_already_done; + uint64_t sysdc_update_times_cur_ran_backwards; + uint64_t sysdc_compute_pri_breaking; + uint64_t sysdc_activate_enter; + uint64_t sysdc_update_enter; + uint64_t sysdc_update_exited; + uint64_t sysdc_update_not_sdc; + uint64_t sysdc_update_idle; + uint64_t sysdc_update_take_break; + uint64_t sysdc_update_no_psets; + uint64_t sysdc_tick_not_sdc; + uint64_t sysdc_tick_quantum_expired; + uint64_t sysdc_thread_enter_enter; +} sysdc_stats; + +#define SYSDC_INC_STAT(x) (sysdc_stats.x++) +#else +#define SYSDC_INC_STAT(x) ((void)0) +#endif + +/* macros are UPPER CASE */ +#define HOWMANY(a, b) howmany((a), (b)) +#define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick) + +static void +sysdc_initparam(void) +{ + uint_t sysdc_break_ticks; + + /* update / prune intervals */ + sysdc_update_ticks = MSECTOTICKS(sysdc_update_interval_msec); + + sysdc_prune_updates = HOWMANY(sysdc_prune_interval_msec, + sysdc_update_interval_msec); + sysdc_reset_updates = HOWMANY(sysdc_reset_interval_msec, + sysdc_update_interval_msec); + + /* We must get at least a little time on CPU. */ + sysdc_minDC = 1; + sysdc_maxDC = SYSDC_DC_MAX; + sysdc_minpri = 0; + sysdc_maxpri = maxclsyspri; + + /* break parameters */ + if (sysdc_max_pset_DC > SYSDC_DC_MAX) { + sysdc_max_pset_DC = SYSDC_DC_MAX; + } + sysdc_break_ticks = MSECTOTICKS(sysdc_break_msec); + sysdc_break_updates = HOWMANY(sysdc_break_ticks, sysdc_update_ticks); + + /* + * We want: + * + * sysdc_max_pset_DC = (nobreak / (break + nobreak)) + * + * ==> nobreak = sysdc_max_pset_DC * (break + nobreak) + * + * sysdc_max_pset_DC * break + * ==> nobreak = ------------------------- + * 1 - sysdc_max_pset_DC + */ + sysdc_nobreak_updates = + HOWMANY((uint64_t)sysdc_break_updates * sysdc_max_pset_DC, + (SYSDC_DC_MAX - sysdc_max_pset_DC)); + + sysdc_param_init = 1; +} + +#undef HOWMANY +#undef MSECTOTICKS + +#define SDC_UPDATE_INITIAL 0x1 /* for the initial update */ +#define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */ +#define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */ + +/* + * Updates the recorded times in the sdc, and returns the elapsed ONPROC + * and Runnable times since the last reset. + * + * newO is the thread's actual ONPROC time; it's used during sysdc_update() + * to track processor set usage. + */ +static void +sysdc_update_times(sysdc_t *sdc, uint_t flags, + hrtime_t *O, hrtime_t *R, hrtime_t *newO) +{ + kthread_t *const t = sdc->sdc_thread; + const uint_t initial = (flags & SDC_UPDATE_INITIAL); + const uint_t update = (flags & SDC_UPDATE_TIMEOUT); + const clock_t now = ddi_get_lbolt(); + uint_t do_reset; + + ASSERT(THREAD_LOCK_HELD(t)); + + *O = *R = 0; + + /* If we've been sleeping, we know we haven't had any ONPROC time. */ + if (sdc->sdc_sleep_updates != 0 && + sdc->sdc_sleep_updates != sdc->sdc_nupdates) { + *newO = sdc->sdc_last_base_O; + SYSDC_INC_STAT(sysdc_update_times_asleep); + return; + } + + /* + * If this is our first update, or we've hit the reset point, + * we need to reset our base_{O,R}. Once we've updated them, we + * report O and R for the entire prior interval. + */ + do_reset = initial; + if (update) { + ++sdc->sdc_nupdates; + if ((sdc->sdc_nupdates % sysdc_reset_updates) == 0) + do_reset = 1; + } + if (do_reset) { + hrtime_t baseO, baseR; + if (initial) { + /* + * Start off our cycle count somewhere in the middle, + * to keep the resets from all happening at once. + * + * 4999 is a handy prime much larger than + * sysdc_reset_updates, so that we don't run into + * trouble if the resolution is a multiple of + * sysdc_reset_updates. + */ + sdc->sdc_nupdates = (uint_t)((gethrtime() % 4999) % + sysdc_reset_updates); + baseO = baseR = 0; + } else { + baseO = sdc->sdc_base_O; + baseR = sdc->sdc_base_R; + } + + mstate_systhread_times(t, &sdc->sdc_base_O, &sdc->sdc_base_R); + *newO = sdc->sdc_base_O; + + sdc->sdc_reset = now; + sdc->sdc_pri_check = -1; /* force mismatch below */ + + /* + * See below for rationale. + */ + if (baseO > sdc->sdc_base_O || baseR > sdc->sdc_base_R) { + SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards); + baseO = sdc->sdc_base_O; + baseR = sdc->sdc_base_R; + } + + /* compute based on the entire interval */ + *O = (sdc->sdc_base_O - baseO); + *R = (sdc->sdc_base_R - baseR); + return; + } + + /* + * If we're called from sysdc_update(), we *must* return a value + * for newO, so we always call mstate_systhread_times(). + * + * Otherwise, if we've already done a pri check this tick, + * we can skip it. + */ + if (!update && sdc->sdc_pri_check == now) { + SYSDC_INC_STAT(sysdc_update_times_already_done); + return; + } + + /* Get the current times from the thread */ + sdc->sdc_pri_check = now; + mstate_systhread_times(t, &sdc->sdc_cur_O, &sdc->sdc_cur_R); + *newO = sdc->sdc_cur_O; + + /* + * The updating of microstate accounting is not done under a + * consistent set of locks, particularly the t_waitrq field. This + * can lead to narrow windows in which we account for time in the + * wrong bucket, which on the next read will be accounted for + * correctly. + * + * If our sdc_base_* fields were affected by one of these blips, we + * throw away the old data, and pretend this tick didn't happen. + */ + if (sdc->sdc_cur_O < sdc->sdc_base_O || + sdc->sdc_cur_R < sdc->sdc_base_R) { + + sdc->sdc_base_O = sdc->sdc_cur_O; + sdc->sdc_base_R = sdc->sdc_cur_R; + + SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards); + return; + } + + *O = sdc->sdc_cur_O - sdc->sdc_base_O; + *R = sdc->sdc_cur_R - sdc->sdc_base_R; +} + +/* + * sysdc_compute_pri() + * + * Recomputes the priority of the thread, leaving the result in + * sdc->sdc_epri. Returns 1 if a priority update should occur + * (which will also trigger a cpu_surrender()), otherwise + * returns 0. + */ +static uint_t +sysdc_compute_pri(sysdc_t *sdc, uint_t flags) +{ + kthread_t *const t = sdc->sdc_thread; + const uint_t update = (flags & SDC_UPDATE_TIMEOUT); + const uint_t tick = (flags & SDC_UPDATE_TICK); + + hrtime_t O, R; + hrtime_t newO = -1; + + ASSERT(THREAD_LOCK_HELD(t)); + + sysdc_update_times(sdc, flags, &O, &R, &newO); + ASSERT(!update || newO != -1); + + /* If we have new data, recompute our priority. */ + if ((O + R) != 0) { + sdc->sdc_cur_DC = (O * SYSDC_DC_MAX) / (O + R); + + /* Adjust our priority to move our DC closer to the target. */ + if (sdc->sdc_cur_DC < sdc->sdc_target_DC) + sdc->sdc_pri = sdc->sdc_maxpri; + else + sdc->sdc_pri = sdc->sdc_minpri; + } + + /* + * If our per-pset duty cycle goes over the max, we will take a break. + * This forces all sysdc threads in the pset to minimum priority, in + * order to let everyone else have a chance at the CPU. + */ + if (sdc->sdc_pset->sdp_need_break) { + SYSDC_INC_STAT(sysdc_compute_pri_breaking); + sdc->sdc_epri = sdc->sdc_minpri; + } else { + sdc->sdc_epri = sdc->sdc_pri; + } + + DTRACE_PROBE4(sysdc__compute__pri, + kthread_t *, t, pri_t, sdc->sdc_epri, uint_t, sdc->sdc_cur_DC, + uint_t, sdc->sdc_target_DC); + + /* + * For sysdc_update(), we compute the ONPROC time for high-priority + * threads, which is used to calculate the per-pset duty cycle. We + * will always tell our callers to update the thread's priority, + * since we want to force a cpu_surrender(). + * + * We reset sdc_update_ticks so that sysdc_tick() will only update + * the thread's priority if our timeout is delayed by a tick or + * more. + */ + if (update) { + /* SDC threads are not allowed to change cpupart bindings. */ + ASSERT(t->t_cpupart == sdc->sdc_pset->sdp_cpupart); + + /* If we were at MAXPRI, account for our onproc time. */ + if (t->t_pri == sdc->sdc_maxpri && + sdc->sdc_last_base_O != 0 && + sdc->sdc_last_base_O < newO) { + sdc->sdc_last_O = newO - sdc->sdc_last_base_O; + sdc->sdc_pset->sdp_onproc_time += + (uint64_t)sdc->sdc_last_O; + sdc->sdc_pset->sdp_onproc_threads++; + } else { + sdc->sdc_last_O = 0; + } + sdc->sdc_last_base_O = newO; + + sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks + 1; + return (1); + } + + /* + * Like sysdc_update(), sysdc_tick() always wants to update the + * thread's priority, so that the CPU is surrendered if necessary. + * We reset sdc_update_ticks so that if the timeout continues to be + * delayed, we'll update at the regular interval. + */ + if (tick) { + ASSERT(sdc->sdc_ticks == sdc->sdc_update_ticks); + sdc->sdc_update_ticks = sdc->sdc_ticks + sysdc_update_ticks; + return (1); + } + + /* + * Otherwise, only tell our callers to update the priority if it has + * changed. + */ + return (sdc->sdc_epri != t->t_pri); +} + +static void +sysdc_update_pri(sysdc_t *sdc, uint_t flags) +{ + kthread_t *t = sdc->sdc_thread; + + ASSERT(THREAD_LOCK_HELD(t)); + + if (sysdc_compute_pri(sdc, flags)) { + if (!thread_change_pri(t, sdc->sdc_epri, 0)) { + cpu_surrender(t); + } + } +} + +/* + * Add a thread onto the active list. It will only be removed by + * sysdc_update(). + */ +static void +sysdc_activate(sysdc_t *sdc) +{ + sysdc_t *volatile *headp = &SYSDC_LIST(sdc)->sdl_list; + sysdc_t *head; + kthread_t *t = sdc->sdc_thread; + + SYSDC_INC_STAT(sysdc_activate_enter); + + ASSERT(sdc->sdc_next == NULL); + ASSERT(THREAD_LOCK_HELD(t)); + + do { + head = *headp; + sdc->sdc_next = head; + } while (atomic_cas_ptr(headp, head, sdc) != head); +} + +/* + * sysdc_update() has two jobs: + * + * 1. It updates the priorities of all active SDC threads on the system. + * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC. + */ +static void +sysdc_update(void *arg) +{ + int idx; + sysdc_t *freelist = NULL; + sysdc_pset_t *cur; + hrtime_t now, diff; + uint_t redeploy = 1; + + SYSDC_INC_STAT(sysdc_update_enter); + + ASSERT(sysdc_update_timeout_started); + + /* + * If this is our first time through, diff will be gigantic, and + * no breaks will be necessary. + */ + now = gethrtime(); + diff = now - sysdc_last_update; + sysdc_last_update = now; + + mutex_enter(&sysdc_pset_lock); + for (cur = list_head(&sysdc_psets); cur != NULL; + cur = list_next(&sysdc_psets, cur)) { + boolean_t breaking = (cur->sdp_should_break != 0); + + if (cur->sdp_need_break != breaking) { + DTRACE_PROBE2(sdc__pset__break, sysdc_pset_t *, cur, + boolean_t, breaking); + } + cur->sdp_onproc_time = 0; + cur->sdp_onproc_threads = 0; + cur->sdp_need_break = breaking; + } + mutex_exit(&sysdc_pset_lock); + + for (idx = 0; idx < SYSDC_NLISTS; idx++) { + sysdc_list_t *sdl = &sysdc_active[idx]; + sysdc_t *volatile *headp = &sdl->sdl_list; + sysdc_t *head, *tail; + sysdc_t **prevptr; + + if (*headp == &sysdc_dummy) + continue; + + /* Prevent any threads from exiting while we're poking them. */ + mutex_enter(&sdl->sdl_lock); + + /* + * Each sdl_list contains a singly-linked list of active + * threads. Threads which become active while we are + * processing the list will be added to sdl_list. Since we + * don't want that to interfere with our own processing, we + * swap in an empty list. Any newly active threads will + * go on to this empty list. When finished, we'll put any + * such threads at the end of the processed list. + */ + head = atomic_swap_ptr(headp, &sysdc_dummy); + prevptr = &head; + while (*prevptr != &sysdc_dummy) { + sysdc_t *const sdc = *prevptr; + kthread_t *const t = sdc->sdc_thread; + + /* + * If the thread has exited, move its sysdc_t onto + * freelist, to be freed later. + */ + if (t == NULL) { + *prevptr = sdc->sdc_next; + SYSDC_INC_STAT(sysdc_update_exited); + sdc->sdc_next = freelist; + freelist = sdc; + continue; + } + + thread_lock(t); + if (t->t_cid != sysdccid) { + thread_unlock(t); + prevptr = &sdc->sdc_next; + SYSDC_INC_STAT(sysdc_update_not_sdc); + continue; + } + ASSERT(t->t_cldata == sdc); + + /* + * If the thread has been sleeping for longer + * than sysdc_prune_interval, make it inactive by + * removing it from the list. + */ + if (!(t->t_state & (TS_RUN | TS_ONPROC)) && + sdc->sdc_sleep_updates != 0 && + (sdc->sdc_sleep_updates - sdc->sdc_nupdates) > + sysdc_prune_updates) { + *prevptr = sdc->sdc_next; + SYSDC_INC_STAT(sysdc_update_idle); + sdc->sdc_next = NULL; + thread_unlock(t); + continue; + } + sysdc_update_pri(sdc, SDC_UPDATE_TIMEOUT); + thread_unlock(t); + + prevptr = &sdc->sdc_next; + } + + /* + * Add our list to the bucket, putting any new entries + * added while we were working at the tail of the list. + */ + do { + tail = *headp; + *prevptr = tail; + } while (atomic_cas_ptr(headp, tail, head) != tail); + + mutex_exit(&sdl->sdl_lock); + } + + mutex_enter(&sysdc_pset_lock); + for (cur = list_head(&sysdc_psets); cur != NULL; + cur = list_next(&sysdc_psets, cur)) { + + cur->sdp_vtime_last_interval = + diff * cur->sdp_cpupart->cp_ncpus; + cur->sdp_DC_last_interval = + (cur->sdp_onproc_time * SYSDC_DC_MAX) / + cur->sdp_vtime_last_interval; + + if (cur->sdp_should_break > 0) { + cur->sdp_should_break--; /* breaking */ + continue; + } + if (cur->sdp_dont_break > 0) { + cur->sdp_dont_break--; /* waiting before checking */ + continue; + } + if (cur->sdp_DC_last_interval > sysdc_max_pset_DC) { + cur->sdp_should_break = sysdc_break_updates; + cur->sdp_dont_break = sysdc_nobreak_updates; + SYSDC_INC_STAT(sysdc_update_take_break); + } + } + + /* + * If there are no sysdc_psets, there can be no threads, so + * we can stop doing our timeout. Since we're holding the + * sysdc_pset_lock, no new sysdc_psets can come in, which will + * prevent anyone from racing with this and dropping our timeout + * on the floor. + */ + if (list_is_empty(&sysdc_psets)) { + SYSDC_INC_STAT(sysdc_update_no_psets); + ASSERT(sysdc_update_timeout_started); + sysdc_update_timeout_started = 0; + + redeploy = 0; + } + mutex_exit(&sysdc_pset_lock); + + while (freelist != NULL) { + sysdc_t *cur = freelist; + freelist = cur->sdc_next; + kmem_free(cur, sizeof (*cur)); + } + + if (redeploy) { + (void) timeout(sysdc_update, arg, sysdc_update_ticks); + } +} + +static void +sysdc_preempt(kthread_t *t) +{ + ASSERT(t == curthread); + ASSERT(THREAD_LOCK_HELD(t)); + + setbackdq(t); /* give others a chance to run */ +} + +static void +sysdc_tick(kthread_t *t) +{ + sysdc_t *sdc; + + thread_lock(t); + if (t->t_cid != sysdccid) { + SYSDC_INC_STAT(sysdc_tick_not_sdc); + thread_unlock(t); + return; + } + sdc = t->t_cldata; + if (t->t_state == TS_ONPROC && + t->t_pri < t->t_disp_queue->disp_maxrunpri) { + cpu_surrender(t); + } + + if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) { + ASSERT(sdc->sdc_sleep_updates == 0); + } + + ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks); + sdc->sdc_ticks++; + if (sdc->sdc_ticks == sdc->sdc_update_ticks) { + SYSDC_INC_STAT(sysdc_tick_quantum_expired); + sysdc_update_pri(sdc, SDC_UPDATE_TICK); + ASSERT(sdc->sdc_ticks != sdc->sdc_update_ticks); + } + thread_unlock(t); +} + +static void +sysdc_setrun(kthread_t *t) +{ + sysdc_t *sdc = t->t_cldata; + + ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ + + sdc->sdc_sleep_updates = 0; + + if (sdc->sdc_next == NULL) { + /* + * Since we're in transition, we don't want to use the + * full thread_update_pri(). + */ + if (sysdc_compute_pri(sdc, 0)) { + THREAD_CHANGE_PRI(t, sdc->sdc_epri); + } + sysdc_activate(sdc); + + ASSERT(sdc->sdc_next != NULL); + } + + setbackdq(t); +} + +static void +sysdc_wakeup(kthread_t *t) +{ + sysdc_setrun(t); +} + +static void +sysdc_sleep(kthread_t *t) +{ + sysdc_t *sdc = t->t_cldata; + + ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ + + sdc->sdc_sleep_updates = sdc->sdc_nupdates; +} + +/*ARGSUSED*/ +static int +sysdc_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, + void *bufp) +{ + cpupart_t *const cpupart = t->t_cpupart; + sysdc_t *sdc = bufp; + sysdc_params_t *sdpp = parmsp; + sysdc_pset_t *newpset = sdc->sdc_pset; + sysdc_pset_t *pset; + int start_timeout; + + if (t->t_cid != syscid) + return (EPERM); + + ASSERT(ttolwp(t) != NULL); + ASSERT(sdpp != NULL); + ASSERT(newpset != NULL); + ASSERT(sysdc_param_init); + + ASSERT(sdpp->sdp_minpri >= sysdc_minpri); + ASSERT(sdpp->sdp_maxpri <= sysdc_maxpri); + ASSERT(sdpp->sdp_DC >= sysdc_minDC); + ASSERT(sdpp->sdp_DC <= sysdc_maxDC); + + sdc->sdc_thread = t; + sdc->sdc_pri = sdpp->sdp_maxpri; /* start off maximally */ + sdc->sdc_minpri = sdpp->sdp_minpri; + sdc->sdc_maxpri = sdpp->sdp_maxpri; + sdc->sdc_target_DC = sdpp->sdp_DC; + sdc->sdc_ticks = 0; + sdc->sdc_update_ticks = sysdc_update_ticks + 1; + + /* Assign ourselves to the appropriate pset. */ + sdc->sdc_pset = NULL; + mutex_enter(&sysdc_pset_lock); + for (pset = list_head(&sysdc_psets); pset != NULL; + pset = list_next(&sysdc_psets, pset)) { + if (pset->sdp_cpupart == cpupart) { + break; + } + } + if (pset == NULL) { + pset = newpset; + newpset = NULL; + pset->sdp_cpupart = cpupart; + list_insert_tail(&sysdc_psets, pset); + } + pset->sdp_nthreads++; + ASSERT(pset->sdp_nthreads > 0); + + sdc->sdc_pset = pset; + + start_timeout = (sysdc_update_timeout_started == 0); + sysdc_update_timeout_started = 1; + mutex_exit(&sysdc_pset_lock); + + if (newpset != NULL) + kmem_free(newpset, sizeof (*newpset)); + + /* Update t's scheduling class and priority. */ + thread_lock(t); + t->t_clfuncs = &(sclass[cid].cl_funcs->thread); + t->t_cid = cid; + t->t_cldata = sdc; + t->t_schedflag |= TS_RUNQMATCH; + + sysdc_update_pri(sdc, SDC_UPDATE_INITIAL); + thread_unlock(t); + + /* Kick off the thread timeout if we're the first one in. */ + if (start_timeout) { + (void) timeout(sysdc_update, NULL, sysdc_update_ticks); + } + + return (0); +} + +static void +sysdc_leave(sysdc_t *sdc) +{ + sysdc_pset_t *sdp = sdc->sdc_pset; + sysdc_list_t *sdl = SYSDC_LIST(sdc); + uint_t freedc; + + mutex_enter(&sdl->sdl_lock); /* block sysdc_update() */ + sdc->sdc_thread = NULL; + freedc = (sdc->sdc_next == NULL); + mutex_exit(&sdl->sdl_lock); + + mutex_enter(&sysdc_pset_lock); + sdp = sdc->sdc_pset; + ASSERT(sdp != NULL); + ASSERT(sdp->sdp_nthreads > 0); + --sdp->sdp_nthreads; + if (sdp->sdp_nthreads == 0) { + list_remove(&sysdc_psets, sdp); + } else { + sdp = NULL; + } + mutex_exit(&sysdc_pset_lock); + + if (freedc) + kmem_free(sdc, sizeof (*sdc)); + if (sdp != NULL) + kmem_free(sdp, sizeof (*sdp)); +} + +static void +sysdc_exitclass(void *buf) +{ + sysdc_leave((sysdc_t *)buf); +} + +/*ARGSUSED*/ +static int +sysdc_canexit(kthread_t *t, cred_t *reqpcredp) +{ + /* Threads cannot exit SDC once joined, except in a body bag. */ + return (EPERM); +} + +static void +sysdc_exit(kthread_t *t) +{ + sysdc_t *sdc; + + /* We're exiting, so we just rejoin the SYS class. */ + thread_lock(t); + ASSERT(t->t_cid == sysdccid); + sdc = t->t_cldata; + t->t_cid = syscid; + t->t_cldata = NULL; + t->t_clfuncs = &(sclass[syscid].cl_funcs->thread); + (void) thread_change_pri(t, maxclsyspri, 0); + t->t_schedflag &= ~TS_RUNQMATCH; + thread_unlock_nopreempt(t); + + /* Unlink the sdc from everything. */ + sysdc_leave(sdc); +} + +/*ARGSUSED*/ +static int +sysdc_fork(kthread_t *t, kthread_t *ct, void *bufp) +{ + /* + * Threads cannot be created with SDC as their class; they must + * be created as SYS and then added with sysdc_thread_enter(). + * Because of this restriction, sysdc_fork() should never be called. + */ + panic("sysdc cannot be forked"); + + return (ENOSYS); +} + +/*ARGSUSED*/ +static void +sysdc_forkret(kthread_t *t, kthread_t *ct) +{ + /* SDC threads are part of system processes, which never fork. */ + panic("sysdc cannot be forked"); +} + +static pri_t +sysdc_globpri(kthread_t *t) +{ + return (t->t_epri); +} + +/*ARGSUSED*/ +static pri_t +sysdc_no_swap(kthread_t *t, int flags) +{ + /* SDC threads cannot be swapped. */ + return (-1); +} + +/* + * Get maximum and minimum priorities enjoyed by SDC threads. + */ +static int +sysdc_getclpri(pcpri_t *pcprip) +{ + pcprip->pc_clpmax = sysdc_maxpri; + pcprip->pc_clpmin = sysdc_minpri; + return (0); +} + +/*ARGSUSED*/ +static int +sysdc_getclinfo(void *arg) +{ + return (0); /* no class-specific info */ +} + +/*ARGSUSED*/ +static int +sysdc_alloc(void **p, int flag) +{ + sysdc_t *new; + + *p = NULL; + if ((new = kmem_zalloc(sizeof (*new), flag)) == NULL) { + return (ENOMEM); + } + if ((new->sdc_pset = kmem_zalloc(sizeof (*new->sdc_pset), flag)) == + NULL) { + kmem_free(new, sizeof (*new)); + return (ENOMEM); + } + *p = new; + return (0); +} + +static void +sysdc_free(void *p) +{ + sysdc_t *sdc = p; + + if (sdc != NULL) { + /* + * We must have failed CL_ENTERCLASS(), so our pset should be + * there and unused. + */ + ASSERT(sdc->sdc_pset != NULL); + ASSERT(sdc->sdc_pset->sdp_cpupart == NULL); + kmem_free(sdc->sdc_pset, sizeof (*sdc->sdc_pset)); + kmem_free(sdc, sizeof (*sdc)); + } +} + +static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */ +static int sysdc_einval(); +static void sysdc_nullsys(); + +static struct classfuncs sysdc_classfuncs = { + /* messages to class manager */ + { + sysdc_enosys, /* admin */ + sysdc_getclinfo, + sysdc_enosys, /* parmsin */ + sysdc_enosys, /* parmsout */ + sysdc_enosys, /* vaparmsin */ + sysdc_enosys, /* vaparmsout */ + sysdc_getclpri, + sysdc_alloc, + sysdc_free, + }, + /* operations on threads */ + { + sysdc_enterclass, + sysdc_exitclass, + sysdc_canexit, + sysdc_fork, + sysdc_forkret, + sysdc_nullsys, /* parmsget */ + sysdc_enosys, /* parmsset */ + sysdc_nullsys, /* stop */ + sysdc_exit, + sysdc_nullsys, /* active */ + sysdc_nullsys, /* inactive */ + sysdc_no_swap, /* swapin */ + sysdc_no_swap, /* swapout */ + sysdc_nullsys, /* trapret */ + sysdc_preempt, + sysdc_setrun, + sysdc_sleep, + sysdc_tick, + sysdc_wakeup, + sysdc_einval, /* donice */ + sysdc_globpri, + sysdc_nullsys, /* set_process_group */ + sysdc_nullsys, /* yield */ + sysdc_einval, /* doprio */ + } +}; + +static int +sysdc_enosys() +{ + return (ENOSYS); +} + +static int +sysdc_einval() +{ + return (EINVAL); +} + +static void +sysdc_nullsys() +{ +} + +/*ARGSUSED*/ +static pri_t +sysdc_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) +{ + int idx; + + list_create(&sysdc_psets, sizeof (sysdc_pset_t), + offsetof(sysdc_pset_t, sdp_node)); + + for (idx = 0; idx < SYSDC_NLISTS; idx++) { + sysdc_active[idx].sdl_list = &sysdc_dummy; + } + + sysdc_initparam(); + + sysdccid = cid; + *clfuncspp = &sysdc_classfuncs; + + return ((pri_t)v.v_maxsyspri); +} + +static struct sclass csw = { + "SDC", + sysdc_init, + 0 +}; + +static struct modlsched modlsched = { + &mod_schedops, "system duty cycle scheduling class", &csw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlsched, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + return (EBUSY); /* can't unload for now */ +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* --- consolidation-private interfaces --- */ +void +sysdc_thread_enter(kthread_t *t, uint_t dc, uint_t flags) +{ + void *buf = NULL; + sysdc_params_t sdp; + + SYSDC_INC_STAT(sysdc_thread_enter_enter); + + ASSERT(sysdc_param_init); + ASSERT(sysdccid >= 0); + + ASSERT((flags & ~SYSDC_THREAD_BATCH) == 0); + + sdp.sdp_minpri = sysdc_minpri; + sdp.sdp_maxpri = sysdc_maxpri; + sdp.sdp_DC = MAX(MIN(dc, sysdc_maxDC), sysdc_minDC); + + if (flags & SYSDC_THREAD_BATCH) + sdp.sdp_maxpri -= sysdc_batch_niceness; + + VERIFY3U(CL_ALLOC(&buf, sysdccid, KM_SLEEP), ==, 0); + + ASSERT(t->t_lwp != NULL); + ASSERT(t->t_cid == syscid); + ASSERT(t->t_cldata == NULL); + VERIFY3U(CL_CANEXIT(t, NULL), ==, 0); + VERIFY3U(CL_ENTERCLASS(t, sysdccid, &sdp, kcred, buf), ==, 0); + CL_EXITCLASS(syscid, NULL); +}
--- a/usr/src/uts/common/disp/thread.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/disp/thread.c Mon Nov 23 15:29:44 2009 -0800 @@ -98,7 +98,10 @@ extern int nthread; +/* System Scheduling classes. */ id_t syscid; /* system scheduling class ID */ +id_t sysdccid = CLASS_UNUSED; /* reset when SDC loads */ + void *segkp_thread; /* cookie for segkp pool */ int lwp_cache_sz = 32;
--- a/usr/src/uts/common/fs/proc/prcontrol.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/fs/proc/prcontrol.c Mon Nov 23 15:29:44 2009 -0800 @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/uio.h> #include <sys/param.h> @@ -299,6 +297,12 @@ p = pcp->prc_proc; ASSERT(p != NULL); + /* System processes defy control. */ + if (p->p_flag & SSYS) { + prunlock(pnp); + return (EBUSY); + } + switch (cmd) { default: @@ -315,7 +319,7 @@ /* * Can't apply to a system process. */ - if ((p->p_flag & SSYS) || p->p_as == &kas) { + if (p->p_as == &kas) { error = EBUSY; break; } @@ -723,6 +727,11 @@ p = pcp->prc_proc; ASSERT(p != NULL); + if (p->p_flag & SSYS) { + prunlock(pnp); + return (EBUSY); + } + switch (cmd) { default: @@ -739,7 +748,7 @@ /* * Can't apply to a system process. */ - if ((p->p_flag & SSYS) || p->p_as == &kas) { + if (p->p_as == &kas) { error = EBUSY; break; }
--- a/usr/src/uts/common/fs/vfs.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/fs/vfs.c Mon Nov 23 15:29:44 2009 -0800 @@ -812,6 +812,7 @@ char *path; size_t plen; struct vfssw *vswp; + proc_t *p; rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL); rw_init(&vfslist, NULL, RW_DEFAULT, NULL); @@ -835,9 +836,22 @@ vfs_setmntpoint(rootvfs, "/"); if (VFS_ROOT(rootvfs, &rootdir)) panic("vfs_mountroot: no root vnode"); - PTOU(curproc)->u_cdir = rootdir; - VN_HOLD(PTOU(curproc)->u_cdir); - PTOU(curproc)->u_rdir = NULL; + + /* + * At this point, the process tree consists of p0 and possibly some + * direct children of p0. (i.e. there are no grandchildren) + * + * Walk through them all, setting their current directory. + */ + mutex_enter(&pidlock); + for (p = practive; p != NULL; p = p->p_next) { + ASSERT(p == &p0 || p->p_parent == &p0); + + PTOU(p)->u_cdir = rootdir; + VN_HOLD(PTOU(p)->u_cdir); + PTOU(p)->u_rdir = NULL; + } + mutex_exit(&pidlock); /* * Setup the global zone's rootvp, now that it exists.
--- a/usr/src/uts/common/fs/zfs/spa.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/spa.c Mon Nov 23 15:29:44 2009 -0800 @@ -62,24 +62,28 @@ #include <sys/zfs_ioctl.h> #ifdef _KERNEL +#include <sys/bootprops.h> +#include <sys/callb.h> +#include <sys/cpupart.h> +#include <sys/pool.h> +#include <sys/sysdc.h> #include <sys/zone.h> -#include <sys/bootprops.h> #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" -enum zti_modes { +typedef enum zti_modes { zti_mode_fixed, /* value is # of threads (min 1) */ zti_mode_online_percent, /* value is % of online CPUs */ - zti_mode_tune, /* fill from zio_taskq_tune_* */ + zti_mode_batch, /* cpu-intensive; value is ignored */ zti_mode_null, /* don't create a taskq */ zti_nmodes -}; +} zti_modes_t; #define ZTI_FIX(n) { zti_mode_fixed, (n) } #define ZTI_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_TUNE { zti_mode_tune, 0 } +#define ZTI_BATCH { zti_mode_batch, 0 } #define ZTI_NULL { zti_mode_null, 0 } #define ZTI_ONE ZTI_FIX(1) @@ -90,7 +94,7 @@ } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { - "issue", "issue_high", "intr", "intr_high" + "issue", "issue_high", "intr", "intr_high" }; /* @@ -100,19 +104,29 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, - { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, + { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, + { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; -enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; -uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ - static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); +uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ +id_t zio_taskq_psrset_bind = PS_NONE; +boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ +uint_t zio_taskq_basedc = 80; /* base duty cycle */ + +boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" + /* * ========================================================================== * SPA properties routines @@ -584,6 +598,139 @@ offsetof(spa_error_entry_t, se_avl)); } +static taskq_t * +spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, + uint_t value) +{ + uint_t flags = TASKQ_PREPOPULATE; + boolean_t batch = B_FALSE; + + switch (mode) { + case zti_mode_null: + return (NULL); /* no taskq needed */ + + case zti_mode_fixed: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case zti_mode_batch: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + case zti_mode_online_percent: + flags |= TASKQ_THREADS_CPU_PCT; + break; + + default: + panic("unrecognized mode for %s taskq (%u:%u) in " + "spa_activate()", + name, mode, value); + break; + } + + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + return (taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags)); + } + return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, + spa->spa_proc, flags)); +} + +static void +spa_create_zio_taskqs(spa_t *spa) +{ + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + char name[32]; + + (void) snprintf(name, sizeof (name), + "%s_%s", zio_type_name[t], zio_taskq_types[q]); + + spa->spa_zio_taskq[t][q] = + spa_taskq_create(spa, name, mode, value); + } + } +} + +#ifdef _KERNEL +static void +spa_thread(void *arg) +{ + callb_cpr_t cprinfo; + + spa_t *spa = arg; + user_t *pu = PTOU(curproc); + + CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, + spa->spa_name); + + ASSERT(curproc != &p0); + (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), + "zpool-%s", spa->spa_name); + (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); + + /* bind this thread to the requested psrset */ + if (zio_taskq_psrset_bind != PS_NONE) { + pool_lock(); + mutex_enter(&cpu_lock); + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, + 0, NULL, NULL) == 0) { + curthread->t_bind_pset = zio_taskq_psrset_bind; + } else { + cmn_err(CE_WARN, + "Couldn't bind process for zfs pool \"%s\" to " + "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); + } + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + mutex_exit(&cpu_lock); + pool_unlock(); + } + + if (zio_taskq_sysdc) { + sysdc_thread_enter(curthread, 100, 0); + } + + spa->spa_proc = curproc; + spa->spa_did = curthread->t_did; + + spa_create_zio_taskqs(spa); + + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); + + spa->spa_proc_state = SPA_PROC_ACTIVE; + cv_broadcast(&spa->spa_proc_cv); + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_proc_state == SPA_PROC_ACTIVE) + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); + + ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); + spa->spa_proc_state = SPA_PROC_GONE; + spa->spa_proc = &p0; + cv_broadcast(&spa->spa_proc_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ + + mutex_enter(&curproc->p_lock); + lwp_exit(); +} +#endif + /* * Activate an uninitialized pool. */ @@ -598,53 +745,38 @@ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; - enum zti_modes mode = ztip->zti_mode; - uint_t value = ztip->zti_value; - char name[32]; - - (void) snprintf(name, sizeof (name), - "%s_%s", zio_type_name[t], zio_taskq_types[q]); - - if (mode == zti_mode_tune) { - mode = zio_taskq_tune_mode; - value = zio_taskq_tune_value; - if (mode == zti_mode_tune) - mode = zti_mode_online_percent; + /* Try to create a covering process */ + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_NONE); + ASSERT(spa->spa_proc == &p0); + spa->spa_did = 0; + + /* Only create a process if we're going to be around a while. */ + if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { + if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, + NULL, 0) == 0) { + spa->spa_proc_state = SPA_PROC_CREATED; + while (spa->spa_proc_state == SPA_PROC_CREATED) { + cv_wait(&spa->spa_proc_cv, + &spa->spa_proc_lock); } - - switch (mode) { - case zti_mode_fixed: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); - - spa->spa_zio_taskq[t][q] = taskq_create(name, - value, maxclsyspri, 50, INT_MAX, - TASKQ_PREPOPULATE); - break; - - case zti_mode_online_percent: - spa->spa_zio_taskq[t][q] = taskq_create(name, - value, maxclsyspri, 50, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); - break; - - case zti_mode_null: - spa->spa_zio_taskq[t][q] = NULL; - break; - - case zti_mode_tune: - default: - panic("unrecognized mode for " - "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " - "in spa_activate()", - t, q, mode, value); - break; - } + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + ASSERT(spa->spa_proc != &p0); + ASSERT(spa->spa_did != 0); + } else { +#ifdef _KERNEL + cmn_err(CE_WARN, + "Couldn't create process for zfs pool \"%s\"\n", + spa->spa_name); +#endif } } + mutex_exit(&spa->spa_proc_lock); + + /* If we didn't create a process, we need to create our taskqs. */ + if (spa->spa_proc == &p0) { + spa_create_zio_taskqs(spa); + } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); @@ -703,6 +835,31 @@ avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; + + mutex_enter(&spa->spa_proc_lock); + if (spa->spa_proc_state != SPA_PROC_NONE) { + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + spa->spa_proc_state = SPA_PROC_DEACTIVATE; + cv_broadcast(&spa->spa_proc_cv); + while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { + ASSERT(spa->spa_proc != &p0); + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_GONE); + spa->spa_proc_state = SPA_PROC_NONE; + } + ASSERT(spa->spa_proc == &p0); + mutex_exit(&spa->spa_proc_lock); + + /* + * We want to make sure spa_thread() has actually exited the ZFS + * module, so that the module can't be unloaded out from underneath + * it. + */ + if (spa->spa_did != 0) { + thread_join(spa->spa_did); + spa->spa_did = 0; + } } /* @@ -2999,13 +3156,6 @@ return (0); } - -/* - * This (illegal) pool name is used when temporarily importing a spa_t in order - * to get the vdev stats associated with the imported devices. - */ -#define TRYIMPORT_NAME "$import" - nvlist_t * spa_tryimport(nvlist_t *tryconfig) {
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Mon Nov 23 15:29:44 2009 -0800 @@ -430,15 +430,17 @@ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); @@ -451,6 +453,8 @@ spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; + spa->spa_proc = &p0; + spa->spa_proc_state = SPA_PROC_NONE; refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -522,15 +526,17 @@ bplist_fini(&spa->spa_deferred_bplist); cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_scrub_lock); + mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); - mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock);
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Mon Nov 23 15:29:44 2009 -0800 @@ -86,6 +86,25 @@ ZIO_TASKQ_TYPES }; +/* + * State machine for the zpool-pooname process. The states transitions + * are done as follows: + * + * From To Routine + * PROC_NONE -> PROC_CREATED spa_activate() + * PROC_CREATED -> PROC_ACTIVE spa_thread() + * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() + * PROC_DEACTIVATE -> PROC_GONE spa_thread() + * PROC_GONE -> PROC_NONE spa_deactivate() + */ +typedef enum spa_proc_state { + SPA_PROC_NONE, /* spa_proc = &p0, no process created */ + SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ + SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ + SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ + SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ +} spa_proc_state_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -186,6 +205,11 @@ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ + kmutex_t spa_proc_lock; /* protects spa_proc* */ + kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ + spa_proc_state_t spa_proc_state; /* see definition */ + struct proc *spa_proc; /* "zpool-poolname" process */ + uint64_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ /*
--- a/usr/src/uts/common/fs/zfs/zio.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/fs/zfs/zio.c Mon Nov 23 15:29:44 2009 -0800 @@ -85,6 +85,8 @@ */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) +boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; + #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else @@ -1024,10 +1026,11 @@ */ static void -zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) +zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; + int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -1052,7 +1055,7 @@ ASSERT3U(q, <, ZIO_TASKQ_TYPES); (void) taskq_dispatch(spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, TQ_SLEEP); + (task_func_t *)zio_execute, zio, flags); } static boolean_t @@ -1071,7 +1074,7 @@ static int zio_issue_async(zio_t *zio) { - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } @@ -1079,7 +1082,7 @@ void zio_interrupt(zio_t *zio) { - zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); + zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } /* @@ -1122,10 +1125,15 @@ * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. + * + * For VDEV_IO_START, we cut in line so that the io will + * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } @@ -1790,7 +1798,7 @@ } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { @@ -2365,6 +2373,9 @@ /* * If the I/O failed, determine whether we should attempt to retry it. + * + * On retry, we cut in line in the issue queue, since we don't want + * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { @@ -2374,7 +2385,8 @@ zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, + zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); }
--- a/usr/src/uts/common/os/condvar.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/condvar.c Mon Nov 23 15:29:44 2009 -0800 @@ -314,13 +314,15 @@ ASSERT(!quiesce_active); /* - * The check for t_intr is to catch an interrupt thread - * that has not yet unpinned the thread underneath. + * Threads in system processes don't process signals. This is + * true both for standard threads of system processes and for + * interrupt threads which have borrowed their pinned thread's LWP. */ - if (lwp == NULL || t->t_intr) { + if (lwp == NULL || (p->p_flag & SSYS)) { cv_wait(cvp, mp); return (rval); } + ASSERT(t->t_intr == NULL); ASSERT(curthread->t_schedflag & TS_DONT_SWAP); cancel_pending = schedctl_cancel_pending(); @@ -374,12 +376,13 @@ ASSERT(!quiesce_active); /* - * If there is no lwp, then we don't need to wait for a signal. - * The check for t_intr is to catch an interrupt thread - * that has not yet unpinned the thread underneath. + * Threads in system processes don't process signals. This is + * true both for standard threads of system processes and for + * interrupt threads which have borrowed their pinned thread's LWP. */ - if (lwp == NULL || t->t_intr) + if (lwp == NULL || (p->p_flag & SSYS)) return (cv_timedwait_hires(cvp, mp, tim, res, flag)); + ASSERT(t->t_intr == NULL); /* * If tim is less than or equal to current hrtime, then the timeout @@ -516,13 +519,15 @@ return (rval); /* - * The check for t_intr is to catch an interrupt thread - * that has not yet unpinned the thread underneath. + * Threads in system processes don't process signals. This is + * true both for standard threads of system processes and for + * interrupt threads which have borrowed their pinned thread's LWP. */ - if (lwp == NULL || t->t_intr) { + if (lwp == NULL || (p->p_flag & SSYS)) { cv_wait(cvp, mp); return (rval); } + ASSERT(t->t_intr == NULL); cancel_pending = schedctl_cancel_pending(); lwp->lwp_asleep = 1; @@ -640,14 +645,15 @@ return; /* - * If there is no lwp, then we don't need to eventually stop it - * The check for t_intr is to catch an interrupt thread - * that has not yet unpinned the thread underneath. + * Threads in system processes don't process signals. This is + * true both for standard threads of system processes and for + * interrupt threads which have borrowed their pinned thread's LWP. */ - if (lwp == NULL || t->t_intr) { + if (lwp == NULL || (p->p_flag & SSYS)) { cv_wait(cvp, mp); return; } + ASSERT(t->t_intr == NULL); /* * Wakeup in wakeup_time milliseconds, i.e., human time.
--- a/usr/src/uts/common/os/exit.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/exit.c Mon Nov 23 15:29:44 2009 -0800 @@ -405,10 +405,12 @@ * Allocate a sigqueue now, before we grab locks. * It will be given to sigcld(), below. * Special case: If we will be making the process disappear - * without a trace (for the benefit of posix_spawn() in libc) - * don't bother to allocate a useless sigqueue. + * without a trace because it is either: + * * an exiting SSYS process, or + * * a posix_spawn() vfork child who requests it, + * we don't bother to allocate a useless sigqueue. */ - evaporate = ((p->p_flag & SVFORK) && + evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) && why == CLD_EXITED && what == _EVAPORATE); if (!evaporate) sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); @@ -747,6 +749,8 @@ rdir = PTOU(p)->u_rdir; cwd = PTOU(p)->u_cwd; + ASSERT(cdir != NULL || p->p_parent == &p0); + /* * Release resource controls, as they are no longer enforceable. */ @@ -840,7 +844,8 @@ * We don't release u_cdir and u_rdir until SZOMB is set. * This protects us against dofusers(). */ - VN_RELE(cdir); + if (cdir) + VN_RELE(cdir); if (rdir) VN_RELE(rdir); if (cwd)
--- a/usr/src/uts/common/os/fork.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/fork.c Mon Nov 23 15:29:44 2009 -0800 @@ -81,7 +81,10 @@ #include <sys/fork.h> static int64_t cfork(int, int, int); -static int getproc(proc_t **, int); +static int getproc(proc_t **, pid_t, uint_t); +#define GETPROC_USER 0x0 +#define GETPROC_KERNEL 0x1 + static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -224,7 +227,7 @@ /* * Create a child proc struct. Place a VN_HOLD on appropriate vnodes. */ - if (getproc(&cp, 0) < 0) { + if (getproc(&cp, 0, GETPROC_USER) < 0) { mutex_enter(&p->p_lock); pool_barrier_exit(); continuelwps(p); @@ -779,20 +782,24 @@ * fork a kernel process. */ int -newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct) +newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, + pid_t pid) { proc_t *p; struct user *up; - klwp_t *lwp; + kthread_t *t; cont_process_t *ctp = NULL; rctl_entity_p_t e; - ASSERT(!(cid == syscid && ct != NULL)); - if (cid == syscid) { + ASSERT(cid != sysdccid); + ASSERT(cid != syscid || ct == NULL); + if (CLASS_KERNEL(cid)) { rctl_alloc_gp_t *init_gp; rctl_set_t *init_set; - if (getproc(&p, 1) < 0) + ASSERT(pid != 1); + + if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); /* @@ -827,12 +834,17 @@ mutex_exit(&p->p_lock); rctl_prealloc_destroy(init_gp); - } else { + + t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri); + } else { rctl_alloc_gp_t *init_gp, *default_gp; rctl_set_t *init_set; task_t *tk, *tk_old; + klwp_t *lwp; - if (getproc(&p, 0) < 0) + ASSERT(pid == 1); + + if (getproc(&p, pid, GETPROC_USER) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -865,29 +877,26 @@ task_rele(tk_old); rctl_prealloc_destroy(default_gp); rctl_prealloc_destroy(init_gp); - } - - p->p_as = &kas; - if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri, - &curthread->t_hold, cid, 1)) == NULL) { - task_t *tk; - fork_fail(p); - mutex_enter(&pidlock); - mutex_enter(&p->p_lock); - tk = p->p_task; - task_detach(p); - ASSERT(p->p_pool->pool_ref > 0); - atomic_add_32(&p->p_pool->pool_ref, -1); - mutex_exit(&p->p_lock); - pid_exit(p); - mutex_exit(&pidlock); - task_rele(tk); + if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri, + &curthread->t_hold, cid, 1)) == NULL) { + task_t *tk; + fork_fail(p); + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + tk = p->p_task; + task_detach(p); + ASSERT(p->p_pool->pool_ref > 0); + atomic_add_32(&p->p_pool->pool_ref, -1); + mutex_exit(&p->p_lock); + pid_exit(p); + mutex_exit(&pidlock); + task_rele(tk); - return (EAGAIN); - } + return (EAGAIN); + } + t = lwptot(lwp); - if (cid != syscid) { ctp = contract_process_fork(sys_process_tmpl, p, curproc, B_FALSE); ASSERT(ctp != NULL); @@ -895,13 +904,14 @@ *ct = &ctp->conp_contract; } + ASSERT3U(t->t_tid, ==, 1); p->p_lwpid = 1; mutex_enter(&pidlock); - pgjoin(p, curproc->p_pgidp); + pgjoin(p, p->p_parent->p_pgidp); p->p_stat = SRUN; mutex_enter(&p->p_lock); - lwptot(lwp)->t_proc_flag &= ~TP_HOLDLWP; - lwp_create_done(lwptot(lwp)); + t->t_proc_flag &= ~TP_HOLDLWP; + lwp_create_done(t); mutex_exit(&p->p_lock); mutex_exit(&pidlock); return (0); @@ -911,7 +921,7 @@ * create a child proc struct. */ static int -getproc(proc_t **cpp, int kernel) +getproc(proc_t **cpp, pid_t pid, uint_t flags) { proc_t *pp, *cp; pid_t newpid; @@ -926,7 +936,7 @@ if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = curproc; + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; cp = kmem_cache_alloc(process_cache, KM_SLEEP); bzero(cp, sizeof (proc_t)); @@ -942,6 +952,7 @@ mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL); cp->p_stat = SIDL; cp->p_mstart = gethrtime(); + cp->p_as = &kas; /* * p_zone must be set before we call pid_allocate since the process * will be visible after that and code such as prfind_zone will @@ -951,7 +962,7 @@ cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; - if ((newpid = pid_allocate(cp, PID_ALLOC_PROC)) == -1) { + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); cmn_err(CE_WARN, "out of processes"); @@ -1060,7 +1071,7 @@ * always bound to the default pool. */ mutex_enter(&pp->p_lock); - if (kernel) { + if (flags & GETPROC_KERNEL) { cp->p_pool = pool_default; cp->p_flag |= SSYS; } else { @@ -1074,7 +1085,7 @@ * are always attached to task0. */ mutex_enter(&cp->p_lock); - if (kernel) + if (flags & GETPROC_KERNEL) task_attach(task0p, cp); else task_attach(pp->p_task, cp); @@ -1098,7 +1109,15 @@ */ fcnt_add(P_FINFO(pp), 1); - VN_HOLD(PTOU(pp)->u_cdir); + if (PTOU(pp)->u_cdir) { + VN_HOLD(PTOU(pp)->u_cdir); + } else { + ASSERT(pp == &p0); + /* + * We must be at or before vfs_mountroot(); it will take care of + * assigning our current directory. + */ + } if (PTOU(pp)->u_rdir) VN_HOLD(PTOU(pp)->u_rdir); if (PTOU(pp)->u_cwd)
--- a/usr/src/uts/common/os/lwp.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/lwp.c Mon Nov 23 15:29:44 2009 -0800 @@ -68,6 +68,24 @@ extern void freectx_ctx(struct ctxop *); /* + * Create a kernel thread associated with a particular system process. Give + * it an LWP so that microstate accounting will be available for it. + */ +kthread_t * +lwp_kernel_create(proc_t *p, void (*proc)(), void *arg, int state, pri_t pri) +{ + klwp_t *lwp; + + VERIFY((p->p_flag & SSYS) != 0); + + lwp = lwp_create(proc, arg, 0, p, state, pri, &t0.t_hold, syscid, 0); + + VERIFY(lwp != NULL); + + return (lwptot(lwp)); +} + +/* * Create a thread that appears to be stopped at sys_rtt. */ klwp_t * @@ -84,7 +102,7 @@ int err = 0; kproject_t *oldkpj, *newkpj; void *bufp = NULL; - klwp_t *curlwp = ttolwp(curthread); + klwp_t *curlwp; lwpent_t *lep; lwpdir_t *old_dir = NULL; uint_t old_dirsz = 0; @@ -96,12 +114,16 @@ boolean_t branded = 0; struct ctxop *ctx = NULL; + ASSERT(cid != sysdccid); /* system threads must start in SYS */ + + ASSERT(p != &p0); /* No new LWPs in p0. */ + mutex_enter(&p->p_lock); mutex_enter(&p->p_zone->zone_nlwps_lock); /* * don't enforce rctl limits on system processes */ - if (cid != syscid) { + if (!CLASS_KERNEL(cid)) { if (p->p_task->tk_nlwps >= p->p_task->tk_nlwps_ctl) if (rctl_test(rc_task_lwps, p->p_task->tk_rctls, p, 1, 0) & RCT_DENY) @@ -128,13 +150,26 @@ mutex_exit(&p->p_zone->zone_nlwps_lock); mutex_exit(&p->p_lock); - if (curlwp == NULL || (stksize = curlwp->lwp_childstksz) == 0) + if (CLASS_KERNEL(cid)) { + curlwp = NULL; /* don't inherit from curlwp */ stksize = lwp_default_stksize; + } else { + curlwp = ttolwp(curthread); + if (curlwp == NULL || (stksize = curlwp->lwp_childstksz) == 0) + stksize = lwp_default_stksize; + } /* - * Try to reclaim a <lwp,stack> from 'deathrow' + * For system threads, we sleep for our swap reservation, and the + * thread stack can't be swapped. + * + * Otherwise, try to reclaim a <lwp,stack> from 'deathrow' */ - if (stksize == lwp_default_stksize) { + if (CLASS_KERNEL(cid)) { + lwpdata = (caddr_t)segkp_get(segkp, stksize, + (KPD_NO_ANON | KPD_HASREDZONE | KPD_LOCKED)); + + } else if (stksize == lwp_default_stksize) { if (lwp_reapcnt > 0) { mutex_enter(&reaplock); if ((t = lwp_deathrow) != NULL) { @@ -434,11 +469,15 @@ kpreempt_disable(); /* can't grab cpu_lock here */ /* - * Inherit processor and processor set bindings from curthread, - * unless we're creating a new kernel process, in which case - * clear all bindings. + * Inherit processor and processor set bindings from curthread. + * + * For kernel LWPs, we do not inherit processor set bindings at + * process creation time (i.e. when p != curproc). After the + * kernel process is created, any subsequent LWPs must be created + * by threads in the kernel process, at which point we *will* + * inherit processor set bindings. */ - if (cid == syscid) { + if (CLASS_KERNEL(cid) && p != curproc) { t->t_bind_cpu = binding = PBIND_NONE; t->t_cpupart = oldpart = &cp_default; t->t_bind_pset = PS_NONE; @@ -658,6 +697,13 @@ error: if (err) { + if (CLASS_KERNEL(cid)) { + /* + * This should only happen if a system process runs + * out of lwpids, which shouldn't occur. + */ + panic("Failed to create a system LWP"); + } /* * We have failed to create an lwp, so decrement the number * of lwps in the task and let the lgroup load averages know
--- a/usr/src/uts/common/os/main.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/main.c Mon Nov 23 15:29:44 2009 -0800 @@ -450,6 +450,12 @@ (void) spl0(); interrupts_unleashed = 1; + /* + * Create kmem cache for proc structures + */ + process_cache = kmem_cache_create("process_cache", sizeof (proc_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + vfs_mountroot(); /* Mount the root file system */ errorq_init(); /* after vfs_mountroot() so DDI root is ready */ cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */ @@ -500,12 +506,6 @@ setupclock(0); /* - * Create kmem cache for proc structures - */ - process_cache = kmem_cache_create("process_cache", sizeof (proc_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - - /* * Initialize process 0's lwp directory and lwpid hash table. */ p->p_lwpdir = p->p_lwpfree = p0_lwpdir; @@ -576,24 +576,33 @@ /* * Make init process; enter scheduling loop with system process. + * + * Note that we manually assign the pids for these processes, for + * historical reasons. If more pre-assigned pids are needed, + * FAMOUS_PIDS will have to be updated. */ /* create init process */ - if (newproc(start_init, NULL, defaultcid, 59, NULL)) + if (newproc(start_init, NULL, defaultcid, 59, NULL, + FAMOUS_PID_INIT)) panic("main: unable to fork init."); /* create pageout daemon */ - if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL)) + if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL, + FAMOUS_PID_PAGEOUT)) panic("main: unable to fork pageout()"); /* create fsflush daemon */ - if (newproc(fsflush, NULL, syscid, minclsyspri, NULL)) + if (newproc(fsflush, NULL, syscid, minclsyspri, NULL, + FAMOUS_PID_FSFLUSH)) panic("main: unable to fork fsflush()"); /* create cluster process if we're a member of one */ if (cluster_bootflags & CLUSTER_BOOTED) { - if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, NULL)) + if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, + NULL, 0)) { panic("main: unable to fork cluster()"); + } } /*
--- a/usr/src/uts/common/os/mem_cage.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/mem_cage.c Mon Nov 23 15:29:44 2009 -0800 @@ -1162,9 +1162,8 @@ kcage_cageout_init() { if (kcage_on) { - - (void) thread_create(NULL, 0, kcage_cageout, - NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); + (void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL, + TS_RUN, maxclsyspri - 1); } }
--- a/usr/src/uts/common/os/msacct.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/msacct.c Mon Nov 23 15:29:44 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -244,6 +242,7 @@ { hrtime_t aggr_time; hrtime_t now; + hrtime_t waitrq; hrtime_t state_start; struct mstate *ms; klwp_t *lwp; @@ -255,6 +254,7 @@ return (0); mstate = t->t_mstate; + waitrq = t->t_waitrq; ms = &lwp->lwp_mstate; state_start = ms->ms_state_start; @@ -267,9 +267,15 @@ * NOTE: gethrtime_unscaled on X86 taken on different CPUs is * inconsistent, so it is possible that now < state_start. */ - if ((mstate == LMS_USER || mstate == LMS_SYSTEM || - mstate == LMS_TRAP) && (now > state_start)) { - aggr_time += now - state_start; + if (mstate == LMS_USER || mstate == LMS_SYSTEM || mstate == LMS_TRAP) { + /* if waitrq is zero, count all of the time. */ + if (waitrq == 0) { + waitrq = now; + } + + if (waitrq > state_start) { + aggr_time += waitrq - state_start; + } } scalehrtime(&aggr_time); @@ -277,6 +283,65 @@ } /* + * Return the amount of onproc and runnable time this thread has experienced. + * + * Because the fields we read are not protected by locks when updated + * by the thread itself, this is an inherently racey interface. In + * particular, the ASSERT(THREAD_LOCK_HELD(t)) doesn't guarantee as much + * as it might appear to. + * + * The implication for users of this interface is that onproc and runnable + * are *NOT* monotonically increasing; they may temporarily be larger than + * they should be. + */ +void +mstate_systhread_times(kthread_t *t, hrtime_t *onproc, hrtime_t *runnable) +{ + struct mstate *const ms = &ttolwp(t)->lwp_mstate; + + int mstate; + hrtime_t now; + hrtime_t state_start; + hrtime_t waitrq; + hrtime_t aggr_onp; + hrtime_t aggr_run; + + ASSERT(THREAD_LOCK_HELD(t)); + ASSERT(t->t_procp->p_flag & SSYS); + ASSERT(ttolwp(t) != NULL); + + /* shouldn't be any non-SYSTEM on-CPU time */ + ASSERT(ms->ms_acct[LMS_USER] == 0); + ASSERT(ms->ms_acct[LMS_TRAP] == 0); + + mstate = t->t_mstate; + waitrq = t->t_waitrq; + state_start = ms->ms_state_start; + + aggr_onp = ms->ms_acct[LMS_SYSTEM]; + aggr_run = ms->ms_acct[LMS_WAIT_CPU]; + + now = gethrtime_unscaled(); + + /* if waitrq == 0, then there is no time to account to TS_RUN */ + if (waitrq == 0) + waitrq = now; + + /* If there is system time to accumulate, do so */ + if (mstate == LMS_SYSTEM && state_start < waitrq) + aggr_onp += waitrq - state_start; + + if (waitrq < now) + aggr_run += now - waitrq; + + scalehrtime(&aggr_onp); + scalehrtime(&aggr_run); + + *onproc = aggr_onp; + *runnable = aggr_run; +} + +/* * Return an aggregation of microstate times in scaled nanoseconds (high-res * time). This keeps in mind that p_acct is already scaled, and ms_acct is * not.
--- a/usr/src/uts/common/os/pid.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/pid.c Mon Nov 23 15:29:44 2009 -0800 @@ -20,16 +20,13 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -94,7 +91,7 @@ static kmutex_t pidlinklock; static struct pid **pidhash; static pid_t minpid; -static pid_t mpid; +static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */ static union procent *procdir; static union procent *procentfree; @@ -132,7 +129,7 @@ if (jump_pid && jump_pid > mpid) minpid = mpid = jump_pid; else - minpid = mpid + 1; + minpid = mpid; } /* @@ -171,7 +168,7 @@ * pid_allocate() returns the new pid on success, -1 on failure. */ pid_t -pid_allocate(proc_t *prp, int flags) +pid_allocate(proc_t *prp, pid_t pid, int flags) { struct pid *pidp; union procent *pep; @@ -187,17 +184,31 @@ goto failed; } - /* - * Allocate a pid - */ - startpid = mpid; - do { - newpid = (++mpid == maxpid ? mpid = minpid : mpid); - } while (pid_lookup(newpid) && newpid != startpid); + if (pid != 0) { + VERIFY(minpid == 0); + VERIFY3P(pid, <, mpid); + VERIFY3P(pid_lookup(pid), ==, NULL); + newpid = pid; + } else { + /* + * Allocate a pid + */ + ASSERT(minpid <= mpid && mpid <= maxpid); - if (newpid == startpid && pid_lookup(newpid)) { - /* couldn't find a free pid */ - goto failed; + startpid = mpid; + for (;;) { + newpid = mpid; + if (mpid >= maxpid) + mpid = minpid; + else + mpid++; + + if (pid_lookup(newpid) == NULL) + break; + + if (mpid == startpid) + goto failed; + } } /*
--- a/usr/src/uts/common/os/sig.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/sig.c Mon Nov 23 15:29:44 2009 -0800 @@ -217,7 +217,8 @@ ASSERT(MUTEX_HELD(&p->p_lock)); - if (sig <= 0 || sig >= NSIG) + /* System processes don't get signals */ + if (sig <= 0 || sig >= NSIG || (p->p_flag & SSYS)) return; /*
--- a/usr/src/uts/common/os/taskq.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/taskq.c Mon Nov 23 15:29:44 2009 -0800 @@ -55,7 +55,7 @@ * the same list managed by the same thread. * * (3) Some tasks may block for a long time, and this should not block other - * tasks in the queue. + * tasks in the queue. * * To provide useful service in such cases we define a "dynamic task queue" * which has an individual thread for each of the tasks. These threads are @@ -74,7 +74,7 @@ * * INTERFACES ================================================================== * - * taskq_t *taskq_create(name, nthreads, pri_t pri, minalloc, maxall, flags); + * taskq_t *taskq_create(name, nthreads, pri, minalloc, maxall, flags); * * Create a taskq with specified properties. * Possible 'flags': @@ -123,6 +123,25 @@ * The 'pri' field specifies the default priority for the threads that * service all scheduled tasks. * + * taskq_t *taskq_create_instance(name, instance, nthreads, pri, minalloc, + * maxall, flags); + * + * Like taskq_create(), but takes an instance number (or -1 to indicate + * no instance). + * + * taskq_t *taskq_create_proc(name, nthreads, pri, minalloc, maxall, proc, + * flags); + * + * Like taskq_create(), but creates the taskq threads in the specified + * system process. If proc != &p0, this must be called from a thread + * in that process. + * + * taskq_t *taskq_create_sysdc(name, nthreads, minalloc, maxall, proc, + * dc, flags); + * + * Like taskq_create_proc(), but the taskq threads will use the + * System Duty Cycle (SDC) scheduling class with a duty cycle of dc. + * * void taskq_destroy(tap): * * Waits for any scheduled tasks to complete, then destroys the taskq. @@ -147,7 +166,7 @@ * * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to * lack of available resources and fail. If this flag is not - * set, and the task pool is exhausted, the task may be scheduled + * set, and the task pool is exhausted, the task may be scheduled * in the backing queue. This flag may ONLY be used with dynamic * task queues. * @@ -156,9 +175,11 @@ * Enqueueing dependent tasks may create deadlocks. * * TQ_SLEEP: May block waiting for resources. May still fail for - * dynamic task queues if TQ_NOQUEUE is also specified, otherwise + * dynamic task queues if TQ_NOQUEUE is also specified, otherwise * always succeed. * + * TQ_FRONT: Puts the new task at the front of the queue. Be careful. + * * NOTE: Dynamic task queues are much more likely to fail in * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it * is important to have backup strategies handling such failures. @@ -234,7 +255,7 @@ * +-------------+ | * | DYNAMIC TASK QUEUES: * | - * +-> taskq_bucket[nCPU] taskq_bucket_dispatch() + * +-> taskq_bucket[nCPU] taskq_bucket_dispatch() * +-------------------+ ^ * +--->| tqbucket_lock | | * | +-------------------+ +--------+ +--------+ @@ -249,7 +270,7 @@ * | +-------------------+<--+--------+<--...+--------+ * | | ... | | thread | | thread | * | +-------------------+ +--------+ +--------+ - * +---> ... + * +---> ... * * * Task queues use tq_task field to link new entry in the queue. The queue is a @@ -283,8 +304,8 @@ * * During creation, tq_nthreads and tq_active are set to 0, and * tq_nthreads_target is set to the number of threads desired. The - * TASKQ_CHANGING flag is set, and taskq_create_thread() is called to - * create the first thread. taskq_create_thread() increments tq_active, + * TASKQ_CHANGING flag is set, and taskq_thread_create() is called to + * create the first thread. taskq_thread_create() increments tq_active, * sets TASKQ_THREAD_CREATED, and creates the new thread. * * Each thread starts in taskq_thread(), clears the TASKQ_THREAD_CREATED @@ -451,13 +472,16 @@ #include <sys/kmem.h> #include <sys/vmem.h> #include <sys/callb.h> +#include <sys/class.h> #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/vmsystm.h> /* For throttlefree */ #include <sys/sysmacros.h> #include <sys/cpuvar.h> +#include <sys/cpupart.h> #include <sys/sdt.h> +#include <sys/sysdc.h> #include <sys/note.h> static kmem_cache_t *taskq_ent_cache, *taskq_cache; @@ -472,7 +496,7 @@ /* * Maximum number of entries in global system taskq is - * system_taskq_size * max_ncpus + * system_taskq_size * max_ncpus */ #define SYSTEM_TASKQ_SIZE 64 int system_taskq_size = SYSTEM_TASKQ_SIZE; @@ -483,6 +507,14 @@ */ int taskq_minimum_nthreads_max = 1; +/* + * We want to ensure that when taskq_create() returns, there is at least + * one thread ready to handle requests. To guarantee this, we have to wait + * for the second thread, since the first one cannot process requests until + * the second thread has been created. + */ +#define TASKQ_CREATE_ACTIVE_THREADS 2 + /* Maximum percentage allowed for TASKQ_THREADS_CPU_PCT */ #define TASKQ_CPUPCT_MAX_PERCENT 1000 int taskq_cpupct_max_percent = TASKQ_CPUPCT_MAX_PERCENT; @@ -522,7 +554,7 @@ * Static functions. */ static taskq_t *taskq_create_common(const char *, int, int, pri_t, int, - int, uint_t); + int, proc_t *, uint_t, uint_t); static void taskq_thread(void *); static void taskq_d_thread(taskq_ent_t *); static void taskq_bucket_extend(void *); @@ -539,6 +571,7 @@ * Task queues kstats. */ struct taskq_kstat { + kstat_named_t tq_pid; kstat_named_t tq_tasks; kstat_named_t tq_executed; kstat_named_t tq_maxtasks; @@ -548,6 +581,7 @@ kstat_named_t tq_pri; kstat_named_t tq_nthreads; } taskq_kstat = { + { "pid", KSTAT_DATA_UINT64 }, { "tasks", KSTAT_DATA_UINT64 }, { "executed", KSTAT_DATA_UINT64 }, { "maxtasks", KSTAT_DATA_UINT64 }, @@ -604,16 +638,9 @@ static int taskq_d_kstat_update(kstat_t *, int); /* - * State for THREAD_CPU_PCT management + * List of all TASKQ_THREADS_CPU_PCT taskqs. */ -typedef struct taskq_cpupct_ent { - list_node_t tp_link; - taskq_t *tp_taskq; -} taskq_cpupct_ent_t; - -static kmutex_t taskq_cpupct_lock; -static list_t taskq_cpupct_list; -static int taskq_cpupct_ncpus_online; +static list_t taskq_cpupct_list; /* protected by cpu_lock */ /* * Collect per-bucket statistic when TASKQ_STATISTIC is defined. @@ -678,22 +705,42 @@ tqe->tqent_next->tqent_prev = tqe; \ tqe->tqent_prev->tqent_next = tqe; \ } +/* + * Prepend 'tqe' to the beginning of l + */ +#define TQ_PREPEND(l, tqe) { \ + tqe->tqent_next = l.tqent_next; \ + tqe->tqent_prev = &l; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} /* * Schedule a task specified by func and arg into the task queue entry tqe. */ -#define TQ_ENQUEUE(tq, tqe, func, arg) { \ - ASSERT(MUTEX_HELD(&tq->tq_lock)); \ - TQ_APPEND(tq->tq_task, tqe); \ - tqe->tqent_func = (func); \ - tqe->tqent_arg = (arg); \ - tq->tq_tasks++; \ - if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \ +#define TQ_DO_ENQUEUE(tq, tqe, func, arg, front) { \ + ASSERT(MUTEX_HELD(&tq->tq_lock)); \ + _NOTE(CONSTCOND) \ + if (front) { \ + TQ_PREPEND(tq->tq_task, tqe); \ + } else { \ + TQ_APPEND(tq->tq_task, tqe); \ + } \ + tqe->tqent_func = (func); \ + tqe->tqent_arg = (arg); \ + tq->tq_tasks++; \ + if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \ tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed; \ - cv_signal(&tq->tq_dispatch_cv); \ + cv_signal(&tq->tq_dispatch_cv); \ DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \ } +#define TQ_ENQUEUE(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 0) + +#define TQ_ENQUEUE_FRONT(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 1) + /* * Do-nothing task which may be used to prepopulate thread caches. */ @@ -703,7 +750,6 @@ { } - /*ARGSUSED*/ static int taskq_constructor(void *buf, void *cdrarg, int kmflags) @@ -776,51 +822,97 @@ (void *)1, INT32_MAX, 1, NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); - list_create(&taskq_cpupct_list, sizeof (taskq_cpupct_ent_t), - offsetof(taskq_cpupct_ent_t, tp_link)); + list_create(&taskq_cpupct_list, sizeof (taskq_t), + offsetof(taskq_t, tq_cpupct_link)); +} + +static void +taskq_update_nthreads(taskq_t *tq, uint_t ncpus) +{ + uint_t newtarget = TASKQ_THREADS_PCT(ncpus, tq->tq_threads_ncpus_pct); + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* We must be going from non-zero to non-zero; no exiting. */ + ASSERT3U(tq->tq_nthreads_target, !=, 0); + ASSERT3U(newtarget, !=, 0); + + ASSERT3U(newtarget, <=, tq->tq_nthreads_max); + if (newtarget != tq->tq_nthreads_target) { + tq->tq_flags |= TASKQ_CHANGING; + tq->tq_nthreads_target = newtarget; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + } +} + +/* called during task queue creation */ +static void +taskq_cpupct_install(taskq_t *tq, cpupart_t *cpup) +{ + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + + mutex_enter(&cpu_lock); + mutex_enter(&tq->tq_lock); + tq->tq_cpupart = cpup->cp_id; + taskq_update_nthreads(tq, cpup->cp_ncpus); + mutex_exit(&tq->tq_lock); + + list_insert_tail(&taskq_cpupct_list, tq); + mutex_exit(&cpu_lock); +} + +static void +taskq_cpupct_remove(taskq_t *tq) +{ + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + + mutex_enter(&cpu_lock); + list_remove(&taskq_cpupct_list, tq); + mutex_exit(&cpu_lock); } /*ARGSUSED*/ static int taskq_cpu_setup(cpu_setup_t what, int id, void *arg) { - taskq_cpupct_ent_t *tpp; - int cpus_online = ncpus_online; + taskq_t *tq; + cpupart_t *cp = cpu[id]->cpu_part; + uint_t ncpus = cp->cp_ncpus; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(ncpus > 0); - /* offlines are called *before* the cpu is offlined. */ - if (what == CPU_OFF) - cpus_online--; - if (cpus_online < 1) - cpus_online = 1; + switch (what) { + case CPU_OFF: + case CPU_CPUPART_OUT: + /* offlines are called *before* the cpu is offlined. */ + if (ncpus > 1) + ncpus--; + break; - mutex_enter(&taskq_cpupct_lock); - if (cpus_online == taskq_cpupct_ncpus_online) { - mutex_exit(&taskq_cpupct_lock); - return (0); + case CPU_ON: + case CPU_CPUPART_IN: + break; + + default: + return (0); /* doesn't affect cpu count */ } - for (tpp = list_head(&taskq_cpupct_list); tpp != NULL; - tpp = list_next(&taskq_cpupct_list, tpp)) { - taskq_t *tq = tpp->tp_taskq; - int newtarget; + for (tq = list_head(&taskq_cpupct_list); tq != NULL; + tq = list_next(&taskq_cpupct_list, tq)) { mutex_enter(&tq->tq_lock); - newtarget = - TASKQ_THREADS_PCT(cpus_online, tq->tq_threads_ncpus_pct); - ASSERT3S(newtarget, <=, tq->tq_nthreads_max); - if (newtarget != tq->tq_nthreads_target) { - /* The taskq must not be exiting */ - ASSERT3S(tq->tq_nthreads_target, !=, 0); - tq->tq_flags |= TASKQ_CHANGING; - tq->tq_nthreads_target = newtarget; - cv_broadcast(&tq->tq_dispatch_cv); - cv_broadcast(&tq->tq_exit_cv); + /* + * If the taskq is part of the cpuset which is changing, + * update its nthreads_target. + */ + if (tq->tq_cpupart == cp->cp_id) { + taskq_update_nthreads(tq, ncpus); } mutex_exit(&tq->tq_lock); } - - taskq_cpupct_ncpus_online = cpus_online; - mutex_exit(&taskq_cpupct_lock); return (0); } @@ -829,7 +921,11 @@ { mutex_enter(&cpu_lock); register_cpu_setup_func(taskq_cpu_setup, NULL); - (void) taskq_cpu_setup(CPU_ON, 0, NULL); + /* + * Make sure we're up to date. At this point in boot, there is only + * one processor set, so we only have to update the current CPU. + */ + (void) taskq_cpu_setup(CPU_ON, CPU->cpu_id, NULL); mutex_exit(&cpu_lock); } @@ -840,7 +936,7 @@ system_taskq_init(void) { system_taskq = taskq_create_common("system_taskq", 0, - system_taskq_size * max_ncpus, minclsyspri, 4, 512, + system_taskq_size * max_ncpus, minclsyspri, 4, 512, &p0, 0, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } @@ -1007,7 +1103,11 @@ mutex_exit(&tq->tq_lock); return (NULL); } - TQ_ENQUEUE(tq, tqe, func, arg); + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } mutex_exit(&tq->tq_lock); return ((taskqid_t)tqe); } @@ -1015,7 +1115,7 @@ /* * Dynamic taskq dispatching. */ - ASSERT(!(flags & TQ_NOALLOC)); + ASSERT(!(flags & (TQ_NOALLOC | TQ_FRONT))); TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flags); bsize = tq->tq_nbuckets; @@ -1105,8 +1205,7 @@ */ mutex_enter(&tq->tq_lock); if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) { - TQ_ENQUEUE(tq, tqe1, taskq_bucket_extend, - bucket); + TQ_ENQUEUE(tq, tqe1, taskq_bucket_extend, bucket); } else { TQ_STAT(bucket, tqs_nomem); } @@ -1223,19 +1322,58 @@ return (thread->t_taskq == tq); } +/* + * Creates a thread in the taskq. We only allow one outstanding create at + * a time. We drop and reacquire the tq_lock in order to avoid blocking other + * taskq activity while thread_create() or lwp_kernel_create() run. + * + * The first time we're called, we do some additional setup, and do not + * return until there are enough threads to start servicing requests. + */ static void taskq_thread_create(taskq_t *tq) { - kthread_t *t; + kthread_t *t; + const boolean_t first = (tq->tq_nthreads == 0); ASSERT(MUTEX_HELD(&tq->tq_lock)); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + ASSERT(tq->tq_nthreads < tq->tq_nthreads_target); ASSERT(!(tq->tq_flags & TASKQ_THREAD_CREATED)); + tq->tq_flags |= TASKQ_THREAD_CREATED; tq->tq_active++; - t = thread_create(NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, - tq->tq_pri); - t->t_taskq = tq; + mutex_exit(&tq->tq_lock); + + if (tq->tq_proc != &p0) { + t = lwp_kernel_create(tq->tq_proc, taskq_thread, tq, TS_RUN, + tq->tq_pri); + } else { + t = thread_create(NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, + tq->tq_pri); + } + + if (!first) { + mutex_enter(&tq->tq_lock); + return; + } + + /* + * We know the thread cannot go away, since tq cannot be + * destroyed until creation has completed. We can therefore + * safely dereference t. + */ + if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { + taskq_cpupct_install(tq, t->t_cpupart); + } + mutex_enter(&tq->tq_lock); + + /* Wait until we can service requests. */ + while (tq->tq_nthreads != tq->tq_nthreads_target && + tq->tq_nthreads < TASKQ_CREATE_ACTIVE_THREADS) { + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + } } /* @@ -1276,6 +1414,13 @@ callb_cpr_t cprinfo; hrtime_t start, end; + curthread->t_taskq = tq; /* mark ourselves for taskq_member() */ + + if (curproc != &p0 && (tq->tq_flags & TASKQ_DUTY_CYCLE)) { + sysdc_thread_enter(curthread, tq->tq_DC, + (tq->tq_flags & TASKQ_DC_BATCH) ? SYSDC_THREAD_BATCH : 0); + } + if (tq->tq_flags & TASKQ_CPR_SAFE) { CALLB_CPR_INIT_SAFE(curthread, tq->tq_name); } else { @@ -1285,6 +1430,7 @@ mutex_enter(&tq->tq_lock); thread_id = ++tq->tq_nthreads; ASSERT(tq->tq_flags & TASKQ_THREAD_CREATED); + ASSERT(tq->tq_flags & TASKQ_CHANGING); tq->tq_flags &= ~TASKQ_THREAD_CREATED; VERIFY3S(thread_id, <=, tq->tq_nthreads_max); @@ -1294,20 +1440,13 @@ else tq->tq_threadlist[thread_id - 1] = curthread; + /* Allow taskq_create_common()'s taskq_thread_create() to return. */ + if (tq->tq_nthreads == TASKQ_CREATE_ACTIVE_THREADS) + cv_broadcast(&tq->tq_wait_cv); + for (;;) { if (tq->tq_flags & TASKQ_CHANGING) { - /* we're done; clear the CHANGING flag */ - if (tq->tq_nthreads == tq->tq_nthreads_target) { - tq->tq_flags &= ~TASKQ_CHANGING; - continue; - } - /* We're low on threads and none have been created */ - if (tq->tq_nthreads < tq->tq_nthreads_target && - !(tq->tq_flags & TASKQ_THREAD_CREATED)) { - taskq_thread_create(tq); - continue; - } - /* We're no longer needed */ + /* See if we're no longer needed */ if (thread_id > tq->tq_nthreads_target) { /* * To preserve the one-to-one mapping between @@ -1329,6 +1468,23 @@ &tq->tq_exit_cv, &cprinfo, -1); continue; } + + /* + * If no thread is starting taskq_thread(), we can + * do some bookkeeping. + */ + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) { + /* Check if we've reached our target */ + if (tq->tq_nthreads == tq->tq_nthreads_target) { + tq->tq_flags &= ~TASKQ_CHANGING; + cv_broadcast(&tq->tq_wait_cv); + } + /* Check if we need to create a thread */ + if (tq->tq_nthreads < tq->tq_nthreads_target) { + taskq_thread_create(tq); + continue; /* tq_lock was dropped */ + } + } } if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) { if (--tq->tq_active == 0) @@ -1338,6 +1494,7 @@ tq->tq_active++; continue; } + tqe->tqent_prev->tqent_next = tqe->tqent_next; tqe->tqent_next->tqent_prev = tqe->tqent_prev; mutex_exit(&tq->tq_lock); @@ -1364,19 +1521,30 @@ else tq->tq_threadlist[thread_id - 1] = NULL; - ASSERT(tq->tq_nthreads > 0); - if (--tq->tq_nthreads == 0) - cv_broadcast(&tq->tq_wait_cv); - - /* let the other threads which need to exit know we're done */ - cv_broadcast(&tq->tq_exit_cv); - /* We're exiting, and therefore no longer active */ + ASSERT(tq->tq_active > 0); tq->tq_active--; + ASSERT(tq->tq_nthreads > 0); + tq->tq_nthreads--; + + /* Wake up anyone waiting for us to exit */ + cv_broadcast(&tq->tq_exit_cv); + if (tq->tq_nthreads == tq->tq_nthreads_target) { + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) + tq->tq_flags &= ~TASKQ_CHANGING; + + cv_broadcast(&tq->tq_wait_cv); + } + ASSERT(!(tq->tq_flags & TASKQ_CPR_SAFE)); - CALLB_CPR_EXIT(&cprinfo); - thread_exit(); + CALLB_CPR_EXIT(&cprinfo); /* drops tq->tq_lock */ + if (curthread->t_lwp != NULL) { + mutex_enter(&curproc->p_lock); + lwp_exit(); + } else { + thread_exit(); + } } /* @@ -1522,8 +1690,10 @@ taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, int maxalloc, uint_t flags) { - return taskq_create_common(name, 0, nthreads, pri, minalloc, - maxalloc, flags | TASKQ_NOINSTANCE); + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, &p0, 0, flags | TASKQ_NOINSTANCE)); } /* @@ -1539,6 +1709,7 @@ taskq_create_instance(const char *name, int instance, int nthreads, pri_t pri, int minalloc, int maxalloc, uint_t flags) { + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); ASSERT((instance >= 0) || (instance == -1)); if (instance < 0) { @@ -1546,12 +1717,36 @@ } return (taskq_create_common(name, instance, nthreads, - pri, minalloc, maxalloc, flags)); + pri, minalloc, maxalloc, &p0, 0, flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, proc_t *proc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + ASSERT(proc->p_flag & SSYS); + + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, proc, 0, flags | TASKQ_NOINSTANCE)); } +taskq_t * +taskq_create_sysdc(const char *name, int nthreads, int minalloc, + int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + ASSERT(proc->p_flag & SSYS); + + return (taskq_create_common(name, 0, nthreads, minclsyspri, minalloc, + maxalloc, proc, dc, flags | TASKQ_NOINSTANCE | TASKQ_DUTY_CYCLE)); +} + +#define IMPLY(a, b) ASSERT((!(a)) || (b)) /* if (a) { ASSERT (b) } */ + static taskq_t * taskq_create_common(const char *name, int instance, int nthreads, pri_t pri, - int minalloc, int maxalloc, uint_t flags) + int minalloc, int maxalloc, proc_t *proc, uint_t dc, uint_t flags) { taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP); uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus); @@ -1559,14 +1754,20 @@ int max_nthreads; /* - * TASKQ_DYNAMIC is incompatible with TASKQ_CPR_SAFE and - * TASKQ_THREADS_CPU_PCT. + * TASKQ_DYNAMIC, TASKQ_CPR_SAFE and TASKQ_THREADS_CPU_PCT are all + * mutually incompatible. */ - ASSERT(!(flags & TASKQ_DYNAMIC) || - !(flags & (TASKQ_CPR_SAFE | TASKQ_THREADS_CPU_PCT))); - /* TASKQ_CPR_SAFE is incompatible with TASKQ_THREADS_CPU_PCT */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_CPR_SAFE)); + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_THREADS_CPU_PCT)); + IMPLY((flags & TASKQ_CPR_SAFE), !(flags & TASKQ_THREADS_CPU_PCT)); - ASSERT(!(flags & TASKQ_CPR_SAFE) || !(flags & TASKQ_THREADS_CPU_PCT)); + /* Cannot have DUTY_CYCLE without a non-p0 kernel process */ + IMPLY((flags & TASKQ_DUTY_CYCLE), proc != &p0); + + /* Cannot have DC_BATCH without DUTY_CYCLE */ + ASSERT((flags & (TASKQ_DUTY_CYCLE|TASKQ_DC_BATCH)) != TASKQ_DC_BATCH); + + ASSERT(proc != NULL); bsize = 1 << (highbit(ncpus) - 1); ASSERT(bsize >= 1); @@ -1579,10 +1780,7 @@ /* For dynamic task queues use just one backup thread */ nthreads = max_nthreads = 1; - } else if (!(flags & TASKQ_THREADS_CPU_PCT)) { - ASSERT3S(nthreads, >=, 1); - max_nthreads = nthreads; - } else { + } else if (flags & TASKQ_THREADS_CPU_PCT) { uint_t pct; ASSERT3S(nthreads, >=, 0); pct = nthreads; @@ -1590,9 +1788,21 @@ if (pct > taskq_cpupct_max_percent) pct = taskq_cpupct_max_percent; + /* + * If you're using THREADS_CPU_PCT, the process for the + * taskq threads must be curproc. This allows any pset + * binding to be inherited correctly. If proc is &p0, + * we won't be creating LWPs, so new threads will be assigned + * to the default processor set. + */ + ASSERT(curproc == proc || proc == &p0); tq->tq_threads_ncpus_pct = pct; - nthreads = TASKQ_THREADS_PCT(ncpus_online, pct); + nthreads = 1; /* corrected in taskq_thread_create() */ max_nthreads = TASKQ_THREADS_PCT(max_ncpus, pct); + + } else { + ASSERT3S(nthreads, >=, 1); + max_nthreads = nthreads; } if (max_nthreads < taskq_minimum_nthreads_max) @@ -1613,34 +1823,26 @@ tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; tq->tq_nbuckets = bsize; + tq->tq_proc = proc; tq->tq_pri = pri; + tq->tq_DC = dc; + list_link_init(&tq->tq_cpupct_link); if (max_nthreads > 1) tq->tq_threadlist = kmem_alloc( sizeof (kthread_t *) * max_nthreads, KM_SLEEP); - /* Add the taskq to the list of CPU_PCT taskqs */ - if (flags & TASKQ_THREADS_CPU_PCT) { - taskq_cpupct_ent_t *tpp = kmem_zalloc(sizeof (*tpp), KM_SLEEP); - - list_link_init(&tpp->tp_link); - tpp->tp_taskq = tq; - - mutex_enter(&taskq_cpupct_lock); - list_insert_tail(&taskq_cpupct_list, tpp); - /* reset our target, to avoid race conditions */ - tq->tq_nthreads_target = TASKQ_THREADS_PCT(ncpus_online, - tq->tq_threads_ncpus_pct); - mutex_exit(&taskq_cpupct_lock); - } - mutex_enter(&tq->tq_lock); if (flags & TASKQ_PREPOPULATE) { while (minalloc-- > 0) taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); } - /* create the first thread; if more are needed, it'll create them */ + /* + * Create the first thread, which will create any other threads + * necessary. taskq_thread_create will not return until we have + * enough threads to be able to process requests. + */ taskq_thread_create(tq); mutex_exit(&tq->tq_lock); @@ -1669,7 +1871,7 @@ * Install kstats. * We have two cases: * 1) Instance is provided to taskq_create_instance(). In this case it - * should be >= 0 and we use it. + * should be >= 0 and we use it. * * 2) Instance is not provided and is automatically generated */ @@ -1740,20 +1942,7 @@ * Unregister from the cpupct list. */ if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { - taskq_cpupct_ent_t *tpp; - - mutex_enter(&taskq_cpupct_lock); - for (tpp = list_head(&taskq_cpupct_list); tpp != NULL; - tpp = list_next(&taskq_cpupct_list, tpp)) { - if (tpp->tp_taskq == tq) - break; - } - ASSERT3P(tpp, !=, NULL); - - list_remove(&taskq_cpupct_list, tpp); - mutex_exit(&taskq_cpupct_lock); - - kmem_free(tpp, sizeof (*tpp)); + taskq_cpupct_remove(tq); } /* @@ -1926,6 +2115,7 @@ if (rw == KSTAT_WRITE) return (EACCES); + tqsp->tq_pid.value.ui64 = tq->tq_proc->p_pid; tqsp->tq_tasks.value.ui64 = tq->tq_tasks; tqsp->tq_executed.value.ui64 = tq->tq_executed; tqsp->tq_maxtasks.value.ui64 = tq->tq_maxtasks;
--- a/usr/src/uts/common/os/vm_pageout.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/vm_pageout.c Mon Nov 23 15:29:44 2009 -0800 @@ -683,7 +683,10 @@ push_req[i].a_next = &push_req[i + 1]; pageout_pri = curthread->t_pri; - pageout_init(pageout_scanner, proc_pageout, pageout_pri - 1); + + /* Create the pageout scanner thread. */ + (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, + pageout_pri - 1); /* * kick off pageout scheduler.
--- a/usr/src/uts/common/os/zone.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/os/zone.c Mon Nov 23 15:29:44 2009 -0800 @@ -2203,7 +2203,7 @@ if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) return (err); /* EFAULT or ENAMETOOLONG */ - if (getcid(sched_class, &classid) != 0 || classid == syscid) + if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid)) return (set_errno(EINVAL)); zone->zone_defaultcid = classid; ASSERT(zone->zone_defaultcid > 0 && @@ -3482,7 +3482,7 @@ * will have to tear down the zone, and fail, or try again. */ if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid, - minclsyspri - 1, &ct)) != 0) { + minclsyspri - 1, &ct, 0)) != 0) { mutex_enter(&zone_status_lock); zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); mutex_exit(&zone_status_lock); @@ -4023,7 +4023,8 @@ * and initialize zsched appropriately. I'm not sure that that * makes much of a difference, though. */ - if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { + error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); + if (error != 0) { /* * We need to undo all globally visible state. */
--- a/usr/src/uts/common/sys/Makefile Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/Makefile Mon Nov 23 15:29:44 2009 -0800 @@ -114,7 +114,7 @@ conf.h \ consdev.h \ console.h \ - consplat.h \ + consplat.h \ vt.h \ vtdaemon.h \ kd.h \ @@ -143,7 +143,7 @@ cyclic_impl.h \ dacf.h \ dacf_impl.h \ - damap.h \ + damap.h \ damap_impl.h \ dc_ki.h \ ddi.h \ @@ -186,7 +186,7 @@ dls.h \ dls_mgmt.h \ dls_impl.h \ - dma_i8237A.h \ + dma_i8237A.h \ dnlc.h \ door.h \ door_data.h \ @@ -196,9 +196,9 @@ dumpadm.h \ dumphdr.h \ ecppsys.h \ - ecppio.h \ - ecppreg.h \ - ecppvar.h \ + ecppio.h \ + ecppreg.h \ + ecppvar.h \ efi_partition.h \ elf.h \ elf_386.h \ @@ -244,8 +244,8 @@ fss.h \ fsspriocntl.h \ fsid.h \ - fssnap.h \ - fssnap_if.h \ + fssnap.h \ + fssnap_if.h \ fstyp.h \ ftrace.h \ fx.h \ @@ -391,7 +391,7 @@ multidata_impl.h \ mutex.h \ nbmlock.h \ - ndifm.h \ + ndifm.h \ ndi_impldefs.h \ net80211.h \ net80211_crypto.h \ @@ -402,11 +402,11 @@ netstack.h \ nexusdefs.h \ note.h \ - nvpair.h \ - nvpair_impl.h \ + nvpair.h \ + nvpair_impl.h \ objfs.h \ objfs_impl.h \ - ontrap.h \ + ontrap.h \ open.h \ openpromio.h \ panic.h \ @@ -440,7 +440,7 @@ port_impl.h \ port_kernel.h \ portif.h \ - ppmio.h \ + ppmio.h \ pppt_ic_if.h \ pppt_ioctl.h \ priocntl.h \ @@ -540,11 +540,13 @@ suntty.h \ swap.h \ synch.h \ + sysdc.h \ + sysdc_impl.h \ syscall.h \ sysconf.h \ sysconfig.h \ sysevent.h \ - sysevent_impl.h \ + sysevent_impl.h \ sysinfo.h \ syslog.h \ sysmacros.h \ @@ -737,7 +739,7 @@ idm_impl.h \ idm_so.h \ idm_text.h \ - idm_transport.h \ + idm_transport.h \ idm_conn_sm.h ISCSITHDRS= \
--- a/usr/src/uts/common/sys/class.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/class.h Mon Nov 23 15:29:44 2009 -0800 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,8 +30,6 @@ #ifndef _SYS_CLASS_H #define _SYS_CLASS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/t_lock.h> #include <sys/cred.h> #include <sys/thread.h> @@ -119,6 +117,8 @@ #ifdef _KERNEL +#define CLASS_KERNEL(cid) ((cid) == syscid || (cid) == sysdccid) + extern int nclass; /* number of configured scheduling classes */ extern char *defaultclass; /* default class for newproc'd processes */ extern struct sclass sclass[]; /* the class table */ @@ -127,6 +127,7 @@ extern pri_t minclsyspri; extern id_t syscid; /* system scheduling class ID */ +extern id_t sysdccid; /* system duty-cycle scheduling class ID */ extern id_t defaultcid; /* "default" class id; see dispadmin(1M) */ extern int alloc_cid(char *, id_t *);
--- a/usr/src/uts/common/sys/debug.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/debug.h Mon Nov 23 15:29:44 2009 -0800 @@ -19,19 +19,19 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - #ifndef _SYS_DEBUG_H #define _SYS_DEBUG_H #include <sys/isa_defs.h> #include <sys/types.h> +#include <sys/note.h> #ifdef __cplusplus extern "C" {
--- a/usr/src/uts/common/sys/param.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/param.h Mon Nov 23 15:29:44 2009 -0800 @@ -87,6 +87,12 @@ #define MAX_TASKID 999999 #define MAX_MAXPID 999999 #define MAXEPHUID 0xfffffffcu /* max ephemeral user id */ + +#define FAMOUS_PID_SCHED 0 +#define FAMOUS_PID_INIT 1 +#define FAMOUS_PID_PAGEOUT 2 +#define FAMOUS_PID_FSFLUSH 3 +#define FAMOUS_PIDS 4 #endif #ifdef DEBUG @@ -95,7 +101,6 @@ #else #define DEFAULT_MAXPID 30000 #define DEFAULT_JUMPPID 0 - #endif #define MAXUID 2147483647 /* max user id */
--- a/usr/src/uts/common/sys/proc.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/proc.h Mon Nov 23 15:29:44 2009 -0800 @@ -564,6 +564,9 @@ /* pseudo-flag to lwp_create() */ #define NOCLASS (-1) +/* unused scheduling class ID */ +#define CLASS_UNUSED (-2) + /* LWP stats updated via lwp_stats_update() */ typedef enum { LWP_STAT_INBLK, @@ -580,7 +583,7 @@ /* process management functions */ -extern int newproc(void (*)(), caddr_t, id_t, int, struct contract **); +extern int newproc(void (*)(), caddr_t, id_t, int, struct contract **, pid_t); extern void vfwait(pid_t); extern void proc_detach(proc_t *); extern void freeproc(proc_t *); @@ -620,7 +623,7 @@ extern void sigdefault(proc_t *); extern void pid_setmin(void); -extern pid_t pid_allocate(proc_t *, int); +extern pid_t pid_allocate(proc_t *, pid_t, int); extern struct pid *pid_find(pid_t); extern int pid_rele(struct pid *); extern void pid_exit(proc_t *); @@ -658,6 +661,7 @@ extern void disable_msacct(proc_t *); extern hrtime_t mstate_aggr_state(proc_t *, int); extern hrtime_t mstate_thread_onproc_time(kthread_t *); +extern void mstate_systhread_times(kthread_t *, hrtime_t *, hrtime_t *); extern void syscall_mstate(int, int); extern uint_t cpu_update_pct(kthread_t *, hrtime_t); @@ -718,6 +722,7 @@ /* lwp function prototypes */ +extern kthread_t *lwp_kernel_create(proc_t *, void (*)(), void *, int, pri_t); extern klwp_t *lwp_create( void (*proc)(), caddr_t arg,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/sysdc.h Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSDC_H +#define _SYS_SYSDC_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct _kthread; + +#define SYSDC_THREAD_BATCH 0x1 /* thread does batch processing */ +extern void sysdc_thread_enter(struct _kthread *, uint_t, uint_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSDC_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/sysdc_impl.h Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSDC_IMPL_H +#define _SYS_SYSDC_IMPL_H + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/list.h> + +#include <sys/sysdc.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct _kthread; +struct cpupart; + +/* + * Tracks per-processor-set information for SDC. Its main use is to + * implement per-processor-set breaks. + */ +typedef struct sysdc_pset { + list_node_t sdp_node; /* node on sysdc_psets list */ + struct cpupart *sdp_cpupart; /* associated cpu partition */ + size_t sdp_nthreads; /* reference count */ + + /* The remainder is only touched by sysdc_update() */ + hrtime_t sdp_onproc_time; /* time onproc at last update */ + boolean_t sdp_need_break; /* threads forced to minpri */ + uint_t sdp_should_break; /* # updates need_break is set */ + uint_t sdp_dont_break; /* after break, # updates until next */ + + /* debugging fields */ + uint_t sdp_onproc_threads; + hrtime_t sdp_vtime_last_interval; + uint_t sdp_DC_last_interval; +} sysdc_pset_t; + +/* + * Per-thread information, pointed to by t_cldata. + */ +typedef struct sysdc { + uint_t sdc_target_DC; /* target duty cycle */ + uint_t sdc_minpri; /* our minimum priority */ + uint_t sdc_maxpri; /* our maximum priority */ + + sysdc_pset_t *sdc_pset; /* the processor set bound to */ + + /* protected by sdl_lock */ + struct _kthread *sdc_thread; /* back-pointer, or NULL if freeable */ + + /* protected by arrangement between thread and sysdc_update() */ + struct sysdc *sdc_next; /* next in hash table, NULL if not in */ + + /* protected by thread_lock() */ + uint_t sdc_nupdates; /* number of sysdc_update_times() */ + + hrtime_t sdc_base_O; /* on-cpu time at last reset */ + hrtime_t sdc_base_R; /* runnable time at last reset */ + + uint_t sdc_sleep_updates; /* 0, or nupdates when we slept */ + clock_t sdc_ticks; /* sdc_tick() calls */ + clock_t sdc_update_ticks; /* value of ticks for forced update */ + clock_t sdc_pri_check; /* lbolt when we checked our priority */ + hrtime_t sdc_last_base_O; /* onproc time at sysdc_update() */ + + uint_t sdc_pri; /* our last computed priority */ + uint_t sdc_epri; /* our actual thread priority */ + + /* for debugging only */ + clock_t sdc_reset; /* lbolt when we reset our bases */ + hrtime_t sdc_cur_O; /* on-cpu time at last prio check */ + hrtime_t sdc_cur_R; /* runnable time at last prio check */ + hrtime_t sdc_last_O; /* onproc time at thread update */ + uint_t sdc_cur_DC; /* our actual duty cycle at last chk */ +} sysdc_t; + +/* + * Hash bucket of active SDC threads. + */ +typedef struct sysdc_list { + kmutex_t sdl_lock; /* lock keeping threads from exiting */ + sysdc_t *volatile sdl_list; /* list of active threads in bucket */ + char sdl_pad[64 - sizeof (kmutex_t) - sizeof (sysdc_t *)]; +} sysdc_list_t; + +/* + * Args to CL_ENTERCLASS(). + */ +typedef struct sysdc_params { + uint_t sdp_minpri; + uint_t sdp_maxpri; + uint_t sdp_DC; +} sysdc_params_t; + +/* + * Duty cycles are percentages in the range [1,100]. + */ +#define SYSDC_DC_MAX 100u /* 1 <= DC <= DC_MAX */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSDC_IMPL_H */
--- a/usr/src/uts/common/sys/taskq.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/taskq.h Mon Nov 23 15:29:44 2009 -0800 @@ -39,6 +39,8 @@ typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); +struct proc; + /* * Public flags for taskq_create(): bit range 0-15 */ @@ -46,6 +48,7 @@ #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ /* * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as @@ -55,6 +58,7 @@ #define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ #ifdef _KERNEL @@ -66,6 +70,10 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + struct proc *, uint_t); +extern taskq_t *taskq_create_sysdc(const char *, int, int, int, + struct proc *, uint_t, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void nulltask(void *); extern void taskq_destroy(taskq_t *);
--- a/usr/src/uts/common/sys/taskq_impl.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/taskq_impl.h Mon Nov 23 15:29:44 2009 -0800 @@ -27,7 +27,9 @@ #define _SYS_TASKQ_IMPL_H #include <sys/taskq.h> +#include <sys/inttypes.h> #include <sys/vmem.h> +#include <sys/list.h> #include <sys/kstat.h> #ifdef __cplusplus @@ -81,13 +83,16 @@ #define TQBUCKET_CLOSE 0x01 #define TQBUCKET_SUSPEND 0x02 +#define TASKQ_INTERFACE_FLAGS 0x0000ffff /* defined in <sys/taskq.h> */ + /* * taskq implementation flags: bit range 16-31 */ -#define TASKQ_CHANGING 0x00010000 -#define TASKQ_SUSPENDED 0x00020000 -#define TASKQ_NOINSTANCE 0x00040000 -#define TASKQ_THREAD_CREATED 0x00080000 +#define TASKQ_CHANGING 0x00010000 /* nthreads != target */ +#define TASKQ_SUSPENDED 0x00020000 /* taskq is suspended */ +#define TASKQ_NOINSTANCE 0x00040000 /* no instance number */ +#define TASKQ_THREAD_CREATED 0x00080000 /* a thread has been created */ +#define TASKQ_DUTY_CYCLE 0x00100000 /* using the SDC class */ struct taskq { char tq_name[TASKQ_NAMELEN + 1]; @@ -116,13 +121,19 @@ kthread_t *_tq_thread; kthread_t **_tq_threadlist; } tq_thr; + + list_node_t tq_cpupct_link; /* linkage for taskq_cpupct_list */ + struct proc *tq_proc; /* process for taskq threads */ + int tq_cpupart; /* cpupart id bound to */ + uint_t tq_DC; /* duty cycle for SDC */ + /* * Statistics. */ kstat_t *tq_kstat; /* Exported statistics */ hrtime_t tq_totaltime; /* Time spent processing tasks */ - int tq_tasks; /* Total # of tasks posted */ - int tq_executed; /* Total # of tasks executed */ + uint64_t tq_tasks; /* Total # of tasks posted */ + uint64_t tq_executed; /* Total # of tasks executed */ int tq_maxtasks; /* Max number of tasks in the queue */ int tq_tcreates; int tq_tdeaths;
--- a/usr/src/uts/common/sys/vmsystm.h Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/common/sys/vmsystm.h Mon Nov 23 15:29:44 2009 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,8 +39,6 @@ #ifndef _SYS_VMSYSTM_H #define _SYS_VMSYSTM_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/proc.h> #ifdef __cplusplus @@ -154,7 +152,6 @@ extern void ppmapout(caddr_t); extern int pf_is_memory(pfn_t); -extern void pageout_init(void (*proc)(), proc_t *pp, pri_t pri); extern void dcache_flushall(void);
--- a/usr/src/uts/i86pc/os/startup.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/i86pc/os/startup.c Mon Nov 23 15:29:44 2009 -0800 @@ -1509,6 +1509,9 @@ if (modload("fs", "dev") == -1) halt("Can't load dev"); + if (modload("fs", "procfs") == -1) + halt("Can't load procfs"); + (void) modloadonly("sys", "lbl_edition"); dispinit();
--- a/usr/src/uts/i86pc/vm/vm_machdep.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/i86pc/vm/vm_machdep.c Mon Nov 23 15:29:44 2009 -0800 @@ -3720,16 +3720,6 @@ } /* - * Create the pageout scanner thread. The thread has to - * start at procedure with process pp and priority pri. - */ -void -pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) -{ - (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); -} - -/* * Function for flushing D-cache when performing module relocations * to an alternate mapping. Unnecessary on Intel / AMD platforms. */
--- a/usr/src/uts/intel/Makefile.intel.shared Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/intel/Makefile.intel.shared Mon Nov 23 15:29:44 2009 -0800 @@ -526,7 +526,7 @@ # # Scheduling Class Modules (/kernel/sched): # -SCHED_KMODS += IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL +SCHED_KMODS += IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL SDC # # File System Modules (/kernel/fs):
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/intel/SDC/Makefile Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,82 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/intel/SDC/Makefile +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# This makefile drives the production of the SDC scheduling class +# kernel module. +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = SDC +OBJECTS = $(SDC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SDC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SCHED_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ
--- a/usr/src/uts/intel/ia32/ml/modstubs.s Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/intel/ia32/ml/modstubs.s Mon Nov 23 15:29:44 2009 -0800 @@ -764,6 +764,15 @@ #endif /* + * Stubs for sysdc + */ +#ifndef SDC_MODULE + MODULE(SDC,sched); + NO_UNLOAD_STUB(SDC, sysdc_thread_enter, nomod_zero); + END_MODULE(SDC); +#endif + +/* * Stubs for ts_dptbl */ #ifndef TS_DPTBL_MODULE
--- a/usr/src/uts/sparc/Makefile.sparc.shared Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/sparc/Makefile.sparc.shared Mon Nov 23 15:29:44 2009 -0800 @@ -359,7 +359,7 @@ # # Scheduling Class Modules (/kernel/sched): # -SCHED_KMODS += RT TS RT_DPTBL TS_DPTBL IA FSS FX FX_DPTBL +SCHED_KMODS += RT TS RT_DPTBL TS_DPTBL IA FSS FX FX_DPTBL SDC # # File System Modules (/kernel/fs):
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/sparc/SDC/Makefile Mon Nov 23 15:29:44 2009 -0800 @@ -0,0 +1,87 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/sparc/SDC/Makefile +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# This makefile drives the production of the SDC scheduling class +# kernel module. +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = SDC +OBJECTS = $(SDC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SDC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SCHED_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ
--- a/usr/src/uts/sparc/ml/modstubs.s Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/sparc/ml/modstubs.s Mon Nov 23 15:29:44 2009 -0800 @@ -652,6 +652,15 @@ #endif /* + * Stubs for sysdc + */ +#ifndef SDC_MODULE + MODULE(SDC,sched); + NO_UNLOAD_STUB(SDC, sysdc_thread_enter, nomod_zero); + END_MODULE(SDC); +#endif + +/* * Stubs for ts_dptbl */ #ifndef TS_DPTBL_MODULE
--- a/usr/src/uts/sparc/zfs/Makefile Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/sparc/zfs/Makefile Mon Nov 23 15:29:44 2009 -0800 @@ -19,11 +19,9 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the zfs file system # kernel module. @@ -59,8 +57,11 @@ # # Overrides and depends_on # +# We require sched/SDC because by the time vfs_mountroot() runs, +# we can no longer load modules through OBP. +# MODSTUBS_DIR = $(OBJS_DIR) -LDFLAGS += -dy -Nfs/specfs -Ncrypto/swrand -Nmisc/idmap +LDFLAGS += -dy -Nfs/specfs -Ncrypto/swrand -Nmisc/idmap -Nsched/SDC INC_PATH += -I$(UTSBASE)/common/fs/zfs INC_PATH += -I$(SRC)/common
--- a/usr/src/uts/sun4/os/startup.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/sun4/os/startup.c Mon Nov 23 15:29:44 2009 -0800 @@ -1583,6 +1583,9 @@ if (modloadonly("fs", "devfs") == -1) halt("Can't load devfs"); + if (modloadonly("fs", "procfs") == -1) + halt("Can't load procfs"); + if (modloadonly("misc", "swapgeneric") == -1) halt("Can't load swapgeneric");
--- a/usr/src/uts/sun4/vm/vm_dep.c Mon Nov 23 16:18:43 2009 -0800 +++ b/usr/src/uts/sun4/vm/vm_dep.c Mon Nov 23 15:29:44 2009 -0800 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * UNIX machine dependent virtual memory support. */ @@ -1004,16 +1002,6 @@ } /* - * Create & Initialise pageout scanner thread. The thread has to - * start at procedure with process pp and priority pri. - */ -void -pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) -{ - (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); -} - -/* * Function for flushing D-cache when performing module relocations * to an alternate mapping. Stubbed out on all platforms except sun4u, * at least for now.