Mercurial > illumos > illumos-gate
changeset 3939:8f689b97125d
6533678 dtrace incorrectly resumes execution in the middle of 2-byte int 3 instructions
6533954 kmdb destroys usermode GSBASE when entering
6534035 dtrace_user_probe fails to consider alternate code selectors when checking for int3
6534061 amd64 DEBUG kernel's __SAVE_REGS should save the {gs,fs}base registers
6534200 syscall rewriting code is not thread-safe
6534218 Incorrect ASSERT logic in prnldt induces panic
6534277 syscall rewriting code in trap() should be aware of alternate segment selectors
author | sethg |
---|---|
date | Fri, 30 Mar 2007 19:27:19 -0700 |
parents | 670947f6c3f6 |
children | 5517729c4306 |
files | usr/src/uts/i86pc/os/dtrace_subr.c usr/src/uts/i86pc/os/machdep.c usr/src/uts/i86pc/os/trap.c usr/src/uts/i86pc/sys/machsystm.h usr/src/uts/intel/amd64/ml/mach_offsets.in usr/src/uts/intel/amd64/sys/privregs.h usr/src/uts/intel/fs/proc/prmachdep.c usr/src/uts/intel/kdi/amd64/kdi_asm.s |
diffstat | 8 files changed, 235 insertions(+), 51 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/i86pc/os/dtrace_subr.c Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/i86pc/os/dtrace_subr.c Fri Mar 30 19:27:19 2007 -0700 @@ -240,7 +240,8 @@ rp->r_pc = npc; } else if (rp->r_trapno == T_BPTFLT) { - uint8_t instr; + uint8_t instr, instr2; + caddr_t linearpc; rwp = &CPU->cpu_ft_lock; /* @@ -258,14 +259,25 @@ } rw_exit(rwp); + if (dtrace_linear_pc(rp, p, &linearpc) != 0) { + trap(rp, addr, cpuid); + return; + } + /* * If the instruction that caused the breakpoint trap doesn't * look like an int 3 anymore, it may be that this tracepoint * was removed just after the user thread executed it. In * that case, return to user land to retry the instuction. + * Note that we assume the length of the instruction to retry + * is 1 byte because that's the length of FASTTRAP_INSTR. + * We check for r_pc > 0 and > 2 so that we don't have to + * deal with segment wraparound. */ - if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && - instr != FASTTRAP_INSTR) { + if (rp->r_pc > 0 && fuword8(linearpc - 1, &instr) == 0 && + instr != FASTTRAP_INSTR && + (instr != 3 || (rp->r_pc >= 2 && + (fuword8(linearpc - 2, &instr2) != 0 || instr2 != 0xCD)))) { rp->r_pc--; return; }
--- a/usr/src/uts/i86pc/os/machdep.c Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/i86pc/os/machdep.c Fri Mar 30 19:27:19 2007 -0700 @@ -29,6 +29,7 @@ #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> +#include <sys/segments.h> #include <sys/sysmacros.h> #include <sys/signal.h> #include <sys/systm.h> @@ -1019,3 +1020,100 @@ { return (0); } + +/* + * Calculates a linear address, given the CS selector and PC values + * by looking up the %cs selector process's LDT or the CPU's GDT. + * proc->p_ldtlock must be held across this call. + */ +int +linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp) +{ + user_desc_t *descrp; + caddr_t baseaddr; + uint16_t idx = SELTOIDX(rp->r_cs); + + ASSERT(rp->r_cs <= 0xFFFF); + ASSERT(MUTEX_HELD(&p->p_ldtlock)); + + if (SELISLDT(rp->r_cs)) { + /* + * Currently 64 bit processes cannot have private LDTs. + */ + ASSERT(p->p_model != DATAMODEL_LP64); + + if (p->p_ldt == NULL) + return (-1); + + descrp = &p->p_ldt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + + /* + * Calculate the linear address (wraparound is not only ok, + * it's expected behavior). The cast to uint32_t is because + * LDT selectors are only allowed in 32-bit processes. + */ + *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr + + rp->r_pc); + } else { +#ifdef DEBUG + descrp = &CPU->cpu_gdt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + /* GDT-based descriptors' base addresses should always be 0 */ + ASSERT(baseaddr == 0); +#endif + *linearp = (caddr_t)(uintptr_t)rp->r_pc; + } + + return (0); +} + +/* + * The implementation of dtrace_linear_pc is similar to the that of + * linear_pc, above, but here we acquire p_ldtlock before accessing + * p_ldt. This implementation is used by the pid provider; we prefix + * it with "dtrace_" to avoid inducing spurious tracing events. + */ +int +dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp) +{ + user_desc_t *descrp; + caddr_t baseaddr; + uint16_t idx = SELTOIDX(rp->r_cs); + + ASSERT(rp->r_cs <= 0xFFFF); + + if (SELISLDT(rp->r_cs)) { + /* + * Currently 64 bit processes cannot have private LDTs. + */ + ASSERT(p->p_model != DATAMODEL_LP64); + + mutex_enter(&p->p_ldtlock); + if (p->p_ldt == NULL) { + mutex_exit(&p->p_ldtlock); + return (-1); + } + descrp = &p->p_ldt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + mutex_exit(&p->p_ldtlock); + + /* + * Calculate the linear address (wraparound is not only ok, + * it's expected behavior). The cast to uint32_t is because + * LDT selectors are only allowed in 32-bit processes. + */ + *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr + + rp->r_pc); + } else { +#ifdef DEBUG + descrp = &CPU->cpu_gdt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + /* GDT-based descriptors' base addresses should always be 0 */ + ASSERT(baseaddr == 0); +#endif + *linearp = (caddr_t)(uintptr_t)rp->r_pc; + } + + return (0); +}
--- a/usr/src/uts/i86pc/os/trap.c Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/i86pc/os/trap.c Fri Mar 30 19:27:19 2007 -0700 @@ -132,6 +132,9 @@ #define TRAP_TYPES (sizeof (trap_type) / sizeof (trap_type[0])) +#define SLOW_SCALL_SIZE 2 +#define FAST_SCALL_SIZE 2 + int tudebug = 0; int tudebugbpt = 0; int tudebugfpe = 0; @@ -206,8 +209,6 @@ * int <vector> is two bytes: 0xCD <vector> */ -#define SLOW_SCALL_SIZE 2 - static int rewrite_syscall(caddr_t pc) { @@ -227,28 +228,88 @@ * * sysenter is two bytes: 0x0F 0x34 * syscall is two bytes: 0x0F 0x05 + * int $T_SYSCALLINT is two bytes: 0xCD 0x91 */ -#define FAST_SCALL_SIZE 2 - static int -instr_is_fast_syscall(caddr_t pc, int which) +instr_is_other_syscall(caddr_t pc, int which) { uchar_t instr[FAST_SCALL_SIZE]; - ASSERT(which == X86_SEP || which == X86_ASYSC); + ASSERT(which == X86_SEP || which == X86_ASYSC || which == 0xCD); - if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0 || - instr[0] != 0x0F) + if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0) return (0); - if ((which == X86_SEP && instr[1] == 0x34) || - (which == X86_ASYSC && instr[1] == 0x05)) - return (1); + switch (which) { + case X86_SEP: + if (instr[0] == 0x0F && instr[1] == 0x34) + return (1); + break; + case X86_ASYSC: + if (instr[0] == 0x0F && instr[1] == 0x05) + return (1); + break; + case 0xCD: + if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT) + return (1); + break; + } return (0); } +static const char * +syscall_insn_string(int syscall_insn) +{ + switch (syscall_insn) { + case X86_SEP: + return ("sysenter"); + case X86_ASYSC: + return ("syscall"); + case 0xCD: + return ("int"); + default: + return ("Unknown"); + } +} + +static int +ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn) +{ + caddr_t linearpc; + int return_code = 0; + + mutex_enter(&p->p_ldtlock); /* Must be held across linear_pc() */ + + if (linear_pc(rp, p, &linearpc) == 0) { + + /* + * If another thread beat us here, it already changed + * this site to the slower (int) syscall instruction. + */ + if (instr_is_other_syscall(linearpc, 0xCD)) { + return_code = 1; + } else if (instr_is_other_syscall(linearpc, syscall_insn)) { + + if (rewrite_syscall(linearpc) == 0) { + return_code = 1; + } +#ifdef DEBUG + else + cmn_err(CE_WARN, "failed to rewrite %s " + "instruction in process %d", + syscall_insn_string(syscall_insn), + p->p_pid); +#endif /* DEBUG */ + } + } + + mutex_exit(&p->p_ldtlock); /* Must be held across linear_pc() */ + + return (return_code); +} + /* * Test to see if the instruction at pc is a system call instruction. * @@ -260,7 +321,7 @@ #define LCALLSIZE 7 static int -instr_is_syscall(caddr_t pc) +instr_is_lcall_syscall(caddr_t pc) { uchar_t instr[LCALLSIZE]; @@ -704,7 +765,7 @@ * trap gate sequence. We just have to adjust the pc. */ if (watchpage && addr == (caddr_t)rp->r_sp && - rw == S_READ && instr_is_syscall((caddr_t)rp->r_pc)) { + rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) { extern void watch_syscall(void); rp->r_pc += LCALLSIZE; @@ -828,16 +889,8 @@ * be to emulate that particular instruction. */ if (p->p_ldt != NULL && - instr_is_fast_syscall((caddr_t)rp->r_pc, X86_ASYSC)) { - if (rewrite_syscall((caddr_t)rp->r_pc) == 0) - goto out; -#ifdef DEBUG - else - cmn_err(CE_WARN, "failed to rewrite syscall " - "instruction in process %d", - curthread->t_procp->p_pid); -#endif /* DEBUG */ - } + ldt_rewrite_syscall(rp, p, X86_ASYSC)) + goto out; #ifdef __amd64 /* @@ -1114,7 +1167,7 @@ #ifdef _SYSCALL32_IMPL if (p->p_model != DATAMODEL_NATIVE) { #endif /* _SYSCALL32_IMPL */ - if (instr_is_syscall((caddr_t)rp->r_pc)) { + if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) { if (type == T_SEGFLT + USER) ASSERT(p->p_ldt != NULL); @@ -1161,16 +1214,9 @@ * this will be to emulate that particular instruction. */ if (p->p_ldt != NULL && - instr_is_fast_syscall((caddr_t)rp->r_pc, X86_SEP)) { - if (rewrite_syscall((caddr_t)rp->r_pc) == 0) - goto out; -#ifdef DEBUG - else - cmn_err(CE_WARN, "failed to rewrite sysenter " - "instruction in process %d", - curthread->t_procp->p_pid); -#endif /* DEBUG */ - } + ldt_rewrite_syscall(rp, p, X86_SEP)) + goto out; + /*FALLTHROUGH*/ case T_BOUNDFLT + USER: /* bound fault */
--- a/usr/src/uts/i86pc/sys/machsystm.h Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/i86pc/sys/machsystm.h Fri Mar 30 19:27:19 2007 -0700 @@ -121,6 +121,8 @@ extern void memlist_add(uint64_t, uint64_t, struct memlist *, struct memlist **); extern page_t *page_get_physical(uintptr_t); +extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp); +extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp); #endif /* _KERNEL */
--- a/usr/src/uts/intel/amd64/ml/mach_offsets.in Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/intel/amd64/ml/mach_offsets.in Fri Mar 30 19:27:19 2007 -0700 @@ -77,6 +77,10 @@ r_r13 REGOFF_R13 r_r14 REGOFF_R14 r_r15 REGOFF_R15 +\#if DEBUG + __r_fsbase REGOFF_FSBASE + __r_gsbase REGOFF_GSBASE +\#endif r_ds REGOFF_DS r_es REGOFF_ES r_fs REGOFF_FS
--- a/usr/src/uts/intel/amd64/sys/privregs.h Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/intel/amd64/sys/privregs.h Fri Mar 30 19:27:19 2007 -0700 @@ -78,15 +78,11 @@ greg_t r_r15; /* callee-saved */ /* - * XX64 - * We used to sample fsbase and gsbase on every exception - * with expensive rdmsr's. Yet this was only useful at - * best for debugging during amd64 bringup. We should take - * these away but for now simply rename them to avoid any - * flag days. + * fsbase and gsbase are sampled on every exception in DEBUG kernels + * only. They remain in the non-DEBUG kernel to avoid any flag days. */ - greg_t __r_fsbase; /* XX64 no longer used by the kernel */ - greg_t __r_gsbase; /* XX64 no longer used by the kernel */ + greg_t __r_fsbase; /* no longer used in non-DEBUG builds */ + greg_t __r_gsbase; /* no longer used in non-DEBUG builds */ greg_t r_ds; greg_t r_es; greg_t r_fs; /* %fs is *never* used by the kernel */ @@ -123,6 +119,20 @@ #include <sys/machprivregs.h> #include <sys/pcb.h> +#ifdef DEBUG +#define __SAVE_BASES \ + movl $MSR_AMD_FSBASE, %ecx; \ + rdmsr; \ + movl %eax, REGOFF_FSBASE(%rsp); \ + movl %edx, REGOFF_FSBASE+4(%rsp); \ + movl $MSR_AMD_GSBASE, %ecx; \ + rdmsr; \ + movl %eax, REGOFF_GSBASE(%rsp); \ + movl %edx, REGOFF_GSBASE+4(%rsp) +#else +#define __SAVE_BASES +#endif + /* * Create a struct regs on the stack suitable for an * interrupt trap. @@ -157,7 +167,8 @@ movw %es, %cx; \ movq %rcx, REGOFF_ES(%rsp); \ movw %ds, %cx; \ - movq %rcx, REGOFF_DS(%rsp) + movq %rcx, REGOFF_DS(%rsp); \ + __SAVE_BASES #define __RESTORE_REGS \ movq REGOFF_RDI(%rsp), %rdi; \
--- a/usr/src/uts/intel/fs/proc/prmachdep.c Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/intel/fs/proc/prmachdep.c Fri Mar 30 19:27:19 2007 -0700 @@ -508,10 +508,9 @@ ASSERT(MUTEX_HELD(&p->p_ldtlock)); /* - * Currently 64 bit processes cannot have a private ldt. + * Currently 64 bit processes cannot have private LDTs. */ - ASSERT(get_udatamodel() != DATAMODEL_LP64 || p->p_ldt == NULL); - + ASSERT(p->p_model != DATAMODEL_LP64 || p->p_ldt == NULL); if (p->p_ldt == NULL) return (0);
--- a/usr/src/uts/intel/kdi/amd64/kdi_asm.s Fri Mar 30 17:01:13 2007 -0700 +++ b/usr/src/uts/intel/kdi/amd64/kdi_asm.s Fri Mar 30 19:27:19 2007 -0700 @@ -104,10 +104,20 @@ movw %fs, %ax; \ movq %rax, REG_OFF(KDIREG_FS)(base); \ movw %gs, %ax; \ - movq %rax, REG_OFF(KDIREG_GS)(base) + movq %rax, REG_OFF(KDIREG_GS)(base); \ + movl $MSR_AMD_GSBASE, %ecx; \ + rdmsr; \ + shlq $32, %rdx; \ + orq %rax, %rdx; \ + movq %rdx, REG_OFF(KDIREG_GSBASE)(base) #define KDI_RESTORE_REGS(base) \ movq base, %rdi; \ + movq REG_OFF(KDIREG_GSBASE)(%rdi), %rdx; \ + movq %rdx, %rax; \ + shrq $32, %rdx; \ + movl $MSR_AMD_GSBASE, %ecx; \ + wrmsr; \ movq REG_OFF(KDIREG_ES)(%rdi), %rax; \ movw %ax, %es; \ movq REG_OFF(KDIREG_DS)(%rdi), %rax; \ @@ -193,7 +203,9 @@ * Switch to the kernel's GSBASE. Neither GSBASE nor the ill-named * KGSBASE can be trusted, as the kernel may or may not have already * done a swapgs. All is not lost, as the kernel can divine the correct - * value for us. + * value for us. Note that the previous GSBASE is saved in the + * KDI_SAVE_REGS macro to prevent a usermode process's GSBASE from being + * blown away. */ subq $10, %rsp sgdt (%rsp)