9 years ago · e37e43a497
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
 
				 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
			
 
				 	select HAVE_ARCH_WITHIN_STACK_FRAMES
			
 
				 	select HAVE_EBPF_JIT			if X86_64
			
 
				+	select HAVE_ARCH_VMAP_STACK		if X86_64
			
 
				 	select HAVE_CC_STACKPROTECTOR
			
 
				 	select HAVE_CMPXCHG_DOUBLE
			
 
				 	select HAVE_CMPXCHG_LOCAL
			
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,28 @@ struct tss_struct;
 
				 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
			
 
				 		      struct tss_struct *tss);
			
 
				 
			
 
				+/* This runs runs on the previous thread's stack. */
			
 
				+static inline void prepare_switch_to(struct task_struct *prev,
			
 
				+				     struct task_struct *next)
			
 
				+{
			
 
				+#ifdef CONFIG_VMAP_STACK
			
 
				+	/*
			
 
				+	 * If we switch to a stack that has a top-level paging entry
			
 
				+	 * that is not present in the current mm, the resulting #PF will
			
 
				+	 * will be promoted to a double-fault and we'll panic.  Probe
			
 
				+	 * the new stack now so that vmalloc_fault can fix up the page
			
 
				+	 * tables if needed.  This can only happen if we use a stack
			
 
				+	 * in vmap space.
			
 
				+	 *
			
 
				+	 * We assume that the stack is aligned so that it never spans
			
 
				+	 * more than one top-level paging entry.
			
 
				+	 *
			
 
				+	 * To minimize cache pollution, just follow the stack pointer.
			
 
				+	 */
			
 
				+	READ_ONCE(*(unsigned char *)next->thread.sp);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_X86_32
			
 
				 
			
 
				 #ifdef CONFIG_CC_STACKPROTECTOR
			
@@ -39,6 +61,8 @@ do {									\
 
				 	 */								\
			
 
				 	unsigned long ebx, ecx, edx, esi, edi;				\
			
 
				 									\
			
 
				+	prepare_switch_to(prev, next);					\
			
 
				+									\
			
 
				 	asm volatile("pushl %%ebp\n\t"		/* save    EBP   */	\
			
 
				 		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
			
 
				 		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
			
@@ -103,7 +127,9 @@ do {									\
 
				  * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
			
 
				  * has no effect.
			
 
				  */
			
 
				-#define switch_to(prev, next, last) \
			
 
				+#define switch_to(prev, next, last)					  \
			
 
				+	prepare_switch_to(prev, next);					  \
			
 
				+									  \
			
 
				 	asm volatile(SAVE_CONTEXT					  \
			
 
				 	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
			
 
				 	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
			
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",	segment_not_present)
 
				 DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment)
			
 
				 DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check)
			
 
				 
			
 
				+#ifdef CONFIG_VMAP_STACK
			
 
				+static void __noreturn handle_stack_overflow(const char *message,
			
 
				+					     struct pt_regs *regs,
			
 
				+					     unsigned long fault_address)
			
 
				+{
			
 
				+	printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
			
 
				+		 (void *)fault_address, current->stack,
			
 
				+		 (char *)current->stack + THREAD_SIZE - 1);
			
 
				+	die(message, regs, 0);
			
 
				+
			
 
				+	/* Be absolutely certain we don't return. */
			
 
				+	panic(message);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef CONFIG_X86_64
			
 
				 /* Runs on IST stack */
			
 
				 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
			
 
				 {
			
 
				 	static const char str[] = "double fault";
			
 
				 	struct task_struct *tsk = current;
			
 
				+#ifdef CONFIG_VMAP_STACK
			
 
				+	unsigned long cr2;
			
 
				+#endif
			
 
				 
			
 
				 #ifdef CONFIG_X86_ESPFIX64
			
 
				 	extern unsigned char native_irq_return_iret[];
			
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
				 	tsk->thread.error_code = error_code;
			
 
				 	tsk->thread.trap_nr = X86_TRAP_DF;
			
 
				 
			
 
				+#ifdef CONFIG_VMAP_STACK
			
 
				+	/*
			
 
				+	 * If we overflow the stack into a guard page, the CPU will fail
			
 
				+	 * to deliver #PF and will send #DF instead.  Similarly, if we
			
 
				+	 * take any non-IST exception while too close to the bottom of
			
 
				+	 * the stack, the processor will get a page fault while
			
 
				+	 * delivering the exception and will generate a double fault.
			
 
				+	 *
			
 
				+	 * According to the SDM (footnote in 6.15 under "Interrupt 14 -
			
 
				+	 * Page-Fault Exception (#PF):
			
 
				+	 *
			
 
				+	 *   Processors update CR2 whenever a page fault is detected. If a
			
 
				+	 *   second page fault occurs while an earlier page fault is being
			
 
				+	 *   deliv- ered, the faulting linear address of the second fault will
			
 
				+	 *   overwrite the contents of CR2 (replacing the previous
			
 
				+	 *   address). These updates to CR2 occur even if the page fault
			
 
				+	 *   results in a double fault or occurs during the delivery of a
			
 
				+	 *   double fault.
			
 
				+	 *
			
 
				+	 * The logic below has a small possibility of incorrectly diagnosing
			
 
				+	 * some errors as stack overflows.  For example, if the IDT or GDT
			
 
				+	 * gets corrupted such that #GP delivery fails due to a bad descriptor
			
 
				+	 * causing #GP and we hit this condition while CR2 coincidentally
			
 
				+	 * points to the stack guard page, we'll think we overflowed the
			
 
				+	 * stack.  Given that we're going to panic one way or another
			
 
				+	 * if this happens, this isn't necessarily worth fixing.
			
 
				+	 *
			
 
				+	 * If necessary, we could improve the test by only diagnosing
			
 
				+	 * a stack overflow if the saved RSP points within 47 bytes of
			
 
				+	 * the bottom of the stack: if RSP == tsk_stack + 48 and we
			
 
				+	 * take an exception, the stack is already aligned and there
			
 
				+	 * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
			
 
				+	 * possible error code, so a stack overflow would *not* double
			
 
				+	 * fault.  With any less space left, exception delivery could
			
 
				+	 * fail, and, as a practical matter, we've overflowed the
			
 
				+	 * stack even if the actual trigger for the double fault was
			
 
				+	 * something else.
			
 
				+	 */
			
 
				+	cr2 = read_cr2();
			
 
				+	if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
			
 
				+		handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
			
 
				+#endif
			
 
				+
			
 
				 #ifdef CONFIG_DOUBLEFAULT
			
 
				 	df_debug(regs, error_code);
			
 
				 #endif
			
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
				 	unsigned cpu = smp_processor_id();
			
 
				 
			
 
				 	if (likely(prev != next)) {
			
 
				+		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
			
 
				+			/*
			
 
				+			 * If our current stack is in vmalloc space and isn't
			
 
				+			 * mapped in the new pgd, we'll double-fault.  Forcibly
			
 
				+			 * map it.
			
 
				+			 */
			
 
				+			unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
			
 
				+
			
 
				+			pgd_t *pgd = next->pgd + stack_pgd_index;
			
 
				+
			
 
				+			if (unlikely(pgd_none(*pgd)))
			
 
				+				set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
			
 
				+		}
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				 		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
			
 
				 		this_cpu_write(cpu_tlbstate.active_mm, next);
			
 
				 #endif
			
 
				+
			
 
				 		cpumask_set_cpu(cpu, mm_cpumask(next));
			
 
				 
			
 
				 		/*