10 лет назад · 0e1dbccd8f
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
 
				 config ARCH_THREAD_INFO_ALLOCATOR
			
 
				 	bool
			
 
				 
			
 
				+# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
			
 
				+config ARCH_WANTS_DYNAMIC_TASK_STRUCT
			
 
				+	bool
			
 
				+
			
 
				 config HAVE_REGS_AND_STACK_ACCESS_API
			
 
				 	bool
			
 
				 	help
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 
				 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
			
 
				 	select ARCH_USE_QUEUED_RWLOCKS
			
 
				 	select ARCH_USE_QUEUED_SPINLOCKS
			
 
				+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
			
 
				 	select ARCH_WANT_FRAME_POINTERS
			
 
				 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
			
 
				 	select ARCH_WANT_OPTIONAL_GPIOLIB
			
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
 
				 
			
 
				 	  If unsure, say N.
			
 
				 
			
 
				+config DEBUG_ENTRY
			
 
				+	bool "Debug low-level entry code"
			
 
				+	depends on DEBUG_KERNEL
			
 
				+	---help---
			
 
				+	  This option enables sanity checks in x86's low-level entry code.
			
 
				+	  Some of these sanity checks may slow down kernel entries and
			
 
				+	  exits or otherwise impact performance.
			
 
				+
			
 
				+	  This is currently used to help test NMI code.
			
 
				+
			
 
				+	  If unsure, say N.
			
 
				+
			
 
				 config DEBUG_NMI_SELFTEST
			
 
				 	bool "NMI Selftest"
			
 
				 	depends on DEBUG_KERNEL && X86_LOCAL_APIC
			
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
 
				 	 *  If the variable is not set and the stack is not the NMI
			
 
				 	 *  stack then:
			
 
				 	 *    o Set the special variable on the stack
			
 
				-	 *    o Copy the interrupt frame into a "saved" location on the stack
			
 
				-	 *    o Copy the interrupt frame into a "copy" location on the stack
			
 
				+	 *    o Copy the interrupt frame into an "outermost" location on the
			
 
				+	 *      stack
			
 
				+	 *    o Copy the interrupt frame into an "iret" location on the stack
			
 
				 	 *    o Continue processing the NMI
			
 
				 	 *  If the variable is set or the previous stack is the NMI stack:
			
 
				-	 *    o Modify the "copy" location to jump to the repeate_nmi
			
 
				+	 *    o Modify the "iret" location to jump to the repeat_nmi
			
 
				 	 *    o return back to the first NMI
			
 
				 	 *
			
 
				 	 * Now on exit of the first NMI, we first clear the stack variable
			
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
 
				 	 * a nested NMI that updated the copy interrupt stack frame, a
			
 
				 	 * jump will be made to the repeat_nmi code that will handle the second
			
 
				 	 * NMI.
			
 
				+	 *
			
 
				+	 * However, espfix prevents us from directly returning to userspace
			
 
				+	 * with a single IRET instruction.  Similarly, IRET to user mode
			
 
				+	 * can fault.  We therefore handle NMIs from user space like
			
 
				+	 * other IST entries.
			
 
				 	 */
			
 
				 
			
 
				 	/* Use %rdx as our temp variable throughout */
			
 
				 	pushq	%rdx
			
 
				 
			
 
				+	testb	$3, CS-RIP+8(%rsp)
			
 
				+	jz	.Lnmi_from_kernel
			
 
				+
			
 
				+	/*
			
 
				+	 * NMI from user mode.  We need to run on the thread stack, but we
			
 
				+	 * can't go through the normal entry paths: NMIs are masked, and
			
 
				+	 * we don't want to enable interrupts, because then we'll end
			
 
				+	 * up in an awkward situation in which IRQs are on but NMIs
			
 
				+	 * are off.
			
 
				+	 */
			
 
				+
			
 
				+	SWAPGS
			
 
				+	cld
			
 
				+	movq	%rsp, %rdx
			
 
				+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
			
 
				+	pushq	5*8(%rdx)	/* pt_regs->ss */
			
 
				+	pushq	4*8(%rdx)	/* pt_regs->rsp */
			
 
				+	pushq	3*8(%rdx)	/* pt_regs->flags */
			
 
				+	pushq	2*8(%rdx)	/* pt_regs->cs */
			
 
				+	pushq	1*8(%rdx)	/* pt_regs->rip */
			
 
				+	pushq   $-1		/* pt_regs->orig_ax */
			
 
				+	pushq   %rdi		/* pt_regs->di */
			
 
				+	pushq   %rsi		/* pt_regs->si */
			
 
				+	pushq   (%rdx)		/* pt_regs->dx */
			
 
				+	pushq   %rcx		/* pt_regs->cx */
			
 
				+	pushq   %rax		/* pt_regs->ax */
			
 
				+	pushq   %r8		/* pt_regs->r8 */
			
 
				+	pushq   %r9		/* pt_regs->r9 */
			
 
				+	pushq   %r10		/* pt_regs->r10 */
			
 
				+	pushq   %r11		/* pt_regs->r11 */
			
 
				+	pushq	%rbx		/* pt_regs->rbx */
			
 
				+	pushq	%rbp		/* pt_regs->rbp */
			
 
				+	pushq	%r12		/* pt_regs->r12 */
			
 
				+	pushq	%r13		/* pt_regs->r13 */
			
 
				+	pushq	%r14		/* pt_regs->r14 */
			
 
				+	pushq	%r15		/* pt_regs->r15 */
			
 
				+
			
 
				+	/*
			
 
				+	 * At this point we no longer need to worry about stack damage
			
 
				+	 * due to nesting -- we're on the normal thread stack and we're
			
 
				+	 * done with the NMI stack.
			
 
				+	 */
			
 
				+
			
 
				+	movq	%rsp, %rdi
			
 
				+	movq	$-1, %rsi
			
 
				+	call	do_nmi
			
 
				+
			
 
				+	/*
			
 
				+	 * Return back to user mode.  We must *not* do the normal exit
			
 
				+	 * work, because we don't want to enable interrupts.  Fortunately,
			
 
				+	 * do_nmi doesn't modify pt_regs.
			
 
				+	 */
			
 
				+	SWAPGS
			
 
				+	jmp	restore_c_regs_and_iret
			
 
				+
			
 
				+.Lnmi_from_kernel:
			
 
				+	/*
			
 
				+	 * Here's what our stack frame will look like:
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 * | original SS                                             |
			
 
				+	 * | original Return RSP                                     |
			
 
				+	 * | original RFLAGS                                         |
			
 
				+	 * | original CS                                             |
			
 
				+	 * | original RIP                                            |
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 * | temp storage for rdx                                    |
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 * | "NMI executing" variable                                |
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 * | iret SS          } Copied from "outermost" frame        |
			
 
				+	 * | iret Return RSP  } on each loop iteration; overwritten  |
			
 
				+	 * | iret RFLAGS      } by a nested NMI to force another     |
			
 
				+	 * | iret CS          } iteration if needed.                 |
			
 
				+	 * | iret RIP         }                                      |
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 * | outermost SS          } initialized in first_nmi;       |
			
 
				+	 * | outermost Return RSP  } will not be changed before      |
			
 
				+	 * | outermost RFLAGS      } NMI processing is done.         |
			
 
				+	 * | outermost CS          } Copied to "iret" frame on each  |
			
 
				+	 * | outermost RIP         } iteration.                      |
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 * | pt_regs                                                 |
			
 
				+	 * +---------------------------------------------------------+
			
 
				+	 *
			
 
				+	 * The "original" frame is used by hardware.  Before re-enabling
			
 
				+	 * NMIs, we need to be done with it, and we need to leave enough
			
 
				+	 * space for the asm code here.
			
 
				+	 *
			
 
				+	 * We return by executing IRET while RSP points to the "iret" frame.
			
 
				+	 * That will either return for real or it will loop back into NMI
			
 
				+	 * processing.
			
 
				+	 *
			
 
				+	 * The "outermost" frame is copied to the "iret" frame on each
			
 
				+	 * iteration of the loop, so each iteration starts with the "iret"
			
 
				+	 * frame pointing to the final return target.
			
 
				+	 */
			
 
				+
			
 
				 	/*
			
 
				-	 * If %cs was not the kernel segment, then the NMI triggered in user
			
 
				-	 * space, which means it is definitely not nested.
			
 
				+	 * Determine whether we're a nested NMI.
			
 
				+	 *
			
 
				+	 * If we interrupted kernel code between repeat_nmi and
			
 
				+	 * end_repeat_nmi, then we are a nested NMI.  We must not
			
 
				+	 * modify the "iret" frame because it's being written by
			
 
				+	 * the outer NMI.  That's okay; the outer NMI handler is
			
 
				+	 * about to about to call do_nmi anyway, so we can just
			
 
				+	 * resume the outer NMI.
			
 
				 	 */
			
 
				-	cmpl	$__KERNEL_CS, 16(%rsp)
			
 
				-	jne	first_nmi
			
 
				+
			
 
				+	movq	$repeat_nmi, %rdx
			
 
				+	cmpq	8(%rsp), %rdx
			
 
				+	ja	1f
			
 
				+	movq	$end_repeat_nmi, %rdx
			
 
				+	cmpq	8(%rsp), %rdx
			
 
				+	ja	nested_nmi_out
			
 
				+1:
			
 
				 
			
 
				 	/*
			
 
				-	 * Check the special variable on the stack to see if NMIs are
			
 
				-	 * executing.
			
 
				+	 * Now check "NMI executing".  If it's set, then we're nested.
			
 
				+	 * This will not detect if we interrupted an outer NMI just
			
 
				+	 * before IRET.
			
 
				 	 */
			
 
				 	cmpl	$1, -8(%rsp)
			
 
				 	je	nested_nmi
			
 
				 
			
 
				 	/*
			
 
				-	 * Now test if the previous stack was an NMI stack.
			
 
				-	 * We need the double check. We check the NMI stack to satisfy the
			
 
				-	 * race when the first NMI clears the variable before returning.
			
 
				-	 * We check the variable because the first NMI could be in a
			
 
				-	 * breakpoint routine using a breakpoint stack.
			
 
				+	 * Now test if the previous stack was an NMI stack.  This covers
			
 
				+	 * the case where we interrupt an outer NMI after it clears
			
 
				+	 * "NMI executing" but before IRET.  We need to be careful, though:
			
 
				+	 * there is one case in which RSP could point to the NMI stack
			
 
				+	 * despite there being no NMI active: naughty userspace controls
			
 
				+	 * RSP at the very beginning of the SYSCALL targets.  We can
			
 
				+	 * pull a fast one on naughty userspace, though: we program
			
 
				+	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
			
 
				+	 * if it controls the kernel's RSP.  We set DF before we clear
			
 
				+	 * "NMI executing".
			
 
				 	 */
			
 
				 	lea	6*8(%rsp), %rdx
			
 
				 	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
			
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
 
				 	cmpq	%rdx, 4*8(%rsp)
			
 
				 	/* If it is below the NMI stack, it is a normal NMI */
			
 
				 	jb	first_nmi
			
 
				-	/* Ah, it is within the NMI stack, treat it as nested */
			
 
				+
			
 
				+	/* Ah, it is within the NMI stack. */
			
 
				+
			
 
				+	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
			
 
				+	jz	first_nmi	/* RSP was user controlled. */
			
 
				+
			
 
				+	/* This is a nested NMI. */
			
 
				 
			
 
				 nested_nmi:
			
 
				 	/*
			
 
				-	 * Do nothing if we interrupted the fixup in repeat_nmi.
			
 
				-	 * It's about to repeat the NMI handler, so we are fine
			
 
				-	 * with ignoring this one.
			
 
				+	 * Modify the "iret" frame to point to repeat_nmi, forcing another
			
 
				+	 * iteration of NMI handling.
			
 
				 	 */
			
 
				-	movq	$repeat_nmi, %rdx
			
 
				-	cmpq	8(%rsp), %rdx
			
 
				-	ja	1f
			
 
				-	movq	$end_repeat_nmi, %rdx
			
 
				-	cmpq	8(%rsp), %rdx
			
 
				-	ja	nested_nmi_out
			
 
				-
			
 
				-1:
			
 
				-	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
			
 
				-	leaq	-1*8(%rsp), %rdx
			
 
				-	movq	%rdx, %rsp
			
 
				+	subq	$8, %rsp
			
 
				 	leaq	-10*8(%rsp), %rdx
			
 
				 	pushq	$__KERNEL_DS
			
 
				 	pushq	%rdx
			
@@ -1318,61 +1434,42 @@ nested_nmi:
 
				 nested_nmi_out:
			
 
				 	popq	%rdx
			
 
				 
			
 
				-	/* No need to check faults here */
			
 
				+	/* We are returning to kernel mode, so this cannot result in a fault. */
			
 
				 	INTERRUPT_RETURN
			
 
				 
			
 
				 first_nmi:
			
 
				-	/*
			
 
				-	 * Because nested NMIs will use the pushed location that we
			
 
				-	 * stored in rdx, we must keep that space available.
			
 
				-	 * Here's what our stack frame will look like:
			
 
				-	 * +-------------------------+
			
 
				-	 * | original SS             |
			
 
				-	 * | original Return RSP     |
			
 
				-	 * | original RFLAGS         |
			
 
				-	 * | original CS             |
			
 
				-	 * | original RIP            |
			
 
				-	 * +-------------------------+
			
 
				-	 * | temp storage for rdx    |
			
 
				-	 * +-------------------------+
			
 
				-	 * | NMI executing variable  |
			
 
				-	 * +-------------------------+
			
 
				-	 * | copied SS               |
			
 
				-	 * | copied Return RSP       |
			
 
				-	 * | copied RFLAGS           |
			
 
				-	 * | copied CS               |
			
 
				-	 * | copied RIP              |
			
 
				-	 * +-------------------------+
			
 
				-	 * | Saved SS                |
			
 
				-	 * | Saved Return RSP        |
			
 
				-	 * | Saved RFLAGS            |
			
 
				-	 * | Saved CS                |
			
 
				-	 * | Saved RIP               |
			
 
				-	 * +-------------------------+
			
 
				-	 * | pt_regs                 |
			
 
				-	 * +-------------------------+
			
 
				-	 *
			
 
				-	 * The saved stack frame is used to fix up the copied stack frame
			
 
				-	 * that a nested NMI may change to make the interrupted NMI iret jump
			
 
				-	 * to the repeat_nmi. The original stack frame and the temp storage
			
 
				-	 * is also used by nested NMIs and can not be trusted on exit.
			
 
				-	 */
			
 
				-	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
			
 
				+	/* Restore rdx. */
			
 
				 	movq	(%rsp), %rdx
			
 
				 
			
 
				-	/* Set the NMI executing variable on the stack. */
			
 
				-	pushq	$1
			
 
				+	/* Make room for "NMI executing". */
			
 
				+	pushq	$0
			
 
				 
			
 
				-	/* Leave room for the "copied" frame */
			
 
				+	/* Leave room for the "iret" frame */
			
 
				 	subq	$(5*8), %rsp
			
 
				 
			
 
				-	/* Copy the stack frame to the Saved frame */
			
 
				+	/* Copy the "original" frame to the "outermost" frame */
			
 
				 	.rept 5
			
 
				 	pushq	11*8(%rsp)
			
 
				 	.endr
			
 
				 
			
 
				 	/* Everything up to here is safe from nested NMIs */
			
 
				 
			
 
				+#ifdef CONFIG_DEBUG_ENTRY
			
 
				+	/*
			
 
				+	 * For ease of testing, unmask NMIs right away.  Disabled by
			
 
				+	 * default because IRET is very expensive.
			
 
				+	 */
			
 
				+	pushq	$0		/* SS */
			
 
				+	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
			
 
				+	addq	$8, (%rsp)	/* Fix up RSP */
			
 
				+	pushfq			/* RFLAGS */
			
 
				+	pushq	$__KERNEL_CS	/* CS */
			
 
				+	pushq	$1f		/* RIP */
			
 
				+	INTERRUPT_RETURN	/* continues at repeat_nmi below */
			
 
				+1:
			
 
				+#endif
			
 
				+
			
 
				+repeat_nmi:
			
 
				 	/*
			
 
				 	 * If there was a nested NMI, the first NMI's iret will return
			
 
				 	 * here. But NMIs are still enabled and we can take another
			
@@ -1381,16 +1478,20 @@ first_nmi:
 
				 	 * it will just return, as we are about to repeat an NMI anyway.
			
 
				 	 * This makes it safe to copy to the stack frame that a nested
			
 
				 	 * NMI will update.
			
 
				+	 *
			
 
				+	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
			
 
				+	 * we're repeating an NMI, gsbase has the same value that it had on
			
 
				+	 * the first iteration.  paranoid_entry will load the kernel
			
 
				+	 * gsbase if needed before we call do_nmi.  "NMI executing"
			
 
				+	 * is zero.
			
 
				 	 */
			
 
				-repeat_nmi:
			
 
				+	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
			
 
				+
			
 
				 	/*
			
 
				-	 * Update the stack variable to say we are still in NMI (the update
			
 
				-	 * is benign for the non-repeat case, where 1 was pushed just above
			
 
				-	 * to this very stack slot).
			
 
				+	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
			
 
				+	 * here must not modify the "iret" frame while we're writing to
			
 
				+	 * it or it will end up containing garbage.
			
 
				 	 */
			
 
				-	movq	$1, 10*8(%rsp)
			
 
				-
			
 
				-	/* Make another copy, this one may be modified by nested NMIs */
			
 
				 	addq	$(10*8), %rsp
			
 
				 	.rept 5
			
 
				 	pushq	-6*8(%rsp)
			
@@ -1399,9 +1500,9 @@ repeat_nmi:
 
				 end_repeat_nmi:
			
 
				 
			
 
				 	/*
			
 
				-	 * Everything below this point can be preempted by a nested
			
 
				-	 * NMI if the first NMI took an exception and reset our iret stack
			
 
				-	 * so that we repeat another NMI.
			
 
				+	 * Everything below this point can be preempted by a nested NMI.
			
 
				+	 * If this happens, then the inner NMI will change the "iret"
			
 
				+	 * frame to point back to repeat_nmi.
			
 
				 	 */
			
 
				 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
			
 
				 	ALLOC_PT_GPREGS_ON_STACK
			
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
 
				 	 */
			
 
				 	call	paranoid_entry
			
 
				 
			
 
				-	/*
			
 
				-	 * Save off the CR2 register. If we take a page fault in the NMI then
			
 
				-	 * it could corrupt the CR2 value. If the NMI preempts a page fault
			
 
				-	 * handler before it was able to read the CR2 register, and then the
			
 
				-	 * NMI itself takes a page fault, the page fault that was preempted
			
 
				-	 * will read the information from the NMI page fault and not the
			
 
				-	 * origin fault. Save it off and restore it if it changes.
			
 
				-	 * Use the r12 callee-saved register.
			
 
				-	 */
			
 
				-	movq	%cr2, %r12
			
 
				-
			
 
				 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
			
 
				 	movq	%rsp, %rdi
			
 
				 	movq	$-1, %rsi
			
 
				 	call	do_nmi
			
 
				 
			
 
				-	/* Did the NMI take a page fault? Restore cr2 if it did */
			
 
				-	movq	%cr2, %rcx
			
 
				-	cmpq	%rcx, %r12
			
 
				-	je	1f
			
 
				-	movq	%r12, %cr2
			
 
				-1:
			
 
				 	testl	%ebx, %ebx			/* swapgs needed? */
			
 
				 	jnz	nmi_restore
			
 
				 nmi_swapgs:
			
@@ -1444,11 +1528,26 @@ nmi_swapgs:
 
				 nmi_restore:
			
 
				 	RESTORE_EXTRA_REGS
			
 
				 	RESTORE_C_REGS
			
 
				-	/* Pop the extra iret frame at once */
			
 
				+
			
 
				+	/* Point RSP at the "iret" frame. */
			
 
				 	REMOVE_PT_GPREGS_FROM_STACK 6*8
			
 
				 
			
 
				-	/* Clear the NMI executing stack variable */
			
 
				-	movq	$0, 5*8(%rsp)
			
 
				+	/*
			
 
				+	 * Clear "NMI executing".  Set DF first so that we can easily
			
 
				+	 * distinguish the remaining code between here and IRET from
			
 
				+	 * the SYSCALL entry and exit paths.  On a native kernel, we
			
 
				+	 * could just inspect RIP, but, on paravirt kernels,
			
 
				+	 * INTERRUPT_RETURN can translate into a jump into a
			
 
				+	 * hypercall page.
			
 
				+	 */
			
 
				+	std
			
 
				+	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
			
 
				+
			
 
				+	/*
			
 
				+	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
			
 
				+	 * stack in a single instruction.  We are returning to kernel
			
 
				+	 * mode, so this cannot result in a fault.
			
 
				+	 */
			
 
				 	INTERRUPT_RETURN
			
 
				 END(nmi)
			
 
				 
			
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
 
				 	struct fxregs_state		fxsave;
			
 
				 	struct swregs_state		soft;
			
 
				 	struct xregs_state		xsave;
			
 
				+	u8 __padding[PAGE_SIZE];
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -197,40 +198,6 @@ union fpregs_state {
 
				  * state fields:
			
 
				  */
			
 
				 struct fpu {
			
 
				-	/*
			
 
				-	 * @state:
			
 
				-	 *
			
 
				-	 * In-memory copy of all FPU registers that we save/restore
			
 
				-	 * over context switches. If the task is using the FPU then
			
 
				-	 * the registers in the FPU are more recent than this state
			
 
				-	 * copy. If the task context-switches away then they get
			
 
				-	 * saved here and represent the FPU state.
			
 
				-	 *
			
 
				-	 * After context switches there may be a (short) time period
			
 
				-	 * during which the in-FPU hardware registers are unchanged
			
 
				-	 * and still perfectly match this state, if the tasks
			
 
				-	 * scheduled afterwards are not using the FPU.
			
 
				-	 *
			
 
				-	 * This is the 'lazy restore' window of optimization, which
			
 
				-	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
			
 
				-	 *
			
 
				-	 * We detect whether a subsequent task uses the FPU via setting
			
 
				-	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
			
 
				-	 *
			
 
				-	 * During this window, if the task gets scheduled again, we
			
 
				-	 * might be able to skip having to do a restore from this
			
 
				-	 * memory buffer to the hardware registers - at the cost of
			
 
				-	 * incurring the overhead of #NM fault traps.
			
 
				-	 *
			
 
				-	 * Note that on modern CPUs that support the XSAVEOPT (or other
			
 
				-	 * optimized XSAVE instructions), we don't use #NM traps anymore,
			
 
				-	 * as the hardware can track whether FPU registers need saving
			
 
				-	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
			
 
				-	 * logic, which unconditionally saves/restores all FPU state
			
 
				-	 * across context switches. (if FPU state exists.)
			
 
				-	 */
			
 
				-	union fpregs_state		state;
			
 
				-
			
 
				 	/*
			
 
				 	 * @last_cpu:
			
 
				 	 *
			
@@ -288,6 +255,43 @@ struct fpu {
 
				 	 * deal with bursty apps that only use the FPU for a short time:
			
 
				 	 */
			
 
				 	unsigned char			counter;
			
 
				+	/*
			
 
				+	 * @state:
			
 
				+	 *
			
 
				+	 * In-memory copy of all FPU registers that we save/restore
			
 
				+	 * over context switches. If the task is using the FPU then
			
 
				+	 * the registers in the FPU are more recent than this state
			
 
				+	 * copy. If the task context-switches away then they get
			
 
				+	 * saved here and represent the FPU state.
			
 
				+	 *
			
 
				+	 * After context switches there may be a (short) time period
			
 
				+	 * during which the in-FPU hardware registers are unchanged
			
 
				+	 * and still perfectly match this state, if the tasks
			
 
				+	 * scheduled afterwards are not using the FPU.
			
 
				+	 *
			
 
				+	 * This is the 'lazy restore' window of optimization, which
			
 
				+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
			
 
				+	 *
			
 
				+	 * We detect whether a subsequent task uses the FPU via setting
			
 
				+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
			
 
				+	 *
			
 
				+	 * During this window, if the task gets scheduled again, we
			
 
				+	 * might be able to skip having to do a restore from this
			
 
				+	 * memory buffer to the hardware registers - at the cost of
			
 
				+	 * incurring the overhead of #NM fault traps.
			
 
				+	 *
			
 
				+	 * Note that on modern CPUs that support the XSAVEOPT (or other
			
 
				+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
			
 
				+	 * as the hardware can track whether FPU registers need saving
			
 
				+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
			
 
				+	 * logic, which unconditionally saves/restores all FPU state
			
 
				+	 * across context switches. (if FPU state exists.)
			
 
				+	 */
			
 
				+	union fpregs_state		state;
			
 
				+	/*
			
 
				+	 * WARNING: 'state' is dynamically-sized.  Do not put
			
 
				+	 * anything after it here.
			
 
				+	 */
			
 
				 };
			
 
				 
			
 
				 #endif /* _ASM_X86_FPU_H */
			
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
 
				 #endif
			
 
				 	unsigned long		gs;
			
 
				 
			
 
				-	/* Floating point and extended processor state */
			
 
				-	struct fpu		fpu;
			
 
				-
			
 
				 	/* Save middle states of ptrace breakpoints */
			
 
				 	struct perf_event	*ptrace_bps[HBP_NUM];
			
 
				 	/* Debug status used for traps, single steps, etc... */
			
@@ -418,6 +415,13 @@ struct thread_struct {
 
				 	unsigned long		iopl;
			
 
				 	/* Max allowed port in the bitmap, in bytes: */
			
 
				 	unsigned		io_bitmap_max;
			
 
				+
			
 
				+	/* Floating point and extended processor state */
			
 
				+	struct fpu		fpu;
			
 
				+	/*
			
 
				+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
			
 
				+	 * the end.
			
 
				+	 */
			
 
				 };
			
 
				 
			
 
				 /*
			
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
 
				 #include <asm/fpu/internal.h>
			
 
				 #include <asm/tlbflush.h>
			
 
				 
			
 
				+#include <linux/sched.h>
			
 
				+
			
 
				 /*
			
 
				  * Initialize the TS bit in CR0 according to the style of context-switches
			
 
				  * we are using:
			
@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void)
 
				 unsigned int xstate_size;
			
 
				 EXPORT_SYMBOL_GPL(xstate_size);
			
 
				 
			
 
				+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
			
 
				+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
			
 
				+	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
			
 
				+
			
 
				+/*
			
 
				+ * We append the 'struct fpu' to the task_struct:
			
 
				+ */
			
 
				+static void __init fpu__init_task_struct_size(void)
			
 
				+{
			
 
				+	int task_size = sizeof(struct task_struct);
			
 
				+
			
 
				+	/*
			
 
				+	 * Subtract off the static size of the register state.
			
 
				+	 * It potentially has a bunch of padding.
			
 
				+	 */
			
 
				+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
			
 
				+
			
 
				+	/*
			
 
				+	 * Add back the dynamically-calculated register state
			
 
				+	 * size.
			
 
				+	 */
			
 
				+	task_size += xstate_size;
			
 
				+
			
 
				+	/*
			
 
				+	 * We dynamically size 'struct fpu', so we require that
			
 
				+	 * it be at the end of 'thread_struct' and that
			
 
				+	 * 'thread_struct' be at the end of 'task_struct'.  If
			
 
				+	 * you hit a compile error here, check the structure to
			
 
				+	 * see if something got added to the end.
			
 
				+	 */
			
 
				+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
			
 
				+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
			
 
				+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
			
 
				+
			
 
				+	arch_task_struct_size = task_size;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Set up the xstate_size based on the legacy FPU context size.
			
 
				  *
			
@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 
				 	fpu__init_system_generic();
			
 
				 	fpu__init_system_xstate_size_legacy();
			
 
				 	fpu__init_system_xstate();
			
 
				+	fpu__init_task_struct_size();
			
 
				 
			
 
				 	fpu__init_system_ctx_switch();
			
 
				 }
			
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
 
				 NOKPROBE_SYMBOL(default_do_nmi);
			
 
				 
			
 
				 /*
			
 
				- * NMIs can hit breakpoints which will cause it to lose its
			
 
				- * NMI context with the CPU when the breakpoint does an iret.
			
 
				- */
			
 
				-#ifdef CONFIG_X86_32
			
 
				-/*
			
 
				- * For i386, NMIs use the same stack as the kernel, and we can
			
 
				- * add a workaround to the iret problem in C (preventing nested
			
 
				- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
			
 
				- * can be in:
			
 
				+ * NMIs can page fault or hit breakpoints which will cause it to lose
			
 
				+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
			
 
				+ *
			
 
				+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
			
 
				+ * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
			
 
				+ * if the outer NMI came from kernel mode, but we can still nest if the
			
 
				+ * outer NMI came from user mode.
			
 
				+ *
			
 
				+ * To handle these nested NMIs, we have three states:
			
 
				  *
			
 
				  *  1) not running
			
 
				  *  2) executing
			
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
 
				  * (Note, the latch is binary, thus multiple NMIs triggering,
			
 
				  *  when one is running, are ignored. Only one NMI is restarted.)
			
 
				  *
			
 
				- * If an NMI hits a breakpoint that executes an iret, another
			
 
				- * NMI can preempt it. We do not want to allow this new NMI
			
 
				- * to run, but we want to execute it when the first one finishes.
			
 
				- * We set the state to "latched", and the exit of the first NMI will
			
 
				- * perform a dec_return, if the result is zero (NOT_RUNNING), then
			
 
				- * it will simply exit the NMI handler. If not, the dec_return
			
 
				- * would have set the state to NMI_EXECUTING (what we want it to
			
 
				- * be when we are running). In this case, we simply jump back
			
 
				- * to rerun the NMI handler again, and restart the 'latched' NMI.
			
 
				+ * If an NMI executes an iret, another NMI can preempt it. We do not
			
 
				+ * want to allow this new NMI to run, but we want to execute it when the
			
 
				+ * first one finishes.  We set the state to "latched", and the exit of
			
 
				+ * the first NMI will perform a dec_return, if the result is zero
			
 
				+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
			
 
				+ * dec_return would have set the state to NMI_EXECUTING (what we want it
			
 
				+ * to be when we are running). In this case, we simply jump back to
			
 
				+ * rerun the NMI handler again, and restart the 'latched' NMI.
			
 
				  *
			
 
				  * No trap (breakpoint or page fault) should be hit before nmi_restart,
			
 
				  * thus there is no race between the first check of state for NOT_RUNNING
			
@@ -461,49 +460,36 @@ enum nmi_states {
 
				 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
			
 
				 static DEFINE_PER_CPU(unsigned long, nmi_cr2);
			
 
				 
			
 
				-#define nmi_nesting_preprocess(regs)					\
			
 
				-	do {								\
			
 
				-		if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {	\
			
 
				-			this_cpu_write(nmi_state, NMI_LATCHED);		\
			
 
				-			return;						\
			
 
				-		}							\
			
 
				-		this_cpu_write(nmi_state, NMI_EXECUTING);		\
			
 
				-		this_cpu_write(nmi_cr2, read_cr2());			\
			
 
				-	} while (0);							\
			
 
				-	nmi_restart:
			
 
				-
			
 
				-#define nmi_nesting_postprocess()					\
			
 
				-	do {								\
			
 
				-		if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))	\
			
 
				-			write_cr2(this_cpu_read(nmi_cr2));		\
			
 
				-		if (this_cpu_dec_return(nmi_state))			\
			
 
				-			goto nmi_restart;				\
			
 
				-	} while (0)
			
 
				-#else /* x86_64 */
			
 
				+#ifdef CONFIG_X86_64
			
 
				 /*
			
 
				- * In x86_64 things are a bit more difficult. This has the same problem
			
 
				- * where an NMI hitting a breakpoint that calls iret will remove the
			
 
				- * NMI context, allowing a nested NMI to enter. What makes this more
			
 
				- * difficult is that both NMIs and breakpoints have their own stack.
			
 
				- * When a new NMI or breakpoint is executed, the stack is set to a fixed
			
 
				- * point. If an NMI is nested, it will have its stack set at that same
			
 
				- * fixed address that the first NMI had, and will start corrupting the
			
 
				- * stack. This is handled in entry_64.S, but the same problem exists with
			
 
				- * the breakpoint stack.
			
 
				+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint.  Without
			
 
				+ * some care, the inner breakpoint will clobber the outer breakpoint's
			
 
				+ * stack.
			
 
				  *
			
 
				- * If a breakpoint is being processed, and the debug stack is being used,
			
 
				- * if an NMI comes in and also hits a breakpoint, the stack pointer
			
 
				- * will be set to the same fixed address as the breakpoint that was
			
 
				- * interrupted, causing that stack to be corrupted. To handle this case,
			
 
				- * check if the stack that was interrupted is the debug stack, and if
			
 
				- * so, change the IDT so that new breakpoints will use the current stack
			
 
				- * and not switch to the fixed address. On return of the NMI, switch back
			
 
				- * to the original IDT.
			
 
				+ * If a breakpoint is being processed, and the debug stack is being
			
 
				+ * used, if an NMI comes in and also hits a breakpoint, the stack
			
 
				+ * pointer will be set to the same fixed address as the breakpoint that
			
 
				+ * was interrupted, causing that stack to be corrupted. To handle this
			
 
				+ * case, check if the stack that was interrupted is the debug stack, and
			
 
				+ * if so, change the IDT so that new breakpoints will use the current
			
 
				+ * stack and not switch to the fixed address. On return of the NMI,
			
 
				+ * switch back to the original IDT.
			
 
				  */
			
 
				 static DEFINE_PER_CPU(int, update_debug_stack);
			
 
				+#endif
			
 
				 
			
 
				-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
			
 
				+dotraplinkage notrace void
			
 
				+do_nmi(struct pt_regs *regs, long error_code)
			
 
				 {
			
 
				+	if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
			
 
				+		this_cpu_write(nmi_state, NMI_LATCHED);
			
 
				+		return;
			
 
				+	}
			
 
				+	this_cpu_write(nmi_state, NMI_EXECUTING);
			
 
				+	this_cpu_write(nmi_cr2, read_cr2());
			
 
				+nmi_restart:
			
 
				+
			
 
				+#ifdef CONFIG_X86_64
			
 
				 	/*
			
 
				 	 * If we interrupted a breakpoint, it is possible that
			
 
				 	 * the nmi handler will have breakpoints too. We need to
			
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
 
				 		debug_stack_set_zero();
			
 
				 		this_cpu_write(update_debug_stack, 1);
			
 
				 	}
			
 
				-}
			
 
				-
			
 
				-static inline void nmi_nesting_postprocess(void)
			
 
				-{
			
 
				-	if (unlikely(this_cpu_read(update_debug_stack))) {
			
 
				-		debug_stack_reset();
			
 
				-		this_cpu_write(update_debug_stack, 0);
			
 
				-	}
			
 
				-}
			
 
				 #endif
			
 
				 
			
 
				-dotraplinkage notrace void
			
 
				-do_nmi(struct pt_regs *regs, long error_code)
			
 
				-{
			
 
				-	nmi_nesting_preprocess(regs);
			
 
				-
			
 
				 	nmi_enter();
			
 
				 
			
 
				 	inc_irq_stat(__nmi_count);
			
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
 
				 
			
 
				 	nmi_exit();
			
 
				 
			
 
				-	/* On i386, may loop back to preprocess */
			
 
				-	nmi_nesting_postprocess();
			
 
				+#ifdef CONFIG_X86_64
			
 
				+	if (unlikely(this_cpu_read(update_debug_stack))) {
			
 
				+		debug_stack_reset();
			
 
				+		this_cpu_write(update_debug_stack, 0);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
			
 
				+		write_cr2(this_cpu_read(nmi_cr2));
			
 
				+	if (this_cpu_dec_return(nmi_state))
			
 
				+		goto nmi_restart;
			
 
				 }
			
 
				 NOKPROBE_SYMBOL(do_nmi);
			
 
				 
			
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 
				  */
			
 
				 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
			
 
				 {
			
 
				-	*dst = *src;
			
 
				+	memcpy(dst, src, arch_task_struct_size);
			
 
				 
			
 
				 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
			
 
				 }
			
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 
				 			     roundup(sizeof(CORE_STR), 4)) +
			
 
				 			roundup(sizeof(struct elf_prstatus), 4) +
			
 
				 			roundup(sizeof(struct elf_prpsinfo), 4) +
			
 
				-			roundup(sizeof(struct task_struct), 4);
			
 
				+			roundup(arch_task_struct_size, 4);
			
 
				 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
			
 
				 	return size + *elf_buflen;
			
 
				 }
			
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 
				 	/* set up the task structure */
			
 
				 	notes[2].name	= CORE_STR;
			
 
				 	notes[2].type	= NT_TASKSTRUCT;
			
 
				-	notes[2].datasz	= sizeof(struct task_struct);
			
 
				+	notes[2].datasz	= arch_task_struct_size;
			
 
				 	notes[2].data	= current;
			
 
				 
			
 
				 	nhdr->p_filesz	+= notesize(&notes[2]);
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
 
				 /* hung task detection */
			
 
				 	unsigned long last_switch_count;
			
 
				 #endif
			
 
				-/* CPU-specific state of this task */
			
 
				-	struct thread_struct thread;
			
 
				 /* filesystem information */
			
 
				 	struct fs_struct *fs;
			
 
				 /* open file information */
			
@@ -1778,8 +1776,22 @@ struct task_struct {
 
				 	unsigned long	task_state_change;
			
 
				 #endif
			
 
				 	int pagefault_disabled;
			
 
				+/* CPU-specific state of this task */
			
 
				+	struct thread_struct thread;
			
 
				+/*
			
 
				+ * WARNING: on x86, 'thread_struct' contains a variable-sized
			
 
				+ * structure.  It *MUST* be at the end of 'task_struct'.
			
 
				+ *
			
 
				+ * Do not put anything below here!
			
 
				+ */
			
 
				 };
			
 
				 
			
 
				+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
			
 
				+extern int arch_task_struct_size __read_mostly;
			
 
				+#else
			
 
				+# define arch_task_struct_size (sizeof(struct task_struct))
			
 
				+#endif
			
 
				+
			
 
				 /* Future-safe accessor for struct task_struct's cpus_allowed. */
			
 
				 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
			
 
				 
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
 
				 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
			
 
				+/* Initialized by the architecture: */
			
 
				+int arch_task_struct_size __read_mostly;
			
 
				+#endif
			
 
				+
			
 
				 void __init fork_init(void)
			
 
				 {
			
 
				 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
			
@@ -295,7 +300,7 @@ void __init fork_init(void)
 
				 #endif
			
 
				 	/* create a slab on which task_structs can be allocated */
			
 
				 	task_struct_cachep =
			
 
				-		kmem_cache_create("task_struct", sizeof(struct task_struct),
			
 
				+		kmem_cache_create("task_struct", arch_task_struct_size,
			
 
				 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
			
 
				 #endif