Ver código fonte

Merge branch 'x86/asm' into locking/core

Upcoming changes to static keys is interacting/conflicting with the following
pending TSC commits in tip:x86/asm:

  4ea1636b04db x86/asm/tsc: Rename native_read_tsc() to rdtsc()
  ...

So merge it into the locking tree to have a smoother resolution.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 10 anos atrás
pai
commit
f320ead76a
93 arquivos alterados com 2076 adições e 1261 exclusões
  1. 2 1
      arch/um/include/shared/kern_util.h
  2. 4 2
      arch/um/kernel/process.c
  3. 1 7
      arch/um/kernel/signal.c
  4. 1 1
      arch/um/kernel/tlb.c
  5. 1 1
      arch/um/kernel/trap.c
  6. 49 11
      arch/x86/Kconfig
  7. 10 3
      arch/x86/Makefile
  8. 1 1
      arch/x86/boot/compressed/aslr.c
  9. 1 0
      arch/x86/entry/Makefile
  10. 0 9
      arch/x86/entry/calling.h
  11. 375 0
      arch/x86/entry/common.c
  12. 1 23
      arch/x86/entry/entry_32.S
  13. 53 144
      arch/x86/entry/entry_64.S
  14. 52 9
      arch/x86/entry/entry_64_compat.S
  15. 15 0
      arch/x86/entry/syscalls/syscall_32.tbl
  16. 3 3
      arch/x86/entry/vdso/Makefile
  17. 2 14
      arch/x86/entry/vdso/vclock_gettime.c
  18. 5 2
      arch/x86/entry/vdso/vma.c
  19. 1 1
      arch/x86/entry/vsyscall/vsyscall_64.c
  20. 0 93
      arch/x86/ia32/ia32_signal.c
  21. 0 11
      arch/x86/include/asm/barrier.h
  22. 0 10
      arch/x86/include/asm/context_tracking.h
  23. 6 11
      arch/x86/include/asm/elf.h
  24. 0 9
      arch/x86/include/asm/ia32.h
  25. 0 10
      arch/x86/include/asm/irq_vectors.h
  26. 1 5
      arch/x86/include/asm/math_emu.h
  27. 2 0
      arch/x86/include/asm/mmu.h
  28. 21 7
      arch/x86/include/asm/mmu_context.h
  29. 39 22
      arch/x86/include/asm/msr.h
  30. 0 34
      arch/x86/include/asm/paravirt.h
  31. 0 2
      arch/x86/include/asm/paravirt_types.h
  32. 3 10
      arch/x86/include/asm/processor.h
  33. 2 8
      arch/x86/include/asm/pvclock.h
  34. 10 0
      arch/x86/include/asm/sigframe.h
  35. 1 0
      arch/x86/include/asm/signal.h
  36. 1 1
      arch/x86/include/asm/stackprotector.h
  37. 1 0
      arch/x86/include/asm/syscalls.h
  38. 7 4
      arch/x86/include/asm/thread_info.h
  39. 2 2
      arch/x86/include/asm/traps.h
  40. 1 17
      arch/x86/include/asm/tsc.h
  41. 33 24
      arch/x86/include/asm/vm86.h
  42. 3 1
      arch/x86/kernel/Makefile
  43. 4 4
      arch/x86/kernel/apb_timer.c
  44. 4 4
      arch/x86/kernel/apic/apic.c
  45. 3 3
      arch/x86/kernel/cpu/amd.c
  46. 4 5
      arch/x86/kernel/cpu/mcheck/mce.c
  47. 2 3
      arch/x86/kernel/cpu/mcheck/p5.c
  48. 2 2
      arch/x86/kernel/cpu/mcheck/winchip.c
  49. 5 1
      arch/x86/kernel/cpu/perf_event.c
  50. 1 1
      arch/x86/kernel/espfix_64.c
  51. 2 2
      arch/x86/kernel/hpet.c
  52. 15 0
      arch/x86/kernel/irq.c
  53. 5 5
      arch/x86/kernel/nmi.c
  54. 0 2
      arch/x86/kernel/paravirt.c
  55. 0 2
      arch/x86/kernel/paravirt_patch_32.c
  56. 3 0
      arch/x86/kernel/process.c
  57. 1 0
      arch/x86/kernel/process_32.c
  58. 4 2
      arch/x86/kernel/process_64.c
  59. 75 265
      arch/x86/kernel/ptrace.c
  60. 5 28
      arch/x86/kernel/signal.c
  61. 95 0
      arch/x86/kernel/signal_compat.c
  62. 2 0
      arch/x86/kernel/step.c
  63. 1 6
      arch/x86/kernel/trace_clock.c
  64. 29 59
      arch/x86/kernel/traps.c
  65. 3 9
      arch/x86/kernel/tsc.c
  66. 6 8
      arch/x86/kernel/tsc_sync.c
  67. 183 190
      arch/x86/kernel/vm86_32.c
  68. 2 2
      arch/x86/kvm/lapic.c
  69. 2 2
      arch/x86/kvm/svm.c
  70. 2 2
      arch/x86/kvm/vmx.c
  71. 7 19
      arch/x86/kvm/x86.c
  72. 5 8
      arch/x86/lib/delay.c
  73. 1 0
      arch/x86/math-emu/get_address.c
  74. 5 2
      arch/x86/mm/fault.c
  75. 0 13
      arch/x86/um/asm/barrier.h
  76. 0 3
      arch/x86/xen/enlighten.c
  77. 1 1
      drivers/cpufreq/intel_pstate.c
  78. 2 2
      drivers/input/gameport/gameport.c
  79. 2 2
      drivers/input/joystick/analog.c
  80. 1 1
      drivers/net/hamradio/baycom_epp.c
  81. 3 0
      drivers/scsi/dpt_i2o.c
  82. 4 59
      drivers/staging/media/lirc/lirc_serial.c
  83. 2 2
      drivers/thermal/intel_powerclamp.c
  84. 15 0
      include/linux/context_tracking.h
  85. 1 0
      include/linux/context_tracking_state.h
  86. 15 15
      include/linux/spinlock.h
  87. 2 0
      kernel/notifier.c
  88. 1 0
      kernel/sys_ni.c
  89. 2 2
      tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
  90. 2 2
      tools/testing/selftests/x86/Makefile
  91. 129 10
      tools/testing/selftests/x86/entry_from_vm86.c
  92. 576 0
      tools/testing/selftests/x86/ldt_gdt.c
  93. 130 0
      tools/testing/selftests/x86/syscall_arg_fault.c

+ 2 - 1
arch/um/include/shared/kern_util.h

@@ -22,7 +22,8 @@ extern int kmalloc_ok;
 extern unsigned long alloc_stack(int order, int atomic);
 extern unsigned long alloc_stack(int order, int atomic);
 extern void free_stack(unsigned long stack, int order);
 extern void free_stack(unsigned long stack, int order);
 
 
-extern int do_signal(void);
+struct pt_regs;
+extern void do_signal(struct pt_regs *regs);
 extern void interrupt_end(void);
 extern void interrupt_end(void);
 extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs);
 extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs);
 
 

+ 4 - 2
arch/um/kernel/process.c

@@ -90,12 +90,14 @@ void *__switch_to(struct task_struct *from, struct task_struct *to)
 
 
 void interrupt_end(void)
 void interrupt_end(void)
 {
 {
+	struct pt_regs *regs = &current->thread.regs;
+
 	if (need_resched())
 	if (need_resched())
 		schedule();
 		schedule();
 	if (test_thread_flag(TIF_SIGPENDING))
 	if (test_thread_flag(TIF_SIGPENDING))
-		do_signal();
+		do_signal(regs);
 	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
 	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(&current->thread.regs);
+		tracehook_notify_resume(regs);
 }
 }
 
 
 void exit_thread(void)
 void exit_thread(void)

+ 1 - 7
arch/um/kernel/signal.c

@@ -64,7 +64,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	signal_setup_done(err, ksig, singlestep);
 	signal_setup_done(err, ksig, singlestep);
 }
 }
 
 
-static int kern_do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 {
 	struct ksignal ksig;
 	struct ksignal ksig;
 	int handled_sig = 0;
 	int handled_sig = 0;
@@ -110,10 +110,4 @@ static int kern_do_signal(struct pt_regs *regs)
 	 */
 	 */
 	if (!handled_sig)
 	if (!handled_sig)
 		restore_saved_sigmask();
 		restore_saved_sigmask();
-	return handled_sig;
-}
-
-int do_signal(void)
-{
-	return kern_do_signal(&current->thread.regs);
 }
 }

+ 1 - 1
arch/um/kernel/tlb.c

@@ -291,7 +291,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
 		/* We are under mmap_sem, release it such that current can terminate */
 		/* We are under mmap_sem, release it such that current can terminate */
 		up_write(&current->mm->mmap_sem);
 		up_write(&current->mm->mmap_sem);
 		force_sig(SIGKILL, current);
 		force_sig(SIGKILL, current);
-		do_signal();
+		do_signal(&current->thread.regs);
 	}
 	}
 }
 }
 
 

+ 1 - 1
arch/um/kernel/trap.c

@@ -173,7 +173,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
 void fatal_sigsegv(void)
 void fatal_sigsegv(void)
 {
 {
 	force_sigsegv(SIGSEGV, current);
 	force_sigsegv(SIGSEGV, current);
-	do_signal();
+	do_signal(&current->thread.regs);
 	/*
 	/*
 	 * This is to tell gcc that we're not returning - do_signal
 	 * This is to tell gcc that we're not returning - do_signal
 	 * can, in general, return, but in this case, it's not, since
 	 * can, in general, return, but in this case, it's not, since

+ 49 - 11
arch/x86/Kconfig

@@ -133,7 +133,7 @@ config X86
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_UID16			if X86_32
+	select HAVE_UID16			if X86_32 || IA32_EMULATION
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
 	select HAVE_USER_RETURN_NOTIFIER
 	select IRQ_FORCED_THREADING
 	select IRQ_FORCED_THREADING
@@ -1002,19 +1002,41 @@ config X86_THERMAL_VECTOR
 	def_bool y
 	def_bool y
 	depends on X86_MCE_INTEL
 	depends on X86_MCE_INTEL
 
 
-config VM86
-	bool "Enable VM86 support" if EXPERT
-	default y
+config X86_LEGACY_VM86
+	bool "Legacy VM86 support (obsolete)"
+	default n
 	depends on X86_32
 	depends on X86_32
 	---help---
 	---help---
-	  This option is required by programs like DOSEMU to run
-	  16-bit real mode legacy code on x86 processors. It also may
-	  be needed by software like XFree86 to initialize some video
-	  cards via BIOS. Disabling this option saves about 6K.
+	  This option allows user programs to put the CPU into V8086
+	  mode, which is an 80286-era approximation of 16-bit real mode.
+
+	  Some very old versions of X and/or vbetool require this option
+	  for user mode setting.  Similarly, DOSEMU will use it if
+	  available to accelerate real mode DOS programs.  However, any
+	  recent version of DOSEMU, X, or vbetool should be fully
+	  functional even without kernel VM86 support, as they will all
+	  fall back to (pretty well performing) software emulation.
+
+	  Anything that works on a 64-bit kernel is unlikely to need
+	  this option, as 64-bit kernels don't, and can't, support V8086
+	  mode.  This option is also unrelated to 16-bit protected mode
+	  and is not needed to run most 16-bit programs under Wine.
+
+	  Enabling this option adds considerable attack surface to the
+	  kernel and slows down system calls and exception handling.
+
+	  Unless you use very old userspace or need the last drop of
+	  performance in your real mode DOS games and can't use KVM,
+	  say N here.
+
+config VM86
+       bool
+       default X86_LEGACY_VM86
 
 
 config X86_16BIT
 config X86_16BIT
 	bool "Enable support for 16-bit segments" if EXPERT
 	bool "Enable support for 16-bit segments" if EXPERT
 	default y
 	default y
+	depends on MODIFY_LDT_SYSCALL
 	---help---
 	---help---
 	  This option is required by programs like Wine to run 16-bit
 	  This option is required by programs like Wine to run 16-bit
 	  protected mode legacy code on x86 processors.  Disabling
 	  protected mode legacy code on x86 processors.  Disabling
@@ -1509,6 +1531,7 @@ config X86_RESERVE_LOW
 
 
 config MATH_EMULATION
 config MATH_EMULATION
 	bool
 	bool
+	depends on MODIFY_LDT_SYSCALL
 	prompt "Math emulation" if X86_32
 	prompt "Math emulation" if X86_32
 	---help---
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
 	  Linux can emulate a math coprocessor (used for floating point
@@ -2053,6 +2076,22 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 	  be set to 'N' under normal conditions.
 
 
+config MODIFY_LDT_SYSCALL
+	bool "Enable the LDT (local descriptor table)" if EXPERT
+	default y
+	---help---
+	  Linux can allow user programs to install a per-process x86
+	  Local Descriptor Table (LDT) using the modify_ldt(2) system
+	  call.  This is required to run 16-bit or segmented code such as
+	  DOSEMU or some Wine programs.  It is also used by some very old
+	  threading libraries.
+
+	  Enabling this feature adds a small amount of overhead to
+	  context switches and increases the low-level kernel attack
+	  surface.  Disabling it removes the modify_ldt(2) system call.
+
+	  Saying 'N' here may make sense for embedded or server kernels.
+
 source "kernel/livepatch/Kconfig"
 source "kernel/livepatch/Kconfig"
 
 
 endmenu
 endmenu
@@ -2522,7 +2561,7 @@ config IA32_EMULATION
 	depends on X86_64
 	depends on X86_64
 	select BINFMT_ELF
 	select BINFMT_ELF
 	select COMPAT_BINFMT_ELF
 	select COMPAT_BINFMT_ELF
-	select HAVE_UID16
+	select ARCH_WANT_OLD_COMPAT_IPC
 	---help---
 	---help---
 	  Include code to run legacy 32-bit programs under a
 	  Include code to run legacy 32-bit programs under a
 	  64-bit kernel. You should likely turn this on, unless you're
 	  64-bit kernel. You should likely turn this on, unless you're
@@ -2536,7 +2575,7 @@ config IA32_AOUT
 
 
 config X86_X32
 config X86_X32
 	bool "x32 ABI for 64-bit mode"
 	bool "x32 ABI for 64-bit mode"
-	depends on X86_64 && IA32_EMULATION
+	depends on X86_64
 	---help---
 	---help---
 	  Include code to run binaries for the x32 native 32-bit ABI
 	  Include code to run binaries for the x32 native 32-bit ABI
 	  for 64-bit processors.  An x32 process gets access to the
 	  for 64-bit processors.  An x32 process gets access to the
@@ -2550,7 +2589,6 @@ config X86_X32
 config COMPAT
 config COMPAT
 	def_bool y
 	def_bool y
 	depends on IA32_EMULATION || X86_X32
 	depends on IA32_EMULATION || X86_X32
-	select ARCH_WANT_OLD_COMPAT_IPC
 
 
 if COMPAT
 if COMPAT
 config COMPAT_FOR_U64_ALIGNMENT
 config COMPAT_FOR_U64_ALIGNMENT

+ 10 - 3
arch/x86/Makefile

@@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
         LDFLAGS_vmlinux := --emit-relocs
         LDFLAGS_vmlinux := --emit-relocs
 endif
 endif
 
 
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+#    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+
 ifeq ($(CONFIG_X86_32),y)
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
         BITS := 32
         UTS_MACHINE := i386
         UTS_MACHINE := i386
@@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe
 KBUILD_CFLAGS += -Wno-sign-compare
 KBUILD_CFLAGS += -Wno-sign-compare
 #
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 
 
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)

+ 1 - 1
arch/x86/boot/compressed/aslr.c

@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
 		debug_putstr(" RDTSC");
-		rdtscll(raw);
+		raw = rdtsc();
 
 
 		random ^= raw;
 		random ^= raw;
 		use_i8254 = false;
 		use_i8254 = false;

+ 1 - 0
arch/x86/entry/Makefile

@@ -2,6 +2,7 @@
 # Makefile for the x86 low level entry code
 # Makefile for the x86 low level entry code
 #
 #
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y				+= common.o
 
 
 obj-y				+= vdso/
 obj-y				+= vdso/
 obj-y				+= vsyscall/
 obj-y				+= vsyscall/

+ 0 - 9
arch/x86/entry/calling.h

@@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %rbp, 4*8+\offset(%rsp)
 	movq %rbp, 4*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
 	.endm
 	.endm
-	.macro SAVE_EXTRA_REGS_RBP offset=0
-	movq %rbp, 4*8+\offset(%rsp)
-	.endm
 
 
 	.macro RESTORE_EXTRA_REGS offset=0
 	.macro RESTORE_EXTRA_REGS offset=0
 	movq 0*8+\offset(%rsp), %r15
 	movq 0*8+\offset(%rsp), %r15
@@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
 	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
 	RESTORE_C_REGS_HELPER 1,0,0,1,1
 	RESTORE_C_REGS_HELPER 1,0,0,1,1
 	.endm
 	.endm
-	.macro RESTORE_RSI_RDI
-	RESTORE_C_REGS_HELPER 0,0,0,0,0
-	.endm
-	.macro RESTORE_RSI_RDI_RDX
-	RESTORE_C_REGS_HELPER 0,0,0,0,1
-	.endm
 
 
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
 	subq $-(15*8+\addskip), %rsp
 	subq $-(15*8+\addskip), %rsp

+ 375 - 0
arch/x86/entry/common.c

@@ -0,0 +1,375 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors.  The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/* Called on entry from user mode with IRQs off. */
+__visible void enter_from_user_mode(void)
+{
+	CT_WARN_ON(ct_state() != CONTEXT_USER);
+	user_exit();
+}
+#endif
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+
+/*
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
+ */
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+	unsigned long ret = 0;
+	u32 work;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+#ifdef CONFIG_CONTEXT_TRACKING
+	/*
+	 * If TIF_NOHZ is set, we are required to call user_exit() before
+	 * doing anything that could touch RCU.
+	 */
+	if (work & _TIF_NOHZ) {
+		enter_from_user_mode();
+		work &= ~_TIF_NOHZ;
+	}
+#endif
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
+{
+	long ret = 0;
+	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (work & _TIF_SINGLESTEP)
+		regs->flags |= X86_EFLAGS_TF;
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
+		/* seccomp failures shouldn't expose any additional code. */
+		return -1;
+	}
+#endif
+
+	if (unlikely(work & _TIF_SYSCALL_EMU))
+		ret = -1L;
+
+	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		ret = -1L;
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->orig_ax);
+
+	do_audit_syscall_entry(regs, arch);
+
+	return ret ?: regs->orig_ax;
+}
+
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
+/* Deprecated. */
+void syscall_trace_leave(struct pt_regs *regs)
+{
+	bool step;
+
+	/*
+	 * We may come here right after calling schedule_user()
+	 * or do_notify_resume(), in which case we can be in RCU
+	 * user mode.
+	 */
+	user_exit();
+
+	audit_syscall_exit(regs);
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->ax);
+
+	/*
+	 * If TIF_SYSCALL_EMU is set, we only get here because of
+	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+	 * We already reported this syscall instruction in
+	 * syscall_trace_enter().
+	 */
+	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
+			!test_thread_flag(TIF_SYSCALL_EMU);
+	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, step);
+
+	user_enter();
+}
+
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+	unsigned long top_of_stack =
+		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
+/* Called with IRQs disabled. */
+__visible void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+	if (WARN_ON(!irqs_disabled()))
+		local_irq_disable();
+
+	/*
+	 * In order to return to user mode, we need to have IRQs off with
+	 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
+	 * _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
+	 * can be set at any time on preemptable kernels if we have IRQs on,
+	 * so we need to loop.  Disabling preemption wouldn't help: doing the
+	 * work to clear some of the flags can sleep.
+	 */
+	while (true) {
+		u32 cached_flags =
+			READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+		if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
+				      _TIF_UPROBE | _TIF_NEED_RESCHED |
+				      _TIF_USER_RETURN_NOTIFY)))
+			break;
+
+		/* We have work to do. */
+		local_irq_enable();
+
+		if (cached_flags & _TIF_NEED_RESCHED)
+			schedule();
+
+		if (cached_flags & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
+
+		/* deal with pending signal delivery */
+		if (cached_flags & _TIF_SIGPENDING)
+			do_signal(regs);
+
+		if (cached_flags & _TIF_NOTIFY_RESUME) {
+			clear_thread_flag(TIF_NOTIFY_RESUME);
+			tracehook_notify_resume(regs);
+		}
+
+		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
+			fire_user_return_notifiers();
+
+		/* Disable IRQs and retry */
+		local_irq_disable();
+	}
+
+	user_enter();
+}
+
+/*
+ * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible void syscall_return_slowpath(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	u32 cached_flags = READ_ONCE(ti->flags);
+	bool step;
+
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+	if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
+		 regs->orig_ax))
+		local_irq_enable();
+
+	/*
+	 * First do one-time work.  If these work items are enabled, we
+	 * want to run them exactly once per syscall exit with IRQs on.
+	 */
+	if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
+			    _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
+		audit_syscall_exit(regs);
+
+		if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+			trace_sys_exit(regs, regs->ax);
+
+		/*
+		 * If TIF_SYSCALL_EMU is set, we only get here because of
+		 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+		 * We already reported this syscall instruction in
+		 * syscall_trace_enter().
+		 */
+		step = unlikely(
+			(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+			== _TIF_SINGLESTEP);
+		if (step || cached_flags & _TIF_SYSCALL_TRACE)
+			tracehook_report_syscall_exit(regs, step);
+	}
+
+#ifdef CONFIG_COMPAT
+	/*
+	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+	 * returning to user mode.
+	 */
+	ti->status &= ~TS_COMPAT;
+#endif
+
+	local_irq_disable();
+	prepare_exit_to_usermode(regs);
+}
+
+/*
+ * Deprecated notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+__visible void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+{
+	user_exit();
+
+	if (thread_info_flags & _TIF_UPROBE)
+		uprobe_notify_resume(regs);
+
+	/* deal with pending signal delivery */
+	if (thread_info_flags & _TIF_SIGPENDING)
+		do_signal(regs);
+
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(regs);
+	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
+
+	user_enter();
+}

+ 1 - 23
arch/x86/entry/entry_32.S

@@ -525,34 +525,12 @@ work_resched:
 
 
 work_notifysig:					# deal with pending signals and
 work_notifysig:					# deal with pending signals and
 						# notify-resume requests
 						# notify-resume requests
-#ifdef CONFIG_VM86
-	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl	%esp, %eax
-	jnz	work_notifysig_v86		# returning to kernel-space or
-						# vm86-space
-1:
-#else
-	movl	%esp, %eax
-#endif
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb	PT_CS(%esp), %bl
-	andb	$SEGMENT_RPL_MASK, %bl
-	cmpb	$USER_RPL, %bl
-	jb	resume_kernel
+	movl	%esp, %eax
 	xorl	%edx, %edx
 	xorl	%edx, %edx
 	call	do_notify_resume
 	call	do_notify_resume
 	jmp	resume_userspace
 	jmp	resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl	%ecx				# save ti_flags for do_notify_resume
-	call	save_v86_state			# %eax contains pt_regs pointer
-	popl	%ecx
-	movl	%eax, %esp
-	jmp	1b
-#endif
 END(work_pending)
 END(work_pending)
 
 
 	# perform syscall exit tracing
 	# perform syscall exit tracing

+ 53 - 144
arch/x86/entry/entry_64.S

@@ -33,7 +33,6 @@
 #include <asm/paravirt.h>
 #include <asm/paravirt.h>
 #include <asm/percpu.h>
 #include <asm/percpu.h>
 #include <asm/asm.h>
 #include <asm/asm.h>
-#include <asm/context_tracking.h>
 #include <asm/smap.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/pgtable_types.h>
 #include <linux/err.h>
 #include <linux/err.h>
@@ -229,6 +228,11 @@ entry_SYSCALL_64_fastpath:
 	 */
 	 */
 	USERGS_SYSRET64
 	USERGS_SYSRET64
 
 
+GLOBAL(int_ret_from_sys_call_irqs_off)
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	jmp int_ret_from_sys_call
+
 	/* Do syscall entry tracing */
 	/* Do syscall entry tracing */
 tracesys:
 tracesys:
 	movq	%rsp, %rdi
 	movq	%rsp, %rdi
@@ -272,69 +276,11 @@ tracesys_phase2:
  * Has correct iret frame.
  * Has correct iret frame.
  */
  */
 GLOBAL(int_ret_from_sys_call)
 GLOBAL(int_ret_from_sys_call)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
-	TRACE_IRQS_OFF
-	movl	$_TIF_ALLWORK_MASK, %edi
-	/* edi:	mask to check */
-GLOBAL(int_with_check)
-	LOCKDEP_SYS_EXIT_IRQ
-	GET_THREAD_INFO(%rcx)
-	movl	TI_flags(%rcx), %edx
-	andl	%edi, %edx
-	jnz	int_careful
-	andl	$~TS_COMPAT, TI_status(%rcx)
-	jmp	syscall_return
-
-	/*
-	 * Either reschedule or signal or syscall exit tracking needed.
-	 * First do a reschedule test.
-	 * edx:	work, edi: workmask
-	 */
-int_careful:
-	bt	$TIF_NEED_RESCHED, %edx
-	jnc	int_very_careful
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq	%rdi
-	SCHEDULE_USER
-	popq	%rdi
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	int_with_check
-
-	/* handle signals and tracing -- both require a full pt_regs */
-int_very_careful:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
 	SAVE_EXTRA_REGS
-	/* Check for syscall exit trace */
-	testl	$_TIF_WORK_SYSCALL_EXIT, %edx
-	jz	int_signal
-	pushq	%rdi
-	leaq	8(%rsp), %rdi			/* &ptregs -> arg1 */
-	call	syscall_trace_leave
-	popq	%rdi
-	andl	$~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
-	jmp	int_restore_rest
-
-int_signal:
-	testl	$_TIF_DO_NOTIFY_MASK, %edx
-	jz	1f
-	movq	%rsp, %rdi			/* &ptregs -> arg1 */
-	xorl	%esi, %esi			/* oldset -> arg2 */
-	call	do_notify_resume
-1:	movl	$_TIF_WORK_MASK, %edi
-int_restore_rest:
+	movq	%rsp, %rdi
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	RESTORE_EXTRA_REGS
 	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	int_with_check
-
-syscall_return:
-	/* The IRETQ could re-enable interrupts: */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_IRETQ
+	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 
 	/*
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * Try to use SYSRET instead of IRET if we're returning to
@@ -555,23 +501,22 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	.macro interrupt func
 	cld
 	cld
-	/*
-	 * Since nothing in interrupt handling code touches r12...r15 members
-	 * of "struct pt_regs", and since interrupts can nest, we can save
-	 * four stack slots and simultaneously provide
-	 * an unwind-friendly stack layout by saving "truncated" pt_regs
-	 * exactly up to rbp slot, without these members.
-	 */
-	ALLOC_PT_GPREGS_ON_STACK -RBP
-	SAVE_C_REGS -RBP
-	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
-	SAVE_EXTRA_REGS_RBP -RBP
-
-	leaq	-RBP(%rsp), %rdi		/* arg1 for \func (pointer to pt_regs) */
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 
 
-	testb	$3, CS-RBP(%rsp)
+	testb	$3, CS(%rsp)
 	jz	1f
 	jz	1f
+
+	/*
+	 * IRQ from user mode.  Switch to kernel gsbase and inform context
+	 * tracking that we're in kernel mode.
+	 */
 	SWAPGS
 	SWAPGS
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
 1:
 1:
 	/*
 	/*
 	 * Save previous stack pointer, optionally switch to interrupt stack.
 	 * Save previous stack pointer, optionally switch to interrupt stack.
@@ -580,14 +525,14 @@ END(irq_entries_start)
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
 	 */
-	movq	%rsp, %rsi
+	movq	%rsp, %rdi
 	incl	PER_CPU_VAR(irq_count)
 	incl	PER_CPU_VAR(irq_count)
 	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
 	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rsi
+	pushq	%rdi
 	/* We entered an interrupt context - irqs are off: */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 
 
-	call	\func
+	call	\func	/* rdi points to pt_regs */
 	.endm
 	.endm
 
 
 	/*
 	/*
@@ -606,34 +551,19 @@ ret_from_intr:
 	decl	PER_CPU_VAR(irq_count)
 	decl	PER_CPU_VAR(irq_count)
 
 
 	/* Restore saved previous stack */
 	/* Restore saved previous stack */
-	popq	%rsi
-	/* return code expects complete pt_regs - adjust rsp accordingly: */
-	leaq	-RBP(%rsi), %rsp
+	popq	%rsp
 
 
 	testb	$3, CS(%rsp)
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
 	jz	retint_kernel
-	/* Interrupt came from user space */
-retint_user:
-	GET_THREAD_INFO(%rcx)
 
 
-	/* %rcx: thread info. Interrupts are off. */
-retint_with_reschedule:
-	movl	$_TIF_WORK_MASK, %edi
-retint_check:
+	/* Interrupt came from user space */
 	LOCKDEP_SYS_EXIT_IRQ
 	LOCKDEP_SYS_EXIT_IRQ
-	movl	TI_flags(%rcx), %edx
-	andl	%edi, %edx
-	jnz	retint_careful
-
-retint_swapgs:					/* return to user-space */
-	/*
-	 * The iretq could re-enable interrupts:
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
+GLOBAL(retint_user)
+	mov	%rsp,%rdi
+	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
 	TRACE_IRQS_IRETQ
-
 	SWAPGS
 	SWAPGS
-	jmp	restore_c_regs_and_iret
+	jmp	restore_regs_and_iret
 
 
 /* Returning to kernel space */
 /* Returning to kernel space */
 retint_kernel:
 retint_kernel:
@@ -657,6 +587,8 @@ retint_kernel:
  * At this label, code paths which return to kernel and to user,
  * At this label, code paths which return to kernel and to user,
  * which come from interrupts/exception and from syscalls, merge.
  * which come from interrupts/exception and from syscalls, merge.
  */
  */
+restore_regs_and_iret:
+	RESTORE_EXTRA_REGS
 restore_c_regs_and_iret:
 restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 	REMOVE_PT_GPREGS_FROM_STACK 8
@@ -707,37 +639,6 @@ native_irq_return_ldt:
 	popq	%rax
 	popq	%rax
 	jmp	native_irq_return_iret
 	jmp	native_irq_return_iret
 #endif
 #endif
-
-	/* edi: workmask, edx: work */
-retint_careful:
-	bt	$TIF_NEED_RESCHED, %edx
-	jnc	retint_signal
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq	%rdi
-	SCHEDULE_USER
-	popq	%rdi
-	GET_THREAD_INFO(%rcx)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	retint_check
-
-retint_signal:
-	testl	$_TIF_DO_NOTIFY_MASK, %edx
-	jz	retint_swapgs
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_EXTRA_REGS
-	movq	$-1, ORIG_RAX(%rsp)
-	xorl	%esi, %esi			/* oldset */
-	movq	%rsp, %rdi			/* &pt_regs */
-	call	do_notify_resume
-	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	jmp	retint_with_reschedule
-
 END(common_interrupt)
 END(common_interrupt)
 
 
 /*
 /*
@@ -1143,12 +1044,22 @@ ENTRY(error_entry)
 	SAVE_EXTRA_REGS 8
 	SAVE_EXTRA_REGS 8
 	xorl	%ebx, %ebx
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	testb	$3, CS+8(%rsp)
-	jz	error_kernelspace
+	jz	.Lerror_kernelspace
 
 
-	/* We entered from user mode */
+.Lerror_entry_from_usermode_swapgs:
+	/*
+	 * We entered from user mode or we're pretending to have entered
+	 * from user mode due to an IRET fault.
+	 */
 	SWAPGS
 	SWAPGS
 
 
-error_entry_done:
+.Lerror_entry_from_usermode_after_swapgs:
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
+.Lerror_entry_done:
+
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 	ret
 	ret
 
 
@@ -1158,31 +1069,30 @@ error_entry_done:
 	 * truncated RIP for IRET exceptions returning to compat mode. Check
 	 * truncated RIP for IRET exceptions returning to compat mode. Check
 	 * for these here too.
 	 * for these here too.
 	 */
 	 */
-error_kernelspace:
+.Lerror_kernelspace:
 	incl	%ebx
 	incl	%ebx
 	leaq	native_irq_return_iret(%rip), %rcx
 	leaq	native_irq_return_iret(%rip), %rcx
 	cmpq	%rcx, RIP+8(%rsp)
 	cmpq	%rcx, RIP+8(%rsp)
-	je	error_bad_iret
+	je	.Lerror_bad_iret
 	movl	%ecx, %eax			/* zero extend */
 	movl	%ecx, %eax			/* zero extend */
 	cmpq	%rax, RIP+8(%rsp)
 	cmpq	%rax, RIP+8(%rsp)
-	je	bstep_iret
+	je	.Lbstep_iret
 	cmpq	$gs_change, RIP+8(%rsp)
 	cmpq	$gs_change, RIP+8(%rsp)
-	jne	error_entry_done
+	jne	.Lerror_entry_done
 
 
 	/*
 	/*
 	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
 	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
 	 * gsbase and proceed.  We'll fix up the exception and land in
 	 * gsbase and proceed.  We'll fix up the exception and land in
 	 * gs_change's error handler with kernel gsbase.
 	 * gs_change's error handler with kernel gsbase.
 	 */
 	 */
-	SWAPGS
-	jmp	error_entry_done
+	jmp	.Lerror_entry_from_usermode_swapgs
 
 
-bstep_iret:
+.Lbstep_iret:
 	/* Fix truncated RIP */
 	/* Fix truncated RIP */
 	movq	%rcx, RIP+8(%rsp)
 	movq	%rcx, RIP+8(%rsp)
 	/* fall through */
 	/* fall through */
 
 
-error_bad_iret:
+.Lerror_bad_iret:
 	/*
 	/*
 	 * We came from an IRET to user mode, so we have user gsbase.
 	 * We came from an IRET to user mode, so we have user gsbase.
 	 * Switch to kernel gsbase:
 	 * Switch to kernel gsbase:
@@ -1198,7 +1108,7 @@ error_bad_iret:
 	call	fixup_bad_iret
 	call	fixup_bad_iret
 	mov	%rax, %rsp
 	mov	%rax, %rsp
 	decl	%ebx
 	decl	%ebx
-	jmp	error_entry_done
+	jmp	.Lerror_entry_from_usermode_after_swapgs
 END(error_entry)
 END(error_entry)
 
 
 
 
@@ -1209,7 +1119,6 @@ END(error_entry)
  */
  */
 ENTRY(error_exit)
 ENTRY(error_exit)
 	movl	%ebx, %eax
 	movl	%ebx, %eax
-	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 	testl	%eax, %eax
 	testl	%eax, %eax

+ 52 - 9
arch/x86/entry/entry_64_compat.S

@@ -22,8 +22,8 @@
 #define __AUDIT_ARCH_LE		0x40000000
 #define __AUDIT_ARCH_LE		0x40000000
 
 
 #ifndef CONFIG_AUDITSYSCALL
 #ifndef CONFIG_AUDITSYSCALL
-# define sysexit_audit		ia32_ret_from_sys_call
-# define sysretl_audit		ia32_ret_from_sys_call
+# define sysexit_audit		ia32_ret_from_sys_call_irqs_off
+# define sysretl_audit		ia32_ret_from_sys_call_irqs_off
 #endif
 #endif
 
 
 	.section .entry.text, "ax"
 	.section .entry.text, "ax"
@@ -140,7 +140,8 @@ sysexit_from_sys_call:
 	 */
 	 */
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	movl	RIP(%rsp), %ecx		/* User %eip */
 	movl	RIP(%rsp), %ecx		/* User %eip */
-	RESTORE_RSI_RDI
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
 	xorl	%edx, %edx		/* Do not leak kernel information */
 	xorl	%edx, %edx		/* Do not leak kernel information */
 	xorq	%r8, %r8
 	xorq	%r8, %r8
 	xorq	%r9, %r9
 	xorq	%r9, %r9
@@ -208,10 +209,10 @@ sysexit_from_sys_call:
 	.endm
 	.endm
 
 
 	.macro auditsys_exit exit
 	.macro auditsys_exit exit
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	ia32_ret_from_sys_call
 	movl	%eax, %esi		/* second arg, syscall return value */
 	movl	%eax, %esi		/* second arg, syscall return value */
 	cmpl	$-MAX_ERRNO, %eax	/* is it an error ? */
 	cmpl	$-MAX_ERRNO, %eax	/* is it an error ? */
 	jbe	1f
 	jbe	1f
@@ -230,7 +231,7 @@ sysexit_from_sys_call:
 	movq	%rax, R10(%rsp)
 	movq	%rax, R10(%rsp)
 	movq	%rax, R9(%rsp)
 	movq	%rax, R9(%rsp)
 	movq	%rax, R8(%rsp)
 	movq	%rax, R8(%rsp)
-	jmp	int_with_check
+	jmp	int_ret_from_sys_call_irqs_off
 	.endm
 	.endm
 
 
 sysenter_auditsys:
 sysenter_auditsys:
@@ -365,7 +366,9 @@ cstar_dispatch:
 
 
 sysretl_from_sys_call:
 sysretl_from_sys_call:
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	RESTORE_RSI_RDI_RDX
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
 	movl	RIP(%rsp), %ecx
 	movl	RIP(%rsp), %ecx
 	movl	EFLAGS(%rsp), %r11d
 	movl	EFLAGS(%rsp), %r11d
 	xorq	%r10, %r10
 	xorq	%r10, %r10
@@ -429,8 +432,48 @@ cstar_tracesys:
 END(entry_SYSCALL_compat)
 END(entry_SYSCALL_compat)
 
 
 ia32_badarg:
 ia32_badarg:
-	ASM_CLAC
-	movq	$-EFAULT, RAX(%rsp)
+	/*
+	 * So far, we've entered kernel mode, set AC, turned on IRQs, and
+	 * saved C regs except r8-r11.  We haven't done any of the other
+	 * standard entry work, though.  We want to bail, but we shouldn't
+	 * treat this as a syscall entry since we don't even know what the
+	 * args are.  Instead, treat this as a non-syscall entry, finish
+	 * the entry work, and immediately exit after setting AX = -EFAULT.
+	 *
+	 * We're really just being polite here.  Killing the task outright
+	 * would be a reasonable action, too.  Given that the only valid
+	 * way to have gotten here is through the vDSO, and we already know
+	 * that the stack pointer is bad, the task isn't going to survive
+	 * for long no matter what we do.
+	 */
+
+	ASM_CLAC			/* undo STAC */
+	movq	$-EFAULT, RAX(%rsp)	/* return -EFAULT if possible */
+
+	/* Fill in the rest of pt_regs */
+	xorl	%eax, %eax
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	SAVE_EXTRA_REGS
+
+	/* Turn IRQs back off. */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+
+	/* Now finish entering normal kernel mode. */
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
+	/* And exit again. */
+	jmp retint_user
+
+ia32_ret_from_sys_call_irqs_off:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 ia32_ret_from_sys_call:
 ia32_ret_from_sys_call:
 	xorl	%eax, %eax		/* Do not leak kernel information */
 	xorl	%eax, %eax		/* Do not leak kernel information */
 	movq	%rax, R11(%rsp)
 	movq	%rax, R11(%rsp)

+ 15 - 0
arch/x86/entry/syscalls/syscall_32.tbl

@@ -365,3 +365,18 @@
 356	i386	memfd_create		sys_memfd_create
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
 357	i386	bpf			sys_bpf
 358	i386	execveat		sys_execveat			stub32_execveat
 358	i386	execveat		sys_execveat			stub32_execveat
+359	i386	socket			sys_socket
+360	i386	socketpair		sys_socketpair
+361	i386	bind			sys_bind
+362	i386	connect			sys_connect
+363	i386	listen			sys_listen
+364	i386	accept4			sys_accept4
+365	i386	getsockopt		sys_getsockopt			compat_sys_getsockopt
+366	i386	setsockopt		sys_setsockopt			compat_sys_setsockopt
+367	i386	getsockname		sys_getsockname
+368	i386	getpeername		sys_getpeername
+369	i386	sendto			sys_sendto
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373	i386	shutdown		sys_shutdown

+ 3 - 3
arch/x86/entry/vdso/Makefile

@@ -8,7 +8,7 @@ KASAN_SANITIZE := n
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
 VDSO32-$(CONFIG_X86_32)		:= y
-VDSO32-$(CONFIG_COMPAT)		:= y
+VDSO32-$(CONFIG_IA32_EMULATION)	:= y
 
 
 # files to link into the vdso
 # files to link into the vdso
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@@ -20,7 +20,7 @@ obj-y				+= vma.o
 vdso_img-$(VDSO64-y)		+= 64
 vdso_img-$(VDSO64-y)		+= 64
 vdso_img-$(VDSOX32-y)		+= x32
 vdso_img-$(VDSOX32-y)		+= x32
 vdso_img-$(VDSO32-y)		+= 32-int80
 vdso_img-$(VDSO32-y)		+= 32-int80
-vdso_img-$(CONFIG_COMPAT)	+= 32-syscall
+vdso_img-$(CONFIG_IA32_EMULATION)	+= 32-syscall
 vdso_img-$(VDSO32-y)		+= 32-sysenter
 vdso_img-$(VDSO32-y)		+= 32-sysenter
 
 
 obj-$(VDSO32-y)			+= vdso32-setup.o
 obj-$(VDSO32-y)			+= vdso32-setup.o
@@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
 # Build multiple 32-bit vDSO images to choose from at boot time.
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
 #
 vdso32.so-$(VDSO32-y)		+= int80
 vdso32.so-$(VDSO32-y)		+= int80
-vdso32.so-$(CONFIG_COMPAT)	+= syscall
+vdso32.so-$(CONFIG_IA32_EMULATION)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
 vdso32.so-$(VDSO32-y)		+= sysenter
 
 
 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)

+ 2 - 14
arch/x86/entry/vdso/vclock_gettime.c

@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 
 notrace static cycle_t vread_tsc(void)
 notrace static cycle_t vread_tsc(void)
 {
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)__native_read_tsc();
-
-	last = gtod->cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = gtod->cycle_last;
 
 
 	if (likely(ret >= last))
 	if (likely(ret >= last))
 		return ret;
 		return ret;

+ 5 - 2
arch/x86/entry/vdso/vma.c

@@ -177,7 +177,7 @@ up_fail:
 	return ret;
 	return ret;
 }
 }
 
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static int load_vdso32(void)
 static int load_vdso32(void)
 {
 {
 	int ret;
 	int ret;
@@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 		return map_vdso(&vdso_image_x32, true);
 		return map_vdso(&vdso_image_x32, true);
 	}
 	}
 #endif
 #endif
-
+#ifdef CONFIG_IA32_EMULATION
 	return load_vdso32();
 	return load_vdso32();
+#else
+	return 0;
+#endif
 }
 }
 #endif
 #endif
 #else
 #else

+ 1 - 1
arch/x86/entry/vsyscall/vsyscall_64.c

@@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = {
 
 
 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
 {
-#ifdef CONFIG_IA32_EMULATION
+#ifdef CONFIG_COMPAT
 	if (!mm || mm->context.ia32_compat)
 	if (!mm || mm->context.ia32_compat)
 		return NULL;
 		return NULL;
 #endif
 #endif

+ 0 - 93
arch/x86/ia32/ia32_signal.c

@@ -34,99 +34,6 @@
 #include <asm/sys_ia32.h>
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 #include <asm/smap.h>
 
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
-	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
-
-	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	put_user_try {
-		/* If you change siginfo_t structure, please make sure that
-		   this code is fixed accordingly.
-		   It should never copy any pad contained in the structure
-		   to avoid security leaks, but must copy the generic
-		   3 ints plus the relevant union member.  */
-		put_user_ex(from->si_signo, &to->si_signo);
-		put_user_ex(from->si_errno, &to->si_errno);
-		put_user_ex((short)from->si_code, &to->si_code);
-
-		if (from->si_code < 0) {
-			put_user_ex(from->si_pid, &to->si_pid);
-			put_user_ex(from->si_uid, &to->si_uid);
-			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
-		} else {
-			/*
-			 * First 32bits of unions are always present:
-			 * si_pid === si_band === si_tid === si_addr(LS half)
-			 */
-			put_user_ex(from->_sifields._pad[0],
-					  &to->_sifields._pad[0]);
-			switch (from->si_code >> 16) {
-			case __SI_FAULT >> 16:
-				break;
-			case __SI_SYS >> 16:
-				put_user_ex(from->si_syscall, &to->si_syscall);
-				put_user_ex(from->si_arch, &to->si_arch);
-				break;
-			case __SI_CHLD >> 16:
-				if (ia32) {
-					put_user_ex(from->si_utime, &to->si_utime);
-					put_user_ex(from->si_stime, &to->si_stime);
-				} else {
-					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
-					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
-				}
-				put_user_ex(from->si_status, &to->si_status);
-				/* FALL THROUGH */
-			default:
-			case __SI_KILL >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				break;
-			case __SI_POLL >> 16:
-				put_user_ex(from->si_fd, &to->si_fd);
-				break;
-			case __SI_TIMER >> 16:
-				put_user_ex(from->si_overrun, &to->si_overrun);
-				put_user_ex(ptr_to_compat(from->si_ptr),
-					    &to->si_ptr);
-				break;
-				 /* This is not generated by the kernel as of now.  */
-			case __SI_RT >> 16:
-			case __SI_MESGQ >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				put_user_ex(from->si_int, &to->si_int);
-				break;
-			}
-		}
-	} put_user_catch(err);
-
-	return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
-	int err = 0;
-	u32 ptr32;
-
-	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	get_user_try {
-		get_user_ex(to->si_signo, &from->si_signo);
-		get_user_ex(to->si_errno, &from->si_errno);
-		get_user_ex(to->si_code, &from->si_code);
-
-		get_user_ex(to->si_pid, &from->si_pid);
-		get_user_ex(to->si_uid, &from->si_uid);
-		get_user_ex(ptr32, &from->si_ptr);
-		to->si_ptr = compat_ptr(ptr32);
-	} get_user_catch(err);
-
-	return err;
-}
-
 /*
 /*
  * Do a signal return; undo the signal stack.
  * Do a signal return; undo the signal stack.
  */
  */

+ 0 - 11
arch/x86/include/asm/barrier.h

@@ -91,15 +91,4 @@ do {									\
 #define smp_mb__before_atomic()	barrier()
 #define smp_mb__before_atomic()	barrier()
 #define smp_mb__after_atomic()	barrier()
 #define smp_mb__after_atomic()	barrier()
 
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- */
-static __always_inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif /* _ASM_X86_BARRIER_H */
 #endif /* _ASM_X86_BARRIER_H */

+ 0 - 10
arch/x86/include/asm/context_tracking.h

@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_CONTEXT_TRACKING_H
-#define _ASM_X86_CONTEXT_TRACKING_H
-
-#ifdef CONFIG_CONTEXT_TRACKING
-# define SCHEDULE_USER call schedule_user
-#else
-# define SCHEDULE_USER call schedule
-#endif
-
-#endif

+ 6 - 11
arch/x86/include/asm/elf.h

@@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 extern unsigned int vdso64_enabled;
 extern unsigned int vdso64_enabled;
 #endif
 #endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 extern unsigned int vdso32_enabled;
 extern unsigned int vdso32_enabled;
 #endif
 #endif
 
 
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
 #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\
 #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\
 	elf_common_init(&current->thread, regs, __USER_DS)
 	elf_common_init(&current->thread, regs, __USER_DS)
 
 
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread start_thread_ia32
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
 
 
 void set_personality_ia32(bool);
 void set_personality_ia32(bool);
 #define COMPAT_SET_PERSONALITY(ex)			\
 #define COMPAT_SET_PERSONALITY(ex)			\
@@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
  */
  */
 static inline int mmap_is_ia32(void)
 static inline int mmap_is_ia32(void)
 {
 {
-#ifdef CONFIG_X86_32
-	return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_ADDR32))
-		return 1;
-#endif
-	return 0;
+	return config_enabled(CONFIG_X86_32) ||
+	       (config_enabled(CONFIG_COMPAT) &&
+		test_thread_flag(TIF_ADDR32));
 }
 }
 
 
 /* Do not change the values. See get_align_mask() */
 /* Do not change the values. See get_align_mask() */

+ 0 - 9
arch/x86/include/asm/ia32.h

@@ -22,15 +22,6 @@ struct ucontext_ia32 {
 	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
 	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 };
 
 
-struct ucontext_x32 {
-	unsigned int	  uc_flags;
-	unsigned int 	  uc_link;
-	compat_stack_t	  uc_stack;
-	unsigned int	  uc__pad0;     /* needed for alignment */
-	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
-	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
-};
-
 /* This matches struct stat64 in glibc2.2, hence the absolutely
 /* This matches struct stat64 in glibc2.2, hence the absolutely
  * insane amounts of padding around dev_t's.
  * insane amounts of padding around dev_t's.
  */
  */

+ 0 - 10
arch/x86/include/asm/irq_vectors.h

@@ -117,16 +117,6 @@
 
 
 #define FPU_IRQ				  13
 #define FPU_IRQ				  13
 
 
-#define	FIRST_VM86_IRQ			   3
-#define LAST_VM86_IRQ			  15
-
-#ifndef __ASSEMBLY__
-static inline int invalid_vm86_irq(int irq)
-{
-	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
-}
-#endif
-
 /*
 /*
  * Size the maximum number of interrupts.
  * Size the maximum number of interrupts.
  *
  *

+ 1 - 5
arch/x86/include/asm/math_emu.h

@@ -2,7 +2,6 @@
 #define _ASM_X86_MATH_EMU_H
 #define _ASM_X86_MATH_EMU_H
 
 
 #include <asm/ptrace.h>
 #include <asm/ptrace.h>
-#include <asm/vm86.h>
 
 
 /* This structure matches the layout of the data saved to the stack
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
    following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
    */
    */
 struct math_emu_info {
 struct math_emu_info {
 	long ___orig_eip;
 	long ___orig_eip;
-	union {
-		struct pt_regs *regs;
-		struct kernel_vm86_regs *vm86;
-	};
+	struct pt_regs *regs;
 };
 };
 #endif /* _ASM_X86_MATH_EMU_H */
 #endif /* _ASM_X86_MATH_EMU_H */

+ 2 - 0
arch/x86/include/asm/mmu.h

@@ -9,7 +9,9 @@
  * we put the segment information here.
  * we put the segment information here.
  */
  */
 typedef struct {
 typedef struct {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 	struct ldt_struct *ldt;
+#endif
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
 	/* True if mm supports a task running in 32 bit compatibility mode. */

+ 21 - 7
arch/x86/include/asm/mmu_context.h

@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 #endif
 #endif
 
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 /*
 /*
  * ldt_structs can be allocated, used, and freed, but they are never
  * ldt_structs can be allocated, used, and freed, but they are never
  * modified while live.
  * modified while live.
@@ -48,8 +49,23 @@ struct ldt_struct {
 	int size;
 	int size;
 };
 };
 
 
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#else	/* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+
 static inline void load_mm_ldt(struct mm_struct *mm)
 static inline void load_mm_ldt(struct mm_struct *mm)
 {
 {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 	struct ldt_struct *ldt;
 
 
 	/* lockless_dereference synchronizes with smp_store_release */
 	/* lockless_dereference synchronizes with smp_store_release */
@@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 		set_ldt(ldt->entries, ldt->size);
 		set_ldt(ldt->entries, ldt->size);
 	else
 	else
 		clear_LDT();
 		clear_LDT();
+#else
+	clear_LDT();
+#endif
 
 
 	DEBUG_LOCKS_WARN_ON(preemptible());
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 }
 
 
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 {
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
@@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Load per-mm CR4 state */
 		/* Load per-mm CR4 state */
 		load_mm_cr4(next);
 		load_mm_cr4(next);
 
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		/*
 		/*
 		 * Load the LDT, if the LDT is different.
 		 * Load the LDT, if the LDT is different.
 		 *
 		 *
@@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 */
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_mm_ldt(next);
 			load_mm_ldt(next);
+#endif
 	}
 	}
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	  else {
 	  else {

+ 39 - 22
arch/x86/include/asm/msr.h

@@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
  * it means rax *or* rdx.
  * it means rax *or* rdx.
  */
  */
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
-#define DECLARE_ARGS(val, low, high)	unsigned low, high
-#define EAX_EDX_VAL(val, low, high)	((low) | ((u64)(high) << 32))
-#define EAX_EDX_ARGS(val, low, high)	"a" (low), "d" (high)
+/* Using 64-bit values saves one instruction clearing the high half of low */
+#define DECLARE_ARGS(val, low, high)	unsigned long low, high
+#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)
 #define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
 #define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
 #else
 #else
 #define DECLARE_ARGS(val, low, high)	unsigned long long val
 #define DECLARE_ARGS(val, low, high)	unsigned long long val
 #define EAX_EDX_VAL(val, low, high)	(val)
 #define EAX_EDX_VAL(val, low, high)	(val)
-#define EAX_EDX_ARGS(val, low, high)	"A" (val)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 #endif
 
 
@@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 	return err;
 	return err;
 }
 }
 
 
-extern unsigned long long native_read_tsc(void);
-
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
 
-static __always_inline unsigned long long __native_read_tsc(void)
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect.  The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
 {
 {
 	DECLARE_ARGS(val, low, high);
 	DECLARE_ARGS(val, low, high);
 
 
@@ -120,6 +126,32 @@ static __always_inline unsigned long long __native_read_tsc(void)
 	return EAX_EDX_VAL(val, low, high);
 	return EAX_EDX_VAL(val, low, high);
 }
 }
 
 
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter.  It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory
+	 * access.  The Intel SDM and the AMD APM are both vague on this
+	 * point, but empirically an RDTSC instruction can be
+	 * speculatively executed before prior loads.  An RDTSC
+	 * immediately after an appropriate barrier appears to be
+	 * ordered as a normal load, that is, it provides the same
+	 * ordering guarantees as reading from a global memory location
+	 * that some other imaginary CPU is updating continuously with a
+	 * time stamp.
+	 */
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
+	return rdtsc();
+}
+
 static inline unsigned long long native_read_pmc(int counter)
 static inline unsigned long long native_read_pmc(int counter)
 {
 {
 	DECLARE_ARGS(val, low, high);
 	DECLARE_ARGS(val, low, high);
@@ -180,12 +212,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 	return err;
 }
 }
 
 
-#define rdtscl(low)						\
-	((low) = (u32)__native_read_tsc())
-
-#define rdtscll(val)						\
-	((val) = __native_read_tsc())
-
 #define rdpmc(counter, low, high)			\
 #define rdpmc(counter, low, high)			\
 do {							\
 do {							\
 	u64 _l = native_read_pmc((counter));		\
 	u64 _l = native_read_pmc((counter));		\
@@ -195,15 +221,6 @@ do {							\
 
 
 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
 
 
-#define rdtscp(low, high, aux)					\
-do {                                                            \
-	unsigned long long _val = native_read_tscp(&(aux));     \
-	(low) = (u32)_val;                                      \
-	(high) = (u32)(_val >> 32);                             \
-} while (0)
-
-#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
-
 #endif	/* !CONFIG_PARAVIRT */
 #endif	/* !CONFIG_PARAVIRT */
 
 
 /*
 /*

+ 0 - 34
arch/x86/include/asm/paravirt.h

@@ -174,19 +174,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 	return err;
 }
 }
 
 
-static inline u64 paravirt_read_tsc(void)
-{
-	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
-}
-
-#define rdtscl(low)				\
-do {						\
-	u64 _l = paravirt_read_tsc();		\
-	low = (int)_l;				\
-} while (0)
-
-#define rdtscll(val) (val = paravirt_read_tsc())
-
 static inline unsigned long long paravirt_sched_clock(void)
 static inline unsigned long long paravirt_sched_clock(void)
 {
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +202,6 @@ do {						\
 
 
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 
 
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
-{
-	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
-}
-
-#define rdtscp(low, high, aux)				\
-do {							\
-	int __aux;					\
-	unsigned long __val = paravirt_rdtscp(&__aux);	\
-	(low) = (u32)__val;				\
-	(high) = (u32)(__val >> 32);			\
-	(aux) = __aux;					\
-} while (0)
-
-#define rdtscpll(val, aux)				\
-do {							\
-	unsigned long __aux; 				\
-	val = paravirt_rdtscp(&__aux);			\
-	(aux) = __aux;					\
-} while (0)
-
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);

+ 0 - 2
arch/x86/include/asm/paravirt_types.h

@@ -156,9 +156,7 @@ struct pv_cpu_ops {
 	u64 (*read_msr)(unsigned int msr, int *err);
 	u64 (*read_msr)(unsigned int msr, int *err);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 
 
-	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
 	u64 (*read_pmc)(int counter);
-	unsigned long long (*read_tscp)(unsigned int *aux);
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	/*
 	/*

+ 3 - 10
arch/x86/include/asm/processor.h

@@ -6,8 +6,8 @@
 /* Forward declaration, a strange C thing */
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct task_struct;
 struct mm_struct;
 struct mm_struct;
+struct vm86;
 
 
-#include <asm/vm86.h>
 #include <asm/math_emu.h>
 #include <asm/math_emu.h>
 #include <asm/segment.h>
 #include <asm/segment.h>
 #include <asm/types.h>
 #include <asm/types.h>
@@ -400,15 +400,9 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
 	unsigned long		error_code;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
 	/* Virtual 86 mode info */
 	/* Virtual 86 mode info */
-	struct vm86_struct __user *vm86_info;
-	unsigned long		screen_bitmap;
-	unsigned long		v86flags;
-	unsigned long		v86mask;
-	unsigned long		saved_sp0;
-	unsigned int		saved_fs;
-	unsigned int		saved_gs;
+	struct vm86		*vm86;
 #endif
 #endif
 	/* IO permissions: */
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
 	unsigned long		*io_bitmap_ptr;
@@ -720,7 +714,6 @@ static inline void spin_lock_prefetch(const void *x)
 
 
 #define INIT_THREAD  {							  \
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
-	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
 	.io_bitmap_ptr		= NULL,					  \
 }
 }

+ 2 - 8
arch/x86/include/asm/pvclock.h

@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
 {
-	u64 delta = __native_read_tsc() - src->tsc_timestamp;
+	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 				   src->tsc_shift);
 }
 }
@@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 	u8 ret_flags;
 	u8 ret_flags;
 
 
 	version = src->version;
 	version = src->version;
-	/* Note: emulated platforms which do not advertise SSE2 support
-	 * result in kvmclock not using the necessary RDTSC barriers.
-	 * Without barriers, it is possible that RDTSC instruction reads from
-	 * the time stamp counter outside rdtsc_barrier protected section
-	 * below, resulting in violation of monotonicity.
-	 */
-	rdtsc_barrier();
+
 	offset = pvclock_get_nsec_offset(src);
 	offset = pvclock_get_nsec_offset(src);
 	ret = src->system_time + offset;
 	ret = src->system_time + offset;
 	ret_flags = src->flags;
 	ret_flags = src->flags;

+ 10 - 0
arch/x86/include/asm/sigframe.h

@@ -4,6 +4,7 @@
 #include <asm/sigcontext.h>
 #include <asm/sigcontext.h>
 #include <asm/siginfo.h>
 #include <asm/siginfo.h>
 #include <asm/ucontext.h>
 #include <asm/ucontext.h>
+#include <linux/compat.h>
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 #define sigframe_ia32		sigframe
 #define sigframe_ia32		sigframe
@@ -69,6 +70,15 @@ struct rt_sigframe {
 
 
 #ifdef CONFIG_X86_X32_ABI
 #ifdef CONFIG_X86_X32_ABI
 
 
+struct ucontext_x32 {
+	unsigned int	  uc_flags;
+	unsigned int 	  uc_link;
+	compat_stack_t	  uc_stack;
+	unsigned int	  uc__pad0;     /* needed for alignment */
+	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
+	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
+};
+
 struct rt_sigframe_x32 {
 struct rt_sigframe_x32 {
 	u64 pretcode;
 	u64 pretcode;
 	struct ucontext_x32 uc;
 	struct ucontext_x32 uc;

+ 1 - 0
arch/x86/include/asm/signal.h

@@ -30,6 +30,7 @@ typedef sigset_t compat_sigset_t;
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */
 #include <uapi/asm/signal.h>
 #include <uapi/asm/signal.h>
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
+extern void do_signal(struct pt_regs *regs);
 extern void do_notify_resume(struct pt_regs *, void *, __u32);
 extern void do_notify_resume(struct pt_regs *, void *, __u32);
 
 
 #define __ARCH_HAS_SA_RESTORER
 #define __ARCH_HAS_SA_RESTORER

+ 1 - 1
arch/x86/include/asm/stackprotector.h

@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
 	 * on during the bootup the random pool has true entropy too.
 	 * on during the bootup the random pool has true entropy too.
 	 */
 	 */
 	get_random_bytes(&canary, sizeof(canary));
 	get_random_bytes(&canary, sizeof(canary));
-	tsc = __native_read_tsc();
+	tsc = rdtsc();
 	canary += tsc + (tsc << 32UL);
 	canary += tsc + (tsc << 32UL);
 
 
 	current->stack_canary = canary;
 	current->stack_canary = canary;

+ 1 - 0
arch/x86/include/asm/syscalls.h

@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
 asmlinkage unsigned long sys_sigreturn(void);
 asmlinkage unsigned long sys_sigreturn(void);
 
 
 /* kernel/vm86_32.c */
 /* kernel/vm86_32.c */
+struct vm86_struct;
 asmlinkage long sys_vm86old(struct vm86_struct __user *);
 asmlinkage long sys_vm86old(struct vm86_struct __user *);
 asmlinkage long sys_vm86(unsigned long, unsigned long);
 asmlinkage long sys_vm86(unsigned long, unsigned long);
 
 

+ 7 - 4
arch/x86/include/asm/thread_info.h

@@ -27,14 +27,17 @@
  * Without this offset, that can result in a page fault.  (We are
  * Without this offset, that can result in a page fault.  (We are
  * careful that, in this case, the value we read doesn't matter.)
  * careful that, in this case, the value we read doesn't matter.)
  *
  *
- * In vm86 mode, the hardware frame is much longer still, but we neither
- * access the extra members from NMI context, nor do we write such a
- * frame at sp0 at all.
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
  *
  *
  * x86_64 has a fixed-length stack frame.
  * x86_64 has a fixed-length stack frame.
  */
  */
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
-# define TOP_OF_KERNEL_STACK_PADDING 8
+# ifdef CONFIG_VM86
+#  define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
 #else
 #else
 # define TOP_OF_KERNEL_STACK_PADDING 0
 # define TOP_OF_KERNEL_STACK_PADDING 0
 #endif
 #endif

+ 2 - 2
arch/x86/include/asm/traps.h

@@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 #endif
 
 
-extern enum ctx_state ist_enter(struct pt_regs *regs);
-extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 extern void ist_end_non_atomic(void);
 
 

+ 1 - 17
arch/x86/include/asm/tsc.h

@@ -21,28 +21,12 @@ extern void disable_TSC(void);
 
 
 static inline cycles_t get_cycles(void)
 static inline cycles_t get_cycles(void)
 {
 {
-	unsigned long long ret = 0;
-
 #ifndef CONFIG_X86_TSC
 #ifndef CONFIG_X86_TSC
 	if (!cpu_has_tsc)
 	if (!cpu_has_tsc)
 		return 0;
 		return 0;
 #endif
 #endif
-	rdtscll(ret);
-
-	return ret;
-}
 
 
-static __always_inline cycles_t vget_cycles(void)
-{
-	/*
-	 * We only do VDSOs on TSC capable CPUs, so this shouldn't
-	 * access boot_cpu_data (which is not VDSO-safe):
-	 */
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-	return (cycles_t)__native_read_tsc();
+	return rdtsc();
 }
 }
 
 
 extern void tsc_init(void);
 extern void tsc_init(void);

+ 33 - 24
arch/x86/include/asm/vm86.h

@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_VM86_H
 #ifndef _ASM_X86_VM86_H
 #define _ASM_X86_VM86_H
 #define _ASM_X86_VM86_H
 
 
-
 #include <asm/ptrace.h>
 #include <asm/ptrace.h>
 #include <uapi/asm/vm86.h>
 #include <uapi/asm/vm86.h>
 
 
@@ -28,43 +27,49 @@ struct kernel_vm86_regs {
 	unsigned short gs, __gsh;
 	unsigned short gs, __gsh;
 };
 };
 
 
-struct kernel_vm86_struct {
-	struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 flags
+struct vm86 {
+	struct vm86plus_struct __user *user_vm86;
+	struct pt_regs regs32;
+	unsigned long veflags;
+	unsigned long veflags_mask;
+	unsigned long saved_sp0;
+
 	unsigned long flags;
 	unsigned long flags;
 	unsigned long screen_bitmap;
 	unsigned long screen_bitmap;
 	unsigned long cpu_type;
 	unsigned long cpu_type;
 	struct revectored_struct int_revectored;
 	struct revectored_struct int_revectored;
 	struct revectored_struct int21_revectored;
 	struct revectored_struct int21_revectored;
 	struct vm86plus_info_struct vm86plus;
 	struct vm86plus_info_struct vm86plus;
-	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
-	long return-eip;        from call to vm86()
-	struct pt_regs oldregs;  user space registers as saved by syscall
- */
 };
 };
 
 
 #ifdef CONFIG_VM86
 #ifdef CONFIG_VM86
 
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
 
 
 struct task_struct;
 struct task_struct;
+
+#define free_vm86(t) do {				\
+	struct thread_struct *__t = (t);		\
+	if (__t->vm86 != NULL) {			\
+		kfree(__t->vm86);			\
+		__t->vm86 = NULL;			\
+	}						\
+} while (0)
+
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ		 3
+#define LAST_VM86_IRQ		15
+
+static inline int invalid_vm86_irq(int irq)
+{
+	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
 void release_vm86_irqs(struct task_struct *);
 void release_vm86_irqs(struct task_struct *);
 
 
 #else
 #else
@@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 	return 0;
 }
 }
 
 
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
+#define free_vm86(t) do { } while(0)
+
 #endif /* CONFIG_VM86 */
 #endif /* CONFIG_VM86 */
 
 
 #endif /* _ASM_X86_VM86_H */
 #endif /* _ASM_X86_VM86_H */

+ 3 - 1
arch/x86/kernel/Makefile

@@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 
 obj-y			:= process_$(BITS).o signal.o
 obj-y			:= process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-y			+= probe_roms.o

+ 4 - 4
arch/x86/kernel/apb_timer.c

@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 
 	/* Verify whether apbt counter works */
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	rdtscll(start);
+	start = rdtsc();
 
 
 	/*
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	 */
 	do {
 	do {
 		rep_nop();
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 	} while ((now - start) < 200000UL);
 
 
 	/* APBT is the only always on clocksource, it has to work! */
 	/* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 	old += loop;
 
 
-	t1 = __native_read_tsc();
+	t1 = rdtsc();
 
 
 	do {
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 	} while (new < old);
 
 
-	t2 = __native_read_tsc();
+	t2 = rdtsc();
 
 
 	shift = 5;
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
 	if (unlikely(loop >> shift == 0)) {

+ 4 - 4
arch/x86/kernel/apic/apic.c

@@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
 {
 {
 	u64 tsc;
 	u64 tsc;
 
 
-	rdtscll(tsc);
+	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 	return 0;
 }
 }
@@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 	unsigned long pm = acpi_pm_read_early();
 
 
 	if (cpu_has_tsc)
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();
 
 
 	switch (lapic_cal_loops++) {
 	switch (lapic_cal_loops++) {
 	case 0:
 	case 0:
@@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 
 	if (cpu_has_tsc)
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();
 
 
 	if (disable_apic) {
 	if (disable_apic) {
 		disable_ioapic_support();
 		disable_ioapic_support();
@@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
 		}
 		}
 		if (queued) {
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
 			if (cpu_has_tsc && cpu_khz) {
-				rdtscll(ntsc);
+				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 			} else
 				max_loops--;
 				max_loops--;

+ 3 - 3
arch/x86/kernel/cpu/amd.c

@@ -114,7 +114,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		const int K6_BUG_LOOP = 1000000;
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		int n;
 		void (*f_vide)(void);
 		void (*f_vide)(void);
-		unsigned long d, d2;
+		u64 d, d2;
 
 
 		printk(KERN_INFO "AMD K6 stepping B detected - ");
 		printk(KERN_INFO "AMD K6 stepping B detected - ");
 
 
@@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 
 		n = K6_BUG_LOOP;
 		n = K6_BUG_LOOP;
 		f_vide = vide;
 		f_vide = vide;
-		rdtscl(d);
+		d = rdtsc();
 		while (n--)
 		while (n--)
 			f_vide();
 			f_vide();
-		rdtscl(d2);
+		d2 = rdtsc();
 		d = d2-d;
 		d = d2-d;
 
 
 		if (d > 20*K6_BUG_LOOP)
 		if (d > 20*K6_BUG_LOOP)

+ 4 - 5
arch/x86/kernel/cpu/mcheck/mce.c

@@ -125,7 +125,7 @@ void mce_setup(struct mce *m)
 {
 {
 	memset(m, 0, sizeof(struct mce));
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
 	m->cpu = m->extcpu = smp_processor_id();
-	rdtscll(m->tsc);
+	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -1029,7 +1029,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 {
 	struct mca_config *cfg = &mca_cfg;
 	struct mca_config *cfg = &mca_cfg;
 	struct mce m, *final;
 	struct mce m, *final;
-	enum ctx_state prev_state;
 	int i;
 	int i;
 	int worst = 0;
 	int worst = 0;
 	int severity;
 	int severity;
@@ -1055,7 +1054,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int flags = MF_ACTION_REQUIRED;
 	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 	int lmce = 0;
 
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 
 	this_cpu_inc(mce_exception_count);
 	this_cpu_inc(mce_exception_count);
 
 
@@ -1227,7 +1226,7 @@ out:
 	local_irq_disable();
 	local_irq_disable();
 	ist_end_non_atomic();
 	ist_end_non_atomic();
 done:
 done:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 EXPORT_SYMBOL_GPL(do_machine_check);
 
 
@@ -1784,7 +1783,7 @@ static void collect_tscs(void *data)
 {
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
 
-	rdtscll(cpu_tsc[smp_processor_id()]);
+	cpu_tsc[smp_processor_id()] = rdtsc();
 }
 }
 
 
 static int mce_apei_read_done;
 static int mce_apei_read_done;

+ 2 - 3
arch/x86/kernel/cpu/mcheck/p5.c

@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
 /* Machine check handler for Pentium class Intel CPUs: */
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state;
 	u32 loaddr, hi, lotype;
 	u32 loaddr, hi, lotype;
 
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 }
 
 
 /* Set up machine check reporting for processors with Intel style MCE: */
 /* Set up machine check reporting for processors with Intel style MCE: */

+ 2 - 2
arch/x86/kernel/cpu/mcheck/winchip.c

@@ -15,12 +15,12 @@
 /* Machine check handler for WinChip C6: */
 /* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 }
 
 
 /* Set up machine check reporting on the Winchip C6 series */
 /* Set up machine check reporting on the Winchip C6 series */

+ 5 - 1
arch/x86/kernel/cpu/perf_event.c

@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;
 	int idx = segment >> 3;
 
 
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;
 		struct ldt_struct *ldt;
 
 
 		if (idx > LDT_ENTRIES)
 		if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
 			return 0;
 			return 0;
 
 
 		desc = &ldt->entries[idx];
 		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
 	} else {
 	} else {
 		if (idx > GDT_ENTRIES)
 		if (idx > GDT_ENTRIES)
 			return 0;
 			return 0;
@@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc);
 	return get_desc_base(desc);
 }
 }
 
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
 
 
 #include <asm/compat.h>
 #include <asm/compat.h>
 
 

+ 1 - 1
arch/x86/kernel/espfix_64.c

@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	 */
 	if (!arch_get_random_long(&rand)) {
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
 		/* The constant is an arbitrary large prime */
-		rdtscll(rand);
+		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
 		rand *= 0xc345c6b72fd16123UL;
 	}
 	}
 
 

+ 2 - 2
arch/x86/kernel/hpet.c

@@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
 
 
 	/* Verify whether hpet counter works */
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
 	t1 = hpet_readl(HPET_COUNTER);
-	rdtscll(start);
+	start = rdtsc();
 
 
 	/*
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
 	 */
 	 */
 	do {
 	do {
 		rep_nop();
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 	} while ((now - start) < 200000UL);
 
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
 	if (t1 == hpet_readl(HPET_COUNTER)) {

+ 15 - 0
arch/x86/kernel/irq.c

@@ -216,8 +216,23 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	unsigned vector = ~regs->orig_ax;
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 	unsigned irq;
 
 
+	/*
+	 * NB: Unlike exception entries, IRQ entries do not reliably
+	 * handle context tracking in the low-level entry code.  This is
+	 * because syscall entries execute briefly with IRQs on before
+	 * updating context tracking state, so we can take an IRQ from
+	 * kernel mode with CONTEXT_USER.  The low-level entry code only
+	 * updates the context if we came from user mode, so we won't
+	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
+	 * code is cleaned up enough that we can cleanly defer enabling
+	 * IRQs.
+	 */
+
 	entering_irq();
 	entering_irq();
 
 
+	/* entering_irq() tells RCU that we're not quiescent.  Check it. */
+	rcu_lockdep_assert(rcu_is_watching(), "IRQ failed to wake up RCU");
+
 	irq = __this_cpu_read(vector_irq[vector]);
 	irq = __this_cpu_read(vector_irq[vector]);
 
 
 	if (!handle_irq(irq, regs)) {
 	if (!handle_irq(irq, regs)) {

+ 5 - 5
arch/x86/kernel/nmi.c

@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
 		a->handler, whole_msecs, decimal_msecs);
 		a->handler, whole_msecs, decimal_msecs);
 }
 }
 
 
-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
 {
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
 	struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 {
 	/* check to see if anyone registered against these types of errors */
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_SERR, regs, false))
+	if (nmi_handle(NMI_SERR, regs))
 		return;
 		return;
 
 
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	unsigned long i;
 	unsigned long i;
 
 
 	/* check to see if anyone registered against these types of errors */
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_IO_CHECK, regs, false))
+	if (nmi_handle(NMI_IO_CHECK, regs))
 		return;
 		return;
 
 
 	pr_emerg(
 	pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 * as only the first one is ever run (unless it can actually determine
 	 * as only the first one is ever run (unless it can actually determine
 	 * if it caused the NMI)
 	 * if it caused the NMI)
 	 */
 	 */
-	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	handled = nmi_handle(NMI_UNKNOWN, regs);
 	if (handled) {
 	if (handled) {
 		__this_cpu_add(nmi_stats.unknown, handled);
 		__this_cpu_add(nmi_stats.unknown, handled);
 		return;
 		return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
 
 
 	__this_cpu_write(last_nmi_rip, regs->ip);
 	__this_cpu_write(last_nmi_rip, regs->ip);
 
 
-	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
 	__this_cpu_add(nmi_stats.normal, handled);
 	if (handled) {
 	if (handled) {
 		/*
 		/*

+ 0 - 2
arch/x86/kernel/paravirt.c

@@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.wbinvd = native_wbinvd,
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
 	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 	.read_pmc = native_read_pmc,
-	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
 	.load_gdt = native_load_gdt,

+ 0 - 2
arch/x86/kernel/paravirt_patch_32.c

@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_cpu_ops, read_tsc);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
 			if (pv_is_native_spin_unlock()) {

+ 3 - 0
arch/x86/kernel/process.c

@@ -29,6 +29,7 @@
 #include <asm/debugreg.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
+#include <asm/vm86.h>
 
 
 /*
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -110,6 +111,8 @@ void exit_thread(void)
 		kfree(bp);
 		kfree(bp);
 	}
 	}
 
 
+	free_vm86(t);
+
 	fpu__drop(fpu);
 	fpu__drop(fpu);
 }
 }
 
 

+ 1 - 0
arch/x86/kernel/process_32.c

@@ -53,6 +53,7 @@
 #include <asm/syscalls.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/switch_to.h>
+#include <asm/vm86.h>
 
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");

+ 4 - 2
arch/x86/kernel/process_64.c

@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 void release_thread(struct task_struct *dead_task)
 {
 {
 	if (dead_task->mm) {
 	if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		if (dead_task->mm->context.ldt) {
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
 				dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
 				dead_task->mm->context.ldt->size);
 				dead_task->mm->context.ldt->size);
 			BUG();
 			BUG();
 		}
 		}
+#endif
 	}
 	}
 }
 }
 
 
@@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 			    __USER_CS, __USER_DS, 0);
 			    __USER_CS, __USER_DS, 0);
 }
 }
 
 
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 {
 	start_thread_common(regs, new_ip, new_sp,
 	start_thread_common(regs, new_ip, new_sp,
 			    test_thread_flag(TIF_X32)
 			    test_thread_flag(TIF_X32)

+ 75 - 265
arch/x86/kernel/ptrace.c

@@ -37,12 +37,10 @@
 #include <asm/proto.h>
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
 #include <asm/traps.h>
+#include <asm/syscall.h>
 
 
 #include "tls.h"
 #include "tls.h"
 
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 enum x86_regset {
 enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_GENERAL,
 	REGSET_FP,
 	REGSET_FP,
@@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 	return ret;
 }
 }
 
 
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+			     compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
+
+	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_SET_THREAD_AREA:
+		return arch_ptrace(child, request, addr, data);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_X32_ABI
 #ifdef CONFIG_X86_X32_ABI
 static long x32_arch_ptrace(struct task_struct *child,
 static long x32_arch_ptrace(struct task_struct *child,
 			    compat_long_t request, compat_ulong_t caddr,
 			    compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
 }
 }
 #endif
 #endif
 
 
+#ifdef CONFIG_COMPAT
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
 {
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
 #ifdef CONFIG_X86_X32_ABI
 #ifdef CONFIG_X86_X32_ABI
 	if (!is_ia32_task())
 	if (!is_ia32_task())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
 #endif
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_GENERAL,
-					   0, sizeof(struct user_regs_struct32),
-					   datap);
-
-	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_GENERAL, 0,
-					     sizeof(struct user_regs_struct32),
-					     datap);
-
-	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_FP, 0,
-					   sizeof(struct user_i387_ia32_struct),
-					   datap);
-
-	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
-		return copy_regset_from_user(
-			child, &user_x86_32_view, REGSET_FP,
-			0, sizeof(struct user_i387_ia32_struct), datap);
-
-	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_XFP, 0,
-					   sizeof(struct user32_fxsr_struct),
-					   datap);
-
-	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_XFP, 0,
-					     sizeof(struct user32_fxsr_struct),
-					     datap);
-
-	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_SET_THREAD_AREA:
-		return arch_ptrace(child, request, addr, data);
-
-	default:
-		return compat_ptrace_request(child, request, addr, data);
-	}
-
-	return ret;
+#ifdef CONFIG_IA32_EMULATION
+	return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+	return 0;
+#endif
 }
 }
-
-#endif	/* CONFIG_IA32_EMULATION */
+#endif	/* CONFIG_COMPAT */
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 
 
@@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	/* Send us the fake SIGTRAP */
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
 	force_sig_info(SIGTRAP, &info, tsk);
 }
 }
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
-	unsigned long ret = 0;
-	u32 work;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		user_exit();
-		work &= ~_TIF_NOHZ;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	long ret = 0;
-	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-#endif
-
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}

+ 5 - 28
arch/x86/kernel/signal.c

@@ -31,11 +31,11 @@
 #include <asm/vdso.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
 #include <asm/sighandling.h>
+#include <asm/vm86.h>
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
 #include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_X86_64 */
 
 
 #include <asm/syscall.h>
 #include <asm/syscall.h>
@@ -636,6 +636,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	bool stepping, failed;
 	bool stepping, failed;
 	struct fpu *fpu = &current->thread.fpu;
 	struct fpu *fpu = &current->thread.fpu;
 
 
+	if (v8086_mode(regs))
+		save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
 	/* Are we from a system call? */
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
 		/* If so, check system call restarting.. */
@@ -701,7 +704,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  * mistake.
  */
  */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 {
 	struct ksignal ksig;
 	struct ksignal ksig;
 
 
@@ -736,32 +739,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 	restore_saved_sigmask();
 }
 }
 
 
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
-
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 {
 	struct task_struct *me = current;
 	struct task_struct *me = current;

+ 95 - 0
arch/x86/kernel/signal_compat.c

@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+	int err = 0;
+	bool ia32 = test_thread_flag(TIF_IA32);
+
+	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	put_user_try {
+		/* If you change siginfo_t structure, please make sure that
+		   this code is fixed accordingly.
+		   It should never copy any pad contained in the structure
+		   to avoid security leaks, but must copy the generic
+		   3 ints plus the relevant union member.  */
+		put_user_ex(from->si_signo, &to->si_signo);
+		put_user_ex(from->si_errno, &to->si_errno);
+		put_user_ex((short)from->si_code, &to->si_code);
+
+		if (from->si_code < 0) {
+			put_user_ex(from->si_pid, &to->si_pid);
+			put_user_ex(from->si_uid, &to->si_uid);
+			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+		} else {
+			/*
+			 * First 32bits of unions are always present:
+			 * si_pid === si_band === si_tid === si_addr(LS half)
+			 */
+			put_user_ex(from->_sifields._pad[0],
+					  &to->_sifields._pad[0]);
+			switch (from->si_code >> 16) {
+			case __SI_FAULT >> 16:
+				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
+			case __SI_CHLD >> 16:
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
+				put_user_ex(from->si_status, &to->si_status);
+				/* FALL THROUGH */
+			default:
+			case __SI_KILL >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				break;
+			case __SI_POLL >> 16:
+				put_user_ex(from->si_fd, &to->si_fd);
+				break;
+			case __SI_TIMER >> 16:
+				put_user_ex(from->si_overrun, &to->si_overrun);
+				put_user_ex(ptr_to_compat(from->si_ptr),
+					    &to->si_ptr);
+				break;
+				 /* This is not generated by the kernel as of now.  */
+			case __SI_RT >> 16:
+			case __SI_MESGQ >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				put_user_ex(from->si_int, &to->si_int);
+				break;
+			}
+		}
+	} put_user_catch(err);
+
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err = 0;
+	u32 ptr32;
+
+	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	get_user_try {
+		get_user_ex(to->si_signo, &from->si_signo);
+		get_user_ex(to->si_errno, &from->si_errno);
+		get_user_ex(to->si_code, &from->si_code);
+
+		get_user_ex(to->si_pid, &from->si_pid);
+		get_user_ex(to->si_uid, &from->si_uid);
+		get_user_ex(ptr32, &from->si_ptr);
+		to->si_ptr = compat_ptr(ptr32);
+	} get_user_catch(err);
+
+	return err;
+}

+ 2 - 0
arch/x86/kernel/step.c

@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		return addr;
 		return addr;
 	}
 	}
 
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	/*
 	/*
 	 * We'll assume that the code segments in the GDT
 	 * We'll assume that the code segments in the GDT
 	 * are all zero-based. That is largely true: the
 	 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		}
 		}
 		mutex_unlock(&child->mm->context.lock);
 		mutex_unlock(&child->mm->context.lock);
 	}
 	}
+#endif
 
 
 	return addr;
 	return addr;
 }
 }

+ 1 - 6
arch/x86/kernel/trace_clock.c

@@ -12,10 +12,5 @@
  */
  */
 u64 notrace trace_clock_x86_tsc(void)
 u64 notrace trace_clock_x86_tsc(void)
 {
 {
-	u64 ret;
-
-	rdtsc_barrier();
-	rdtscll(ret);
-
-	return ret;
+	return rdtsc_ordered();
 }
 }

+ 29 - 59
arch/x86/kernel/traps.c

@@ -62,6 +62,7 @@
 #include <asm/fpu/xstate.h>
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
 #include <asm/mpx.h>
+#include <asm/vm86.h>
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
 #include <asm/x86_init.h>
@@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_count_dec();
 	preempt_count_dec();
 }
 }
 
 
-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
 {
 {
-	enum ctx_state prev_state;
-
 	if (user_mode(regs)) {
 	if (user_mode(regs)) {
-		/* Other than that, we're just an exception. */
-		prev_state = exception_enter();
+		CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	} else {
 	} else {
 		/*
 		/*
 		 * We might have interrupted pretty much anything.  In
 		 * We might have interrupted pretty much anything.  In
@@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 * but we need to notify RCU.
 		 */
 		 */
 		rcu_nmi_enter();
 		rcu_nmi_enter();
-		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 	}
 
 
 	/*
 	/*
-	 * We are atomic because we're on the IST stack (or we're on x86_32,
-	 * in which case we still shouldn't schedule).
-	 *
-	 * This must be after exception_enter(), because exception_enter()
-	 * won't do anything if in_interrupt() returns true.
+	 * We are atomic because we're on the IST stack; or we're on
+	 * x86_32, in which case we still shouldn't schedule; or we're
+	 * on x86_64 and entered from user mode, in which case we're
+	 * still atomic unless ist_begin_non_atomic is called.
 	 */
 	 */
 	preempt_count_add(HARDIRQ_OFFSET);
 	preempt_count_add(HARDIRQ_OFFSET);
 
 
 	/* This code is a bit fragile.  Test it. */
 	/* This code is a bit fragile.  Test it. */
 	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
 	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
-
-	return prev_state;
 }
 }
 
 
-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
 {
 {
-	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 	preempt_count_sub(HARDIRQ_OFFSET);
 
 
-	if (user_mode(regs))
-		return exception_exit(prev_state);
-	else
+	if (!user_mode(regs))
 		rcu_nmi_exit();
 		rcu_nmi_exit();
 }
 }
 
 
@@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
  * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
  * before ist_exit().
  * before ist_exit().
  */
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
 void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 			  unsigned long trapnr, int signr)
 			  unsigned long trapnr, int signr)
 {
 {
-	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 	siginfo_t info;
 
 
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 			NOTIFY_STOP) {
 		conditional_sti(regs);
 		conditional_sti(regs);
 		do_trap(trapnr, signr, str, regs, error_code,
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
 	}
-
-	exception_exit(prev_state);
 }
 }
 
 
 #define DO_ERROR(trapnr, signr, str, name)				\
 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	}
 	}
 #endif
 #endif
 
 
-	ist_enter(regs);  /* Discard prev_state because we won't return. */
+	ist_enter(regs);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 
 	tsk->thread.error_code = error_code;
 	tsk->thread.error_code = error_code;
@@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state;
 	const struct bndcsr *bndcsr;
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 	siginfo_t *info;
 
 
-	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
-		goto exit;
+		return;
 	conditional_sti(regs);
 	conditional_sti(regs);
 
 
 	if (!user_mode(regs))
 	if (!user_mode(regs))
@@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		die("bounds", regs, error_code);
 		die("bounds", regs, error_code);
 	}
 	}
 
 
-exit:
-	exception_exit(prev_state);
 	return;
 	return;
+
 exit_trap:
 exit_trap:
 	/*
 	/*
 	 * This path out is for all the cases where we could not
 	 * This path out is for all the cases where we could not
@@ -447,35 +435,33 @@ exit_trap:
 	 * time..
 	 * time..
 	 */
 	 */
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
-	exception_exit(prev_state);
 }
 }
 
 
 dotraplinkage void
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 {
 	struct task_struct *tsk;
 	struct task_struct *tsk;
-	enum ctx_state prev_state;
 
 
-	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	conditional_sti(regs);
 	conditional_sti(regs);
 
 
 	if (v8086_mode(regs)) {
 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-		goto exit;
+		return;
 	}
 	}
 
 
 	tsk = current;
 	tsk = current;
 	if (!user_mode(regs)) {
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
 		if (fixup_exception(regs))
-			goto exit;
+			return;
 
 
 		tsk->thread.error_code = error_code;
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			die("general protection fault", regs, error_code);
 			die("general protection fault", regs, error_code);
-		goto exit;
+		return;
 	}
 	}
 
 
 	tsk->thread.error_code = error_code;
 	tsk->thread.error_code = error_code;
@@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}
 	}
 
 
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
-	exception_exit(prev_state);
 }
 }
 NOKPROBE_SYMBOL(do_general_protection);
 NOKPROBE_SYMBOL(do_general_protection);
 
 
 /* May run on IST stack. */
 /* May run on IST stack. */
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state;
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	if (poke_int3_handler(regs))
 	if (poke_int3_handler(regs))
 		return;
 		return;
 
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
 				SIGTRAP) == NOTIFY_STOP)
@@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	preempt_conditional_cli(regs);
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 	debug_stack_usage_dec();
 exit:
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 }
 NOKPROBE_SYMBOL(do_int3);
 NOKPROBE_SYMBOL(do_int3);
 
 
@@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
 {
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
-	enum ctx_state prev_state;
 	int user_icebp = 0;
 	int user_icebp = 0;
 	unsigned long dr6;
 	unsigned long dr6;
 	int si_code;
 	int si_code;
 
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 
 	get_debugreg(dr6, 6);
 	get_debugreg(dr6, 6);
 
 
@@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 	debug_stack_usage_dec();
 
 
 exit:
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 }
 NOKPROBE_SYMBOL(do_debug);
 NOKPROBE_SYMBOL(do_debug);
 
 
@@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_MF);
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(prev_state);
 }
 }
 
 
 dotraplinkage void
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_XF);
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(prev_state);
 }
 }
 
 
 dotraplinkage void
 dotraplinkage void
@@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	BUG_ON(use_eager_fpu());
 	BUG_ON(use_eager_fpu());
 
 
 #ifdef CONFIG_MATH_EMULATION
 #ifdef CONFIG_MATH_EMULATION
@@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 
 		info.regs = regs;
 		info.regs = regs;
 		math_emulate(&info);
 		math_emulate(&info);
-		exception_exit(prev_state);
 		return;
 		return;
 	}
 	}
 #endif
 #endif
@@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 	conditional_sti(regs);
 #endif
 #endif
-	exception_exit(prev_state);
 }
 }
 NOKPROBE_SYMBOL(do_device_not_available);
 NOKPROBE_SYMBOL(do_device_not_available);
 
 
@@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 {
 	siginfo_t info;
 	siginfo_t info;
-	enum ctx_state prev_state;
 
 
-	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	local_irq_enable();
 	local_irq_enable();
 
 
 	info.si_signo = SIGILL;
 	info.si_signo = SIGILL;
@@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 			&info);
 	}
 	}
-	exception_exit(prev_state);
 }
 }
 #endif
 #endif
 
 

+ 3 - 9
arch/x86/kernel/tsc.c

@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 
 	data = cyc2ns_write_begin(cpu);
 	data = cyc2ns_write_begin(cpu);
 
 
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 	ns_now = cycles_2_ns(tsc_now);
 	ns_now = cycles_2_ns(tsc_now);
 
 
 	/*
 	/*
@@ -290,7 +290,7 @@ u64 native_sched_clock(void)
 	}
 	}
 
 
 	/* read the Time Stamp Counter: */
 	/* read the Time Stamp Counter: */
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 
 
 	/* return the value in ns */
 	/* return the value in ns */
 	return cycles_2_ns(tsc_now);
 	return cycles_2_ns(tsc_now);
@@ -308,12 +308,6 @@ unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 #endif
 #endif
 
 
-unsigned long long native_read_tsc(void)
-{
-	return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
 int check_tsc_unstable(void)
 int check_tsc_unstable(void)
 {
 {
 	return tsc_unstable;
 	return tsc_unstable;
@@ -976,7 +970,7 @@ static struct clocksource clocksource_tsc;
  */
  */
 static cycle_t read_tsc(struct clocksource *cs)
 static cycle_t read_tsc(struct clocksource *cs)
 {
 {
-	return (cycle_t)get_cycles();
+	return (cycle_t)rdtsc_ordered();
 }
 }
 
 
 /*
 /*

+ 6 - 8
arch/x86/kernel/tsc_sync.c

@@ -39,16 +39,15 @@ static cycles_t max_warp;
 static int nr_warps;
 static int nr_warps;
 
 
 /*
 /*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs.  This is not called
+ * if there is no TSC.
  */
  */
 static void check_tsc_warp(unsigned int timeout)
 static void check_tsc_warp(unsigned int timeout)
 {
 {
 	cycles_t start, now, prev, end;
 	cycles_t start, now, prev, end;
 	int i;
 	int i;
 
 
-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
+	start = rdtsc_ordered();
 	/*
 	/*
 	 * The measurement runs for 'timeout' msecs:
 	 * The measurement runs for 'timeout' msecs:
 	 */
 	 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
 		 */
 		 */
 		arch_spin_lock(&sync_lock);
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
 		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
+		now = rdtsc_ordered();
 		last_tsc = now;
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);
 		arch_spin_unlock(&sync_lock);
 
 
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
 
 
 	/*
 	/*
 	 * No need to check if we already know that the TSC is not
 	 * No need to check if we already know that the TSC is not
-	 * synchronized:
+	 * synchronized or if we have no TSC.
 	 */
 	 */
 	if (unsynchronized_tsc())
 	if (unsynchronized_tsc())
 		return;
 		return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
 {
 {
 	int cpus = 2;
 	int cpus = 2;
 
 
+	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 		return;
 
 

+ 183 - 190
arch/x86/kernel/vm86_32.c

@@ -44,11 +44,14 @@
 #include <linux/ptrace.h>
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/audit.h>
 #include <linux/stddef.h>
 #include <linux/stddef.h>
+#include <linux/slab.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
 #include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
 
 
 /*
 /*
  * Known problems:
  * Known problems:
@@ -66,10 +69,6 @@
  */
  */
 
 
 
 
-#define KVM86	((struct kernel_vm86_struct *)regs)
-#define VMPI	KVM86->vm86plus
-
-
 /*
 /*
  * 8- and 16-bit register defines..
  * 8- and 16-bit register defines..
  */
  */
@@ -81,8 +80,8 @@
 /*
 /*
  * virtual flags (16 and 32-bit versions)
  * virtual flags (16 and 32-bit versions)
  */
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS	(current->thread.v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS	(current->thread.vm86->veflags)
 
 
 #define set_flags(X, new, mask) \
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -90,46 +89,13 @@
 #define SAFE_MASK	(0xDD5)
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 #define RETURN_MASK	(0xDFF)
 
 
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-				  const struct kernel_vm86_regs *regs)
-{
-	int ret = 0;
-
-	/*
-	 * kernel_vm86_regs is missing gs, so copy everything up to
-	 * (but not including) orig_eax, and then rest including orig_eax.
-	 */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
-			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
-	return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-				    const struct vm86_regs __user *user,
-				    unsigned extra)
-{
-	int ret = 0;
-
-	/* copy ax-fs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	/* copy orig_ax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
-			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
-			      extra);
-	return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 {
 	struct tss_struct *tss;
 	struct tss_struct *tss;
-	struct pt_regs *ret;
-	unsigned long tmp;
+	struct task_struct *tsk = current;
+	struct vm86plus_struct __user *user;
+	struct vm86 *vm86 = current->thread.vm86;
+	long err = 0;
 
 
 	/*
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
 	 * This gets called from entry.S with interrupts disabled, but
@@ -138,31 +104,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	 */
 	local_irq_enable();
 	local_irq_enable();
 
 
-	if (!current->thread.vm86_info) {
-		pr_alert("no vm86_info: BAD\n");
+	if (!vm86 || !vm86->user_vm86) {
+		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 		do_exit(SIGSEGV);
 	}
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
-	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
-	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
-	if (tmp) {
-		pr_alert("could not access userspace vm86_info\n");
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+	user = vm86->user_vm86;
+
+	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+		       sizeof(struct vm86plus_struct) :
+		       sizeof(struct vm86_struct))) {
+		pr_alert("could not access userspace vm86 info\n");
+		do_exit(SIGSEGV);
+	}
+
+	put_user_try {
+		put_user_ex(regs->pt.bx, &user->regs.ebx);
+		put_user_ex(regs->pt.cx, &user->regs.ecx);
+		put_user_ex(regs->pt.dx, &user->regs.edx);
+		put_user_ex(regs->pt.si, &user->regs.esi);
+		put_user_ex(regs->pt.di, &user->regs.edi);
+		put_user_ex(regs->pt.bp, &user->regs.ebp);
+		put_user_ex(regs->pt.ax, &user->regs.eax);
+		put_user_ex(regs->pt.ip, &user->regs.eip);
+		put_user_ex(regs->pt.cs, &user->regs.cs);
+		put_user_ex(regs->pt.flags, &user->regs.eflags);
+		put_user_ex(regs->pt.sp, &user->regs.esp);
+		put_user_ex(regs->pt.ss, &user->regs.ss);
+		put_user_ex(regs->es, &user->regs.es);
+		put_user_ex(regs->ds, &user->regs.ds);
+		put_user_ex(regs->fs, &user->regs.fs);
+		put_user_ex(regs->gs, &user->regs.gs);
+
+		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+	} put_user_catch(err);
+	if (err) {
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 		do_exit(SIGSEGV);
 	}
 	}
 
 
 	tss = &per_cpu(cpu_tss, get_cpu());
 	tss = &per_cpu(cpu_tss, get_cpu());
-	current->thread.sp0 = current->thread.saved_sp0;
-	current->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &current->thread);
-	current->thread.saved_sp0 = 0;
+	tsk->thread.sp0 = vm86->saved_sp0;
+	tsk->thread.sysenter_cs = __KERNEL_CS;
+	load_sp0(tss, &tsk->thread);
+	vm86->saved_sp0 = 0;
 	put_cpu();
 	put_cpu();
 
 
-	ret = KVM86->regs32;
+	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
 
-	ret->fs = current->thread.saved_fs;
-	set_user_gs(ret, current->thread.saved_gs);
+	lazy_load_gs(vm86->regs32.gs);
 
 
-	return ret;
+	regs->pt.ax = retval;
 }
 }
 
 
 static void mark_screen_rdonly(struct mm_struct *mm)
 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -200,45 +192,16 @@ out:
 
 
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
 
 
-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
 {
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk = current;
-	int tmp;
-
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, vm86plus) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-	info.regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
 }
 }
 
 
 
 
 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 {
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk;
-	int tmp;
-	struct vm86plus_struct __user *v86;
-
-	tsk = current;
 	switch (cmd) {
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
 	case VM86_FREE_IRQ:
@@ -256,114 +219,133 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 	}
 
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	v86 = (struct vm86plus_struct __user *)arg;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, regs32) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	info.regs32 = current_pt_regs();
-	info.vm86plus.is_vm86pus = 1;
-	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
 }
 }
 
 
 
 
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 {
 	struct tss_struct *tss;
 	struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-	info->regs.pt.ds = 0;
-	info->regs.pt.es = 0;
-	info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
-	info->regs.pt.gs = 0;
-#endif
+	struct task_struct *tsk = current;
+	struct vm86 *vm86 = tsk->thread.vm86;
+	struct kernel_vm86_regs vm86regs;
+	struct pt_regs *regs = current_pt_regs();
+	unsigned long err = 0;
+
+	if (!vm86) {
+		if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+			return -ENOMEM;
+		tsk->thread.vm86 = vm86;
+	}
+	if (vm86->saved_sp0)
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, user_vm86, plus ?
+		       sizeof(struct vm86_struct) :
+		       sizeof(struct vm86plus_struct)))
+		return -EFAULT;
+
+	memset(&vm86regs, 0, sizeof(vm86regs));
+	get_user_try {
+		unsigned short seg;
+		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+		get_user_ex(seg, &user_vm86->regs.cs);
+		vm86regs.pt.cs = seg;
+		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+		get_user_ex(seg, &user_vm86->regs.ss);
+		vm86regs.pt.ss = seg;
+		get_user_ex(vm86regs.es, &user_vm86->regs.es);
+		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+		get_user_ex(vm86->flags, &user_vm86->flags);
+		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+	} get_user_catch(err);
+	if (err)
+		return err;
+
+	if (copy_from_user(&vm86->int_revectored,
+			   &user_vm86->int_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (copy_from_user(&vm86->int21_revectored,
+			   &user_vm86->int21_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (plus) {
+		if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+				   sizeof(struct vm86plus_info_struct)))
+			return -EFAULT;
+		vm86->vm86plus.is_vm86pus = 1;
+	} else
+		memset(&vm86->vm86plus, 0,
+		       sizeof(struct vm86plus_info_struct));
+
+	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+	vm86->user_vm86 = user_vm86;
 
 
 /*
 /*
  * The flags register is also special: we cannot trust that the user
  * The flags register is also special: we cannot trust that the user
  * has set it up safely, so this makes sure interrupt etc flags are
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  * inherited from protected mode.
  */
  */
-	VEFLAGS = info->regs.pt.flags;
-	info->regs.pt.flags &= SAFE_MASK;
-	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= X86_VM_MASK;
+	VEFLAGS = vm86regs.pt.flags;
+	vm86regs.pt.flags &= SAFE_MASK;
+	vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= X86_VM_MASK;
+
+	vm86regs.pt.orig_ax = regs->orig_ax;
 
 
-	switch (info->cpu_type) {
+	switch (vm86->cpu_type) {
 	case CPU_286:
 	case CPU_286:
-		tsk->thread.v86mask = 0;
+		vm86->veflags_mask = 0;
 		break;
 		break;
 	case CPU_386:
 	case CPU_386:
-		tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 		break;
 	case CPU_486:
 	case CPU_486:
-		tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 		break;
 	default:
 	default:
-		tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 		break;
 	}
 	}
 
 
 /*
 /*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
  */
  */
-	info->regs32->ax = VM86_SIGNAL;
-	tsk->thread.saved_sp0 = tsk->thread.sp0;
-	tsk->thread.saved_fs = info->regs32->fs;
-	tsk->thread.saved_gs = get_user_gs(info->regs32);
+	vm86->saved_sp0 = tsk->thread.sp0;
+	lazy_save_gs(vm86->regs32.gs);
 
 
 	tss = &per_cpu(cpu_tss, get_cpu());
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	/* make room for real-mode segments */
+	tsk->thread.sp0 += 16;
 	if (cpu_has_sep)
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 	put_cpu();
 
 
-	tsk->thread.screen_bitmap = info->screen_bitmap;
-	if (info->flags & VM86_SCREEN_BITMAP)
+	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 		mark_screen_rdonly(tsk->mm);
 
 
-	/*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
-#endif
-
-	__asm__ __volatile__(
-		"movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
-		"mov  %2, %%gs\n\t"
-#endif
-		"jmp resume_userspace"
-		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-	/* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
-	struct pt_regs *regs32;
-
-	regs32 = save_v86_state(regs16);
-	regs32->ax = retval;
-	__asm__ __volatile__("movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-		"jmp resume_userspace"
-		: : "r" (regs32), "r" (current_thread_info()));
+	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+	force_iret();
+	return regs->ax;
 }
 }
 
 
 static inline void set_IF(struct kernel_vm86_regs *regs)
 static inline void set_IF(struct kernel_vm86_regs *regs)
 {
 {
 	VEFLAGS |= X86_EFLAGS_VIF;
 	VEFLAGS |= X86_EFLAGS_VIF;
-	if (VEFLAGS & X86_EFLAGS_VIP)
-		return_to_32bit(regs, VM86_STI);
 }
 }
 
 
 static inline void clear_IF(struct kernel_vm86_regs *regs)
 static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -395,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
 {
-	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
 		set_IF(regs);
@@ -405,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
 {
-	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
 		set_IF(regs);
@@ -420,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
 }
 }
 
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@@ -518,12 +500,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 {
 	unsigned long __user *intr_ptr;
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
 	unsigned long segoffs;
+	struct vm86 *vm86 = current->thread.vm86;
 
 
 	if (regs->pt.cs == BIOSSEG)
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
 		goto cannot_handle;
-	if (is_revectored(i, &KVM86->int_revectored))
+	if (is_revectored(i, &vm86->int_revectored))
 		goto cannot_handle;
 		goto cannot_handle;
-	if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+	if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
 		goto cannot_handle;
 		goto cannot_handle;
 	intr_ptr = (unsigned long __user *) (i << 2);
 	intr_ptr = (unsigned long __user *) (i << 2);
 	if (get_user(segoffs, intr_ptr))
 	if (get_user(segoffs, intr_ptr))
@@ -542,18 +525,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	return;
 	return;
 
 
 cannot_handle:
 cannot_handle:
-	return_to_32bit(regs, VM86_INTx + (i << 8));
+	save_v86_state(regs, VM86_INTx + (i << 8));
 }
 }
 
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
 {
-	if (VMPI.is_vm86pus) {
+	struct vm86 *vm86 = current->thread.vm86;
+
+	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
 		if ((trapno == 3) || (trapno == 1)) {
-			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			save_v86_state(regs, VM86_TRAP + (trapno << 8));
 			return 0;
 			return 0;
 		}
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -574,16 +555,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	unsigned char __user *ssp;
 	unsigned char __user *ssp;
 	unsigned short ip, sp, orig_flags;
 	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
 	int data32, pref_done;
+	struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
 
 
 #define CHECK_IF_IN_TRAP \
 #define CHECK_IF_IN_TRAP \
-	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
 		newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
-	if (VMPI.force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
-		return_to_32bit(regs, VM86_PICRETURN); \
-	if (orig_flags & X86_EFLAGS_TF) \
-		handle_vm86_trap(regs, 0, 1); \
-	return; } while (0)
 
 
 	orig_flags = *(unsigned short *)&regs->pt.flags;
 	orig_flags = *(unsigned short *)&regs->pt.flags;
 
 
@@ -622,7 +598,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 			SP(regs) -= 2;
 			SP(regs) -= 2;
 		}
 		}
 		IP(regs) = ip;
 		IP(regs) = ip;
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 
 	/* popf */
 	/* popf */
 	case 0x9d:
 	case 0x9d:
@@ -642,16 +618,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		else
 		else
 			set_vflags_short(newflags, regs);
 			set_vflags_short(newflags, regs);
 
 
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 		}
 
 
 	/* int xx */
 	/* int xx */
 	case 0xcd: {
 	case 0xcd: {
 		int intno = popb(csp, ip, simulate_sigsegv);
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
 		IP(regs) = ip;
-		if (VMPI.vm86dbg_active) {
-			if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
-				return_to_32bit(regs, VM86_INTx + (intno << 8));
+		if (vmpi->vm86dbg_active) {
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+				save_v86_state(regs, VM86_INTx + (intno << 8));
+				return;
+			}
 		}
 		}
 		do_int(regs, intno, ssp, sp);
 		do_int(regs, intno, ssp, sp);
 		return;
 		return;
@@ -682,14 +660,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		} else {
 		} else {
 			set_vflags_short(newflags, regs);
 			set_vflags_short(newflags, regs);
 		}
 		}
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 		}
 
 
 	/* cli */
 	/* cli */
 	case 0xfa:
 	case 0xfa:
 		IP(regs) = ip;
 		IP(regs) = ip;
 		clear_IF(regs);
 		clear_IF(regs);
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 
 	/* sti */
 	/* sti */
 	/*
 	/*
@@ -701,14 +679,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xfb:
 	case 0xfb:
 		IP(regs) = ip;
 		IP(regs) = ip;
 		set_IF(regs);
 		set_IF(regs);
-		VM86_FAULT_RETURN;
+		goto check_vip;
 
 
 	default:
 	default:
-		return_to_32bit(regs, VM86_UNKNOWN);
+		save_v86_state(regs, VM86_UNKNOWN);
 	}
 	}
 
 
 	return;
 	return;
 
 
+check_vip:
+	if (VEFLAGS & X86_EFLAGS_VIP) {
+		save_v86_state(regs, VM86_STI);
+		return;
+	}
+
+vm86_fault_return:
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+		save_v86_state(regs, VM86_PICRETURN);
+		return;
+	}
+	if (orig_flags & X86_EFLAGS_TF)
+		handle_vm86_trap(regs, 0, X86_TRAP_DB);
+	return;
+
 simulate_sigsegv:
 simulate_sigsegv:
 	/* FIXME: After a long discussion with Stas we finally
 	/* FIXME: After a long discussion with Stas we finally
 	 *        agreed, that this is wrong. Here we should
 	 *        agreed, that this is wrong. Here we should
@@ -720,7 +713,7 @@ simulate_sigsegv:
 	 *        should be a mixture of the two, but how do we
 	 *        should be a mixture of the two, but how do we
 	 *        get the information? [KD]
 	 *        get the information? [KD]
 	 */
 	 */
-	return_to_32bit(regs, VM86_UNKNOWN);
+	save_v86_state(regs, VM86_UNKNOWN);
 }
 }
 
 
 /* ---------------- vm86 special IRQ passing stuff ----------------- */
 /* ---------------- vm86 special IRQ passing stuff ----------------- */

+ 2 - 2
arch/x86/kvm/lapic.c

@@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 
 
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
 	apic->lapic_timer.expired_tscdeadline = 0;
-	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
 
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
@@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		local_irq_save(flags);
 		local_irq_save(flags);
 
 
 		now = apic->lapic_timer.timer.base->get_time();
 		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 		if (likely(tscdeadline > guest_tsc)) {
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
 			do_div(ns, this_tsc_khz);

+ 2 - 2
arch/x86/kvm/svm.c

@@ -1139,7 +1139,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 {
 	u64 tsc;
 	u64 tsc;
 
 
-	tsc = svm_scale_tsc(vcpu, native_read_tsc());
+	tsc = svm_scale_tsc(vcpu, rdtsc());
 
 
 	return target_tsc - tsc;
 	return target_tsc - tsc;
 }
 }
@@ -3172,7 +3172,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	switch (msr_info->index) {
 	switch (msr_info->index) {
 	case MSR_IA32_TSC: {
 	case MSR_IA32_TSC: {
 		msr_info->data = svm->vmcb->control.tsc_offset +
 		msr_info->data = svm->vmcb->control.tsc_offset +
-			svm_scale_tsc(vcpu, native_read_tsc());
+			svm_scale_tsc(vcpu, rdtsc());
 
 
 		break;
 		break;
 	}
 	}

+ 2 - 2
arch/x86/kvm/vmx.c

@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
 {
 {
 	u64 host_tsc, tsc_offset;
 	u64 host_tsc, tsc_offset;
 
 
-	rdtscll(host_tsc);
+	host_tsc = rdtsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	return host_tsc + tsc_offset;
 	return host_tsc + tsc_offset;
 }
 }
@@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
 
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 {
-	return target_tsc - native_read_tsc();
+	return target_tsc - rdtsc();
 }
 }
 
 
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)

+ 7 - 19
arch/x86/kvm/x86.c

@@ -1444,20 +1444,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 
 static cycle_t read_tsc(void)
 static cycle_t read_tsc(void)
 {
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)vget_cycles();
-
-	last = pvclock_gtod_data.clock.cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = pvclock_gtod_data.clock.cycle_last;
 
 
 	if (likely(ret >= last))
 	if (likely(ret >= last))
 		return ret;
 		return ret;
@@ -1646,7 +1634,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 1;
 		return 1;
 	}
 	}
 	if (!use_master_clock) {
 	if (!use_master_clock) {
-		host_tsc = native_read_tsc();
+		host_tsc = rdtsc();
 		kernel_ns = get_kernel_ns();
 		kernel_ns = get_kernel_ns();
 	}
 	}
 
 
@@ -2810,7 +2798,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
-				native_read_tsc() - vcpu->arch.last_host_tsc;
+				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
 		if (check_tsc_unstable()) {
@@ -2838,7 +2826,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_host_tsc = native_read_tsc();
+	vcpu->arch.last_host_tsc = rdtsc();
 }
 }
 
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6622,7 +6610,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		hw_breakpoint_restore();
 		hw_breakpoint_restore();
 
 
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
-							   native_read_tsc());
+							   rdtsc());
 
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 	smp_wmb();
@@ -7431,7 +7419,7 @@ int kvm_arch_hardware_enable(void)
 	if (ret != 0)
 	if (ret != 0)
 		return ret;
 		return ret;
 
 
-	local_tsc = native_read_tsc();
+	local_tsc = rdtsc();
 	stable = !check_tsc_unstable();
 	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {

+ 5 - 8
arch/x86/lib/delay.c

@@ -49,16 +49,14 @@ static void delay_loop(unsigned long loops)
 /* TSC based delay: */
 /* TSC based delay: */
 static void delay_tsc(unsigned long __loops)
 static void delay_tsc(unsigned long __loops)
 {
 {
-	u32 bclock, now, loops = __loops;
+	u64 bclock, now, loops = __loops;
 	int cpu;
 	int cpu;
 
 
 	preempt_disable();
 	preempt_disable();
 	cpu = smp_processor_id();
 	cpu = smp_processor_id();
-	rdtsc_barrier();
-	rdtscl(bclock);
+	bclock = rdtsc_ordered();
 	for (;;) {
 	for (;;) {
-		rdtsc_barrier();
-		rdtscl(now);
+		now = rdtsc_ordered();
 		if ((now - bclock) >= loops)
 		if ((now - bclock) >= loops)
 			break;
 			break;
 
 
@@ -79,8 +77,7 @@ static void delay_tsc(unsigned long __loops)
 		if (unlikely(cpu != smp_processor_id())) {
 		if (unlikely(cpu != smp_processor_id())) {
 			loops -= (now - bclock);
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
 			cpu = smp_processor_id();
-			rdtsc_barrier();
-			rdtscl(bclock);
+			bclock = rdtsc_ordered();
 		}
 		}
 	}
 	}
 	preempt_enable();
 	preempt_enable();
@@ -100,7 +97,7 @@ void use_tsc_delay(void)
 int read_current_timer(unsigned long *timer_val)
 int read_current_timer(unsigned long *timer_val)
 {
 {
 	if (delay_fn == delay_tsc) {
 	if (delay_fn == delay_tsc) {
-		rdtscll(*timer_val);
+		*timer_val = rdtsc();
 		return 0;
 		return 0;
 	}
 	}
 	return -1;
 	return -1;

+ 1 - 0
arch/x86/math-emu/get_address.c

@@ -21,6 +21,7 @@
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
+#include <asm/vm86.h>
 
 
 #include "fpu_system.h"
 #include "fpu_system.h"
 #include "exception.h"
 #include "exception.h"

+ 5 - 2
arch/x86/mm/fault.c

@@ -20,6 +20,7 @@
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+#include <asm/vm86.h>			/* struct vm86			*/
 
 
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
 #include <asm/trace/exceptions.h>
@@ -301,14 +302,16 @@ static inline void
 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 		 struct task_struct *tsk)
 		 struct task_struct *tsk)
 {
 {
+#ifdef CONFIG_VM86
 	unsigned long bit;
 	unsigned long bit;
 
 
-	if (!v8086_mode(regs))
+	if (!v8086_mode(regs) || !tsk->thread.vm86)
 		return;
 		return;
 
 
 	bit = (address - 0xA0000) >> PAGE_SHIFT;
 	bit = (address - 0xA0000) >> PAGE_SHIFT;
 	if (bit < 32)
 	if (bit < 32)
-		tsk->thread.screen_bitmap |= 1 << bit;
+		tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
 }
 }
 
 
 static bool low_pfn(unsigned long pfn)
 static bool low_pfn(unsigned long pfn)

+ 0 - 13
arch/x86/um/asm/barrier.h

@@ -45,17 +45,4 @@
 #define read_barrier_depends()		do { } while (0)
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif
 #endif

+ 0 - 3
arch/x86/xen/enlighten.c

@@ -1215,11 +1215,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_msr = xen_read_msr_safe,
 	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
 
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 	.read_pmc = native_read_pmc,
 
 
-	.read_tscp = native_read_tscp,
-
 	.iret = xen_iret,
 	.iret = xen_iret,
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret32 = xen_sysret32,

+ 1 - 1
drivers/cpufreq/intel_pstate.c

@@ -766,7 +766,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_save(flags);
 	local_irq_save(flags);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 
 
 	cpu->last_sample_time = cpu->sample.time;
 	cpu->last_sample_time = cpu->sample.time;

+ 2 - 2
drivers/input/gameport/gameport.c

@@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport)
 
 
 	for(i = 0; i < 50; i++) {
 	for(i = 0; i < 50; i++) {
 		local_irq_save(flags);
 		local_irq_save(flags);
-		rdtscl(t1);
+		t1 = rdtsc();
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
-		rdtscl(t2);
+		t2 = rdtsc();
 		local_irq_restore(flags);
 		local_irq_restore(flags);
 		udelay(i * 10);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 		if (t2 - t1 < tx) tx = t2 - t1;

+ 2 - 2
drivers/input/joystick/analog.c

@@ -143,7 +143,7 @@ struct analog_port {
 
 
 #include <linux/i8253.h>
 #include <linux/i8253.h>
 
 
-#define GET_TIME(x)	do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0)
+#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
 #define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
 #define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
 static unsigned int get_time_pit(void)
 static unsigned int get_time_pit(void)
@@ -160,7 +160,7 @@ static unsigned int get_time_pit(void)
         return count;
         return count;
 }
 }
 #elif defined(__x86_64__)
 #elif defined(__x86_64__)
-#define GET_TIME(x)	rdtscl(x)
+#define GET_TIME(x)	do { x = (unsigned int)rdtsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
 #define TIME_NAME	"TSC"
 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)

+ 1 - 1
drivers/net/hamradio/baycom_epp.c

@@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt)
 #define GETTICK(x)                                                \
 #define GETTICK(x)                                                \
 ({                                                                \
 ({                                                                \
 	if (cpu_has_tsc)                                          \
 	if (cpu_has_tsc)                                          \
-		rdtscl(x);                                        \
+		x = (unsigned int)rdtsc();		  \
 })
 })
 #else /* __i386__ */
 #else /* __i386__ */
 #define GETTICK(x)
 #define GETTICK(x)

+ 3 - 0
drivers/scsi/dpt_i2o.c

@@ -1924,6 +1924,9 @@ static void adpt_alpha_info(sysInfo_S* si)
 #endif
 #endif
 
 
 #if defined __i386__
 #if defined __i386__
+
+#include <uapi/asm/vm86.h>
+
 static void adpt_i386_info(sysInfo_S* si)
 static void adpt_i386_info(sysInfo_S* si)
 {
 {
 	// This is all the info we need for now
 	// This is all the info we need for now

+ 4 - 59
drivers/staging/media/lirc/lirc_serial.c

@@ -327,9 +327,6 @@ static void safe_udelay(unsigned long usecs)
  * time
  * time
  */
  */
 
 
-/* So send_pulse can quickly convert microseconds to clocks */
-static unsigned long conv_us_to_clocks;
-
 static int init_timing_params(unsigned int new_duty_cycle,
 static int init_timing_params(unsigned int new_duty_cycle,
 		unsigned int new_freq)
 		unsigned int new_freq)
 {
 {
@@ -344,7 +341,6 @@ static int init_timing_params(unsigned int new_duty_cycle,
 	/* How many clocks in a microsecond?, avoiding long long divide */
 	/* How many clocks in a microsecond?, avoiding long long divide */
 	work = loops_per_sec;
 	work = loops_per_sec;
 	work *= 4295;  /* 4295 = 2^32 / 1e6 */
 	work *= 4295;  /* 4295 = 2^32 / 1e6 */
-	conv_us_to_clocks = work >> 32;
 
 
 	/*
 	/*
 	 * Carrier period in clocks, approach good up to 32GHz clock,
 	 * Carrier period in clocks, approach good up to 32GHz clock,
@@ -357,10 +353,9 @@ static int init_timing_params(unsigned int new_duty_cycle,
 	pulse_width = period * duty_cycle / 100;
 	pulse_width = period * duty_cycle / 100;
 	space_width = period - pulse_width;
 	space_width = period - pulse_width;
 	dprintk("in init_timing_params, freq=%d, duty_cycle=%d, "
 	dprintk("in init_timing_params, freq=%d, duty_cycle=%d, "
-		"clk/jiffy=%ld, pulse=%ld, space=%ld, "
-		"conv_us_to_clocks=%ld\n",
+		"clk/jiffy=%ld, pulse=%ld, space=%ld\n",
 		freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy),
 		freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy),
-		pulse_width, space_width, conv_us_to_clocks);
+		pulse_width, space_width);
 	return 0;
 	return 0;
 }
 }
 #else /* ! USE_RDTSC */
 #else /* ! USE_RDTSC */
@@ -431,63 +426,14 @@ static long send_pulse_irdeo(unsigned long length)
 	return ret;
 	return ret;
 }
 }
 
 
-#ifdef USE_RDTSC
-/* Version that uses Pentium rdtsc instruction to measure clocks */
-
-/*
- * This version does sub-microsecond timing using rdtsc instruction,
- * and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY
- * Implicitly i586 architecture...  - Steve
- */
-
-static long send_pulse_homebrew_softcarrier(unsigned long length)
-{
-	int flag;
-	unsigned long target, start, now;
-
-	/* Get going quick as we can */
-	rdtscl(start);
-	on();
-	/* Convert length from microseconds to clocks */
-	length *= conv_us_to_clocks;
-	/* And loop till time is up - flipping at right intervals */
-	now = start;
-	target = pulse_width;
-	flag = 1;
-	/*
-	 * FIXME: This looks like a hard busy wait, without even an occasional,
-	 * polite, cpu_relax() call.  There's got to be a better way?
-	 *
-	 * The i2c code has the result of a lot of bit-banging work, I wonder if
-	 * there's something there which could be helpful here.
-	 */
-	while ((now - start) < length) {
-		/* Delay till flip time */
-		do {
-			rdtscl(now);
-		} while ((now - start) < target);
-
-		/* flip */
-		if (flag) {
-			rdtscl(now);
-			off();
-			target += space_width;
-		} else {
-			rdtscl(now); on();
-			target += pulse_width;
-		}
-		flag = !flag;
-	}
-	rdtscl(now);
-	return ((now - start) - length) / conv_us_to_clocks;
-}
-#else /* ! USE_RDTSC */
 /* Version using udelay() */
 /* Version using udelay() */
 
 
 /*
 /*
  * here we use fixed point arithmetic, with 8
  * here we use fixed point arithmetic, with 8
  * fractional bits.  that gets us within 0.1% or so of the right average
  * fractional bits.  that gets us within 0.1% or so of the right average
  * frequency, albeit with some jitter in pulse length - Steve
  * frequency, albeit with some jitter in pulse length - Steve
+ *
+ * This should use ndelay instead.
  */
  */
 
 
 /* To match 8 fractional bits used for pulse/space length */
 /* To match 8 fractional bits used for pulse/space length */
@@ -520,7 +466,6 @@ static long send_pulse_homebrew_softcarrier(unsigned long length)
 	}
 	}
 	return (actual-length) >> 8;
 	return (actual-length) >> 8;
 }
 }
-#endif /* USE_RDTSC */
 
 
 static long send_pulse_homebrew(unsigned long length)
 static long send_pulse_homebrew(unsigned long length)
 {
 {

+ 2 - 2
drivers/thermal/intel_powerclamp.c

@@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 
 
 	/* check result for the last window */
 	/* check result for the last window */
 	msr_now = pkg_state_counter();
 	msr_now = pkg_state_counter();
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 
 
 	/* calculate pkg cstate vs tsc ratio */
 	/* calculate pkg cstate vs tsc ratio */
 	if (!msr_last || !tsc_last)
 	if (!msr_last || !tsc_last)
@@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 	u64 val64;
 	u64 val64;
 
 
 	msr_now = pkg_state_counter();
 	msr_now = pkg_state_counter();
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 	jiffies_now = jiffies;
 	jiffies_now = jiffies;
 
 
 	/* calculate pkg cstate vs tsc ratio */
 	/* calculate pkg cstate vs tsc ratio */

+ 15 - 0
include/linux/context_tracking.h

@@ -49,13 +49,28 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	}
 	}
 }
 }
 
 
+
+/**
+ * ct_state() - return the current context tracking state if known
+ *
+ * Returns the current cpu's context tracking state if context tracking
+ * is enabled.  If context tracking is disabled, returns
+ * CONTEXT_DISABLED.  This should be used primarily for debugging.
+ */
+static inline enum ctx_state ct_state(void)
+{
+	return context_tracking_is_enabled() ?
+		this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
+}
 #else
 #else
 static inline void user_enter(void) { }
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline void user_exit(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
+static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
 #endif /* !CONFIG_CONTEXT_TRACKING */
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
 
+#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
 
 
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
 extern void context_tracking_init(void);
 extern void context_tracking_init(void);

+ 1 - 0
include/linux/context_tracking_state.h

@@ -14,6 +14,7 @@ struct context_tracking {
 	bool active;
 	bool active;
 	int recursion;
 	int recursion;
 	enum ctx_state {
 	enum ctx_state {
+		CONTEXT_DISABLED = -1,	/* returned by ct_state() if unknown */
 		CONTEXT_KERNEL = 0,
 		CONTEXT_KERNEL = 0,
 		CONTEXT_USER,
 		CONTEXT_USER,
 		CONTEXT_GUEST,
 		CONTEXT_GUEST,

+ 15 - 15
include/linux/spinlock.h

@@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
  */
 
 
-static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
+static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
 {
 {
 	return &lock->rlock;
 	return &lock->rlock;
 }
 }
@@ -307,17 +307,17 @@ do {							\
 	raw_spin_lock_init(&(_lock)->rlock);		\
 	raw_spin_lock_init(&(_lock)->rlock);		\
 } while (0)
 } while (0)
 
 
-static inline void spin_lock(spinlock_t *lock)
+static __always_inline void spin_lock(spinlock_t *lock)
 {
 {
 	raw_spin_lock(&lock->rlock);
 	raw_spin_lock(&lock->rlock);
 }
 }
 
 
-static inline void spin_lock_bh(spinlock_t *lock)
+static __always_inline void spin_lock_bh(spinlock_t *lock)
 {
 {
 	raw_spin_lock_bh(&lock->rlock);
 	raw_spin_lock_bh(&lock->rlock);
 }
 }
 
 
-static inline int spin_trylock(spinlock_t *lock)
+static __always_inline int spin_trylock(spinlock_t *lock)
 {
 {
 	return raw_spin_trylock(&lock->rlock);
 	return raw_spin_trylock(&lock->rlock);
 }
 }
@@ -337,7 +337,7 @@ do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
 } while (0)
 } while (0)
 
 
-static inline void spin_lock_irq(spinlock_t *lock)
+static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
 {
 	raw_spin_lock_irq(&lock->rlock);
 	raw_spin_lock_irq(&lock->rlock);
 }
 }
@@ -352,32 +352,32 @@ do {									\
 	raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
 	raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
 } while (0)
 } while (0)
 
 
-static inline void spin_unlock(spinlock_t *lock)
+static __always_inline void spin_unlock(spinlock_t *lock)
 {
 {
 	raw_spin_unlock(&lock->rlock);
 	raw_spin_unlock(&lock->rlock);
 }
 }
 
 
-static inline void spin_unlock_bh(spinlock_t *lock)
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
 {
 {
 	raw_spin_unlock_bh(&lock->rlock);
 	raw_spin_unlock_bh(&lock->rlock);
 }
 }
 
 
-static inline void spin_unlock_irq(spinlock_t *lock)
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
 {
 {
 	raw_spin_unlock_irq(&lock->rlock);
 	raw_spin_unlock_irq(&lock->rlock);
 }
 }
 
 
-static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 {
 	raw_spin_unlock_irqrestore(&lock->rlock, flags);
 	raw_spin_unlock_irqrestore(&lock->rlock, flags);
 }
 }
 
 
-static inline int spin_trylock_bh(spinlock_t *lock)
+static __always_inline int spin_trylock_bh(spinlock_t *lock)
 {
 {
 	return raw_spin_trylock_bh(&lock->rlock);
 	return raw_spin_trylock_bh(&lock->rlock);
 }
 }
 
 
-static inline int spin_trylock_irq(spinlock_t *lock)
+static __always_inline int spin_trylock_irq(spinlock_t *lock)
 {
 {
 	return raw_spin_trylock_irq(&lock->rlock);
 	return raw_spin_trylock_irq(&lock->rlock);
 }
 }
@@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 })
 
 
-static inline void spin_unlock_wait(spinlock_t *lock)
+static __always_inline void spin_unlock_wait(spinlock_t *lock)
 {
 {
 	raw_spin_unlock_wait(&lock->rlock);
 	raw_spin_unlock_wait(&lock->rlock);
 }
 }
 
 
-static inline int spin_is_locked(spinlock_t *lock)
+static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 {
 	return raw_spin_is_locked(&lock->rlock);
 	return raw_spin_is_locked(&lock->rlock);
 }
 }
 
 
-static inline int spin_is_contended(spinlock_t *lock)
+static __always_inline int spin_is_contended(spinlock_t *lock)
 {
 {
 	return raw_spin_is_contended(&lock->rlock);
 	return raw_spin_is_contended(&lock->rlock);
 }
 }
 
 
-static inline int spin_can_lock(spinlock_t *lock)
+static __always_inline int spin_can_lock(spinlock_t *lock)
 {
 {
 	return raw_spin_can_lock(&lock->rlock);
 	return raw_spin_can_lock(&lock->rlock);
 }
 }

+ 2 - 0
kernel/notifier.c

@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
 		.signr	= sig,
 		.signr	= sig,
 
 
 	};
 	};
+	rcu_lockdep_assert(rcu_is_watching(),
+			   "notify_die called but RCU thinks we're quiescent");
 	return atomic_notifier_call_chain(&die_chain, val, &args);
 	return atomic_notifier_call_chain(&die_chain, val, &args);
 }
 }
 NOKPROBE_SYMBOL(notify_die);
 NOKPROBE_SYMBOL(notify_die);

+ 1 - 0
kernel/sys_ni.c

@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
 cond_syscall(sys_ssetmask);
 cond_syscall(sys_ssetmask);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
 cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
 cond_syscall(sys_ipc);
 cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(compat_sys_sysctl);

+ 2 - 2
tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c

@@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
 
 
 	printk(KERN_DEBUG "start--> \n");
 	printk(KERN_DEBUG "start--> \n");
 	then = read_pmtmr();
 	then = read_pmtmr();
-        rdtscll(then_tsc);
+	then_tsc = rdtsc();
 	for (i=0;i<20;i++) {
 	for (i=0;i<20;i++) {
 		mdelay(100);
 		mdelay(100);
 		now = read_pmtmr();
 		now = read_pmtmr();
-		rdtscll(now_tsc);
+		now_tsc = rdtsc();
 		diff = (now - then) & 0xFFFFFF;
 		diff = (now - then) & 0xFFFFFF;
 		diff_tsc = now_tsc - then_tsc;
 		diff_tsc = now_tsc - then_tsc;
 		printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);
 		printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);

+ 2 - 2
tools/testing/selftests/x86/Makefile

@@ -4,8 +4,8 @@ include ../lib.mk
 
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 
-TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
-TARGETS_C_32BIT_ONLY := entry_from_vm86
+TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault
 
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)

+ 129 - 10
tools/testing/selftests/x86/entry_from_vm86.c

@@ -28,6 +28,55 @@
 static unsigned long load_addr = 0x10000;
 static unsigned long load_addr = 0x10000;
 static int nerrs = 0;
 static int nerrs = 0;
 
 
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static sig_atomic_t got_signal;
+
+static void sighandler(int sig, siginfo_t *info, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+	if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM ||
+	    (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) {
+		printf("[FAIL]\tSignal frame should not reflect vm86 mode\n");
+		nerrs++;
+	}
+
+	const char *signame;
+	if (sig == SIGSEGV)
+		signame = "SIGSEGV";
+	else if (sig == SIGILL)
+		signame = "SIGILL";
+	else
+		signame = "unexpected signal";
+
+	printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame,
+	       (unsigned long)ctx->uc_mcontext.gregs[REG_EFL],
+	       (unsigned short)ctx->uc_mcontext.gregs[REG_CS]);
+
+	got_signal = 1;
+}
+
 asm (
 asm (
 	".pushsection .rodata\n\t"
 	".pushsection .rodata\n\t"
 	".type vmcode_bound, @object\n\t"
 	".type vmcode_bound, @object\n\t"
@@ -38,6 +87,14 @@ asm (
 	"int3\n\t"
 	"int3\n\t"
 	"vmcode_sysenter:\n\t"
 	"vmcode_sysenter:\n\t"
 	"sysenter\n\t"
 	"sysenter\n\t"
+	"vmcode_syscall:\n\t"
+	"syscall\n\t"
+	"vmcode_sti:\n\t"
+	"sti\n\t"
+	"vmcode_int3:\n\t"
+	"int3\n\t"
+	"vmcode_int80:\n\t"
+	"int $0x80\n\t"
 	".size vmcode, . - vmcode\n\t"
 	".size vmcode, . - vmcode\n\t"
 	"end_vmcode:\n\t"
 	"end_vmcode:\n\t"
 	".code32\n\t"
 	".code32\n\t"
@@ -45,9 +102,12 @@ asm (
 	);
 	);
 
 
 extern unsigned char vmcode[], end_vmcode[];
 extern unsigned char vmcode[], end_vmcode[];
-extern unsigned char vmcode_bound[], vmcode_sysenter[];
+extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
+	vmcode_sti[], vmcode_int3[], vmcode_int80[];
 
 
-static void do_test(struct vm86plus_struct *v86, unsigned long eip,
+/* Returns false if the test was skipped. */
+static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
+		    unsigned int rettype, unsigned int retarg,
 		    const char *text)
 		    const char *text)
 {
 {
 	long ret;
 	long ret;
@@ -58,7 +118,7 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
 
 
 	if (ret == -1 && errno == ENOSYS) {
 	if (ret == -1 && errno == ENOSYS) {
 		printf("[SKIP]\tvm86 not supported\n");
 		printf("[SKIP]\tvm86 not supported\n");
-		return;
+		return false;
 	}
 	}
 
 
 	if (VM86_TYPE(ret) == VM86_INTx) {
 	if (VM86_TYPE(ret) == VM86_INTx) {
@@ -73,13 +133,30 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
 		else
 		else
 			sprintf(trapname, "%d", trapno);
 			sprintf(trapname, "%d", trapno);
 
 
-		printf("[OK]\tExited vm86 mode due to #%s\n", trapname);
+		printf("[INFO]\tExited vm86 mode due to #%s\n", trapname);
 	} else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
 	} else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
-		printf("[OK]\tExited vm86 mode due to unhandled GP fault\n");
+		printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n");
+	} else if (VM86_TYPE(ret) == VM86_TRAP) {
+		printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n",
+		       VM86_ARG(ret));
+	} else if (VM86_TYPE(ret) == VM86_SIGNAL) {
+		printf("[INFO]\tExited vm86 mode due to a signal\n");
+	} else if (VM86_TYPE(ret) == VM86_STI) {
+		printf("[INFO]\tExited vm86 mode due to STI\n");
 	} else {
 	} else {
-		printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n",
+		printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n",
 		       VM86_TYPE(ret), VM86_ARG(ret));
 		       VM86_TYPE(ret), VM86_ARG(ret));
 	}
 	}
+
+	if (rettype == -1 ||
+	    (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) {
+		printf("[OK]\tReturned correctly\n");
+	} else {
+		printf("[FAIL]\tIncorrect return reason\n");
+		nerrs++;
+	}
+
+	return true;
 }
 }
 
 
 int main(void)
 int main(void)
@@ -105,10 +182,52 @@ int main(void)
 	assert((v86.regs.cs & 3) == 0);	/* Looks like RPL = 0 */
 	assert((v86.regs.cs & 3) == 0);	/* Looks like RPL = 0 */
 
 
 	/* #BR -- should deliver SIG??? */
 	/* #BR -- should deliver SIG??? */
-	do_test(&v86, vmcode_bound - vmcode, "#BR");
-
-	/* SYSENTER -- should cause #GP or #UD depending on CPU */
-	do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER");
+	do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR");
+
+	/*
+	 * SYSENTER -- should cause #GP or #UD depending on CPU.
+	 * Expected return type -1 means that we shouldn't validate
+	 * the vm86 return value.  This will avoid problems on non-SEP
+	 * CPUs.
+	 */
+	sethandler(SIGILL, sighandler, 0);
+	do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER");
+	clearhandler(SIGILL);
+
+	/*
+	 * SYSCALL would be a disaster in VM86 mode.  Fortunately,
+	 * there is no kernel that both enables SYSCALL and sets
+	 * EFER.SCE, so it's #UD on all systems.  But vm86 is
+	 * buggy (or has a "feature"), so the SIGILL will actually
+	 * be delivered.
+	 */
+	sethandler(SIGILL, sighandler, 0);
+	do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL");
+	clearhandler(SIGILL);
+
+	/* STI with VIP set */
+	v86.regs.eflags |= X86_EFLAGS_VIP;
+	v86.regs.eflags &= ~X86_EFLAGS_IF;
+	do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set");
+
+	/* INT3 -- should cause #BP */
+	do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3");
+
+	/* INT80 -- should exit with "INTx 0x80" */
+	v86.regs.eax = (unsigned int)-1;
+	do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
+
+	/* Execute a null pointer */
+	v86.regs.cs = 0;
+	v86.regs.ss = 0;
+	sethandler(SIGSEGV, sighandler, 0);
+	got_signal = 0;
+	if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") &&
+	    !got_signal) {
+		printf("[FAIL]\tDid not receive SIGSEGV\n");
+		nerrs++;
+	}
+	clearhandler(SIGSEGV);
 
 
 	return (nerrs == 0 ? 0 : 1);
 	return (nerrs == 0 ? 0 : 1);
 }
 }

+ 576 - 0
tools/testing/selftests/x86/ldt_gdt.c

@@ -0,0 +1,576 @@
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/ldt.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sched.h>
+#include <linux/futex.h>
+
+#define AR_ACCESSED		(1<<8)
+
+#define AR_TYPE_RODATA		(0 * (1<<9))
+#define AR_TYPE_RWDATA		(1 * (1<<9))
+#define AR_TYPE_RODATA_EXPDOWN	(2 * (1<<9))
+#define AR_TYPE_RWDATA_EXPDOWN	(3 * (1<<9))
+#define AR_TYPE_XOCODE		(4 * (1<<9))
+#define AR_TYPE_XRCODE		(5 * (1<<9))
+#define AR_TYPE_XOCODE_CONF	(6 * (1<<9))
+#define AR_TYPE_XRCODE_CONF	(7 * (1<<9))
+
+#define AR_DPL3			(3 * (1<<13))
+
+#define AR_S			(1 << 12)
+#define AR_P			(1 << 15)
+#define AR_AVL			(1 << 20)
+#define AR_L			(1 << 21)
+#define AR_DB			(1 << 22)
+#define AR_G			(1 << 23)
+
+static int nerrs;
+
+static void check_invalid_segment(uint16_t index, int ldt)
+{
+	uint32_t has_limit = 0, has_ar = 0, limit, ar;
+	uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+	asm ("lsl %[selector], %[limit]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_limit]\n\t"
+	     "1:"
+	     : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+	     : [selector] "r" (selector));
+	asm ("larl %[selector], %[ar]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_ar]\n\t"
+	     "1:"
+	     : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+	     : [selector] "r" (selector));
+
+	if (has_limit || has_ar) {
+		printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
+		       (ldt ? "LDT" : "GDT"), index);
+		nerrs++;
+	} else {
+		printf("[OK]\t%s entry %hu is invalid\n",
+		       (ldt ? "LDT" : "GDT"), index);
+	}
+}
+
+static void check_valid_segment(uint16_t index, int ldt,
+				uint32_t expected_ar, uint32_t expected_limit,
+				bool verbose)
+{
+	uint32_t has_limit = 0, has_ar = 0, limit, ar;
+	uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+	asm ("lsl %[selector], %[limit]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_limit]\n\t"
+	     "1:"
+	     : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+	     : [selector] "r" (selector));
+	asm ("larl %[selector], %[ar]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_ar]\n\t"
+	     "1:"
+	     : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+	     : [selector] "r" (selector));
+
+	if (!has_limit || !has_ar) {
+		printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
+		       (ldt ? "LDT" : "GDT"), index);
+		nerrs++;
+		return;
+	}
+
+	if (ar != expected_ar) {
+		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
+		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
+		nerrs++;
+	} else if (limit != expected_limit) {
+		printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
+		       (ldt ? "LDT" : "GDT"), index, limit, expected_limit);
+		nerrs++;
+	} else if (verbose) {
+		printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
+		       (ldt ? "LDT" : "GDT"), index, ar, limit);
+	}
+}
+
+static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
+			       bool oldmode)
+{
+	int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+			  desc, sizeof(*desc));
+	if (ret < -1)
+		errno = -ret;
+	if (ret == 0) {
+		uint32_t limit = desc->limit;
+		if (desc->limit_in_pages)
+			limit = (limit << 12) + 4095;
+		check_valid_segment(desc->entry_number, 1, ar, limit, true);
+		return true;
+	} else if (errno == ENOSYS) {
+		printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+		return false;
+	} else {
+		if (desc->seg_32bit) {
+			printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+			       errno);
+			nerrs++;
+			return false;
+		} else {
+			printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+			return false;
+		}
+	}
+}
+
+static bool install_valid(const struct user_desc *desc, uint32_t ar)
+{
+	return install_valid_mode(desc, ar, false);
+}
+
+static void install_invalid(const struct user_desc *desc, bool oldmode)
+{
+	int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+			  desc, sizeof(*desc));
+	if (ret < -1)
+		errno = -ret;
+	if (ret == 0) {
+		check_invalid_segment(desc->entry_number, 1);
+	} else if (errno == ENOSYS) {
+		printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+	} else {
+		if (desc->seg_32bit) {
+			printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+			       errno);
+			nerrs++;
+		} else {
+			printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+		}
+	}
+}
+
+static int safe_modify_ldt(int func, struct user_desc *ptr,
+			   unsigned long bytecount)
+{
+	int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
+	if (ret < -1)
+		errno = -ret;
+	return ret;
+}
+
+static void fail_install(struct user_desc *desc)
+{
+	if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
+		printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
+		nerrs++;
+	} else if (errno == ENOSYS) {
+		printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+	} else {
+		printf("[OK]\tmodify_ldt failure %d\n", errno);
+	}
+}
+
+static void do_simple_tests(void)
+{
+	struct user_desc desc = {
+		.entry_number    = 0,
+		.base_addr       = 0,
+		.limit           = 10,
+		.seg_32bit       = 1,
+		.contents        = 2, /* Code, not conforming */
+		.read_exec_only  = 0,
+		.limit_in_pages  = 0,
+		.seg_not_present = 0,
+		.useable         = 0
+	};
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+	desc.limit_in_pages = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G);
+
+	check_invalid_segment(1, 1);
+
+	desc.entry_number = 2;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G);
+
+	check_invalid_segment(1, 1);
+
+	desc.base_addr = 0xf0000000;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G);
+
+	desc.useable = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G | AR_AVL);
+
+	desc.seg_not_present = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.seg_32bit = 0;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_G | AR_AVL);
+
+	desc.seg_32bit = 1;
+	desc.contents = 0;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.read_exec_only = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.contents = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.read_exec_only = 0;
+	desc.limit_in_pages = 0;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.contents = 3;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.read_exec_only = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.read_exec_only = 0;
+	desc.contents = 2;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.read_exec_only = 1;
+
+#ifdef __x86_64__
+	desc.lm = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+		      AR_S | AR_DB | AR_AVL);
+	desc.lm = 0;
+#endif
+
+	bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+					 AR_S | AR_DB | AR_AVL);
+
+	if (entry1_okay) {
+		printf("[RUN]\tTest fork\n");
+		pid_t child = fork();
+		if (child == 0) {
+			nerrs = 0;
+			check_valid_segment(desc.entry_number, 1,
+					    AR_DPL3 | AR_TYPE_XOCODE |
+					    AR_S | AR_DB | AR_AVL, desc.limit,
+					    true);
+			check_invalid_segment(1, 1);
+			exit(nerrs ? 1 : 0);
+		} else {
+			int status;
+			if (waitpid(child, &status, 0) != child ||
+			    !WIFEXITED(status)) {
+				printf("[FAIL]\tChild died\n");
+				nerrs++;
+			} else if (WEXITSTATUS(status) != 0) {
+				printf("[FAIL]\tChild failed\n");
+				nerrs++;
+			} else {
+				printf("[OK]\tChild succeeded\n");
+			}
+		}
+
+		printf("[RUN]\tTest size\n");
+		int i;
+		for (i = 0; i < 8192; i++) {
+			desc.entry_number = i;
+			desc.limit = i;
+			if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+				printf("[FAIL]\tFailed to install entry %d\n", i);
+				nerrs++;
+				break;
+			}
+		}
+		for (int j = 0; j < i; j++) {
+			check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE |
+					    AR_S | AR_DB | AR_AVL, j, false);
+		}
+		printf("[DONE]\tSize test\n");
+	} else {
+		printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n");
+	}
+
+	/* Test entry_number too high. */
+	desc.entry_number = 8192;
+	fail_install(&desc);
+
+	/* Test deletion and actions mistakeable for deletion. */
+	memset(&desc, 0, sizeof(desc));
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
+
+	desc.seg_not_present = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+	desc.seg_not_present = 0;
+	desc.read_exec_only = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
+
+	desc.read_exec_only = 0;
+	desc.seg_not_present = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+	desc.read_exec_only = 1;
+	desc.limit = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+	desc.limit = 0;
+	desc.base_addr = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+	desc.base_addr = 0;
+	install_invalid(&desc, false);
+
+	desc.seg_not_present = 0;
+	desc.read_exec_only = 0;
+	desc.seg_32bit = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
+	install_invalid(&desc, true);
+}
+
+/*
+ * 0: thread is idle
+ * 1: thread armed
+ * 2: thread should clear LDT entry 0
+ * 3: thread should exit
+ */
+static volatile unsigned int ftx;
+
+static void *threadproc(void *ctx)
+{
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(1, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		err(1, "sched_setaffinity to CPU 1");	/* should never fail */
+
+	while (1) {
+		syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+		while (ftx != 2) {
+			if (ftx >= 3)
+				return NULL;
+		}
+
+		/* clear LDT entry 0 */
+		const struct user_desc desc = {};
+		if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
+			err(1, "modify_ldt");
+
+		/* If ftx == 2, set it to zero.  If ftx == 100, quit. */
+		unsigned int x = -2;
+		asm volatile ("lock xaddl %[x], %[ftx]" :
+			      [x] "+r" (x), [ftx] "+m" (ftx));
+		if (x != 2)
+			return NULL;
+	}
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	siglongjmp(jmpbuf, 1);
+}
+
+static void do_multicpu_tests(void)
+{
+	cpu_set_t cpuset;
+	pthread_t thread;
+	int failures = 0, iters = 5, i;
+	unsigned short orig_ss;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(1, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+		printf("[SKIP]\tCannot set affinity to CPU 1\n");
+		return;
+	}
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+		printf("[SKIP]\tCannot set affinity to CPU 0\n");
+		return;
+	}
+
+	sethandler(SIGSEGV, sigsegv, 0);
+#ifdef __i386__
+	/* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */
+	sethandler(SIGILL, sigsegv, 0);
+#endif
+
+	printf("[RUN]\tCross-CPU LDT invalidation\n");
+
+	if (pthread_create(&thread, 0, threadproc, 0) != 0)
+		err(1, "pthread_create");
+
+	asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
+
+	for (i = 0; i < 5; i++) {
+		if (sigsetjmp(jmpbuf, 1) != 0)
+			continue;
+
+		/* Make sure the thread is ready after the last test. */
+		while (ftx != 0)
+			;
+
+		struct user_desc desc = {
+			.entry_number    = 0,
+			.base_addr       = 0,
+			.limit           = 0xfffff,
+			.seg_32bit       = 1,
+			.contents        = 0, /* Data */
+			.read_exec_only  = 0,
+			.limit_in_pages  = 1,
+			.seg_not_present = 0,
+			.useable         = 0
+		};
+
+		if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+			if (errno != ENOSYS)
+				err(1, "modify_ldt");
+			printf("[SKIP]\tmodify_ldt unavailable\n");
+			break;
+		}
+
+		/* Arm the thread. */
+		ftx = 1;
+		syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+		asm volatile ("mov %0, %%ss" : : "r" (0x7));
+
+		/* Go! */
+		ftx = 2;
+
+		while (ftx != 0)
+			;
+
+		/*
+		 * On success, modify_ldt will segfault us synchronously,
+		 * and we'll escape via siglongjmp.
+		 */
+
+		failures++;
+		asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
+	};
+
+	ftx = 100;  /* Kill the thread. */
+	syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+	if (pthread_join(thread, NULL) != 0)
+		err(1, "pthread_join");
+
+	if (failures) {
+		printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
+		nerrs++;
+	} else {
+		printf("[OK]\tAll %d iterations succeeded\n", iters);
+	}
+}
+
+static int finish_exec_test(void)
+{
+	/*
+	 * In a sensible world, this would be check_invalid_segment(0, 1);
+	 * For better or for worse, though, the LDT is inherited across exec.
+	 * We can probably change this safely, but for now we test it.
+	 */
+	check_valid_segment(0, 1,
+			    AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
+			    42, true);
+
+	return nerrs ? 1 : 0;
+}
+
+static void do_exec_test(void)
+{
+	printf("[RUN]\tTest exec\n");
+
+	struct user_desc desc = {
+		.entry_number    = 0,
+		.base_addr       = 0,
+		.limit           = 42,
+		.seg_32bit       = 1,
+		.contents        = 2, /* Code, not conforming */
+		.read_exec_only  = 0,
+		.limit_in_pages  = 0,
+		.seg_not_present = 0,
+		.useable         = 0
+	};
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+	pid_t child = fork();
+	if (child == 0) {
+		execl("/proc/self/exe", "ldt_gdt_test_exec", NULL);
+		printf("[FAIL]\tCould not exec self\n");
+		exit(1);	/* exec failed */
+	} else {
+		int status;
+		if (waitpid(child, &status, 0) != child ||
+		    !WIFEXITED(status)) {
+			printf("[FAIL]\tChild died\n");
+			nerrs++;
+		} else if (WEXITSTATUS(status) != 0) {
+			printf("[FAIL]\tChild failed\n");
+			nerrs++;
+		} else {
+			printf("[OK]\tChild succeeded\n");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
+		return finish_exec_test();
+
+	do_simple_tests();
+
+	do_multicpu_tests();
+
+	do_exec_test();
+
+	return nerrs ? 1 : 0;
+}

+ 130 - 0
tools/testing/selftests/x86/syscall_arg_fault.c

@@ -0,0 +1,130 @@
+/*
+ * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <err.h>
+#include <setjmp.h>
+#include <errno.h>
+
+/* Our sigaltstack scratch space. */
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static volatile sig_atomic_t sig_traps;
+static sigjmp_buf jmpbuf;
+
+static volatile sig_atomic_t n_errs;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+	if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
+		printf("[FAIL]\tAX had the wrong value: 0x%x\n",
+		       ctx->uc_mcontext.gregs[REG_EAX]);
+		n_errs++;
+	} else {
+		printf("[OK]\tSeems okay\n");
+	}
+
+	siglongjmp(jmpbuf, 1);
+}
+
+static void sigill(int sig, siginfo_t *info, void *ctx_void)
+{
+	printf("[SKIP]\tIllegal instruction\n");
+	siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+	stack_t stack = {
+		.ss_sp = altstack_data,
+		.ss_size = SIGSTKSZ,
+	};
+	if (sigaltstack(&stack, NULL) != 0)
+		err(1, "sigaltstack");
+
+	sethandler(SIGSEGV, sigsegv, SA_ONSTACK);
+	sethandler(SIGILL, sigill, SA_ONSTACK);
+
+	/*
+	 * Exercise another nasty special case.  The 32-bit SYSCALL
+	 * and SYSENTER instructions (even in compat mode) each
+	 * clobber one register.  A Linux system call has a syscall
+	 * number and six arguments, and the user stack pointer
+	 * needs to live in some register on return.  That means
+	 * that we need eight registers, but SYSCALL and SYSENTER
+	 * only preserve seven registers.  As a result, one argument
+	 * ends up on the stack.  The stack is user memory, which
+	 * means that the kernel can fail to read it.
+	 *
+	 * The 32-bit fast system calls don't have a defined ABI:
+	 * we're supposed to invoke them through the vDSO.  So we'll
+	 * fudge it: we set all regs to invalid pointer values and
+	 * invoke the entry instruction.  The return will fail no
+	 * matter what, and we completely lose our program state,
+	 * but we can fix it up with a signal handler.
+	 */
+
+	printf("[RUN]\tSYSENTER with invalid state\n");
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		asm volatile (
+			"movl $-1, %%eax\n\t"
+			"movl $-1, %%ebx\n\t"
+			"movl $-1, %%ecx\n\t"
+			"movl $-1, %%edx\n\t"
+			"movl $-1, %%esi\n\t"
+			"movl $-1, %%edi\n\t"
+			"movl $-1, %%ebp\n\t"
+			"movl $-1, %%esp\n\t"
+			"sysenter"
+			: : : "memory", "flags");
+	}
+
+	printf("[RUN]\tSYSCALL with invalid state\n");
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		asm volatile (
+			"movl $-1, %%eax\n\t"
+			"movl $-1, %%ebx\n\t"
+			"movl $-1, %%ecx\n\t"
+			"movl $-1, %%edx\n\t"
+			"movl $-1, %%esi\n\t"
+			"movl $-1, %%edi\n\t"
+			"movl $-1, %%ebp\n\t"
+			"movl $-1, %%esp\n\t"
+			"syscall\n\t"
+			"pushl $0"	/* make sure we segfault cleanly */
+			: : : "memory", "flags");
+	}
+
+	return 0;
+}