vsyscall_64.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /*
  2. * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  3. * Copyright 2003 Andi Kleen, SuSE Labs.
  4. *
  5. * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
  6. *
  7. * Thanks to hpa@transmeta.com for some useful hint.
  8. * Special thanks to Ingo Molnar for his early experience with
  9. * a different vsyscall implementation for Linux/IA32 and for the name.
  10. *
  11. * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
  12. * at virtual address -10Mbyte+1024bytes etc... There are at max 4
  13. * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  14. * jumping out of line if necessary. We cannot add more with this
  15. * mechanism because older kernels won't return -ENOSYS.
  16. *
  17. * Note: the concept clashes with user mode linux. UML users should
  18. * use the vDSO.
  19. */
  20. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  21. #include <linux/time.h>
  22. #include <linux/init.h>
  23. #include <linux/kernel.h>
  24. #include <linux/timer.h>
  25. #include <linux/seqlock.h>
  26. #include <linux/jiffies.h>
  27. #include <linux/sysctl.h>
  28. #include <linux/topology.h>
  29. #include <linux/timekeeper_internal.h>
  30. #include <linux/getcpu.h>
  31. #include <linux/cpu.h>
  32. #include <linux/smp.h>
  33. #include <linux/notifier.h>
  34. #include <linux/syscalls.h>
  35. #include <linux/ratelimit.h>
  36. #include <asm/vsyscall.h>
  37. #include <asm/pgtable.h>
  38. #include <asm/compat.h>
  39. #include <asm/page.h>
  40. #include <asm/unistd.h>
  41. #include <asm/fixmap.h>
  42. #include <asm/errno.h>
  43. #include <asm/io.h>
  44. #include <asm/segment.h>
  45. #include <asm/desc.h>
  46. #include <asm/topology.h>
  47. #include <asm/traps.h>
  48. #define CREATE_TRACE_POINTS
  49. #include "vsyscall_trace.h"
  50. DEFINE_VVAR(int, vgetcpu_mode);
  51. static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
  52. static int __init vsyscall_setup(char *str)
  53. {
  54. if (str) {
  55. if (!strcmp("emulate", str))
  56. vsyscall_mode = EMULATE;
  57. else if (!strcmp("native", str))
  58. vsyscall_mode = NATIVE;
  59. else if (!strcmp("none", str))
  60. vsyscall_mode = NONE;
  61. else
  62. return -EINVAL;
  63. return 0;
  64. }
  65. return -EINVAL;
  66. }
  67. early_param("vsyscall", vsyscall_setup);
  68. static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
  69. const char *message)
  70. {
  71. if (!show_unhandled_signals)
  72. return;
  73. pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
  74. level, current->comm, task_pid_nr(current),
  75. message, regs->ip, regs->cs,
  76. regs->sp, regs->ax, regs->si, regs->di);
  77. }
  78. static int addr_to_vsyscall_nr(unsigned long addr)
  79. {
  80. int nr;
  81. if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
  82. return -EINVAL;
  83. nr = (addr & 0xC00UL) >> 10;
  84. if (nr >= 3)
  85. return -EINVAL;
  86. return nr;
  87. }
  88. static bool write_ok_or_segv(unsigned long ptr, size_t size)
  89. {
  90. /*
  91. * XXX: if access_ok, get_user, and put_user handled
  92. * sig_on_uaccess_error, this could go away.
  93. */
  94. if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
  95. siginfo_t info;
  96. struct thread_struct *thread = &current->thread;
  97. thread->error_code = 6; /* user fault, no page, write */
  98. thread->cr2 = ptr;
  99. thread->trap_nr = X86_TRAP_PF;
  100. memset(&info, 0, sizeof(info));
  101. info.si_signo = SIGSEGV;
  102. info.si_errno = 0;
  103. info.si_code = SEGV_MAPERR;
  104. info.si_addr = (void __user *)ptr;
  105. force_sig_info(SIGSEGV, &info, current);
  106. return false;
  107. } else {
  108. return true;
  109. }
  110. }
  111. bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
  112. {
  113. struct task_struct *tsk;
  114. unsigned long caller;
  115. int vsyscall_nr, syscall_nr, tmp;
  116. int prev_sig_on_uaccess_error;
  117. long ret;
  118. /*
  119. * No point in checking CS -- the only way to get here is a user mode
  120. * trap to a high address, which means that we're in 64-bit user code.
  121. */
  122. WARN_ON_ONCE(address != regs->ip);
  123. if (vsyscall_mode == NONE) {
  124. warn_bad_vsyscall(KERN_INFO, regs,
  125. "vsyscall attempted with vsyscall=none");
  126. return false;
  127. }
  128. vsyscall_nr = addr_to_vsyscall_nr(address);
  129. trace_emulate_vsyscall(vsyscall_nr);
  130. if (vsyscall_nr < 0) {
  131. warn_bad_vsyscall(KERN_WARNING, regs,
  132. "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
  133. goto sigsegv;
  134. }
  135. if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
  136. warn_bad_vsyscall(KERN_WARNING, regs,
  137. "vsyscall with bad stack (exploit attempt?)");
  138. goto sigsegv;
  139. }
  140. tsk = current;
  141. /*
  142. * Check for access_ok violations and find the syscall nr.
  143. *
  144. * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
  145. * 64-bit, so we don't need to special-case it here. For all the
  146. * vsyscalls, NULL means "don't write anything" not "write it at
  147. * address 0".
  148. */
  149. switch (vsyscall_nr) {
  150. case 0:
  151. if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
  152. !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
  153. ret = -EFAULT;
  154. goto check_fault;
  155. }
  156. syscall_nr = __NR_gettimeofday;
  157. break;
  158. case 1:
  159. if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
  160. ret = -EFAULT;
  161. goto check_fault;
  162. }
  163. syscall_nr = __NR_time;
  164. break;
  165. case 2:
  166. if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
  167. !write_ok_or_segv(regs->si, sizeof(unsigned))) {
  168. ret = -EFAULT;
  169. goto check_fault;
  170. }
  171. syscall_nr = __NR_getcpu;
  172. break;
  173. }
  174. /*
  175. * Handle seccomp. regs->ip must be the original value.
  176. * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
  177. *
  178. * We could optimize the seccomp disabled case, but performance
  179. * here doesn't matter.
  180. */
  181. regs->orig_ax = syscall_nr;
  182. regs->ax = -ENOSYS;
  183. tmp = secure_computing(syscall_nr);
  184. if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
  185. warn_bad_vsyscall(KERN_DEBUG, regs,
  186. "seccomp tried to change syscall nr or ip");
  187. do_exit(SIGSYS);
  188. }
  189. if (tmp)
  190. goto do_ret; /* skip requested */
  191. /*
  192. * With a real vsyscall, page faults cause SIGSEGV. We want to
  193. * preserve that behavior to make writing exploits harder.
  194. */
  195. prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
  196. current_thread_info()->sig_on_uaccess_error = 1;
  197. ret = -EFAULT;
  198. switch (vsyscall_nr) {
  199. case 0:
  200. ret = sys_gettimeofday(
  201. (struct timeval __user *)regs->di,
  202. (struct timezone __user *)regs->si);
  203. break;
  204. case 1:
  205. ret = sys_time((time_t __user *)regs->di);
  206. break;
  207. case 2:
  208. ret = sys_getcpu((unsigned __user *)regs->di,
  209. (unsigned __user *)regs->si,
  210. NULL);
  211. break;
  212. }
  213. current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
  214. check_fault:
  215. if (ret == -EFAULT) {
  216. /* Bad news -- userspace fed a bad pointer to a vsyscall. */
  217. warn_bad_vsyscall(KERN_INFO, regs,
  218. "vsyscall fault (exploit attempt?)");
  219. /*
  220. * If we failed to generate a signal for any reason,
  221. * generate one here. (This should be impossible.)
  222. */
  223. if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
  224. !sigismember(&tsk->pending.signal, SIGSEGV)))
  225. goto sigsegv;
  226. return true; /* Don't emulate the ret. */
  227. }
  228. regs->ax = ret;
  229. do_ret:
  230. /* Emulate a ret instruction. */
  231. regs->ip = caller;
  232. regs->sp += 8;
  233. return true;
  234. sigsegv:
  235. force_sig(SIGSEGV, current);
  236. return true;
  237. }
  238. /*
  239. * Assume __initcall executes before all user space. Hopefully kmod
  240. * doesn't violate that. We'll find out if it does.
  241. */
  242. static void vsyscall_set_cpu(int cpu)
  243. {
  244. unsigned long d;
  245. unsigned long node = 0;
  246. #ifdef CONFIG_NUMA
  247. node = cpu_to_node(cpu);
  248. #endif
  249. if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
  250. write_rdtscp_aux((node << 12) | cpu);
  251. /*
  252. * Store cpu number in limit so that it can be loaded quickly
  253. * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
  254. */
  255. d = 0x0f40000000000ULL;
  256. d |= cpu;
  257. d |= (node & 0xf) << 12;
  258. d |= (node >> 4) << 48;
  259. write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
  260. }
  261. static void cpu_vsyscall_init(void *arg)
  262. {
  263. /* preemption should be already off */
  264. vsyscall_set_cpu(raw_smp_processor_id());
  265. }
  266. static int
  267. cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
  268. {
  269. long cpu = (long)arg;
  270. if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
  271. smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
  272. return NOTIFY_DONE;
  273. }
  274. void __init map_vsyscall(void)
  275. {
  276. extern char __vsyscall_page;
  277. unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
  278. __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
  279. vsyscall_mode == NATIVE
  280. ? PAGE_KERNEL_VSYSCALL
  281. : PAGE_KERNEL_VVAR);
  282. BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
  283. (unsigned long)VSYSCALL_ADDR);
  284. }
  285. static int __init vsyscall_init(void)
  286. {
  287. cpu_notifier_register_begin();
  288. on_each_cpu(cpu_vsyscall_init, NULL, 1);
  289. /* notifier priority > KVM */
  290. __hotcpu_notifier(cpu_vsyscall_notifier, 30);
  291. cpu_notifier_register_done();
  292. return 0;
  293. }
  294. __initcall(vsyscall_init);