tlb.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. #include <linux/init.h>
  2. #include <linux/mm.h>
  3. #include <linux/spinlock.h>
  4. #include <linux/smp.h>
  5. #include <linux/interrupt.h>
  6. #include <linux/export.h>
  7. #include <linux/cpu.h>
  8. #include <asm/tlbflush.h>
  9. #include <asm/mmu_context.h>
  10. #include <asm/cache.h>
  11. #include <asm/apic.h>
  12. #include <asm/uv/uv.h>
  13. #include <linux/debugfs.h>
  14. /*
  15. * TLB flushing, formerly SMP-only
  16. * c/o Linus Torvalds.
  17. *
  18. * These mean you can really definitely utterly forget about
  19. * writing to user space from interrupts. (Its not allowed anyway).
  20. *
  21. * Optimizations Manfred Spraul <manfred@colorfullife.com>
  22. *
  23. * More scalable flush, from Andi Kleen
  24. *
  25. * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  26. */
  27. void leave_mm(int cpu)
  28. {
  29. struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
  30. /*
  31. * It's plausible that we're in lazy TLB mode while our mm is init_mm.
  32. * If so, our callers still expect us to flush the TLB, but there
  33. * aren't any user TLB entries in init_mm to worry about.
  34. *
  35. * This needs to happen before any other sanity checks due to
  36. * intel_idle's shenanigans.
  37. */
  38. if (loaded_mm == &init_mm)
  39. return;
  40. if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
  41. BUG();
  42. switch_mm(NULL, &init_mm, NULL);
  43. }
  44. EXPORT_SYMBOL_GPL(leave_mm);
  45. void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  46. struct task_struct *tsk)
  47. {
  48. unsigned long flags;
  49. local_irq_save(flags);
  50. switch_mm_irqs_off(prev, next, tsk);
  51. local_irq_restore(flags);
  52. }
  53. void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  54. struct task_struct *tsk)
  55. {
  56. unsigned cpu = smp_processor_id();
  57. struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
  58. /*
  59. * NB: The scheduler will call us with prev == next when
  60. * switching from lazy TLB mode to normal mode if active_mm
  61. * isn't changing. When this happens, there is no guarantee
  62. * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
  63. *
  64. * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
  65. */
  66. this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
  67. if (real_prev == next) {
  68. /*
  69. * There's nothing to do: we always keep the per-mm control
  70. * regs in sync with cpu_tlbstate.loaded_mm. Just
  71. * sanity-check mm_cpumask.
  72. */
  73. if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
  74. cpumask_set_cpu(cpu, mm_cpumask(next));
  75. return;
  76. }
  77. if (IS_ENABLED(CONFIG_VMAP_STACK)) {
  78. /*
  79. * If our current stack is in vmalloc space and isn't
  80. * mapped in the new pgd, we'll double-fault. Forcibly
  81. * map it.
  82. */
  83. unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
  84. pgd_t *pgd = next->pgd + stack_pgd_index;
  85. if (unlikely(pgd_none(*pgd)))
  86. set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
  87. }
  88. this_cpu_write(cpu_tlbstate.loaded_mm, next);
  89. WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
  90. cpumask_set_cpu(cpu, mm_cpumask(next));
  91. /*
  92. * Re-load page tables.
  93. *
  94. * This logic has an ordering constraint:
  95. *
  96. * CPU 0: Write to a PTE for 'next'
  97. * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
  98. * CPU 1: set bit 1 in next's mm_cpumask
  99. * CPU 1: load from the PTE that CPU 0 writes (implicit)
  100. *
  101. * We need to prevent an outcome in which CPU 1 observes
  102. * the new PTE value and CPU 0 observes bit 1 clear in
  103. * mm_cpumask. (If that occurs, then the IPI will never
  104. * be sent, and CPU 0's TLB will contain a stale entry.)
  105. *
  106. * The bad outcome can occur if either CPU's load is
  107. * reordered before that CPU's store, so both CPUs must
  108. * execute full barriers to prevent this from happening.
  109. *
  110. * Thus, switch_mm needs a full barrier between the
  111. * store to mm_cpumask and any operation that could load
  112. * from next->pgd. TLB fills are special and can happen
  113. * due to instruction fetches or for no reason at all,
  114. * and neither LOCK nor MFENCE orders them.
  115. * Fortunately, load_cr3() is serializing and gives the
  116. * ordering guarantee we need.
  117. */
  118. load_cr3(next->pgd);
  119. /*
  120. * This gets called via leave_mm() in the idle path where RCU
  121. * functions differently. Tracing normally uses RCU, so we have to
  122. * call the tracepoint specially here.
  123. */
  124. trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
  125. /* Stop flush ipis for the previous mm */
  126. WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
  127. real_prev != &init_mm);
  128. cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
  129. /* Load per-mm CR4 and LDTR state */
  130. load_mm_cr4(next);
  131. switch_ldt(real_prev, next);
  132. }
  133. static void flush_tlb_func_common(const struct flush_tlb_info *f,
  134. bool local, enum tlb_flush_reason reason)
  135. {
  136. /* This code cannot presently handle being reentered. */
  137. VM_WARN_ON(!irqs_disabled());
  138. if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
  139. leave_mm(smp_processor_id());
  140. return;
  141. }
  142. if (f->end == TLB_FLUSH_ALL) {
  143. local_flush_tlb();
  144. if (local)
  145. count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
  146. trace_tlb_flush(reason, TLB_FLUSH_ALL);
  147. } else {
  148. unsigned long addr;
  149. unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
  150. addr = f->start;
  151. while (addr < f->end) {
  152. __flush_tlb_single(addr);
  153. addr += PAGE_SIZE;
  154. }
  155. if (local)
  156. count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
  157. trace_tlb_flush(reason, nr_pages);
  158. }
  159. }
  160. static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
  161. {
  162. const struct flush_tlb_info *f = info;
  163. flush_tlb_func_common(f, true, reason);
  164. }
  165. static void flush_tlb_func_remote(void *info)
  166. {
  167. const struct flush_tlb_info *f = info;
  168. inc_irq_stat(irq_tlb_count);
  169. if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
  170. return;
  171. count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
  172. flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
  173. }
  174. void native_flush_tlb_others(const struct cpumask *cpumask,
  175. const struct flush_tlb_info *info)
  176. {
  177. count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
  178. if (info->end == TLB_FLUSH_ALL)
  179. trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
  180. else
  181. trace_tlb_flush(TLB_REMOTE_SEND_IPI,
  182. (info->end - info->start) >> PAGE_SHIFT);
  183. if (is_uv_system()) {
  184. unsigned int cpu;
  185. cpu = smp_processor_id();
  186. cpumask = uv_flush_tlb_others(cpumask, info);
  187. if (cpumask)
  188. smp_call_function_many(cpumask, flush_tlb_func_remote,
  189. (void *)info, 1);
  190. return;
  191. }
  192. smp_call_function_many(cpumask, flush_tlb_func_remote,
  193. (void *)info, 1);
  194. }
  195. /*
  196. * See Documentation/x86/tlb.txt for details. We choose 33
  197. * because it is large enough to cover the vast majority (at
  198. * least 95%) of allocations, and is small enough that we are
  199. * confident it will not cause too much overhead. Each single
  200. * flush is about 100 ns, so this caps the maximum overhead at
  201. * _about_ 3,000 ns.
  202. *
  203. * This is in units of pages.
  204. */
  205. static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
  206. void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
  207. unsigned long end, unsigned long vmflag)
  208. {
  209. int cpu;
  210. struct flush_tlb_info info = {
  211. .mm = mm,
  212. };
  213. cpu = get_cpu();
  214. /* Synchronize with switch_mm. */
  215. smp_mb();
  216. /* Should we flush just the requested range? */
  217. if ((end != TLB_FLUSH_ALL) &&
  218. !(vmflag & VM_HUGETLB) &&
  219. ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
  220. info.start = start;
  221. info.end = end;
  222. } else {
  223. info.start = 0UL;
  224. info.end = TLB_FLUSH_ALL;
  225. }
  226. if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
  227. VM_WARN_ON(irqs_disabled());
  228. local_irq_disable();
  229. flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
  230. local_irq_enable();
  231. }
  232. if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
  233. flush_tlb_others(mm_cpumask(mm), &info);
  234. put_cpu();
  235. }
  236. static void do_flush_tlb_all(void *info)
  237. {
  238. count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
  239. __flush_tlb_all();
  240. if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
  241. leave_mm(smp_processor_id());
  242. }
  243. void flush_tlb_all(void)
  244. {
  245. count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
  246. on_each_cpu(do_flush_tlb_all, NULL, 1);
  247. }
  248. static void do_kernel_range_flush(void *info)
  249. {
  250. struct flush_tlb_info *f = info;
  251. unsigned long addr;
  252. /* flush range by one by one 'invlpg' */
  253. for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
  254. __flush_tlb_single(addr);
  255. }
  256. void flush_tlb_kernel_range(unsigned long start, unsigned long end)
  257. {
  258. /* Balance as user space task's flush, a bit conservative */
  259. if (end == TLB_FLUSH_ALL ||
  260. (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
  261. on_each_cpu(do_flush_tlb_all, NULL, 1);
  262. } else {
  263. struct flush_tlb_info info;
  264. info.start = start;
  265. info.end = end;
  266. on_each_cpu(do_kernel_range_flush, &info, 1);
  267. }
  268. }
  269. void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
  270. {
  271. struct flush_tlb_info info = {
  272. .mm = NULL,
  273. .start = 0UL,
  274. .end = TLB_FLUSH_ALL,
  275. };
  276. int cpu = get_cpu();
  277. if (cpumask_test_cpu(cpu, &batch->cpumask)) {
  278. VM_WARN_ON(irqs_disabled());
  279. local_irq_disable();
  280. flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
  281. local_irq_enable();
  282. }
  283. if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
  284. flush_tlb_others(&batch->cpumask, &info);
  285. cpumask_clear(&batch->cpumask);
  286. put_cpu();
  287. }
  288. static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
  289. size_t count, loff_t *ppos)
  290. {
  291. char buf[32];
  292. unsigned int len;
  293. len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
  294. return simple_read_from_buffer(user_buf, count, ppos, buf, len);
  295. }
  296. static ssize_t tlbflush_write_file(struct file *file,
  297. const char __user *user_buf, size_t count, loff_t *ppos)
  298. {
  299. char buf[32];
  300. ssize_t len;
  301. int ceiling;
  302. len = min(count, sizeof(buf) - 1);
  303. if (copy_from_user(buf, user_buf, len))
  304. return -EFAULT;
  305. buf[len] = '\0';
  306. if (kstrtoint(buf, 0, &ceiling))
  307. return -EINVAL;
  308. if (ceiling < 0)
  309. return -EINVAL;
  310. tlb_single_page_flush_ceiling = ceiling;
  311. return count;
  312. }
  313. static const struct file_operations fops_tlbflush = {
  314. .read = tlbflush_read_file,
  315. .write = tlbflush_write_file,
  316. .llseek = default_llseek,
  317. };
  318. static int __init create_tlb_single_page_flush_ceiling(void)
  319. {
  320. debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
  321. arch_debugfs_dir, NULL, &fops_tlbflush);
  322. return 0;
  323. }
  324. late_initcall(create_tlb_single_page_flush_ceiling);