|
@@ -7,6 +7,7 @@
|
|
|
#include <linux/export.h>
|
|
|
#include <linux/cpu.h>
|
|
|
#include <linux/debugfs.h>
|
|
|
+#include <linux/gfp.h>
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
#include <asm/mmu_context.h>
|
|
@@ -35,7 +36,7 @@
|
|
|
* necessary invalidation by clearing out the 'ctx_id' which
|
|
|
* forces a TLB flush when the context is loaded.
|
|
|
*/
|
|
|
-void clear_asid_other(void)
|
|
|
+static void clear_asid_other(void)
|
|
|
{
|
|
|
u16 asid;
|
|
|
|
|
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
|
{
|
|
|
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
|
|
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
|
|
|
unsigned cpu = smp_processor_id();
|
|
|
u64 next_tlb_gen;
|
|
|
+ bool need_flush;
|
|
|
+ u16 new_asid;
|
|
|
|
|
|
/*
|
|
|
* NB: The scheduler will call us with prev == next when switching
|
|
@@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
|
next->context.ctx_id);
|
|
|
|
|
|
/*
|
|
|
- * We don't currently support having a real mm loaded without
|
|
|
- * our cpu set in mm_cpumask(). We have all the bookkeeping
|
|
|
- * in place to figure out whether we would need to flush
|
|
|
- * if our cpu were cleared in mm_cpumask(), but we don't
|
|
|
- * currently use it.
|
|
|
+ * Even in lazy TLB mode, the CPU should stay set in the
|
|
|
+ * mm_cpumask. The TLB shootdown code can figure out from
|
|
|
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
|
|
|
*/
|
|
|
if (WARN_ON_ONCE(real_prev != &init_mm &&
|
|
|
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
|
|
- return;
|
|
|
+ /*
|
|
|
+ * If the CPU is not in lazy TLB mode, we are just switching
|
|
|
+ * from one thread in a process to another thread in the same
|
|
|
+ * process. No TLB flush required.
|
|
|
+ */
|
|
|
+ if (!was_lazy)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Read the tlb_gen to check whether a flush is needed.
|
|
|
+ * If the TLB is up to date, just use it.
|
|
|
+ * The barrier synchronizes with the tlb_gen increment in
|
|
|
+ * the TLB shootdown code.
|
|
|
+ */
|
|
|
+ smp_mb();
|
|
|
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
|
|
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
|
|
|
+ next_tlb_gen)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * TLB contents went out of date while we were in lazy
|
|
|
+ * mode. Fall through to the TLB switching code below.
|
|
|
+ */
|
|
|
+ new_asid = prev_asid;
|
|
|
+ need_flush = true;
|
|
|
} else {
|
|
|
- u16 new_asid;
|
|
|
- bool need_flush;
|
|
|
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
|
|
|
|
|
|
/*
|
|
@@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
|
sync_current_stack_to_mm(next);
|
|
|
}
|
|
|
|
|
|
- /* Stop remote flushes for the previous mm */
|
|
|
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
|
|
|
- real_prev != &init_mm);
|
|
|
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
|
|
+ /*
|
|
|
+ * Stop remote flushes for the previous mm.
|
|
|
+ * Skip kernel threads; we never send init_mm TLB flushing IPIs,
|
|
|
+ * but the bitmap manipulation can cause cache line contention.
|
|
|
+ */
|
|
|
+ if (real_prev != &init_mm) {
|
|
|
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
|
|
|
+ mm_cpumask(real_prev)));
|
|
|
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
* Start remote flushes and then read tlb_gen.
|
|
|
*/
|
|
|
- cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
+ if (next != &init_mm)
|
|
|
+ cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
|
|
|
|
|
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
|
|
+ }
|
|
|
|
|
|
- if (need_flush) {
|
|
|
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
|
|
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
|
|
- load_new_mm_cr3(next->pgd, new_asid, true);
|
|
|
-
|
|
|
- /*
|
|
|
- * NB: This gets called via leave_mm() in the idle path
|
|
|
- * where RCU functions differently. Tracing normally
|
|
|
- * uses RCU, so we need to use the _rcuidle variant.
|
|
|
- *
|
|
|
- * (There is no good reason for this. The idle code should
|
|
|
- * be rearranged to call this before rcu_idle_enter().)
|
|
|
- */
|
|
|
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
|
- } else {
|
|
|
- /* The new ASID is already up to date. */
|
|
|
- load_new_mm_cr3(next->pgd, new_asid, false);
|
|
|
-
|
|
|
- /* See above wrt _rcuidle. */
|
|
|
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
|
|
- }
|
|
|
+ if (need_flush) {
|
|
|
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
|
|
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
|
|
+ load_new_mm_cr3(next->pgd, new_asid, true);
|
|
|
|
|
|
/*
|
|
|
- * Record last user mm's context id, so we can avoid
|
|
|
- * flushing branch buffer with IBPB if we switch back
|
|
|
- * to the same user.
|
|
|
+ * NB: This gets called via leave_mm() in the idle path
|
|
|
+ * where RCU functions differently. Tracing normally
|
|
|
+ * uses RCU, so we need to use the _rcuidle variant.
|
|
|
+ *
|
|
|
+ * (There is no good reason for this. The idle code should
|
|
|
+ * be rearranged to call this before rcu_idle_enter().)
|
|
|
*/
|
|
|
- if (next != &init_mm)
|
|
|
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
|
|
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
|
+ } else {
|
|
|
+ /* The new ASID is already up to date. */
|
|
|
+ load_new_mm_cr3(next->pgd, new_asid, false);
|
|
|
|
|
|
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
|
|
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
|
|
+ /* See above wrt _rcuidle. */
|
|
|
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * Record last user mm's context id, so we can avoid
|
|
|
+ * flushing branch buffer with IBPB if we switch back
|
|
|
+ * to the same user.
|
|
|
+ */
|
|
|
+ if (next != &init_mm)
|
|
|
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
|
|
+
|
|
|
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
|
|
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
|
|
+
|
|
|
load_mm_cr4(next);
|
|
|
switch_ldt(real_prev, next);
|
|
|
}
|
|
@@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
|
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
|
|
return;
|
|
|
|
|
|
- if (tlb_defer_switch_to_init_mm()) {
|
|
|
- /*
|
|
|
- * There's a significant optimization that may be possible
|
|
|
- * here. We have accurate enough TLB flush tracking that we
|
|
|
- * don't need to maintain coherence of TLB per se when we're
|
|
|
- * lazy. We do, however, need to maintain coherence of
|
|
|
- * paging-structure caches. We could, in principle, leave our
|
|
|
- * old mm loaded and only switch to init_mm when
|
|
|
- * tlb_remove_page() happens.
|
|
|
- */
|
|
|
- this_cpu_write(cpu_tlbstate.is_lazy, true);
|
|
|
- } else {
|
|
|
- switch_mm(NULL, &init_mm, NULL);
|
|
|
- }
|
|
|
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|
|
* paging-structure cache to avoid speculatively reading
|
|
|
* garbage into our TLB. Since switching to init_mm is barely
|
|
|
* slower than a minimal flush, just switch to init_mm.
|
|
|
+ *
|
|
|
+ * This should be rare, with native_flush_tlb_others skipping
|
|
|
+ * IPIs to lazy TLB mode CPUs.
|
|
|
*/
|
|
|
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
|
|
return;
|
|
@@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
|
|
|
void native_flush_tlb_others(const struct cpumask *cpumask,
|
|
|
const struct flush_tlb_info *info)
|
|
|
{
|
|
|
+ cpumask_var_t lazymask;
|
|
|
+ unsigned int cpu;
|
|
|
+
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
|
|
if (info->end == TLB_FLUSH_ALL)
|
|
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
|
|
@@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|
|
* that UV should be updated so that smp_call_function_many(),
|
|
|
* etc, are optimal on UV.
|
|
|
*/
|
|
|
- unsigned int cpu;
|
|
|
-
|
|
|
cpu = smp_processor_id();
|
|
|
cpumask = uv_flush_tlb_others(cpumask, info);
|
|
|
if (cpumask)
|
|
@@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|
|
(void *)info, 1);
|
|
|
return;
|
|
|
}
|
|
|
- smp_call_function_many(cpumask, flush_tlb_func_remote,
|
|
|
+
|
|
|
+ /*
|
|
|
+ * A temporary cpumask is used in order to skip sending IPIs
|
|
|
+ * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
|
|
|
+ * If the allocation fails, simply IPI every CPU in mm_cpumask.
|
|
|
+ */
|
|
|
+ if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
|
|
|
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
|
|
|
(void *)info, 1);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ cpumask_copy(lazymask, cpumask);
|
|
|
+
|
|
|
+ for_each_cpu(cpu, lazymask) {
|
|
|
+ if (per_cpu(cpu_tlbstate.is_lazy, cpu))
|
|
|
+ cpumask_clear_cpu(cpu, lazymask);
|
|
|
+ }
|
|
|
+
|
|
|
+ smp_call_function_many(lazymask, flush_tlb_func_remote,
|
|
|
+ (void *)info, 1);
|
|
|
+
|
|
|
+ free_cpumask_var(lazymask);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|
|
put_cpu();
|
|
|
}
|
|
|
|
|
|
+void tlb_flush_remove_tables_local(void *arg)
|
|
|
+{
|
|
|
+ struct mm_struct *mm = arg;
|
|
|
+
|
|
|
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
|
|
|
+ this_cpu_read(cpu_tlbstate.is_lazy)) {
|
|
|
+ /*
|
|
|
+ * We're in lazy mode. We need to at least flush our
|
|
|
+ * paging-structure cache to avoid speculatively reading
|
|
|
+ * garbage into our TLB. Since switching to init_mm is barely
|
|
|
+ * slower than a minimal flush, just switch to init_mm.
|
|
|
+ */
|
|
|
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
|
|
|
+ struct cpumask *lazy_cpus)
|
|
|
+{
|
|
|
+ int cpu;
|
|
|
+
|
|
|
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
|
|
+ if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
|
|
|
+ cpumask_set_cpu(cpu, lazy_cpus);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void tlb_flush_remove_tables(struct mm_struct *mm)
|
|
|
+{
|
|
|
+ int cpu = get_cpu();
|
|
|
+ cpumask_var_t lazy_cpus;
|
|
|
+
|
|
|
+ if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
|
|
|
+ put_cpu();
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
|
|
|
+ /*
|
|
|
+ * If the cpumask allocation fails, do a brute force flush
|
|
|
+ * on all the CPUs that have this mm loaded.
|
|
|
+ */
|
|
|
+ smp_call_function_many(mm_cpumask(mm),
|
|
|
+ tlb_flush_remove_tables_local, (void *)mm, 1);
|
|
|
+ put_cpu();
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * CPUs with !is_lazy either received a TLB flush IPI while the user
|
|
|
+ * pages in this address range were unmapped, or have context switched
|
|
|
+ * and reloaded %CR3 since then.
|
|
|
+ *
|
|
|
+ * Shootdown IPIs at page table freeing time only need to be sent to
|
|
|
+ * CPUs that may have out of date TLB contents.
|
|
|
+ */
|
|
|
+ mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
|
|
|
+ smp_call_function_many(lazy_cpus,
|
|
|
+ tlb_flush_remove_tables_local, (void *)mm, 1);
|
|
|
+ free_cpumask_var(lazy_cpus);
|
|
|
+ put_cpu();
|
|
|
+}
|
|
|
|
|
|
static void do_flush_tlb_all(void *info)
|
|
|
{
|