|
@@ -35,6 +35,11 @@
|
|
|
#include <linux/freezer.h>
|
|
|
#include <linux/ftrace.h>
|
|
|
#include <linux/ratelimit.h>
|
|
|
+#include <linux/kthread.h>
|
|
|
+#include <linux/init.h>
|
|
|
+
|
|
|
+#include <asm/tlb.h>
|
|
|
+#include "internal.h"
|
|
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
|
#include <trace/events/oom.h>
|
|
@@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
|
|
|
|
|
|
bool oom_killer_disabled __read_mostly;
|
|
|
|
|
|
+#ifdef CONFIG_MMU
|
|
|
+/*
|
|
|
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
|
|
|
+ * victim (if that is possible) to help the OOM killer to move on.
|
|
|
+ */
|
|
|
+static struct task_struct *oom_reaper_th;
|
|
|
+static struct mm_struct *mm_to_reap;
|
|
|
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
|
|
+
|
|
|
+static bool __oom_reap_vmas(struct mm_struct *mm)
|
|
|
+{
|
|
|
+ struct mmu_gather tlb;
|
|
|
+ struct vm_area_struct *vma;
|
|
|
+ struct zap_details details = {.check_swap_entries = true,
|
|
|
+ .ignore_dirty = true};
|
|
|
+ bool ret = true;
|
|
|
+
|
|
|
+ /* We might have raced with exit path */
|
|
|
+ if (!atomic_inc_not_zero(&mm->mm_users))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ if (!down_read_trylock(&mm->mmap_sem)) {
|
|
|
+ ret = false;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ tlb_gather_mmu(&tlb, mm, 0, -1);
|
|
|
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
|
|
|
+ if (is_vm_hugetlb_page(vma))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * mlocked VMAs require explicit munlocking before unmap.
|
|
|
+ * Let's keep it simple here and skip such VMAs.
|
|
|
+ */
|
|
|
+ if (vma->vm_flags & VM_LOCKED)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Only anonymous pages have a good chance to be dropped
|
|
|
+ * without additional steps which we cannot afford as we
|
|
|
+ * are OOM already.
|
|
|
+ *
|
|
|
+ * We do not even care about fs backed pages because all
|
|
|
+ * which are reclaimable have already been reclaimed and
|
|
|
+ * we do not want to block exit_mmap by keeping mm ref
|
|
|
+ * count elevated without a good reason.
|
|
|
+ */
|
|
|
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
|
|
|
+ unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
|
|
|
+ &details);
|
|
|
+ }
|
|
|
+ tlb_finish_mmu(&tlb, 0, -1);
|
|
|
+ up_read(&mm->mmap_sem);
|
|
|
+out:
|
|
|
+ mmput(mm);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static void oom_reap_vmas(struct mm_struct *mm)
|
|
|
+{
|
|
|
+ int attempts = 0;
|
|
|
+
|
|
|
+ /* Retry the down_read_trylock(mmap_sem) a few times */
|
|
|
+ while (attempts++ < 10 && !__oom_reap_vmas(mm))
|
|
|
+ schedule_timeout_idle(HZ/10);
|
|
|
+
|
|
|
+ /* Drop a reference taken by wake_oom_reaper */
|
|
|
+ mmdrop(mm);
|
|
|
+}
|
|
|
+
|
|
|
+static int oom_reaper(void *unused)
|
|
|
+{
|
|
|
+ while (true) {
|
|
|
+ struct mm_struct *mm;
|
|
|
+
|
|
|
+ wait_event_freezable(oom_reaper_wait,
|
|
|
+ (mm = READ_ONCE(mm_to_reap)));
|
|
|
+ oom_reap_vmas(mm);
|
|
|
+ WRITE_ONCE(mm_to_reap, NULL);
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static void wake_oom_reaper(struct mm_struct *mm)
|
|
|
+{
|
|
|
+ struct mm_struct *old_mm;
|
|
|
+
|
|
|
+ if (!oom_reaper_th)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Pin the given mm. Use mm_count instead of mm_users because
|
|
|
+ * we do not want to delay the address space tear down.
|
|
|
+ */
|
|
|
+ atomic_inc(&mm->mm_count);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Make sure that only a single mm is ever queued for the reaper
|
|
|
+ * because multiple are not necessary and the operation might be
|
|
|
+ * disruptive so better reduce it to the bare minimum.
|
|
|
+ */
|
|
|
+ old_mm = cmpxchg(&mm_to_reap, NULL, mm);
|
|
|
+ if (!old_mm)
|
|
|
+ wake_up(&oom_reaper_wait);
|
|
|
+ else
|
|
|
+ mmdrop(mm);
|
|
|
+}
|
|
|
+
|
|
|
+static int __init oom_init(void)
|
|
|
+{
|
|
|
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
|
|
|
+ if (IS_ERR(oom_reaper_th)) {
|
|
|
+ pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
|
|
|
+ PTR_ERR(oom_reaper_th));
|
|
|
+ oom_reaper_th = NULL;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+subsys_initcall(oom_init)
|
|
|
+#else
|
|
|
+static void wake_oom_reaper(struct mm_struct *mm)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
/**
|
|
|
* mark_oom_victim - mark the given task as OOM victim
|
|
|
* @tsk: task to mark
|
|
@@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
|
|
unsigned int victim_points = 0;
|
|
|
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
|
DEFAULT_RATELIMIT_BURST);
|
|
|
+ bool can_oom_reap = true;
|
|
|
|
|
|
/*
|
|
|
* If the task is already exiting, don't alarm the sysadmin or kill
|
|
@@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
|
|
continue;
|
|
|
if (same_thread_group(p, victim))
|
|
|
continue;
|
|
|
- if (unlikely(p->flags & PF_KTHREAD))
|
|
|
- continue;
|
|
|
- if (is_global_init(p))
|
|
|
- continue;
|
|
|
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
|
|
|
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
|
|
|
+ p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
|
|
|
+ /*
|
|
|
+ * We cannot use oom_reaper for the mm shared by this
|
|
|
+ * process because it wouldn't get killed and so the
|
|
|
+ * memory might be still used.
|
|
|
+ */
|
|
|
+ can_oom_reap = false;
|
|
|
continue;
|
|
|
-
|
|
|
+ }
|
|
|
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
+ if (can_oom_reap)
|
|
|
+ wake_oom_reaper(mm);
|
|
|
+
|
|
|
mmdrop(mm);
|
|
|
put_task_struct(victim);
|
|
|
}
|