|
@@ -22,6 +22,7 @@
|
|
|
#include <linux/swap.h>
|
|
|
#include <linux/swapops.h>
|
|
|
#include <linux/page-isolation.h>
|
|
|
+#include <linux/jhash.h>
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
#include <asm/pgtable.h>
|
|
@@ -53,6 +54,13 @@ static unsigned long __initdata default_hstate_size;
|
|
|
*/
|
|
|
DEFINE_SPINLOCK(hugetlb_lock);
|
|
|
|
|
|
+/*
|
|
|
+ * Serializes faults on the same logical page. This is used to
|
|
|
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
|
|
|
+ */
|
|
|
+static int num_fault_mutexes;
|
|
|
+static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
|
|
|
+
|
|
|
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
|
|
|
{
|
|
|
bool free = (spool->count == 0) && (spool->used_hpages == 0);
|
|
@@ -1961,11 +1969,14 @@ static void __exit hugetlb_exit(void)
|
|
|
}
|
|
|
|
|
|
kobject_put(hugepages_kobj);
|
|
|
+ kfree(htlb_fault_mutex_table);
|
|
|
}
|
|
|
module_exit(hugetlb_exit);
|
|
|
|
|
|
static int __init hugetlb_init(void)
|
|
|
{
|
|
|
+ int i;
|
|
|
+
|
|
|
/* Some platform decide whether they support huge pages at boot
|
|
|
* time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
|
|
|
* there is no such support
|
|
@@ -1990,6 +2001,17 @@ static int __init hugetlb_init(void)
|
|
|
hugetlb_register_all_nodes();
|
|
|
hugetlb_cgroup_file_init();
|
|
|
|
|
|
+#ifdef CONFIG_SMP
|
|
|
+ num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
|
|
|
+#else
|
|
|
+ num_fault_mutexes = 1;
|
|
|
+#endif
|
|
|
+ htlb_fault_mutex_table =
|
|
|
+ kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
|
|
|
+ BUG_ON(!htlb_fault_mutex_table);
|
|
|
+
|
|
|
+ for (i = 0; i < num_fault_mutexes; i++)
|
|
|
+ mutex_init(&htlb_fault_mutex_table[i]);
|
|
|
return 0;
|
|
|
}
|
|
|
module_init(hugetlb_init);
|
|
@@ -2767,15 +2789,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
|
|
|
}
|
|
|
|
|
|
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
- unsigned long address, pte_t *ptep, unsigned int flags)
|
|
|
+ struct address_space *mapping, pgoff_t idx,
|
|
|
+ unsigned long address, pte_t *ptep, unsigned int flags)
|
|
|
{
|
|
|
struct hstate *h = hstate_vma(vma);
|
|
|
int ret = VM_FAULT_SIGBUS;
|
|
|
int anon_rmap = 0;
|
|
|
- pgoff_t idx;
|
|
|
unsigned long size;
|
|
|
struct page *page;
|
|
|
- struct address_space *mapping;
|
|
|
pte_t new_pte;
|
|
|
spinlock_t *ptl;
|
|
|
|
|
@@ -2790,9 +2811,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
- mapping = vma->vm_file->f_mapping;
|
|
|
- idx = vma_hugecache_offset(h, vma, address);
|
|
|
-
|
|
|
/*
|
|
|
* Use page lock to guard against racing truncation
|
|
|
* before we get page_table_lock.
|
|
@@ -2902,17 +2920,53 @@ backout_unlocked:
|
|
|
goto out;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_SMP
|
|
|
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
|
|
+ struct vm_area_struct *vma,
|
|
|
+ struct address_space *mapping,
|
|
|
+ pgoff_t idx, unsigned long address)
|
|
|
+{
|
|
|
+ unsigned long key[2];
|
|
|
+ u32 hash;
|
|
|
+
|
|
|
+ if (vma->vm_flags & VM_SHARED) {
|
|
|
+ key[0] = (unsigned long) mapping;
|
|
|
+ key[1] = idx;
|
|
|
+ } else {
|
|
|
+ key[0] = (unsigned long) mm;
|
|
|
+ key[1] = address >> huge_page_shift(h);
|
|
|
+ }
|
|
|
+
|
|
|
+ hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
|
|
|
+
|
|
|
+ return hash & (num_fault_mutexes - 1);
|
|
|
+}
|
|
|
+#else
|
|
|
+/*
|
|
|
+ * For uniprocesor systems we always use a single mutex, so just
|
|
|
+ * return 0 and avoid the hashing overhead.
|
|
|
+ */
|
|
|
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
|
|
+ struct vm_area_struct *vma,
|
|
|
+ struct address_space *mapping,
|
|
|
+ pgoff_t idx, unsigned long address)
|
|
|
+{
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
unsigned long address, unsigned int flags)
|
|
|
{
|
|
|
- pte_t *ptep;
|
|
|
- pte_t entry;
|
|
|
+ pte_t *ptep, entry;
|
|
|
spinlock_t *ptl;
|
|
|
int ret;
|
|
|
+ u32 hash;
|
|
|
+ pgoff_t idx;
|
|
|
struct page *page = NULL;
|
|
|
struct page *pagecache_page = NULL;
|
|
|
- static DEFINE_MUTEX(hugetlb_instantiation_mutex);
|
|
|
struct hstate *h = hstate_vma(vma);
|
|
|
+ struct address_space *mapping;
|
|
|
|
|
|
address &= huge_page_mask(h);
|
|
|
|
|
@@ -2931,15 +2985,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
if (!ptep)
|
|
|
return VM_FAULT_OOM;
|
|
|
|
|
|
+ mapping = vma->vm_file->f_mapping;
|
|
|
+ idx = vma_hugecache_offset(h, vma, address);
|
|
|
+
|
|
|
/*
|
|
|
* Serialize hugepage allocation and instantiation, so that we don't
|
|
|
* get spurious allocation failures if two CPUs race to instantiate
|
|
|
* the same page in the page cache.
|
|
|
*/
|
|
|
- mutex_lock(&hugetlb_instantiation_mutex);
|
|
|
+ hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
|
|
|
+ mutex_lock(&htlb_fault_mutex_table[hash]);
|
|
|
+
|
|
|
entry = huge_ptep_get(ptep);
|
|
|
if (huge_pte_none(entry)) {
|
|
|
- ret = hugetlb_no_page(mm, vma, address, ptep, flags);
|
|
|
+ ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
|
|
|
goto out_mutex;
|
|
|
}
|
|
|
|
|
@@ -3008,8 +3067,7 @@ out_ptl:
|
|
|
put_page(page);
|
|
|
|
|
|
out_mutex:
|
|
|
- mutex_unlock(&hugetlb_instantiation_mutex);
|
|
|
-
|
|
|
+ mutex_unlock(&htlb_fault_mutex_table[hash]);
|
|
|
return ret;
|
|
|
}
|
|
|
|