|
@@ -14,6 +14,8 @@
|
|
|
#include <linux/swapops.h>
|
|
|
#include <linux/userfaultfd_k.h>
|
|
|
#include <linux/mmu_notifier.h>
|
|
|
+#include <linux/hugetlb.h>
|
|
|
+#include <linux/pagemap.h>
|
|
|
#include <asm/tlbflush.h>
|
|
|
#include "internal.h"
|
|
|
|
|
@@ -139,6 +141,183 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
|
|
|
return pmd;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_HUGETLB_PAGE
|
|
|
+/*
|
|
|
+ * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
|
|
|
+ * called with mmap_sem held, it will release mmap_sem before returning.
|
|
|
+ */
|
|
|
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
|
|
+ struct vm_area_struct *dst_vma,
|
|
|
+ unsigned long dst_start,
|
|
|
+ unsigned long src_start,
|
|
|
+ unsigned long len,
|
|
|
+ bool zeropage)
|
|
|
+{
|
|
|
+ ssize_t err;
|
|
|
+ pte_t *dst_pte;
|
|
|
+ unsigned long src_addr, dst_addr;
|
|
|
+ long copied;
|
|
|
+ struct page *page;
|
|
|
+ struct hstate *h;
|
|
|
+ unsigned long vma_hpagesize;
|
|
|
+ pgoff_t idx;
|
|
|
+ u32 hash;
|
|
|
+ struct address_space *mapping;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * There is no default zero huge page for all huge page sizes as
|
|
|
+ * supported by hugetlb. A PMD_SIZE huge pages may exist as used
|
|
|
+ * by THP. Since we can not reliably insert a zero page, this
|
|
|
+ * feature is not supported.
|
|
|
+ */
|
|
|
+ if (zeropage) {
|
|
|
+ up_read(&dst_mm->mmap_sem);
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+
|
|
|
+ src_addr = src_start;
|
|
|
+ dst_addr = dst_start;
|
|
|
+ copied = 0;
|
|
|
+ page = NULL;
|
|
|
+ vma_hpagesize = vma_kernel_pagesize(dst_vma);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Validate alignment based on huge page size
|
|
|
+ */
|
|
|
+ err = -EINVAL;
|
|
|
+ if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+retry:
|
|
|
+ /*
|
|
|
+ * On routine entry dst_vma is set. If we had to drop mmap_sem and
|
|
|
+ * retry, dst_vma will be set to NULL and we must lookup again.
|
|
|
+ */
|
|
|
+ if (!dst_vma) {
|
|
|
+ err = -EINVAL;
|
|
|
+ dst_vma = find_vma(dst_mm, dst_start);
|
|
|
+ if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Make sure the vma is not shared, that the remaining dst
|
|
|
+ * range is both valid and fully within a single existing vma.
|
|
|
+ */
|
|
|
+ if (dst_vma->vm_flags & VM_SHARED)
|
|
|
+ goto out_unlock;
|
|
|
+ if (dst_start < dst_vma->vm_start ||
|
|
|
+ dst_start + len > dst_vma->vm_end)
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
|
|
|
+ (len - copied) & (vma_hpagesize - 1)))
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
|
|
|
+ */
|
|
|
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Ensure the dst_vma has a anon_vma.
|
|
|
+ */
|
|
|
+ err = -ENOMEM;
|
|
|
+ if (unlikely(anon_vma_prepare(dst_vma)))
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ h = hstate_vma(dst_vma);
|
|
|
+
|
|
|
+ while (src_addr < src_start + len) {
|
|
|
+ pte_t dst_pteval;
|
|
|
+
|
|
|
+ BUG_ON(dst_addr >= dst_start + len);
|
|
|
+ VM_BUG_ON(dst_addr & ~huge_page_mask(h));
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Serialize via hugetlb_fault_mutex
|
|
|
+ */
|
|
|
+ idx = linear_page_index(dst_vma, dst_addr);
|
|
|
+ mapping = dst_vma->vm_file->f_mapping;
|
|
|
+ hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
|
|
|
+ idx, dst_addr);
|
|
|
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
|
|
+
|
|
|
+ err = -ENOMEM;
|
|
|
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
|
|
|
+ if (!dst_pte) {
|
|
|
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ err = -EEXIST;
|
|
|
+ dst_pteval = huge_ptep_get(dst_pte);
|
|
|
+ if (!huge_pte_none(dst_pteval)) {
|
|
|
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
|
|
|
+ dst_addr, src_addr, &page);
|
|
|
+
|
|
|
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
+
|
|
|
+ cond_resched();
|
|
|
+
|
|
|
+ if (unlikely(err == -EFAULT)) {
|
|
|
+ up_read(&dst_mm->mmap_sem);
|
|
|
+ BUG_ON(!page);
|
|
|
+
|
|
|
+ err = copy_huge_page_from_user(page,
|
|
|
+ (const void __user *)src_addr,
|
|
|
+ pages_per_huge_page(h));
|
|
|
+ if (unlikely(err)) {
|
|
|
+ err = -EFAULT;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ down_read(&dst_mm->mmap_sem);
|
|
|
+
|
|
|
+ dst_vma = NULL;
|
|
|
+ goto retry;
|
|
|
+ } else
|
|
|
+ BUG_ON(page);
|
|
|
+
|
|
|
+ if (!err) {
|
|
|
+ dst_addr += vma_hpagesize;
|
|
|
+ src_addr += vma_hpagesize;
|
|
|
+ copied += vma_hpagesize;
|
|
|
+
|
|
|
+ if (fatal_signal_pending(current))
|
|
|
+ err = -EINTR;
|
|
|
+ }
|
|
|
+ if (err)
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+out_unlock:
|
|
|
+ up_read(&dst_mm->mmap_sem);
|
|
|
+out:
|
|
|
+ if (page)
|
|
|
+ put_page(page);
|
|
|
+ BUG_ON(copied < 0);
|
|
|
+ BUG_ON(err > 0);
|
|
|
+ BUG_ON(!copied && !err);
|
|
|
+ return copied ? copied : err;
|
|
|
+}
|
|
|
+#else /* !CONFIG_HUGETLB_PAGE */
|
|
|
+/* fail at build time if gcc attempts to use this */
|
|
|
+extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
|
|
+ struct vm_area_struct *dst_vma,
|
|
|
+ unsigned long dst_start,
|
|
|
+ unsigned long src_start,
|
|
|
+ unsigned long len,
|
|
|
+ bool zeropage);
|
|
|
+#endif /* CONFIG_HUGETLB_PAGE */
|
|
|
+
|
|
|
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
|
|
|
unsigned long dst_start,
|
|
|
unsigned long src_start,
|
|
@@ -181,6 +360,13 @@ retry:
|
|
|
dst_start + len > dst_vma->vm_end)
|
|
|
goto out_unlock;
|
|
|
|
|
|
+ /*
|
|
|
+ * If this is a HUGETLB vma, pass off to appropriate routine
|
|
|
+ */
|
|
|
+ if (is_vm_hugetlb_page(dst_vma))
|
|
|
+ return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
|
|
|
+ src_start, len, zeropage);
|
|
|
+
|
|
|
/*
|
|
|
* Be strict and only allow __mcopy_atomic on userfaultfd
|
|
|
* registered ranges to prevent userland errors going
|