|
@@ -42,6 +42,9 @@
|
|
|
#define DAX_WAIT_TABLE_BITS 12
|
|
|
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
|
|
|
|
|
|
+/* The 'colour' (ie low bits) within a PMD of a page offset. */
|
|
|
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
|
|
|
+
|
|
|
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
|
|
|
|
|
|
static int __init init_dax_wait_table(void)
|
|
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
|
|
|
}
|
|
|
fs_initcall(init_dax_wait_table);
|
|
|
|
|
|
+/*
|
|
|
+ * We use lowest available bit in exceptional entry for locking, one bit for
|
|
|
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
|
|
|
+ * an empty entry that is just used for locking. In total four special bits.
|
|
|
+ *
|
|
|
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
|
|
|
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
|
|
|
+ * block allocation.
|
|
|
+ */
|
|
|
+#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
|
|
|
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
|
|
|
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
|
|
|
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
|
|
|
+#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
|
|
|
+
|
|
|
+static unsigned long dax_radix_sector(void *entry)
|
|
|
+{
|
|
|
+ return (unsigned long)entry >> RADIX_DAX_SHIFT;
|
|
|
+}
|
|
|
+
|
|
|
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
|
|
|
+{
|
|
|
+ return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
|
|
|
+ ((unsigned long)sector << RADIX_DAX_SHIFT) |
|
|
|
+ RADIX_DAX_ENTRY_LOCK);
|
|
|
+}
|
|
|
+
|
|
|
+static unsigned int dax_radix_order(void *entry)
|
|
|
+{
|
|
|
+ if ((unsigned long)entry & RADIX_DAX_PMD)
|
|
|
+ return PMD_SHIFT - PAGE_SHIFT;
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
static int dax_is_pmd_entry(void *entry)
|
|
|
{
|
|
|
return (unsigned long)entry & RADIX_DAX_PMD;
|
|
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
|
|
|
|
|
|
static int dax_is_zero_entry(void *entry)
|
|
|
{
|
|
|
- return (unsigned long)entry & RADIX_DAX_HZP;
|
|
|
+ return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
|
|
|
}
|
|
|
|
|
|
static int dax_is_empty_entry(void *entry)
|
|
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
|
|
|
* the range covered by the PMD map to the same bit lock.
|
|
|
*/
|
|
|
if (dax_is_pmd_entry(entry))
|
|
|
- index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
|
|
|
+ index &= ~PG_PMD_COLOUR;
|
|
|
|
|
|
key->mapping = mapping;
|
|
|
key->entry_start = index;
|
|
@@ -120,6 +157,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
|
|
|
return autoremove_wake_function(wait, mode, sync, NULL);
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * We do not necessarily hold the mapping->tree_lock when we call this
|
|
|
+ * function so it is possible that 'entry' is no longer a valid item in the
|
|
|
+ * radix tree. This is okay because all we really need to do is to find the
|
|
|
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
|
|
|
+ * wake them.
|
|
|
+ */
|
|
|
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
|
|
+ pgoff_t index, void *entry, bool wake_all)
|
|
|
+{
|
|
|
+ struct exceptional_entry_key key;
|
|
|
+ wait_queue_head_t *wq;
|
|
|
+
|
|
|
+ wq = dax_entry_waitqueue(mapping, index, entry, &key);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
|
|
|
+ * under mapping->tree_lock, ditto for entry handling in our callers.
|
|
|
+ * So at this point all tasks that could have seen our entry locked
|
|
|
+ * must be in the waitqueue and the following check will see them.
|
|
|
+ */
|
|
|
+ if (waitqueue_active(wq))
|
|
|
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Check whether the given slot is locked. The function must be called with
|
|
|
* mapping->tree_lock held
|
|
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
|
|
|
for (;;) {
|
|
|
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
|
|
|
&slot);
|
|
|
- if (!entry || !radix_tree_exceptional_entry(entry) ||
|
|
|
+ if (!entry ||
|
|
|
+ WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
|
|
|
!slot_locked(mapping, slot)) {
|
|
|
if (slotp)
|
|
|
*slotp = slot;
|
|
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
|
|
|
}
|
|
|
|
|
|
static void put_locked_mapping_entry(struct address_space *mapping,
|
|
|
- pgoff_t index, void *entry)
|
|
|
+ pgoff_t index)
|
|
|
{
|
|
|
- if (!radix_tree_exceptional_entry(entry)) {
|
|
|
- unlock_page(entry);
|
|
|
- put_page(entry);
|
|
|
- } else {
|
|
|
- dax_unlock_mapping_entry(mapping, index);
|
|
|
- }
|
|
|
+ dax_unlock_mapping_entry(mapping, index);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
|
|
|
static void put_unlocked_mapping_entry(struct address_space *mapping,
|
|
|
pgoff_t index, void *entry)
|
|
|
{
|
|
|
- if (!radix_tree_exceptional_entry(entry))
|
|
|
+ if (!entry)
|
|
|
return;
|
|
|
|
|
|
/* We have to wake up next waiter for the radix tree entry lock */
|
|
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Find radix tree entry at given index. If it points to a page, return with
|
|
|
- * the page locked. If it points to the exceptional entry, return with the
|
|
|
- * radix tree entry locked. If the radix tree doesn't contain given index,
|
|
|
- * create empty exceptional entry for the index and return with it locked.
|
|
|
+ * Find radix tree entry at given index. If it points to an exceptional entry,
|
|
|
+ * return it with the radix tree entry locked. If the radix tree doesn't
|
|
|
+ * contain given index, create an empty exceptional entry for the index and
|
|
|
+ * return with it locked.
|
|
|
*
|
|
|
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
|
|
|
* either return that locked entry or will return an error. This error will
|
|
|
- * happen if there are any 4k entries (either zero pages or DAX entries)
|
|
|
- * within the 2MiB range that we are requesting.
|
|
|
+ * happen if there are any 4k entries within the 2MiB range that we are
|
|
|
+ * requesting.
|
|
|
*
|
|
|
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
|
|
|
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
|
|
@@ -276,18 +334,21 @@ restart:
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
|
entry = get_unlocked_mapping_entry(mapping, index, &slot);
|
|
|
|
|
|
+ if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
|
|
|
+ entry = ERR_PTR(-EIO);
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
if (entry) {
|
|
|
if (size_flag & RADIX_DAX_PMD) {
|
|
|
- if (!radix_tree_exceptional_entry(entry) ||
|
|
|
- dax_is_pte_entry(entry)) {
|
|
|
+ if (dax_is_pte_entry(entry)) {
|
|
|
put_unlocked_mapping_entry(mapping, index,
|
|
|
entry);
|
|
|
entry = ERR_PTR(-EEXIST);
|
|
|
goto out_unlock;
|
|
|
}
|
|
|
} else { /* trying to grab a PTE entry */
|
|
|
- if (radix_tree_exceptional_entry(entry) &&
|
|
|
- dax_is_pmd_entry(entry) &&
|
|
|
+ if (dax_is_pmd_entry(entry) &&
|
|
|
(dax_is_zero_entry(entry) ||
|
|
|
dax_is_empty_entry(entry))) {
|
|
|
pmd_downgrade = true;
|
|
@@ -321,7 +382,7 @@ restart:
|
|
|
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
|
|
|
if (err) {
|
|
|
if (pmd_downgrade)
|
|
|
- put_locked_mapping_entry(mapping, index, entry);
|
|
|
+ put_locked_mapping_entry(mapping, index);
|
|
|
return ERR_PTR(err);
|
|
|
}
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
@@ -371,52 +432,12 @@ restart:
|
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
|
return entry;
|
|
|
}
|
|
|
- /* Normal page in radix tree? */
|
|
|
- if (!radix_tree_exceptional_entry(entry)) {
|
|
|
- struct page *page = entry;
|
|
|
-
|
|
|
- get_page(page);
|
|
|
- spin_unlock_irq(&mapping->tree_lock);
|
|
|
- lock_page(page);
|
|
|
- /* Page got truncated? Retry... */
|
|
|
- if (unlikely(page->mapping != mapping)) {
|
|
|
- unlock_page(page);
|
|
|
- put_page(page);
|
|
|
- goto restart;
|
|
|
- }
|
|
|
- return page;
|
|
|
- }
|
|
|
entry = lock_slot(mapping, slot);
|
|
|
out_unlock:
|
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
|
return entry;
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * We do not necessarily hold the mapping->tree_lock when we call this
|
|
|
- * function so it is possible that 'entry' is no longer a valid item in the
|
|
|
- * radix tree. This is okay because all we really need to do is to find the
|
|
|
- * correct waitqueue where tasks might be waiting for that old 'entry' and
|
|
|
- * wake them.
|
|
|
- */
|
|
|
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
|
|
- pgoff_t index, void *entry, bool wake_all)
|
|
|
-{
|
|
|
- struct exceptional_entry_key key;
|
|
|
- wait_queue_head_t *wq;
|
|
|
-
|
|
|
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
|
|
|
-
|
|
|
- /*
|
|
|
- * Checking for locked entry and prepare_to_wait_exclusive() happens
|
|
|
- * under mapping->tree_lock, ditto for entry handling in our callers.
|
|
|
- * So at this point all tasks that could have seen our entry locked
|
|
|
- * must be in the waitqueue and the following check will see them.
|
|
|
- */
|
|
|
- if (waitqueue_active(wq))
|
|
|
- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
|
|
|
-}
|
|
|
-
|
|
|
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
|
|
|
pgoff_t index, bool trunc)
|
|
|
{
|
|
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
|
|
|
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
|
entry = get_unlocked_mapping_entry(mapping, index, NULL);
|
|
|
- if (!entry || !radix_tree_exceptional_entry(entry))
|
|
|
+ if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
|
|
|
goto out;
|
|
|
if (!trunc &&
|
|
|
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
|
|
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
|
|
|
return __dax_invalidate_mapping_entry(mapping, index, false);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * The user has performed a load from a hole in the file. Allocating
|
|
|
- * a new page in the file would cause excessive storage usage for
|
|
|
- * workloads with sparse files. We allocate a page cache page instead.
|
|
|
- * We'll kick it out of the page cache if it's ever written to,
|
|
|
- * otherwise it will simply fall out of the page cache under memory
|
|
|
- * pressure without ever having been dirtied.
|
|
|
- */
|
|
|
-static int dax_load_hole(struct address_space *mapping, void **entry,
|
|
|
- struct vm_fault *vmf)
|
|
|
-{
|
|
|
- struct inode *inode = mapping->host;
|
|
|
- struct page *page;
|
|
|
- int ret;
|
|
|
-
|
|
|
- /* Hole page already exists? Return it... */
|
|
|
- if (!radix_tree_exceptional_entry(*entry)) {
|
|
|
- page = *entry;
|
|
|
- goto finish_fault;
|
|
|
- }
|
|
|
-
|
|
|
- /* This will replace locked radix tree entry with a hole page */
|
|
|
- page = find_or_create_page(mapping, vmf->pgoff,
|
|
|
- vmf->gfp_mask | __GFP_ZERO);
|
|
|
- if (!page) {
|
|
|
- ret = VM_FAULT_OOM;
|
|
|
- goto out;
|
|
|
- }
|
|
|
-
|
|
|
-finish_fault:
|
|
|
- vmf->page = page;
|
|
|
- ret = finish_fault(vmf);
|
|
|
- vmf->page = NULL;
|
|
|
- *entry = page;
|
|
|
- if (!ret) {
|
|
|
- /* Grab reference for PTE that is now referencing the page */
|
|
|
- get_page(page);
|
|
|
- ret = VM_FAULT_NOPAGE;
|
|
|
- }
|
|
|
-out:
|
|
|
- trace_dax_load_hole(inode, vmf, ret);
|
|
|
- return ret;
|
|
|
-}
|
|
|
-
|
|
|
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
|
|
|
sector_t sector, size_t size, struct page *to,
|
|
|
unsigned long vaddr)
|
|
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
|
|
|
unsigned long flags)
|
|
|
{
|
|
|
struct radix_tree_root *page_tree = &mapping->page_tree;
|
|
|
- int error = 0;
|
|
|
- bool hole_fill = false;
|
|
|
void *new_entry;
|
|
|
pgoff_t index = vmf->pgoff;
|
|
|
|
|
|
if (vmf->flags & FAULT_FLAG_WRITE)
|
|
|
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
|
|
|
|
|
- /* Replacing hole page with block mapping? */
|
|
|
- if (!radix_tree_exceptional_entry(entry)) {
|
|
|
- hole_fill = true;
|
|
|
- /*
|
|
|
- * Unmap the page now before we remove it from page cache below.
|
|
|
- * The page is locked so it cannot be faulted in again.
|
|
|
- */
|
|
|
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
|
|
|
- PAGE_SIZE, 0);
|
|
|
- error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
|
|
|
- if (error)
|
|
|
- return ERR_PTR(error);
|
|
|
- } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
|
|
|
- /* replacing huge zero page with PMD block mapping */
|
|
|
- unmap_mapping_range(mapping,
|
|
|
- (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
|
|
|
+ if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
|
|
|
+ /* we are replacing a zero page with block mapping */
|
|
|
+ if (dax_is_pmd_entry(entry))
|
|
|
+ unmap_mapping_range(mapping,
|
|
|
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
|
|
|
+ PMD_SIZE, 0);
|
|
|
+ else /* pte entry */
|
|
|
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
|
|
|
+ PAGE_SIZE, 0);
|
|
|
}
|
|
|
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
|
new_entry = dax_radix_locked_entry(sector, flags);
|
|
|
|
|
|
- if (hole_fill) {
|
|
|
- __delete_from_page_cache(entry, NULL);
|
|
|
- /* Drop pagecache reference */
|
|
|
- put_page(entry);
|
|
|
- error = __radix_tree_insert(page_tree, index,
|
|
|
- dax_radix_order(new_entry), new_entry);
|
|
|
- if (error) {
|
|
|
- new_entry = ERR_PTR(error);
|
|
|
- goto unlock;
|
|
|
- }
|
|
|
- mapping->nrexceptional++;
|
|
|
- } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
|
|
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
|
|
|
/*
|
|
|
* Only swap our new entry into the radix tree if the current
|
|
|
* entry is a zero page or an empty entry. If a normal PTE or
|
|
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
|
|
|
WARN_ON_ONCE(ret != entry);
|
|
|
__radix_tree_replace(page_tree, node, slot,
|
|
|
new_entry, NULL, NULL);
|
|
|
+ entry = new_entry;
|
|
|
}
|
|
|
+
|
|
|
if (vmf->flags & FAULT_FLAG_WRITE)
|
|
|
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
|
|
|
- unlock:
|
|
|
+
|
|
|
spin_unlock_irq(&mapping->tree_lock);
|
|
|
- if (hole_fill) {
|
|
|
- radix_tree_preload_end();
|
|
|
- /*
|
|
|
- * We don't need hole page anymore, it has been replaced with
|
|
|
- * locked radix tree entry now.
|
|
|
- */
|
|
|
- if (mapping->a_ops->freepage)
|
|
|
- mapping->a_ops->freepage(entry);
|
|
|
- unlock_page(entry);
|
|
|
- put_page(entry);
|
|
|
- }
|
|
|
- return new_entry;
|
|
|
+ return entry;
|
|
|
}
|
|
|
|
|
|
static inline unsigned long
|
|
@@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
|
|
|
spin_lock_irq(&mapping->tree_lock);
|
|
|
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
|
|
|
/* Entry got punched out / reallocated? */
|
|
|
- if (!entry2 || !radix_tree_exceptional_entry(entry2))
|
|
|
+ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
|
|
|
goto put_unlocked;
|
|
|
/*
|
|
|
* Entry got reallocated elsewhere? No need to writeback. We have to
|
|
@@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
|
|
|
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
|
|
|
dax_unlock:
|
|
|
dax_read_unlock(id);
|
|
|
- put_locked_mapping_entry(mapping, index, entry);
|
|
|
+ put_locked_mapping_entry(mapping, index);
|
|
|
return ret;
|
|
|
|
|
|
put_unlocked:
|
|
@@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
|
|
|
|
|
|
static int dax_insert_mapping(struct address_space *mapping,
|
|
|
struct block_device *bdev, struct dax_device *dax_dev,
|
|
|
- sector_t sector, size_t size, void **entryp,
|
|
|
+ sector_t sector, size_t size, void *entry,
|
|
|
struct vm_area_struct *vma, struct vm_fault *vmf)
|
|
|
{
|
|
|
unsigned long vaddr = vmf->address;
|
|
|
- void *entry = *entryp;
|
|
|
void *ret, *kaddr;
|
|
|
pgoff_t pgoff;
|
|
|
int id, rc;
|
|
@@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
|
|
|
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
|
|
|
if (IS_ERR(ret))
|
|
|
return PTR_ERR(ret);
|
|
|
- *entryp = ret;
|
|
|
|
|
|
trace_dax_insert_mapping(mapping->host, vmf, ret);
|
|
|
- return vm_insert_mixed(vma, vaddr, pfn);
|
|
|
+ if (vmf->flags & FAULT_FLAG_WRITE)
|
|
|
+ return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
|
|
|
+ else
|
|
|
+ return vm_insert_mixed(vma, vaddr, pfn);
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * dax_pfn_mkwrite - handle first write to DAX page
|
|
|
- * @vmf: The description of the fault
|
|
|
+/*
|
|
|
+ * The user has performed a load from a hole in the file. Allocating a new
|
|
|
+ * page in the file would cause excessive storage usage for workloads with
|
|
|
+ * sparse files. Instead we insert a read-only mapping of the 4k zero page.
|
|
|
+ * If this page is ever written to we will re-fault and change the mapping to
|
|
|
+ * point to real DAX storage instead.
|
|
|
*/
|
|
|
-int dax_pfn_mkwrite(struct vm_fault *vmf)
|
|
|
+static int dax_load_hole(struct address_space *mapping, void *entry,
|
|
|
+ struct vm_fault *vmf)
|
|
|
{
|
|
|
- struct file *file = vmf->vma->vm_file;
|
|
|
- struct address_space *mapping = file->f_mapping;
|
|
|
struct inode *inode = mapping->host;
|
|
|
- void *entry, **slot;
|
|
|
- pgoff_t index = vmf->pgoff;
|
|
|
+ unsigned long vaddr = vmf->address;
|
|
|
+ int ret = VM_FAULT_NOPAGE;
|
|
|
+ struct page *zero_page;
|
|
|
+ void *entry2;
|
|
|
|
|
|
- spin_lock_irq(&mapping->tree_lock);
|
|
|
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
|
|
|
- if (!entry || !radix_tree_exceptional_entry(entry)) {
|
|
|
- if (entry)
|
|
|
- put_unlocked_mapping_entry(mapping, index, entry);
|
|
|
- spin_unlock_irq(&mapping->tree_lock);
|
|
|
- trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
|
|
|
- return VM_FAULT_NOPAGE;
|
|
|
+ zero_page = ZERO_PAGE(0);
|
|
|
+ if (unlikely(!zero_page)) {
|
|
|
+ ret = VM_FAULT_OOM;
|
|
|
+ goto out;
|
|
|
}
|
|
|
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
|
|
|
- entry = lock_slot(mapping, slot);
|
|
|
- spin_unlock_irq(&mapping->tree_lock);
|
|
|
- /*
|
|
|
- * If we race with somebody updating the PTE and finish_mkwrite_fault()
|
|
|
- * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
|
|
|
- * the fault in either case.
|
|
|
- */
|
|
|
- finish_mkwrite_fault(vmf);
|
|
|
- put_locked_mapping_entry(mapping, index, entry);
|
|
|
- trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
|
|
|
- return VM_FAULT_NOPAGE;
|
|
|
+
|
|
|
+ entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
|
|
|
+ RADIX_DAX_ZERO_PAGE);
|
|
|
+ if (IS_ERR(entry2)) {
|
|
|
+ ret = VM_FAULT_SIGBUS;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
|
|
|
+out:
|
|
|
+ trace_dax_load_hole(inode, vmf, ret);
|
|
|
+ return ret;
|
|
|
}
|
|
|
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
|
|
|
|
|
|
static bool dax_range_is_aligned(struct block_device *bdev,
|
|
|
unsigned int offset, unsigned int length)
|
|
@@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
|
|
if (map_len > end - pos)
|
|
|
map_len = end - pos;
|
|
|
|
|
|
+ /*
|
|
|
+ * The userspace address for the memory copy has already been
|
|
|
+ * validated via access_ok() in either vfs_read() or
|
|
|
+ * vfs_write(), depending on which operation we are doing.
|
|
|
+ */
|
|
|
if (iov_iter_rw(iter) == WRITE)
|
|
|
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
|
|
|
map_len, iter);
|
|
@@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
|
|
|
major = VM_FAULT_MAJOR;
|
|
|
}
|
|
|
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
|
|
|
- sector, PAGE_SIZE, &entry, vmf->vma, vmf);
|
|
|
+ sector, PAGE_SIZE, entry, vmf->vma, vmf);
|
|
|
/* -EBUSY is fine, somebody else faulted on the same PTE */
|
|
|
if (error == -EBUSY)
|
|
|
error = 0;
|
|
@@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
|
|
|
case IOMAP_UNWRITTEN:
|
|
|
case IOMAP_HOLE:
|
|
|
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
|
|
|
- vmf_ret = dax_load_hole(mapping, &entry, vmf);
|
|
|
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
|
|
|
goto finish_iomap;
|
|
|
}
|
|
|
/*FALLTHRU*/
|
|
@@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
|
|
|
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
|
|
|
}
|
|
|
unlock_entry:
|
|
|
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
|
|
|
+ put_locked_mapping_entry(mapping, vmf->pgoff);
|
|
|
out:
|
|
|
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
|
|
|
return vmf_ret;
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_FS_DAX_PMD
|
|
|
-/*
|
|
|
- * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
|
|
|
- * more often than one might expect in the below functions.
|
|
|
- */
|
|
|
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
|
|
|
-
|
|
|
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
|
|
- loff_t pos, void **entryp)
|
|
|
+ loff_t pos, void *entry)
|
|
|
{
|
|
|
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
|
|
const sector_t sector = dax_iomap_sector(iomap, pos);
|
|
@@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
|
|
void *ret = NULL, *kaddr;
|
|
|
long length = 0;
|
|
|
pgoff_t pgoff;
|
|
|
- pfn_t pfn;
|
|
|
+ pfn_t pfn = {};
|
|
|
int id;
|
|
|
|
|
|
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
|
|
@@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
|
|
|
goto unlock_fallback;
|
|
|
dax_read_unlock(id);
|
|
|
|
|
|
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
|
|
|
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
|
|
|
RADIX_DAX_PMD);
|
|
|
if (IS_ERR(ret))
|
|
|
goto fallback;
|
|
|
- *entryp = ret;
|
|
|
|
|
|
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
|
|
|
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
|
|
@@ -1321,7 +1267,7 @@ fallback:
|
|
|
}
|
|
|
|
|
|
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
|
|
|
- void **entryp)
|
|
|
+ void *entry)
|
|
|
{
|
|
|
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
|
|
unsigned long pmd_addr = vmf->address & PMD_MASK;
|
|
@@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
|
|
|
if (unlikely(!zero_page))
|
|
|
goto fallback;
|
|
|
|
|
|
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
|
|
|
- RADIX_DAX_PMD | RADIX_DAX_HZP);
|
|
|
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
|
|
|
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
|
|
|
if (IS_ERR(ret))
|
|
|
goto fallback;
|
|
|
- *entryp = ret;
|
|
|
|
|
|
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
|
|
|
if (!pmd_none(*(vmf->pmd))) {
|
|
@@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
|
|
|
goto fallback;
|
|
|
|
|
|
/*
|
|
|
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
|
|
|
- * PMD or a HZP entry. If it can't (because a 4k page is already in
|
|
|
- * the tree, for instance), it will return -EEXIST and we just fall
|
|
|
- * back to 4k entries.
|
|
|
+ * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
|
|
|
+ * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
|
|
|
+ * is already in the tree, for instance), it will return -EEXIST and
|
|
|
+ * we just fall back to 4k entries.
|
|
|
*/
|
|
|
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
|
|
|
if (IS_ERR(entry))
|
|
@@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
|
|
|
|
|
|
switch (iomap.type) {
|
|
|
case IOMAP_MAPPED:
|
|
|
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
|
|
|
+ result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
|
|
|
break;
|
|
|
case IOMAP_UNWRITTEN:
|
|
|
case IOMAP_HOLE:
|
|
|
if (WARN_ON_ONCE(write))
|
|
|
break;
|
|
|
- result = dax_pmd_load_hole(vmf, &iomap, &entry);
|
|
|
+ result = dax_pmd_load_hole(vmf, &iomap, entry);
|
|
|
break;
|
|
|
default:
|
|
|
WARN_ON_ONCE(1);
|
|
@@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
|
|
|
&iomap);
|
|
|
}
|
|
|
unlock_entry:
|
|
|
- put_locked_mapping_entry(mapping, pgoff, entry);
|
|
|
+ put_locked_mapping_entry(mapping, pgoff);
|
|
|
fallback:
|
|
|
if (result == VM_FAULT_FALLBACK) {
|
|
|
split_huge_pmd(vma, vmf->pmd, vmf->address);
|