dax.c 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288
  1. /*
  2. * fs/dax.c - Direct Access filesystem code
  3. * Copyright (c) 2013-2014 Intel Corporation
  4. * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  5. * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms and conditions of the GNU General Public License,
  9. * version 2, as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope it will be useful, but WITHOUT
  12. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  14. * more details.
  15. */
  16. #include <linux/atomic.h>
  17. #include <linux/blkdev.h>
  18. #include <linux/buffer_head.h>
  19. #include <linux/dax.h>
  20. #include <linux/fs.h>
  21. #include <linux/genhd.h>
  22. #include <linux/highmem.h>
  23. #include <linux/memcontrol.h>
  24. #include <linux/mm.h>
  25. #include <linux/mutex.h>
  26. #include <linux/pagevec.h>
  27. #include <linux/pmem.h>
  28. #include <linux/sched.h>
  29. #include <linux/uio.h>
  30. #include <linux/vmstat.h>
  31. #include <linux/pfn_t.h>
  32. #include <linux/sizes.h>
  33. /*
  34. * We use lowest available bit in exceptional entry for locking, other two
  35. * bits to determine entry type. In total 3 special bits.
  36. */
  37. #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
  38. #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
  39. #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  40. #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
  41. #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
  42. #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
  43. #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
  44. RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
  45. RADIX_TREE_EXCEPTIONAL_ENTRY))
  46. /* We choose 4096 entries - same as per-zone page wait tables */
  47. #define DAX_WAIT_TABLE_BITS 12
  48. #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  49. wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  50. static int __init init_dax_wait_table(void)
  51. {
  52. int i;
  53. for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  54. init_waitqueue_head(wait_table + i);
  55. return 0;
  56. }
  57. fs_initcall(init_dax_wait_table);
  58. static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
  59. pgoff_t index)
  60. {
  61. unsigned long hash = hash_long((unsigned long)mapping ^ index,
  62. DAX_WAIT_TABLE_BITS);
  63. return wait_table + hash;
  64. }
  65. static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  66. {
  67. struct request_queue *q = bdev->bd_queue;
  68. long rc = -EIO;
  69. dax->addr = (void __pmem *) ERR_PTR(-EIO);
  70. if (blk_queue_enter(q, true) != 0)
  71. return rc;
  72. rc = bdev_direct_access(bdev, dax);
  73. if (rc < 0) {
  74. dax->addr = (void __pmem *) ERR_PTR(rc);
  75. blk_queue_exit(q);
  76. return rc;
  77. }
  78. return rc;
  79. }
  80. static void dax_unmap_atomic(struct block_device *bdev,
  81. const struct blk_dax_ctl *dax)
  82. {
  83. if (IS_ERR(dax->addr))
  84. return;
  85. blk_queue_exit(bdev->bd_queue);
  86. }
  87. struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  88. {
  89. struct page *page = alloc_pages(GFP_KERNEL, 0);
  90. struct blk_dax_ctl dax = {
  91. .size = PAGE_SIZE,
  92. .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
  93. };
  94. long rc;
  95. if (!page)
  96. return ERR_PTR(-ENOMEM);
  97. rc = dax_map_atomic(bdev, &dax);
  98. if (rc < 0)
  99. return ERR_PTR(rc);
  100. memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
  101. dax_unmap_atomic(bdev, &dax);
  102. return page;
  103. }
  104. static bool buffer_written(struct buffer_head *bh)
  105. {
  106. return buffer_mapped(bh) && !buffer_unwritten(bh);
  107. }
  108. /*
  109. * When ext4 encounters a hole, it returns without modifying the buffer_head
  110. * which means that we can't trust b_size. To cope with this, we set b_state
  111. * to 0 before calling get_block and, if any bit is set, we know we can trust
  112. * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
  113. * and would save us time calling get_block repeatedly.
  114. */
  115. static bool buffer_size_valid(struct buffer_head *bh)
  116. {
  117. return bh->b_state != 0;
  118. }
  119. static sector_t to_sector(const struct buffer_head *bh,
  120. const struct inode *inode)
  121. {
  122. sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
  123. return sector;
  124. }
  125. static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  126. loff_t start, loff_t end, get_block_t get_block,
  127. struct buffer_head *bh)
  128. {
  129. loff_t pos = start, max = start, bh_max = start;
  130. bool hole = false, need_wmb = false;
  131. struct block_device *bdev = NULL;
  132. int rw = iov_iter_rw(iter), rc;
  133. long map_len = 0;
  134. struct blk_dax_ctl dax = {
  135. .addr = (void __pmem *) ERR_PTR(-EIO),
  136. };
  137. unsigned blkbits = inode->i_blkbits;
  138. sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
  139. >> blkbits;
  140. if (rw == READ)
  141. end = min(end, i_size_read(inode));
  142. while (pos < end) {
  143. size_t len;
  144. if (pos == max) {
  145. long page = pos >> PAGE_SHIFT;
  146. sector_t block = page << (PAGE_SHIFT - blkbits);
  147. unsigned first = pos - (block << blkbits);
  148. long size;
  149. if (pos == bh_max) {
  150. bh->b_size = PAGE_ALIGN(end - pos);
  151. bh->b_state = 0;
  152. rc = get_block(inode, block, bh, rw == WRITE);
  153. if (rc)
  154. break;
  155. if (!buffer_size_valid(bh))
  156. bh->b_size = 1 << blkbits;
  157. bh_max = pos - first + bh->b_size;
  158. bdev = bh->b_bdev;
  159. /*
  160. * We allow uninitialized buffers for writes
  161. * beyond EOF as those cannot race with faults
  162. */
  163. WARN_ON_ONCE(
  164. (buffer_new(bh) && block < file_blks) ||
  165. (rw == WRITE && buffer_unwritten(bh)));
  166. } else {
  167. unsigned done = bh->b_size -
  168. (bh_max - (pos - first));
  169. bh->b_blocknr += done >> blkbits;
  170. bh->b_size -= done;
  171. }
  172. hole = rw == READ && !buffer_written(bh);
  173. if (hole) {
  174. size = bh->b_size - first;
  175. } else {
  176. dax_unmap_atomic(bdev, &dax);
  177. dax.sector = to_sector(bh, inode);
  178. dax.size = bh->b_size;
  179. map_len = dax_map_atomic(bdev, &dax);
  180. if (map_len < 0) {
  181. rc = map_len;
  182. break;
  183. }
  184. dax.addr += first;
  185. size = map_len - first;
  186. }
  187. max = min(pos + size, end);
  188. }
  189. if (iov_iter_rw(iter) == WRITE) {
  190. len = copy_from_iter_pmem(dax.addr, max - pos, iter);
  191. need_wmb = true;
  192. } else if (!hole)
  193. len = copy_to_iter((void __force *) dax.addr, max - pos,
  194. iter);
  195. else
  196. len = iov_iter_zero(max - pos, iter);
  197. if (!len) {
  198. rc = -EFAULT;
  199. break;
  200. }
  201. pos += len;
  202. if (!IS_ERR(dax.addr))
  203. dax.addr += len;
  204. }
  205. if (need_wmb)
  206. wmb_pmem();
  207. dax_unmap_atomic(bdev, &dax);
  208. return (pos == start) ? rc : pos - start;
  209. }
  210. /**
  211. * dax_do_io - Perform I/O to a DAX file
  212. * @iocb: The control block for this I/O
  213. * @inode: The file which the I/O is directed at
  214. * @iter: The addresses to do I/O from or to
  215. * @get_block: The filesystem method used to translate file offsets to blocks
  216. * @end_io: A filesystem callback for I/O completion
  217. * @flags: See below
  218. *
  219. * This function uses the same locking scheme as do_blockdev_direct_IO:
  220. * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
  221. * caller for writes. For reads, we take and release the i_mutex ourselves.
  222. * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
  223. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
  224. * is in progress.
  225. */
  226. ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
  227. struct iov_iter *iter, get_block_t get_block,
  228. dio_iodone_t end_io, int flags)
  229. {
  230. struct buffer_head bh;
  231. ssize_t retval = -EINVAL;
  232. loff_t pos = iocb->ki_pos;
  233. loff_t end = pos + iov_iter_count(iter);
  234. memset(&bh, 0, sizeof(bh));
  235. bh.b_bdev = inode->i_sb->s_bdev;
  236. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
  237. inode_lock(inode);
  238. /* Protects against truncate */
  239. if (!(flags & DIO_SKIP_DIO_COUNT))
  240. inode_dio_begin(inode);
  241. retval = dax_io(inode, iter, pos, end, get_block, &bh);
  242. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
  243. inode_unlock(inode);
  244. if (end_io) {
  245. int err;
  246. err = end_io(iocb, pos, retval, bh.b_private);
  247. if (err)
  248. retval = err;
  249. }
  250. if (!(flags & DIO_SKIP_DIO_COUNT))
  251. inode_dio_end(inode);
  252. return retval;
  253. }
  254. EXPORT_SYMBOL_GPL(dax_do_io);
  255. /*
  256. * DAX radix tree locking
  257. */
  258. struct exceptional_entry_key {
  259. struct address_space *mapping;
  260. unsigned long index;
  261. };
  262. struct wait_exceptional_entry_queue {
  263. wait_queue_t wait;
  264. struct exceptional_entry_key key;
  265. };
  266. static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
  267. int sync, void *keyp)
  268. {
  269. struct exceptional_entry_key *key = keyp;
  270. struct wait_exceptional_entry_queue *ewait =
  271. container_of(wait, struct wait_exceptional_entry_queue, wait);
  272. if (key->mapping != ewait->key.mapping ||
  273. key->index != ewait->key.index)
  274. return 0;
  275. return autoremove_wake_function(wait, mode, sync, NULL);
  276. }
  277. /*
  278. * Check whether the given slot is locked. The function must be called with
  279. * mapping->tree_lock held
  280. */
  281. static inline int slot_locked(struct address_space *mapping, void **slot)
  282. {
  283. unsigned long entry = (unsigned long)
  284. radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  285. return entry & RADIX_DAX_ENTRY_LOCK;
  286. }
  287. /*
  288. * Mark the given slot is locked. The function must be called with
  289. * mapping->tree_lock held
  290. */
  291. static inline void *lock_slot(struct address_space *mapping, void **slot)
  292. {
  293. unsigned long entry = (unsigned long)
  294. radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  295. entry |= RADIX_DAX_ENTRY_LOCK;
  296. radix_tree_replace_slot(slot, (void *)entry);
  297. return (void *)entry;
  298. }
  299. /*
  300. * Mark the given slot is unlocked. The function must be called with
  301. * mapping->tree_lock held
  302. */
  303. static inline void *unlock_slot(struct address_space *mapping, void **slot)
  304. {
  305. unsigned long entry = (unsigned long)
  306. radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  307. entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
  308. radix_tree_replace_slot(slot, (void *)entry);
  309. return (void *)entry;
  310. }
  311. /*
  312. * Lookup entry in radix tree, wait for it to become unlocked if it is
  313. * exceptional entry and return it. The caller must call
  314. * put_unlocked_mapping_entry() when he decided not to lock the entry or
  315. * put_locked_mapping_entry() when he locked the entry and now wants to
  316. * unlock it.
  317. *
  318. * The function must be called with mapping->tree_lock held.
  319. */
  320. static void *get_unlocked_mapping_entry(struct address_space *mapping,
  321. pgoff_t index, void ***slotp)
  322. {
  323. void *ret, **slot;
  324. struct wait_exceptional_entry_queue ewait;
  325. wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
  326. init_wait(&ewait.wait);
  327. ewait.wait.func = wake_exceptional_entry_func;
  328. ewait.key.mapping = mapping;
  329. ewait.key.index = index;
  330. for (;;) {
  331. ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
  332. &slot);
  333. if (!ret || !radix_tree_exceptional_entry(ret) ||
  334. !slot_locked(mapping, slot)) {
  335. if (slotp)
  336. *slotp = slot;
  337. return ret;
  338. }
  339. prepare_to_wait_exclusive(wq, &ewait.wait,
  340. TASK_UNINTERRUPTIBLE);
  341. spin_unlock_irq(&mapping->tree_lock);
  342. schedule();
  343. finish_wait(wq, &ewait.wait);
  344. spin_lock_irq(&mapping->tree_lock);
  345. }
  346. }
  347. /*
  348. * Find radix tree entry at given index. If it points to a page, return with
  349. * the page locked. If it points to the exceptional entry, return with the
  350. * radix tree entry locked. If the radix tree doesn't contain given index,
  351. * create empty exceptional entry for the index and return with it locked.
  352. *
  353. * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  354. * persistent memory the benefit is doubtful. We can add that later if we can
  355. * show it helps.
  356. */
  357. static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
  358. {
  359. void *ret, **slot;
  360. restart:
  361. spin_lock_irq(&mapping->tree_lock);
  362. ret = get_unlocked_mapping_entry(mapping, index, &slot);
  363. /* No entry for given index? Make sure radix tree is big enough. */
  364. if (!ret) {
  365. int err;
  366. spin_unlock_irq(&mapping->tree_lock);
  367. err = radix_tree_preload(
  368. mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
  369. if (err)
  370. return ERR_PTR(err);
  371. ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
  372. RADIX_DAX_ENTRY_LOCK);
  373. spin_lock_irq(&mapping->tree_lock);
  374. err = radix_tree_insert(&mapping->page_tree, index, ret);
  375. radix_tree_preload_end();
  376. if (err) {
  377. spin_unlock_irq(&mapping->tree_lock);
  378. /* Someone already created the entry? */
  379. if (err == -EEXIST)
  380. goto restart;
  381. return ERR_PTR(err);
  382. }
  383. /* Good, we have inserted empty locked entry into the tree. */
  384. mapping->nrexceptional++;
  385. spin_unlock_irq(&mapping->tree_lock);
  386. return ret;
  387. }
  388. /* Normal page in radix tree? */
  389. if (!radix_tree_exceptional_entry(ret)) {
  390. struct page *page = ret;
  391. get_page(page);
  392. spin_unlock_irq(&mapping->tree_lock);
  393. lock_page(page);
  394. /* Page got truncated? Retry... */
  395. if (unlikely(page->mapping != mapping)) {
  396. unlock_page(page);
  397. put_page(page);
  398. goto restart;
  399. }
  400. return page;
  401. }
  402. ret = lock_slot(mapping, slot);
  403. spin_unlock_irq(&mapping->tree_lock);
  404. return ret;
  405. }
  406. void dax_wake_mapping_entry_waiter(struct address_space *mapping,
  407. pgoff_t index, bool wake_all)
  408. {
  409. wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
  410. /*
  411. * Checking for locked entry and prepare_to_wait_exclusive() happens
  412. * under mapping->tree_lock, ditto for entry handling in our callers.
  413. * So at this point all tasks that could have seen our entry locked
  414. * must be in the waitqueue and the following check will see them.
  415. */
  416. if (waitqueue_active(wq)) {
  417. struct exceptional_entry_key key;
  418. key.mapping = mapping;
  419. key.index = index;
  420. __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
  421. }
  422. }
  423. void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
  424. {
  425. void *ret, **slot;
  426. spin_lock_irq(&mapping->tree_lock);
  427. ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
  428. if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
  429. !slot_locked(mapping, slot))) {
  430. spin_unlock_irq(&mapping->tree_lock);
  431. return;
  432. }
  433. unlock_slot(mapping, slot);
  434. spin_unlock_irq(&mapping->tree_lock);
  435. dax_wake_mapping_entry_waiter(mapping, index, false);
  436. }
  437. static void put_locked_mapping_entry(struct address_space *mapping,
  438. pgoff_t index, void *entry)
  439. {
  440. if (!radix_tree_exceptional_entry(entry)) {
  441. unlock_page(entry);
  442. put_page(entry);
  443. } else {
  444. dax_unlock_mapping_entry(mapping, index);
  445. }
  446. }
  447. /*
  448. * Called when we are done with radix tree entry we looked up via
  449. * get_unlocked_mapping_entry() and which we didn't lock in the end.
  450. */
  451. static void put_unlocked_mapping_entry(struct address_space *mapping,
  452. pgoff_t index, void *entry)
  453. {
  454. if (!radix_tree_exceptional_entry(entry))
  455. return;
  456. /* We have to wake up next waiter for the radix tree entry lock */
  457. dax_wake_mapping_entry_waiter(mapping, index, false);
  458. }
  459. /*
  460. * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  461. * entry to get unlocked before deleting it.
  462. */
  463. int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  464. {
  465. void *entry;
  466. spin_lock_irq(&mapping->tree_lock);
  467. entry = get_unlocked_mapping_entry(mapping, index, NULL);
  468. /*
  469. * This gets called from truncate / punch_hole path. As such, the caller
  470. * must hold locks protecting against concurrent modifications of the
  471. * radix tree (usually fs-private i_mmap_sem for writing). Since the
  472. * caller has seen exceptional entry for this index, we better find it
  473. * at that index as well...
  474. */
  475. if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
  476. spin_unlock_irq(&mapping->tree_lock);
  477. return 0;
  478. }
  479. radix_tree_delete(&mapping->page_tree, index);
  480. mapping->nrexceptional--;
  481. spin_unlock_irq(&mapping->tree_lock);
  482. dax_wake_mapping_entry_waiter(mapping, index, true);
  483. return 1;
  484. }
  485. /*
  486. * The user has performed a load from a hole in the file. Allocating
  487. * a new page in the file would cause excessive storage usage for
  488. * workloads with sparse files. We allocate a page cache page instead.
  489. * We'll kick it out of the page cache if it's ever written to,
  490. * otherwise it will simply fall out of the page cache under memory
  491. * pressure without ever having been dirtied.
  492. */
  493. static int dax_load_hole(struct address_space *mapping, void *entry,
  494. struct vm_fault *vmf)
  495. {
  496. struct page *page;
  497. /* Hole page already exists? Return it... */
  498. if (!radix_tree_exceptional_entry(entry)) {
  499. vmf->page = entry;
  500. return VM_FAULT_LOCKED;
  501. }
  502. /* This will replace locked radix tree entry with a hole page */
  503. page = find_or_create_page(mapping, vmf->pgoff,
  504. vmf->gfp_mask | __GFP_ZERO);
  505. if (!page) {
  506. put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  507. return VM_FAULT_OOM;
  508. }
  509. vmf->page = page;
  510. return VM_FAULT_LOCKED;
  511. }
  512. static int copy_user_bh(struct page *to, struct inode *inode,
  513. struct buffer_head *bh, unsigned long vaddr)
  514. {
  515. struct blk_dax_ctl dax = {
  516. .sector = to_sector(bh, inode),
  517. .size = bh->b_size,
  518. };
  519. struct block_device *bdev = bh->b_bdev;
  520. void *vto;
  521. if (dax_map_atomic(bdev, &dax) < 0)
  522. return PTR_ERR(dax.addr);
  523. vto = kmap_atomic(to);
  524. copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
  525. kunmap_atomic(vto);
  526. dax_unmap_atomic(bdev, &dax);
  527. return 0;
  528. }
  529. #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
  530. static void *dax_insert_mapping_entry(struct address_space *mapping,
  531. struct vm_fault *vmf,
  532. void *entry, sector_t sector)
  533. {
  534. struct radix_tree_root *page_tree = &mapping->page_tree;
  535. int error = 0;
  536. bool hole_fill = false;
  537. void *new_entry;
  538. pgoff_t index = vmf->pgoff;
  539. if (vmf->flags & FAULT_FLAG_WRITE)
  540. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  541. /* Replacing hole page with block mapping? */
  542. if (!radix_tree_exceptional_entry(entry)) {
  543. hole_fill = true;
  544. /*
  545. * Unmap the page now before we remove it from page cache below.
  546. * The page is locked so it cannot be faulted in again.
  547. */
  548. unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
  549. PAGE_SIZE, 0);
  550. error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
  551. if (error)
  552. return ERR_PTR(error);
  553. }
  554. spin_lock_irq(&mapping->tree_lock);
  555. new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
  556. RADIX_DAX_ENTRY_LOCK);
  557. if (hole_fill) {
  558. __delete_from_page_cache(entry, NULL);
  559. /* Drop pagecache reference */
  560. put_page(entry);
  561. error = radix_tree_insert(page_tree, index, new_entry);
  562. if (error) {
  563. new_entry = ERR_PTR(error);
  564. goto unlock;
  565. }
  566. mapping->nrexceptional++;
  567. } else {
  568. void **slot;
  569. void *ret;
  570. ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
  571. WARN_ON_ONCE(ret != entry);
  572. radix_tree_replace_slot(slot, new_entry);
  573. }
  574. if (vmf->flags & FAULT_FLAG_WRITE)
  575. radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  576. unlock:
  577. spin_unlock_irq(&mapping->tree_lock);
  578. if (hole_fill) {
  579. radix_tree_preload_end();
  580. /*
  581. * We don't need hole page anymore, it has been replaced with
  582. * locked radix tree entry now.
  583. */
  584. if (mapping->a_ops->freepage)
  585. mapping->a_ops->freepage(entry);
  586. unlock_page(entry);
  587. put_page(entry);
  588. }
  589. return new_entry;
  590. }
  591. static int dax_writeback_one(struct block_device *bdev,
  592. struct address_space *mapping, pgoff_t index, void *entry)
  593. {
  594. struct radix_tree_root *page_tree = &mapping->page_tree;
  595. int type = RADIX_DAX_TYPE(entry);
  596. struct radix_tree_node *node;
  597. struct blk_dax_ctl dax;
  598. void **slot;
  599. int ret = 0;
  600. spin_lock_irq(&mapping->tree_lock);
  601. /*
  602. * Regular page slots are stabilized by the page lock even
  603. * without the tree itself locked. These unlocked entries
  604. * need verification under the tree lock.
  605. */
  606. if (!__radix_tree_lookup(page_tree, index, &node, &slot))
  607. goto unlock;
  608. if (*slot != entry)
  609. goto unlock;
  610. /* another fsync thread may have already written back this entry */
  611. if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
  612. goto unlock;
  613. if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
  614. ret = -EIO;
  615. goto unlock;
  616. }
  617. dax.sector = RADIX_DAX_SECTOR(entry);
  618. dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
  619. spin_unlock_irq(&mapping->tree_lock);
  620. /*
  621. * We cannot hold tree_lock while calling dax_map_atomic() because it
  622. * eventually calls cond_resched().
  623. */
  624. ret = dax_map_atomic(bdev, &dax);
  625. if (ret < 0)
  626. return ret;
  627. if (WARN_ON_ONCE(ret < dax.size)) {
  628. ret = -EIO;
  629. goto unmap;
  630. }
  631. wb_cache_pmem(dax.addr, dax.size);
  632. spin_lock_irq(&mapping->tree_lock);
  633. radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
  634. spin_unlock_irq(&mapping->tree_lock);
  635. unmap:
  636. dax_unmap_atomic(bdev, &dax);
  637. return ret;
  638. unlock:
  639. spin_unlock_irq(&mapping->tree_lock);
  640. return ret;
  641. }
  642. /*
  643. * Flush the mapping to the persistent domain within the byte range of [start,
  644. * end]. This is required by data integrity operations to ensure file data is
  645. * on persistent storage prior to completion of the operation.
  646. */
  647. int dax_writeback_mapping_range(struct address_space *mapping,
  648. struct block_device *bdev, struct writeback_control *wbc)
  649. {
  650. struct inode *inode = mapping->host;
  651. pgoff_t start_index, end_index, pmd_index;
  652. pgoff_t indices[PAGEVEC_SIZE];
  653. struct pagevec pvec;
  654. bool done = false;
  655. int i, ret = 0;
  656. void *entry;
  657. if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  658. return -EIO;
  659. if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
  660. return 0;
  661. start_index = wbc->range_start >> PAGE_SHIFT;
  662. end_index = wbc->range_end >> PAGE_SHIFT;
  663. pmd_index = DAX_PMD_INDEX(start_index);
  664. rcu_read_lock();
  665. entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
  666. rcu_read_unlock();
  667. /* see if the start of our range is covered by a PMD entry */
  668. if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
  669. start_index = pmd_index;
  670. tag_pages_for_writeback(mapping, start_index, end_index);
  671. pagevec_init(&pvec, 0);
  672. while (!done) {
  673. pvec.nr = find_get_entries_tag(mapping, start_index,
  674. PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
  675. pvec.pages, indices);
  676. if (pvec.nr == 0)
  677. break;
  678. for (i = 0; i < pvec.nr; i++) {
  679. if (indices[i] > end_index) {
  680. done = true;
  681. break;
  682. }
  683. ret = dax_writeback_one(bdev, mapping, indices[i],
  684. pvec.pages[i]);
  685. if (ret < 0)
  686. return ret;
  687. }
  688. }
  689. wmb_pmem();
  690. return 0;
  691. }
  692. EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  693. static int dax_insert_mapping(struct address_space *mapping,
  694. struct buffer_head *bh, void **entryp,
  695. struct vm_area_struct *vma, struct vm_fault *vmf)
  696. {
  697. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  698. struct block_device *bdev = bh->b_bdev;
  699. struct blk_dax_ctl dax = {
  700. .sector = to_sector(bh, mapping->host),
  701. .size = bh->b_size,
  702. };
  703. void *ret;
  704. void *entry = *entryp;
  705. if (dax_map_atomic(bdev, &dax) < 0)
  706. return PTR_ERR(dax.addr);
  707. dax_unmap_atomic(bdev, &dax);
  708. ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
  709. if (IS_ERR(ret))
  710. return PTR_ERR(ret);
  711. *entryp = ret;
  712. return vm_insert_mixed(vma, vaddr, dax.pfn);
  713. }
  714. /**
  715. * __dax_fault - handle a page fault on a DAX file
  716. * @vma: The virtual memory area where the fault occurred
  717. * @vmf: The description of the fault
  718. * @get_block: The filesystem method used to translate file offsets to blocks
  719. *
  720. * When a page fault occurs, filesystems may call this helper in their
  721. * fault handler for DAX files. __dax_fault() assumes the caller has done all
  722. * the necessary locking for the page fault to proceed successfully.
  723. */
  724. int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  725. get_block_t get_block)
  726. {
  727. struct file *file = vma->vm_file;
  728. struct address_space *mapping = file->f_mapping;
  729. struct inode *inode = mapping->host;
  730. void *entry;
  731. struct buffer_head bh;
  732. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  733. unsigned blkbits = inode->i_blkbits;
  734. sector_t block;
  735. pgoff_t size;
  736. int error;
  737. int major = 0;
  738. /*
  739. * Check whether offset isn't beyond end of file now. Caller is supposed
  740. * to hold locks serializing us with truncate / punch hole so this is
  741. * a reliable test.
  742. */
  743. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  744. if (vmf->pgoff >= size)
  745. return VM_FAULT_SIGBUS;
  746. memset(&bh, 0, sizeof(bh));
  747. block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
  748. bh.b_bdev = inode->i_sb->s_bdev;
  749. bh.b_size = PAGE_SIZE;
  750. entry = grab_mapping_entry(mapping, vmf->pgoff);
  751. if (IS_ERR(entry)) {
  752. error = PTR_ERR(entry);
  753. goto out;
  754. }
  755. error = get_block(inode, block, &bh, 0);
  756. if (!error && (bh.b_size < PAGE_SIZE))
  757. error = -EIO; /* fs corruption? */
  758. if (error)
  759. goto unlock_entry;
  760. if (vmf->cow_page) {
  761. struct page *new_page = vmf->cow_page;
  762. if (buffer_written(&bh))
  763. error = copy_user_bh(new_page, inode, &bh, vaddr);
  764. else
  765. clear_user_highpage(new_page, vaddr);
  766. if (error)
  767. goto unlock_entry;
  768. if (!radix_tree_exceptional_entry(entry)) {
  769. vmf->page = entry;
  770. return VM_FAULT_LOCKED;
  771. }
  772. vmf->entry = entry;
  773. return VM_FAULT_DAX_LOCKED;
  774. }
  775. if (!buffer_mapped(&bh)) {
  776. if (vmf->flags & FAULT_FLAG_WRITE) {
  777. error = get_block(inode, block, &bh, 1);
  778. count_vm_event(PGMAJFAULT);
  779. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  780. major = VM_FAULT_MAJOR;
  781. if (!error && (bh.b_size < PAGE_SIZE))
  782. error = -EIO;
  783. if (error)
  784. goto unlock_entry;
  785. } else {
  786. return dax_load_hole(mapping, entry, vmf);
  787. }
  788. }
  789. /* Filesystem should not return unwritten buffers to us! */
  790. WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
  791. error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
  792. unlock_entry:
  793. put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  794. out:
  795. if (error == -ENOMEM)
  796. return VM_FAULT_OOM | major;
  797. /* -EBUSY is fine, somebody else faulted on the same PTE */
  798. if ((error < 0) && (error != -EBUSY))
  799. return VM_FAULT_SIGBUS | major;
  800. return VM_FAULT_NOPAGE | major;
  801. }
  802. EXPORT_SYMBOL(__dax_fault);
  803. /**
  804. * dax_fault - handle a page fault on a DAX file
  805. * @vma: The virtual memory area where the fault occurred
  806. * @vmf: The description of the fault
  807. * @get_block: The filesystem method used to translate file offsets to blocks
  808. *
  809. * When a page fault occurs, filesystems may call this helper in their
  810. * fault handler for DAX files.
  811. */
  812. int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  813. get_block_t get_block)
  814. {
  815. int result;
  816. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  817. if (vmf->flags & FAULT_FLAG_WRITE) {
  818. sb_start_pagefault(sb);
  819. file_update_time(vma->vm_file);
  820. }
  821. result = __dax_fault(vma, vmf, get_block);
  822. if (vmf->flags & FAULT_FLAG_WRITE)
  823. sb_end_pagefault(sb);
  824. return result;
  825. }
  826. EXPORT_SYMBOL_GPL(dax_fault);
  827. #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
  828. /*
  829. * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
  830. * more often than one might expect in the below function.
  831. */
  832. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  833. static void __dax_dbg(struct buffer_head *bh, unsigned long address,
  834. const char *reason, const char *fn)
  835. {
  836. if (bh) {
  837. char bname[BDEVNAME_SIZE];
  838. bdevname(bh->b_bdev, bname);
  839. pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
  840. "length %zd fallback: %s\n", fn, current->comm,
  841. address, bname, bh->b_state, (u64)bh->b_blocknr,
  842. bh->b_size, reason);
  843. } else {
  844. pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
  845. current->comm, address, reason);
  846. }
  847. }
  848. #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
  849. int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  850. pmd_t *pmd, unsigned int flags, get_block_t get_block)
  851. {
  852. struct file *file = vma->vm_file;
  853. struct address_space *mapping = file->f_mapping;
  854. struct inode *inode = mapping->host;
  855. struct buffer_head bh;
  856. unsigned blkbits = inode->i_blkbits;
  857. unsigned long pmd_addr = address & PMD_MASK;
  858. bool write = flags & FAULT_FLAG_WRITE;
  859. struct block_device *bdev;
  860. pgoff_t size, pgoff;
  861. sector_t block;
  862. int result = 0;
  863. bool alloc = false;
  864. /* dax pmd mappings require pfn_t_devmap() */
  865. if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
  866. return VM_FAULT_FALLBACK;
  867. /* Fall back to PTEs if we're going to COW */
  868. if (write && !(vma->vm_flags & VM_SHARED)) {
  869. split_huge_pmd(vma, pmd, address);
  870. dax_pmd_dbg(NULL, address, "cow write");
  871. return VM_FAULT_FALLBACK;
  872. }
  873. /* If the PMD would extend outside the VMA */
  874. if (pmd_addr < vma->vm_start) {
  875. dax_pmd_dbg(NULL, address, "vma start unaligned");
  876. return VM_FAULT_FALLBACK;
  877. }
  878. if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
  879. dax_pmd_dbg(NULL, address, "vma end unaligned");
  880. return VM_FAULT_FALLBACK;
  881. }
  882. pgoff = linear_page_index(vma, pmd_addr);
  883. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  884. if (pgoff >= size)
  885. return VM_FAULT_SIGBUS;
  886. /* If the PMD would cover blocks out of the file */
  887. if ((pgoff | PG_PMD_COLOUR) >= size) {
  888. dax_pmd_dbg(NULL, address,
  889. "offset + huge page size > file size");
  890. return VM_FAULT_FALLBACK;
  891. }
  892. memset(&bh, 0, sizeof(bh));
  893. bh.b_bdev = inode->i_sb->s_bdev;
  894. block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
  895. bh.b_size = PMD_SIZE;
  896. if (get_block(inode, block, &bh, 0) != 0)
  897. return VM_FAULT_SIGBUS;
  898. if (!buffer_mapped(&bh) && write) {
  899. if (get_block(inode, block, &bh, 1) != 0)
  900. return VM_FAULT_SIGBUS;
  901. alloc = true;
  902. WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
  903. }
  904. bdev = bh.b_bdev;
  905. /*
  906. * If the filesystem isn't willing to tell us the length of a hole,
  907. * just fall back to PTEs. Calling get_block 512 times in a loop
  908. * would be silly.
  909. */
  910. if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
  911. dax_pmd_dbg(&bh, address, "allocated block too small");
  912. return VM_FAULT_FALLBACK;
  913. }
  914. /*
  915. * If we allocated new storage, make sure no process has any
  916. * zero pages covering this hole
  917. */
  918. if (alloc) {
  919. loff_t lstart = pgoff << PAGE_SHIFT;
  920. loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
  921. truncate_pagecache_range(inode, lstart, lend);
  922. }
  923. if (!write && !buffer_mapped(&bh)) {
  924. spinlock_t *ptl;
  925. pmd_t entry;
  926. struct page *zero_page = get_huge_zero_page();
  927. if (unlikely(!zero_page)) {
  928. dax_pmd_dbg(&bh, address, "no zero page");
  929. goto fallback;
  930. }
  931. ptl = pmd_lock(vma->vm_mm, pmd);
  932. if (!pmd_none(*pmd)) {
  933. spin_unlock(ptl);
  934. dax_pmd_dbg(&bh, address, "pmd already present");
  935. goto fallback;
  936. }
  937. dev_dbg(part_to_dev(bdev->bd_part),
  938. "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
  939. __func__, current->comm, address,
  940. (unsigned long long) to_sector(&bh, inode));
  941. entry = mk_pmd(zero_page, vma->vm_page_prot);
  942. entry = pmd_mkhuge(entry);
  943. set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
  944. result = VM_FAULT_NOPAGE;
  945. spin_unlock(ptl);
  946. } else {
  947. struct blk_dax_ctl dax = {
  948. .sector = to_sector(&bh, inode),
  949. .size = PMD_SIZE,
  950. };
  951. long length = dax_map_atomic(bdev, &dax);
  952. if (length < 0) {
  953. dax_pmd_dbg(&bh, address, "dax-error fallback");
  954. goto fallback;
  955. }
  956. if (length < PMD_SIZE) {
  957. dax_pmd_dbg(&bh, address, "dax-length too small");
  958. dax_unmap_atomic(bdev, &dax);
  959. goto fallback;
  960. }
  961. if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
  962. dax_pmd_dbg(&bh, address, "pfn unaligned");
  963. dax_unmap_atomic(bdev, &dax);
  964. goto fallback;
  965. }
  966. if (!pfn_t_devmap(dax.pfn)) {
  967. dax_unmap_atomic(bdev, &dax);
  968. dax_pmd_dbg(&bh, address, "pfn not in memmap");
  969. goto fallback;
  970. }
  971. dax_unmap_atomic(bdev, &dax);
  972. /*
  973. * For PTE faults we insert a radix tree entry for reads, and
  974. * leave it clean. Then on the first write we dirty the radix
  975. * tree entry via the dax_pfn_mkwrite() path. This sequence
  976. * allows the dax_pfn_mkwrite() call to be simpler and avoid a
  977. * call into get_block() to translate the pgoff to a sector in
  978. * order to be able to create a new radix tree entry.
  979. *
  980. * The PMD path doesn't have an equivalent to
  981. * dax_pfn_mkwrite(), though, so for a read followed by a
  982. * write we traverse all the way through __dax_pmd_fault()
  983. * twice. This means we can just skip inserting a radix tree
  984. * entry completely on the initial read and just wait until
  985. * the write to insert a dirty entry.
  986. */
  987. if (write) {
  988. /*
  989. * We should insert radix-tree entry and dirty it here.
  990. * For now this is broken...
  991. */
  992. }
  993. dev_dbg(part_to_dev(bdev->bd_part),
  994. "%s: %s addr: %lx pfn: %lx sect: %llx\n",
  995. __func__, current->comm, address,
  996. pfn_t_to_pfn(dax.pfn),
  997. (unsigned long long) dax.sector);
  998. result |= vmf_insert_pfn_pmd(vma, address, pmd,
  999. dax.pfn, write);
  1000. }
  1001. out:
  1002. return result;
  1003. fallback:
  1004. count_vm_event(THP_FAULT_FALLBACK);
  1005. result = VM_FAULT_FALLBACK;
  1006. goto out;
  1007. }
  1008. EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  1009. /**
  1010. * dax_pmd_fault - handle a PMD fault on a DAX file
  1011. * @vma: The virtual memory area where the fault occurred
  1012. * @vmf: The description of the fault
  1013. * @get_block: The filesystem method used to translate file offsets to blocks
  1014. *
  1015. * When a page fault occurs, filesystems may call this helper in their
  1016. * pmd_fault handler for DAX files.
  1017. */
  1018. int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  1019. pmd_t *pmd, unsigned int flags, get_block_t get_block)
  1020. {
  1021. int result;
  1022. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  1023. if (flags & FAULT_FLAG_WRITE) {
  1024. sb_start_pagefault(sb);
  1025. file_update_time(vma->vm_file);
  1026. }
  1027. result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
  1028. if (flags & FAULT_FLAG_WRITE)
  1029. sb_end_pagefault(sb);
  1030. return result;
  1031. }
  1032. EXPORT_SYMBOL_GPL(dax_pmd_fault);
  1033. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  1034. /**
  1035. * dax_pfn_mkwrite - handle first write to DAX page
  1036. * @vma: The virtual memory area where the fault occurred
  1037. * @vmf: The description of the fault
  1038. */
  1039. int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  1040. {
  1041. struct file *file = vma->vm_file;
  1042. struct address_space *mapping = file->f_mapping;
  1043. void *entry;
  1044. pgoff_t index = vmf->pgoff;
  1045. spin_lock_irq(&mapping->tree_lock);
  1046. entry = get_unlocked_mapping_entry(mapping, index, NULL);
  1047. if (!entry || !radix_tree_exceptional_entry(entry))
  1048. goto out;
  1049. radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
  1050. put_unlocked_mapping_entry(mapping, index, entry);
  1051. out:
  1052. spin_unlock_irq(&mapping->tree_lock);
  1053. return VM_FAULT_NOPAGE;
  1054. }
  1055. EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  1056. static bool dax_range_is_aligned(struct block_device *bdev,
  1057. unsigned int offset, unsigned int length)
  1058. {
  1059. unsigned short sector_size = bdev_logical_block_size(bdev);
  1060. if (!IS_ALIGNED(offset, sector_size))
  1061. return false;
  1062. if (!IS_ALIGNED(length, sector_size))
  1063. return false;
  1064. return true;
  1065. }
  1066. int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
  1067. unsigned int offset, unsigned int length)
  1068. {
  1069. struct blk_dax_ctl dax = {
  1070. .sector = sector,
  1071. .size = PAGE_SIZE,
  1072. };
  1073. if (dax_range_is_aligned(bdev, offset, length)) {
  1074. sector_t start_sector = dax.sector + (offset >> 9);
  1075. return blkdev_issue_zeroout(bdev, start_sector,
  1076. length >> 9, GFP_NOFS, true);
  1077. } else {
  1078. if (dax_map_atomic(bdev, &dax) < 0)
  1079. return PTR_ERR(dax.addr);
  1080. clear_pmem(dax.addr + offset, length);
  1081. wmb_pmem();
  1082. dax_unmap_atomic(bdev, &dax);
  1083. }
  1084. return 0;
  1085. }
  1086. EXPORT_SYMBOL_GPL(__dax_zero_page_range);
  1087. /**
  1088. * dax_zero_page_range - zero a range within a page of a DAX file
  1089. * @inode: The file being truncated
  1090. * @from: The file offset that is being truncated to
  1091. * @length: The number of bytes to zero
  1092. * @get_block: The filesystem method used to translate file offsets to blocks
  1093. *
  1094. * This function can be called by a filesystem when it is zeroing part of a
  1095. * page in a DAX file. This is intended for hole-punch operations. If
  1096. * you are truncating a file, the helper function dax_truncate_page() may be
  1097. * more convenient.
  1098. */
  1099. int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
  1100. get_block_t get_block)
  1101. {
  1102. struct buffer_head bh;
  1103. pgoff_t index = from >> PAGE_SHIFT;
  1104. unsigned offset = from & (PAGE_SIZE-1);
  1105. int err;
  1106. /* Block boundary? Nothing to do */
  1107. if (!length)
  1108. return 0;
  1109. BUG_ON((offset + length) > PAGE_SIZE);
  1110. memset(&bh, 0, sizeof(bh));
  1111. bh.b_bdev = inode->i_sb->s_bdev;
  1112. bh.b_size = PAGE_SIZE;
  1113. err = get_block(inode, index, &bh, 0);
  1114. if (err < 0 || !buffer_written(&bh))
  1115. return err;
  1116. return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
  1117. offset, length);
  1118. }
  1119. EXPORT_SYMBOL_GPL(dax_zero_page_range);
  1120. /**
  1121. * dax_truncate_page - handle a partial page being truncated in a DAX file
  1122. * @inode: The file being truncated
  1123. * @from: The file offset that is being truncated to
  1124. * @get_block: The filesystem method used to translate file offsets to blocks
  1125. *
  1126. * Similar to block_truncate_page(), this function can be called by a
  1127. * filesystem when it is truncating a DAX file to handle the partial page.
  1128. */
  1129. int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
  1130. {
  1131. unsigned length = PAGE_ALIGN(from) - from;
  1132. return dax_zero_page_range(inode, from, length, get_block);
  1133. }
  1134. EXPORT_SYMBOL_GPL(dax_truncate_page);