dax.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146
  1. /*
  2. * fs/dax.c - Direct Access filesystem code
  3. * Copyright (c) 2013-2014 Intel Corporation
  4. * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  5. * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms and conditions of the GNU General Public License,
  9. * version 2, as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope it will be useful, but WITHOUT
  12. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  14. * more details.
  15. */
  16. #include <linux/atomic.h>
  17. #include <linux/blkdev.h>
  18. #include <linux/buffer_head.h>
  19. #include <linux/dax.h>
  20. #include <linux/fs.h>
  21. #include <linux/genhd.h>
  22. #include <linux/highmem.h>
  23. #include <linux/memcontrol.h>
  24. #include <linux/mm.h>
  25. #include <linux/mutex.h>
  26. #include <linux/pagevec.h>
  27. #include <linux/pmem.h>
  28. #include <linux/sched.h>
  29. #include <linux/uio.h>
  30. #include <linux/vmstat.h>
  31. #include <linux/pfn_t.h>
  32. #include <linux/sizes.h>
  33. static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  34. {
  35. struct request_queue *q = bdev->bd_queue;
  36. long rc = -EIO;
  37. dax->addr = (void __pmem *) ERR_PTR(-EIO);
  38. if (blk_queue_enter(q, true) != 0)
  39. return rc;
  40. rc = bdev_direct_access(bdev, dax);
  41. if (rc < 0) {
  42. dax->addr = (void __pmem *) ERR_PTR(rc);
  43. blk_queue_exit(q);
  44. return rc;
  45. }
  46. return rc;
  47. }
  48. static void dax_unmap_atomic(struct block_device *bdev,
  49. const struct blk_dax_ctl *dax)
  50. {
  51. if (IS_ERR(dax->addr))
  52. return;
  53. blk_queue_exit(bdev->bd_queue);
  54. }
  55. struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  56. {
  57. struct page *page = alloc_pages(GFP_KERNEL, 0);
  58. struct blk_dax_ctl dax = {
  59. .size = PAGE_SIZE,
  60. .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
  61. };
  62. long rc;
  63. if (!page)
  64. return ERR_PTR(-ENOMEM);
  65. rc = dax_map_atomic(bdev, &dax);
  66. if (rc < 0)
  67. return ERR_PTR(rc);
  68. memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
  69. dax_unmap_atomic(bdev, &dax);
  70. return page;
  71. }
  72. /*
  73. * dax_clear_blocks() is called from within transaction context from XFS,
  74. * and hence this means the stack from this point must follow GFP_NOFS
  75. * semantics for all operations.
  76. */
  77. int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
  78. {
  79. struct block_device *bdev = inode->i_sb->s_bdev;
  80. struct blk_dax_ctl dax = {
  81. .sector = block << (inode->i_blkbits - 9),
  82. .size = _size,
  83. };
  84. might_sleep();
  85. do {
  86. long count, sz;
  87. count = dax_map_atomic(bdev, &dax);
  88. if (count < 0)
  89. return count;
  90. sz = min_t(long, count, SZ_128K);
  91. clear_pmem(dax.addr, sz);
  92. dax.size -= sz;
  93. dax.sector += sz / 512;
  94. dax_unmap_atomic(bdev, &dax);
  95. cond_resched();
  96. } while (dax.size);
  97. wmb_pmem();
  98. return 0;
  99. }
  100. EXPORT_SYMBOL_GPL(dax_clear_blocks);
  101. /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
  102. static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
  103. loff_t pos, loff_t end)
  104. {
  105. loff_t final = end - pos + first; /* The final byte of the buffer */
  106. if (first > 0)
  107. clear_pmem(addr, first);
  108. if (final < size)
  109. clear_pmem(addr + final, size - final);
  110. }
  111. static bool buffer_written(struct buffer_head *bh)
  112. {
  113. return buffer_mapped(bh) && !buffer_unwritten(bh);
  114. }
  115. /*
  116. * When ext4 encounters a hole, it returns without modifying the buffer_head
  117. * which means that we can't trust b_size. To cope with this, we set b_state
  118. * to 0 before calling get_block and, if any bit is set, we know we can trust
  119. * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
  120. * and would save us time calling get_block repeatedly.
  121. */
  122. static bool buffer_size_valid(struct buffer_head *bh)
  123. {
  124. return bh->b_state != 0;
  125. }
  126. static sector_t to_sector(const struct buffer_head *bh,
  127. const struct inode *inode)
  128. {
  129. sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
  130. return sector;
  131. }
  132. static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  133. loff_t start, loff_t end, get_block_t get_block,
  134. struct buffer_head *bh)
  135. {
  136. loff_t pos = start, max = start, bh_max = start;
  137. bool hole = false, need_wmb = false;
  138. struct block_device *bdev = NULL;
  139. int rw = iov_iter_rw(iter), rc;
  140. long map_len = 0;
  141. struct blk_dax_ctl dax = {
  142. .addr = (void __pmem *) ERR_PTR(-EIO),
  143. };
  144. if (rw == READ)
  145. end = min(end, i_size_read(inode));
  146. while (pos < end) {
  147. size_t len;
  148. if (pos == max) {
  149. unsigned blkbits = inode->i_blkbits;
  150. long page = pos >> PAGE_SHIFT;
  151. sector_t block = page << (PAGE_SHIFT - blkbits);
  152. unsigned first = pos - (block << blkbits);
  153. long size;
  154. if (pos == bh_max) {
  155. bh->b_size = PAGE_ALIGN(end - pos);
  156. bh->b_state = 0;
  157. rc = get_block(inode, block, bh, rw == WRITE);
  158. if (rc)
  159. break;
  160. if (!buffer_size_valid(bh))
  161. bh->b_size = 1 << blkbits;
  162. bh_max = pos - first + bh->b_size;
  163. bdev = bh->b_bdev;
  164. } else {
  165. unsigned done = bh->b_size -
  166. (bh_max - (pos - first));
  167. bh->b_blocknr += done >> blkbits;
  168. bh->b_size -= done;
  169. }
  170. hole = rw == READ && !buffer_written(bh);
  171. if (hole) {
  172. size = bh->b_size - first;
  173. } else {
  174. dax_unmap_atomic(bdev, &dax);
  175. dax.sector = to_sector(bh, inode);
  176. dax.size = bh->b_size;
  177. map_len = dax_map_atomic(bdev, &dax);
  178. if (map_len < 0) {
  179. rc = map_len;
  180. break;
  181. }
  182. if (buffer_unwritten(bh) || buffer_new(bh)) {
  183. dax_new_buf(dax.addr, map_len, first,
  184. pos, end);
  185. need_wmb = true;
  186. }
  187. dax.addr += first;
  188. size = map_len - first;
  189. }
  190. max = min(pos + size, end);
  191. }
  192. if (iov_iter_rw(iter) == WRITE) {
  193. len = copy_from_iter_pmem(dax.addr, max - pos, iter);
  194. need_wmb = true;
  195. } else if (!hole)
  196. len = copy_to_iter((void __force *) dax.addr, max - pos,
  197. iter);
  198. else
  199. len = iov_iter_zero(max - pos, iter);
  200. if (!len) {
  201. rc = -EFAULT;
  202. break;
  203. }
  204. pos += len;
  205. if (!IS_ERR(dax.addr))
  206. dax.addr += len;
  207. }
  208. if (need_wmb)
  209. wmb_pmem();
  210. dax_unmap_atomic(bdev, &dax);
  211. return (pos == start) ? rc : pos - start;
  212. }
  213. /**
  214. * dax_do_io - Perform I/O to a DAX file
  215. * @iocb: The control block for this I/O
  216. * @inode: The file which the I/O is directed at
  217. * @iter: The addresses to do I/O from or to
  218. * @pos: The file offset where the I/O starts
  219. * @get_block: The filesystem method used to translate file offsets to blocks
  220. * @end_io: A filesystem callback for I/O completion
  221. * @flags: See below
  222. *
  223. * This function uses the same locking scheme as do_blockdev_direct_IO:
  224. * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
  225. * caller for writes. For reads, we take and release the i_mutex ourselves.
  226. * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
  227. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
  228. * is in progress.
  229. */
  230. ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
  231. struct iov_iter *iter, loff_t pos, get_block_t get_block,
  232. dio_iodone_t end_io, int flags)
  233. {
  234. struct buffer_head bh;
  235. ssize_t retval = -EINVAL;
  236. loff_t end = pos + iov_iter_count(iter);
  237. memset(&bh, 0, sizeof(bh));
  238. bh.b_bdev = inode->i_sb->s_bdev;
  239. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
  240. struct address_space *mapping = inode->i_mapping;
  241. inode_lock(inode);
  242. retval = filemap_write_and_wait_range(mapping, pos, end - 1);
  243. if (retval) {
  244. inode_unlock(inode);
  245. goto out;
  246. }
  247. }
  248. /* Protects against truncate */
  249. if (!(flags & DIO_SKIP_DIO_COUNT))
  250. inode_dio_begin(inode);
  251. retval = dax_io(inode, iter, pos, end, get_block, &bh);
  252. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
  253. inode_unlock(inode);
  254. if ((retval > 0) && end_io)
  255. end_io(iocb, pos, retval, bh.b_private);
  256. if (!(flags & DIO_SKIP_DIO_COUNT))
  257. inode_dio_end(inode);
  258. out:
  259. return retval;
  260. }
  261. EXPORT_SYMBOL_GPL(dax_do_io);
  262. /*
  263. * The user has performed a load from a hole in the file. Allocating
  264. * a new page in the file would cause excessive storage usage for
  265. * workloads with sparse files. We allocate a page cache page instead.
  266. * We'll kick it out of the page cache if it's ever written to,
  267. * otherwise it will simply fall out of the page cache under memory
  268. * pressure without ever having been dirtied.
  269. */
  270. static int dax_load_hole(struct address_space *mapping, struct page *page,
  271. struct vm_fault *vmf)
  272. {
  273. unsigned long size;
  274. struct inode *inode = mapping->host;
  275. if (!page)
  276. page = find_or_create_page(mapping, vmf->pgoff,
  277. GFP_KERNEL | __GFP_ZERO);
  278. if (!page)
  279. return VM_FAULT_OOM;
  280. /* Recheck i_size under page lock to avoid truncate race */
  281. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  282. if (vmf->pgoff >= size) {
  283. unlock_page(page);
  284. page_cache_release(page);
  285. return VM_FAULT_SIGBUS;
  286. }
  287. vmf->page = page;
  288. return VM_FAULT_LOCKED;
  289. }
  290. static int copy_user_bh(struct page *to, struct inode *inode,
  291. struct buffer_head *bh, unsigned long vaddr)
  292. {
  293. struct blk_dax_ctl dax = {
  294. .sector = to_sector(bh, inode),
  295. .size = bh->b_size,
  296. };
  297. struct block_device *bdev = bh->b_bdev;
  298. void *vto;
  299. if (dax_map_atomic(bdev, &dax) < 0)
  300. return PTR_ERR(dax.addr);
  301. vto = kmap_atomic(to);
  302. copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
  303. kunmap_atomic(vto);
  304. dax_unmap_atomic(bdev, &dax);
  305. return 0;
  306. }
  307. #define NO_SECTOR -1
  308. #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
  309. static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
  310. sector_t sector, bool pmd_entry, bool dirty)
  311. {
  312. struct radix_tree_root *page_tree = &mapping->page_tree;
  313. pgoff_t pmd_index = DAX_PMD_INDEX(index);
  314. int type, error = 0;
  315. void *entry;
  316. WARN_ON_ONCE(pmd_entry && !dirty);
  317. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  318. spin_lock_irq(&mapping->tree_lock);
  319. entry = radix_tree_lookup(page_tree, pmd_index);
  320. if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
  321. index = pmd_index;
  322. goto dirty;
  323. }
  324. entry = radix_tree_lookup(page_tree, index);
  325. if (entry) {
  326. type = RADIX_DAX_TYPE(entry);
  327. if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
  328. type != RADIX_DAX_PMD)) {
  329. error = -EIO;
  330. goto unlock;
  331. }
  332. if (!pmd_entry || type == RADIX_DAX_PMD)
  333. goto dirty;
  334. /*
  335. * We only insert dirty PMD entries into the radix tree. This
  336. * means we don't need to worry about removing a dirty PTE
  337. * entry and inserting a clean PMD entry, thus reducing the
  338. * range we would flush with a follow-up fsync/msync call.
  339. */
  340. radix_tree_delete(&mapping->page_tree, index);
  341. mapping->nrexceptional--;
  342. }
  343. if (sector == NO_SECTOR) {
  344. /*
  345. * This can happen during correct operation if our pfn_mkwrite
  346. * fault raced against a hole punch operation. If this
  347. * happens the pte that was hole punched will have been
  348. * unmapped and the radix tree entry will have been removed by
  349. * the time we are called, but the call will still happen. We
  350. * will return all the way up to wp_pfn_shared(), where the
  351. * pte_same() check will fail, eventually causing page fault
  352. * to be retried by the CPU.
  353. */
  354. goto unlock;
  355. }
  356. error = radix_tree_insert(page_tree, index,
  357. RADIX_DAX_ENTRY(sector, pmd_entry));
  358. if (error)
  359. goto unlock;
  360. mapping->nrexceptional++;
  361. dirty:
  362. if (dirty)
  363. radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  364. unlock:
  365. spin_unlock_irq(&mapping->tree_lock);
  366. return error;
  367. }
  368. static int dax_writeback_one(struct block_device *bdev,
  369. struct address_space *mapping, pgoff_t index, void *entry)
  370. {
  371. struct radix_tree_root *page_tree = &mapping->page_tree;
  372. int type = RADIX_DAX_TYPE(entry);
  373. struct radix_tree_node *node;
  374. struct blk_dax_ctl dax;
  375. void **slot;
  376. int ret = 0;
  377. spin_lock_irq(&mapping->tree_lock);
  378. /*
  379. * Regular page slots are stabilized by the page lock even
  380. * without the tree itself locked. These unlocked entries
  381. * need verification under the tree lock.
  382. */
  383. if (!__radix_tree_lookup(page_tree, index, &node, &slot))
  384. goto unlock;
  385. if (*slot != entry)
  386. goto unlock;
  387. /* another fsync thread may have already written back this entry */
  388. if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
  389. goto unlock;
  390. if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
  391. ret = -EIO;
  392. goto unlock;
  393. }
  394. dax.sector = RADIX_DAX_SECTOR(entry);
  395. dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
  396. spin_unlock_irq(&mapping->tree_lock);
  397. /*
  398. * We cannot hold tree_lock while calling dax_map_atomic() because it
  399. * eventually calls cond_resched().
  400. */
  401. ret = dax_map_atomic(bdev, &dax);
  402. if (ret < 0)
  403. return ret;
  404. if (WARN_ON_ONCE(ret < dax.size)) {
  405. ret = -EIO;
  406. goto unmap;
  407. }
  408. wb_cache_pmem(dax.addr, dax.size);
  409. spin_lock_irq(&mapping->tree_lock);
  410. radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
  411. spin_unlock_irq(&mapping->tree_lock);
  412. unmap:
  413. dax_unmap_atomic(bdev, &dax);
  414. return ret;
  415. unlock:
  416. spin_unlock_irq(&mapping->tree_lock);
  417. return ret;
  418. }
  419. /*
  420. * Flush the mapping to the persistent domain within the byte range of [start,
  421. * end]. This is required by data integrity operations to ensure file data is
  422. * on persistent storage prior to completion of the operation.
  423. */
  424. int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
  425. loff_t end)
  426. {
  427. struct inode *inode = mapping->host;
  428. struct block_device *bdev = inode->i_sb->s_bdev;
  429. pgoff_t start_index, end_index, pmd_index;
  430. pgoff_t indices[PAGEVEC_SIZE];
  431. struct pagevec pvec;
  432. bool done = false;
  433. int i, ret = 0;
  434. void *entry;
  435. if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  436. return -EIO;
  437. start_index = start >> PAGE_CACHE_SHIFT;
  438. end_index = end >> PAGE_CACHE_SHIFT;
  439. pmd_index = DAX_PMD_INDEX(start_index);
  440. rcu_read_lock();
  441. entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
  442. rcu_read_unlock();
  443. /* see if the start of our range is covered by a PMD entry */
  444. if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
  445. start_index = pmd_index;
  446. tag_pages_for_writeback(mapping, start_index, end_index);
  447. pagevec_init(&pvec, 0);
  448. while (!done) {
  449. pvec.nr = find_get_entries_tag(mapping, start_index,
  450. PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
  451. pvec.pages, indices);
  452. if (pvec.nr == 0)
  453. break;
  454. for (i = 0; i < pvec.nr; i++) {
  455. if (indices[i] > end_index) {
  456. done = true;
  457. break;
  458. }
  459. ret = dax_writeback_one(bdev, mapping, indices[i],
  460. pvec.pages[i]);
  461. if (ret < 0)
  462. return ret;
  463. }
  464. }
  465. wmb_pmem();
  466. return 0;
  467. }
  468. EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  469. static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  470. struct vm_area_struct *vma, struct vm_fault *vmf)
  471. {
  472. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  473. struct address_space *mapping = inode->i_mapping;
  474. struct block_device *bdev = bh->b_bdev;
  475. struct blk_dax_ctl dax = {
  476. .sector = to_sector(bh, inode),
  477. .size = bh->b_size,
  478. };
  479. pgoff_t size;
  480. int error;
  481. i_mmap_lock_read(mapping);
  482. /*
  483. * Check truncate didn't happen while we were allocating a block.
  484. * If it did, this block may or may not be still allocated to the
  485. * file. We can't tell the filesystem to free it because we can't
  486. * take i_mutex here. In the worst case, the file still has blocks
  487. * allocated past the end of the file.
  488. */
  489. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  490. if (unlikely(vmf->pgoff >= size)) {
  491. error = -EIO;
  492. goto out;
  493. }
  494. if (dax_map_atomic(bdev, &dax) < 0) {
  495. error = PTR_ERR(dax.addr);
  496. goto out;
  497. }
  498. if (buffer_unwritten(bh) || buffer_new(bh)) {
  499. clear_pmem(dax.addr, PAGE_SIZE);
  500. wmb_pmem();
  501. }
  502. dax_unmap_atomic(bdev, &dax);
  503. error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
  504. vmf->flags & FAULT_FLAG_WRITE);
  505. if (error)
  506. goto out;
  507. error = vm_insert_mixed(vma, vaddr, dax.pfn);
  508. out:
  509. i_mmap_unlock_read(mapping);
  510. return error;
  511. }
  512. /**
  513. * __dax_fault - handle a page fault on a DAX file
  514. * @vma: The virtual memory area where the fault occurred
  515. * @vmf: The description of the fault
  516. * @get_block: The filesystem method used to translate file offsets to blocks
  517. * @complete_unwritten: The filesystem method used to convert unwritten blocks
  518. * to written so the data written to them is exposed. This is required for
  519. * required by write faults for filesystems that will return unwritten
  520. * extent mappings from @get_block, but it is optional for reads as
  521. * dax_insert_mapping() will always zero unwritten blocks. If the fs does
  522. * not support unwritten extents, the it should pass NULL.
  523. *
  524. * When a page fault occurs, filesystems may call this helper in their
  525. * fault handler for DAX files. __dax_fault() assumes the caller has done all
  526. * the necessary locking for the page fault to proceed successfully.
  527. */
  528. int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  529. get_block_t get_block, dax_iodone_t complete_unwritten)
  530. {
  531. struct file *file = vma->vm_file;
  532. struct address_space *mapping = file->f_mapping;
  533. struct inode *inode = mapping->host;
  534. struct page *page;
  535. struct buffer_head bh;
  536. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  537. unsigned blkbits = inode->i_blkbits;
  538. sector_t block;
  539. pgoff_t size;
  540. int error;
  541. int major = 0;
  542. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  543. if (vmf->pgoff >= size)
  544. return VM_FAULT_SIGBUS;
  545. memset(&bh, 0, sizeof(bh));
  546. block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
  547. bh.b_bdev = inode->i_sb->s_bdev;
  548. bh.b_size = PAGE_SIZE;
  549. repeat:
  550. page = find_get_page(mapping, vmf->pgoff);
  551. if (page) {
  552. if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
  553. page_cache_release(page);
  554. return VM_FAULT_RETRY;
  555. }
  556. if (unlikely(page->mapping != mapping)) {
  557. unlock_page(page);
  558. page_cache_release(page);
  559. goto repeat;
  560. }
  561. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  562. if (unlikely(vmf->pgoff >= size)) {
  563. /*
  564. * We have a struct page covering a hole in the file
  565. * from a read fault and we've raced with a truncate
  566. */
  567. error = -EIO;
  568. goto unlock_page;
  569. }
  570. }
  571. error = get_block(inode, block, &bh, 0);
  572. if (!error && (bh.b_size < PAGE_SIZE))
  573. error = -EIO; /* fs corruption? */
  574. if (error)
  575. goto unlock_page;
  576. if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
  577. if (vmf->flags & FAULT_FLAG_WRITE) {
  578. error = get_block(inode, block, &bh, 1);
  579. count_vm_event(PGMAJFAULT);
  580. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  581. major = VM_FAULT_MAJOR;
  582. if (!error && (bh.b_size < PAGE_SIZE))
  583. error = -EIO;
  584. if (error)
  585. goto unlock_page;
  586. } else {
  587. return dax_load_hole(mapping, page, vmf);
  588. }
  589. }
  590. if (vmf->cow_page) {
  591. struct page *new_page = vmf->cow_page;
  592. if (buffer_written(&bh))
  593. error = copy_user_bh(new_page, inode, &bh, vaddr);
  594. else
  595. clear_user_highpage(new_page, vaddr);
  596. if (error)
  597. goto unlock_page;
  598. vmf->page = page;
  599. if (!page) {
  600. i_mmap_lock_read(mapping);
  601. /* Check we didn't race with truncate */
  602. size = (i_size_read(inode) + PAGE_SIZE - 1) >>
  603. PAGE_SHIFT;
  604. if (vmf->pgoff >= size) {
  605. i_mmap_unlock_read(mapping);
  606. error = -EIO;
  607. goto out;
  608. }
  609. }
  610. return VM_FAULT_LOCKED;
  611. }
  612. /* Check we didn't race with a read fault installing a new page */
  613. if (!page && major)
  614. page = find_lock_page(mapping, vmf->pgoff);
  615. if (page) {
  616. unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
  617. PAGE_CACHE_SIZE, 0);
  618. delete_from_page_cache(page);
  619. unlock_page(page);
  620. page_cache_release(page);
  621. page = NULL;
  622. }
  623. /*
  624. * If we successfully insert the new mapping over an unwritten extent,
  625. * we need to ensure we convert the unwritten extent. If there is an
  626. * error inserting the mapping, the filesystem needs to leave it as
  627. * unwritten to prevent exposure of the stale underlying data to
  628. * userspace, but we still need to call the completion function so
  629. * the private resources on the mapping buffer can be released. We
  630. * indicate what the callback should do via the uptodate variable, same
  631. * as for normal BH based IO completions.
  632. */
  633. error = dax_insert_mapping(inode, &bh, vma, vmf);
  634. if (buffer_unwritten(&bh)) {
  635. if (complete_unwritten)
  636. complete_unwritten(&bh, !error);
  637. else
  638. WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
  639. }
  640. out:
  641. if (error == -ENOMEM)
  642. return VM_FAULT_OOM | major;
  643. /* -EBUSY is fine, somebody else faulted on the same PTE */
  644. if ((error < 0) && (error != -EBUSY))
  645. return VM_FAULT_SIGBUS | major;
  646. return VM_FAULT_NOPAGE | major;
  647. unlock_page:
  648. if (page) {
  649. unlock_page(page);
  650. page_cache_release(page);
  651. }
  652. goto out;
  653. }
  654. EXPORT_SYMBOL(__dax_fault);
  655. /**
  656. * dax_fault - handle a page fault on a DAX file
  657. * @vma: The virtual memory area where the fault occurred
  658. * @vmf: The description of the fault
  659. * @get_block: The filesystem method used to translate file offsets to blocks
  660. *
  661. * When a page fault occurs, filesystems may call this helper in their
  662. * fault handler for DAX files.
  663. */
  664. int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  665. get_block_t get_block, dax_iodone_t complete_unwritten)
  666. {
  667. int result;
  668. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  669. if (vmf->flags & FAULT_FLAG_WRITE) {
  670. sb_start_pagefault(sb);
  671. file_update_time(vma->vm_file);
  672. }
  673. result = __dax_fault(vma, vmf, get_block, complete_unwritten);
  674. if (vmf->flags & FAULT_FLAG_WRITE)
  675. sb_end_pagefault(sb);
  676. return result;
  677. }
  678. EXPORT_SYMBOL_GPL(dax_fault);
  679. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  680. /*
  681. * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
  682. * more often than one might expect in the below function.
  683. */
  684. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  685. static void __dax_dbg(struct buffer_head *bh, unsigned long address,
  686. const char *reason, const char *fn)
  687. {
  688. if (bh) {
  689. char bname[BDEVNAME_SIZE];
  690. bdevname(bh->b_bdev, bname);
  691. pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
  692. "length %zd fallback: %s\n", fn, current->comm,
  693. address, bname, bh->b_state, (u64)bh->b_blocknr,
  694. bh->b_size, reason);
  695. } else {
  696. pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
  697. current->comm, address, reason);
  698. }
  699. }
  700. #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
  701. int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  702. pmd_t *pmd, unsigned int flags, get_block_t get_block,
  703. dax_iodone_t complete_unwritten)
  704. {
  705. struct file *file = vma->vm_file;
  706. struct address_space *mapping = file->f_mapping;
  707. struct inode *inode = mapping->host;
  708. struct buffer_head bh;
  709. unsigned blkbits = inode->i_blkbits;
  710. unsigned long pmd_addr = address & PMD_MASK;
  711. bool write = flags & FAULT_FLAG_WRITE;
  712. struct block_device *bdev;
  713. pgoff_t size, pgoff;
  714. sector_t block;
  715. int error, result = 0;
  716. bool alloc = false;
  717. /* dax pmd mappings require pfn_t_devmap() */
  718. if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
  719. return VM_FAULT_FALLBACK;
  720. /* Fall back to PTEs if we're going to COW */
  721. if (write && !(vma->vm_flags & VM_SHARED)) {
  722. split_huge_pmd(vma, pmd, address);
  723. dax_pmd_dbg(NULL, address, "cow write");
  724. return VM_FAULT_FALLBACK;
  725. }
  726. /* If the PMD would extend outside the VMA */
  727. if (pmd_addr < vma->vm_start) {
  728. dax_pmd_dbg(NULL, address, "vma start unaligned");
  729. return VM_FAULT_FALLBACK;
  730. }
  731. if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
  732. dax_pmd_dbg(NULL, address, "vma end unaligned");
  733. return VM_FAULT_FALLBACK;
  734. }
  735. pgoff = linear_page_index(vma, pmd_addr);
  736. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  737. if (pgoff >= size)
  738. return VM_FAULT_SIGBUS;
  739. /* If the PMD would cover blocks out of the file */
  740. if ((pgoff | PG_PMD_COLOUR) >= size) {
  741. dax_pmd_dbg(NULL, address,
  742. "offset + huge page size > file size");
  743. return VM_FAULT_FALLBACK;
  744. }
  745. memset(&bh, 0, sizeof(bh));
  746. bh.b_bdev = inode->i_sb->s_bdev;
  747. block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
  748. bh.b_size = PMD_SIZE;
  749. if (get_block(inode, block, &bh, 0) != 0)
  750. return VM_FAULT_SIGBUS;
  751. if (!buffer_mapped(&bh) && write) {
  752. if (get_block(inode, block, &bh, 1) != 0)
  753. return VM_FAULT_SIGBUS;
  754. alloc = true;
  755. }
  756. bdev = bh.b_bdev;
  757. /*
  758. * If the filesystem isn't willing to tell us the length of a hole,
  759. * just fall back to PTEs. Calling get_block 512 times in a loop
  760. * would be silly.
  761. */
  762. if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
  763. dax_pmd_dbg(&bh, address, "allocated block too small");
  764. return VM_FAULT_FALLBACK;
  765. }
  766. /*
  767. * If we allocated new storage, make sure no process has any
  768. * zero pages covering this hole
  769. */
  770. if (alloc) {
  771. loff_t lstart = pgoff << PAGE_SHIFT;
  772. loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
  773. truncate_pagecache_range(inode, lstart, lend);
  774. }
  775. i_mmap_lock_read(mapping);
  776. /*
  777. * If a truncate happened while we were allocating blocks, we may
  778. * leave blocks allocated to the file that are beyond EOF. We can't
  779. * take i_mutex here, so just leave them hanging; they'll be freed
  780. * when the file is deleted.
  781. */
  782. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  783. if (pgoff >= size) {
  784. result = VM_FAULT_SIGBUS;
  785. goto out;
  786. }
  787. if ((pgoff | PG_PMD_COLOUR) >= size) {
  788. dax_pmd_dbg(&bh, address,
  789. "offset + huge page size > file size");
  790. goto fallback;
  791. }
  792. if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
  793. spinlock_t *ptl;
  794. pmd_t entry;
  795. struct page *zero_page = get_huge_zero_page();
  796. if (unlikely(!zero_page)) {
  797. dax_pmd_dbg(&bh, address, "no zero page");
  798. goto fallback;
  799. }
  800. ptl = pmd_lock(vma->vm_mm, pmd);
  801. if (!pmd_none(*pmd)) {
  802. spin_unlock(ptl);
  803. dax_pmd_dbg(&bh, address, "pmd already present");
  804. goto fallback;
  805. }
  806. dev_dbg(part_to_dev(bdev->bd_part),
  807. "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
  808. __func__, current->comm, address,
  809. (unsigned long long) to_sector(&bh, inode));
  810. entry = mk_pmd(zero_page, vma->vm_page_prot);
  811. entry = pmd_mkhuge(entry);
  812. set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
  813. result = VM_FAULT_NOPAGE;
  814. spin_unlock(ptl);
  815. } else {
  816. struct blk_dax_ctl dax = {
  817. .sector = to_sector(&bh, inode),
  818. .size = PMD_SIZE,
  819. };
  820. long length = dax_map_atomic(bdev, &dax);
  821. if (length < 0) {
  822. result = VM_FAULT_SIGBUS;
  823. goto out;
  824. }
  825. if (length < PMD_SIZE) {
  826. dax_pmd_dbg(&bh, address, "dax-length too small");
  827. dax_unmap_atomic(bdev, &dax);
  828. goto fallback;
  829. }
  830. if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
  831. dax_pmd_dbg(&bh, address, "pfn unaligned");
  832. dax_unmap_atomic(bdev, &dax);
  833. goto fallback;
  834. }
  835. if (!pfn_t_devmap(dax.pfn)) {
  836. dax_unmap_atomic(bdev, &dax);
  837. dax_pmd_dbg(&bh, address, "pfn not in memmap");
  838. goto fallback;
  839. }
  840. if (buffer_unwritten(&bh) || buffer_new(&bh)) {
  841. clear_pmem(dax.addr, PMD_SIZE);
  842. wmb_pmem();
  843. count_vm_event(PGMAJFAULT);
  844. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  845. result |= VM_FAULT_MAJOR;
  846. }
  847. dax_unmap_atomic(bdev, &dax);
  848. /*
  849. * For PTE faults we insert a radix tree entry for reads, and
  850. * leave it clean. Then on the first write we dirty the radix
  851. * tree entry via the dax_pfn_mkwrite() path. This sequence
  852. * allows the dax_pfn_mkwrite() call to be simpler and avoid a
  853. * call into get_block() to translate the pgoff to a sector in
  854. * order to be able to create a new radix tree entry.
  855. *
  856. * The PMD path doesn't have an equivalent to
  857. * dax_pfn_mkwrite(), though, so for a read followed by a
  858. * write we traverse all the way through __dax_pmd_fault()
  859. * twice. This means we can just skip inserting a radix tree
  860. * entry completely on the initial read and just wait until
  861. * the write to insert a dirty entry.
  862. */
  863. if (write) {
  864. error = dax_radix_entry(mapping, pgoff, dax.sector,
  865. true, true);
  866. if (error) {
  867. dax_pmd_dbg(&bh, address,
  868. "PMD radix insertion failed");
  869. goto fallback;
  870. }
  871. }
  872. dev_dbg(part_to_dev(bdev->bd_part),
  873. "%s: %s addr: %lx pfn: %lx sect: %llx\n",
  874. __func__, current->comm, address,
  875. pfn_t_to_pfn(dax.pfn),
  876. (unsigned long long) dax.sector);
  877. result |= vmf_insert_pfn_pmd(vma, address, pmd,
  878. dax.pfn, write);
  879. }
  880. out:
  881. i_mmap_unlock_read(mapping);
  882. if (buffer_unwritten(&bh))
  883. complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
  884. return result;
  885. fallback:
  886. count_vm_event(THP_FAULT_FALLBACK);
  887. result = VM_FAULT_FALLBACK;
  888. goto out;
  889. }
  890. EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  891. /**
  892. * dax_pmd_fault - handle a PMD fault on a DAX file
  893. * @vma: The virtual memory area where the fault occurred
  894. * @vmf: The description of the fault
  895. * @get_block: The filesystem method used to translate file offsets to blocks
  896. *
  897. * When a page fault occurs, filesystems may call this helper in their
  898. * pmd_fault handler for DAX files.
  899. */
  900. int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  901. pmd_t *pmd, unsigned int flags, get_block_t get_block,
  902. dax_iodone_t complete_unwritten)
  903. {
  904. int result;
  905. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  906. if (flags & FAULT_FLAG_WRITE) {
  907. sb_start_pagefault(sb);
  908. file_update_time(vma->vm_file);
  909. }
  910. result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
  911. complete_unwritten);
  912. if (flags & FAULT_FLAG_WRITE)
  913. sb_end_pagefault(sb);
  914. return result;
  915. }
  916. EXPORT_SYMBOL_GPL(dax_pmd_fault);
  917. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  918. /**
  919. * dax_pfn_mkwrite - handle first write to DAX page
  920. * @vma: The virtual memory area where the fault occurred
  921. * @vmf: The description of the fault
  922. */
  923. int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  924. {
  925. struct file *file = vma->vm_file;
  926. /*
  927. * We pass NO_SECTOR to dax_radix_entry() because we expect that a
  928. * RADIX_DAX_PTE entry already exists in the radix tree from a
  929. * previous call to __dax_fault(). We just want to look up that PTE
  930. * entry using vmf->pgoff and make sure the dirty tag is set. This
  931. * saves us from having to make a call to get_block() here to look
  932. * up the sector.
  933. */
  934. dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
  935. return VM_FAULT_NOPAGE;
  936. }
  937. EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  938. /**
  939. * dax_zero_page_range - zero a range within a page of a DAX file
  940. * @inode: The file being truncated
  941. * @from: The file offset that is being truncated to
  942. * @length: The number of bytes to zero
  943. * @get_block: The filesystem method used to translate file offsets to blocks
  944. *
  945. * This function can be called by a filesystem when it is zeroing part of a
  946. * page in a DAX file. This is intended for hole-punch operations. If
  947. * you are truncating a file, the helper function dax_truncate_page() may be
  948. * more convenient.
  949. *
  950. * We work in terms of PAGE_CACHE_SIZE here for commonality with
  951. * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
  952. * took care of disposing of the unnecessary blocks. Even if the filesystem
  953. * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
  954. * since the file might be mmapped.
  955. */
  956. int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
  957. get_block_t get_block)
  958. {
  959. struct buffer_head bh;
  960. pgoff_t index = from >> PAGE_CACHE_SHIFT;
  961. unsigned offset = from & (PAGE_CACHE_SIZE-1);
  962. int err;
  963. /* Block boundary? Nothing to do */
  964. if (!length)
  965. return 0;
  966. BUG_ON((offset + length) > PAGE_CACHE_SIZE);
  967. memset(&bh, 0, sizeof(bh));
  968. bh.b_bdev = inode->i_sb->s_bdev;
  969. bh.b_size = PAGE_CACHE_SIZE;
  970. err = get_block(inode, index, &bh, 0);
  971. if (err < 0)
  972. return err;
  973. if (buffer_written(&bh)) {
  974. struct block_device *bdev = bh.b_bdev;
  975. struct blk_dax_ctl dax = {
  976. .sector = to_sector(&bh, inode),
  977. .size = PAGE_CACHE_SIZE,
  978. };
  979. if (dax_map_atomic(bdev, &dax) < 0)
  980. return PTR_ERR(dax.addr);
  981. clear_pmem(dax.addr + offset, length);
  982. wmb_pmem();
  983. dax_unmap_atomic(bdev, &dax);
  984. }
  985. return 0;
  986. }
  987. EXPORT_SYMBOL_GPL(dax_zero_page_range);
  988. /**
  989. * dax_truncate_page - handle a partial page being truncated in a DAX file
  990. * @inode: The file being truncated
  991. * @from: The file offset that is being truncated to
  992. * @get_block: The filesystem method used to translate file offsets to blocks
  993. *
  994. * Similar to block_truncate_page(), this function can be called by a
  995. * filesystem when it is truncating a DAX file to handle the partial page.
  996. *
  997. * We work in terms of PAGE_CACHE_SIZE here for commonality with
  998. * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
  999. * took care of disposing of the unnecessary blocks. Even if the filesystem
  1000. * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
  1001. * since the file might be mmapped.
  1002. */
  1003. int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
  1004. {
  1005. unsigned length = PAGE_CACHE_ALIGN(from) - from;
  1006. return dax_zero_page_range(inode, from, length, get_block);
  1007. }
  1008. EXPORT_SYMBOL_GPL(dax_truncate_page);