dax.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
  1. /*
  2. * fs/dax.c - Direct Access filesystem code
  3. * Copyright (c) 2013-2014 Intel Corporation
  4. * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  5. * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms and conditions of the GNU General Public License,
  9. * version 2, as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope it will be useful, but WITHOUT
  12. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  14. * more details.
  15. */
  16. #include <linux/atomic.h>
  17. #include <linux/blkdev.h>
  18. #include <linux/buffer_head.h>
  19. #include <linux/dax.h>
  20. #include <linux/fs.h>
  21. #include <linux/genhd.h>
  22. #include <linux/highmem.h>
  23. #include <linux/memcontrol.h>
  24. #include <linux/mm.h>
  25. #include <linux/mutex.h>
  26. #include <linux/pmem.h>
  27. #include <linux/sched.h>
  28. #include <linux/uio.h>
  29. #include <linux/vmstat.h>
  30. #include <linux/pfn_t.h>
  31. #include <linux/sizes.h>
  32. static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  33. {
  34. struct request_queue *q = bdev->bd_queue;
  35. long rc = -EIO;
  36. dax->addr = (void __pmem *) ERR_PTR(-EIO);
  37. if (blk_queue_enter(q, true) != 0)
  38. return rc;
  39. rc = bdev_direct_access(bdev, dax);
  40. if (rc < 0) {
  41. dax->addr = (void __pmem *) ERR_PTR(rc);
  42. blk_queue_exit(q);
  43. return rc;
  44. }
  45. return rc;
  46. }
  47. static void dax_unmap_atomic(struct block_device *bdev,
  48. const struct blk_dax_ctl *dax)
  49. {
  50. if (IS_ERR(dax->addr))
  51. return;
  52. blk_queue_exit(bdev->bd_queue);
  53. }
  54. /*
  55. * dax_clear_blocks() is called from within transaction context from XFS,
  56. * and hence this means the stack from this point must follow GFP_NOFS
  57. * semantics for all operations.
  58. */
  59. int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
  60. {
  61. struct block_device *bdev = inode->i_sb->s_bdev;
  62. struct blk_dax_ctl dax = {
  63. .sector = block << (inode->i_blkbits - 9),
  64. .size = _size,
  65. };
  66. might_sleep();
  67. do {
  68. long count, sz;
  69. count = dax_map_atomic(bdev, &dax);
  70. if (count < 0)
  71. return count;
  72. sz = min_t(long, count, SZ_128K);
  73. clear_pmem(dax.addr, sz);
  74. dax.size -= sz;
  75. dax.sector += sz / 512;
  76. dax_unmap_atomic(bdev, &dax);
  77. cond_resched();
  78. } while (dax.size);
  79. wmb_pmem();
  80. return 0;
  81. }
  82. EXPORT_SYMBOL_GPL(dax_clear_blocks);
  83. /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
  84. static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
  85. loff_t pos, loff_t end)
  86. {
  87. loff_t final = end - pos + first; /* The final byte of the buffer */
  88. if (first > 0)
  89. clear_pmem(addr, first);
  90. if (final < size)
  91. clear_pmem(addr + final, size - final);
  92. }
  93. static bool buffer_written(struct buffer_head *bh)
  94. {
  95. return buffer_mapped(bh) && !buffer_unwritten(bh);
  96. }
  97. /*
  98. * When ext4 encounters a hole, it returns without modifying the buffer_head
  99. * which means that we can't trust b_size. To cope with this, we set b_state
  100. * to 0 before calling get_block and, if any bit is set, we know we can trust
  101. * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
  102. * and would save us time calling get_block repeatedly.
  103. */
  104. static bool buffer_size_valid(struct buffer_head *bh)
  105. {
  106. return bh->b_state != 0;
  107. }
  108. static sector_t to_sector(const struct buffer_head *bh,
  109. const struct inode *inode)
  110. {
  111. sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
  112. return sector;
  113. }
  114. static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  115. loff_t start, loff_t end, get_block_t get_block,
  116. struct buffer_head *bh)
  117. {
  118. loff_t pos = start, max = start, bh_max = start;
  119. bool hole = false, need_wmb = false;
  120. struct block_device *bdev = NULL;
  121. int rw = iov_iter_rw(iter), rc;
  122. long map_len = 0;
  123. struct blk_dax_ctl dax = {
  124. .addr = (void __pmem *) ERR_PTR(-EIO),
  125. };
  126. if (rw == READ)
  127. end = min(end, i_size_read(inode));
  128. while (pos < end) {
  129. size_t len;
  130. if (pos == max) {
  131. unsigned blkbits = inode->i_blkbits;
  132. long page = pos >> PAGE_SHIFT;
  133. sector_t block = page << (PAGE_SHIFT - blkbits);
  134. unsigned first = pos - (block << blkbits);
  135. long size;
  136. if (pos == bh_max) {
  137. bh->b_size = PAGE_ALIGN(end - pos);
  138. bh->b_state = 0;
  139. rc = get_block(inode, block, bh, rw == WRITE);
  140. if (rc)
  141. break;
  142. if (!buffer_size_valid(bh))
  143. bh->b_size = 1 << blkbits;
  144. bh_max = pos - first + bh->b_size;
  145. bdev = bh->b_bdev;
  146. } else {
  147. unsigned done = bh->b_size -
  148. (bh_max - (pos - first));
  149. bh->b_blocknr += done >> blkbits;
  150. bh->b_size -= done;
  151. }
  152. hole = rw == READ && !buffer_written(bh);
  153. if (hole) {
  154. size = bh->b_size - first;
  155. } else {
  156. dax_unmap_atomic(bdev, &dax);
  157. dax.sector = to_sector(bh, inode);
  158. dax.size = bh->b_size;
  159. map_len = dax_map_atomic(bdev, &dax);
  160. if (map_len < 0) {
  161. rc = map_len;
  162. break;
  163. }
  164. if (buffer_unwritten(bh) || buffer_new(bh)) {
  165. dax_new_buf(dax.addr, map_len, first,
  166. pos, end);
  167. need_wmb = true;
  168. }
  169. dax.addr += first;
  170. size = map_len - first;
  171. }
  172. max = min(pos + size, end);
  173. }
  174. if (iov_iter_rw(iter) == WRITE) {
  175. len = copy_from_iter_pmem(dax.addr, max - pos, iter);
  176. need_wmb = true;
  177. } else if (!hole)
  178. len = copy_to_iter((void __force *) dax.addr, max - pos,
  179. iter);
  180. else
  181. len = iov_iter_zero(max - pos, iter);
  182. if (!len) {
  183. rc = -EFAULT;
  184. break;
  185. }
  186. pos += len;
  187. if (!IS_ERR(dax.addr))
  188. dax.addr += len;
  189. }
  190. if (need_wmb)
  191. wmb_pmem();
  192. dax_unmap_atomic(bdev, &dax);
  193. return (pos == start) ? rc : pos - start;
  194. }
  195. /**
  196. * dax_do_io - Perform I/O to a DAX file
  197. * @iocb: The control block for this I/O
  198. * @inode: The file which the I/O is directed at
  199. * @iter: The addresses to do I/O from or to
  200. * @pos: The file offset where the I/O starts
  201. * @get_block: The filesystem method used to translate file offsets to blocks
  202. * @end_io: A filesystem callback for I/O completion
  203. * @flags: See below
  204. *
  205. * This function uses the same locking scheme as do_blockdev_direct_IO:
  206. * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
  207. * caller for writes. For reads, we take and release the i_mutex ourselves.
  208. * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
  209. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
  210. * is in progress.
  211. */
  212. ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
  213. struct iov_iter *iter, loff_t pos, get_block_t get_block,
  214. dio_iodone_t end_io, int flags)
  215. {
  216. struct buffer_head bh;
  217. ssize_t retval = -EINVAL;
  218. loff_t end = pos + iov_iter_count(iter);
  219. memset(&bh, 0, sizeof(bh));
  220. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
  221. struct address_space *mapping = inode->i_mapping;
  222. inode_lock(inode);
  223. retval = filemap_write_and_wait_range(mapping, pos, end - 1);
  224. if (retval) {
  225. inode_unlock(inode);
  226. goto out;
  227. }
  228. }
  229. /* Protects against truncate */
  230. if (!(flags & DIO_SKIP_DIO_COUNT))
  231. inode_dio_begin(inode);
  232. retval = dax_io(inode, iter, pos, end, get_block, &bh);
  233. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
  234. inode_unlock(inode);
  235. if ((retval > 0) && end_io)
  236. end_io(iocb, pos, retval, bh.b_private);
  237. if (!(flags & DIO_SKIP_DIO_COUNT))
  238. inode_dio_end(inode);
  239. out:
  240. return retval;
  241. }
  242. EXPORT_SYMBOL_GPL(dax_do_io);
  243. /*
  244. * The user has performed a load from a hole in the file. Allocating
  245. * a new page in the file would cause excessive storage usage for
  246. * workloads with sparse files. We allocate a page cache page instead.
  247. * We'll kick it out of the page cache if it's ever written to,
  248. * otherwise it will simply fall out of the page cache under memory
  249. * pressure without ever having been dirtied.
  250. */
  251. static int dax_load_hole(struct address_space *mapping, struct page *page,
  252. struct vm_fault *vmf)
  253. {
  254. unsigned long size;
  255. struct inode *inode = mapping->host;
  256. if (!page)
  257. page = find_or_create_page(mapping, vmf->pgoff,
  258. GFP_KERNEL | __GFP_ZERO);
  259. if (!page)
  260. return VM_FAULT_OOM;
  261. /* Recheck i_size under page lock to avoid truncate race */
  262. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  263. if (vmf->pgoff >= size) {
  264. unlock_page(page);
  265. page_cache_release(page);
  266. return VM_FAULT_SIGBUS;
  267. }
  268. vmf->page = page;
  269. return VM_FAULT_LOCKED;
  270. }
  271. static int copy_user_bh(struct page *to, struct inode *inode,
  272. struct buffer_head *bh, unsigned long vaddr)
  273. {
  274. struct blk_dax_ctl dax = {
  275. .sector = to_sector(bh, inode),
  276. .size = bh->b_size,
  277. };
  278. struct block_device *bdev = bh->b_bdev;
  279. void *vto;
  280. if (dax_map_atomic(bdev, &dax) < 0)
  281. return PTR_ERR(dax.addr);
  282. vto = kmap_atomic(to);
  283. copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
  284. kunmap_atomic(vto);
  285. dax_unmap_atomic(bdev, &dax);
  286. return 0;
  287. }
  288. static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  289. struct vm_area_struct *vma, struct vm_fault *vmf)
  290. {
  291. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  292. struct address_space *mapping = inode->i_mapping;
  293. struct block_device *bdev = bh->b_bdev;
  294. struct blk_dax_ctl dax = {
  295. .sector = to_sector(bh, inode),
  296. .size = bh->b_size,
  297. };
  298. pgoff_t size;
  299. int error;
  300. i_mmap_lock_read(mapping);
  301. /*
  302. * Check truncate didn't happen while we were allocating a block.
  303. * If it did, this block may or may not be still allocated to the
  304. * file. We can't tell the filesystem to free it because we can't
  305. * take i_mutex here. In the worst case, the file still has blocks
  306. * allocated past the end of the file.
  307. */
  308. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  309. if (unlikely(vmf->pgoff >= size)) {
  310. error = -EIO;
  311. goto out;
  312. }
  313. if (dax_map_atomic(bdev, &dax) < 0) {
  314. error = PTR_ERR(dax.addr);
  315. goto out;
  316. }
  317. if (buffer_unwritten(bh) || buffer_new(bh)) {
  318. clear_pmem(dax.addr, PAGE_SIZE);
  319. wmb_pmem();
  320. }
  321. dax_unmap_atomic(bdev, &dax);
  322. error = vm_insert_mixed(vma, vaddr, dax.pfn);
  323. out:
  324. i_mmap_unlock_read(mapping);
  325. return error;
  326. }
  327. /**
  328. * __dax_fault - handle a page fault on a DAX file
  329. * @vma: The virtual memory area where the fault occurred
  330. * @vmf: The description of the fault
  331. * @get_block: The filesystem method used to translate file offsets to blocks
  332. * @complete_unwritten: The filesystem method used to convert unwritten blocks
  333. * to written so the data written to them is exposed. This is required for
  334. * required by write faults for filesystems that will return unwritten
  335. * extent mappings from @get_block, but it is optional for reads as
  336. * dax_insert_mapping() will always zero unwritten blocks. If the fs does
  337. * not support unwritten extents, the it should pass NULL.
  338. *
  339. * When a page fault occurs, filesystems may call this helper in their
  340. * fault handler for DAX files. __dax_fault() assumes the caller has done all
  341. * the necessary locking for the page fault to proceed successfully.
  342. */
  343. int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  344. get_block_t get_block, dax_iodone_t complete_unwritten)
  345. {
  346. struct file *file = vma->vm_file;
  347. struct address_space *mapping = file->f_mapping;
  348. struct inode *inode = mapping->host;
  349. struct page *page;
  350. struct buffer_head bh;
  351. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  352. unsigned blkbits = inode->i_blkbits;
  353. sector_t block;
  354. pgoff_t size;
  355. int error;
  356. int major = 0;
  357. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  358. if (vmf->pgoff >= size)
  359. return VM_FAULT_SIGBUS;
  360. memset(&bh, 0, sizeof(bh));
  361. block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
  362. bh.b_size = PAGE_SIZE;
  363. repeat:
  364. page = find_get_page(mapping, vmf->pgoff);
  365. if (page) {
  366. if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
  367. page_cache_release(page);
  368. return VM_FAULT_RETRY;
  369. }
  370. if (unlikely(page->mapping != mapping)) {
  371. unlock_page(page);
  372. page_cache_release(page);
  373. goto repeat;
  374. }
  375. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  376. if (unlikely(vmf->pgoff >= size)) {
  377. /*
  378. * We have a struct page covering a hole in the file
  379. * from a read fault and we've raced with a truncate
  380. */
  381. error = -EIO;
  382. goto unlock_page;
  383. }
  384. }
  385. error = get_block(inode, block, &bh, 0);
  386. if (!error && (bh.b_size < PAGE_SIZE))
  387. error = -EIO; /* fs corruption? */
  388. if (error)
  389. goto unlock_page;
  390. if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
  391. if (vmf->flags & FAULT_FLAG_WRITE) {
  392. error = get_block(inode, block, &bh, 1);
  393. count_vm_event(PGMAJFAULT);
  394. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  395. major = VM_FAULT_MAJOR;
  396. if (!error && (bh.b_size < PAGE_SIZE))
  397. error = -EIO;
  398. if (error)
  399. goto unlock_page;
  400. } else {
  401. return dax_load_hole(mapping, page, vmf);
  402. }
  403. }
  404. if (vmf->cow_page) {
  405. struct page *new_page = vmf->cow_page;
  406. if (buffer_written(&bh))
  407. error = copy_user_bh(new_page, inode, &bh, vaddr);
  408. else
  409. clear_user_highpage(new_page, vaddr);
  410. if (error)
  411. goto unlock_page;
  412. vmf->page = page;
  413. if (!page) {
  414. i_mmap_lock_read(mapping);
  415. /* Check we didn't race with truncate */
  416. size = (i_size_read(inode) + PAGE_SIZE - 1) >>
  417. PAGE_SHIFT;
  418. if (vmf->pgoff >= size) {
  419. i_mmap_unlock_read(mapping);
  420. error = -EIO;
  421. goto out;
  422. }
  423. }
  424. return VM_FAULT_LOCKED;
  425. }
  426. /* Check we didn't race with a read fault installing a new page */
  427. if (!page && major)
  428. page = find_lock_page(mapping, vmf->pgoff);
  429. if (page) {
  430. unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
  431. PAGE_CACHE_SIZE, 0);
  432. delete_from_page_cache(page);
  433. unlock_page(page);
  434. page_cache_release(page);
  435. }
  436. /*
  437. * If we successfully insert the new mapping over an unwritten extent,
  438. * we need to ensure we convert the unwritten extent. If there is an
  439. * error inserting the mapping, the filesystem needs to leave it as
  440. * unwritten to prevent exposure of the stale underlying data to
  441. * userspace, but we still need to call the completion function so
  442. * the private resources on the mapping buffer can be released. We
  443. * indicate what the callback should do via the uptodate variable, same
  444. * as for normal BH based IO completions.
  445. */
  446. error = dax_insert_mapping(inode, &bh, vma, vmf);
  447. if (buffer_unwritten(&bh)) {
  448. if (complete_unwritten)
  449. complete_unwritten(&bh, !error);
  450. else
  451. WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
  452. }
  453. out:
  454. if (error == -ENOMEM)
  455. return VM_FAULT_OOM | major;
  456. /* -EBUSY is fine, somebody else faulted on the same PTE */
  457. if ((error < 0) && (error != -EBUSY))
  458. return VM_FAULT_SIGBUS | major;
  459. return VM_FAULT_NOPAGE | major;
  460. unlock_page:
  461. if (page) {
  462. unlock_page(page);
  463. page_cache_release(page);
  464. }
  465. goto out;
  466. }
  467. EXPORT_SYMBOL(__dax_fault);
  468. /**
  469. * dax_fault - handle a page fault on a DAX file
  470. * @vma: The virtual memory area where the fault occurred
  471. * @vmf: The description of the fault
  472. * @get_block: The filesystem method used to translate file offsets to blocks
  473. *
  474. * When a page fault occurs, filesystems may call this helper in their
  475. * fault handler for DAX files.
  476. */
  477. int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  478. get_block_t get_block, dax_iodone_t complete_unwritten)
  479. {
  480. int result;
  481. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  482. if (vmf->flags & FAULT_FLAG_WRITE) {
  483. sb_start_pagefault(sb);
  484. file_update_time(vma->vm_file);
  485. }
  486. result = __dax_fault(vma, vmf, get_block, complete_unwritten);
  487. if (vmf->flags & FAULT_FLAG_WRITE)
  488. sb_end_pagefault(sb);
  489. return result;
  490. }
  491. EXPORT_SYMBOL_GPL(dax_fault);
  492. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  493. /*
  494. * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
  495. * more often than one might expect in the below function.
  496. */
  497. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  498. static void __dax_dbg(struct buffer_head *bh, unsigned long address,
  499. const char *reason, const char *fn)
  500. {
  501. if (bh) {
  502. char bname[BDEVNAME_SIZE];
  503. bdevname(bh->b_bdev, bname);
  504. pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
  505. "length %zd fallback: %s\n", fn, current->comm,
  506. address, bname, bh->b_state, (u64)bh->b_blocknr,
  507. bh->b_size, reason);
  508. } else {
  509. pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
  510. current->comm, address, reason);
  511. }
  512. }
  513. #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
  514. int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  515. pmd_t *pmd, unsigned int flags, get_block_t get_block,
  516. dax_iodone_t complete_unwritten)
  517. {
  518. struct file *file = vma->vm_file;
  519. struct address_space *mapping = file->f_mapping;
  520. struct inode *inode = mapping->host;
  521. struct buffer_head bh;
  522. unsigned blkbits = inode->i_blkbits;
  523. unsigned long pmd_addr = address & PMD_MASK;
  524. bool write = flags & FAULT_FLAG_WRITE;
  525. struct block_device *bdev;
  526. pgoff_t size, pgoff;
  527. sector_t block;
  528. int result = 0;
  529. /* dax pmd mappings require pfn_t_devmap() */
  530. if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
  531. return VM_FAULT_FALLBACK;
  532. /* Fall back to PTEs if we're going to COW */
  533. if (write && !(vma->vm_flags & VM_SHARED)) {
  534. split_huge_pmd(vma, pmd, address);
  535. dax_pmd_dbg(NULL, address, "cow write");
  536. return VM_FAULT_FALLBACK;
  537. }
  538. /* If the PMD would extend outside the VMA */
  539. if (pmd_addr < vma->vm_start) {
  540. dax_pmd_dbg(NULL, address, "vma start unaligned");
  541. return VM_FAULT_FALLBACK;
  542. }
  543. if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
  544. dax_pmd_dbg(NULL, address, "vma end unaligned");
  545. return VM_FAULT_FALLBACK;
  546. }
  547. pgoff = linear_page_index(vma, pmd_addr);
  548. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  549. if (pgoff >= size)
  550. return VM_FAULT_SIGBUS;
  551. /* If the PMD would cover blocks out of the file */
  552. if ((pgoff | PG_PMD_COLOUR) >= size) {
  553. dax_pmd_dbg(NULL, address,
  554. "offset + huge page size > file size");
  555. return VM_FAULT_FALLBACK;
  556. }
  557. memset(&bh, 0, sizeof(bh));
  558. block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
  559. bh.b_size = PMD_SIZE;
  560. if (get_block(inode, block, &bh, write) != 0)
  561. return VM_FAULT_SIGBUS;
  562. bdev = bh.b_bdev;
  563. i_mmap_lock_read(mapping);
  564. /*
  565. * If the filesystem isn't willing to tell us the length of a hole,
  566. * just fall back to PTEs. Calling get_block 512 times in a loop
  567. * would be silly.
  568. */
  569. if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
  570. dax_pmd_dbg(&bh, address, "allocated block too small");
  571. goto fallback;
  572. }
  573. /*
  574. * If we allocated new storage, make sure no process has any
  575. * zero pages covering this hole
  576. */
  577. if (buffer_new(&bh)) {
  578. i_mmap_unlock_read(mapping);
  579. unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
  580. i_mmap_lock_read(mapping);
  581. }
  582. /*
  583. * If a truncate happened while we were allocating blocks, we may
  584. * leave blocks allocated to the file that are beyond EOF. We can't
  585. * take i_mutex here, so just leave them hanging; they'll be freed
  586. * when the file is deleted.
  587. */
  588. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  589. if (pgoff >= size) {
  590. result = VM_FAULT_SIGBUS;
  591. goto out;
  592. }
  593. if ((pgoff | PG_PMD_COLOUR) >= size) {
  594. dax_pmd_dbg(&bh, address, "pgoff unaligned");
  595. goto fallback;
  596. }
  597. if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
  598. spinlock_t *ptl;
  599. pmd_t entry;
  600. struct page *zero_page = get_huge_zero_page();
  601. if (unlikely(!zero_page)) {
  602. dax_pmd_dbg(&bh, address, "no zero page");
  603. goto fallback;
  604. }
  605. ptl = pmd_lock(vma->vm_mm, pmd);
  606. if (!pmd_none(*pmd)) {
  607. spin_unlock(ptl);
  608. dax_pmd_dbg(&bh, address, "pmd already present");
  609. goto fallback;
  610. }
  611. dev_dbg(part_to_dev(bdev->bd_part),
  612. "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
  613. __func__, current->comm, address,
  614. (unsigned long long) to_sector(&bh, inode));
  615. entry = mk_pmd(zero_page, vma->vm_page_prot);
  616. entry = pmd_mkhuge(entry);
  617. set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
  618. result = VM_FAULT_NOPAGE;
  619. spin_unlock(ptl);
  620. } else {
  621. struct blk_dax_ctl dax = {
  622. .sector = to_sector(&bh, inode),
  623. .size = PMD_SIZE,
  624. };
  625. long length = dax_map_atomic(bdev, &dax);
  626. if (length < 0) {
  627. result = VM_FAULT_SIGBUS;
  628. goto out;
  629. }
  630. if (length < PMD_SIZE) {
  631. dax_pmd_dbg(&bh, address, "dax-length too small");
  632. dax_unmap_atomic(bdev, &dax);
  633. goto fallback;
  634. }
  635. if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
  636. dax_pmd_dbg(&bh, address, "pfn unaligned");
  637. dax_unmap_atomic(bdev, &dax);
  638. goto fallback;
  639. }
  640. if (!pfn_t_devmap(dax.pfn)) {
  641. dax_unmap_atomic(bdev, &dax);
  642. dax_pmd_dbg(&bh, address, "pfn not in memmap");
  643. goto fallback;
  644. }
  645. if (buffer_unwritten(&bh) || buffer_new(&bh)) {
  646. clear_pmem(dax.addr, PMD_SIZE);
  647. wmb_pmem();
  648. count_vm_event(PGMAJFAULT);
  649. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  650. result |= VM_FAULT_MAJOR;
  651. }
  652. dax_unmap_atomic(bdev, &dax);
  653. dev_dbg(part_to_dev(bdev->bd_part),
  654. "%s: %s addr: %lx pfn: %lx sect: %llx\n",
  655. __func__, current->comm, address,
  656. pfn_t_to_pfn(dax.pfn),
  657. (unsigned long long) dax.sector);
  658. result |= vmf_insert_pfn_pmd(vma, address, pmd,
  659. dax.pfn, write);
  660. }
  661. out:
  662. i_mmap_unlock_read(mapping);
  663. if (buffer_unwritten(&bh))
  664. complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
  665. return result;
  666. fallback:
  667. count_vm_event(THP_FAULT_FALLBACK);
  668. result = VM_FAULT_FALLBACK;
  669. goto out;
  670. }
  671. EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  672. /**
  673. * dax_pmd_fault - handle a PMD fault on a DAX file
  674. * @vma: The virtual memory area where the fault occurred
  675. * @vmf: The description of the fault
  676. * @get_block: The filesystem method used to translate file offsets to blocks
  677. *
  678. * When a page fault occurs, filesystems may call this helper in their
  679. * pmd_fault handler for DAX files.
  680. */
  681. int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  682. pmd_t *pmd, unsigned int flags, get_block_t get_block,
  683. dax_iodone_t complete_unwritten)
  684. {
  685. int result;
  686. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  687. if (flags & FAULT_FLAG_WRITE) {
  688. sb_start_pagefault(sb);
  689. file_update_time(vma->vm_file);
  690. }
  691. result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
  692. complete_unwritten);
  693. if (flags & FAULT_FLAG_WRITE)
  694. sb_end_pagefault(sb);
  695. return result;
  696. }
  697. EXPORT_SYMBOL_GPL(dax_pmd_fault);
  698. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  699. /**
  700. * dax_pfn_mkwrite - handle first write to DAX page
  701. * @vma: The virtual memory area where the fault occurred
  702. * @vmf: The description of the fault
  703. *
  704. */
  705. int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  706. {
  707. struct super_block *sb = file_inode(vma->vm_file)->i_sb;
  708. sb_start_pagefault(sb);
  709. file_update_time(vma->vm_file);
  710. sb_end_pagefault(sb);
  711. return VM_FAULT_NOPAGE;
  712. }
  713. EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  714. /**
  715. * dax_zero_page_range - zero a range within a page of a DAX file
  716. * @inode: The file being truncated
  717. * @from: The file offset that is being truncated to
  718. * @length: The number of bytes to zero
  719. * @get_block: The filesystem method used to translate file offsets to blocks
  720. *
  721. * This function can be called by a filesystem when it is zeroing part of a
  722. * page in a DAX file. This is intended for hole-punch operations. If
  723. * you are truncating a file, the helper function dax_truncate_page() may be
  724. * more convenient.
  725. *
  726. * We work in terms of PAGE_CACHE_SIZE here for commonality with
  727. * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
  728. * took care of disposing of the unnecessary blocks. Even if the filesystem
  729. * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
  730. * since the file might be mmapped.
  731. */
  732. int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
  733. get_block_t get_block)
  734. {
  735. struct buffer_head bh;
  736. pgoff_t index = from >> PAGE_CACHE_SHIFT;
  737. unsigned offset = from & (PAGE_CACHE_SIZE-1);
  738. int err;
  739. /* Block boundary? Nothing to do */
  740. if (!length)
  741. return 0;
  742. BUG_ON((offset + length) > PAGE_CACHE_SIZE);
  743. memset(&bh, 0, sizeof(bh));
  744. bh.b_size = PAGE_CACHE_SIZE;
  745. err = get_block(inode, index, &bh, 0);
  746. if (err < 0)
  747. return err;
  748. if (buffer_written(&bh)) {
  749. struct block_device *bdev = bh.b_bdev;
  750. struct blk_dax_ctl dax = {
  751. .sector = to_sector(&bh, inode),
  752. .size = PAGE_CACHE_SIZE,
  753. };
  754. if (dax_map_atomic(bdev, &dax) < 0)
  755. return PTR_ERR(dax.addr);
  756. clear_pmem(dax.addr + offset, length);
  757. wmb_pmem();
  758. dax_unmap_atomic(bdev, &dax);
  759. }
  760. return 0;
  761. }
  762. EXPORT_SYMBOL_GPL(dax_zero_page_range);
  763. /**
  764. * dax_truncate_page - handle a partial page being truncated in a DAX file
  765. * @inode: The file being truncated
  766. * @from: The file offset that is being truncated to
  767. * @get_block: The filesystem method used to translate file offsets to blocks
  768. *
  769. * Similar to block_truncate_page(), this function can be called by a
  770. * filesystem when it is truncating a DAX file to handle the partial page.
  771. *
  772. * We work in terms of PAGE_CACHE_SIZE here for commonality with
  773. * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
  774. * took care of disposing of the unnecessary blocks. Even if the filesystem
  775. * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
  776. * since the file might be mmapped.
  777. */
  778. int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
  779. {
  780. unsigned length = PAGE_CACHE_ALIGN(from) - from;
  781. return dax_zero_page_range(inode, from, length, get_block);
  782. }
  783. EXPORT_SYMBOL_GPL(dax_truncate_page);