dax.c 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571
  1. /*
  2. * fs/dax.c - Direct Access filesystem code
  3. * Copyright (c) 2013-2014 Intel Corporation
  4. * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  5. * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms and conditions of the GNU General Public License,
  9. * version 2, as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope it will be useful, but WITHOUT
  12. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  14. * more details.
  15. */
  16. #include <linux/atomic.h>
  17. #include <linux/blkdev.h>
  18. #include <linux/buffer_head.h>
  19. #include <linux/dax.h>
  20. #include <linux/fs.h>
  21. #include <linux/genhd.h>
  22. #include <linux/highmem.h>
  23. #include <linux/memcontrol.h>
  24. #include <linux/mm.h>
  25. #include <linux/mutex.h>
  26. #include <linux/pagevec.h>
  27. #include <linux/pmem.h>
  28. #include <linux/sched.h>
  29. #include <linux/uio.h>
  30. #include <linux/vmstat.h>
  31. #include <linux/pfn_t.h>
  32. #include <linux/sizes.h>
  33. #include <linux/iomap.h>
  34. #include "internal.h"
  35. /* We choose 4096 entries - same as per-zone page wait tables */
  36. #define DAX_WAIT_TABLE_BITS 12
  37. #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  38. static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  39. static int __init init_dax_wait_table(void)
  40. {
  41. int i;
  42. for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  43. init_waitqueue_head(wait_table + i);
  44. return 0;
  45. }
  46. fs_initcall(init_dax_wait_table);
  47. static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  48. {
  49. struct request_queue *q = bdev->bd_queue;
  50. long rc = -EIO;
  51. dax->addr = ERR_PTR(-EIO);
  52. if (blk_queue_enter(q, true) != 0)
  53. return rc;
  54. rc = bdev_direct_access(bdev, dax);
  55. if (rc < 0) {
  56. dax->addr = ERR_PTR(rc);
  57. blk_queue_exit(q);
  58. return rc;
  59. }
  60. return rc;
  61. }
  62. static void dax_unmap_atomic(struct block_device *bdev,
  63. const struct blk_dax_ctl *dax)
  64. {
  65. if (IS_ERR(dax->addr))
  66. return;
  67. blk_queue_exit(bdev->bd_queue);
  68. }
  69. static int dax_is_pmd_entry(void *entry)
  70. {
  71. return (unsigned long)entry & RADIX_DAX_PMD;
  72. }
  73. static int dax_is_pte_entry(void *entry)
  74. {
  75. return !((unsigned long)entry & RADIX_DAX_PMD);
  76. }
  77. static int dax_is_zero_entry(void *entry)
  78. {
  79. return (unsigned long)entry & RADIX_DAX_HZP;
  80. }
  81. static int dax_is_empty_entry(void *entry)
  82. {
  83. return (unsigned long)entry & RADIX_DAX_EMPTY;
  84. }
  85. struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  86. {
  87. struct page *page = alloc_pages(GFP_KERNEL, 0);
  88. struct blk_dax_ctl dax = {
  89. .size = PAGE_SIZE,
  90. .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
  91. };
  92. long rc;
  93. if (!page)
  94. return ERR_PTR(-ENOMEM);
  95. rc = dax_map_atomic(bdev, &dax);
  96. if (rc < 0)
  97. return ERR_PTR(rc);
  98. memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
  99. dax_unmap_atomic(bdev, &dax);
  100. return page;
  101. }
  102. static bool buffer_written(struct buffer_head *bh)
  103. {
  104. return buffer_mapped(bh) && !buffer_unwritten(bh);
  105. }
  106. static sector_t to_sector(const struct buffer_head *bh,
  107. const struct inode *inode)
  108. {
  109. sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
  110. return sector;
  111. }
  112. static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  113. loff_t start, loff_t end, get_block_t get_block,
  114. struct buffer_head *bh)
  115. {
  116. loff_t pos = start, max = start, bh_max = start;
  117. bool hole = false;
  118. struct block_device *bdev = NULL;
  119. int rw = iov_iter_rw(iter), rc;
  120. long map_len = 0;
  121. struct blk_dax_ctl dax = {
  122. .addr = ERR_PTR(-EIO),
  123. };
  124. unsigned blkbits = inode->i_blkbits;
  125. sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
  126. >> blkbits;
  127. if (rw == READ)
  128. end = min(end, i_size_read(inode));
  129. while (pos < end) {
  130. size_t len;
  131. if (pos == max) {
  132. long page = pos >> PAGE_SHIFT;
  133. sector_t block = page << (PAGE_SHIFT - blkbits);
  134. unsigned first = pos - (block << blkbits);
  135. long size;
  136. if (pos == bh_max) {
  137. bh->b_size = PAGE_ALIGN(end - pos);
  138. bh->b_state = 0;
  139. rc = get_block(inode, block, bh, rw == WRITE);
  140. if (rc)
  141. break;
  142. bh_max = pos - first + bh->b_size;
  143. bdev = bh->b_bdev;
  144. /*
  145. * We allow uninitialized buffers for writes
  146. * beyond EOF as those cannot race with faults
  147. */
  148. WARN_ON_ONCE(
  149. (buffer_new(bh) && block < file_blks) ||
  150. (rw == WRITE && buffer_unwritten(bh)));
  151. } else {
  152. unsigned done = bh->b_size -
  153. (bh_max - (pos - first));
  154. bh->b_blocknr += done >> blkbits;
  155. bh->b_size -= done;
  156. }
  157. hole = rw == READ && !buffer_written(bh);
  158. if (hole) {
  159. size = bh->b_size - first;
  160. } else {
  161. dax_unmap_atomic(bdev, &dax);
  162. dax.sector = to_sector(bh, inode);
  163. dax.size = bh->b_size;
  164. map_len = dax_map_atomic(bdev, &dax);
  165. if (map_len < 0) {
  166. rc = map_len;
  167. break;
  168. }
  169. dax.addr += first;
  170. size = map_len - first;
  171. }
  172. /*
  173. * pos + size is one past the last offset for IO,
  174. * so pos + size can overflow loff_t at extreme offsets.
  175. * Cast to u64 to catch this and get the true minimum.
  176. */
  177. max = min_t(u64, pos + size, end);
  178. }
  179. if (iov_iter_rw(iter) == WRITE) {
  180. len = copy_from_iter_pmem(dax.addr, max - pos, iter);
  181. } else if (!hole)
  182. len = copy_to_iter((void __force *) dax.addr, max - pos,
  183. iter);
  184. else
  185. len = iov_iter_zero(max - pos, iter);
  186. if (!len) {
  187. rc = -EFAULT;
  188. break;
  189. }
  190. pos += len;
  191. if (!IS_ERR(dax.addr))
  192. dax.addr += len;
  193. }
  194. dax_unmap_atomic(bdev, &dax);
  195. return (pos == start) ? rc : pos - start;
  196. }
  197. /**
  198. * dax_do_io - Perform I/O to a DAX file
  199. * @iocb: The control block for this I/O
  200. * @inode: The file which the I/O is directed at
  201. * @iter: The addresses to do I/O from or to
  202. * @get_block: The filesystem method used to translate file offsets to blocks
  203. * @end_io: A filesystem callback for I/O completion
  204. * @flags: See below
  205. *
  206. * This function uses the same locking scheme as do_blockdev_direct_IO:
  207. * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
  208. * caller for writes. For reads, we take and release the i_mutex ourselves.
  209. * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
  210. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
  211. * is in progress.
  212. */
  213. ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
  214. struct iov_iter *iter, get_block_t get_block,
  215. dio_iodone_t end_io, int flags)
  216. {
  217. struct buffer_head bh;
  218. ssize_t retval = -EINVAL;
  219. loff_t pos = iocb->ki_pos;
  220. loff_t end = pos + iov_iter_count(iter);
  221. memset(&bh, 0, sizeof(bh));
  222. bh.b_bdev = inode->i_sb->s_bdev;
  223. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
  224. inode_lock(inode);
  225. /* Protects against truncate */
  226. if (!(flags & DIO_SKIP_DIO_COUNT))
  227. inode_dio_begin(inode);
  228. retval = dax_io(inode, iter, pos, end, get_block, &bh);
  229. if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
  230. inode_unlock(inode);
  231. if (end_io) {
  232. int err;
  233. err = end_io(iocb, pos, retval, bh.b_private);
  234. if (err)
  235. retval = err;
  236. }
  237. if (!(flags & DIO_SKIP_DIO_COUNT))
  238. inode_dio_end(inode);
  239. return retval;
  240. }
  241. EXPORT_SYMBOL_GPL(dax_do_io);
  242. /*
  243. * DAX radix tree locking
  244. */
  245. struct exceptional_entry_key {
  246. struct address_space *mapping;
  247. pgoff_t entry_start;
  248. };
  249. struct wait_exceptional_entry_queue {
  250. wait_queue_t wait;
  251. struct exceptional_entry_key key;
  252. };
  253. static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
  254. pgoff_t index, void *entry, struct exceptional_entry_key *key)
  255. {
  256. unsigned long hash;
  257. /*
  258. * If 'entry' is a PMD, align the 'index' that we use for the wait
  259. * queue to the start of that PMD. This ensures that all offsets in
  260. * the range covered by the PMD map to the same bit lock.
  261. */
  262. if (dax_is_pmd_entry(entry))
  263. index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
  264. key->mapping = mapping;
  265. key->entry_start = index;
  266. hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
  267. return wait_table + hash;
  268. }
  269. static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
  270. int sync, void *keyp)
  271. {
  272. struct exceptional_entry_key *key = keyp;
  273. struct wait_exceptional_entry_queue *ewait =
  274. container_of(wait, struct wait_exceptional_entry_queue, wait);
  275. if (key->mapping != ewait->key.mapping ||
  276. key->entry_start != ewait->key.entry_start)
  277. return 0;
  278. return autoremove_wake_function(wait, mode, sync, NULL);
  279. }
  280. /*
  281. * Check whether the given slot is locked. The function must be called with
  282. * mapping->tree_lock held
  283. */
  284. static inline int slot_locked(struct address_space *mapping, void **slot)
  285. {
  286. unsigned long entry = (unsigned long)
  287. radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  288. return entry & RADIX_DAX_ENTRY_LOCK;
  289. }
  290. /*
  291. * Mark the given slot is locked. The function must be called with
  292. * mapping->tree_lock held
  293. */
  294. static inline void *lock_slot(struct address_space *mapping, void **slot)
  295. {
  296. unsigned long entry = (unsigned long)
  297. radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  298. entry |= RADIX_DAX_ENTRY_LOCK;
  299. radix_tree_replace_slot(slot, (void *)entry);
  300. return (void *)entry;
  301. }
  302. /*
  303. * Mark the given slot is unlocked. The function must be called with
  304. * mapping->tree_lock held
  305. */
  306. static inline void *unlock_slot(struct address_space *mapping, void **slot)
  307. {
  308. unsigned long entry = (unsigned long)
  309. radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  310. entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
  311. radix_tree_replace_slot(slot, (void *)entry);
  312. return (void *)entry;
  313. }
  314. /*
  315. * Lookup entry in radix tree, wait for it to become unlocked if it is
  316. * exceptional entry and return it. The caller must call
  317. * put_unlocked_mapping_entry() when he decided not to lock the entry or
  318. * put_locked_mapping_entry() when he locked the entry and now wants to
  319. * unlock it.
  320. *
  321. * The function must be called with mapping->tree_lock held.
  322. */
  323. static void *get_unlocked_mapping_entry(struct address_space *mapping,
  324. pgoff_t index, void ***slotp)
  325. {
  326. void *entry, **slot;
  327. struct wait_exceptional_entry_queue ewait;
  328. wait_queue_head_t *wq;
  329. init_wait(&ewait.wait);
  330. ewait.wait.func = wake_exceptional_entry_func;
  331. for (;;) {
  332. entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
  333. &slot);
  334. if (!entry || !radix_tree_exceptional_entry(entry) ||
  335. !slot_locked(mapping, slot)) {
  336. if (slotp)
  337. *slotp = slot;
  338. return entry;
  339. }
  340. wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
  341. prepare_to_wait_exclusive(wq, &ewait.wait,
  342. TASK_UNINTERRUPTIBLE);
  343. spin_unlock_irq(&mapping->tree_lock);
  344. schedule();
  345. finish_wait(wq, &ewait.wait);
  346. spin_lock_irq(&mapping->tree_lock);
  347. }
  348. }
  349. static void put_locked_mapping_entry(struct address_space *mapping,
  350. pgoff_t index, void *entry)
  351. {
  352. if (!radix_tree_exceptional_entry(entry)) {
  353. unlock_page(entry);
  354. put_page(entry);
  355. } else {
  356. dax_unlock_mapping_entry(mapping, index);
  357. }
  358. }
  359. /*
  360. * Called when we are done with radix tree entry we looked up via
  361. * get_unlocked_mapping_entry() and which we didn't lock in the end.
  362. */
  363. static void put_unlocked_mapping_entry(struct address_space *mapping,
  364. pgoff_t index, void *entry)
  365. {
  366. if (!radix_tree_exceptional_entry(entry))
  367. return;
  368. /* We have to wake up next waiter for the radix tree entry lock */
  369. dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  370. }
  371. /*
  372. * Find radix tree entry at given index. If it points to a page, return with
  373. * the page locked. If it points to the exceptional entry, return with the
  374. * radix tree entry locked. If the radix tree doesn't contain given index,
  375. * create empty exceptional entry for the index and return with it locked.
  376. *
  377. * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
  378. * either return that locked entry or will return an error. This error will
  379. * happen if there are any 4k entries (either zero pages or DAX entries)
  380. * within the 2MiB range that we are requesting.
  381. *
  382. * We always favor 4k entries over 2MiB entries. There isn't a flow where we
  383. * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
  384. * insertion will fail if it finds any 4k entries already in the tree, and a
  385. * 4k insertion will cause an existing 2MiB entry to be unmapped and
  386. * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
  387. * well as 2MiB empty entries.
  388. *
  389. * The exception to this downgrade path is for 2MiB DAX PMD entries that have
  390. * real storage backing them. We will leave these real 2MiB DAX entries in
  391. * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
  392. *
  393. * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  394. * persistent memory the benefit is doubtful. We can add that later if we can
  395. * show it helps.
  396. */
  397. static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
  398. unsigned long size_flag)
  399. {
  400. bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
  401. void *entry, **slot;
  402. restart:
  403. spin_lock_irq(&mapping->tree_lock);
  404. entry = get_unlocked_mapping_entry(mapping, index, &slot);
  405. if (entry) {
  406. if (size_flag & RADIX_DAX_PMD) {
  407. if (!radix_tree_exceptional_entry(entry) ||
  408. dax_is_pte_entry(entry)) {
  409. put_unlocked_mapping_entry(mapping, index,
  410. entry);
  411. entry = ERR_PTR(-EEXIST);
  412. goto out_unlock;
  413. }
  414. } else { /* trying to grab a PTE entry */
  415. if (radix_tree_exceptional_entry(entry) &&
  416. dax_is_pmd_entry(entry) &&
  417. (dax_is_zero_entry(entry) ||
  418. dax_is_empty_entry(entry))) {
  419. pmd_downgrade = true;
  420. }
  421. }
  422. }
  423. /* No entry for given index? Make sure radix tree is big enough. */
  424. if (!entry || pmd_downgrade) {
  425. int err;
  426. if (pmd_downgrade) {
  427. /*
  428. * Make sure 'entry' remains valid while we drop
  429. * mapping->tree_lock.
  430. */
  431. entry = lock_slot(mapping, slot);
  432. }
  433. spin_unlock_irq(&mapping->tree_lock);
  434. err = radix_tree_preload(
  435. mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
  436. if (err) {
  437. if (pmd_downgrade)
  438. put_locked_mapping_entry(mapping, index, entry);
  439. return ERR_PTR(err);
  440. }
  441. /*
  442. * Besides huge zero pages the only other thing that gets
  443. * downgraded are empty entries which don't need to be
  444. * unmapped.
  445. */
  446. if (pmd_downgrade && dax_is_zero_entry(entry))
  447. unmap_mapping_range(mapping,
  448. (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
  449. spin_lock_irq(&mapping->tree_lock);
  450. if (pmd_downgrade) {
  451. radix_tree_delete(&mapping->page_tree, index);
  452. mapping->nrexceptional--;
  453. dax_wake_mapping_entry_waiter(mapping, index, entry,
  454. true);
  455. }
  456. entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
  457. err = __radix_tree_insert(&mapping->page_tree, index,
  458. dax_radix_order(entry), entry);
  459. radix_tree_preload_end();
  460. if (err) {
  461. spin_unlock_irq(&mapping->tree_lock);
  462. /*
  463. * Someone already created the entry? This is a
  464. * normal failure when inserting PMDs in a range
  465. * that already contains PTEs. In that case we want
  466. * to return -EEXIST immediately.
  467. */
  468. if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
  469. goto restart;
  470. /*
  471. * Our insertion of a DAX PMD entry failed, most
  472. * likely because it collided with a PTE sized entry
  473. * at a different index in the PMD range. We haven't
  474. * inserted anything into the radix tree and have no
  475. * waiters to wake.
  476. */
  477. return ERR_PTR(err);
  478. }
  479. /* Good, we have inserted empty locked entry into the tree. */
  480. mapping->nrexceptional++;
  481. spin_unlock_irq(&mapping->tree_lock);
  482. return entry;
  483. }
  484. /* Normal page in radix tree? */
  485. if (!radix_tree_exceptional_entry(entry)) {
  486. struct page *page = entry;
  487. get_page(page);
  488. spin_unlock_irq(&mapping->tree_lock);
  489. lock_page(page);
  490. /* Page got truncated? Retry... */
  491. if (unlikely(page->mapping != mapping)) {
  492. unlock_page(page);
  493. put_page(page);
  494. goto restart;
  495. }
  496. return page;
  497. }
  498. entry = lock_slot(mapping, slot);
  499. out_unlock:
  500. spin_unlock_irq(&mapping->tree_lock);
  501. return entry;
  502. }
  503. /*
  504. * We do not necessarily hold the mapping->tree_lock when we call this
  505. * function so it is possible that 'entry' is no longer a valid item in the
  506. * radix tree. This is okay because all we really need to do is to find the
  507. * correct waitqueue where tasks might be waiting for that old 'entry' and
  508. * wake them.
  509. */
  510. void dax_wake_mapping_entry_waiter(struct address_space *mapping,
  511. pgoff_t index, void *entry, bool wake_all)
  512. {
  513. struct exceptional_entry_key key;
  514. wait_queue_head_t *wq;
  515. wq = dax_entry_waitqueue(mapping, index, entry, &key);
  516. /*
  517. * Checking for locked entry and prepare_to_wait_exclusive() happens
  518. * under mapping->tree_lock, ditto for entry handling in our callers.
  519. * So at this point all tasks that could have seen our entry locked
  520. * must be in the waitqueue and the following check will see them.
  521. */
  522. if (waitqueue_active(wq))
  523. __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
  524. }
  525. void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
  526. {
  527. void *entry, **slot;
  528. spin_lock_irq(&mapping->tree_lock);
  529. entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
  530. if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
  531. !slot_locked(mapping, slot))) {
  532. spin_unlock_irq(&mapping->tree_lock);
  533. return;
  534. }
  535. unlock_slot(mapping, slot);
  536. spin_unlock_irq(&mapping->tree_lock);
  537. dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  538. }
  539. /*
  540. * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  541. * entry to get unlocked before deleting it.
  542. */
  543. int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  544. {
  545. void *entry;
  546. spin_lock_irq(&mapping->tree_lock);
  547. entry = get_unlocked_mapping_entry(mapping, index, NULL);
  548. /*
  549. * This gets called from truncate / punch_hole path. As such, the caller
  550. * must hold locks protecting against concurrent modifications of the
  551. * radix tree (usually fs-private i_mmap_sem for writing). Since the
  552. * caller has seen exceptional entry for this index, we better find it
  553. * at that index as well...
  554. */
  555. if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
  556. spin_unlock_irq(&mapping->tree_lock);
  557. return 0;
  558. }
  559. radix_tree_delete(&mapping->page_tree, index);
  560. mapping->nrexceptional--;
  561. spin_unlock_irq(&mapping->tree_lock);
  562. dax_wake_mapping_entry_waiter(mapping, index, entry, true);
  563. return 1;
  564. }
  565. /*
  566. * The user has performed a load from a hole in the file. Allocating
  567. * a new page in the file would cause excessive storage usage for
  568. * workloads with sparse files. We allocate a page cache page instead.
  569. * We'll kick it out of the page cache if it's ever written to,
  570. * otherwise it will simply fall out of the page cache under memory
  571. * pressure without ever having been dirtied.
  572. */
  573. static int dax_load_hole(struct address_space *mapping, void *entry,
  574. struct vm_fault *vmf)
  575. {
  576. struct page *page;
  577. /* Hole page already exists? Return it... */
  578. if (!radix_tree_exceptional_entry(entry)) {
  579. vmf->page = entry;
  580. return VM_FAULT_LOCKED;
  581. }
  582. /* This will replace locked radix tree entry with a hole page */
  583. page = find_or_create_page(mapping, vmf->pgoff,
  584. vmf->gfp_mask | __GFP_ZERO);
  585. if (!page) {
  586. put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  587. return VM_FAULT_OOM;
  588. }
  589. vmf->page = page;
  590. return VM_FAULT_LOCKED;
  591. }
  592. static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
  593. struct page *to, unsigned long vaddr)
  594. {
  595. struct blk_dax_ctl dax = {
  596. .sector = sector,
  597. .size = size,
  598. };
  599. void *vto;
  600. if (dax_map_atomic(bdev, &dax) < 0)
  601. return PTR_ERR(dax.addr);
  602. vto = kmap_atomic(to);
  603. copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
  604. kunmap_atomic(vto);
  605. dax_unmap_atomic(bdev, &dax);
  606. return 0;
  607. }
  608. /*
  609. * By this point grab_mapping_entry() has ensured that we have a locked entry
  610. * of the appropriate size so we don't have to worry about downgrading PMDs to
  611. * PTEs. If we happen to be trying to insert a PTE and there is a PMD
  612. * already in the tree, we will skip the insertion and just dirty the PMD as
  613. * appropriate.
  614. */
  615. static void *dax_insert_mapping_entry(struct address_space *mapping,
  616. struct vm_fault *vmf,
  617. void *entry, sector_t sector,
  618. unsigned long flags)
  619. {
  620. struct radix_tree_root *page_tree = &mapping->page_tree;
  621. int error = 0;
  622. bool hole_fill = false;
  623. void *new_entry;
  624. pgoff_t index = vmf->pgoff;
  625. if (vmf->flags & FAULT_FLAG_WRITE)
  626. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  627. /* Replacing hole page with block mapping? */
  628. if (!radix_tree_exceptional_entry(entry)) {
  629. hole_fill = true;
  630. /*
  631. * Unmap the page now before we remove it from page cache below.
  632. * The page is locked so it cannot be faulted in again.
  633. */
  634. unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
  635. PAGE_SIZE, 0);
  636. error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
  637. if (error)
  638. return ERR_PTR(error);
  639. } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
  640. /* replacing huge zero page with PMD block mapping */
  641. unmap_mapping_range(mapping,
  642. (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
  643. }
  644. spin_lock_irq(&mapping->tree_lock);
  645. new_entry = dax_radix_locked_entry(sector, flags);
  646. if (hole_fill) {
  647. __delete_from_page_cache(entry, NULL);
  648. /* Drop pagecache reference */
  649. put_page(entry);
  650. error = __radix_tree_insert(page_tree, index,
  651. dax_radix_order(new_entry), new_entry);
  652. if (error) {
  653. new_entry = ERR_PTR(error);
  654. goto unlock;
  655. }
  656. mapping->nrexceptional++;
  657. } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  658. /*
  659. * Only swap our new entry into the radix tree if the current
  660. * entry is a zero page or an empty entry. If a normal PTE or
  661. * PMD entry is already in the tree, we leave it alone. This
  662. * means that if we are trying to insert a PTE and the
  663. * existing entry is a PMD, we will just leave the PMD in the
  664. * tree and dirty it if necessary.
  665. */
  666. void **slot;
  667. void *ret;
  668. ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
  669. WARN_ON_ONCE(ret != entry);
  670. radix_tree_replace_slot(slot, new_entry);
  671. }
  672. if (vmf->flags & FAULT_FLAG_WRITE)
  673. radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  674. unlock:
  675. spin_unlock_irq(&mapping->tree_lock);
  676. if (hole_fill) {
  677. radix_tree_preload_end();
  678. /*
  679. * We don't need hole page anymore, it has been replaced with
  680. * locked radix tree entry now.
  681. */
  682. if (mapping->a_ops->freepage)
  683. mapping->a_ops->freepage(entry);
  684. unlock_page(entry);
  685. put_page(entry);
  686. }
  687. return new_entry;
  688. }
  689. static int dax_writeback_one(struct block_device *bdev,
  690. struct address_space *mapping, pgoff_t index, void *entry)
  691. {
  692. struct radix_tree_root *page_tree = &mapping->page_tree;
  693. struct radix_tree_node *node;
  694. struct blk_dax_ctl dax;
  695. void **slot;
  696. int ret = 0;
  697. spin_lock_irq(&mapping->tree_lock);
  698. /*
  699. * Regular page slots are stabilized by the page lock even
  700. * without the tree itself locked. These unlocked entries
  701. * need verification under the tree lock.
  702. */
  703. if (!__radix_tree_lookup(page_tree, index, &node, &slot))
  704. goto unlock;
  705. if (*slot != entry)
  706. goto unlock;
  707. /* another fsync thread may have already written back this entry */
  708. if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
  709. goto unlock;
  710. if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  711. dax_is_zero_entry(entry))) {
  712. ret = -EIO;
  713. goto unlock;
  714. }
  715. /*
  716. * Even if dax_writeback_mapping_range() was given a wbc->range_start
  717. * in the middle of a PMD, the 'index' we are given will be aligned to
  718. * the start index of the PMD, as will the sector we pull from
  719. * 'entry'. This allows us to flush for PMD_SIZE and not have to
  720. * worry about partial PMD writebacks.
  721. */
  722. dax.sector = dax_radix_sector(entry);
  723. dax.size = PAGE_SIZE << dax_radix_order(entry);
  724. spin_unlock_irq(&mapping->tree_lock);
  725. /*
  726. * We cannot hold tree_lock while calling dax_map_atomic() because it
  727. * eventually calls cond_resched().
  728. */
  729. ret = dax_map_atomic(bdev, &dax);
  730. if (ret < 0)
  731. return ret;
  732. if (WARN_ON_ONCE(ret < dax.size)) {
  733. ret = -EIO;
  734. goto unmap;
  735. }
  736. wb_cache_pmem(dax.addr, dax.size);
  737. spin_lock_irq(&mapping->tree_lock);
  738. radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
  739. spin_unlock_irq(&mapping->tree_lock);
  740. unmap:
  741. dax_unmap_atomic(bdev, &dax);
  742. return ret;
  743. unlock:
  744. spin_unlock_irq(&mapping->tree_lock);
  745. return ret;
  746. }
  747. /*
  748. * Flush the mapping to the persistent domain within the byte range of [start,
  749. * end]. This is required by data integrity operations to ensure file data is
  750. * on persistent storage prior to completion of the operation.
  751. */
  752. int dax_writeback_mapping_range(struct address_space *mapping,
  753. struct block_device *bdev, struct writeback_control *wbc)
  754. {
  755. struct inode *inode = mapping->host;
  756. pgoff_t start_index, end_index;
  757. pgoff_t indices[PAGEVEC_SIZE];
  758. struct pagevec pvec;
  759. bool done = false;
  760. int i, ret = 0;
  761. if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  762. return -EIO;
  763. if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
  764. return 0;
  765. start_index = wbc->range_start >> PAGE_SHIFT;
  766. end_index = wbc->range_end >> PAGE_SHIFT;
  767. tag_pages_for_writeback(mapping, start_index, end_index);
  768. pagevec_init(&pvec, 0);
  769. while (!done) {
  770. pvec.nr = find_get_entries_tag(mapping, start_index,
  771. PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
  772. pvec.pages, indices);
  773. if (pvec.nr == 0)
  774. break;
  775. for (i = 0; i < pvec.nr; i++) {
  776. if (indices[i] > end_index) {
  777. done = true;
  778. break;
  779. }
  780. ret = dax_writeback_one(bdev, mapping, indices[i],
  781. pvec.pages[i]);
  782. if (ret < 0)
  783. return ret;
  784. }
  785. }
  786. return 0;
  787. }
  788. EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  789. static int dax_insert_mapping(struct address_space *mapping,
  790. struct block_device *bdev, sector_t sector, size_t size,
  791. void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
  792. {
  793. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  794. struct blk_dax_ctl dax = {
  795. .sector = sector,
  796. .size = size,
  797. };
  798. void *ret;
  799. void *entry = *entryp;
  800. if (dax_map_atomic(bdev, &dax) < 0)
  801. return PTR_ERR(dax.addr);
  802. dax_unmap_atomic(bdev, &dax);
  803. ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
  804. if (IS_ERR(ret))
  805. return PTR_ERR(ret);
  806. *entryp = ret;
  807. return vm_insert_mixed(vma, vaddr, dax.pfn);
  808. }
  809. /**
  810. * dax_fault - handle a page fault on a DAX file
  811. * @vma: The virtual memory area where the fault occurred
  812. * @vmf: The description of the fault
  813. * @get_block: The filesystem method used to translate file offsets to blocks
  814. *
  815. * When a page fault occurs, filesystems may call this helper in their
  816. * fault handler for DAX files. dax_fault() assumes the caller has done all
  817. * the necessary locking for the page fault to proceed successfully.
  818. */
  819. int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  820. get_block_t get_block)
  821. {
  822. struct file *file = vma->vm_file;
  823. struct address_space *mapping = file->f_mapping;
  824. struct inode *inode = mapping->host;
  825. void *entry;
  826. struct buffer_head bh;
  827. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  828. unsigned blkbits = inode->i_blkbits;
  829. sector_t block;
  830. pgoff_t size;
  831. int error;
  832. int major = 0;
  833. /*
  834. * Check whether offset isn't beyond end of file now. Caller is supposed
  835. * to hold locks serializing us with truncate / punch hole so this is
  836. * a reliable test.
  837. */
  838. size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  839. if (vmf->pgoff >= size)
  840. return VM_FAULT_SIGBUS;
  841. memset(&bh, 0, sizeof(bh));
  842. block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
  843. bh.b_bdev = inode->i_sb->s_bdev;
  844. bh.b_size = PAGE_SIZE;
  845. entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
  846. if (IS_ERR(entry)) {
  847. error = PTR_ERR(entry);
  848. goto out;
  849. }
  850. error = get_block(inode, block, &bh, 0);
  851. if (!error && (bh.b_size < PAGE_SIZE))
  852. error = -EIO; /* fs corruption? */
  853. if (error)
  854. goto unlock_entry;
  855. if (vmf->cow_page) {
  856. struct page *new_page = vmf->cow_page;
  857. if (buffer_written(&bh))
  858. error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
  859. bh.b_size, new_page, vaddr);
  860. else
  861. clear_user_highpage(new_page, vaddr);
  862. if (error)
  863. goto unlock_entry;
  864. if (!radix_tree_exceptional_entry(entry)) {
  865. vmf->page = entry;
  866. return VM_FAULT_LOCKED;
  867. }
  868. vmf->entry = entry;
  869. return VM_FAULT_DAX_LOCKED;
  870. }
  871. if (!buffer_mapped(&bh)) {
  872. if (vmf->flags & FAULT_FLAG_WRITE) {
  873. error = get_block(inode, block, &bh, 1);
  874. count_vm_event(PGMAJFAULT);
  875. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  876. major = VM_FAULT_MAJOR;
  877. if (!error && (bh.b_size < PAGE_SIZE))
  878. error = -EIO;
  879. if (error)
  880. goto unlock_entry;
  881. } else {
  882. return dax_load_hole(mapping, entry, vmf);
  883. }
  884. }
  885. /* Filesystem should not return unwritten buffers to us! */
  886. WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
  887. error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
  888. bh.b_size, &entry, vma, vmf);
  889. unlock_entry:
  890. put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  891. out:
  892. if (error == -ENOMEM)
  893. return VM_FAULT_OOM | major;
  894. /* -EBUSY is fine, somebody else faulted on the same PTE */
  895. if ((error < 0) && (error != -EBUSY))
  896. return VM_FAULT_SIGBUS | major;
  897. return VM_FAULT_NOPAGE | major;
  898. }
  899. EXPORT_SYMBOL_GPL(dax_fault);
  900. /**
  901. * dax_pfn_mkwrite - handle first write to DAX page
  902. * @vma: The virtual memory area where the fault occurred
  903. * @vmf: The description of the fault
  904. */
  905. int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  906. {
  907. struct file *file = vma->vm_file;
  908. struct address_space *mapping = file->f_mapping;
  909. void *entry;
  910. pgoff_t index = vmf->pgoff;
  911. spin_lock_irq(&mapping->tree_lock);
  912. entry = get_unlocked_mapping_entry(mapping, index, NULL);
  913. if (!entry || !radix_tree_exceptional_entry(entry))
  914. goto out;
  915. radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
  916. put_unlocked_mapping_entry(mapping, index, entry);
  917. out:
  918. spin_unlock_irq(&mapping->tree_lock);
  919. return VM_FAULT_NOPAGE;
  920. }
  921. EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  922. static bool dax_range_is_aligned(struct block_device *bdev,
  923. unsigned int offset, unsigned int length)
  924. {
  925. unsigned short sector_size = bdev_logical_block_size(bdev);
  926. if (!IS_ALIGNED(offset, sector_size))
  927. return false;
  928. if (!IS_ALIGNED(length, sector_size))
  929. return false;
  930. return true;
  931. }
  932. int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
  933. unsigned int offset, unsigned int length)
  934. {
  935. struct blk_dax_ctl dax = {
  936. .sector = sector,
  937. .size = PAGE_SIZE,
  938. };
  939. if (dax_range_is_aligned(bdev, offset, length)) {
  940. sector_t start_sector = dax.sector + (offset >> 9);
  941. return blkdev_issue_zeroout(bdev, start_sector,
  942. length >> 9, GFP_NOFS, true);
  943. } else {
  944. if (dax_map_atomic(bdev, &dax) < 0)
  945. return PTR_ERR(dax.addr);
  946. clear_pmem(dax.addr + offset, length);
  947. dax_unmap_atomic(bdev, &dax);
  948. }
  949. return 0;
  950. }
  951. EXPORT_SYMBOL_GPL(__dax_zero_page_range);
  952. /**
  953. * dax_zero_page_range - zero a range within a page of a DAX file
  954. * @inode: The file being truncated
  955. * @from: The file offset that is being truncated to
  956. * @length: The number of bytes to zero
  957. * @get_block: The filesystem method used to translate file offsets to blocks
  958. *
  959. * This function can be called by a filesystem when it is zeroing part of a
  960. * page in a DAX file. This is intended for hole-punch operations. If
  961. * you are truncating a file, the helper function dax_truncate_page() may be
  962. * more convenient.
  963. */
  964. int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
  965. get_block_t get_block)
  966. {
  967. struct buffer_head bh;
  968. pgoff_t index = from >> PAGE_SHIFT;
  969. unsigned offset = from & (PAGE_SIZE-1);
  970. int err;
  971. /* Block boundary? Nothing to do */
  972. if (!length)
  973. return 0;
  974. if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
  975. return -EINVAL;
  976. memset(&bh, 0, sizeof(bh));
  977. bh.b_bdev = inode->i_sb->s_bdev;
  978. bh.b_size = PAGE_SIZE;
  979. err = get_block(inode, index, &bh, 0);
  980. if (err < 0 || !buffer_written(&bh))
  981. return err;
  982. return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
  983. offset, length);
  984. }
  985. EXPORT_SYMBOL_GPL(dax_zero_page_range);
  986. /**
  987. * dax_truncate_page - handle a partial page being truncated in a DAX file
  988. * @inode: The file being truncated
  989. * @from: The file offset that is being truncated to
  990. * @get_block: The filesystem method used to translate file offsets to blocks
  991. *
  992. * Similar to block_truncate_page(), this function can be called by a
  993. * filesystem when it is truncating a DAX file to handle the partial page.
  994. */
  995. int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
  996. {
  997. unsigned length = PAGE_ALIGN(from) - from;
  998. return dax_zero_page_range(inode, from, length, get_block);
  999. }
  1000. EXPORT_SYMBOL_GPL(dax_truncate_page);
  1001. #ifdef CONFIG_FS_IOMAP
  1002. static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
  1003. {
  1004. return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
  1005. }
  1006. static loff_t
  1007. dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  1008. struct iomap *iomap)
  1009. {
  1010. struct iov_iter *iter = data;
  1011. loff_t end = pos + length, done = 0;
  1012. ssize_t ret = 0;
  1013. if (iov_iter_rw(iter) == READ) {
  1014. end = min(end, i_size_read(inode));
  1015. if (pos >= end)
  1016. return 0;
  1017. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  1018. return iov_iter_zero(min(length, end - pos), iter);
  1019. }
  1020. if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
  1021. return -EIO;
  1022. while (pos < end) {
  1023. unsigned offset = pos & (PAGE_SIZE - 1);
  1024. struct blk_dax_ctl dax = { 0 };
  1025. ssize_t map_len;
  1026. dax.sector = dax_iomap_sector(iomap, pos);
  1027. dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
  1028. map_len = dax_map_atomic(iomap->bdev, &dax);
  1029. if (map_len < 0) {
  1030. ret = map_len;
  1031. break;
  1032. }
  1033. dax.addr += offset;
  1034. map_len -= offset;
  1035. if (map_len > end - pos)
  1036. map_len = end - pos;
  1037. if (iov_iter_rw(iter) == WRITE)
  1038. map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
  1039. else
  1040. map_len = copy_to_iter(dax.addr, map_len, iter);
  1041. dax_unmap_atomic(iomap->bdev, &dax);
  1042. if (map_len <= 0) {
  1043. ret = map_len ? map_len : -EFAULT;
  1044. break;
  1045. }
  1046. pos += map_len;
  1047. length -= map_len;
  1048. done += map_len;
  1049. }
  1050. return done ? done : ret;
  1051. }
  1052. /**
  1053. * dax_iomap_rw - Perform I/O to a DAX file
  1054. * @iocb: The control block for this I/O
  1055. * @iter: The addresses to do I/O from or to
  1056. * @ops: iomap ops passed from the file system
  1057. *
  1058. * This function performs read and write operations to directly mapped
  1059. * persistent memory. The callers needs to take care of read/write exclusion
  1060. * and evicting any page cache pages in the region under I/O.
  1061. */
  1062. ssize_t
  1063. dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
  1064. struct iomap_ops *ops)
  1065. {
  1066. struct address_space *mapping = iocb->ki_filp->f_mapping;
  1067. struct inode *inode = mapping->host;
  1068. loff_t pos = iocb->ki_pos, ret = 0, done = 0;
  1069. unsigned flags = 0;
  1070. if (iov_iter_rw(iter) == WRITE)
  1071. flags |= IOMAP_WRITE;
  1072. /*
  1073. * Yes, even DAX files can have page cache attached to them: A zeroed
  1074. * page is inserted into the pagecache when we have to serve a write
  1075. * fault on a hole. It should never be dirtied and can simply be
  1076. * dropped from the pagecache once we get real data for the page.
  1077. *
  1078. * XXX: This is racy against mmap, and there's nothing we can do about
  1079. * it. We'll eventually need to shift this down even further so that
  1080. * we can check if we allocated blocks over a hole first.
  1081. */
  1082. if (mapping->nrpages) {
  1083. ret = invalidate_inode_pages2_range(mapping,
  1084. pos >> PAGE_SHIFT,
  1085. (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
  1086. WARN_ON_ONCE(ret);
  1087. }
  1088. while (iov_iter_count(iter)) {
  1089. ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
  1090. iter, dax_iomap_actor);
  1091. if (ret <= 0)
  1092. break;
  1093. pos += ret;
  1094. done += ret;
  1095. }
  1096. iocb->ki_pos += done;
  1097. return done ? done : ret;
  1098. }
  1099. EXPORT_SYMBOL_GPL(dax_iomap_rw);
  1100. /**
  1101. * dax_iomap_fault - handle a page fault on a DAX file
  1102. * @vma: The virtual memory area where the fault occurred
  1103. * @vmf: The description of the fault
  1104. * @ops: iomap ops passed from the file system
  1105. *
  1106. * When a page fault occurs, filesystems may call this helper in their fault
  1107. * or mkwrite handler for DAX files. Assumes the caller has done all the
  1108. * necessary locking for the page fault to proceed successfully.
  1109. */
  1110. int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  1111. struct iomap_ops *ops)
  1112. {
  1113. struct address_space *mapping = vma->vm_file->f_mapping;
  1114. struct inode *inode = mapping->host;
  1115. unsigned long vaddr = (unsigned long)vmf->virtual_address;
  1116. loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
  1117. sector_t sector;
  1118. struct iomap iomap = { 0 };
  1119. unsigned flags = IOMAP_FAULT;
  1120. int error, major = 0;
  1121. int locked_status = 0;
  1122. void *entry;
  1123. /*
  1124. * Check whether offset isn't beyond end of file now. Caller is supposed
  1125. * to hold locks serializing us with truncate / punch hole so this is
  1126. * a reliable test.
  1127. */
  1128. if (pos >= i_size_read(inode))
  1129. return VM_FAULT_SIGBUS;
  1130. entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
  1131. if (IS_ERR(entry)) {
  1132. error = PTR_ERR(entry);
  1133. goto out;
  1134. }
  1135. if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
  1136. flags |= IOMAP_WRITE;
  1137. /*
  1138. * Note that we don't bother to use iomap_apply here: DAX required
  1139. * the file system block size to be equal the page size, which means
  1140. * that we never have to deal with more than a single extent here.
  1141. */
  1142. error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
  1143. if (error)
  1144. goto unlock_entry;
  1145. if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
  1146. error = -EIO; /* fs corruption? */
  1147. goto finish_iomap;
  1148. }
  1149. sector = dax_iomap_sector(&iomap, pos);
  1150. if (vmf->cow_page) {
  1151. switch (iomap.type) {
  1152. case IOMAP_HOLE:
  1153. case IOMAP_UNWRITTEN:
  1154. clear_user_highpage(vmf->cow_page, vaddr);
  1155. break;
  1156. case IOMAP_MAPPED:
  1157. error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
  1158. vmf->cow_page, vaddr);
  1159. break;
  1160. default:
  1161. WARN_ON_ONCE(1);
  1162. error = -EIO;
  1163. break;
  1164. }
  1165. if (error)
  1166. goto finish_iomap;
  1167. if (!radix_tree_exceptional_entry(entry)) {
  1168. vmf->page = entry;
  1169. locked_status = VM_FAULT_LOCKED;
  1170. } else {
  1171. vmf->entry = entry;
  1172. locked_status = VM_FAULT_DAX_LOCKED;
  1173. }
  1174. goto finish_iomap;
  1175. }
  1176. switch (iomap.type) {
  1177. case IOMAP_MAPPED:
  1178. if (iomap.flags & IOMAP_F_NEW) {
  1179. count_vm_event(PGMAJFAULT);
  1180. mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  1181. major = VM_FAULT_MAJOR;
  1182. }
  1183. error = dax_insert_mapping(mapping, iomap.bdev, sector,
  1184. PAGE_SIZE, &entry, vma, vmf);
  1185. break;
  1186. case IOMAP_UNWRITTEN:
  1187. case IOMAP_HOLE:
  1188. if (!(vmf->flags & FAULT_FLAG_WRITE)) {
  1189. locked_status = dax_load_hole(mapping, entry, vmf);
  1190. break;
  1191. }
  1192. /*FALLTHRU*/
  1193. default:
  1194. WARN_ON_ONCE(1);
  1195. error = -EIO;
  1196. break;
  1197. }
  1198. finish_iomap:
  1199. if (ops->iomap_end) {
  1200. if (error) {
  1201. /* keep previous error */
  1202. ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
  1203. &iomap);
  1204. } else {
  1205. error = ops->iomap_end(inode, pos, PAGE_SIZE,
  1206. PAGE_SIZE, flags, &iomap);
  1207. }
  1208. }
  1209. unlock_entry:
  1210. if (!locked_status || error)
  1211. put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  1212. out:
  1213. if (error == -ENOMEM)
  1214. return VM_FAULT_OOM | major;
  1215. /* -EBUSY is fine, somebody else faulted on the same PTE */
  1216. if (error < 0 && error != -EBUSY)
  1217. return VM_FAULT_SIGBUS | major;
  1218. if (locked_status) {
  1219. WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
  1220. return locked_status;
  1221. }
  1222. return VM_FAULT_NOPAGE | major;
  1223. }
  1224. EXPORT_SYMBOL_GPL(dax_iomap_fault);
  1225. #ifdef CONFIG_FS_DAX_PMD
  1226. /*
  1227. * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
  1228. * more often than one might expect in the below functions.
  1229. */
  1230. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  1231. static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
  1232. struct vm_fault *vmf, unsigned long address,
  1233. struct iomap *iomap, loff_t pos, bool write, void **entryp)
  1234. {
  1235. struct address_space *mapping = vma->vm_file->f_mapping;
  1236. struct block_device *bdev = iomap->bdev;
  1237. struct blk_dax_ctl dax = {
  1238. .sector = dax_iomap_sector(iomap, pos),
  1239. .size = PMD_SIZE,
  1240. };
  1241. long length = dax_map_atomic(bdev, &dax);
  1242. void *ret;
  1243. if (length < 0) /* dax_map_atomic() failed */
  1244. return VM_FAULT_FALLBACK;
  1245. if (length < PMD_SIZE)
  1246. goto unmap_fallback;
  1247. if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
  1248. goto unmap_fallback;
  1249. if (!pfn_t_devmap(dax.pfn))
  1250. goto unmap_fallback;
  1251. dax_unmap_atomic(bdev, &dax);
  1252. ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
  1253. RADIX_DAX_PMD);
  1254. if (IS_ERR(ret))
  1255. return VM_FAULT_FALLBACK;
  1256. *entryp = ret;
  1257. return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
  1258. unmap_fallback:
  1259. dax_unmap_atomic(bdev, &dax);
  1260. return VM_FAULT_FALLBACK;
  1261. }
  1262. static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
  1263. struct vm_fault *vmf, unsigned long address,
  1264. struct iomap *iomap, void **entryp)
  1265. {
  1266. struct address_space *mapping = vma->vm_file->f_mapping;
  1267. unsigned long pmd_addr = address & PMD_MASK;
  1268. struct page *zero_page;
  1269. spinlock_t *ptl;
  1270. pmd_t pmd_entry;
  1271. void *ret;
  1272. zero_page = mm_get_huge_zero_page(vma->vm_mm);
  1273. if (unlikely(!zero_page))
  1274. return VM_FAULT_FALLBACK;
  1275. ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
  1276. RADIX_DAX_PMD | RADIX_DAX_HZP);
  1277. if (IS_ERR(ret))
  1278. return VM_FAULT_FALLBACK;
  1279. *entryp = ret;
  1280. ptl = pmd_lock(vma->vm_mm, pmd);
  1281. if (!pmd_none(*pmd)) {
  1282. spin_unlock(ptl);
  1283. return VM_FAULT_FALLBACK;
  1284. }
  1285. pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
  1286. pmd_entry = pmd_mkhuge(pmd_entry);
  1287. set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
  1288. spin_unlock(ptl);
  1289. return VM_FAULT_NOPAGE;
  1290. }
  1291. int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
  1292. pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
  1293. {
  1294. struct address_space *mapping = vma->vm_file->f_mapping;
  1295. unsigned long pmd_addr = address & PMD_MASK;
  1296. bool write = flags & FAULT_FLAG_WRITE;
  1297. unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
  1298. struct inode *inode = mapping->host;
  1299. int result = VM_FAULT_FALLBACK;
  1300. struct iomap iomap = { 0 };
  1301. pgoff_t max_pgoff, pgoff;
  1302. struct vm_fault vmf;
  1303. void *entry;
  1304. loff_t pos;
  1305. int error;
  1306. /* Fall back to PTEs if we're going to COW */
  1307. if (write && !(vma->vm_flags & VM_SHARED))
  1308. goto fallback;
  1309. /* If the PMD would extend outside the VMA */
  1310. if (pmd_addr < vma->vm_start)
  1311. goto fallback;
  1312. if ((pmd_addr + PMD_SIZE) > vma->vm_end)
  1313. goto fallback;
  1314. /*
  1315. * Check whether offset isn't beyond end of file now. Caller is
  1316. * supposed to hold locks serializing us with truncate / punch hole so
  1317. * this is a reliable test.
  1318. */
  1319. pgoff = linear_page_index(vma, pmd_addr);
  1320. max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
  1321. if (pgoff > max_pgoff)
  1322. return VM_FAULT_SIGBUS;
  1323. /* If the PMD would extend beyond the file size */
  1324. if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
  1325. goto fallback;
  1326. /*
  1327. * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
  1328. * PMD or a HZP entry. If it can't (because a 4k page is already in
  1329. * the tree, for instance), it will return -EEXIST and we just fall
  1330. * back to 4k entries.
  1331. */
  1332. entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
  1333. if (IS_ERR(entry))
  1334. goto fallback;
  1335. /*
  1336. * Note that we don't use iomap_apply here. We aren't doing I/O, only
  1337. * setting up a mapping, so really we're using iomap_begin() as a way
  1338. * to look up our filesystem block.
  1339. */
  1340. pos = (loff_t)pgoff << PAGE_SHIFT;
  1341. error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
  1342. if (error)
  1343. goto unlock_entry;
  1344. if (iomap.offset + iomap.length < pos + PMD_SIZE)
  1345. goto finish_iomap;
  1346. vmf.pgoff = pgoff;
  1347. vmf.flags = flags;
  1348. vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
  1349. switch (iomap.type) {
  1350. case IOMAP_MAPPED:
  1351. result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
  1352. &iomap, pos, write, &entry);
  1353. break;
  1354. case IOMAP_UNWRITTEN:
  1355. case IOMAP_HOLE:
  1356. if (WARN_ON_ONCE(write))
  1357. goto finish_iomap;
  1358. result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
  1359. &entry);
  1360. break;
  1361. default:
  1362. WARN_ON_ONCE(1);
  1363. break;
  1364. }
  1365. finish_iomap:
  1366. if (ops->iomap_end) {
  1367. if (result == VM_FAULT_FALLBACK) {
  1368. ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
  1369. &iomap);
  1370. } else {
  1371. error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
  1372. iomap_flags, &iomap);
  1373. if (error)
  1374. result = VM_FAULT_FALLBACK;
  1375. }
  1376. }
  1377. unlock_entry:
  1378. put_locked_mapping_entry(mapping, pgoff, entry);
  1379. fallback:
  1380. if (result == VM_FAULT_FALLBACK) {
  1381. split_huge_pmd(vma, pmd, address);
  1382. count_vm_event(THP_FAULT_FALLBACK);
  1383. }
  1384. return result;
  1385. }
  1386. EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
  1387. #endif /* CONFIG_FS_DAX_PMD */
  1388. #endif /* CONFIG_FS_IOMAP */