truncate.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891
  1. /*
  2. * mm/truncate.c - code for taking down pages from address_spaces
  3. *
  4. * Copyright (C) 2002, Linus Torvalds
  5. *
  6. * 10Sep2002 Andrew Morton
  7. * Initial version.
  8. */
  9. #include <linux/kernel.h>
  10. #include <linux/backing-dev.h>
  11. #include <linux/dax.h>
  12. #include <linux/gfp.h>
  13. #include <linux/mm.h>
  14. #include <linux/swap.h>
  15. #include <linux/export.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/highmem.h>
  18. #include <linux/pagevec.h>
  19. #include <linux/task_io_accounting_ops.h>
  20. #include <linux/buffer_head.h> /* grr. try_to_release_page,
  21. do_invalidatepage */
  22. #include <linux/shmem_fs.h>
  23. #include <linux/cleancache.h>
  24. #include <linux/rmap.h>
  25. #include "internal.h"
  26. static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
  27. void *entry)
  28. {
  29. struct radix_tree_node *node;
  30. void **slot;
  31. spin_lock_irq(&mapping->tree_lock);
  32. /*
  33. * Regular page slots are stabilized by the page lock even
  34. * without the tree itself locked. These unlocked entries
  35. * need verification under the tree lock.
  36. */
  37. if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
  38. goto unlock;
  39. if (*slot != entry)
  40. goto unlock;
  41. __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
  42. workingset_update_node, mapping);
  43. mapping->nrexceptional--;
  44. unlock:
  45. spin_unlock_irq(&mapping->tree_lock);
  46. }
  47. /*
  48. * Unconditionally remove exceptional entry. Usually called from truncate path.
  49. */
  50. static void truncate_exceptional_entry(struct address_space *mapping,
  51. pgoff_t index, void *entry)
  52. {
  53. /* Handled by shmem itself */
  54. if (shmem_mapping(mapping))
  55. return;
  56. if (dax_mapping(mapping)) {
  57. dax_delete_mapping_entry(mapping, index);
  58. return;
  59. }
  60. clear_shadow_entry(mapping, index, entry);
  61. }
  62. /*
  63. * Invalidate exceptional entry if easily possible. This handles exceptional
  64. * entries for invalidate_inode_pages().
  65. */
  66. static int invalidate_exceptional_entry(struct address_space *mapping,
  67. pgoff_t index, void *entry)
  68. {
  69. /* Handled by shmem itself, or for DAX we do nothing. */
  70. if (shmem_mapping(mapping) || dax_mapping(mapping))
  71. return 1;
  72. clear_shadow_entry(mapping, index, entry);
  73. return 1;
  74. }
  75. /*
  76. * Invalidate exceptional entry if clean. This handles exceptional entries for
  77. * invalidate_inode_pages2() so for DAX it evicts only clean entries.
  78. */
  79. static int invalidate_exceptional_entry2(struct address_space *mapping,
  80. pgoff_t index, void *entry)
  81. {
  82. /* Handled by shmem itself */
  83. if (shmem_mapping(mapping))
  84. return 1;
  85. if (dax_mapping(mapping))
  86. return dax_invalidate_mapping_entry_sync(mapping, index);
  87. clear_shadow_entry(mapping, index, entry);
  88. return 1;
  89. }
  90. /**
  91. * do_invalidatepage - invalidate part or all of a page
  92. * @page: the page which is affected
  93. * @offset: start of the range to invalidate
  94. * @length: length of the range to invalidate
  95. *
  96. * do_invalidatepage() is called when all or part of the page has become
  97. * invalidated by a truncate operation.
  98. *
  99. * do_invalidatepage() does not have to release all buffers, but it must
  100. * ensure that no dirty buffer is left outside @offset and that no I/O
  101. * is underway against any of the blocks which are outside the truncation
  102. * point. Because the caller is about to free (and possibly reuse) those
  103. * blocks on-disk.
  104. */
  105. void do_invalidatepage(struct page *page, unsigned int offset,
  106. unsigned int length)
  107. {
  108. void (*invalidatepage)(struct page *, unsigned int, unsigned int);
  109. invalidatepage = page->mapping->a_ops->invalidatepage;
  110. #ifdef CONFIG_BLOCK
  111. if (!invalidatepage)
  112. invalidatepage = block_invalidatepage;
  113. #endif
  114. if (invalidatepage)
  115. (*invalidatepage)(page, offset, length);
  116. }
  117. /*
  118. * If truncate cannot remove the fs-private metadata from the page, the page
  119. * becomes orphaned. It will be left on the LRU and may even be mapped into
  120. * user pagetables if we're racing with filemap_fault().
  121. *
  122. * We need to bale out if page->mapping is no longer equal to the original
  123. * mapping. This happens a) when the VM reclaimed the page while we waited on
  124. * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  125. * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  126. */
  127. static void
  128. truncate_cleanup_page(struct address_space *mapping, struct page *page)
  129. {
  130. if (page_mapped(page)) {
  131. loff_t holelen;
  132. holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
  133. unmap_mapping_range(mapping,
  134. (loff_t)page->index << PAGE_SHIFT,
  135. holelen, 0);
  136. }
  137. if (page_has_private(page))
  138. do_invalidatepage(page, 0, PAGE_SIZE);
  139. /*
  140. * Some filesystems seem to re-dirty the page even after
  141. * the VM has canceled the dirty bit (eg ext3 journaling).
  142. * Hence dirty accounting check is placed after invalidation.
  143. */
  144. cancel_dirty_page(page);
  145. ClearPageMappedToDisk(page);
  146. }
  147. /*
  148. * This is for invalidate_mapping_pages(). That function can be called at
  149. * any time, and is not supposed to throw away dirty pages. But pages can
  150. * be marked dirty at any time too, so use remove_mapping which safely
  151. * discards clean, unused pages.
  152. *
  153. * Returns non-zero if the page was successfully invalidated.
  154. */
  155. static int
  156. invalidate_complete_page(struct address_space *mapping, struct page *page)
  157. {
  158. int ret;
  159. if (page->mapping != mapping)
  160. return 0;
  161. if (page_has_private(page) && !try_to_release_page(page, 0))
  162. return 0;
  163. ret = remove_mapping(mapping, page);
  164. return ret;
  165. }
  166. int truncate_inode_page(struct address_space *mapping, struct page *page)
  167. {
  168. VM_BUG_ON_PAGE(PageTail(page), page);
  169. if (page->mapping != mapping)
  170. return -EIO;
  171. truncate_cleanup_page(mapping, page);
  172. delete_from_page_cache(page);
  173. return 0;
  174. }
  175. /*
  176. * Used to get rid of pages on hardware memory corruption.
  177. */
  178. int generic_error_remove_page(struct address_space *mapping, struct page *page)
  179. {
  180. if (!mapping)
  181. return -EINVAL;
  182. /*
  183. * Only punch for normal data pages for now.
  184. * Handling other types like directories would need more auditing.
  185. */
  186. if (!S_ISREG(mapping->host->i_mode))
  187. return -EIO;
  188. return truncate_inode_page(mapping, page);
  189. }
  190. EXPORT_SYMBOL(generic_error_remove_page);
  191. /*
  192. * Safely invalidate one page from its pagecache mapping.
  193. * It only drops clean, unused pages. The page must be locked.
  194. *
  195. * Returns 1 if the page is successfully invalidated, otherwise 0.
  196. */
  197. int invalidate_inode_page(struct page *page)
  198. {
  199. struct address_space *mapping = page_mapping(page);
  200. if (!mapping)
  201. return 0;
  202. if (PageDirty(page) || PageWriteback(page))
  203. return 0;
  204. if (page_mapped(page))
  205. return 0;
  206. return invalidate_complete_page(mapping, page);
  207. }
  208. /**
  209. * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
  210. * @mapping: mapping to truncate
  211. * @lstart: offset from which to truncate
  212. * @lend: offset to which to truncate (inclusive)
  213. *
  214. * Truncate the page cache, removing the pages that are between
  215. * specified offsets (and zeroing out partial pages
  216. * if lstart or lend + 1 is not page aligned).
  217. *
  218. * Truncate takes two passes - the first pass is nonblocking. It will not
  219. * block on page locks and it will not block on writeback. The second pass
  220. * will wait. This is to prevent as much IO as possible in the affected region.
  221. * The first pass will remove most pages, so the search cost of the second pass
  222. * is low.
  223. *
  224. * We pass down the cache-hot hint to the page freeing code. Even if the
  225. * mapping is large, it is probably the case that the final pages are the most
  226. * recently touched, and freeing happens in ascending file offset order.
  227. *
  228. * Note that since ->invalidatepage() accepts range to invalidate
  229. * truncate_inode_pages_range is able to handle cases where lend + 1 is not
  230. * page aligned properly.
  231. */
  232. void truncate_inode_pages_range(struct address_space *mapping,
  233. loff_t lstart, loff_t lend)
  234. {
  235. pgoff_t start; /* inclusive */
  236. pgoff_t end; /* exclusive */
  237. unsigned int partial_start; /* inclusive */
  238. unsigned int partial_end; /* exclusive */
  239. struct pagevec pvec;
  240. pgoff_t indices[PAGEVEC_SIZE];
  241. pgoff_t index;
  242. int i;
  243. if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
  244. goto out;
  245. /* Offsets within partial pages */
  246. partial_start = lstart & (PAGE_SIZE - 1);
  247. partial_end = (lend + 1) & (PAGE_SIZE - 1);
  248. /*
  249. * 'start' and 'end' always covers the range of pages to be fully
  250. * truncated. Partial pages are covered with 'partial_start' at the
  251. * start of the range and 'partial_end' at the end of the range.
  252. * Note that 'end' is exclusive while 'lend' is inclusive.
  253. */
  254. start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
  255. if (lend == -1)
  256. /*
  257. * lend == -1 indicates end-of-file so we have to set 'end'
  258. * to the highest possible pgoff_t and since the type is
  259. * unsigned we're using -1.
  260. */
  261. end = -1;
  262. else
  263. end = (lend + 1) >> PAGE_SHIFT;
  264. pagevec_init(&pvec, 0);
  265. index = start;
  266. while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
  267. min(end - index, (pgoff_t)PAGEVEC_SIZE),
  268. indices)) {
  269. /*
  270. * Pagevec array has exceptional entries and we may also fail
  271. * to lock some pages. So we store pages that can be deleted
  272. * in a new pagevec.
  273. */
  274. struct pagevec locked_pvec;
  275. pagevec_init(&locked_pvec, 0);
  276. for (i = 0; i < pagevec_count(&pvec); i++) {
  277. struct page *page = pvec.pages[i];
  278. /* We rely upon deletion not changing page->index */
  279. index = indices[i];
  280. if (index >= end)
  281. break;
  282. if (radix_tree_exceptional_entry(page)) {
  283. truncate_exceptional_entry(mapping, index,
  284. page);
  285. continue;
  286. }
  287. if (!trylock_page(page))
  288. continue;
  289. WARN_ON(page_to_index(page) != index);
  290. if (PageWriteback(page)) {
  291. unlock_page(page);
  292. continue;
  293. }
  294. if (page->mapping != mapping) {
  295. unlock_page(page);
  296. continue;
  297. }
  298. pagevec_add(&locked_pvec, page);
  299. }
  300. for (i = 0; i < pagevec_count(&locked_pvec); i++)
  301. truncate_cleanup_page(mapping, locked_pvec.pages[i]);
  302. delete_from_page_cache_batch(mapping, &locked_pvec);
  303. for (i = 0; i < pagevec_count(&locked_pvec); i++)
  304. unlock_page(locked_pvec.pages[i]);
  305. pagevec_remove_exceptionals(&pvec);
  306. pagevec_release(&pvec);
  307. cond_resched();
  308. index++;
  309. }
  310. if (partial_start) {
  311. struct page *page = find_lock_page(mapping, start - 1);
  312. if (page) {
  313. unsigned int top = PAGE_SIZE;
  314. if (start > end) {
  315. /* Truncation within a single page */
  316. top = partial_end;
  317. partial_end = 0;
  318. }
  319. wait_on_page_writeback(page);
  320. zero_user_segment(page, partial_start, top);
  321. cleancache_invalidate_page(mapping, page);
  322. if (page_has_private(page))
  323. do_invalidatepage(page, partial_start,
  324. top - partial_start);
  325. unlock_page(page);
  326. put_page(page);
  327. }
  328. }
  329. if (partial_end) {
  330. struct page *page = find_lock_page(mapping, end);
  331. if (page) {
  332. wait_on_page_writeback(page);
  333. zero_user_segment(page, 0, partial_end);
  334. cleancache_invalidate_page(mapping, page);
  335. if (page_has_private(page))
  336. do_invalidatepage(page, 0,
  337. partial_end);
  338. unlock_page(page);
  339. put_page(page);
  340. }
  341. }
  342. /*
  343. * If the truncation happened within a single page no pages
  344. * will be released, just zeroed, so we can bail out now.
  345. */
  346. if (start >= end)
  347. goto out;
  348. index = start;
  349. for ( ; ; ) {
  350. cond_resched();
  351. if (!pagevec_lookup_entries(&pvec, mapping, index,
  352. min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
  353. /* If all gone from start onwards, we're done */
  354. if (index == start)
  355. break;
  356. /* Otherwise restart to make sure all gone */
  357. index = start;
  358. continue;
  359. }
  360. if (index == start && indices[0] >= end) {
  361. /* All gone out of hole to be punched, we're done */
  362. pagevec_remove_exceptionals(&pvec);
  363. pagevec_release(&pvec);
  364. break;
  365. }
  366. for (i = 0; i < pagevec_count(&pvec); i++) {
  367. struct page *page = pvec.pages[i];
  368. /* We rely upon deletion not changing page->index */
  369. index = indices[i];
  370. if (index >= end) {
  371. /* Restart punch to make sure all gone */
  372. index = start - 1;
  373. break;
  374. }
  375. if (radix_tree_exceptional_entry(page)) {
  376. truncate_exceptional_entry(mapping, index,
  377. page);
  378. continue;
  379. }
  380. lock_page(page);
  381. WARN_ON(page_to_index(page) != index);
  382. wait_on_page_writeback(page);
  383. truncate_inode_page(mapping, page);
  384. unlock_page(page);
  385. }
  386. pagevec_remove_exceptionals(&pvec);
  387. pagevec_release(&pvec);
  388. index++;
  389. }
  390. out:
  391. cleancache_invalidate_inode(mapping);
  392. }
  393. EXPORT_SYMBOL(truncate_inode_pages_range);
  394. /**
  395. * truncate_inode_pages - truncate *all* the pages from an offset
  396. * @mapping: mapping to truncate
  397. * @lstart: offset from which to truncate
  398. *
  399. * Called under (and serialised by) inode->i_mutex.
  400. *
  401. * Note: When this function returns, there can be a page in the process of
  402. * deletion (inside __delete_from_page_cache()) in the specified range. Thus
  403. * mapping->nrpages can be non-zero when this function returns even after
  404. * truncation of the whole mapping.
  405. */
  406. void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
  407. {
  408. truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
  409. }
  410. EXPORT_SYMBOL(truncate_inode_pages);
  411. /**
  412. * truncate_inode_pages_final - truncate *all* pages before inode dies
  413. * @mapping: mapping to truncate
  414. *
  415. * Called under (and serialized by) inode->i_mutex.
  416. *
  417. * Filesystems have to use this in the .evict_inode path to inform the
  418. * VM that this is the final truncate and the inode is going away.
  419. */
  420. void truncate_inode_pages_final(struct address_space *mapping)
  421. {
  422. unsigned long nrexceptional;
  423. unsigned long nrpages;
  424. /*
  425. * Page reclaim can not participate in regular inode lifetime
  426. * management (can't call iput()) and thus can race with the
  427. * inode teardown. Tell it when the address space is exiting,
  428. * so that it does not install eviction information after the
  429. * final truncate has begun.
  430. */
  431. mapping_set_exiting(mapping);
  432. /*
  433. * When reclaim installs eviction entries, it increases
  434. * nrexceptional first, then decreases nrpages. Make sure we see
  435. * this in the right order or we might miss an entry.
  436. */
  437. nrpages = mapping->nrpages;
  438. smp_rmb();
  439. nrexceptional = mapping->nrexceptional;
  440. if (nrpages || nrexceptional) {
  441. /*
  442. * As truncation uses a lockless tree lookup, cycle
  443. * the tree lock to make sure any ongoing tree
  444. * modification that does not see AS_EXITING is
  445. * completed before starting the final truncate.
  446. */
  447. spin_lock_irq(&mapping->tree_lock);
  448. spin_unlock_irq(&mapping->tree_lock);
  449. truncate_inode_pages(mapping, 0);
  450. }
  451. }
  452. EXPORT_SYMBOL(truncate_inode_pages_final);
  453. /**
  454. * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
  455. * @mapping: the address_space which holds the pages to invalidate
  456. * @start: the offset 'from' which to invalidate
  457. * @end: the offset 'to' which to invalidate (inclusive)
  458. *
  459. * This function only removes the unlocked pages, if you want to
  460. * remove all the pages of one inode, you must call truncate_inode_pages.
  461. *
  462. * invalidate_mapping_pages() will not block on IO activity. It will not
  463. * invalidate pages which are dirty, locked, under writeback or mapped into
  464. * pagetables.
  465. */
  466. unsigned long invalidate_mapping_pages(struct address_space *mapping,
  467. pgoff_t start, pgoff_t end)
  468. {
  469. pgoff_t indices[PAGEVEC_SIZE];
  470. struct pagevec pvec;
  471. pgoff_t index = start;
  472. unsigned long ret;
  473. unsigned long count = 0;
  474. int i;
  475. pagevec_init(&pvec, 0);
  476. while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
  477. min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
  478. indices)) {
  479. for (i = 0; i < pagevec_count(&pvec); i++) {
  480. struct page *page = pvec.pages[i];
  481. /* We rely upon deletion not changing page->index */
  482. index = indices[i];
  483. if (index > end)
  484. break;
  485. if (radix_tree_exceptional_entry(page)) {
  486. invalidate_exceptional_entry(mapping, index,
  487. page);
  488. continue;
  489. }
  490. if (!trylock_page(page))
  491. continue;
  492. WARN_ON(page_to_index(page) != index);
  493. /* Middle of THP: skip */
  494. if (PageTransTail(page)) {
  495. unlock_page(page);
  496. continue;
  497. } else if (PageTransHuge(page)) {
  498. index += HPAGE_PMD_NR - 1;
  499. i += HPAGE_PMD_NR - 1;
  500. /*
  501. * 'end' is in the middle of THP. Don't
  502. * invalidate the page as the part outside of
  503. * 'end' could be still useful.
  504. */
  505. if (index > end) {
  506. unlock_page(page);
  507. continue;
  508. }
  509. }
  510. ret = invalidate_inode_page(page);
  511. unlock_page(page);
  512. /*
  513. * Invalidation is a hint that the page is no longer
  514. * of interest and try to speed up its reclaim.
  515. */
  516. if (!ret)
  517. deactivate_file_page(page);
  518. count += ret;
  519. }
  520. pagevec_remove_exceptionals(&pvec);
  521. pagevec_release(&pvec);
  522. cond_resched();
  523. index++;
  524. }
  525. return count;
  526. }
  527. EXPORT_SYMBOL(invalidate_mapping_pages);
  528. /*
  529. * This is like invalidate_complete_page(), except it ignores the page's
  530. * refcount. We do this because invalidate_inode_pages2() needs stronger
  531. * invalidation guarantees, and cannot afford to leave pages behind because
  532. * shrink_page_list() has a temp ref on them, or because they're transiently
  533. * sitting in the lru_cache_add() pagevecs.
  534. */
  535. static int
  536. invalidate_complete_page2(struct address_space *mapping, struct page *page)
  537. {
  538. unsigned long flags;
  539. if (page->mapping != mapping)
  540. return 0;
  541. if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
  542. return 0;
  543. spin_lock_irqsave(&mapping->tree_lock, flags);
  544. if (PageDirty(page))
  545. goto failed;
  546. BUG_ON(page_has_private(page));
  547. __delete_from_page_cache(page, NULL);
  548. spin_unlock_irqrestore(&mapping->tree_lock, flags);
  549. if (mapping->a_ops->freepage)
  550. mapping->a_ops->freepage(page);
  551. put_page(page); /* pagecache ref */
  552. return 1;
  553. failed:
  554. spin_unlock_irqrestore(&mapping->tree_lock, flags);
  555. return 0;
  556. }
  557. static int do_launder_page(struct address_space *mapping, struct page *page)
  558. {
  559. if (!PageDirty(page))
  560. return 0;
  561. if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
  562. return 0;
  563. return mapping->a_ops->launder_page(page);
  564. }
  565. /**
  566. * invalidate_inode_pages2_range - remove range of pages from an address_space
  567. * @mapping: the address_space
  568. * @start: the page offset 'from' which to invalidate
  569. * @end: the page offset 'to' which to invalidate (inclusive)
  570. *
  571. * Any pages which are found to be mapped into pagetables are unmapped prior to
  572. * invalidation.
  573. *
  574. * Returns -EBUSY if any pages could not be invalidated.
  575. */
  576. int invalidate_inode_pages2_range(struct address_space *mapping,
  577. pgoff_t start, pgoff_t end)
  578. {
  579. pgoff_t indices[PAGEVEC_SIZE];
  580. struct pagevec pvec;
  581. pgoff_t index;
  582. int i;
  583. int ret = 0;
  584. int ret2 = 0;
  585. int did_range_unmap = 0;
  586. if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
  587. goto out;
  588. pagevec_init(&pvec, 0);
  589. index = start;
  590. while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
  591. min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
  592. indices)) {
  593. for (i = 0; i < pagevec_count(&pvec); i++) {
  594. struct page *page = pvec.pages[i];
  595. /* We rely upon deletion not changing page->index */
  596. index = indices[i];
  597. if (index > end)
  598. break;
  599. if (radix_tree_exceptional_entry(page)) {
  600. if (!invalidate_exceptional_entry2(mapping,
  601. index, page))
  602. ret = -EBUSY;
  603. continue;
  604. }
  605. lock_page(page);
  606. WARN_ON(page_to_index(page) != index);
  607. if (page->mapping != mapping) {
  608. unlock_page(page);
  609. continue;
  610. }
  611. wait_on_page_writeback(page);
  612. if (page_mapped(page)) {
  613. if (!did_range_unmap) {
  614. /*
  615. * Zap the rest of the file in one hit.
  616. */
  617. unmap_mapping_range(mapping,
  618. (loff_t)index << PAGE_SHIFT,
  619. (loff_t)(1 + end - index)
  620. << PAGE_SHIFT,
  621. 0);
  622. did_range_unmap = 1;
  623. } else {
  624. /*
  625. * Just zap this page
  626. */
  627. unmap_mapping_range(mapping,
  628. (loff_t)index << PAGE_SHIFT,
  629. PAGE_SIZE, 0);
  630. }
  631. }
  632. BUG_ON(page_mapped(page));
  633. ret2 = do_launder_page(mapping, page);
  634. if (ret2 == 0) {
  635. if (!invalidate_complete_page2(mapping, page))
  636. ret2 = -EBUSY;
  637. }
  638. if (ret2 < 0)
  639. ret = ret2;
  640. unlock_page(page);
  641. }
  642. pagevec_remove_exceptionals(&pvec);
  643. pagevec_release(&pvec);
  644. cond_resched();
  645. index++;
  646. }
  647. /*
  648. * For DAX we invalidate page tables after invalidating radix tree. We
  649. * could invalidate page tables while invalidating each entry however
  650. * that would be expensive. And doing range unmapping before doesn't
  651. * work as we have no cheap way to find whether radix tree entry didn't
  652. * get remapped later.
  653. */
  654. if (dax_mapping(mapping)) {
  655. unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
  656. (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
  657. }
  658. out:
  659. cleancache_invalidate_inode(mapping);
  660. return ret;
  661. }
  662. EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
  663. /**
  664. * invalidate_inode_pages2 - remove all pages from an address_space
  665. * @mapping: the address_space
  666. *
  667. * Any pages which are found to be mapped into pagetables are unmapped prior to
  668. * invalidation.
  669. *
  670. * Returns -EBUSY if any pages could not be invalidated.
  671. */
  672. int invalidate_inode_pages2(struct address_space *mapping)
  673. {
  674. return invalidate_inode_pages2_range(mapping, 0, -1);
  675. }
  676. EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
  677. /**
  678. * truncate_pagecache - unmap and remove pagecache that has been truncated
  679. * @inode: inode
  680. * @newsize: new file size
  681. *
  682. * inode's new i_size must already be written before truncate_pagecache
  683. * is called.
  684. *
  685. * This function should typically be called before the filesystem
  686. * releases resources associated with the freed range (eg. deallocates
  687. * blocks). This way, pagecache will always stay logically coherent
  688. * with on-disk format, and the filesystem would not have to deal with
  689. * situations such as writepage being called for a page that has already
  690. * had its underlying blocks deallocated.
  691. */
  692. void truncate_pagecache(struct inode *inode, loff_t newsize)
  693. {
  694. struct address_space *mapping = inode->i_mapping;
  695. loff_t holebegin = round_up(newsize, PAGE_SIZE);
  696. /*
  697. * unmap_mapping_range is called twice, first simply for
  698. * efficiency so that truncate_inode_pages does fewer
  699. * single-page unmaps. However after this first call, and
  700. * before truncate_inode_pages finishes, it is possible for
  701. * private pages to be COWed, which remain after
  702. * truncate_inode_pages finishes, hence the second
  703. * unmap_mapping_range call must be made for correctness.
  704. */
  705. unmap_mapping_range(mapping, holebegin, 0, 1);
  706. truncate_inode_pages(mapping, newsize);
  707. unmap_mapping_range(mapping, holebegin, 0, 1);
  708. }
  709. EXPORT_SYMBOL(truncate_pagecache);
  710. /**
  711. * truncate_setsize - update inode and pagecache for a new file size
  712. * @inode: inode
  713. * @newsize: new file size
  714. *
  715. * truncate_setsize updates i_size and performs pagecache truncation (if
  716. * necessary) to @newsize. It will be typically be called from the filesystem's
  717. * setattr function when ATTR_SIZE is passed in.
  718. *
  719. * Must be called with a lock serializing truncates and writes (generally
  720. * i_mutex but e.g. xfs uses a different lock) and before all filesystem
  721. * specific block truncation has been performed.
  722. */
  723. void truncate_setsize(struct inode *inode, loff_t newsize)
  724. {
  725. loff_t oldsize = inode->i_size;
  726. i_size_write(inode, newsize);
  727. if (newsize > oldsize)
  728. pagecache_isize_extended(inode, oldsize, newsize);
  729. truncate_pagecache(inode, newsize);
  730. }
  731. EXPORT_SYMBOL(truncate_setsize);
  732. /**
  733. * pagecache_isize_extended - update pagecache after extension of i_size
  734. * @inode: inode for which i_size was extended
  735. * @from: original inode size
  736. * @to: new inode size
  737. *
  738. * Handle extension of inode size either caused by extending truncate or by
  739. * write starting after current i_size. We mark the page straddling current
  740. * i_size RO so that page_mkwrite() is called on the nearest write access to
  741. * the page. This way filesystem can be sure that page_mkwrite() is called on
  742. * the page before user writes to the page via mmap after the i_size has been
  743. * changed.
  744. *
  745. * The function must be called after i_size is updated so that page fault
  746. * coming after we unlock the page will already see the new i_size.
  747. * The function must be called while we still hold i_mutex - this not only
  748. * makes sure i_size is stable but also that userspace cannot observe new
  749. * i_size value before we are prepared to store mmap writes at new inode size.
  750. */
  751. void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
  752. {
  753. int bsize = i_blocksize(inode);
  754. loff_t rounded_from;
  755. struct page *page;
  756. pgoff_t index;
  757. WARN_ON(to > inode->i_size);
  758. if (from >= to || bsize == PAGE_SIZE)
  759. return;
  760. /* Page straddling @from will not have any hole block created? */
  761. rounded_from = round_up(from, bsize);
  762. if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
  763. return;
  764. index = from >> PAGE_SHIFT;
  765. page = find_lock_page(inode->i_mapping, index);
  766. /* Page not cached? Nothing to do */
  767. if (!page)
  768. return;
  769. /*
  770. * See clear_page_dirty_for_io() for details why set_page_dirty()
  771. * is needed.
  772. */
  773. if (page_mkclean(page))
  774. set_page_dirty(page);
  775. unlock_page(page);
  776. put_page(page);
  777. }
  778. EXPORT_SYMBOL(pagecache_isize_extended);
  779. /**
  780. * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
  781. * @inode: inode
  782. * @lstart: offset of beginning of hole
  783. * @lend: offset of last byte of hole
  784. *
  785. * This function should typically be called before the filesystem
  786. * releases resources associated with the freed range (eg. deallocates
  787. * blocks). This way, pagecache will always stay logically coherent
  788. * with on-disk format, and the filesystem would not have to deal with
  789. * situations such as writepage being called for a page that has already
  790. * had its underlying blocks deallocated.
  791. */
  792. void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
  793. {
  794. struct address_space *mapping = inode->i_mapping;
  795. loff_t unmap_start = round_up(lstart, PAGE_SIZE);
  796. loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
  797. /*
  798. * This rounding is currently just for example: unmap_mapping_range
  799. * expands its hole outwards, whereas we want it to contract the hole
  800. * inwards. However, existing callers of truncate_pagecache_range are
  801. * doing their own page rounding first. Note that unmap_mapping_range
  802. * allows holelen 0 for all, and we allow lend -1 for end of file.
  803. */
  804. /*
  805. * Unlike in truncate_pagecache, unmap_mapping_range is called only
  806. * once (before truncating pagecache), and without "even_cows" flag:
  807. * hole-punching should not remove private COWed pages from the hole.
  808. */
  809. if ((u64)unmap_end > (u64)unmap_start)
  810. unmap_mapping_range(mapping, unmap_start,
  811. 1 + unmap_end - unmap_start, 0);
  812. truncate_inode_pages_range(mapping, lstart, lend);
  813. }
  814. EXPORT_SYMBOL(truncate_pagecache_range);