xfs_aops.c 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429
  1. /*
  2. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "xfs.h"
  19. #include "xfs_shared.h"
  20. #include "xfs_format.h"
  21. #include "xfs_log_format.h"
  22. #include "xfs_trans_resv.h"
  23. #include "xfs_mount.h"
  24. #include "xfs_inode.h"
  25. #include "xfs_trans.h"
  26. #include "xfs_inode_item.h"
  27. #include "xfs_alloc.h"
  28. #include "xfs_error.h"
  29. #include "xfs_iomap.h"
  30. #include "xfs_trace.h"
  31. #include "xfs_bmap.h"
  32. #include "xfs_bmap_util.h"
  33. #include "xfs_bmap_btree.h"
  34. #include "xfs_reflink.h"
  35. #include <linux/gfp.h>
  36. #include <linux/mpage.h>
  37. #include <linux/pagevec.h>
  38. #include <linux/writeback.h>
  39. /*
  40. * structure owned by writepages passed to individual writepage calls
  41. */
  42. struct xfs_writepage_ctx {
  43. struct xfs_bmbt_irec imap;
  44. bool imap_valid;
  45. unsigned int io_type;
  46. struct xfs_ioend *ioend;
  47. sector_t last_block;
  48. };
  49. void
  50. xfs_count_page_state(
  51. struct page *page,
  52. int *delalloc,
  53. int *unwritten)
  54. {
  55. struct buffer_head *bh, *head;
  56. *delalloc = *unwritten = 0;
  57. bh = head = page_buffers(page);
  58. do {
  59. if (buffer_unwritten(bh))
  60. (*unwritten) = 1;
  61. else if (buffer_delay(bh))
  62. (*delalloc) = 1;
  63. } while ((bh = bh->b_this_page) != head);
  64. }
  65. struct block_device *
  66. xfs_find_bdev_for_inode(
  67. struct inode *inode)
  68. {
  69. struct xfs_inode *ip = XFS_I(inode);
  70. struct xfs_mount *mp = ip->i_mount;
  71. if (XFS_IS_REALTIME_INODE(ip))
  72. return mp->m_rtdev_targp->bt_bdev;
  73. else
  74. return mp->m_ddev_targp->bt_bdev;
  75. }
  76. /*
  77. * We're now finished for good with this page. Update the page state via the
  78. * associated buffer_heads, paying attention to the start and end offsets that
  79. * we need to process on the page.
  80. *
  81. * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
  82. * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
  83. * the page at all, as we may be racing with memory reclaim and it can free both
  84. * the bufferhead chain and the page as it will see the page as clean and
  85. * unused.
  86. */
  87. static void
  88. xfs_finish_page_writeback(
  89. struct inode *inode,
  90. struct bio_vec *bvec,
  91. int error)
  92. {
  93. unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
  94. struct buffer_head *head, *bh, *next;
  95. unsigned int off = 0;
  96. unsigned int bsize;
  97. ASSERT(bvec->bv_offset < PAGE_SIZE);
  98. ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
  99. ASSERT(end < PAGE_SIZE);
  100. ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
  101. bh = head = page_buffers(bvec->bv_page);
  102. bsize = bh->b_size;
  103. do {
  104. next = bh->b_this_page;
  105. if (off < bvec->bv_offset)
  106. goto next_bh;
  107. if (off > end)
  108. break;
  109. bh->b_end_io(bh, !error);
  110. next_bh:
  111. off += bsize;
  112. } while ((bh = next) != head);
  113. }
  114. /*
  115. * We're now finished for good with this ioend structure. Update the page
  116. * state, release holds on bios, and finally free up memory. Do not use the
  117. * ioend after this.
  118. */
  119. STATIC void
  120. xfs_destroy_ioend(
  121. struct xfs_ioend *ioend,
  122. int error)
  123. {
  124. struct inode *inode = ioend->io_inode;
  125. struct bio *last = ioend->io_bio;
  126. struct bio *bio, *next;
  127. for (bio = &ioend->io_inline_bio; bio; bio = next) {
  128. struct bio_vec *bvec;
  129. int i;
  130. /*
  131. * For the last bio, bi_private points to the ioend, so we
  132. * need to explicitly end the iteration here.
  133. */
  134. if (bio == last)
  135. next = NULL;
  136. else
  137. next = bio->bi_private;
  138. /* walk each page on bio, ending page IO on them */
  139. bio_for_each_segment_all(bvec, bio, i)
  140. xfs_finish_page_writeback(inode, bvec, error);
  141. bio_put(bio);
  142. }
  143. }
  144. /*
  145. * Fast and loose check if this write could update the on-disk inode size.
  146. */
  147. static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
  148. {
  149. return ioend->io_offset + ioend->io_size >
  150. XFS_I(ioend->io_inode)->i_d.di_size;
  151. }
  152. STATIC int
  153. xfs_setfilesize_trans_alloc(
  154. struct xfs_ioend *ioend)
  155. {
  156. struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
  157. struct xfs_trans *tp;
  158. int error;
  159. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
  160. if (error)
  161. return error;
  162. ioend->io_append_trans = tp;
  163. /*
  164. * We may pass freeze protection with a transaction. So tell lockdep
  165. * we released it.
  166. */
  167. __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
  168. /*
  169. * We hand off the transaction to the completion thread now, so
  170. * clear the flag here.
  171. */
  172. current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
  173. return 0;
  174. }
  175. /*
  176. * Update on-disk file size now that data has been written to disk.
  177. */
  178. STATIC int
  179. __xfs_setfilesize(
  180. struct xfs_inode *ip,
  181. struct xfs_trans *tp,
  182. xfs_off_t offset,
  183. size_t size)
  184. {
  185. xfs_fsize_t isize;
  186. xfs_ilock(ip, XFS_ILOCK_EXCL);
  187. isize = xfs_new_eof(ip, offset + size);
  188. if (!isize) {
  189. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  190. xfs_trans_cancel(tp);
  191. return 0;
  192. }
  193. trace_xfs_setfilesize(ip, offset, size);
  194. ip->i_d.di_size = isize;
  195. xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  196. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  197. return xfs_trans_commit(tp);
  198. }
  199. int
  200. xfs_setfilesize(
  201. struct xfs_inode *ip,
  202. xfs_off_t offset,
  203. size_t size)
  204. {
  205. struct xfs_mount *mp = ip->i_mount;
  206. struct xfs_trans *tp;
  207. int error;
  208. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
  209. if (error)
  210. return error;
  211. return __xfs_setfilesize(ip, tp, offset, size);
  212. }
  213. STATIC int
  214. xfs_setfilesize_ioend(
  215. struct xfs_ioend *ioend,
  216. int error)
  217. {
  218. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  219. struct xfs_trans *tp = ioend->io_append_trans;
  220. /*
  221. * The transaction may have been allocated in the I/O submission thread,
  222. * thus we need to mark ourselves as being in a transaction manually.
  223. * Similarly for freeze protection.
  224. */
  225. current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
  226. __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
  227. /* we abort the update if there was an IO error */
  228. if (error) {
  229. xfs_trans_cancel(tp);
  230. return error;
  231. }
  232. return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
  233. }
  234. /*
  235. * IO write completion.
  236. */
  237. STATIC void
  238. xfs_end_io(
  239. struct work_struct *work)
  240. {
  241. struct xfs_ioend *ioend =
  242. container_of(work, struct xfs_ioend, io_work);
  243. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  244. int error = ioend->io_bio->bi_error;
  245. /*
  246. * Set an error if the mount has shut down and proceed with end I/O
  247. * processing so it can perform whatever cleanups are necessary.
  248. */
  249. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  250. error = -EIO;
  251. /*
  252. * For a CoW extent, we need to move the mapping from the CoW fork
  253. * to the data fork. If instead an error happened, just dump the
  254. * new blocks.
  255. */
  256. if (ioend->io_type == XFS_IO_COW) {
  257. if (error)
  258. goto done;
  259. if (ioend->io_bio->bi_error) {
  260. error = xfs_reflink_cancel_cow_range(ip,
  261. ioend->io_offset, ioend->io_size);
  262. goto done;
  263. }
  264. error = xfs_reflink_end_cow(ip, ioend->io_offset,
  265. ioend->io_size);
  266. if (error)
  267. goto done;
  268. }
  269. /*
  270. * For unwritten extents we need to issue transactions to convert a
  271. * range to normal written extens after the data I/O has finished.
  272. * Detecting and handling completion IO errors is done individually
  273. * for each case as different cleanup operations need to be performed
  274. * on error.
  275. */
  276. if (ioend->io_type == XFS_IO_UNWRITTEN) {
  277. if (error)
  278. goto done;
  279. error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
  280. ioend->io_size);
  281. } else if (ioend->io_append_trans) {
  282. error = xfs_setfilesize_ioend(ioend, error);
  283. } else {
  284. ASSERT(!xfs_ioend_is_append(ioend) ||
  285. ioend->io_type == XFS_IO_COW);
  286. }
  287. done:
  288. xfs_destroy_ioend(ioend, error);
  289. }
  290. STATIC void
  291. xfs_end_bio(
  292. struct bio *bio)
  293. {
  294. struct xfs_ioend *ioend = bio->bi_private;
  295. struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
  296. if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
  297. queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
  298. else if (ioend->io_append_trans)
  299. queue_work(mp->m_data_workqueue, &ioend->io_work);
  300. else
  301. xfs_destroy_ioend(ioend, bio->bi_error);
  302. }
  303. STATIC int
  304. xfs_map_blocks(
  305. struct inode *inode,
  306. loff_t offset,
  307. struct xfs_bmbt_irec *imap,
  308. int type)
  309. {
  310. struct xfs_inode *ip = XFS_I(inode);
  311. struct xfs_mount *mp = ip->i_mount;
  312. ssize_t count = 1 << inode->i_blkbits;
  313. xfs_fileoff_t offset_fsb, end_fsb;
  314. int error = 0;
  315. int bmapi_flags = XFS_BMAPI_ENTIRE;
  316. int nimaps = 1;
  317. if (XFS_FORCED_SHUTDOWN(mp))
  318. return -EIO;
  319. ASSERT(type != XFS_IO_COW);
  320. if (type == XFS_IO_UNWRITTEN)
  321. bmapi_flags |= XFS_BMAPI_IGSTATE;
  322. xfs_ilock(ip, XFS_ILOCK_SHARED);
  323. ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
  324. (ip->i_df.if_flags & XFS_IFEXTENTS));
  325. ASSERT(offset <= mp->m_super->s_maxbytes);
  326. if (offset + count > mp->m_super->s_maxbytes)
  327. count = mp->m_super->s_maxbytes - offset;
  328. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
  329. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  330. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  331. imap, &nimaps, bmapi_flags);
  332. /*
  333. * Truncate an overwrite extent if there's a pending CoW
  334. * reservation before the end of this extent. This forces us
  335. * to come back to writepage to take care of the CoW.
  336. */
  337. if (nimaps && type == XFS_IO_OVERWRITE)
  338. xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
  339. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  340. if (error)
  341. return error;
  342. if (type == XFS_IO_DELALLOC &&
  343. (!nimaps || isnullstartblock(imap->br_startblock))) {
  344. error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
  345. imap);
  346. if (!error)
  347. trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
  348. return error;
  349. }
  350. #ifdef DEBUG
  351. if (type == XFS_IO_UNWRITTEN) {
  352. ASSERT(nimaps);
  353. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  354. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  355. }
  356. #endif
  357. if (nimaps)
  358. trace_xfs_map_blocks_found(ip, offset, count, type, imap);
  359. return 0;
  360. }
  361. STATIC bool
  362. xfs_imap_valid(
  363. struct inode *inode,
  364. struct xfs_bmbt_irec *imap,
  365. xfs_off_t offset)
  366. {
  367. offset >>= inode->i_blkbits;
  368. return offset >= imap->br_startoff &&
  369. offset < imap->br_startoff + imap->br_blockcount;
  370. }
  371. STATIC void
  372. xfs_start_buffer_writeback(
  373. struct buffer_head *bh)
  374. {
  375. ASSERT(buffer_mapped(bh));
  376. ASSERT(buffer_locked(bh));
  377. ASSERT(!buffer_delay(bh));
  378. ASSERT(!buffer_unwritten(bh));
  379. mark_buffer_async_write(bh);
  380. set_buffer_uptodate(bh);
  381. clear_buffer_dirty(bh);
  382. }
  383. STATIC void
  384. xfs_start_page_writeback(
  385. struct page *page,
  386. int clear_dirty)
  387. {
  388. ASSERT(PageLocked(page));
  389. ASSERT(!PageWriteback(page));
  390. /*
  391. * if the page was not fully cleaned, we need to ensure that the higher
  392. * layers come back to it correctly. That means we need to keep the page
  393. * dirty, and for WB_SYNC_ALL writeback we need to ensure the
  394. * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
  395. * write this page in this writeback sweep will be made.
  396. */
  397. if (clear_dirty) {
  398. clear_page_dirty_for_io(page);
  399. set_page_writeback(page);
  400. } else
  401. set_page_writeback_keepwrite(page);
  402. unlock_page(page);
  403. }
  404. static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  405. {
  406. return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
  407. }
  408. /*
  409. * Submit the bio for an ioend. We are passed an ioend with a bio attached to
  410. * it, and we submit that bio. The ioend may be used for multiple bio
  411. * submissions, so we only want to allocate an append transaction for the ioend
  412. * once. In the case of multiple bio submission, each bio will take an IO
  413. * reference to the ioend to ensure that the ioend completion is only done once
  414. * all bios have been submitted and the ioend is really done.
  415. *
  416. * If @fail is non-zero, it means that we have a situation where some part of
  417. * the submission process has failed after we have marked paged for writeback
  418. * and unlocked them. In this situation, we need to fail the bio and ioend
  419. * rather than submit it to IO. This typically only happens on a filesystem
  420. * shutdown.
  421. */
  422. STATIC int
  423. xfs_submit_ioend(
  424. struct writeback_control *wbc,
  425. struct xfs_ioend *ioend,
  426. int status)
  427. {
  428. /* Reserve log space if we might write beyond the on-disk inode size. */
  429. if (!status &&
  430. ioend->io_type != XFS_IO_UNWRITTEN &&
  431. xfs_ioend_is_append(ioend) &&
  432. !ioend->io_append_trans)
  433. status = xfs_setfilesize_trans_alloc(ioend);
  434. ioend->io_bio->bi_private = ioend;
  435. ioend->io_bio->bi_end_io = xfs_end_bio;
  436. ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
  437. /*
  438. * If we are failing the IO now, just mark the ioend with an
  439. * error and finish it. This will run IO completion immediately
  440. * as there is only one reference to the ioend at this point in
  441. * time.
  442. */
  443. if (status) {
  444. ioend->io_bio->bi_error = status;
  445. bio_endio(ioend->io_bio);
  446. return status;
  447. }
  448. submit_bio(ioend->io_bio);
  449. return 0;
  450. }
  451. static void
  452. xfs_init_bio_from_bh(
  453. struct bio *bio,
  454. struct buffer_head *bh)
  455. {
  456. bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  457. bio->bi_bdev = bh->b_bdev;
  458. }
  459. static struct xfs_ioend *
  460. xfs_alloc_ioend(
  461. struct inode *inode,
  462. unsigned int type,
  463. xfs_off_t offset,
  464. struct buffer_head *bh)
  465. {
  466. struct xfs_ioend *ioend;
  467. struct bio *bio;
  468. bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
  469. xfs_init_bio_from_bh(bio, bh);
  470. ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
  471. INIT_LIST_HEAD(&ioend->io_list);
  472. ioend->io_type = type;
  473. ioend->io_inode = inode;
  474. ioend->io_size = 0;
  475. ioend->io_offset = offset;
  476. INIT_WORK(&ioend->io_work, xfs_end_io);
  477. ioend->io_append_trans = NULL;
  478. ioend->io_bio = bio;
  479. return ioend;
  480. }
  481. /*
  482. * Allocate a new bio, and chain the old bio to the new one.
  483. *
  484. * Note that we have to do perform the chaining in this unintuitive order
  485. * so that the bi_private linkage is set up in the right direction for the
  486. * traversal in xfs_destroy_ioend().
  487. */
  488. static void
  489. xfs_chain_bio(
  490. struct xfs_ioend *ioend,
  491. struct writeback_control *wbc,
  492. struct buffer_head *bh)
  493. {
  494. struct bio *new;
  495. new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
  496. xfs_init_bio_from_bh(new, bh);
  497. bio_chain(ioend->io_bio, new);
  498. bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
  499. ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
  500. submit_bio(ioend->io_bio);
  501. ioend->io_bio = new;
  502. }
  503. /*
  504. * Test to see if we've been building up a completion structure for
  505. * earlier buffers -- if so, we try to append to this ioend if we
  506. * can, otherwise we finish off any current ioend and start another.
  507. * Return the ioend we finished off so that the caller can submit it
  508. * once it has finished processing the dirty page.
  509. */
  510. STATIC void
  511. xfs_add_to_ioend(
  512. struct inode *inode,
  513. struct buffer_head *bh,
  514. xfs_off_t offset,
  515. struct xfs_writepage_ctx *wpc,
  516. struct writeback_control *wbc,
  517. struct list_head *iolist)
  518. {
  519. if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
  520. bh->b_blocknr != wpc->last_block + 1 ||
  521. offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
  522. if (wpc->ioend)
  523. list_add(&wpc->ioend->io_list, iolist);
  524. wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
  525. }
  526. /*
  527. * If the buffer doesn't fit into the bio we need to allocate a new
  528. * one. This shouldn't happen more than once for a given buffer.
  529. */
  530. while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
  531. xfs_chain_bio(wpc->ioend, wbc, bh);
  532. wpc->ioend->io_size += bh->b_size;
  533. wpc->last_block = bh->b_blocknr;
  534. xfs_start_buffer_writeback(bh);
  535. }
  536. STATIC void
  537. xfs_map_buffer(
  538. struct inode *inode,
  539. struct buffer_head *bh,
  540. struct xfs_bmbt_irec *imap,
  541. xfs_off_t offset)
  542. {
  543. sector_t bn;
  544. struct xfs_mount *m = XFS_I(inode)->i_mount;
  545. xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
  546. xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
  547. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  548. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  549. bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
  550. ((offset - iomap_offset) >> inode->i_blkbits);
  551. ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
  552. bh->b_blocknr = bn;
  553. set_buffer_mapped(bh);
  554. }
  555. STATIC void
  556. xfs_map_at_offset(
  557. struct inode *inode,
  558. struct buffer_head *bh,
  559. struct xfs_bmbt_irec *imap,
  560. xfs_off_t offset)
  561. {
  562. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  563. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  564. xfs_map_buffer(inode, bh, imap, offset);
  565. set_buffer_mapped(bh);
  566. clear_buffer_delay(bh);
  567. clear_buffer_unwritten(bh);
  568. }
  569. /*
  570. * Test if a given page contains at least one buffer of a given @type.
  571. * If @check_all_buffers is true, then we walk all the buffers in the page to
  572. * try to find one of the type passed in. If it is not set, then the caller only
  573. * needs to check the first buffer on the page for a match.
  574. */
  575. STATIC bool
  576. xfs_check_page_type(
  577. struct page *page,
  578. unsigned int type,
  579. bool check_all_buffers)
  580. {
  581. struct buffer_head *bh;
  582. struct buffer_head *head;
  583. if (PageWriteback(page))
  584. return false;
  585. if (!page->mapping)
  586. return false;
  587. if (!page_has_buffers(page))
  588. return false;
  589. bh = head = page_buffers(page);
  590. do {
  591. if (buffer_unwritten(bh)) {
  592. if (type == XFS_IO_UNWRITTEN)
  593. return true;
  594. } else if (buffer_delay(bh)) {
  595. if (type == XFS_IO_DELALLOC)
  596. return true;
  597. } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
  598. if (type == XFS_IO_OVERWRITE)
  599. return true;
  600. }
  601. /* If we are only checking the first buffer, we are done now. */
  602. if (!check_all_buffers)
  603. break;
  604. } while ((bh = bh->b_this_page) != head);
  605. return false;
  606. }
  607. STATIC void
  608. xfs_vm_invalidatepage(
  609. struct page *page,
  610. unsigned int offset,
  611. unsigned int length)
  612. {
  613. trace_xfs_invalidatepage(page->mapping->host, page, offset,
  614. length);
  615. block_invalidatepage(page, offset, length);
  616. }
  617. /*
  618. * If the page has delalloc buffers on it, we need to punch them out before we
  619. * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  620. * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
  621. * is done on that same region - the delalloc extent is returned when none is
  622. * supposed to be there.
  623. *
  624. * We prevent this by truncating away the delalloc regions on the page before
  625. * invalidating it. Because they are delalloc, we can do this without needing a
  626. * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
  627. * truncation without a transaction as there is no space left for block
  628. * reservation (typically why we see a ENOSPC in writeback).
  629. *
  630. * This is not a performance critical path, so for now just do the punching a
  631. * buffer head at a time.
  632. */
  633. STATIC void
  634. xfs_aops_discard_page(
  635. struct page *page)
  636. {
  637. struct inode *inode = page->mapping->host;
  638. struct xfs_inode *ip = XFS_I(inode);
  639. struct buffer_head *bh, *head;
  640. loff_t offset = page_offset(page);
  641. if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
  642. goto out_invalidate;
  643. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  644. goto out_invalidate;
  645. xfs_alert(ip->i_mount,
  646. "page discard on page %p, inode 0x%llx, offset %llu.",
  647. page, ip->i_ino, offset);
  648. xfs_ilock(ip, XFS_ILOCK_EXCL);
  649. bh = head = page_buffers(page);
  650. do {
  651. int error;
  652. xfs_fileoff_t start_fsb;
  653. if (!buffer_delay(bh))
  654. goto next_buffer;
  655. start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
  656. error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
  657. if (error) {
  658. /* something screwed, just bail */
  659. if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  660. xfs_alert(ip->i_mount,
  661. "page discard unable to remove delalloc mapping.");
  662. }
  663. break;
  664. }
  665. next_buffer:
  666. offset += 1 << inode->i_blkbits;
  667. } while ((bh = bh->b_this_page) != head);
  668. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  669. out_invalidate:
  670. xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
  671. return;
  672. }
  673. static int
  674. xfs_map_cow(
  675. struct xfs_writepage_ctx *wpc,
  676. struct inode *inode,
  677. loff_t offset,
  678. unsigned int *new_type)
  679. {
  680. struct xfs_inode *ip = XFS_I(inode);
  681. struct xfs_bmbt_irec imap;
  682. bool is_cow = false;
  683. int error;
  684. /*
  685. * If we already have a valid COW mapping keep using it.
  686. */
  687. if (wpc->io_type == XFS_IO_COW) {
  688. wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
  689. if (wpc->imap_valid) {
  690. *new_type = XFS_IO_COW;
  691. return 0;
  692. }
  693. }
  694. /*
  695. * Else we need to check if there is a COW mapping at this offset.
  696. */
  697. xfs_ilock(ip, XFS_ILOCK_SHARED);
  698. is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
  699. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  700. if (!is_cow)
  701. return 0;
  702. /*
  703. * And if the COW mapping has a delayed extent here we need to
  704. * allocate real space for it now.
  705. */
  706. if (isnullstartblock(imap.br_startblock)) {
  707. error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
  708. &imap);
  709. if (error)
  710. return error;
  711. }
  712. wpc->io_type = *new_type = XFS_IO_COW;
  713. wpc->imap_valid = true;
  714. wpc->imap = imap;
  715. return 0;
  716. }
  717. /*
  718. * We implement an immediate ioend submission policy here to avoid needing to
  719. * chain multiple ioends and hence nest mempool allocations which can violate
  720. * forward progress guarantees we need to provide. The current ioend we are
  721. * adding buffers to is cached on the writepage context, and if the new buffer
  722. * does not append to the cached ioend it will create a new ioend and cache that
  723. * instead.
  724. *
  725. * If a new ioend is created and cached, the old ioend is returned and queued
  726. * locally for submission once the entire page is processed or an error has been
  727. * detected. While ioends are submitted immediately after they are completed,
  728. * batching optimisations are provided by higher level block plugging.
  729. *
  730. * At the end of a writeback pass, there will be a cached ioend remaining on the
  731. * writepage context that the caller will need to submit.
  732. */
  733. static int
  734. xfs_writepage_map(
  735. struct xfs_writepage_ctx *wpc,
  736. struct writeback_control *wbc,
  737. struct inode *inode,
  738. struct page *page,
  739. loff_t offset,
  740. __uint64_t end_offset)
  741. {
  742. LIST_HEAD(submit_list);
  743. struct xfs_ioend *ioend, *next;
  744. struct buffer_head *bh, *head;
  745. ssize_t len = 1 << inode->i_blkbits;
  746. int error = 0;
  747. int count = 0;
  748. int uptodate = 1;
  749. unsigned int new_type;
  750. bh = head = page_buffers(page);
  751. offset = page_offset(page);
  752. do {
  753. if (offset >= end_offset)
  754. break;
  755. if (!buffer_uptodate(bh))
  756. uptodate = 0;
  757. /*
  758. * set_page_dirty dirties all buffers in a page, independent
  759. * of their state. The dirty state however is entirely
  760. * meaningless for holes (!mapped && uptodate), so skip
  761. * buffers covering holes here.
  762. */
  763. if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
  764. wpc->imap_valid = false;
  765. continue;
  766. }
  767. if (buffer_unwritten(bh))
  768. new_type = XFS_IO_UNWRITTEN;
  769. else if (buffer_delay(bh))
  770. new_type = XFS_IO_DELALLOC;
  771. else if (buffer_uptodate(bh))
  772. new_type = XFS_IO_OVERWRITE;
  773. else {
  774. if (PageUptodate(page))
  775. ASSERT(buffer_mapped(bh));
  776. /*
  777. * This buffer is not uptodate and will not be
  778. * written to disk. Ensure that we will put any
  779. * subsequent writeable buffers into a new
  780. * ioend.
  781. */
  782. wpc->imap_valid = false;
  783. continue;
  784. }
  785. if (xfs_is_reflink_inode(XFS_I(inode))) {
  786. error = xfs_map_cow(wpc, inode, offset, &new_type);
  787. if (error)
  788. goto out;
  789. }
  790. if (wpc->io_type != new_type) {
  791. wpc->io_type = new_type;
  792. wpc->imap_valid = false;
  793. }
  794. if (wpc->imap_valid)
  795. wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
  796. offset);
  797. if (!wpc->imap_valid) {
  798. error = xfs_map_blocks(inode, offset, &wpc->imap,
  799. wpc->io_type);
  800. if (error)
  801. goto out;
  802. wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
  803. offset);
  804. }
  805. if (wpc->imap_valid) {
  806. lock_buffer(bh);
  807. if (wpc->io_type != XFS_IO_OVERWRITE)
  808. xfs_map_at_offset(inode, bh, &wpc->imap, offset);
  809. xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
  810. count++;
  811. }
  812. } while (offset += len, ((bh = bh->b_this_page) != head));
  813. if (uptodate && bh == head)
  814. SetPageUptodate(page);
  815. ASSERT(wpc->ioend || list_empty(&submit_list));
  816. out:
  817. /*
  818. * On error, we have to fail the ioend here because we have locked
  819. * buffers in the ioend. If we don't do this, we'll deadlock
  820. * invalidating the page as that tries to lock the buffers on the page.
  821. * Also, because we may have set pages under writeback, we have to make
  822. * sure we run IO completion to mark the error state of the IO
  823. * appropriately, so we can't cancel the ioend directly here. That means
  824. * we have to mark this page as under writeback if we included any
  825. * buffers from it in the ioend chain so that completion treats it
  826. * correctly.
  827. *
  828. * If we didn't include the page in the ioend, the on error we can
  829. * simply discard and unlock it as there are no other users of the page
  830. * or it's buffers right now. The caller will still need to trigger
  831. * submission of outstanding ioends on the writepage context so they are
  832. * treated correctly on error.
  833. */
  834. if (count) {
  835. xfs_start_page_writeback(page, !error);
  836. /*
  837. * Preserve the original error if there was one, otherwise catch
  838. * submission errors here and propagate into subsequent ioend
  839. * submissions.
  840. */
  841. list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
  842. int error2;
  843. list_del_init(&ioend->io_list);
  844. error2 = xfs_submit_ioend(wbc, ioend, error);
  845. if (error2 && !error)
  846. error = error2;
  847. }
  848. } else if (error) {
  849. xfs_aops_discard_page(page);
  850. ClearPageUptodate(page);
  851. unlock_page(page);
  852. } else {
  853. /*
  854. * We can end up here with no error and nothing to write if we
  855. * race with a partial page truncate on a sub-page block sized
  856. * filesystem. In that case we need to mark the page clean.
  857. */
  858. xfs_start_page_writeback(page, 1);
  859. end_page_writeback(page);
  860. }
  861. mapping_set_error(page->mapping, error);
  862. return error;
  863. }
  864. /*
  865. * Write out a dirty page.
  866. *
  867. * For delalloc space on the page we need to allocate space and flush it.
  868. * For unwritten space on the page we need to start the conversion to
  869. * regular allocated space.
  870. * For any other dirty buffer heads on the page we should flush them.
  871. */
  872. STATIC int
  873. xfs_do_writepage(
  874. struct page *page,
  875. struct writeback_control *wbc,
  876. void *data)
  877. {
  878. struct xfs_writepage_ctx *wpc = data;
  879. struct inode *inode = page->mapping->host;
  880. loff_t offset;
  881. __uint64_t end_offset;
  882. pgoff_t end_index;
  883. trace_xfs_writepage(inode, page, 0, 0);
  884. ASSERT(page_has_buffers(page));
  885. /*
  886. * Refuse to write the page out if we are called from reclaim context.
  887. *
  888. * This avoids stack overflows when called from deeply used stacks in
  889. * random callers for direct reclaim or memcg reclaim. We explicitly
  890. * allow reclaim from kswapd as the stack usage there is relatively low.
  891. *
  892. * This should never happen except in the case of a VM regression so
  893. * warn about it.
  894. */
  895. if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
  896. PF_MEMALLOC))
  897. goto redirty;
  898. /*
  899. * Given that we do not allow direct reclaim to call us, we should
  900. * never be called while in a filesystem transaction.
  901. */
  902. if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
  903. goto redirty;
  904. /*
  905. * Is this page beyond the end of the file?
  906. *
  907. * The page index is less than the end_index, adjust the end_offset
  908. * to the highest offset that this page should represent.
  909. * -----------------------------------------------------
  910. * | file mapping | <EOF> |
  911. * -----------------------------------------------------
  912. * | Page ... | Page N-2 | Page N-1 | Page N | |
  913. * ^--------------------------------^----------|--------
  914. * | desired writeback range | see else |
  915. * ---------------------------------^------------------|
  916. */
  917. offset = i_size_read(inode);
  918. end_index = offset >> PAGE_SHIFT;
  919. if (page->index < end_index)
  920. end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
  921. else {
  922. /*
  923. * Check whether the page to write out is beyond or straddles
  924. * i_size or not.
  925. * -------------------------------------------------------
  926. * | file mapping | <EOF> |
  927. * -------------------------------------------------------
  928. * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
  929. * ^--------------------------------^-----------|---------
  930. * | | Straddles |
  931. * ---------------------------------^-----------|--------|
  932. */
  933. unsigned offset_into_page = offset & (PAGE_SIZE - 1);
  934. /*
  935. * Skip the page if it is fully outside i_size, e.g. due to a
  936. * truncate operation that is in progress. We must redirty the
  937. * page so that reclaim stops reclaiming it. Otherwise
  938. * xfs_vm_releasepage() is called on it and gets confused.
  939. *
  940. * Note that the end_index is unsigned long, it would overflow
  941. * if the given offset is greater than 16TB on 32-bit system
  942. * and if we do check the page is fully outside i_size or not
  943. * via "if (page->index >= end_index + 1)" as "end_index + 1"
  944. * will be evaluated to 0. Hence this page will be redirtied
  945. * and be written out repeatedly which would result in an
  946. * infinite loop, the user program that perform this operation
  947. * will hang. Instead, we can verify this situation by checking
  948. * if the page to write is totally beyond the i_size or if it's
  949. * offset is just equal to the EOF.
  950. */
  951. if (page->index > end_index ||
  952. (page->index == end_index && offset_into_page == 0))
  953. goto redirty;
  954. /*
  955. * The page straddles i_size. It must be zeroed out on each
  956. * and every writepage invocation because it may be mmapped.
  957. * "A file is mapped in multiples of the page size. For a file
  958. * that is not a multiple of the page size, the remaining
  959. * memory is zeroed when mapped, and writes to that region are
  960. * not written out to the file."
  961. */
  962. zero_user_segment(page, offset_into_page, PAGE_SIZE);
  963. /* Adjust the end_offset to the end of file */
  964. end_offset = offset;
  965. }
  966. return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
  967. redirty:
  968. redirty_page_for_writepage(wbc, page);
  969. unlock_page(page);
  970. return 0;
  971. }
  972. STATIC int
  973. xfs_vm_writepage(
  974. struct page *page,
  975. struct writeback_control *wbc)
  976. {
  977. struct xfs_writepage_ctx wpc = {
  978. .io_type = XFS_IO_INVALID,
  979. };
  980. int ret;
  981. ret = xfs_do_writepage(page, wbc, &wpc);
  982. if (wpc.ioend)
  983. ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
  984. return ret;
  985. }
  986. STATIC int
  987. xfs_vm_writepages(
  988. struct address_space *mapping,
  989. struct writeback_control *wbc)
  990. {
  991. struct xfs_writepage_ctx wpc = {
  992. .io_type = XFS_IO_INVALID,
  993. };
  994. int ret;
  995. xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
  996. if (dax_mapping(mapping))
  997. return dax_writeback_mapping_range(mapping,
  998. xfs_find_bdev_for_inode(mapping->host), wbc);
  999. ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
  1000. if (wpc.ioend)
  1001. ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
  1002. return ret;
  1003. }
  1004. /*
  1005. * Called to move a page into cleanable state - and from there
  1006. * to be released. The page should already be clean. We always
  1007. * have buffer heads in this call.
  1008. *
  1009. * Returns 1 if the page is ok to release, 0 otherwise.
  1010. */
  1011. STATIC int
  1012. xfs_vm_releasepage(
  1013. struct page *page,
  1014. gfp_t gfp_mask)
  1015. {
  1016. int delalloc, unwritten;
  1017. trace_xfs_releasepage(page->mapping->host, page, 0, 0);
  1018. /*
  1019. * mm accommodates an old ext3 case where clean pages might not have had
  1020. * the dirty bit cleared. Thus, it can send actual dirty pages to
  1021. * ->releasepage() via shrink_active_list(). Conversely,
  1022. * block_invalidatepage() can send pages that are still marked dirty
  1023. * but otherwise have invalidated buffers.
  1024. *
  1025. * We want to release the latter to avoid unnecessary buildup of the
  1026. * LRU, skip the former and warn if we've left any lingering
  1027. * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
  1028. * or unwritten buffers and warn if the page is not dirty. Otherwise
  1029. * try to release the buffers.
  1030. */
  1031. xfs_count_page_state(page, &delalloc, &unwritten);
  1032. if (delalloc) {
  1033. WARN_ON_ONCE(!PageDirty(page));
  1034. return 0;
  1035. }
  1036. if (unwritten) {
  1037. WARN_ON_ONCE(!PageDirty(page));
  1038. return 0;
  1039. }
  1040. return try_to_free_buffers(page);
  1041. }
  1042. /*
  1043. * If this is O_DIRECT or the mpage code calling tell them how large the mapping
  1044. * is, so that we can avoid repeated get_blocks calls.
  1045. *
  1046. * If the mapping spans EOF, then we have to break the mapping up as the mapping
  1047. * for blocks beyond EOF must be marked new so that sub block regions can be
  1048. * correctly zeroed. We can't do this for mappings within EOF unless the mapping
  1049. * was just allocated or is unwritten, otherwise the callers would overwrite
  1050. * existing data with zeros. Hence we have to split the mapping into a range up
  1051. * to and including EOF, and a second mapping for beyond EOF.
  1052. */
  1053. static void
  1054. xfs_map_trim_size(
  1055. struct inode *inode,
  1056. sector_t iblock,
  1057. struct buffer_head *bh_result,
  1058. struct xfs_bmbt_irec *imap,
  1059. xfs_off_t offset,
  1060. ssize_t size)
  1061. {
  1062. xfs_off_t mapping_size;
  1063. mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
  1064. mapping_size <<= inode->i_blkbits;
  1065. ASSERT(mapping_size > 0);
  1066. if (mapping_size > size)
  1067. mapping_size = size;
  1068. if (offset < i_size_read(inode) &&
  1069. offset + mapping_size >= i_size_read(inode)) {
  1070. /* limit mapping to block that spans EOF */
  1071. mapping_size = roundup_64(i_size_read(inode) - offset,
  1072. 1 << inode->i_blkbits);
  1073. }
  1074. if (mapping_size > LONG_MAX)
  1075. mapping_size = LONG_MAX;
  1076. bh_result->b_size = mapping_size;
  1077. }
  1078. static int
  1079. xfs_get_blocks(
  1080. struct inode *inode,
  1081. sector_t iblock,
  1082. struct buffer_head *bh_result,
  1083. int create)
  1084. {
  1085. struct xfs_inode *ip = XFS_I(inode);
  1086. struct xfs_mount *mp = ip->i_mount;
  1087. xfs_fileoff_t offset_fsb, end_fsb;
  1088. int error = 0;
  1089. int lockmode = 0;
  1090. struct xfs_bmbt_irec imap;
  1091. int nimaps = 1;
  1092. xfs_off_t offset;
  1093. ssize_t size;
  1094. BUG_ON(create);
  1095. if (XFS_FORCED_SHUTDOWN(mp))
  1096. return -EIO;
  1097. offset = (xfs_off_t)iblock << inode->i_blkbits;
  1098. ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
  1099. size = bh_result->b_size;
  1100. if (offset >= i_size_read(inode))
  1101. return 0;
  1102. /*
  1103. * Direct I/O is usually done on preallocated files, so try getting
  1104. * a block mapping without an exclusive lock first.
  1105. */
  1106. lockmode = xfs_ilock_data_map_shared(ip);
  1107. ASSERT(offset <= mp->m_super->s_maxbytes);
  1108. if (offset + size > mp->m_super->s_maxbytes)
  1109. size = mp->m_super->s_maxbytes - offset;
  1110. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
  1111. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  1112. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  1113. &imap, &nimaps, XFS_BMAPI_ENTIRE);
  1114. if (error)
  1115. goto out_unlock;
  1116. if (nimaps) {
  1117. trace_xfs_get_blocks_found(ip, offset, size,
  1118. ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
  1119. : XFS_IO_OVERWRITE, &imap);
  1120. xfs_iunlock(ip, lockmode);
  1121. } else {
  1122. trace_xfs_get_blocks_notfound(ip, offset, size);
  1123. goto out_unlock;
  1124. }
  1125. /* trim mapping down to size requested */
  1126. xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
  1127. /*
  1128. * For unwritten extents do not report a disk address in the buffered
  1129. * read case (treat as if we're reading into a hole).
  1130. */
  1131. if (imap.br_startblock != HOLESTARTBLOCK &&
  1132. imap.br_startblock != DELAYSTARTBLOCK &&
  1133. !ISUNWRITTEN(&imap))
  1134. xfs_map_buffer(inode, bh_result, &imap, offset);
  1135. /*
  1136. * If this is a realtime file, data may be on a different device.
  1137. * to that pointed to from the buffer_head b_bdev currently.
  1138. */
  1139. bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
  1140. return 0;
  1141. out_unlock:
  1142. xfs_iunlock(ip, lockmode);
  1143. return error;
  1144. }
  1145. STATIC ssize_t
  1146. xfs_vm_direct_IO(
  1147. struct kiocb *iocb,
  1148. struct iov_iter *iter)
  1149. {
  1150. /*
  1151. * We just need the method present so that open/fcntl allow direct I/O.
  1152. */
  1153. return -EINVAL;
  1154. }
  1155. STATIC sector_t
  1156. xfs_vm_bmap(
  1157. struct address_space *mapping,
  1158. sector_t block)
  1159. {
  1160. struct inode *inode = (struct inode *)mapping->host;
  1161. struct xfs_inode *ip = XFS_I(inode);
  1162. trace_xfs_vm_bmap(XFS_I(inode));
  1163. /*
  1164. * The swap code (ab-)uses ->bmap to get a block mapping and then
  1165. * bypasseѕ the file system for actual I/O. We really can't allow
  1166. * that on reflinks inodes, so we have to skip out here. And yes,
  1167. * 0 is the magic code for a bmap error..
  1168. */
  1169. if (xfs_is_reflink_inode(ip))
  1170. return 0;
  1171. filemap_write_and_wait(mapping);
  1172. return generic_block_bmap(mapping, block, xfs_get_blocks);
  1173. }
  1174. STATIC int
  1175. xfs_vm_readpage(
  1176. struct file *unused,
  1177. struct page *page)
  1178. {
  1179. trace_xfs_vm_readpage(page->mapping->host, 1);
  1180. return mpage_readpage(page, xfs_get_blocks);
  1181. }
  1182. STATIC int
  1183. xfs_vm_readpages(
  1184. struct file *unused,
  1185. struct address_space *mapping,
  1186. struct list_head *pages,
  1187. unsigned nr_pages)
  1188. {
  1189. trace_xfs_vm_readpages(mapping->host, nr_pages);
  1190. return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
  1191. }
  1192. /*
  1193. * This is basically a copy of __set_page_dirty_buffers() with one
  1194. * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
  1195. * dirty, we'll never be able to clean them because we don't write buffers
  1196. * beyond EOF, and that means we can't invalidate pages that span EOF
  1197. * that have been marked dirty. Further, the dirty state can leak into
  1198. * the file interior if the file is extended, resulting in all sorts of
  1199. * bad things happening as the state does not match the underlying data.
  1200. *
  1201. * XXX: this really indicates that bufferheads in XFS need to die. Warts like
  1202. * this only exist because of bufferheads and how the generic code manages them.
  1203. */
  1204. STATIC int
  1205. xfs_vm_set_page_dirty(
  1206. struct page *page)
  1207. {
  1208. struct address_space *mapping = page->mapping;
  1209. struct inode *inode = mapping->host;
  1210. loff_t end_offset;
  1211. loff_t offset;
  1212. int newly_dirty;
  1213. if (unlikely(!mapping))
  1214. return !TestSetPageDirty(page);
  1215. end_offset = i_size_read(inode);
  1216. offset = page_offset(page);
  1217. spin_lock(&mapping->private_lock);
  1218. if (page_has_buffers(page)) {
  1219. struct buffer_head *head = page_buffers(page);
  1220. struct buffer_head *bh = head;
  1221. do {
  1222. if (offset < end_offset)
  1223. set_buffer_dirty(bh);
  1224. bh = bh->b_this_page;
  1225. offset += 1 << inode->i_blkbits;
  1226. } while (bh != head);
  1227. }
  1228. /*
  1229. * Lock out page->mem_cgroup migration to keep PageDirty
  1230. * synchronized with per-memcg dirty page counters.
  1231. */
  1232. lock_page_memcg(page);
  1233. newly_dirty = !TestSetPageDirty(page);
  1234. spin_unlock(&mapping->private_lock);
  1235. if (newly_dirty) {
  1236. /* sigh - __set_page_dirty() is static, so copy it here, too */
  1237. unsigned long flags;
  1238. spin_lock_irqsave(&mapping->tree_lock, flags);
  1239. if (page->mapping) { /* Race with truncate? */
  1240. WARN_ON_ONCE(!PageUptodate(page));
  1241. account_page_dirtied(page, mapping);
  1242. radix_tree_tag_set(&mapping->page_tree,
  1243. page_index(page), PAGECACHE_TAG_DIRTY);
  1244. }
  1245. spin_unlock_irqrestore(&mapping->tree_lock, flags);
  1246. }
  1247. unlock_page_memcg(page);
  1248. if (newly_dirty)
  1249. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  1250. return newly_dirty;
  1251. }
  1252. const struct address_space_operations xfs_address_space_operations = {
  1253. .readpage = xfs_vm_readpage,
  1254. .readpages = xfs_vm_readpages,
  1255. .writepage = xfs_vm_writepage,
  1256. .writepages = xfs_vm_writepages,
  1257. .set_page_dirty = xfs_vm_set_page_dirty,
  1258. .releasepage = xfs_vm_releasepage,
  1259. .invalidatepage = xfs_vm_invalidatepage,
  1260. .bmap = xfs_vm_bmap,
  1261. .direct_IO = xfs_vm_direct_IO,
  1262. .migratepage = buffer_migrate_page,
  1263. .is_partially_uptodate = block_is_partially_uptodate,
  1264. .error_remove_page = generic_error_remove_page,
  1265. };