xfs_aops.c 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522
  1. /*
  2. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "xfs.h"
  19. #include "xfs_shared.h"
  20. #include "xfs_format.h"
  21. #include "xfs_log_format.h"
  22. #include "xfs_trans_resv.h"
  23. #include "xfs_mount.h"
  24. #include "xfs_inode.h"
  25. #include "xfs_trans.h"
  26. #include "xfs_inode_item.h"
  27. #include "xfs_alloc.h"
  28. #include "xfs_error.h"
  29. #include "xfs_iomap.h"
  30. #include "xfs_trace.h"
  31. #include "xfs_bmap.h"
  32. #include "xfs_bmap_util.h"
  33. #include "xfs_bmap_btree.h"
  34. #include <linux/gfp.h>
  35. #include <linux/mpage.h>
  36. #include <linux/pagevec.h>
  37. #include <linux/writeback.h>
  38. /* flags for direct write completions */
  39. #define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
  40. #define XFS_DIO_FLAG_APPEND (1 << 1)
  41. /*
  42. * structure owned by writepages passed to individual writepage calls
  43. */
  44. struct xfs_writepage_ctx {
  45. struct xfs_bmbt_irec imap;
  46. bool imap_valid;
  47. unsigned int io_type;
  48. struct xfs_ioend *ioend;
  49. sector_t last_block;
  50. };
  51. void
  52. xfs_count_page_state(
  53. struct page *page,
  54. int *delalloc,
  55. int *unwritten)
  56. {
  57. struct buffer_head *bh, *head;
  58. *delalloc = *unwritten = 0;
  59. bh = head = page_buffers(page);
  60. do {
  61. if (buffer_unwritten(bh))
  62. (*unwritten) = 1;
  63. else if (buffer_delay(bh))
  64. (*delalloc) = 1;
  65. } while ((bh = bh->b_this_page) != head);
  66. }
  67. struct block_device *
  68. xfs_find_bdev_for_inode(
  69. struct inode *inode)
  70. {
  71. struct xfs_inode *ip = XFS_I(inode);
  72. struct xfs_mount *mp = ip->i_mount;
  73. if (XFS_IS_REALTIME_INODE(ip))
  74. return mp->m_rtdev_targp->bt_bdev;
  75. else
  76. return mp->m_ddev_targp->bt_bdev;
  77. }
  78. /*
  79. * We're now finished for good with this page. Update the page state via the
  80. * associated buffer_heads, paying attention to the start and end offsets that
  81. * we need to process on the page.
  82. *
  83. * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
  84. * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
  85. * the page at all, as we may be racing with memory reclaim and it can free both
  86. * the bufferhead chain and the page as it will see the page as clean and
  87. * unused.
  88. */
  89. static void
  90. xfs_finish_page_writeback(
  91. struct inode *inode,
  92. struct bio_vec *bvec,
  93. int error)
  94. {
  95. unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
  96. struct buffer_head *head, *bh, *next;
  97. unsigned int off = 0;
  98. unsigned int bsize;
  99. ASSERT(bvec->bv_offset < PAGE_SIZE);
  100. ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
  101. ASSERT(end < PAGE_SIZE);
  102. ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
  103. bh = head = page_buffers(bvec->bv_page);
  104. bsize = bh->b_size;
  105. do {
  106. next = bh->b_this_page;
  107. if (off < bvec->bv_offset)
  108. goto next_bh;
  109. if (off > end)
  110. break;
  111. bh->b_end_io(bh, !error);
  112. next_bh:
  113. off += bsize;
  114. } while ((bh = next) != head);
  115. }
  116. /*
  117. * We're now finished for good with this ioend structure. Update the page
  118. * state, release holds on bios, and finally free up memory. Do not use the
  119. * ioend after this.
  120. */
  121. STATIC void
  122. xfs_destroy_ioend(
  123. struct xfs_ioend *ioend,
  124. int error)
  125. {
  126. struct inode *inode = ioend->io_inode;
  127. struct bio *last = ioend->io_bio;
  128. struct bio *bio, *next;
  129. for (bio = &ioend->io_inline_bio; bio; bio = next) {
  130. struct bio_vec *bvec;
  131. int i;
  132. /*
  133. * For the last bio, bi_private points to the ioend, so we
  134. * need to explicitly end the iteration here.
  135. */
  136. if (bio == last)
  137. next = NULL;
  138. else
  139. next = bio->bi_private;
  140. /* walk each page on bio, ending page IO on them */
  141. bio_for_each_segment_all(bvec, bio, i)
  142. xfs_finish_page_writeback(inode, bvec, error);
  143. bio_put(bio);
  144. }
  145. }
  146. /*
  147. * Fast and loose check if this write could update the on-disk inode size.
  148. */
  149. static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
  150. {
  151. return ioend->io_offset + ioend->io_size >
  152. XFS_I(ioend->io_inode)->i_d.di_size;
  153. }
  154. STATIC int
  155. xfs_setfilesize_trans_alloc(
  156. struct xfs_ioend *ioend)
  157. {
  158. struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
  159. struct xfs_trans *tp;
  160. int error;
  161. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
  162. if (error)
  163. return error;
  164. ioend->io_append_trans = tp;
  165. /*
  166. * We may pass freeze protection with a transaction. So tell lockdep
  167. * we released it.
  168. */
  169. __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
  170. /*
  171. * We hand off the transaction to the completion thread now, so
  172. * clear the flag here.
  173. */
  174. current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
  175. return 0;
  176. }
  177. /*
  178. * Update on-disk file size now that data has been written to disk.
  179. */
  180. STATIC int
  181. xfs_setfilesize(
  182. struct xfs_inode *ip,
  183. struct xfs_trans *tp,
  184. xfs_off_t offset,
  185. size_t size)
  186. {
  187. xfs_fsize_t isize;
  188. xfs_ilock(ip, XFS_ILOCK_EXCL);
  189. isize = xfs_new_eof(ip, offset + size);
  190. if (!isize) {
  191. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  192. xfs_trans_cancel(tp);
  193. return 0;
  194. }
  195. trace_xfs_setfilesize(ip, offset, size);
  196. ip->i_d.di_size = isize;
  197. xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  198. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  199. return xfs_trans_commit(tp);
  200. }
  201. STATIC int
  202. xfs_setfilesize_ioend(
  203. struct xfs_ioend *ioend,
  204. int error)
  205. {
  206. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  207. struct xfs_trans *tp = ioend->io_append_trans;
  208. /*
  209. * The transaction may have been allocated in the I/O submission thread,
  210. * thus we need to mark ourselves as being in a transaction manually.
  211. * Similarly for freeze protection.
  212. */
  213. current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
  214. __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
  215. /* we abort the update if there was an IO error */
  216. if (error) {
  217. xfs_trans_cancel(tp);
  218. return error;
  219. }
  220. return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
  221. }
  222. /*
  223. * IO write completion.
  224. */
  225. STATIC void
  226. xfs_end_io(
  227. struct work_struct *work)
  228. {
  229. struct xfs_ioend *ioend =
  230. container_of(work, struct xfs_ioend, io_work);
  231. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  232. int error = ioend->io_bio->bi_error;
  233. /*
  234. * Set an error if the mount has shut down and proceed with end I/O
  235. * processing so it can perform whatever cleanups are necessary.
  236. */
  237. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  238. error = -EIO;
  239. /*
  240. * For unwritten extents we need to issue transactions to convert a
  241. * range to normal written extens after the data I/O has finished.
  242. * Detecting and handling completion IO errors is done individually
  243. * for each case as different cleanup operations need to be performed
  244. * on error.
  245. */
  246. if (ioend->io_type == XFS_IO_UNWRITTEN) {
  247. if (error)
  248. goto done;
  249. error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
  250. ioend->io_size);
  251. } else if (ioend->io_append_trans) {
  252. error = xfs_setfilesize_ioend(ioend, error);
  253. } else {
  254. ASSERT(!xfs_ioend_is_append(ioend));
  255. }
  256. done:
  257. xfs_destroy_ioend(ioend, error);
  258. }
  259. STATIC void
  260. xfs_end_bio(
  261. struct bio *bio)
  262. {
  263. struct xfs_ioend *ioend = bio->bi_private;
  264. struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
  265. if (ioend->io_type == XFS_IO_UNWRITTEN)
  266. queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
  267. else if (ioend->io_append_trans)
  268. queue_work(mp->m_data_workqueue, &ioend->io_work);
  269. else
  270. xfs_destroy_ioend(ioend, bio->bi_error);
  271. }
  272. STATIC int
  273. xfs_map_blocks(
  274. struct inode *inode,
  275. loff_t offset,
  276. struct xfs_bmbt_irec *imap,
  277. int type)
  278. {
  279. struct xfs_inode *ip = XFS_I(inode);
  280. struct xfs_mount *mp = ip->i_mount;
  281. ssize_t count = 1 << inode->i_blkbits;
  282. xfs_fileoff_t offset_fsb, end_fsb;
  283. int error = 0;
  284. int bmapi_flags = XFS_BMAPI_ENTIRE;
  285. int nimaps = 1;
  286. if (XFS_FORCED_SHUTDOWN(mp))
  287. return -EIO;
  288. if (type == XFS_IO_UNWRITTEN)
  289. bmapi_flags |= XFS_BMAPI_IGSTATE;
  290. xfs_ilock(ip, XFS_ILOCK_SHARED);
  291. ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
  292. (ip->i_df.if_flags & XFS_IFEXTENTS));
  293. ASSERT(offset <= mp->m_super->s_maxbytes);
  294. if (offset + count > mp->m_super->s_maxbytes)
  295. count = mp->m_super->s_maxbytes - offset;
  296. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
  297. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  298. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  299. imap, &nimaps, bmapi_flags);
  300. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  301. if (error)
  302. return error;
  303. if (type == XFS_IO_DELALLOC &&
  304. (!nimaps || isnullstartblock(imap->br_startblock))) {
  305. error = xfs_iomap_write_allocate(ip, offset, imap);
  306. if (!error)
  307. trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
  308. return error;
  309. }
  310. #ifdef DEBUG
  311. if (type == XFS_IO_UNWRITTEN) {
  312. ASSERT(nimaps);
  313. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  314. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  315. }
  316. #endif
  317. if (nimaps)
  318. trace_xfs_map_blocks_found(ip, offset, count, type, imap);
  319. return 0;
  320. }
  321. STATIC bool
  322. xfs_imap_valid(
  323. struct inode *inode,
  324. struct xfs_bmbt_irec *imap,
  325. xfs_off_t offset)
  326. {
  327. offset >>= inode->i_blkbits;
  328. return offset >= imap->br_startoff &&
  329. offset < imap->br_startoff + imap->br_blockcount;
  330. }
  331. STATIC void
  332. xfs_start_buffer_writeback(
  333. struct buffer_head *bh)
  334. {
  335. ASSERT(buffer_mapped(bh));
  336. ASSERT(buffer_locked(bh));
  337. ASSERT(!buffer_delay(bh));
  338. ASSERT(!buffer_unwritten(bh));
  339. mark_buffer_async_write(bh);
  340. set_buffer_uptodate(bh);
  341. clear_buffer_dirty(bh);
  342. }
  343. STATIC void
  344. xfs_start_page_writeback(
  345. struct page *page,
  346. int clear_dirty)
  347. {
  348. ASSERT(PageLocked(page));
  349. ASSERT(!PageWriteback(page));
  350. /*
  351. * if the page was not fully cleaned, we need to ensure that the higher
  352. * layers come back to it correctly. That means we need to keep the page
  353. * dirty, and for WB_SYNC_ALL writeback we need to ensure the
  354. * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
  355. * write this page in this writeback sweep will be made.
  356. */
  357. if (clear_dirty) {
  358. clear_page_dirty_for_io(page);
  359. set_page_writeback(page);
  360. } else
  361. set_page_writeback_keepwrite(page);
  362. unlock_page(page);
  363. }
  364. static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  365. {
  366. return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
  367. }
  368. /*
  369. * Submit the bio for an ioend. We are passed an ioend with a bio attached to
  370. * it, and we submit that bio. The ioend may be used for multiple bio
  371. * submissions, so we only want to allocate an append transaction for the ioend
  372. * once. In the case of multiple bio submission, each bio will take an IO
  373. * reference to the ioend to ensure that the ioend completion is only done once
  374. * all bios have been submitted and the ioend is really done.
  375. *
  376. * If @fail is non-zero, it means that we have a situation where some part of
  377. * the submission process has failed after we have marked paged for writeback
  378. * and unlocked them. In this situation, we need to fail the bio and ioend
  379. * rather than submit it to IO. This typically only happens on a filesystem
  380. * shutdown.
  381. */
  382. STATIC int
  383. xfs_submit_ioend(
  384. struct writeback_control *wbc,
  385. struct xfs_ioend *ioend,
  386. int status)
  387. {
  388. /* Reserve log space if we might write beyond the on-disk inode size. */
  389. if (!status &&
  390. ioend->io_type != XFS_IO_UNWRITTEN &&
  391. xfs_ioend_is_append(ioend) &&
  392. !ioend->io_append_trans)
  393. status = xfs_setfilesize_trans_alloc(ioend);
  394. ioend->io_bio->bi_private = ioend;
  395. ioend->io_bio->bi_end_io = xfs_end_bio;
  396. bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
  397. (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
  398. /*
  399. * If we are failing the IO now, just mark the ioend with an
  400. * error and finish it. This will run IO completion immediately
  401. * as there is only one reference to the ioend at this point in
  402. * time.
  403. */
  404. if (status) {
  405. ioend->io_bio->bi_error = status;
  406. bio_endio(ioend->io_bio);
  407. return status;
  408. }
  409. submit_bio(ioend->io_bio);
  410. return 0;
  411. }
  412. static void
  413. xfs_init_bio_from_bh(
  414. struct bio *bio,
  415. struct buffer_head *bh)
  416. {
  417. bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  418. bio->bi_bdev = bh->b_bdev;
  419. }
  420. static struct xfs_ioend *
  421. xfs_alloc_ioend(
  422. struct inode *inode,
  423. unsigned int type,
  424. xfs_off_t offset,
  425. struct buffer_head *bh)
  426. {
  427. struct xfs_ioend *ioend;
  428. struct bio *bio;
  429. bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
  430. xfs_init_bio_from_bh(bio, bh);
  431. ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
  432. INIT_LIST_HEAD(&ioend->io_list);
  433. ioend->io_type = type;
  434. ioend->io_inode = inode;
  435. ioend->io_size = 0;
  436. ioend->io_offset = offset;
  437. INIT_WORK(&ioend->io_work, xfs_end_io);
  438. ioend->io_append_trans = NULL;
  439. ioend->io_bio = bio;
  440. return ioend;
  441. }
  442. /*
  443. * Allocate a new bio, and chain the old bio to the new one.
  444. *
  445. * Note that we have to do perform the chaining in this unintuitive order
  446. * so that the bi_private linkage is set up in the right direction for the
  447. * traversal in xfs_destroy_ioend().
  448. */
  449. static void
  450. xfs_chain_bio(
  451. struct xfs_ioend *ioend,
  452. struct writeback_control *wbc,
  453. struct buffer_head *bh)
  454. {
  455. struct bio *new;
  456. new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
  457. xfs_init_bio_from_bh(new, bh);
  458. bio_chain(ioend->io_bio, new);
  459. bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
  460. bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
  461. (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
  462. submit_bio(ioend->io_bio);
  463. ioend->io_bio = new;
  464. }
  465. /*
  466. * Test to see if we've been building up a completion structure for
  467. * earlier buffers -- if so, we try to append to this ioend if we
  468. * can, otherwise we finish off any current ioend and start another.
  469. * Return the ioend we finished off so that the caller can submit it
  470. * once it has finished processing the dirty page.
  471. */
  472. STATIC void
  473. xfs_add_to_ioend(
  474. struct inode *inode,
  475. struct buffer_head *bh,
  476. xfs_off_t offset,
  477. struct xfs_writepage_ctx *wpc,
  478. struct writeback_control *wbc,
  479. struct list_head *iolist)
  480. {
  481. if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
  482. bh->b_blocknr != wpc->last_block + 1 ||
  483. offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
  484. if (wpc->ioend)
  485. list_add(&wpc->ioend->io_list, iolist);
  486. wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
  487. }
  488. /*
  489. * If the buffer doesn't fit into the bio we need to allocate a new
  490. * one. This shouldn't happen more than once for a given buffer.
  491. */
  492. while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
  493. xfs_chain_bio(wpc->ioend, wbc, bh);
  494. wpc->ioend->io_size += bh->b_size;
  495. wpc->last_block = bh->b_blocknr;
  496. xfs_start_buffer_writeback(bh);
  497. }
  498. STATIC void
  499. xfs_map_buffer(
  500. struct inode *inode,
  501. struct buffer_head *bh,
  502. struct xfs_bmbt_irec *imap,
  503. xfs_off_t offset)
  504. {
  505. sector_t bn;
  506. struct xfs_mount *m = XFS_I(inode)->i_mount;
  507. xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
  508. xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
  509. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  510. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  511. bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
  512. ((offset - iomap_offset) >> inode->i_blkbits);
  513. ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
  514. bh->b_blocknr = bn;
  515. set_buffer_mapped(bh);
  516. }
  517. STATIC void
  518. xfs_map_at_offset(
  519. struct inode *inode,
  520. struct buffer_head *bh,
  521. struct xfs_bmbt_irec *imap,
  522. xfs_off_t offset)
  523. {
  524. ASSERT(imap->br_startblock != HOLESTARTBLOCK);
  525. ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
  526. xfs_map_buffer(inode, bh, imap, offset);
  527. set_buffer_mapped(bh);
  528. clear_buffer_delay(bh);
  529. clear_buffer_unwritten(bh);
  530. }
  531. /*
  532. * Test if a given page contains at least one buffer of a given @type.
  533. * If @check_all_buffers is true, then we walk all the buffers in the page to
  534. * try to find one of the type passed in. If it is not set, then the caller only
  535. * needs to check the first buffer on the page for a match.
  536. */
  537. STATIC bool
  538. xfs_check_page_type(
  539. struct page *page,
  540. unsigned int type,
  541. bool check_all_buffers)
  542. {
  543. struct buffer_head *bh;
  544. struct buffer_head *head;
  545. if (PageWriteback(page))
  546. return false;
  547. if (!page->mapping)
  548. return false;
  549. if (!page_has_buffers(page))
  550. return false;
  551. bh = head = page_buffers(page);
  552. do {
  553. if (buffer_unwritten(bh)) {
  554. if (type == XFS_IO_UNWRITTEN)
  555. return true;
  556. } else if (buffer_delay(bh)) {
  557. if (type == XFS_IO_DELALLOC)
  558. return true;
  559. } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
  560. if (type == XFS_IO_OVERWRITE)
  561. return true;
  562. }
  563. /* If we are only checking the first buffer, we are done now. */
  564. if (!check_all_buffers)
  565. break;
  566. } while ((bh = bh->b_this_page) != head);
  567. return false;
  568. }
  569. STATIC void
  570. xfs_vm_invalidatepage(
  571. struct page *page,
  572. unsigned int offset,
  573. unsigned int length)
  574. {
  575. trace_xfs_invalidatepage(page->mapping->host, page, offset,
  576. length);
  577. block_invalidatepage(page, offset, length);
  578. }
  579. /*
  580. * If the page has delalloc buffers on it, we need to punch them out before we
  581. * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  582. * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
  583. * is done on that same region - the delalloc extent is returned when none is
  584. * supposed to be there.
  585. *
  586. * We prevent this by truncating away the delalloc regions on the page before
  587. * invalidating it. Because they are delalloc, we can do this without needing a
  588. * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
  589. * truncation without a transaction as there is no space left for block
  590. * reservation (typically why we see a ENOSPC in writeback).
  591. *
  592. * This is not a performance critical path, so for now just do the punching a
  593. * buffer head at a time.
  594. */
  595. STATIC void
  596. xfs_aops_discard_page(
  597. struct page *page)
  598. {
  599. struct inode *inode = page->mapping->host;
  600. struct xfs_inode *ip = XFS_I(inode);
  601. struct buffer_head *bh, *head;
  602. loff_t offset = page_offset(page);
  603. if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
  604. goto out_invalidate;
  605. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  606. goto out_invalidate;
  607. xfs_alert(ip->i_mount,
  608. "page discard on page %p, inode 0x%llx, offset %llu.",
  609. page, ip->i_ino, offset);
  610. xfs_ilock(ip, XFS_ILOCK_EXCL);
  611. bh = head = page_buffers(page);
  612. do {
  613. int error;
  614. xfs_fileoff_t start_fsb;
  615. if (!buffer_delay(bh))
  616. goto next_buffer;
  617. start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
  618. error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
  619. if (error) {
  620. /* something screwed, just bail */
  621. if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  622. xfs_alert(ip->i_mount,
  623. "page discard unable to remove delalloc mapping.");
  624. }
  625. break;
  626. }
  627. next_buffer:
  628. offset += 1 << inode->i_blkbits;
  629. } while ((bh = bh->b_this_page) != head);
  630. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  631. out_invalidate:
  632. xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
  633. return;
  634. }
  635. /*
  636. * We implement an immediate ioend submission policy here to avoid needing to
  637. * chain multiple ioends and hence nest mempool allocations which can violate
  638. * forward progress guarantees we need to provide. The current ioend we are
  639. * adding buffers to is cached on the writepage context, and if the new buffer
  640. * does not append to the cached ioend it will create a new ioend and cache that
  641. * instead.
  642. *
  643. * If a new ioend is created and cached, the old ioend is returned and queued
  644. * locally for submission once the entire page is processed or an error has been
  645. * detected. While ioends are submitted immediately after they are completed,
  646. * batching optimisations are provided by higher level block plugging.
  647. *
  648. * At the end of a writeback pass, there will be a cached ioend remaining on the
  649. * writepage context that the caller will need to submit.
  650. */
  651. static int
  652. xfs_writepage_map(
  653. struct xfs_writepage_ctx *wpc,
  654. struct writeback_control *wbc,
  655. struct inode *inode,
  656. struct page *page,
  657. loff_t offset,
  658. __uint64_t end_offset)
  659. {
  660. LIST_HEAD(submit_list);
  661. struct xfs_ioend *ioend, *next;
  662. struct buffer_head *bh, *head;
  663. ssize_t len = 1 << inode->i_blkbits;
  664. int error = 0;
  665. int count = 0;
  666. int uptodate = 1;
  667. bh = head = page_buffers(page);
  668. offset = page_offset(page);
  669. do {
  670. if (offset >= end_offset)
  671. break;
  672. if (!buffer_uptodate(bh))
  673. uptodate = 0;
  674. /*
  675. * set_page_dirty dirties all buffers in a page, independent
  676. * of their state. The dirty state however is entirely
  677. * meaningless for holes (!mapped && uptodate), so skip
  678. * buffers covering holes here.
  679. */
  680. if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
  681. wpc->imap_valid = false;
  682. continue;
  683. }
  684. if (buffer_unwritten(bh)) {
  685. if (wpc->io_type != XFS_IO_UNWRITTEN) {
  686. wpc->io_type = XFS_IO_UNWRITTEN;
  687. wpc->imap_valid = false;
  688. }
  689. } else if (buffer_delay(bh)) {
  690. if (wpc->io_type != XFS_IO_DELALLOC) {
  691. wpc->io_type = XFS_IO_DELALLOC;
  692. wpc->imap_valid = false;
  693. }
  694. } else if (buffer_uptodate(bh)) {
  695. if (wpc->io_type != XFS_IO_OVERWRITE) {
  696. wpc->io_type = XFS_IO_OVERWRITE;
  697. wpc->imap_valid = false;
  698. }
  699. } else {
  700. if (PageUptodate(page))
  701. ASSERT(buffer_mapped(bh));
  702. /*
  703. * This buffer is not uptodate and will not be
  704. * written to disk. Ensure that we will put any
  705. * subsequent writeable buffers into a new
  706. * ioend.
  707. */
  708. wpc->imap_valid = false;
  709. continue;
  710. }
  711. if (wpc->imap_valid)
  712. wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
  713. offset);
  714. if (!wpc->imap_valid) {
  715. error = xfs_map_blocks(inode, offset, &wpc->imap,
  716. wpc->io_type);
  717. if (error)
  718. goto out;
  719. wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
  720. offset);
  721. }
  722. if (wpc->imap_valid) {
  723. lock_buffer(bh);
  724. if (wpc->io_type != XFS_IO_OVERWRITE)
  725. xfs_map_at_offset(inode, bh, &wpc->imap, offset);
  726. xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
  727. count++;
  728. }
  729. } while (offset += len, ((bh = bh->b_this_page) != head));
  730. if (uptodate && bh == head)
  731. SetPageUptodate(page);
  732. ASSERT(wpc->ioend || list_empty(&submit_list));
  733. out:
  734. /*
  735. * On error, we have to fail the ioend here because we have locked
  736. * buffers in the ioend. If we don't do this, we'll deadlock
  737. * invalidating the page as that tries to lock the buffers on the page.
  738. * Also, because we may have set pages under writeback, we have to make
  739. * sure we run IO completion to mark the error state of the IO
  740. * appropriately, so we can't cancel the ioend directly here. That means
  741. * we have to mark this page as under writeback if we included any
  742. * buffers from it in the ioend chain so that completion treats it
  743. * correctly.
  744. *
  745. * If we didn't include the page in the ioend, the on error we can
  746. * simply discard and unlock it as there are no other users of the page
  747. * or it's buffers right now. The caller will still need to trigger
  748. * submission of outstanding ioends on the writepage context so they are
  749. * treated correctly on error.
  750. */
  751. if (count) {
  752. xfs_start_page_writeback(page, !error);
  753. /*
  754. * Preserve the original error if there was one, otherwise catch
  755. * submission errors here and propagate into subsequent ioend
  756. * submissions.
  757. */
  758. list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
  759. int error2;
  760. list_del_init(&ioend->io_list);
  761. error2 = xfs_submit_ioend(wbc, ioend, error);
  762. if (error2 && !error)
  763. error = error2;
  764. }
  765. } else if (error) {
  766. xfs_aops_discard_page(page);
  767. ClearPageUptodate(page);
  768. unlock_page(page);
  769. } else {
  770. /*
  771. * We can end up here with no error and nothing to write if we
  772. * race with a partial page truncate on a sub-page block sized
  773. * filesystem. In that case we need to mark the page clean.
  774. */
  775. xfs_start_page_writeback(page, 1);
  776. end_page_writeback(page);
  777. }
  778. mapping_set_error(page->mapping, error);
  779. return error;
  780. }
  781. /*
  782. * Write out a dirty page.
  783. *
  784. * For delalloc space on the page we need to allocate space and flush it.
  785. * For unwritten space on the page we need to start the conversion to
  786. * regular allocated space.
  787. * For any other dirty buffer heads on the page we should flush them.
  788. */
  789. STATIC int
  790. xfs_do_writepage(
  791. struct page *page,
  792. struct writeback_control *wbc,
  793. void *data)
  794. {
  795. struct xfs_writepage_ctx *wpc = data;
  796. struct inode *inode = page->mapping->host;
  797. loff_t offset;
  798. __uint64_t end_offset;
  799. pgoff_t end_index;
  800. trace_xfs_writepage(inode, page, 0, 0);
  801. ASSERT(page_has_buffers(page));
  802. /*
  803. * Refuse to write the page out if we are called from reclaim context.
  804. *
  805. * This avoids stack overflows when called from deeply used stacks in
  806. * random callers for direct reclaim or memcg reclaim. We explicitly
  807. * allow reclaim from kswapd as the stack usage there is relatively low.
  808. *
  809. * This should never happen except in the case of a VM regression so
  810. * warn about it.
  811. */
  812. if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
  813. PF_MEMALLOC))
  814. goto redirty;
  815. /*
  816. * Given that we do not allow direct reclaim to call us, we should
  817. * never be called while in a filesystem transaction.
  818. */
  819. if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
  820. goto redirty;
  821. /*
  822. * Is this page beyond the end of the file?
  823. *
  824. * The page index is less than the end_index, adjust the end_offset
  825. * to the highest offset that this page should represent.
  826. * -----------------------------------------------------
  827. * | file mapping | <EOF> |
  828. * -----------------------------------------------------
  829. * | Page ... | Page N-2 | Page N-1 | Page N | |
  830. * ^--------------------------------^----------|--------
  831. * | desired writeback range | see else |
  832. * ---------------------------------^------------------|
  833. */
  834. offset = i_size_read(inode);
  835. end_index = offset >> PAGE_SHIFT;
  836. if (page->index < end_index)
  837. end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
  838. else {
  839. /*
  840. * Check whether the page to write out is beyond or straddles
  841. * i_size or not.
  842. * -------------------------------------------------------
  843. * | file mapping | <EOF> |
  844. * -------------------------------------------------------
  845. * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
  846. * ^--------------------------------^-----------|---------
  847. * | | Straddles |
  848. * ---------------------------------^-----------|--------|
  849. */
  850. unsigned offset_into_page = offset & (PAGE_SIZE - 1);
  851. /*
  852. * Skip the page if it is fully outside i_size, e.g. due to a
  853. * truncate operation that is in progress. We must redirty the
  854. * page so that reclaim stops reclaiming it. Otherwise
  855. * xfs_vm_releasepage() is called on it and gets confused.
  856. *
  857. * Note that the end_index is unsigned long, it would overflow
  858. * if the given offset is greater than 16TB on 32-bit system
  859. * and if we do check the page is fully outside i_size or not
  860. * via "if (page->index >= end_index + 1)" as "end_index + 1"
  861. * will be evaluated to 0. Hence this page will be redirtied
  862. * and be written out repeatedly which would result in an
  863. * infinite loop, the user program that perform this operation
  864. * will hang. Instead, we can verify this situation by checking
  865. * if the page to write is totally beyond the i_size or if it's
  866. * offset is just equal to the EOF.
  867. */
  868. if (page->index > end_index ||
  869. (page->index == end_index && offset_into_page == 0))
  870. goto redirty;
  871. /*
  872. * The page straddles i_size. It must be zeroed out on each
  873. * and every writepage invocation because it may be mmapped.
  874. * "A file is mapped in multiples of the page size. For a file
  875. * that is not a multiple of the page size, the remaining
  876. * memory is zeroed when mapped, and writes to that region are
  877. * not written out to the file."
  878. */
  879. zero_user_segment(page, offset_into_page, PAGE_SIZE);
  880. /* Adjust the end_offset to the end of file */
  881. end_offset = offset;
  882. }
  883. return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
  884. redirty:
  885. redirty_page_for_writepage(wbc, page);
  886. unlock_page(page);
  887. return 0;
  888. }
  889. STATIC int
  890. xfs_vm_writepage(
  891. struct page *page,
  892. struct writeback_control *wbc)
  893. {
  894. struct xfs_writepage_ctx wpc = {
  895. .io_type = XFS_IO_INVALID,
  896. };
  897. int ret;
  898. ret = xfs_do_writepage(page, wbc, &wpc);
  899. if (wpc.ioend)
  900. ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
  901. return ret;
  902. }
  903. STATIC int
  904. xfs_vm_writepages(
  905. struct address_space *mapping,
  906. struct writeback_control *wbc)
  907. {
  908. struct xfs_writepage_ctx wpc = {
  909. .io_type = XFS_IO_INVALID,
  910. };
  911. int ret;
  912. xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
  913. if (dax_mapping(mapping))
  914. return dax_writeback_mapping_range(mapping,
  915. xfs_find_bdev_for_inode(mapping->host), wbc);
  916. ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
  917. if (wpc.ioend)
  918. ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
  919. return ret;
  920. }
  921. /*
  922. * Called to move a page into cleanable state - and from there
  923. * to be released. The page should already be clean. We always
  924. * have buffer heads in this call.
  925. *
  926. * Returns 1 if the page is ok to release, 0 otherwise.
  927. */
  928. STATIC int
  929. xfs_vm_releasepage(
  930. struct page *page,
  931. gfp_t gfp_mask)
  932. {
  933. int delalloc, unwritten;
  934. trace_xfs_releasepage(page->mapping->host, page, 0, 0);
  935. /*
  936. * mm accommodates an old ext3 case where clean pages might not have had
  937. * the dirty bit cleared. Thus, it can send actual dirty pages to
  938. * ->releasepage() via shrink_active_list(). Conversely,
  939. * block_invalidatepage() can send pages that are still marked dirty
  940. * but otherwise have invalidated buffers.
  941. *
  942. * We've historically freed buffers on the latter. Instead, quietly
  943. * filter out all dirty pages to avoid spurious buffer state warnings.
  944. * This can likely be removed once shrink_active_list() is fixed.
  945. */
  946. if (PageDirty(page))
  947. return 0;
  948. xfs_count_page_state(page, &delalloc, &unwritten);
  949. if (WARN_ON_ONCE(delalloc))
  950. return 0;
  951. if (WARN_ON_ONCE(unwritten))
  952. return 0;
  953. return try_to_free_buffers(page);
  954. }
  955. /*
  956. * When we map a DIO buffer, we may need to pass flags to
  957. * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
  958. *
  959. * Note that for DIO, an IO to the highest supported file block offset (i.e.
  960. * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
  961. * bit variable. Hence if we see this overflow, we have to assume that the IO is
  962. * extending the file size. We won't know for sure until IO completion is run
  963. * and the actual max write offset is communicated to the IO completion
  964. * routine.
  965. */
  966. static void
  967. xfs_map_direct(
  968. struct inode *inode,
  969. struct buffer_head *bh_result,
  970. struct xfs_bmbt_irec *imap,
  971. xfs_off_t offset)
  972. {
  973. uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
  974. xfs_off_t size = bh_result->b_size;
  975. trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
  976. ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
  977. if (ISUNWRITTEN(imap)) {
  978. *flags |= XFS_DIO_FLAG_UNWRITTEN;
  979. set_buffer_defer_completion(bh_result);
  980. } else if (offset + size > i_size_read(inode) || offset + size < 0) {
  981. *flags |= XFS_DIO_FLAG_APPEND;
  982. set_buffer_defer_completion(bh_result);
  983. }
  984. }
  985. /*
  986. * If this is O_DIRECT or the mpage code calling tell them how large the mapping
  987. * is, so that we can avoid repeated get_blocks calls.
  988. *
  989. * If the mapping spans EOF, then we have to break the mapping up as the mapping
  990. * for blocks beyond EOF must be marked new so that sub block regions can be
  991. * correctly zeroed. We can't do this for mappings within EOF unless the mapping
  992. * was just allocated or is unwritten, otherwise the callers would overwrite
  993. * existing data with zeros. Hence we have to split the mapping into a range up
  994. * to and including EOF, and a second mapping for beyond EOF.
  995. */
  996. static void
  997. xfs_map_trim_size(
  998. struct inode *inode,
  999. sector_t iblock,
  1000. struct buffer_head *bh_result,
  1001. struct xfs_bmbt_irec *imap,
  1002. xfs_off_t offset,
  1003. ssize_t size)
  1004. {
  1005. xfs_off_t mapping_size;
  1006. mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
  1007. mapping_size <<= inode->i_blkbits;
  1008. ASSERT(mapping_size > 0);
  1009. if (mapping_size > size)
  1010. mapping_size = size;
  1011. if (offset < i_size_read(inode) &&
  1012. offset + mapping_size >= i_size_read(inode)) {
  1013. /* limit mapping to block that spans EOF */
  1014. mapping_size = roundup_64(i_size_read(inode) - offset,
  1015. 1 << inode->i_blkbits);
  1016. }
  1017. if (mapping_size > LONG_MAX)
  1018. mapping_size = LONG_MAX;
  1019. bh_result->b_size = mapping_size;
  1020. }
  1021. STATIC int
  1022. __xfs_get_blocks(
  1023. struct inode *inode,
  1024. sector_t iblock,
  1025. struct buffer_head *bh_result,
  1026. int create,
  1027. bool direct,
  1028. bool dax_fault)
  1029. {
  1030. struct xfs_inode *ip = XFS_I(inode);
  1031. struct xfs_mount *mp = ip->i_mount;
  1032. xfs_fileoff_t offset_fsb, end_fsb;
  1033. int error = 0;
  1034. int lockmode = 0;
  1035. struct xfs_bmbt_irec imap;
  1036. int nimaps = 1;
  1037. xfs_off_t offset;
  1038. ssize_t size;
  1039. int new = 0;
  1040. BUG_ON(create && !direct);
  1041. if (XFS_FORCED_SHUTDOWN(mp))
  1042. return -EIO;
  1043. offset = (xfs_off_t)iblock << inode->i_blkbits;
  1044. ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
  1045. size = bh_result->b_size;
  1046. if (!create && offset >= i_size_read(inode))
  1047. return 0;
  1048. /*
  1049. * Direct I/O is usually done on preallocated files, so try getting
  1050. * a block mapping without an exclusive lock first.
  1051. */
  1052. lockmode = xfs_ilock_data_map_shared(ip);
  1053. ASSERT(offset <= mp->m_super->s_maxbytes);
  1054. if (offset + size > mp->m_super->s_maxbytes)
  1055. size = mp->m_super->s_maxbytes - offset;
  1056. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
  1057. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  1058. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  1059. &imap, &nimaps, XFS_BMAPI_ENTIRE);
  1060. if (error)
  1061. goto out_unlock;
  1062. /* for DAX, we convert unwritten extents directly */
  1063. if (create &&
  1064. (!nimaps ||
  1065. (imap.br_startblock == HOLESTARTBLOCK ||
  1066. imap.br_startblock == DELAYSTARTBLOCK) ||
  1067. (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
  1068. /*
  1069. * xfs_iomap_write_direct() expects the shared lock. It
  1070. * is unlocked on return.
  1071. */
  1072. if (lockmode == XFS_ILOCK_EXCL)
  1073. xfs_ilock_demote(ip, lockmode);
  1074. error = xfs_iomap_write_direct(ip, offset, size,
  1075. &imap, nimaps);
  1076. if (error)
  1077. return error;
  1078. new = 1;
  1079. trace_xfs_get_blocks_alloc(ip, offset, size,
  1080. ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
  1081. : XFS_IO_DELALLOC, &imap);
  1082. } else if (nimaps) {
  1083. trace_xfs_get_blocks_found(ip, offset, size,
  1084. ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
  1085. : XFS_IO_OVERWRITE, &imap);
  1086. xfs_iunlock(ip, lockmode);
  1087. } else {
  1088. trace_xfs_get_blocks_notfound(ip, offset, size);
  1089. goto out_unlock;
  1090. }
  1091. if (IS_DAX(inode) && create) {
  1092. ASSERT(!ISUNWRITTEN(&imap));
  1093. /* zeroing is not needed at a higher layer */
  1094. new = 0;
  1095. }
  1096. /* trim mapping down to size requested */
  1097. xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
  1098. /*
  1099. * For unwritten extents do not report a disk address in the buffered
  1100. * read case (treat as if we're reading into a hole).
  1101. */
  1102. if (imap.br_startblock != HOLESTARTBLOCK &&
  1103. imap.br_startblock != DELAYSTARTBLOCK &&
  1104. (create || !ISUNWRITTEN(&imap))) {
  1105. xfs_map_buffer(inode, bh_result, &imap, offset);
  1106. if (ISUNWRITTEN(&imap))
  1107. set_buffer_unwritten(bh_result);
  1108. /* direct IO needs special help */
  1109. if (create) {
  1110. if (dax_fault)
  1111. ASSERT(!ISUNWRITTEN(&imap));
  1112. else
  1113. xfs_map_direct(inode, bh_result, &imap, offset);
  1114. }
  1115. }
  1116. /*
  1117. * If this is a realtime file, data may be on a different device.
  1118. * to that pointed to from the buffer_head b_bdev currently.
  1119. */
  1120. bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
  1121. /*
  1122. * If we previously allocated a block out beyond eof and we are now
  1123. * coming back to use it then we will need to flag it as new even if it
  1124. * has a disk address.
  1125. *
  1126. * With sub-block writes into unwritten extents we also need to mark
  1127. * the buffer as new so that the unwritten parts of the buffer gets
  1128. * correctly zeroed.
  1129. */
  1130. if (create &&
  1131. ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
  1132. (offset >= i_size_read(inode)) ||
  1133. (new || ISUNWRITTEN(&imap))))
  1134. set_buffer_new(bh_result);
  1135. BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
  1136. return 0;
  1137. out_unlock:
  1138. xfs_iunlock(ip, lockmode);
  1139. return error;
  1140. }
  1141. int
  1142. xfs_get_blocks(
  1143. struct inode *inode,
  1144. sector_t iblock,
  1145. struct buffer_head *bh_result,
  1146. int create)
  1147. {
  1148. return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
  1149. }
  1150. int
  1151. xfs_get_blocks_direct(
  1152. struct inode *inode,
  1153. sector_t iblock,
  1154. struct buffer_head *bh_result,
  1155. int create)
  1156. {
  1157. return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
  1158. }
  1159. int
  1160. xfs_get_blocks_dax_fault(
  1161. struct inode *inode,
  1162. sector_t iblock,
  1163. struct buffer_head *bh_result,
  1164. int create)
  1165. {
  1166. return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
  1167. }
  1168. /*
  1169. * Complete a direct I/O write request.
  1170. *
  1171. * xfs_map_direct passes us some flags in the private data to tell us what to
  1172. * do. If no flags are set, then the write IO is an overwrite wholly within
  1173. * the existing allocated file size and so there is nothing for us to do.
  1174. *
  1175. * Note that in this case the completion can be called in interrupt context,
  1176. * whereas if we have flags set we will always be called in task context
  1177. * (i.e. from a workqueue).
  1178. */
  1179. int
  1180. xfs_end_io_direct_write(
  1181. struct kiocb *iocb,
  1182. loff_t offset,
  1183. ssize_t size,
  1184. void *private)
  1185. {
  1186. struct inode *inode = file_inode(iocb->ki_filp);
  1187. struct xfs_inode *ip = XFS_I(inode);
  1188. struct xfs_mount *mp = ip->i_mount;
  1189. uintptr_t flags = (uintptr_t)private;
  1190. int error = 0;
  1191. trace_xfs_end_io_direct_write(ip, offset, size);
  1192. if (XFS_FORCED_SHUTDOWN(mp))
  1193. return -EIO;
  1194. if (size <= 0)
  1195. return size;
  1196. /*
  1197. * The flags tell us whether we are doing unwritten extent conversions
  1198. * or an append transaction that updates the on-disk file size. These
  1199. * cases are the only cases where we should *potentially* be needing
  1200. * to update the VFS inode size.
  1201. */
  1202. if (flags == 0) {
  1203. ASSERT(offset + size <= i_size_read(inode));
  1204. return 0;
  1205. }
  1206. /*
  1207. * We need to update the in-core inode size here so that we don't end up
  1208. * with the on-disk inode size being outside the in-core inode size. We
  1209. * have no other method of updating EOF for AIO, so always do it here
  1210. * if necessary.
  1211. *
  1212. * We need to lock the test/set EOF update as we can be racing with
  1213. * other IO completions here to update the EOF. Failing to serialise
  1214. * here can result in EOF moving backwards and Bad Things Happen when
  1215. * that occurs.
  1216. */
  1217. spin_lock(&ip->i_flags_lock);
  1218. if (offset + size > i_size_read(inode))
  1219. i_size_write(inode, offset + size);
  1220. spin_unlock(&ip->i_flags_lock);
  1221. if (flags & XFS_DIO_FLAG_UNWRITTEN) {
  1222. trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
  1223. error = xfs_iomap_write_unwritten(ip, offset, size);
  1224. } else if (flags & XFS_DIO_FLAG_APPEND) {
  1225. struct xfs_trans *tp;
  1226. trace_xfs_end_io_direct_write_append(ip, offset, size);
  1227. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
  1228. &tp);
  1229. if (!error)
  1230. error = xfs_setfilesize(ip, tp, offset, size);
  1231. }
  1232. return error;
  1233. }
  1234. STATIC ssize_t
  1235. xfs_vm_direct_IO(
  1236. struct kiocb *iocb,
  1237. struct iov_iter *iter)
  1238. {
  1239. /*
  1240. * We just need the method present so that open/fcntl allow direct I/O.
  1241. */
  1242. return -EINVAL;
  1243. }
  1244. STATIC sector_t
  1245. xfs_vm_bmap(
  1246. struct address_space *mapping,
  1247. sector_t block)
  1248. {
  1249. struct inode *inode = (struct inode *)mapping->host;
  1250. struct xfs_inode *ip = XFS_I(inode);
  1251. trace_xfs_vm_bmap(XFS_I(inode));
  1252. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  1253. filemap_write_and_wait(mapping);
  1254. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  1255. return generic_block_bmap(mapping, block, xfs_get_blocks);
  1256. }
  1257. STATIC int
  1258. xfs_vm_readpage(
  1259. struct file *unused,
  1260. struct page *page)
  1261. {
  1262. trace_xfs_vm_readpage(page->mapping->host, 1);
  1263. return mpage_readpage(page, xfs_get_blocks);
  1264. }
  1265. STATIC int
  1266. xfs_vm_readpages(
  1267. struct file *unused,
  1268. struct address_space *mapping,
  1269. struct list_head *pages,
  1270. unsigned nr_pages)
  1271. {
  1272. trace_xfs_vm_readpages(mapping->host, nr_pages);
  1273. return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
  1274. }
  1275. /*
  1276. * This is basically a copy of __set_page_dirty_buffers() with one
  1277. * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
  1278. * dirty, we'll never be able to clean them because we don't write buffers
  1279. * beyond EOF, and that means we can't invalidate pages that span EOF
  1280. * that have been marked dirty. Further, the dirty state can leak into
  1281. * the file interior if the file is extended, resulting in all sorts of
  1282. * bad things happening as the state does not match the underlying data.
  1283. *
  1284. * XXX: this really indicates that bufferheads in XFS need to die. Warts like
  1285. * this only exist because of bufferheads and how the generic code manages them.
  1286. */
  1287. STATIC int
  1288. xfs_vm_set_page_dirty(
  1289. struct page *page)
  1290. {
  1291. struct address_space *mapping = page->mapping;
  1292. struct inode *inode = mapping->host;
  1293. loff_t end_offset;
  1294. loff_t offset;
  1295. int newly_dirty;
  1296. if (unlikely(!mapping))
  1297. return !TestSetPageDirty(page);
  1298. end_offset = i_size_read(inode);
  1299. offset = page_offset(page);
  1300. spin_lock(&mapping->private_lock);
  1301. if (page_has_buffers(page)) {
  1302. struct buffer_head *head = page_buffers(page);
  1303. struct buffer_head *bh = head;
  1304. do {
  1305. if (offset < end_offset)
  1306. set_buffer_dirty(bh);
  1307. bh = bh->b_this_page;
  1308. offset += 1 << inode->i_blkbits;
  1309. } while (bh != head);
  1310. }
  1311. /*
  1312. * Lock out page->mem_cgroup migration to keep PageDirty
  1313. * synchronized with per-memcg dirty page counters.
  1314. */
  1315. lock_page_memcg(page);
  1316. newly_dirty = !TestSetPageDirty(page);
  1317. spin_unlock(&mapping->private_lock);
  1318. if (newly_dirty) {
  1319. /* sigh - __set_page_dirty() is static, so copy it here, too */
  1320. unsigned long flags;
  1321. spin_lock_irqsave(&mapping->tree_lock, flags);
  1322. if (page->mapping) { /* Race with truncate? */
  1323. WARN_ON_ONCE(!PageUptodate(page));
  1324. account_page_dirtied(page, mapping);
  1325. radix_tree_tag_set(&mapping->page_tree,
  1326. page_index(page), PAGECACHE_TAG_DIRTY);
  1327. }
  1328. spin_unlock_irqrestore(&mapping->tree_lock, flags);
  1329. }
  1330. unlock_page_memcg(page);
  1331. if (newly_dirty)
  1332. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  1333. return newly_dirty;
  1334. }
  1335. const struct address_space_operations xfs_address_space_operations = {
  1336. .readpage = xfs_vm_readpage,
  1337. .readpages = xfs_vm_readpages,
  1338. .writepage = xfs_vm_writepage,
  1339. .writepages = xfs_vm_writepages,
  1340. .set_page_dirty = xfs_vm_set_page_dirty,
  1341. .releasepage = xfs_vm_releasepage,
  1342. .invalidatepage = xfs_vm_invalidatepage,
  1343. .bmap = xfs_vm_bmap,
  1344. .direct_IO = xfs_vm_direct_IO,
  1345. .migratepage = buffer_migrate_page,
  1346. .is_partially_uptodate = block_is_partially_uptodate,
  1347. .error_remove_page = generic_error_remove_page,
  1348. };