iomap.c 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136
  1. /*
  2. * Copyright (C) 2010 Red Hat, Inc.
  3. * Copyright (c) 2016-2018 Christoph Hellwig.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #include <linux/module.h>
  15. #include <linux/compiler.h>
  16. #include <linux/fs.h>
  17. #include <linux/iomap.h>
  18. #include <linux/uaccess.h>
  19. #include <linux/gfp.h>
  20. #include <linux/migrate.h>
  21. #include <linux/mm.h>
  22. #include <linux/mm_inline.h>
  23. #include <linux/swap.h>
  24. #include <linux/pagemap.h>
  25. #include <linux/pagevec.h>
  26. #include <linux/file.h>
  27. #include <linux/uio.h>
  28. #include <linux/backing-dev.h>
  29. #include <linux/buffer_head.h>
  30. #include <linux/task_io_accounting_ops.h>
  31. #include <linux/dax.h>
  32. #include <linux/sched/signal.h>
  33. #include "internal.h"
  34. /*
  35. * Execute a iomap write on a segment of the mapping that spans a
  36. * contiguous range of pages that have identical block mapping state.
  37. *
  38. * This avoids the need to map pages individually, do individual allocations
  39. * for each page and most importantly avoid the need for filesystem specific
  40. * locking per page. Instead, all the operations are amortised over the entire
  41. * range of pages. It is assumed that the filesystems will lock whatever
  42. * resources they require in the iomap_begin call, and release them in the
  43. * iomap_end call.
  44. */
  45. loff_t
  46. iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
  47. const struct iomap_ops *ops, void *data, iomap_actor_t actor)
  48. {
  49. struct iomap iomap = { 0 };
  50. loff_t written = 0, ret;
  51. /*
  52. * Need to map a range from start position for length bytes. This can
  53. * span multiple pages - it is only guaranteed to return a range of a
  54. * single type of pages (e.g. all into a hole, all mapped or all
  55. * unwritten). Failure at this point has nothing to undo.
  56. *
  57. * If allocation is required for this range, reserve the space now so
  58. * that the allocation is guaranteed to succeed later on. Once we copy
  59. * the data into the page cache pages, then we cannot fail otherwise we
  60. * expose transient stale data. If the reserve fails, we can safely
  61. * back out at this point as there is nothing to undo.
  62. */
  63. ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
  64. if (ret)
  65. return ret;
  66. if (WARN_ON(iomap.offset > pos))
  67. return -EIO;
  68. if (WARN_ON(iomap.length == 0))
  69. return -EIO;
  70. /*
  71. * Cut down the length to the one actually provided by the filesystem,
  72. * as it might not be able to give us the whole size that we requested.
  73. */
  74. if (iomap.offset + iomap.length < pos + length)
  75. length = iomap.offset + iomap.length - pos;
  76. /*
  77. * Now that we have guaranteed that the space allocation will succeed.
  78. * we can do the copy-in page by page without having to worry about
  79. * failures exposing transient data.
  80. */
  81. written = actor(inode, pos, length, data, &iomap);
  82. /*
  83. * Now the data has been copied, commit the range we've copied. This
  84. * should not fail unless the filesystem has had a fatal error.
  85. */
  86. if (ops->iomap_end) {
  87. ret = ops->iomap_end(inode, pos, length,
  88. written > 0 ? written : 0,
  89. flags, &iomap);
  90. }
  91. return written ? written : ret;
  92. }
  93. static sector_t
  94. iomap_sector(struct iomap *iomap, loff_t pos)
  95. {
  96. return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
  97. }
  98. static struct iomap_page *
  99. iomap_page_create(struct inode *inode, struct page *page)
  100. {
  101. struct iomap_page *iop = to_iomap_page(page);
  102. if (iop || i_blocksize(inode) == PAGE_SIZE)
  103. return iop;
  104. iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
  105. atomic_set(&iop->read_count, 0);
  106. atomic_set(&iop->write_count, 0);
  107. bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
  108. /*
  109. * migrate_page_move_mapping() assumes that pages with private data have
  110. * their count elevated by 1.
  111. */
  112. get_page(page);
  113. set_page_private(page, (unsigned long)iop);
  114. SetPagePrivate(page);
  115. return iop;
  116. }
  117. static void
  118. iomap_page_release(struct page *page)
  119. {
  120. struct iomap_page *iop = to_iomap_page(page);
  121. if (!iop)
  122. return;
  123. WARN_ON_ONCE(atomic_read(&iop->read_count));
  124. WARN_ON_ONCE(atomic_read(&iop->write_count));
  125. ClearPagePrivate(page);
  126. set_page_private(page, 0);
  127. put_page(page);
  128. kfree(iop);
  129. }
  130. /*
  131. * Calculate the range inside the page that we actually need to read.
  132. */
  133. static void
  134. iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
  135. loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
  136. {
  137. loff_t orig_pos = *pos;
  138. loff_t isize = i_size_read(inode);
  139. unsigned block_bits = inode->i_blkbits;
  140. unsigned block_size = (1 << block_bits);
  141. unsigned poff = offset_in_page(*pos);
  142. unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
  143. unsigned first = poff >> block_bits;
  144. unsigned last = (poff + plen - 1) >> block_bits;
  145. /*
  146. * If the block size is smaller than the page size we need to check the
  147. * per-block uptodate status and adjust the offset and length if needed
  148. * to avoid reading in already uptodate ranges.
  149. */
  150. if (iop) {
  151. unsigned int i;
  152. /* move forward for each leading block marked uptodate */
  153. for (i = first; i <= last; i++) {
  154. if (!test_bit(i, iop->uptodate))
  155. break;
  156. *pos += block_size;
  157. poff += block_size;
  158. plen -= block_size;
  159. first++;
  160. }
  161. /* truncate len if we find any trailing uptodate block(s) */
  162. for ( ; i <= last; i++) {
  163. if (test_bit(i, iop->uptodate)) {
  164. plen -= (last - i + 1) * block_size;
  165. last = i - 1;
  166. break;
  167. }
  168. }
  169. }
  170. /*
  171. * If the extent spans the block that contains the i_size we need to
  172. * handle both halves separately so that we properly zero data in the
  173. * page cache for blocks that are entirely outside of i_size.
  174. */
  175. if (orig_pos <= isize && orig_pos + length > isize) {
  176. unsigned end = offset_in_page(isize - 1) >> block_bits;
  177. if (first <= end && last > end)
  178. plen -= (last - end) * block_size;
  179. }
  180. *offp = poff;
  181. *lenp = plen;
  182. }
  183. static void
  184. iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
  185. {
  186. struct iomap_page *iop = to_iomap_page(page);
  187. struct inode *inode = page->mapping->host;
  188. unsigned first = off >> inode->i_blkbits;
  189. unsigned last = (off + len - 1) >> inode->i_blkbits;
  190. unsigned int i;
  191. bool uptodate = true;
  192. if (iop) {
  193. for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
  194. if (i >= first && i <= last)
  195. set_bit(i, iop->uptodate);
  196. else if (!test_bit(i, iop->uptodate))
  197. uptodate = false;
  198. }
  199. }
  200. if (uptodate && !PageError(page))
  201. SetPageUptodate(page);
  202. }
  203. static void
  204. iomap_read_finish(struct iomap_page *iop, struct page *page)
  205. {
  206. if (!iop || atomic_dec_and_test(&iop->read_count))
  207. unlock_page(page);
  208. }
  209. static void
  210. iomap_read_page_end_io(struct bio_vec *bvec, int error)
  211. {
  212. struct page *page = bvec->bv_page;
  213. struct iomap_page *iop = to_iomap_page(page);
  214. if (unlikely(error)) {
  215. ClearPageUptodate(page);
  216. SetPageError(page);
  217. } else {
  218. iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
  219. }
  220. iomap_read_finish(iop, page);
  221. }
  222. static void
  223. iomap_read_inline_data(struct inode *inode, struct page *page,
  224. struct iomap *iomap)
  225. {
  226. size_t size = i_size_read(inode);
  227. void *addr;
  228. if (PageUptodate(page))
  229. return;
  230. BUG_ON(page->index);
  231. BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
  232. addr = kmap_atomic(page);
  233. memcpy(addr, iomap->inline_data, size);
  234. memset(addr + size, 0, PAGE_SIZE - size);
  235. kunmap_atomic(addr);
  236. SetPageUptodate(page);
  237. }
  238. static void
  239. iomap_read_end_io(struct bio *bio)
  240. {
  241. int error = blk_status_to_errno(bio->bi_status);
  242. struct bio_vec *bvec;
  243. int i;
  244. bio_for_each_segment_all(bvec, bio, i)
  245. iomap_read_page_end_io(bvec, error);
  246. bio_put(bio);
  247. }
  248. struct iomap_readpage_ctx {
  249. struct page *cur_page;
  250. bool cur_page_in_bio;
  251. bool is_readahead;
  252. struct bio *bio;
  253. struct list_head *pages;
  254. };
  255. static loff_t
  256. iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  257. struct iomap *iomap)
  258. {
  259. struct iomap_readpage_ctx *ctx = data;
  260. struct page *page = ctx->cur_page;
  261. struct iomap_page *iop = iomap_page_create(inode, page);
  262. bool is_contig = false;
  263. loff_t orig_pos = pos;
  264. unsigned poff, plen;
  265. sector_t sector;
  266. if (iomap->type == IOMAP_INLINE) {
  267. WARN_ON_ONCE(pos);
  268. iomap_read_inline_data(inode, page, iomap);
  269. return PAGE_SIZE;
  270. }
  271. /* zero post-eof blocks as the page may be mapped */
  272. iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
  273. if (plen == 0)
  274. goto done;
  275. if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
  276. zero_user(page, poff, plen);
  277. iomap_set_range_uptodate(page, poff, plen);
  278. goto done;
  279. }
  280. ctx->cur_page_in_bio = true;
  281. /*
  282. * Try to merge into a previous segment if we can.
  283. */
  284. sector = iomap_sector(iomap, pos);
  285. if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
  286. if (__bio_try_merge_page(ctx->bio, page, plen, poff))
  287. goto done;
  288. is_contig = true;
  289. }
  290. /*
  291. * If we start a new segment we need to increase the read count, and we
  292. * need to do so before submitting any previous full bio to make sure
  293. * that we don't prematurely unlock the page.
  294. */
  295. if (iop)
  296. atomic_inc(&iop->read_count);
  297. if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
  298. gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
  299. int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
  300. if (ctx->bio)
  301. submit_bio(ctx->bio);
  302. if (ctx->is_readahead) /* same as readahead_gfp_mask */
  303. gfp |= __GFP_NORETRY | __GFP_NOWARN;
  304. ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
  305. ctx->bio->bi_opf = REQ_OP_READ;
  306. if (ctx->is_readahead)
  307. ctx->bio->bi_opf |= REQ_RAHEAD;
  308. ctx->bio->bi_iter.bi_sector = sector;
  309. bio_set_dev(ctx->bio, iomap->bdev);
  310. ctx->bio->bi_end_io = iomap_read_end_io;
  311. }
  312. __bio_add_page(ctx->bio, page, plen, poff);
  313. done:
  314. /*
  315. * Move the caller beyond our range so that it keeps making progress.
  316. * For that we have to include any leading non-uptodate ranges, but
  317. * we can skip trailing ones as they will be handled in the next
  318. * iteration.
  319. */
  320. return pos - orig_pos + plen;
  321. }
  322. int
  323. iomap_readpage(struct page *page, const struct iomap_ops *ops)
  324. {
  325. struct iomap_readpage_ctx ctx = { .cur_page = page };
  326. struct inode *inode = page->mapping->host;
  327. unsigned poff;
  328. loff_t ret;
  329. for (poff = 0; poff < PAGE_SIZE; poff += ret) {
  330. ret = iomap_apply(inode, page_offset(page) + poff,
  331. PAGE_SIZE - poff, 0, ops, &ctx,
  332. iomap_readpage_actor);
  333. if (ret <= 0) {
  334. WARN_ON_ONCE(ret == 0);
  335. SetPageError(page);
  336. break;
  337. }
  338. }
  339. if (ctx.bio) {
  340. submit_bio(ctx.bio);
  341. WARN_ON_ONCE(!ctx.cur_page_in_bio);
  342. } else {
  343. WARN_ON_ONCE(ctx.cur_page_in_bio);
  344. unlock_page(page);
  345. }
  346. /*
  347. * Just like mpage_readpages and block_read_full_page we always
  348. * return 0 and just mark the page as PageError on errors. This
  349. * should be cleaned up all through the stack eventually.
  350. */
  351. return 0;
  352. }
  353. EXPORT_SYMBOL_GPL(iomap_readpage);
  354. static struct page *
  355. iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
  356. loff_t length, loff_t *done)
  357. {
  358. while (!list_empty(pages)) {
  359. struct page *page = lru_to_page(pages);
  360. if (page_offset(page) >= (u64)pos + length)
  361. break;
  362. list_del(&page->lru);
  363. if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
  364. GFP_NOFS))
  365. return page;
  366. /*
  367. * If we already have a page in the page cache at index we are
  368. * done. Upper layers don't care if it is uptodate after the
  369. * readpages call itself as every page gets checked again once
  370. * actually needed.
  371. */
  372. *done += PAGE_SIZE;
  373. put_page(page);
  374. }
  375. return NULL;
  376. }
  377. static loff_t
  378. iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
  379. void *data, struct iomap *iomap)
  380. {
  381. struct iomap_readpage_ctx *ctx = data;
  382. loff_t done, ret;
  383. for (done = 0; done < length; done += ret) {
  384. if (ctx->cur_page && offset_in_page(pos + done) == 0) {
  385. if (!ctx->cur_page_in_bio)
  386. unlock_page(ctx->cur_page);
  387. put_page(ctx->cur_page);
  388. ctx->cur_page = NULL;
  389. }
  390. if (!ctx->cur_page) {
  391. ctx->cur_page = iomap_next_page(inode, ctx->pages,
  392. pos, length, &done);
  393. if (!ctx->cur_page)
  394. break;
  395. ctx->cur_page_in_bio = false;
  396. }
  397. ret = iomap_readpage_actor(inode, pos + done, length - done,
  398. ctx, iomap);
  399. }
  400. return done;
  401. }
  402. int
  403. iomap_readpages(struct address_space *mapping, struct list_head *pages,
  404. unsigned nr_pages, const struct iomap_ops *ops)
  405. {
  406. struct iomap_readpage_ctx ctx = {
  407. .pages = pages,
  408. .is_readahead = true,
  409. };
  410. loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
  411. loff_t last = page_offset(list_entry(pages->next, struct page, lru));
  412. loff_t length = last - pos + PAGE_SIZE, ret = 0;
  413. while (length > 0) {
  414. ret = iomap_apply(mapping->host, pos, length, 0, ops,
  415. &ctx, iomap_readpages_actor);
  416. if (ret <= 0) {
  417. WARN_ON_ONCE(ret == 0);
  418. goto done;
  419. }
  420. pos += ret;
  421. length -= ret;
  422. }
  423. ret = 0;
  424. done:
  425. if (ctx.bio)
  426. submit_bio(ctx.bio);
  427. if (ctx.cur_page) {
  428. if (!ctx.cur_page_in_bio)
  429. unlock_page(ctx.cur_page);
  430. put_page(ctx.cur_page);
  431. }
  432. /*
  433. * Check that we didn't lose a page due to the arcance calling
  434. * conventions..
  435. */
  436. WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
  437. return ret;
  438. }
  439. EXPORT_SYMBOL_GPL(iomap_readpages);
  440. int
  441. iomap_is_partially_uptodate(struct page *page, unsigned long from,
  442. unsigned long count)
  443. {
  444. struct iomap_page *iop = to_iomap_page(page);
  445. struct inode *inode = page->mapping->host;
  446. unsigned first = from >> inode->i_blkbits;
  447. unsigned last = (from + count - 1) >> inode->i_blkbits;
  448. unsigned i;
  449. if (iop) {
  450. for (i = first; i <= last; i++)
  451. if (!test_bit(i, iop->uptodate))
  452. return 0;
  453. return 1;
  454. }
  455. return 0;
  456. }
  457. EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  458. int
  459. iomap_releasepage(struct page *page, gfp_t gfp_mask)
  460. {
  461. /*
  462. * mm accommodates an old ext3 case where clean pages might not have had
  463. * the dirty bit cleared. Thus, it can send actual dirty pages to
  464. * ->releasepage() via shrink_active_list(), skip those here.
  465. */
  466. if (PageDirty(page) || PageWriteback(page))
  467. return 0;
  468. iomap_page_release(page);
  469. return 1;
  470. }
  471. EXPORT_SYMBOL_GPL(iomap_releasepage);
  472. void
  473. iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
  474. {
  475. /*
  476. * If we are invalidating the entire page, clear the dirty state from it
  477. * and release it to avoid unnecessary buildup of the LRU.
  478. */
  479. if (offset == 0 && len == PAGE_SIZE) {
  480. WARN_ON_ONCE(PageWriteback(page));
  481. cancel_dirty_page(page);
  482. iomap_page_release(page);
  483. }
  484. }
  485. EXPORT_SYMBOL_GPL(iomap_invalidatepage);
  486. #ifdef CONFIG_MIGRATION
  487. int
  488. iomap_migrate_page(struct address_space *mapping, struct page *newpage,
  489. struct page *page, enum migrate_mode mode)
  490. {
  491. int ret;
  492. ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
  493. if (ret != MIGRATEPAGE_SUCCESS)
  494. return ret;
  495. if (page_has_private(page)) {
  496. ClearPagePrivate(page);
  497. set_page_private(newpage, page_private(page));
  498. set_page_private(page, 0);
  499. SetPagePrivate(newpage);
  500. }
  501. if (mode != MIGRATE_SYNC_NO_COPY)
  502. migrate_page_copy(newpage, page);
  503. else
  504. migrate_page_states(newpage, page);
  505. return MIGRATEPAGE_SUCCESS;
  506. }
  507. EXPORT_SYMBOL_GPL(iomap_migrate_page);
  508. #endif /* CONFIG_MIGRATION */
  509. static void
  510. iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
  511. {
  512. loff_t i_size = i_size_read(inode);
  513. /*
  514. * Only truncate newly allocated pages beyoned EOF, even if the
  515. * write started inside the existing inode size.
  516. */
  517. if (pos + len > i_size)
  518. truncate_pagecache_range(inode, max(pos, i_size), pos + len);
  519. }
  520. static int
  521. iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
  522. unsigned poff, unsigned plen, unsigned from, unsigned to,
  523. struct iomap *iomap)
  524. {
  525. struct bio_vec bvec;
  526. struct bio bio;
  527. if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
  528. zero_user_segments(page, poff, from, to, poff + plen);
  529. iomap_set_range_uptodate(page, poff, plen);
  530. return 0;
  531. }
  532. bio_init(&bio, &bvec, 1);
  533. bio.bi_opf = REQ_OP_READ;
  534. bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
  535. bio_set_dev(&bio, iomap->bdev);
  536. __bio_add_page(&bio, page, plen, poff);
  537. return submit_bio_wait(&bio);
  538. }
  539. static int
  540. __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
  541. struct page *page, struct iomap *iomap)
  542. {
  543. struct iomap_page *iop = iomap_page_create(inode, page);
  544. loff_t block_size = i_blocksize(inode);
  545. loff_t block_start = pos & ~(block_size - 1);
  546. loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
  547. unsigned from = offset_in_page(pos), to = from + len, poff, plen;
  548. int status = 0;
  549. if (PageUptodate(page))
  550. return 0;
  551. do {
  552. iomap_adjust_read_range(inode, iop, &block_start,
  553. block_end - block_start, &poff, &plen);
  554. if (plen == 0)
  555. break;
  556. if ((from > poff && from < poff + plen) ||
  557. (to > poff && to < poff + plen)) {
  558. status = iomap_read_page_sync(inode, block_start, page,
  559. poff, plen, from, to, iomap);
  560. if (status)
  561. break;
  562. }
  563. } while ((block_start += plen) < block_end);
  564. return status;
  565. }
  566. static int
  567. iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
  568. struct page **pagep, struct iomap *iomap)
  569. {
  570. pgoff_t index = pos >> PAGE_SHIFT;
  571. struct page *page;
  572. int status = 0;
  573. BUG_ON(pos + len > iomap->offset + iomap->length);
  574. if (fatal_signal_pending(current))
  575. return -EINTR;
  576. page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
  577. if (!page)
  578. return -ENOMEM;
  579. if (iomap->type == IOMAP_INLINE)
  580. iomap_read_inline_data(inode, page, iomap);
  581. else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
  582. status = __block_write_begin_int(page, pos, len, NULL, iomap);
  583. else
  584. status = __iomap_write_begin(inode, pos, len, page, iomap);
  585. if (unlikely(status)) {
  586. unlock_page(page);
  587. put_page(page);
  588. page = NULL;
  589. iomap_write_failed(inode, pos, len);
  590. }
  591. *pagep = page;
  592. return status;
  593. }
  594. int
  595. iomap_set_page_dirty(struct page *page)
  596. {
  597. struct address_space *mapping = page_mapping(page);
  598. int newly_dirty;
  599. if (unlikely(!mapping))
  600. return !TestSetPageDirty(page);
  601. /*
  602. * Lock out page->mem_cgroup migration to keep PageDirty
  603. * synchronized with per-memcg dirty page counters.
  604. */
  605. lock_page_memcg(page);
  606. newly_dirty = !TestSetPageDirty(page);
  607. if (newly_dirty)
  608. __set_page_dirty(page, mapping, 0);
  609. unlock_page_memcg(page);
  610. if (newly_dirty)
  611. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  612. return newly_dirty;
  613. }
  614. EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
  615. static int
  616. __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  617. unsigned copied, struct page *page, struct iomap *iomap)
  618. {
  619. flush_dcache_page(page);
  620. /*
  621. * The blocks that were entirely written will now be uptodate, so we
  622. * don't have to worry about a readpage reading them and overwriting a
  623. * partial write. However if we have encountered a short write and only
  624. * partially written into a block, it will not be marked uptodate, so a
  625. * readpage might come in and destroy our partial write.
  626. *
  627. * Do the simplest thing, and just treat any short write to a non
  628. * uptodate page as a zero-length write, and force the caller to redo
  629. * the whole thing.
  630. */
  631. if (unlikely(copied < len && !PageUptodate(page))) {
  632. copied = 0;
  633. } else {
  634. iomap_set_range_uptodate(page, offset_in_page(pos), len);
  635. iomap_set_page_dirty(page);
  636. }
  637. return __generic_write_end(inode, pos, copied, page);
  638. }
  639. static int
  640. iomap_write_end_inline(struct inode *inode, struct page *page,
  641. struct iomap *iomap, loff_t pos, unsigned copied)
  642. {
  643. void *addr;
  644. WARN_ON_ONCE(!PageUptodate(page));
  645. BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
  646. addr = kmap_atomic(page);
  647. memcpy(iomap->inline_data + pos, addr + pos, copied);
  648. kunmap_atomic(addr);
  649. mark_inode_dirty(inode);
  650. __generic_write_end(inode, pos, copied, page);
  651. return copied;
  652. }
  653. static int
  654. iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  655. unsigned copied, struct page *page, struct iomap *iomap)
  656. {
  657. int ret;
  658. if (iomap->type == IOMAP_INLINE) {
  659. ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
  660. } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
  661. ret = generic_write_end(NULL, inode->i_mapping, pos, len,
  662. copied, page, NULL);
  663. } else {
  664. ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
  665. }
  666. if (iomap->page_done)
  667. iomap->page_done(inode, pos, copied, page, iomap);
  668. if (ret < len)
  669. iomap_write_failed(inode, pos, len);
  670. return ret;
  671. }
  672. static loff_t
  673. iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  674. struct iomap *iomap)
  675. {
  676. struct iov_iter *i = data;
  677. long status = 0;
  678. ssize_t written = 0;
  679. unsigned int flags = AOP_FLAG_NOFS;
  680. do {
  681. struct page *page;
  682. unsigned long offset; /* Offset into pagecache page */
  683. unsigned long bytes; /* Bytes to write to page */
  684. size_t copied; /* Bytes copied from user */
  685. offset = offset_in_page(pos);
  686. bytes = min_t(unsigned long, PAGE_SIZE - offset,
  687. iov_iter_count(i));
  688. again:
  689. if (bytes > length)
  690. bytes = length;
  691. /*
  692. * Bring in the user page that we will copy from _first_.
  693. * Otherwise there's a nasty deadlock on copying from the
  694. * same page as we're writing to, without it being marked
  695. * up-to-date.
  696. *
  697. * Not only is this an optimisation, but it is also required
  698. * to check that the address is actually valid, when atomic
  699. * usercopies are used, below.
  700. */
  701. if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  702. status = -EFAULT;
  703. break;
  704. }
  705. status = iomap_write_begin(inode, pos, bytes, flags, &page,
  706. iomap);
  707. if (unlikely(status))
  708. break;
  709. if (mapping_writably_mapped(inode->i_mapping))
  710. flush_dcache_page(page);
  711. copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
  712. flush_dcache_page(page);
  713. status = iomap_write_end(inode, pos, bytes, copied, page,
  714. iomap);
  715. if (unlikely(status < 0))
  716. break;
  717. copied = status;
  718. cond_resched();
  719. iov_iter_advance(i, copied);
  720. if (unlikely(copied == 0)) {
  721. /*
  722. * If we were unable to copy any data at all, we must
  723. * fall back to a single segment length write.
  724. *
  725. * If we didn't fallback here, we could livelock
  726. * because not all segments in the iov can be copied at
  727. * once without a pagefault.
  728. */
  729. bytes = min_t(unsigned long, PAGE_SIZE - offset,
  730. iov_iter_single_seg_count(i));
  731. goto again;
  732. }
  733. pos += copied;
  734. written += copied;
  735. length -= copied;
  736. balance_dirty_pages_ratelimited(inode->i_mapping);
  737. } while (iov_iter_count(i) && length);
  738. return written ? written : status;
  739. }
  740. ssize_t
  741. iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
  742. const struct iomap_ops *ops)
  743. {
  744. struct inode *inode = iocb->ki_filp->f_mapping->host;
  745. loff_t pos = iocb->ki_pos, ret = 0, written = 0;
  746. while (iov_iter_count(iter)) {
  747. ret = iomap_apply(inode, pos, iov_iter_count(iter),
  748. IOMAP_WRITE, ops, iter, iomap_write_actor);
  749. if (ret <= 0)
  750. break;
  751. pos += ret;
  752. written += ret;
  753. }
  754. return written ? written : ret;
  755. }
  756. EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
  757. static struct page *
  758. __iomap_read_page(struct inode *inode, loff_t offset)
  759. {
  760. struct address_space *mapping = inode->i_mapping;
  761. struct page *page;
  762. page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
  763. if (IS_ERR(page))
  764. return page;
  765. if (!PageUptodate(page)) {
  766. put_page(page);
  767. return ERR_PTR(-EIO);
  768. }
  769. return page;
  770. }
  771. static loff_t
  772. iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  773. struct iomap *iomap)
  774. {
  775. long status = 0;
  776. ssize_t written = 0;
  777. do {
  778. struct page *page, *rpage;
  779. unsigned long offset; /* Offset into pagecache page */
  780. unsigned long bytes; /* Bytes to write to page */
  781. offset = offset_in_page(pos);
  782. bytes = min_t(loff_t, PAGE_SIZE - offset, length);
  783. rpage = __iomap_read_page(inode, pos);
  784. if (IS_ERR(rpage))
  785. return PTR_ERR(rpage);
  786. status = iomap_write_begin(inode, pos, bytes,
  787. AOP_FLAG_NOFS, &page, iomap);
  788. put_page(rpage);
  789. if (unlikely(status))
  790. return status;
  791. WARN_ON_ONCE(!PageUptodate(page));
  792. status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
  793. if (unlikely(status <= 0)) {
  794. if (WARN_ON_ONCE(status == 0))
  795. return -EIO;
  796. return status;
  797. }
  798. cond_resched();
  799. pos += status;
  800. written += status;
  801. length -= status;
  802. balance_dirty_pages_ratelimited(inode->i_mapping);
  803. } while (length);
  804. return written;
  805. }
  806. int
  807. iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
  808. const struct iomap_ops *ops)
  809. {
  810. loff_t ret;
  811. while (len) {
  812. ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
  813. iomap_dirty_actor);
  814. if (ret <= 0)
  815. return ret;
  816. pos += ret;
  817. len -= ret;
  818. }
  819. return 0;
  820. }
  821. EXPORT_SYMBOL_GPL(iomap_file_dirty);
  822. static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
  823. unsigned bytes, struct iomap *iomap)
  824. {
  825. struct page *page;
  826. int status;
  827. status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
  828. iomap);
  829. if (status)
  830. return status;
  831. zero_user(page, offset, bytes);
  832. mark_page_accessed(page);
  833. return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
  834. }
  835. static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
  836. struct iomap *iomap)
  837. {
  838. return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
  839. iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
  840. }
  841. static loff_t
  842. iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
  843. void *data, struct iomap *iomap)
  844. {
  845. bool *did_zero = data;
  846. loff_t written = 0;
  847. int status;
  848. /* already zeroed? we're done. */
  849. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  850. return count;
  851. do {
  852. unsigned offset, bytes;
  853. offset = offset_in_page(pos);
  854. bytes = min_t(loff_t, PAGE_SIZE - offset, count);
  855. if (IS_DAX(inode))
  856. status = iomap_dax_zero(pos, offset, bytes, iomap);
  857. else
  858. status = iomap_zero(inode, pos, offset, bytes, iomap);
  859. if (status < 0)
  860. return status;
  861. pos += bytes;
  862. count -= bytes;
  863. written += bytes;
  864. if (did_zero)
  865. *did_zero = true;
  866. } while (count > 0);
  867. return written;
  868. }
  869. int
  870. iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
  871. const struct iomap_ops *ops)
  872. {
  873. loff_t ret;
  874. while (len > 0) {
  875. ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
  876. ops, did_zero, iomap_zero_range_actor);
  877. if (ret <= 0)
  878. return ret;
  879. pos += ret;
  880. len -= ret;
  881. }
  882. return 0;
  883. }
  884. EXPORT_SYMBOL_GPL(iomap_zero_range);
  885. int
  886. iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
  887. const struct iomap_ops *ops)
  888. {
  889. unsigned int blocksize = i_blocksize(inode);
  890. unsigned int off = pos & (blocksize - 1);
  891. /* Block boundary? Nothing to do */
  892. if (!off)
  893. return 0;
  894. return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
  895. }
  896. EXPORT_SYMBOL_GPL(iomap_truncate_page);
  897. static loff_t
  898. iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
  899. void *data, struct iomap *iomap)
  900. {
  901. struct page *page = data;
  902. int ret;
  903. if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
  904. ret = __block_write_begin_int(page, pos, length, NULL, iomap);
  905. if (ret)
  906. return ret;
  907. block_commit_write(page, 0, length);
  908. } else {
  909. WARN_ON_ONCE(!PageUptodate(page));
  910. iomap_page_create(inode, page);
  911. set_page_dirty(page);
  912. }
  913. return length;
  914. }
  915. vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
  916. {
  917. struct page *page = vmf->page;
  918. struct inode *inode = file_inode(vmf->vma->vm_file);
  919. unsigned long length;
  920. loff_t offset, size;
  921. ssize_t ret;
  922. lock_page(page);
  923. size = i_size_read(inode);
  924. if ((page->mapping != inode->i_mapping) ||
  925. (page_offset(page) > size)) {
  926. /* We overload EFAULT to mean page got truncated */
  927. ret = -EFAULT;
  928. goto out_unlock;
  929. }
  930. /* page is wholly or partially inside EOF */
  931. if (((page->index + 1) << PAGE_SHIFT) > size)
  932. length = offset_in_page(size);
  933. else
  934. length = PAGE_SIZE;
  935. offset = page_offset(page);
  936. while (length > 0) {
  937. ret = iomap_apply(inode, offset, length,
  938. IOMAP_WRITE | IOMAP_FAULT, ops, page,
  939. iomap_page_mkwrite_actor);
  940. if (unlikely(ret <= 0))
  941. goto out_unlock;
  942. offset += ret;
  943. length -= ret;
  944. }
  945. wait_for_stable_page(page);
  946. return VM_FAULT_LOCKED;
  947. out_unlock:
  948. unlock_page(page);
  949. return block_page_mkwrite_return(ret);
  950. }
  951. EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
  952. struct fiemap_ctx {
  953. struct fiemap_extent_info *fi;
  954. struct iomap prev;
  955. };
  956. static int iomap_to_fiemap(struct fiemap_extent_info *fi,
  957. struct iomap *iomap, u32 flags)
  958. {
  959. switch (iomap->type) {
  960. case IOMAP_HOLE:
  961. /* skip holes */
  962. return 0;
  963. case IOMAP_DELALLOC:
  964. flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
  965. break;
  966. case IOMAP_MAPPED:
  967. break;
  968. case IOMAP_UNWRITTEN:
  969. flags |= FIEMAP_EXTENT_UNWRITTEN;
  970. break;
  971. case IOMAP_INLINE:
  972. flags |= FIEMAP_EXTENT_DATA_INLINE;
  973. break;
  974. }
  975. if (iomap->flags & IOMAP_F_MERGED)
  976. flags |= FIEMAP_EXTENT_MERGED;
  977. if (iomap->flags & IOMAP_F_SHARED)
  978. flags |= FIEMAP_EXTENT_SHARED;
  979. return fiemap_fill_next_extent(fi, iomap->offset,
  980. iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
  981. iomap->length, flags);
  982. }
  983. static loff_t
  984. iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  985. struct iomap *iomap)
  986. {
  987. struct fiemap_ctx *ctx = data;
  988. loff_t ret = length;
  989. if (iomap->type == IOMAP_HOLE)
  990. return length;
  991. ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
  992. ctx->prev = *iomap;
  993. switch (ret) {
  994. case 0: /* success */
  995. return length;
  996. case 1: /* extent array full */
  997. return 0;
  998. default:
  999. return ret;
  1000. }
  1001. }
  1002. int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
  1003. loff_t start, loff_t len, const struct iomap_ops *ops)
  1004. {
  1005. struct fiemap_ctx ctx;
  1006. loff_t ret;
  1007. memset(&ctx, 0, sizeof(ctx));
  1008. ctx.fi = fi;
  1009. ctx.prev.type = IOMAP_HOLE;
  1010. ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
  1011. if (ret)
  1012. return ret;
  1013. if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
  1014. ret = filemap_write_and_wait(inode->i_mapping);
  1015. if (ret)
  1016. return ret;
  1017. }
  1018. while (len > 0) {
  1019. ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
  1020. iomap_fiemap_actor);
  1021. /* inode with no (attribute) mapping will give ENOENT */
  1022. if (ret == -ENOENT)
  1023. break;
  1024. if (ret < 0)
  1025. return ret;
  1026. if (ret == 0)
  1027. break;
  1028. start += ret;
  1029. len -= ret;
  1030. }
  1031. if (ctx.prev.type != IOMAP_HOLE) {
  1032. ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
  1033. if (ret < 0)
  1034. return ret;
  1035. }
  1036. return 0;
  1037. }
  1038. EXPORT_SYMBOL_GPL(iomap_fiemap);
  1039. /*
  1040. * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
  1041. * Returns true if found and updates @lastoff to the offset in file.
  1042. */
  1043. static bool
  1044. page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
  1045. int whence)
  1046. {
  1047. const struct address_space_operations *ops = inode->i_mapping->a_ops;
  1048. unsigned int bsize = i_blocksize(inode), off;
  1049. bool seek_data = whence == SEEK_DATA;
  1050. loff_t poff = page_offset(page);
  1051. if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
  1052. return false;
  1053. if (*lastoff < poff) {
  1054. /*
  1055. * Last offset smaller than the start of the page means we found
  1056. * a hole:
  1057. */
  1058. if (whence == SEEK_HOLE)
  1059. return true;
  1060. *lastoff = poff;
  1061. }
  1062. /*
  1063. * Just check the page unless we can and should check block ranges:
  1064. */
  1065. if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
  1066. return PageUptodate(page) == seek_data;
  1067. lock_page(page);
  1068. if (unlikely(page->mapping != inode->i_mapping))
  1069. goto out_unlock_not_found;
  1070. for (off = 0; off < PAGE_SIZE; off += bsize) {
  1071. if (offset_in_page(*lastoff) >= off + bsize)
  1072. continue;
  1073. if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
  1074. unlock_page(page);
  1075. return true;
  1076. }
  1077. *lastoff = poff + off + bsize;
  1078. }
  1079. out_unlock_not_found:
  1080. unlock_page(page);
  1081. return false;
  1082. }
  1083. /*
  1084. * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
  1085. *
  1086. * Within unwritten extents, the page cache determines which parts are holes
  1087. * and which are data: uptodate buffer heads count as data; everything else
  1088. * counts as a hole.
  1089. *
  1090. * Returns the resulting offset on successs, and -ENOENT otherwise.
  1091. */
  1092. static loff_t
  1093. page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
  1094. int whence)
  1095. {
  1096. pgoff_t index = offset >> PAGE_SHIFT;
  1097. pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
  1098. loff_t lastoff = offset;
  1099. struct pagevec pvec;
  1100. if (length <= 0)
  1101. return -ENOENT;
  1102. pagevec_init(&pvec);
  1103. do {
  1104. unsigned nr_pages, i;
  1105. nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
  1106. end - 1);
  1107. if (nr_pages == 0)
  1108. break;
  1109. for (i = 0; i < nr_pages; i++) {
  1110. struct page *page = pvec.pages[i];
  1111. if (page_seek_hole_data(inode, page, &lastoff, whence))
  1112. goto check_range;
  1113. lastoff = page_offset(page) + PAGE_SIZE;
  1114. }
  1115. pagevec_release(&pvec);
  1116. } while (index < end);
  1117. /* When no page at lastoff and we are not done, we found a hole. */
  1118. if (whence != SEEK_HOLE)
  1119. goto not_found;
  1120. check_range:
  1121. if (lastoff < offset + length)
  1122. goto out;
  1123. not_found:
  1124. lastoff = -ENOENT;
  1125. out:
  1126. pagevec_release(&pvec);
  1127. return lastoff;
  1128. }
  1129. static loff_t
  1130. iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
  1131. void *data, struct iomap *iomap)
  1132. {
  1133. switch (iomap->type) {
  1134. case IOMAP_UNWRITTEN:
  1135. offset = page_cache_seek_hole_data(inode, offset, length,
  1136. SEEK_HOLE);
  1137. if (offset < 0)
  1138. return length;
  1139. /* fall through */
  1140. case IOMAP_HOLE:
  1141. *(loff_t *)data = offset;
  1142. return 0;
  1143. default:
  1144. return length;
  1145. }
  1146. }
  1147. loff_t
  1148. iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  1149. {
  1150. loff_t size = i_size_read(inode);
  1151. loff_t length = size - offset;
  1152. loff_t ret;
  1153. /* Nothing to be found before or beyond the end of the file. */
  1154. if (offset < 0 || offset >= size)
  1155. return -ENXIO;
  1156. while (length > 0) {
  1157. ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  1158. &offset, iomap_seek_hole_actor);
  1159. if (ret < 0)
  1160. return ret;
  1161. if (ret == 0)
  1162. break;
  1163. offset += ret;
  1164. length -= ret;
  1165. }
  1166. return offset;
  1167. }
  1168. EXPORT_SYMBOL_GPL(iomap_seek_hole);
  1169. static loff_t
  1170. iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
  1171. void *data, struct iomap *iomap)
  1172. {
  1173. switch (iomap->type) {
  1174. case IOMAP_HOLE:
  1175. return length;
  1176. case IOMAP_UNWRITTEN:
  1177. offset = page_cache_seek_hole_data(inode, offset, length,
  1178. SEEK_DATA);
  1179. if (offset < 0)
  1180. return length;
  1181. /*FALLTHRU*/
  1182. default:
  1183. *(loff_t *)data = offset;
  1184. return 0;
  1185. }
  1186. }
  1187. loff_t
  1188. iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  1189. {
  1190. loff_t size = i_size_read(inode);
  1191. loff_t length = size - offset;
  1192. loff_t ret;
  1193. /* Nothing to be found before or beyond the end of the file. */
  1194. if (offset < 0 || offset >= size)
  1195. return -ENXIO;
  1196. while (length > 0) {
  1197. ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  1198. &offset, iomap_seek_data_actor);
  1199. if (ret < 0)
  1200. return ret;
  1201. if (ret == 0)
  1202. break;
  1203. offset += ret;
  1204. length -= ret;
  1205. }
  1206. if (length <= 0)
  1207. return -ENXIO;
  1208. return offset;
  1209. }
  1210. EXPORT_SYMBOL_GPL(iomap_seek_data);
  1211. /*
  1212. * Private flags for iomap_dio, must not overlap with the public ones in
  1213. * iomap.h:
  1214. */
  1215. #define IOMAP_DIO_WRITE_FUA (1 << 28)
  1216. #define IOMAP_DIO_NEED_SYNC (1 << 29)
  1217. #define IOMAP_DIO_WRITE (1 << 30)
  1218. #define IOMAP_DIO_DIRTY (1 << 31)
  1219. struct iomap_dio {
  1220. struct kiocb *iocb;
  1221. iomap_dio_end_io_t *end_io;
  1222. loff_t i_size;
  1223. loff_t size;
  1224. atomic_t ref;
  1225. unsigned flags;
  1226. int error;
  1227. bool wait_for_completion;
  1228. union {
  1229. /* used during submission and for synchronous completion: */
  1230. struct {
  1231. struct iov_iter *iter;
  1232. struct task_struct *waiter;
  1233. struct request_queue *last_queue;
  1234. blk_qc_t cookie;
  1235. } submit;
  1236. /* used for aio completion: */
  1237. struct {
  1238. struct work_struct work;
  1239. } aio;
  1240. };
  1241. };
  1242. static ssize_t iomap_dio_complete(struct iomap_dio *dio)
  1243. {
  1244. struct kiocb *iocb = dio->iocb;
  1245. struct inode *inode = file_inode(iocb->ki_filp);
  1246. loff_t offset = iocb->ki_pos;
  1247. ssize_t ret;
  1248. if (dio->end_io) {
  1249. ret = dio->end_io(iocb,
  1250. dio->error ? dio->error : dio->size,
  1251. dio->flags);
  1252. } else {
  1253. ret = dio->error;
  1254. }
  1255. if (likely(!ret)) {
  1256. ret = dio->size;
  1257. /* check for short read */
  1258. if (offset + ret > dio->i_size &&
  1259. !(dio->flags & IOMAP_DIO_WRITE))
  1260. ret = dio->i_size - offset;
  1261. iocb->ki_pos += ret;
  1262. }
  1263. /*
  1264. * Try again to invalidate clean pages which might have been cached by
  1265. * non-direct readahead, or faulted in by get_user_pages() if the source
  1266. * of the write was an mmap'ed region of the file we're writing. Either
  1267. * one is a pretty crazy thing to do, so we don't support it 100%. If
  1268. * this invalidation fails, tough, the write still worked...
  1269. *
  1270. * And this page cache invalidation has to be after dio->end_io(), as
  1271. * some filesystems convert unwritten extents to real allocations in
  1272. * end_io() when necessary, otherwise a racing buffer read would cache
  1273. * zeros from unwritten extents.
  1274. */
  1275. if (!dio->error &&
  1276. (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
  1277. int err;
  1278. err = invalidate_inode_pages2_range(inode->i_mapping,
  1279. offset >> PAGE_SHIFT,
  1280. (offset + dio->size - 1) >> PAGE_SHIFT);
  1281. if (err)
  1282. dio_warn_stale_pagecache(iocb->ki_filp);
  1283. }
  1284. /*
  1285. * If this is a DSYNC write, make sure we push it to stable storage now
  1286. * that we've written data.
  1287. */
  1288. if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
  1289. ret = generic_write_sync(iocb, ret);
  1290. inode_dio_end(file_inode(iocb->ki_filp));
  1291. kfree(dio);
  1292. return ret;
  1293. }
  1294. static void iomap_dio_complete_work(struct work_struct *work)
  1295. {
  1296. struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
  1297. struct kiocb *iocb = dio->iocb;
  1298. iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
  1299. }
  1300. /*
  1301. * Set an error in the dio if none is set yet. We have to use cmpxchg
  1302. * as the submission context and the completion context(s) can race to
  1303. * update the error.
  1304. */
  1305. static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
  1306. {
  1307. cmpxchg(&dio->error, 0, ret);
  1308. }
  1309. static void iomap_dio_bio_end_io(struct bio *bio)
  1310. {
  1311. struct iomap_dio *dio = bio->bi_private;
  1312. bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
  1313. if (bio->bi_status)
  1314. iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
  1315. if (atomic_dec_and_test(&dio->ref)) {
  1316. if (dio->wait_for_completion) {
  1317. struct task_struct *waiter = dio->submit.waiter;
  1318. WRITE_ONCE(dio->submit.waiter, NULL);
  1319. wake_up_process(waiter);
  1320. } else if (dio->flags & IOMAP_DIO_WRITE) {
  1321. struct inode *inode = file_inode(dio->iocb->ki_filp);
  1322. INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
  1323. queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
  1324. } else {
  1325. iomap_dio_complete_work(&dio->aio.work);
  1326. }
  1327. }
  1328. if (should_dirty) {
  1329. bio_check_pages_dirty(bio);
  1330. } else {
  1331. struct bio_vec *bvec;
  1332. int i;
  1333. bio_for_each_segment_all(bvec, bio, i)
  1334. put_page(bvec->bv_page);
  1335. bio_put(bio);
  1336. }
  1337. }
  1338. static blk_qc_t
  1339. iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
  1340. unsigned len)
  1341. {
  1342. struct page *page = ZERO_PAGE(0);
  1343. struct bio *bio;
  1344. bio = bio_alloc(GFP_KERNEL, 1);
  1345. bio_set_dev(bio, iomap->bdev);
  1346. bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
  1347. bio->bi_private = dio;
  1348. bio->bi_end_io = iomap_dio_bio_end_io;
  1349. get_page(page);
  1350. __bio_add_page(bio, page, len, 0);
  1351. bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
  1352. atomic_inc(&dio->ref);
  1353. return submit_bio(bio);
  1354. }
  1355. static loff_t
  1356. iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
  1357. struct iomap_dio *dio, struct iomap *iomap)
  1358. {
  1359. unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
  1360. unsigned int fs_block_size = i_blocksize(inode), pad;
  1361. unsigned int align = iov_iter_alignment(dio->submit.iter);
  1362. struct iov_iter iter;
  1363. struct bio *bio;
  1364. bool need_zeroout = false;
  1365. bool use_fua = false;
  1366. int nr_pages, ret = 0;
  1367. size_t copied = 0;
  1368. if ((pos | length | align) & ((1 << blkbits) - 1))
  1369. return -EINVAL;
  1370. if (iomap->type == IOMAP_UNWRITTEN) {
  1371. dio->flags |= IOMAP_DIO_UNWRITTEN;
  1372. need_zeroout = true;
  1373. }
  1374. if (iomap->flags & IOMAP_F_SHARED)
  1375. dio->flags |= IOMAP_DIO_COW;
  1376. if (iomap->flags & IOMAP_F_NEW) {
  1377. need_zeroout = true;
  1378. } else if (iomap->type == IOMAP_MAPPED) {
  1379. /*
  1380. * Use a FUA write if we need datasync semantics, this is a pure
  1381. * data IO that doesn't require any metadata updates (including
  1382. * after IO completion such as unwritten extent conversion) and
  1383. * the underlying device supports FUA. This allows us to avoid
  1384. * cache flushes on IO completion.
  1385. */
  1386. if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
  1387. (dio->flags & IOMAP_DIO_WRITE_FUA) &&
  1388. blk_queue_fua(bdev_get_queue(iomap->bdev)))
  1389. use_fua = true;
  1390. }
  1391. /*
  1392. * Operate on a partial iter trimmed to the extent we were called for.
  1393. * We'll update the iter in the dio once we're done with this extent.
  1394. */
  1395. iter = *dio->submit.iter;
  1396. iov_iter_truncate(&iter, length);
  1397. nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  1398. if (nr_pages <= 0)
  1399. return nr_pages;
  1400. if (need_zeroout) {
  1401. /* zero out from the start of the block to the write offset */
  1402. pad = pos & (fs_block_size - 1);
  1403. if (pad)
  1404. iomap_dio_zero(dio, iomap, pos - pad, pad);
  1405. }
  1406. do {
  1407. size_t n;
  1408. if (dio->error) {
  1409. iov_iter_revert(dio->submit.iter, copied);
  1410. return 0;
  1411. }
  1412. bio = bio_alloc(GFP_KERNEL, nr_pages);
  1413. bio_set_dev(bio, iomap->bdev);
  1414. bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
  1415. bio->bi_write_hint = dio->iocb->ki_hint;
  1416. bio->bi_ioprio = dio->iocb->ki_ioprio;
  1417. bio->bi_private = dio;
  1418. bio->bi_end_io = iomap_dio_bio_end_io;
  1419. ret = bio_iov_iter_get_pages(bio, &iter);
  1420. if (unlikely(ret)) {
  1421. /*
  1422. * We have to stop part way through an IO. We must fall
  1423. * through to the sub-block tail zeroing here, otherwise
  1424. * this short IO may expose stale data in the tail of
  1425. * the block we haven't written data to.
  1426. */
  1427. bio_put(bio);
  1428. goto zero_tail;
  1429. }
  1430. n = bio->bi_iter.bi_size;
  1431. if (dio->flags & IOMAP_DIO_WRITE) {
  1432. bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
  1433. if (use_fua)
  1434. bio->bi_opf |= REQ_FUA;
  1435. else
  1436. dio->flags &= ~IOMAP_DIO_WRITE_FUA;
  1437. task_io_account_write(n);
  1438. } else {
  1439. bio->bi_opf = REQ_OP_READ;
  1440. if (dio->flags & IOMAP_DIO_DIRTY)
  1441. bio_set_pages_dirty(bio);
  1442. }
  1443. iov_iter_advance(dio->submit.iter, n);
  1444. dio->size += n;
  1445. pos += n;
  1446. copied += n;
  1447. nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  1448. atomic_inc(&dio->ref);
  1449. dio->submit.last_queue = bdev_get_queue(iomap->bdev);
  1450. dio->submit.cookie = submit_bio(bio);
  1451. } while (nr_pages);
  1452. /*
  1453. * We need to zeroout the tail of a sub-block write if the extent type
  1454. * requires zeroing or the write extends beyond EOF. If we don't zero
  1455. * the block tail in the latter case, we can expose stale data via mmap
  1456. * reads of the EOF block.
  1457. */
  1458. zero_tail:
  1459. if (need_zeroout ||
  1460. ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
  1461. /* zero out from the end of the write to the end of the block */
  1462. pad = pos & (fs_block_size - 1);
  1463. if (pad)
  1464. iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
  1465. }
  1466. return copied ? copied : ret;
  1467. }
  1468. static loff_t
  1469. iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
  1470. {
  1471. length = iov_iter_zero(length, dio->submit.iter);
  1472. dio->size += length;
  1473. return length;
  1474. }
  1475. static loff_t
  1476. iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
  1477. struct iomap_dio *dio, struct iomap *iomap)
  1478. {
  1479. struct iov_iter *iter = dio->submit.iter;
  1480. size_t copied;
  1481. BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
  1482. if (dio->flags & IOMAP_DIO_WRITE) {
  1483. loff_t size = inode->i_size;
  1484. if (pos > size)
  1485. memset(iomap->inline_data + size, 0, pos - size);
  1486. copied = copy_from_iter(iomap->inline_data + pos, length, iter);
  1487. if (copied) {
  1488. if (pos + copied > size)
  1489. i_size_write(inode, pos + copied);
  1490. mark_inode_dirty(inode);
  1491. }
  1492. } else {
  1493. copied = copy_to_iter(iomap->inline_data + pos, length, iter);
  1494. }
  1495. dio->size += copied;
  1496. return copied;
  1497. }
  1498. static loff_t
  1499. iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
  1500. void *data, struct iomap *iomap)
  1501. {
  1502. struct iomap_dio *dio = data;
  1503. switch (iomap->type) {
  1504. case IOMAP_HOLE:
  1505. if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
  1506. return -EIO;
  1507. return iomap_dio_hole_actor(length, dio);
  1508. case IOMAP_UNWRITTEN:
  1509. if (!(dio->flags & IOMAP_DIO_WRITE))
  1510. return iomap_dio_hole_actor(length, dio);
  1511. return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
  1512. case IOMAP_MAPPED:
  1513. return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
  1514. case IOMAP_INLINE:
  1515. return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
  1516. default:
  1517. WARN_ON_ONCE(1);
  1518. return -EIO;
  1519. }
  1520. }
  1521. /*
  1522. * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
  1523. * is being issued as AIO or not. This allows us to optimise pure data writes
  1524. * to use REQ_FUA rather than requiring generic_write_sync() to issue a
  1525. * REQ_FLUSH post write. This is slightly tricky because a single request here
  1526. * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
  1527. * may be pure data writes. In that case, we still need to do a full data sync
  1528. * completion.
  1529. */
  1530. ssize_t
  1531. iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
  1532. const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
  1533. {
  1534. struct address_space *mapping = iocb->ki_filp->f_mapping;
  1535. struct inode *inode = file_inode(iocb->ki_filp);
  1536. size_t count = iov_iter_count(iter);
  1537. loff_t pos = iocb->ki_pos, start = pos;
  1538. loff_t end = iocb->ki_pos + count - 1, ret = 0;
  1539. unsigned int flags = IOMAP_DIRECT;
  1540. struct blk_plug plug;
  1541. struct iomap_dio *dio;
  1542. lockdep_assert_held(&inode->i_rwsem);
  1543. if (!count)
  1544. return 0;
  1545. dio = kmalloc(sizeof(*dio), GFP_KERNEL);
  1546. if (!dio)
  1547. return -ENOMEM;
  1548. dio->iocb = iocb;
  1549. atomic_set(&dio->ref, 1);
  1550. dio->size = 0;
  1551. dio->i_size = i_size_read(inode);
  1552. dio->end_io = end_io;
  1553. dio->error = 0;
  1554. dio->flags = 0;
  1555. dio->wait_for_completion = is_sync_kiocb(iocb);
  1556. dio->submit.iter = iter;
  1557. dio->submit.waiter = current;
  1558. dio->submit.cookie = BLK_QC_T_NONE;
  1559. dio->submit.last_queue = NULL;
  1560. if (iov_iter_rw(iter) == READ) {
  1561. if (pos >= dio->i_size)
  1562. goto out_free_dio;
  1563. if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
  1564. dio->flags |= IOMAP_DIO_DIRTY;
  1565. } else {
  1566. flags |= IOMAP_WRITE;
  1567. dio->flags |= IOMAP_DIO_WRITE;
  1568. /* for data sync or sync, we need sync completion processing */
  1569. if (iocb->ki_flags & IOCB_DSYNC)
  1570. dio->flags |= IOMAP_DIO_NEED_SYNC;
  1571. /*
  1572. * For datasync only writes, we optimistically try using FUA for
  1573. * this IO. Any non-FUA write that occurs will clear this flag,
  1574. * hence we know before completion whether a cache flush is
  1575. * necessary.
  1576. */
  1577. if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
  1578. dio->flags |= IOMAP_DIO_WRITE_FUA;
  1579. }
  1580. if (iocb->ki_flags & IOCB_NOWAIT) {
  1581. if (filemap_range_has_page(mapping, start, end)) {
  1582. ret = -EAGAIN;
  1583. goto out_free_dio;
  1584. }
  1585. flags |= IOMAP_NOWAIT;
  1586. }
  1587. ret = filemap_write_and_wait_range(mapping, start, end);
  1588. if (ret)
  1589. goto out_free_dio;
  1590. /*
  1591. * Try to invalidate cache pages for the range we're direct
  1592. * writing. If this invalidation fails, tough, the write will
  1593. * still work, but racing two incompatible write paths is a
  1594. * pretty crazy thing to do, so we don't support it 100%.
  1595. */
  1596. ret = invalidate_inode_pages2_range(mapping,
  1597. start >> PAGE_SHIFT, end >> PAGE_SHIFT);
  1598. if (ret)
  1599. dio_warn_stale_pagecache(iocb->ki_filp);
  1600. ret = 0;
  1601. if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
  1602. !inode->i_sb->s_dio_done_wq) {
  1603. ret = sb_init_dio_done_wq(inode->i_sb);
  1604. if (ret < 0)
  1605. goto out_free_dio;
  1606. }
  1607. inode_dio_begin(inode);
  1608. blk_start_plug(&plug);
  1609. do {
  1610. ret = iomap_apply(inode, pos, count, flags, ops, dio,
  1611. iomap_dio_actor);
  1612. if (ret <= 0) {
  1613. /* magic error code to fall back to buffered I/O */
  1614. if (ret == -ENOTBLK) {
  1615. dio->wait_for_completion = true;
  1616. ret = 0;
  1617. }
  1618. break;
  1619. }
  1620. pos += ret;
  1621. if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
  1622. break;
  1623. } while ((count = iov_iter_count(iter)) > 0);
  1624. blk_finish_plug(&plug);
  1625. if (ret < 0)
  1626. iomap_dio_set_error(dio, ret);
  1627. /*
  1628. * If all the writes we issued were FUA, we don't need to flush the
  1629. * cache on IO completion. Clear the sync flag for this case.
  1630. */
  1631. if (dio->flags & IOMAP_DIO_WRITE_FUA)
  1632. dio->flags &= ~IOMAP_DIO_NEED_SYNC;
  1633. if (!atomic_dec_and_test(&dio->ref)) {
  1634. if (!dio->wait_for_completion)
  1635. return -EIOCBQUEUED;
  1636. for (;;) {
  1637. set_current_state(TASK_UNINTERRUPTIBLE);
  1638. if (!READ_ONCE(dio->submit.waiter))
  1639. break;
  1640. if (!(iocb->ki_flags & IOCB_HIPRI) ||
  1641. !dio->submit.last_queue ||
  1642. !blk_poll(dio->submit.last_queue,
  1643. dio->submit.cookie))
  1644. io_schedule();
  1645. }
  1646. __set_current_state(TASK_RUNNING);
  1647. }
  1648. ret = iomap_dio_complete(dio);
  1649. return ret;
  1650. out_free_dio:
  1651. kfree(dio);
  1652. return ret;
  1653. }
  1654. EXPORT_SYMBOL_GPL(iomap_dio_rw);
  1655. /* Swapfile activation */
  1656. #ifdef CONFIG_SWAP
  1657. struct iomap_swapfile_info {
  1658. struct iomap iomap; /* accumulated iomap */
  1659. struct swap_info_struct *sis;
  1660. uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
  1661. uint64_t highest_ppage; /* highest physical addr seen (pages) */
  1662. unsigned long nr_pages; /* number of pages collected */
  1663. int nr_extents; /* extent count */
  1664. };
  1665. /*
  1666. * Collect physical extents for this swap file. Physical extents reported to
  1667. * the swap code must be trimmed to align to a page boundary. The logical
  1668. * offset within the file is irrelevant since the swapfile code maps logical
  1669. * page numbers of the swap device to the physical page-aligned extents.
  1670. */
  1671. static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
  1672. {
  1673. struct iomap *iomap = &isi->iomap;
  1674. unsigned long nr_pages;
  1675. uint64_t first_ppage;
  1676. uint64_t first_ppage_reported;
  1677. uint64_t next_ppage;
  1678. int error;
  1679. /*
  1680. * Round the start up and the end down so that the physical
  1681. * extent aligns to a page boundary.
  1682. */
  1683. first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
  1684. next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
  1685. PAGE_SHIFT;
  1686. /* Skip too-short physical extents. */
  1687. if (first_ppage >= next_ppage)
  1688. return 0;
  1689. nr_pages = next_ppage - first_ppage;
  1690. /*
  1691. * Calculate how much swap space we're adding; the first page contains
  1692. * the swap header and doesn't count. The mm still wants that first
  1693. * page fed to add_swap_extent, however.
  1694. */
  1695. first_ppage_reported = first_ppage;
  1696. if (iomap->offset == 0)
  1697. first_ppage_reported++;
  1698. if (isi->lowest_ppage > first_ppage_reported)
  1699. isi->lowest_ppage = first_ppage_reported;
  1700. if (isi->highest_ppage < (next_ppage - 1))
  1701. isi->highest_ppage = next_ppage - 1;
  1702. /* Add extent, set up for the next call. */
  1703. error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
  1704. if (error < 0)
  1705. return error;
  1706. isi->nr_extents += error;
  1707. isi->nr_pages += nr_pages;
  1708. return 0;
  1709. }
  1710. /*
  1711. * Accumulate iomaps for this swap file. We have to accumulate iomaps because
  1712. * swap only cares about contiguous page-aligned physical extents and makes no
  1713. * distinction between written and unwritten extents.
  1714. */
  1715. static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
  1716. loff_t count, void *data, struct iomap *iomap)
  1717. {
  1718. struct iomap_swapfile_info *isi = data;
  1719. int error;
  1720. switch (iomap->type) {
  1721. case IOMAP_MAPPED:
  1722. case IOMAP_UNWRITTEN:
  1723. /* Only real or unwritten extents. */
  1724. break;
  1725. case IOMAP_INLINE:
  1726. /* No inline data. */
  1727. pr_err("swapon: file is inline\n");
  1728. return -EINVAL;
  1729. default:
  1730. pr_err("swapon: file has unallocated extents\n");
  1731. return -EINVAL;
  1732. }
  1733. /* No uncommitted metadata or shared blocks. */
  1734. if (iomap->flags & IOMAP_F_DIRTY) {
  1735. pr_err("swapon: file is not committed\n");
  1736. return -EINVAL;
  1737. }
  1738. if (iomap->flags & IOMAP_F_SHARED) {
  1739. pr_err("swapon: file has shared extents\n");
  1740. return -EINVAL;
  1741. }
  1742. /* Only one bdev per swap file. */
  1743. if (iomap->bdev != isi->sis->bdev) {
  1744. pr_err("swapon: file is on multiple devices\n");
  1745. return -EINVAL;
  1746. }
  1747. if (isi->iomap.length == 0) {
  1748. /* No accumulated extent, so just store it. */
  1749. memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
  1750. } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
  1751. /* Append this to the accumulated extent. */
  1752. isi->iomap.length += iomap->length;
  1753. } else {
  1754. /* Otherwise, add the retained iomap and store this one. */
  1755. error = iomap_swapfile_add_extent(isi);
  1756. if (error)
  1757. return error;
  1758. memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
  1759. }
  1760. return count;
  1761. }
  1762. /*
  1763. * Iterate a swap file's iomaps to construct physical extents that can be
  1764. * passed to the swapfile subsystem.
  1765. */
  1766. int iomap_swapfile_activate(struct swap_info_struct *sis,
  1767. struct file *swap_file, sector_t *pagespan,
  1768. const struct iomap_ops *ops)
  1769. {
  1770. struct iomap_swapfile_info isi = {
  1771. .sis = sis,
  1772. .lowest_ppage = (sector_t)-1ULL,
  1773. };
  1774. struct address_space *mapping = swap_file->f_mapping;
  1775. struct inode *inode = mapping->host;
  1776. loff_t pos = 0;
  1777. loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
  1778. loff_t ret;
  1779. /*
  1780. * Persist all file mapping metadata so that we won't have any
  1781. * IOMAP_F_DIRTY iomaps.
  1782. */
  1783. ret = vfs_fsync(swap_file, 1);
  1784. if (ret)
  1785. return ret;
  1786. while (len > 0) {
  1787. ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
  1788. ops, &isi, iomap_swapfile_activate_actor);
  1789. if (ret <= 0)
  1790. return ret;
  1791. pos += ret;
  1792. len -= ret;
  1793. }
  1794. if (isi.iomap.length) {
  1795. ret = iomap_swapfile_add_extent(&isi);
  1796. if (ret)
  1797. return ret;
  1798. }
  1799. *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
  1800. sis->max = isi.nr_pages;
  1801. sis->pages = isi.nr_pages - 1;
  1802. sis->highest_bit = isi.nr_pages - 1;
  1803. return isi.nr_extents;
  1804. }
  1805. EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
  1806. #endif /* CONFIG_SWAP */
  1807. static loff_t
  1808. iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
  1809. void *data, struct iomap *iomap)
  1810. {
  1811. sector_t *bno = data, addr;
  1812. if (iomap->type == IOMAP_MAPPED) {
  1813. addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
  1814. if (addr > INT_MAX)
  1815. WARN(1, "would truncate bmap result\n");
  1816. else
  1817. *bno = addr;
  1818. }
  1819. return 0;
  1820. }
  1821. /* legacy ->bmap interface. 0 is the error return (!) */
  1822. sector_t
  1823. iomap_bmap(struct address_space *mapping, sector_t bno,
  1824. const struct iomap_ops *ops)
  1825. {
  1826. struct inode *inode = mapping->host;
  1827. loff_t pos = bno << inode->i_blkbits;
  1828. unsigned blocksize = i_blocksize(inode);
  1829. if (filemap_write_and_wait(mapping))
  1830. return 0;
  1831. bno = 0;
  1832. iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
  1833. return bno;
  1834. }
  1835. EXPORT_SYMBOL_GPL(iomap_bmap);