iomap.c 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064
  1. /*
  2. * Copyright (C) 2010 Red Hat, Inc.
  3. * Copyright (c) 2016 Christoph Hellwig.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #include <linux/module.h>
  15. #include <linux/compiler.h>
  16. #include <linux/fs.h>
  17. #include <linux/iomap.h>
  18. #include <linux/uaccess.h>
  19. #include <linux/gfp.h>
  20. #include <linux/mm.h>
  21. #include <linux/swap.h>
  22. #include <linux/pagemap.h>
  23. #include <linux/file.h>
  24. #include <linux/uio.h>
  25. #include <linux/backing-dev.h>
  26. #include <linux/buffer_head.h>
  27. #include <linux/task_io_accounting_ops.h>
  28. #include <linux/dax.h>
  29. #include <linux/sched/signal.h>
  30. #include "internal.h"
  31. /*
  32. * Execute a iomap write on a segment of the mapping that spans a
  33. * contiguous range of pages that have identical block mapping state.
  34. *
  35. * This avoids the need to map pages individually, do individual allocations
  36. * for each page and most importantly avoid the need for filesystem specific
  37. * locking per page. Instead, all the operations are amortised over the entire
  38. * range of pages. It is assumed that the filesystems will lock whatever
  39. * resources they require in the iomap_begin call, and release them in the
  40. * iomap_end call.
  41. */
  42. loff_t
  43. iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
  44. const struct iomap_ops *ops, void *data, iomap_actor_t actor)
  45. {
  46. struct iomap iomap = { 0 };
  47. loff_t written = 0, ret;
  48. /*
  49. * Need to map a range from start position for length bytes. This can
  50. * span multiple pages - it is only guaranteed to return a range of a
  51. * single type of pages (e.g. all into a hole, all mapped or all
  52. * unwritten). Failure at this point has nothing to undo.
  53. *
  54. * If allocation is required for this range, reserve the space now so
  55. * that the allocation is guaranteed to succeed later on. Once we copy
  56. * the data into the page cache pages, then we cannot fail otherwise we
  57. * expose transient stale data. If the reserve fails, we can safely
  58. * back out at this point as there is nothing to undo.
  59. */
  60. ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
  61. if (ret)
  62. return ret;
  63. if (WARN_ON(iomap.offset > pos))
  64. return -EIO;
  65. /*
  66. * Cut down the length to the one actually provided by the filesystem,
  67. * as it might not be able to give us the whole size that we requested.
  68. */
  69. if (iomap.offset + iomap.length < pos + length)
  70. length = iomap.offset + iomap.length - pos;
  71. /*
  72. * Now that we have guaranteed that the space allocation will succeed.
  73. * we can do the copy-in page by page without having to worry about
  74. * failures exposing transient data.
  75. */
  76. written = actor(inode, pos, length, data, &iomap);
  77. /*
  78. * Now the data has been copied, commit the range we've copied. This
  79. * should not fail unless the filesystem has had a fatal error.
  80. */
  81. if (ops->iomap_end) {
  82. ret = ops->iomap_end(inode, pos, length,
  83. written > 0 ? written : 0,
  84. flags, &iomap);
  85. }
  86. return written ? written : ret;
  87. }
  88. static void
  89. iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
  90. {
  91. loff_t i_size = i_size_read(inode);
  92. /*
  93. * Only truncate newly allocated pages beyoned EOF, even if the
  94. * write started inside the existing inode size.
  95. */
  96. if (pos + len > i_size)
  97. truncate_pagecache_range(inode, max(pos, i_size), pos + len);
  98. }
  99. static int
  100. iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
  101. struct page **pagep, struct iomap *iomap)
  102. {
  103. pgoff_t index = pos >> PAGE_SHIFT;
  104. struct page *page;
  105. int status = 0;
  106. BUG_ON(pos + len > iomap->offset + iomap->length);
  107. if (fatal_signal_pending(current))
  108. return -EINTR;
  109. page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
  110. if (!page)
  111. return -ENOMEM;
  112. status = __block_write_begin_int(page, pos, len, NULL, iomap);
  113. if (unlikely(status)) {
  114. unlock_page(page);
  115. put_page(page);
  116. page = NULL;
  117. iomap_write_failed(inode, pos, len);
  118. }
  119. *pagep = page;
  120. return status;
  121. }
  122. static int
  123. iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  124. unsigned copied, struct page *page)
  125. {
  126. int ret;
  127. ret = generic_write_end(NULL, inode->i_mapping, pos, len,
  128. copied, page, NULL);
  129. if (ret < len)
  130. iomap_write_failed(inode, pos, len);
  131. return ret;
  132. }
  133. static loff_t
  134. iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  135. struct iomap *iomap)
  136. {
  137. struct iov_iter *i = data;
  138. long status = 0;
  139. ssize_t written = 0;
  140. unsigned int flags = AOP_FLAG_NOFS;
  141. do {
  142. struct page *page;
  143. unsigned long offset; /* Offset into pagecache page */
  144. unsigned long bytes; /* Bytes to write to page */
  145. size_t copied; /* Bytes copied from user */
  146. offset = (pos & (PAGE_SIZE - 1));
  147. bytes = min_t(unsigned long, PAGE_SIZE - offset,
  148. iov_iter_count(i));
  149. again:
  150. if (bytes > length)
  151. bytes = length;
  152. /*
  153. * Bring in the user page that we will copy from _first_.
  154. * Otherwise there's a nasty deadlock on copying from the
  155. * same page as we're writing to, without it being marked
  156. * up-to-date.
  157. *
  158. * Not only is this an optimisation, but it is also required
  159. * to check that the address is actually valid, when atomic
  160. * usercopies are used, below.
  161. */
  162. if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  163. status = -EFAULT;
  164. break;
  165. }
  166. status = iomap_write_begin(inode, pos, bytes, flags, &page,
  167. iomap);
  168. if (unlikely(status))
  169. break;
  170. if (mapping_writably_mapped(inode->i_mapping))
  171. flush_dcache_page(page);
  172. copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
  173. flush_dcache_page(page);
  174. status = iomap_write_end(inode, pos, bytes, copied, page);
  175. if (unlikely(status < 0))
  176. break;
  177. copied = status;
  178. cond_resched();
  179. iov_iter_advance(i, copied);
  180. if (unlikely(copied == 0)) {
  181. /*
  182. * If we were unable to copy any data at all, we must
  183. * fall back to a single segment length write.
  184. *
  185. * If we didn't fallback here, we could livelock
  186. * because not all segments in the iov can be copied at
  187. * once without a pagefault.
  188. */
  189. bytes = min_t(unsigned long, PAGE_SIZE - offset,
  190. iov_iter_single_seg_count(i));
  191. goto again;
  192. }
  193. pos += copied;
  194. written += copied;
  195. length -= copied;
  196. balance_dirty_pages_ratelimited(inode->i_mapping);
  197. } while (iov_iter_count(i) && length);
  198. return written ? written : status;
  199. }
  200. ssize_t
  201. iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
  202. const struct iomap_ops *ops)
  203. {
  204. struct inode *inode = iocb->ki_filp->f_mapping->host;
  205. loff_t pos = iocb->ki_pos, ret = 0, written = 0;
  206. while (iov_iter_count(iter)) {
  207. ret = iomap_apply(inode, pos, iov_iter_count(iter),
  208. IOMAP_WRITE, ops, iter, iomap_write_actor);
  209. if (ret <= 0)
  210. break;
  211. pos += ret;
  212. written += ret;
  213. }
  214. return written ? written : ret;
  215. }
  216. EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
  217. static struct page *
  218. __iomap_read_page(struct inode *inode, loff_t offset)
  219. {
  220. struct address_space *mapping = inode->i_mapping;
  221. struct page *page;
  222. page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
  223. if (IS_ERR(page))
  224. return page;
  225. if (!PageUptodate(page)) {
  226. put_page(page);
  227. return ERR_PTR(-EIO);
  228. }
  229. return page;
  230. }
  231. static loff_t
  232. iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  233. struct iomap *iomap)
  234. {
  235. long status = 0;
  236. ssize_t written = 0;
  237. do {
  238. struct page *page, *rpage;
  239. unsigned long offset; /* Offset into pagecache page */
  240. unsigned long bytes; /* Bytes to write to page */
  241. offset = (pos & (PAGE_SIZE - 1));
  242. bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
  243. rpage = __iomap_read_page(inode, pos);
  244. if (IS_ERR(rpage))
  245. return PTR_ERR(rpage);
  246. status = iomap_write_begin(inode, pos, bytes,
  247. AOP_FLAG_NOFS, &page, iomap);
  248. put_page(rpage);
  249. if (unlikely(status))
  250. return status;
  251. WARN_ON_ONCE(!PageUptodate(page));
  252. status = iomap_write_end(inode, pos, bytes, bytes, page);
  253. if (unlikely(status <= 0)) {
  254. if (WARN_ON_ONCE(status == 0))
  255. return -EIO;
  256. return status;
  257. }
  258. cond_resched();
  259. pos += status;
  260. written += status;
  261. length -= status;
  262. balance_dirty_pages_ratelimited(inode->i_mapping);
  263. } while (length);
  264. return written;
  265. }
  266. int
  267. iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
  268. const struct iomap_ops *ops)
  269. {
  270. loff_t ret;
  271. while (len) {
  272. ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
  273. iomap_dirty_actor);
  274. if (ret <= 0)
  275. return ret;
  276. pos += ret;
  277. len -= ret;
  278. }
  279. return 0;
  280. }
  281. EXPORT_SYMBOL_GPL(iomap_file_dirty);
  282. static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
  283. unsigned bytes, struct iomap *iomap)
  284. {
  285. struct page *page;
  286. int status;
  287. status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
  288. iomap);
  289. if (status)
  290. return status;
  291. zero_user(page, offset, bytes);
  292. mark_page_accessed(page);
  293. return iomap_write_end(inode, pos, bytes, bytes, page);
  294. }
  295. static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
  296. struct iomap *iomap)
  297. {
  298. sector_t sector = iomap->blkno +
  299. (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
  300. return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
  301. offset, bytes);
  302. }
  303. static loff_t
  304. iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
  305. void *data, struct iomap *iomap)
  306. {
  307. bool *did_zero = data;
  308. loff_t written = 0;
  309. int status;
  310. /* already zeroed? we're done. */
  311. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  312. return count;
  313. do {
  314. unsigned offset, bytes;
  315. offset = pos & (PAGE_SIZE - 1); /* Within page */
  316. bytes = min_t(unsigned, PAGE_SIZE - offset, count);
  317. if (IS_DAX(inode))
  318. status = iomap_dax_zero(pos, offset, bytes, iomap);
  319. else
  320. status = iomap_zero(inode, pos, offset, bytes, iomap);
  321. if (status < 0)
  322. return status;
  323. pos += bytes;
  324. count -= bytes;
  325. written += bytes;
  326. if (did_zero)
  327. *did_zero = true;
  328. } while (count > 0);
  329. return written;
  330. }
  331. int
  332. iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
  333. const struct iomap_ops *ops)
  334. {
  335. loff_t ret;
  336. while (len > 0) {
  337. ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
  338. ops, did_zero, iomap_zero_range_actor);
  339. if (ret <= 0)
  340. return ret;
  341. pos += ret;
  342. len -= ret;
  343. }
  344. return 0;
  345. }
  346. EXPORT_SYMBOL_GPL(iomap_zero_range);
  347. int
  348. iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
  349. const struct iomap_ops *ops)
  350. {
  351. unsigned int blocksize = i_blocksize(inode);
  352. unsigned int off = pos & (blocksize - 1);
  353. /* Block boundary? Nothing to do */
  354. if (!off)
  355. return 0;
  356. return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
  357. }
  358. EXPORT_SYMBOL_GPL(iomap_truncate_page);
  359. static loff_t
  360. iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
  361. void *data, struct iomap *iomap)
  362. {
  363. struct page *page = data;
  364. int ret;
  365. ret = __block_write_begin_int(page, pos, length, NULL, iomap);
  366. if (ret)
  367. return ret;
  368. block_commit_write(page, 0, length);
  369. return length;
  370. }
  371. int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
  372. {
  373. struct page *page = vmf->page;
  374. struct inode *inode = file_inode(vmf->vma->vm_file);
  375. unsigned long length;
  376. loff_t offset, size;
  377. ssize_t ret;
  378. lock_page(page);
  379. size = i_size_read(inode);
  380. if ((page->mapping != inode->i_mapping) ||
  381. (page_offset(page) > size)) {
  382. /* We overload EFAULT to mean page got truncated */
  383. ret = -EFAULT;
  384. goto out_unlock;
  385. }
  386. /* page is wholly or partially inside EOF */
  387. if (((page->index + 1) << PAGE_SHIFT) > size)
  388. length = size & ~PAGE_MASK;
  389. else
  390. length = PAGE_SIZE;
  391. offset = page_offset(page);
  392. while (length > 0) {
  393. ret = iomap_apply(inode, offset, length,
  394. IOMAP_WRITE | IOMAP_FAULT, ops, page,
  395. iomap_page_mkwrite_actor);
  396. if (unlikely(ret <= 0))
  397. goto out_unlock;
  398. offset += ret;
  399. length -= ret;
  400. }
  401. set_page_dirty(page);
  402. wait_for_stable_page(page);
  403. return 0;
  404. out_unlock:
  405. unlock_page(page);
  406. return ret;
  407. }
  408. EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
  409. struct fiemap_ctx {
  410. struct fiemap_extent_info *fi;
  411. struct iomap prev;
  412. };
  413. static int iomap_to_fiemap(struct fiemap_extent_info *fi,
  414. struct iomap *iomap, u32 flags)
  415. {
  416. switch (iomap->type) {
  417. case IOMAP_HOLE:
  418. /* skip holes */
  419. return 0;
  420. case IOMAP_DELALLOC:
  421. flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
  422. break;
  423. case IOMAP_UNWRITTEN:
  424. flags |= FIEMAP_EXTENT_UNWRITTEN;
  425. break;
  426. case IOMAP_MAPPED:
  427. break;
  428. }
  429. if (iomap->flags & IOMAP_F_MERGED)
  430. flags |= FIEMAP_EXTENT_MERGED;
  431. if (iomap->flags & IOMAP_F_SHARED)
  432. flags |= FIEMAP_EXTENT_SHARED;
  433. return fiemap_fill_next_extent(fi, iomap->offset,
  434. iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
  435. iomap->length, flags);
  436. }
  437. static loff_t
  438. iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  439. struct iomap *iomap)
  440. {
  441. struct fiemap_ctx *ctx = data;
  442. loff_t ret = length;
  443. if (iomap->type == IOMAP_HOLE)
  444. return length;
  445. ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
  446. ctx->prev = *iomap;
  447. switch (ret) {
  448. case 0: /* success */
  449. return length;
  450. case 1: /* extent array full */
  451. return 0;
  452. default:
  453. return ret;
  454. }
  455. }
  456. int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
  457. loff_t start, loff_t len, const struct iomap_ops *ops)
  458. {
  459. struct fiemap_ctx ctx;
  460. loff_t ret;
  461. memset(&ctx, 0, sizeof(ctx));
  462. ctx.fi = fi;
  463. ctx.prev.type = IOMAP_HOLE;
  464. ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
  465. if (ret)
  466. return ret;
  467. if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
  468. ret = filemap_write_and_wait(inode->i_mapping);
  469. if (ret)
  470. return ret;
  471. }
  472. while (len > 0) {
  473. ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
  474. iomap_fiemap_actor);
  475. /* inode with no (attribute) mapping will give ENOENT */
  476. if (ret == -ENOENT)
  477. break;
  478. if (ret < 0)
  479. return ret;
  480. if (ret == 0)
  481. break;
  482. start += ret;
  483. len -= ret;
  484. }
  485. if (ctx.prev.type != IOMAP_HOLE) {
  486. ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
  487. if (ret < 0)
  488. return ret;
  489. }
  490. return 0;
  491. }
  492. EXPORT_SYMBOL_GPL(iomap_fiemap);
  493. static loff_t
  494. iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
  495. void *data, struct iomap *iomap)
  496. {
  497. switch (iomap->type) {
  498. case IOMAP_UNWRITTEN:
  499. offset = page_cache_seek_hole_data(inode, offset, length,
  500. SEEK_HOLE);
  501. if (offset < 0)
  502. return length;
  503. /* fall through */
  504. case IOMAP_HOLE:
  505. *(loff_t *)data = offset;
  506. return 0;
  507. default:
  508. return length;
  509. }
  510. }
  511. loff_t
  512. iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  513. {
  514. loff_t size = i_size_read(inode);
  515. loff_t length = size - offset;
  516. loff_t ret;
  517. /* Nothing to be found before or beyond the end of the file. */
  518. if (offset < 0 || offset >= size)
  519. return -ENXIO;
  520. while (length > 0) {
  521. ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  522. &offset, iomap_seek_hole_actor);
  523. if (ret < 0)
  524. return ret;
  525. if (ret == 0)
  526. break;
  527. offset += ret;
  528. length -= ret;
  529. }
  530. return offset;
  531. }
  532. EXPORT_SYMBOL_GPL(iomap_seek_hole);
  533. static loff_t
  534. iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
  535. void *data, struct iomap *iomap)
  536. {
  537. switch (iomap->type) {
  538. case IOMAP_HOLE:
  539. return length;
  540. case IOMAP_UNWRITTEN:
  541. offset = page_cache_seek_hole_data(inode, offset, length,
  542. SEEK_DATA);
  543. if (offset < 0)
  544. return length;
  545. /*FALLTHRU*/
  546. default:
  547. *(loff_t *)data = offset;
  548. return 0;
  549. }
  550. }
  551. loff_t
  552. iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  553. {
  554. loff_t size = i_size_read(inode);
  555. loff_t length = size - offset;
  556. loff_t ret;
  557. /* Nothing to be found before or beyond the end of the file. */
  558. if (offset < 0 || offset >= size)
  559. return -ENXIO;
  560. while (length > 0) {
  561. ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  562. &offset, iomap_seek_data_actor);
  563. if (ret < 0)
  564. return ret;
  565. if (ret == 0)
  566. break;
  567. offset += ret;
  568. length -= ret;
  569. }
  570. if (length <= 0)
  571. return -ENXIO;
  572. return offset;
  573. }
  574. EXPORT_SYMBOL_GPL(iomap_seek_data);
  575. /*
  576. * Private flags for iomap_dio, must not overlap with the public ones in
  577. * iomap.h:
  578. */
  579. #define IOMAP_DIO_WRITE (1 << 30)
  580. #define IOMAP_DIO_DIRTY (1 << 31)
  581. struct iomap_dio {
  582. struct kiocb *iocb;
  583. iomap_dio_end_io_t *end_io;
  584. loff_t i_size;
  585. loff_t size;
  586. atomic_t ref;
  587. unsigned flags;
  588. int error;
  589. union {
  590. /* used during submission and for synchronous completion: */
  591. struct {
  592. struct iov_iter *iter;
  593. struct task_struct *waiter;
  594. struct request_queue *last_queue;
  595. blk_qc_t cookie;
  596. } submit;
  597. /* used for aio completion: */
  598. struct {
  599. struct work_struct work;
  600. } aio;
  601. };
  602. };
  603. static ssize_t iomap_dio_complete(struct iomap_dio *dio)
  604. {
  605. struct kiocb *iocb = dio->iocb;
  606. ssize_t ret;
  607. if (dio->end_io) {
  608. ret = dio->end_io(iocb,
  609. dio->error ? dio->error : dio->size,
  610. dio->flags);
  611. } else {
  612. ret = dio->error;
  613. }
  614. if (likely(!ret)) {
  615. ret = dio->size;
  616. /* check for short read */
  617. if (iocb->ki_pos + ret > dio->i_size &&
  618. !(dio->flags & IOMAP_DIO_WRITE))
  619. ret = dio->i_size - iocb->ki_pos;
  620. iocb->ki_pos += ret;
  621. }
  622. inode_dio_end(file_inode(iocb->ki_filp));
  623. kfree(dio);
  624. return ret;
  625. }
  626. static void iomap_dio_complete_work(struct work_struct *work)
  627. {
  628. struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
  629. struct kiocb *iocb = dio->iocb;
  630. bool is_write = (dio->flags & IOMAP_DIO_WRITE);
  631. ssize_t ret;
  632. ret = iomap_dio_complete(dio);
  633. if (is_write && ret > 0)
  634. ret = generic_write_sync(iocb, ret);
  635. iocb->ki_complete(iocb, ret, 0);
  636. }
  637. /*
  638. * Set an error in the dio if none is set yet. We have to use cmpxchg
  639. * as the submission context and the completion context(s) can race to
  640. * update the error.
  641. */
  642. static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
  643. {
  644. cmpxchg(&dio->error, 0, ret);
  645. }
  646. static void iomap_dio_bio_end_io(struct bio *bio)
  647. {
  648. struct iomap_dio *dio = bio->bi_private;
  649. bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
  650. if (bio->bi_status)
  651. iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
  652. if (atomic_dec_and_test(&dio->ref)) {
  653. if (is_sync_kiocb(dio->iocb)) {
  654. struct task_struct *waiter = dio->submit.waiter;
  655. WRITE_ONCE(dio->submit.waiter, NULL);
  656. wake_up_process(waiter);
  657. } else if (dio->flags & IOMAP_DIO_WRITE) {
  658. struct inode *inode = file_inode(dio->iocb->ki_filp);
  659. INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
  660. queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
  661. } else {
  662. iomap_dio_complete_work(&dio->aio.work);
  663. }
  664. }
  665. if (should_dirty) {
  666. bio_check_pages_dirty(bio);
  667. } else {
  668. struct bio_vec *bvec;
  669. int i;
  670. bio_for_each_segment_all(bvec, bio, i)
  671. put_page(bvec->bv_page);
  672. bio_put(bio);
  673. }
  674. }
  675. static blk_qc_t
  676. iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
  677. unsigned len)
  678. {
  679. struct page *page = ZERO_PAGE(0);
  680. struct bio *bio;
  681. bio = bio_alloc(GFP_KERNEL, 1);
  682. bio->bi_bdev = iomap->bdev;
  683. bio->bi_iter.bi_sector =
  684. iomap->blkno + ((pos - iomap->offset) >> 9);
  685. bio->bi_private = dio;
  686. bio->bi_end_io = iomap_dio_bio_end_io;
  687. get_page(page);
  688. if (bio_add_page(bio, page, len, 0) != len)
  689. BUG();
  690. bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
  691. atomic_inc(&dio->ref);
  692. return submit_bio(bio);
  693. }
  694. static loff_t
  695. iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
  696. void *data, struct iomap *iomap)
  697. {
  698. struct iomap_dio *dio = data;
  699. unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
  700. unsigned int fs_block_size = i_blocksize(inode), pad;
  701. unsigned int align = iov_iter_alignment(dio->submit.iter);
  702. struct iov_iter iter;
  703. struct bio *bio;
  704. bool need_zeroout = false;
  705. int nr_pages, ret;
  706. if ((pos | length | align) & ((1 << blkbits) - 1))
  707. return -EINVAL;
  708. switch (iomap->type) {
  709. case IOMAP_HOLE:
  710. if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
  711. return -EIO;
  712. /*FALLTHRU*/
  713. case IOMAP_UNWRITTEN:
  714. if (!(dio->flags & IOMAP_DIO_WRITE)) {
  715. iov_iter_zero(length, dio->submit.iter);
  716. dio->size += length;
  717. return length;
  718. }
  719. dio->flags |= IOMAP_DIO_UNWRITTEN;
  720. need_zeroout = true;
  721. break;
  722. case IOMAP_MAPPED:
  723. if (iomap->flags & IOMAP_F_SHARED)
  724. dio->flags |= IOMAP_DIO_COW;
  725. if (iomap->flags & IOMAP_F_NEW)
  726. need_zeroout = true;
  727. break;
  728. default:
  729. WARN_ON_ONCE(1);
  730. return -EIO;
  731. }
  732. /*
  733. * Operate on a partial iter trimmed to the extent we were called for.
  734. * We'll update the iter in the dio once we're done with this extent.
  735. */
  736. iter = *dio->submit.iter;
  737. iov_iter_truncate(&iter, length);
  738. nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  739. if (nr_pages <= 0)
  740. return nr_pages;
  741. if (need_zeroout) {
  742. /* zero out from the start of the block to the write offset */
  743. pad = pos & (fs_block_size - 1);
  744. if (pad)
  745. iomap_dio_zero(dio, iomap, pos - pad, pad);
  746. }
  747. do {
  748. if (dio->error)
  749. return 0;
  750. bio = bio_alloc(GFP_KERNEL, nr_pages);
  751. bio->bi_bdev = iomap->bdev;
  752. bio->bi_iter.bi_sector =
  753. iomap->blkno + ((pos - iomap->offset) >> 9);
  754. bio->bi_write_hint = dio->iocb->ki_hint;
  755. bio->bi_private = dio;
  756. bio->bi_end_io = iomap_dio_bio_end_io;
  757. ret = bio_iov_iter_get_pages(bio, &iter);
  758. if (unlikely(ret)) {
  759. bio_put(bio);
  760. return ret;
  761. }
  762. if (dio->flags & IOMAP_DIO_WRITE) {
  763. bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
  764. task_io_account_write(bio->bi_iter.bi_size);
  765. } else {
  766. bio_set_op_attrs(bio, REQ_OP_READ, 0);
  767. if (dio->flags & IOMAP_DIO_DIRTY)
  768. bio_set_pages_dirty(bio);
  769. }
  770. dio->size += bio->bi_iter.bi_size;
  771. pos += bio->bi_iter.bi_size;
  772. nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  773. atomic_inc(&dio->ref);
  774. dio->submit.last_queue = bdev_get_queue(iomap->bdev);
  775. dio->submit.cookie = submit_bio(bio);
  776. } while (nr_pages);
  777. if (need_zeroout) {
  778. /* zero out from the end of the write to the end of the block */
  779. pad = pos & (fs_block_size - 1);
  780. if (pad)
  781. iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
  782. }
  783. iov_iter_advance(dio->submit.iter, length);
  784. return length;
  785. }
  786. ssize_t
  787. iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
  788. const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
  789. {
  790. struct address_space *mapping = iocb->ki_filp->f_mapping;
  791. struct inode *inode = file_inode(iocb->ki_filp);
  792. size_t count = iov_iter_count(iter);
  793. loff_t pos = iocb->ki_pos, start = pos;
  794. loff_t end = iocb->ki_pos + count - 1, ret = 0;
  795. unsigned int flags = IOMAP_DIRECT;
  796. struct blk_plug plug;
  797. struct iomap_dio *dio;
  798. lockdep_assert_held(&inode->i_rwsem);
  799. if (!count)
  800. return 0;
  801. dio = kmalloc(sizeof(*dio), GFP_KERNEL);
  802. if (!dio)
  803. return -ENOMEM;
  804. dio->iocb = iocb;
  805. atomic_set(&dio->ref, 1);
  806. dio->size = 0;
  807. dio->i_size = i_size_read(inode);
  808. dio->end_io = end_io;
  809. dio->error = 0;
  810. dio->flags = 0;
  811. dio->submit.iter = iter;
  812. if (is_sync_kiocb(iocb)) {
  813. dio->submit.waiter = current;
  814. dio->submit.cookie = BLK_QC_T_NONE;
  815. dio->submit.last_queue = NULL;
  816. }
  817. if (iov_iter_rw(iter) == READ) {
  818. if (pos >= dio->i_size)
  819. goto out_free_dio;
  820. if (iter->type == ITER_IOVEC)
  821. dio->flags |= IOMAP_DIO_DIRTY;
  822. } else {
  823. dio->flags |= IOMAP_DIO_WRITE;
  824. flags |= IOMAP_WRITE;
  825. }
  826. if (iocb->ki_flags & IOCB_NOWAIT) {
  827. if (filemap_range_has_page(mapping, start, end)) {
  828. ret = -EAGAIN;
  829. goto out_free_dio;
  830. }
  831. flags |= IOMAP_NOWAIT;
  832. }
  833. ret = filemap_write_and_wait_range(mapping, start, end);
  834. if (ret)
  835. goto out_free_dio;
  836. ret = invalidate_inode_pages2_range(mapping,
  837. start >> PAGE_SHIFT, end >> PAGE_SHIFT);
  838. WARN_ON_ONCE(ret);
  839. ret = 0;
  840. inode_dio_begin(inode);
  841. blk_start_plug(&plug);
  842. do {
  843. ret = iomap_apply(inode, pos, count, flags, ops, dio,
  844. iomap_dio_actor);
  845. if (ret <= 0) {
  846. /* magic error code to fall back to buffered I/O */
  847. if (ret == -ENOTBLK)
  848. ret = 0;
  849. break;
  850. }
  851. pos += ret;
  852. if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
  853. break;
  854. } while ((count = iov_iter_count(iter)) > 0);
  855. blk_finish_plug(&plug);
  856. if (ret < 0)
  857. iomap_dio_set_error(dio, ret);
  858. if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
  859. !inode->i_sb->s_dio_done_wq) {
  860. ret = sb_init_dio_done_wq(inode->i_sb);
  861. if (ret < 0)
  862. iomap_dio_set_error(dio, ret);
  863. }
  864. if (!atomic_dec_and_test(&dio->ref)) {
  865. if (!is_sync_kiocb(iocb))
  866. return -EIOCBQUEUED;
  867. for (;;) {
  868. set_current_state(TASK_UNINTERRUPTIBLE);
  869. if (!READ_ONCE(dio->submit.waiter))
  870. break;
  871. if (!(iocb->ki_flags & IOCB_HIPRI) ||
  872. !dio->submit.last_queue ||
  873. !blk_mq_poll(dio->submit.last_queue,
  874. dio->submit.cookie))
  875. io_schedule();
  876. }
  877. __set_current_state(TASK_RUNNING);
  878. }
  879. ret = iomap_dio_complete(dio);
  880. /*
  881. * Try again to invalidate clean pages which might have been cached by
  882. * non-direct readahead, or faulted in by get_user_pages() if the source
  883. * of the write was an mmap'ed region of the file we're writing. Either
  884. * one is a pretty crazy thing to do, so we don't support it 100%. If
  885. * this invalidation fails, tough, the write still worked...
  886. */
  887. if (iov_iter_rw(iter) == WRITE) {
  888. int err = invalidate_inode_pages2_range(mapping,
  889. start >> PAGE_SHIFT, end >> PAGE_SHIFT);
  890. WARN_ON_ONCE(err);
  891. }
  892. return ret;
  893. out_free_dio:
  894. kfree(dio);
  895. return ret;
  896. }
  897. EXPORT_SYMBOL_GPL(iomap_dio_rw);