xfs_reflink.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. /*
  2. * Copyright (C) 2016 Oracle. All Rights Reserved.
  3. *
  4. * Author: Darrick J. Wong <darrick.wong@oracle.com>
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version 2
  9. * of the License, or (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it would be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write the Free Software Foundation,
  18. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. #include "xfs.h"
  21. #include "xfs_fs.h"
  22. #include "xfs_shared.h"
  23. #include "xfs_format.h"
  24. #include "xfs_log_format.h"
  25. #include "xfs_trans_resv.h"
  26. #include "xfs_mount.h"
  27. #include "xfs_defer.h"
  28. #include "xfs_da_format.h"
  29. #include "xfs_da_btree.h"
  30. #include "xfs_inode.h"
  31. #include "xfs_trans.h"
  32. #include "xfs_inode_item.h"
  33. #include "xfs_bmap.h"
  34. #include "xfs_bmap_util.h"
  35. #include "xfs_error.h"
  36. #include "xfs_dir2.h"
  37. #include "xfs_dir2_priv.h"
  38. #include "xfs_ioctl.h"
  39. #include "xfs_trace.h"
  40. #include "xfs_log.h"
  41. #include "xfs_icache.h"
  42. #include "xfs_pnfs.h"
  43. #include "xfs_refcount_btree.h"
  44. #include "xfs_refcount.h"
  45. #include "xfs_bmap_btree.h"
  46. #include "xfs_trans_space.h"
  47. #include "xfs_bit.h"
  48. #include "xfs_alloc.h"
  49. #include "xfs_quota_defs.h"
  50. #include "xfs_quota.h"
  51. #include "xfs_btree.h"
  52. #include "xfs_bmap_btree.h"
  53. #include "xfs_reflink.h"
  54. #include "xfs_iomap.h"
  55. /*
  56. * Copy on Write of Shared Blocks
  57. *
  58. * XFS must preserve "the usual" file semantics even when two files share
  59. * the same physical blocks. This means that a write to one file must not
  60. * alter the blocks in a different file; the way that we'll do that is
  61. * through the use of a copy-on-write mechanism. At a high level, that
  62. * means that when we want to write to a shared block, we allocate a new
  63. * block, write the data to the new block, and if that succeeds we map the
  64. * new block into the file.
  65. *
  66. * XFS provides a "delayed allocation" mechanism that defers the allocation
  67. * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
  68. * possible. This reduces fragmentation by enabling the filesystem to ask
  69. * for bigger chunks less often, which is exactly what we want for CoW.
  70. *
  71. * The delalloc mechanism begins when the kernel wants to make a block
  72. * writable (write_begin or page_mkwrite). If the offset is not mapped, we
  73. * create a delalloc mapping, which is a regular in-core extent, but without
  74. * a real startblock. (For delalloc mappings, the startblock encodes both
  75. * a flag that this is a delalloc mapping, and a worst-case estimate of how
  76. * many blocks might be required to put the mapping into the BMBT.) delalloc
  77. * mappings are a reservation against the free space in the filesystem;
  78. * adjacent mappings can also be combined into fewer larger mappings.
  79. *
  80. * When dirty pages are being written out (typically in writepage), the
  81. * delalloc reservations are converted into real mappings by allocating
  82. * blocks and replacing the delalloc mapping with real ones. A delalloc
  83. * mapping can be replaced by several real ones if the free space is
  84. * fragmented.
  85. *
  86. * We want to adapt the delalloc mechanism for copy-on-write, since the
  87. * write paths are similar. The first two steps (creating the reservation
  88. * and allocating the blocks) are exactly the same as delalloc except that
  89. * the mappings must be stored in a separate CoW fork because we do not want
  90. * to disturb the mapping in the data fork until we're sure that the write
  91. * succeeded. IO completion in this case is the process of removing the old
  92. * mapping from the data fork and moving the new mapping from the CoW fork to
  93. * the data fork. This will be discussed shortly.
  94. *
  95. * For now, unaligned directio writes will be bounced back to the page cache.
  96. * Block-aligned directio writes will use the same mechanism as buffered
  97. * writes.
  98. *
  99. * CoW remapping must be done after the data block write completes,
  100. * because we don't want to destroy the old data fork map until we're sure
  101. * the new block has been written. Since the new mappings are kept in a
  102. * separate fork, we can simply iterate these mappings to find the ones
  103. * that cover the file blocks that we just CoW'd. For each extent, simply
  104. * unmap the corresponding range in the data fork, map the new range into
  105. * the data fork, and remove the extent from the CoW fork.
  106. *
  107. * Since the remapping operation can be applied to an arbitrary file
  108. * range, we record the need for the remap step as a flag in the ioend
  109. * instead of declaring a new IO type. This is required for direct io
  110. * because we only have ioend for the whole dio, and we have to be able to
  111. * remember the presence of unwritten blocks and CoW blocks with a single
  112. * ioend structure. Better yet, the more ground we can cover with one
  113. * ioend, the better.
  114. */
  115. /*
  116. * Given an AG extent, find the lowest-numbered run of shared blocks
  117. * within that range and return the range in fbno/flen. If
  118. * find_end_of_shared is true, return the longest contiguous extent of
  119. * shared blocks. If there are no shared extents, fbno and flen will
  120. * be set to NULLAGBLOCK and 0, respectively.
  121. */
  122. int
  123. xfs_reflink_find_shared(
  124. struct xfs_mount *mp,
  125. xfs_agnumber_t agno,
  126. xfs_agblock_t agbno,
  127. xfs_extlen_t aglen,
  128. xfs_agblock_t *fbno,
  129. xfs_extlen_t *flen,
  130. bool find_end_of_shared)
  131. {
  132. struct xfs_buf *agbp;
  133. struct xfs_btree_cur *cur;
  134. int error;
  135. error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
  136. if (error)
  137. return error;
  138. cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
  139. error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
  140. find_end_of_shared);
  141. xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
  142. xfs_buf_relse(agbp);
  143. return error;
  144. }
  145. /*
  146. * Trim the mapping to the next block where there's a change in the
  147. * shared/unshared status. More specifically, this means that we
  148. * find the lowest-numbered extent of shared blocks that coincides with
  149. * the given block mapping. If the shared extent overlaps the start of
  150. * the mapping, trim the mapping to the end of the shared extent. If
  151. * the shared region intersects the mapping, trim the mapping to the
  152. * start of the shared extent. If there are no shared regions that
  153. * overlap, just return the original extent.
  154. */
  155. int
  156. xfs_reflink_trim_around_shared(
  157. struct xfs_inode *ip,
  158. struct xfs_bmbt_irec *irec,
  159. bool *shared,
  160. bool *trimmed)
  161. {
  162. xfs_agnumber_t agno;
  163. xfs_agblock_t agbno;
  164. xfs_extlen_t aglen;
  165. xfs_agblock_t fbno;
  166. xfs_extlen_t flen;
  167. int error = 0;
  168. /* Holes, unwritten, and delalloc extents cannot be shared */
  169. if (!xfs_is_reflink_inode(ip) ||
  170. ISUNWRITTEN(irec) ||
  171. irec->br_startblock == HOLESTARTBLOCK ||
  172. irec->br_startblock == DELAYSTARTBLOCK) {
  173. *shared = false;
  174. return 0;
  175. }
  176. trace_xfs_reflink_trim_around_shared(ip, irec);
  177. agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
  178. agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
  179. aglen = irec->br_blockcount;
  180. error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
  181. aglen, &fbno, &flen, true);
  182. if (error)
  183. return error;
  184. *shared = *trimmed = false;
  185. if (fbno == NULLAGBLOCK) {
  186. /* No shared blocks at all. */
  187. return 0;
  188. } else if (fbno == agbno) {
  189. /*
  190. * The start of this extent is shared. Truncate the
  191. * mapping at the end of the shared region so that a
  192. * subsequent iteration starts at the start of the
  193. * unshared region.
  194. */
  195. irec->br_blockcount = flen;
  196. *shared = true;
  197. if (flen != aglen)
  198. *trimmed = true;
  199. return 0;
  200. } else {
  201. /*
  202. * There's a shared extent midway through this extent.
  203. * Truncate the mapping at the start of the shared
  204. * extent so that a subsequent iteration starts at the
  205. * start of the shared region.
  206. */
  207. irec->br_blockcount = fbno - agbno;
  208. *trimmed = true;
  209. return 0;
  210. }
  211. }
  212. /* Create a CoW reservation for a range of blocks within a file. */
  213. static int
  214. __xfs_reflink_reserve_cow(
  215. struct xfs_inode *ip,
  216. xfs_fileoff_t *offset_fsb,
  217. xfs_fileoff_t end_fsb)
  218. {
  219. struct xfs_bmbt_irec got, prev, imap;
  220. xfs_fileoff_t orig_end_fsb;
  221. int nimaps, eof = 0, error = 0;
  222. bool shared = false, trimmed = false;
  223. xfs_extnum_t idx;
  224. /* Already reserved? Skip the refcount btree access. */
  225. xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
  226. &got, &prev);
  227. if (!eof && got.br_startoff <= *offset_fsb) {
  228. end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
  229. trace_xfs_reflink_cow_found(ip, &got);
  230. goto done;
  231. }
  232. /* Read extent from the source file. */
  233. nimaps = 1;
  234. error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
  235. &imap, &nimaps, 0);
  236. if (error)
  237. goto out_unlock;
  238. ASSERT(nimaps == 1);
  239. /* Trim the mapping to the nearest shared extent boundary. */
  240. error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
  241. if (error)
  242. goto out_unlock;
  243. end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
  244. /* Not shared? Just report the (potentially capped) extent. */
  245. if (!shared)
  246. goto done;
  247. /*
  248. * Fork all the shared blocks from our write offset until the end of
  249. * the extent.
  250. */
  251. error = xfs_qm_dqattach_locked(ip, 0);
  252. if (error)
  253. goto out_unlock;
  254. retry:
  255. error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
  256. end_fsb - *offset_fsb, &got,
  257. &prev, &idx, eof);
  258. switch (error) {
  259. case 0:
  260. break;
  261. case -ENOSPC:
  262. case -EDQUOT:
  263. /* retry without any preallocation */
  264. trace_xfs_reflink_cow_enospc(ip, &imap);
  265. if (end_fsb != orig_end_fsb) {
  266. end_fsb = orig_end_fsb;
  267. goto retry;
  268. }
  269. /*FALLTHRU*/
  270. default:
  271. goto out_unlock;
  272. }
  273. trace_xfs_reflink_cow_alloc(ip, &got);
  274. done:
  275. *offset_fsb = end_fsb;
  276. out_unlock:
  277. return error;
  278. }
  279. /* Create a CoW reservation for part of a file. */
  280. int
  281. xfs_reflink_reserve_cow_range(
  282. struct xfs_inode *ip,
  283. xfs_off_t offset,
  284. xfs_off_t count)
  285. {
  286. struct xfs_mount *mp = ip->i_mount;
  287. xfs_fileoff_t offset_fsb, end_fsb;
  288. int error;
  289. trace_xfs_reflink_reserve_cow_range(ip, offset, count);
  290. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  291. end_fsb = XFS_B_TO_FSB(mp, offset + count);
  292. xfs_ilock(ip, XFS_ILOCK_EXCL);
  293. while (offset_fsb < end_fsb) {
  294. error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
  295. if (error) {
  296. trace_xfs_reflink_reserve_cow_range_error(ip, error,
  297. _RET_IP_);
  298. break;
  299. }
  300. }
  301. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  302. return error;
  303. }