xfs_iomap.c 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. /*
  2. * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include <linux/iomap.h>
  19. #include "xfs.h"
  20. #include "xfs_fs.h"
  21. #include "xfs_shared.h"
  22. #include "xfs_format.h"
  23. #include "xfs_log_format.h"
  24. #include "xfs_trans_resv.h"
  25. #include "xfs_mount.h"
  26. #include "xfs_defer.h"
  27. #include "xfs_inode.h"
  28. #include "xfs_btree.h"
  29. #include "xfs_bmap_btree.h"
  30. #include "xfs_bmap.h"
  31. #include "xfs_bmap_util.h"
  32. #include "xfs_error.h"
  33. #include "xfs_trans.h"
  34. #include "xfs_trans_space.h"
  35. #include "xfs_iomap.h"
  36. #include "xfs_trace.h"
  37. #include "xfs_icache.h"
  38. #include "xfs_quota.h"
  39. #include "xfs_dquot_item.h"
  40. #include "xfs_dquot.h"
  41. #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
  42. << mp->m_writeio_log)
  43. #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
  44. STATIC int
  45. xfs_iomap_eof_align_last_fsb(
  46. xfs_mount_t *mp,
  47. xfs_inode_t *ip,
  48. xfs_extlen_t extsize,
  49. xfs_fileoff_t *last_fsb)
  50. {
  51. xfs_extlen_t align = 0;
  52. int eof, error;
  53. if (!XFS_IS_REALTIME_INODE(ip)) {
  54. /*
  55. * Round up the allocation request to a stripe unit
  56. * (m_dalign) boundary if the file size is >= stripe unit
  57. * size, and we are allocating past the allocation eof.
  58. *
  59. * If mounted with the "-o swalloc" option the alignment is
  60. * increased from the strip unit size to the stripe width.
  61. */
  62. if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
  63. align = mp->m_swidth;
  64. else if (mp->m_dalign)
  65. align = mp->m_dalign;
  66. if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
  67. align = 0;
  68. }
  69. /*
  70. * Always round up the allocation request to an extent boundary
  71. * (when file on a real-time subvolume or has di_extsize hint).
  72. */
  73. if (extsize) {
  74. if (align)
  75. align = roundup_64(align, extsize);
  76. else
  77. align = extsize;
  78. }
  79. if (align) {
  80. xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
  81. error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
  82. if (error)
  83. return error;
  84. if (eof)
  85. *last_fsb = new_last_fsb;
  86. }
  87. return 0;
  88. }
  89. STATIC int
  90. xfs_alert_fsblock_zero(
  91. xfs_inode_t *ip,
  92. xfs_bmbt_irec_t *imap)
  93. {
  94. xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
  95. "Access to block zero in inode %llu "
  96. "start_block: %llx start_off: %llx "
  97. "blkcnt: %llx extent-state: %x",
  98. (unsigned long long)ip->i_ino,
  99. (unsigned long long)imap->br_startblock,
  100. (unsigned long long)imap->br_startoff,
  101. (unsigned long long)imap->br_blockcount,
  102. imap->br_state);
  103. return -EFSCORRUPTED;
  104. }
  105. int
  106. xfs_iomap_write_direct(
  107. xfs_inode_t *ip,
  108. xfs_off_t offset,
  109. size_t count,
  110. xfs_bmbt_irec_t *imap,
  111. int nmaps)
  112. {
  113. xfs_mount_t *mp = ip->i_mount;
  114. xfs_fileoff_t offset_fsb;
  115. xfs_fileoff_t last_fsb;
  116. xfs_filblks_t count_fsb, resaligned;
  117. xfs_fsblock_t firstfsb;
  118. xfs_extlen_t extsz, temp;
  119. int nimaps;
  120. int quota_flag;
  121. int rt;
  122. xfs_trans_t *tp;
  123. struct xfs_defer_ops dfops;
  124. uint qblocks, resblks, resrtextents;
  125. int error;
  126. int lockmode;
  127. int bmapi_flags = XFS_BMAPI_PREALLOC;
  128. uint tflags = 0;
  129. rt = XFS_IS_REALTIME_INODE(ip);
  130. extsz = xfs_get_extsz_hint(ip);
  131. lockmode = XFS_ILOCK_SHARED; /* locked by caller */
  132. ASSERT(xfs_isilocked(ip, lockmode));
  133. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  134. last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
  135. if ((offset + count) > XFS_ISIZE(ip)) {
  136. /*
  137. * Assert that the in-core extent list is present since this can
  138. * call xfs_iread_extents() and we only have the ilock shared.
  139. * This should be safe because the lock was held around a bmapi
  140. * call in the caller and we only need it to access the in-core
  141. * list.
  142. */
  143. ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
  144. XFS_IFEXTENTS);
  145. error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
  146. if (error)
  147. goto out_unlock;
  148. } else {
  149. if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
  150. last_fsb = MIN(last_fsb, (xfs_fileoff_t)
  151. imap->br_blockcount +
  152. imap->br_startoff);
  153. }
  154. count_fsb = last_fsb - offset_fsb;
  155. ASSERT(count_fsb > 0);
  156. resaligned = count_fsb;
  157. if (unlikely(extsz)) {
  158. if ((temp = do_mod(offset_fsb, extsz)))
  159. resaligned += temp;
  160. if ((temp = do_mod(resaligned, extsz)))
  161. resaligned += extsz - temp;
  162. }
  163. if (unlikely(rt)) {
  164. resrtextents = qblocks = resaligned;
  165. resrtextents /= mp->m_sb.sb_rextsize;
  166. resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  167. quota_flag = XFS_QMOPT_RES_RTBLKS;
  168. } else {
  169. resrtextents = 0;
  170. resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
  171. quota_flag = XFS_QMOPT_RES_REGBLKS;
  172. }
  173. /*
  174. * Drop the shared lock acquired by the caller, attach the dquot if
  175. * necessary and move on to transaction setup.
  176. */
  177. xfs_iunlock(ip, lockmode);
  178. error = xfs_qm_dqattach(ip, 0);
  179. if (error)
  180. return error;
  181. /*
  182. * For DAX, we do not allocate unwritten extents, but instead we zero
  183. * the block before we commit the transaction. Ideally we'd like to do
  184. * this outside the transaction context, but if we commit and then crash
  185. * we may not have zeroed the blocks and this will be exposed on
  186. * recovery of the allocation. Hence we must zero before commit.
  187. *
  188. * Further, if we are mapping unwritten extents here, we need to zero
  189. * and convert them to written so that we don't need an unwritten extent
  190. * callback for DAX. This also means that we need to be able to dip into
  191. * the reserve block pool for bmbt block allocation if there is no space
  192. * left but we need to do unwritten extent conversion.
  193. */
  194. if (IS_DAX(VFS_I(ip))) {
  195. bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
  196. if (ISUNWRITTEN(imap)) {
  197. tflags |= XFS_TRANS_RESERVE;
  198. resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
  199. }
  200. }
  201. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
  202. tflags, &tp);
  203. if (error)
  204. return error;
  205. lockmode = XFS_ILOCK_EXCL;
  206. xfs_ilock(ip, lockmode);
  207. error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
  208. if (error)
  209. goto out_trans_cancel;
  210. xfs_trans_ijoin(tp, ip, 0);
  211. /*
  212. * From this point onwards we overwrite the imap pointer that the
  213. * caller gave to us.
  214. */
  215. xfs_defer_init(&dfops, &firstfsb);
  216. nimaps = 1;
  217. error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
  218. bmapi_flags, &firstfsb, resblks, imap,
  219. &nimaps, &dfops);
  220. if (error)
  221. goto out_bmap_cancel;
  222. /*
  223. * Complete the transaction
  224. */
  225. error = xfs_defer_finish(&tp, &dfops, NULL);
  226. if (error)
  227. goto out_bmap_cancel;
  228. error = xfs_trans_commit(tp);
  229. if (error)
  230. goto out_unlock;
  231. /*
  232. * Copy any maps to caller's array and return any error.
  233. */
  234. if (nimaps == 0) {
  235. error = -ENOSPC;
  236. goto out_unlock;
  237. }
  238. if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
  239. error = xfs_alert_fsblock_zero(ip, imap);
  240. out_unlock:
  241. xfs_iunlock(ip, lockmode);
  242. return error;
  243. out_bmap_cancel:
  244. xfs_defer_cancel(&dfops);
  245. xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
  246. out_trans_cancel:
  247. xfs_trans_cancel(tp);
  248. goto out_unlock;
  249. }
  250. /*
  251. * If the caller is doing a write at the end of the file, then extend the
  252. * allocation out to the file system's write iosize. We clean up any extra
  253. * space left over when the file is closed in xfs_inactive().
  254. *
  255. * If we find we already have delalloc preallocation beyond EOF, don't do more
  256. * preallocation as it it not needed.
  257. */
  258. STATIC int
  259. xfs_iomap_eof_want_preallocate(
  260. xfs_mount_t *mp,
  261. xfs_inode_t *ip,
  262. xfs_off_t offset,
  263. size_t count,
  264. xfs_bmbt_irec_t *imap,
  265. int nimaps,
  266. int *prealloc)
  267. {
  268. xfs_fileoff_t start_fsb;
  269. xfs_filblks_t count_fsb;
  270. int n, error, imaps;
  271. int found_delalloc = 0;
  272. *prealloc = 0;
  273. if (offset + count <= XFS_ISIZE(ip))
  274. return 0;
  275. /*
  276. * If the file is smaller than the minimum prealloc and we are using
  277. * dynamic preallocation, don't do any preallocation at all as it is
  278. * likely this is the only write to the file that is going to be done.
  279. */
  280. if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
  281. XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
  282. return 0;
  283. /*
  284. * If there are any real blocks past eof, then don't
  285. * do any speculative allocation.
  286. */
  287. start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
  288. count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
  289. while (count_fsb > 0) {
  290. imaps = nimaps;
  291. error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
  292. 0);
  293. if (error)
  294. return error;
  295. for (n = 0; n < imaps; n++) {
  296. if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
  297. (imap[n].br_startblock != DELAYSTARTBLOCK))
  298. return 0;
  299. start_fsb += imap[n].br_blockcount;
  300. count_fsb -= imap[n].br_blockcount;
  301. if (imap[n].br_startblock == DELAYSTARTBLOCK)
  302. found_delalloc = 1;
  303. }
  304. }
  305. if (!found_delalloc)
  306. *prealloc = 1;
  307. return 0;
  308. }
  309. /*
  310. * Determine the initial size of the preallocation. We are beyond the current
  311. * EOF here, but we need to take into account whether this is a sparse write or
  312. * an extending write when determining the preallocation size. Hence we need to
  313. * look up the extent that ends at the current write offset and use the result
  314. * to determine the preallocation size.
  315. *
  316. * If the extent is a hole, then preallocation is essentially disabled.
  317. * Otherwise we take the size of the preceeding data extent as the basis for the
  318. * preallocation size. If the size of the extent is greater than half the
  319. * maximum extent length, then use the current offset as the basis. This ensures
  320. * that for large files the preallocation size always extends to MAXEXTLEN
  321. * rather than falling short due to things like stripe unit/width alignment of
  322. * real extents.
  323. */
  324. STATIC xfs_fsblock_t
  325. xfs_iomap_eof_prealloc_initial_size(
  326. struct xfs_mount *mp,
  327. struct xfs_inode *ip,
  328. xfs_off_t offset,
  329. xfs_bmbt_irec_t *imap,
  330. int nimaps)
  331. {
  332. xfs_fileoff_t start_fsb;
  333. int imaps = 1;
  334. int error;
  335. ASSERT(nimaps >= imaps);
  336. /* if we are using a specific prealloc size, return now */
  337. if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
  338. return 0;
  339. /* If the file is small, then use the minimum prealloc */
  340. if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
  341. return 0;
  342. /*
  343. * As we write multiple pages, the offset will always align to the
  344. * start of a page and hence point to a hole at EOF. i.e. if the size is
  345. * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
  346. * will return FSB 1. Hence if there are blocks in the file, we want to
  347. * point to the block prior to the EOF block and not the hole that maps
  348. * directly at @offset.
  349. */
  350. start_fsb = XFS_B_TO_FSB(mp, offset);
  351. if (start_fsb)
  352. start_fsb--;
  353. error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
  354. if (error)
  355. return 0;
  356. ASSERT(imaps == 1);
  357. if (imap[0].br_startblock == HOLESTARTBLOCK)
  358. return 0;
  359. if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
  360. return imap[0].br_blockcount << 1;
  361. return XFS_B_TO_FSB(mp, offset);
  362. }
  363. STATIC bool
  364. xfs_quota_need_throttle(
  365. struct xfs_inode *ip,
  366. int type,
  367. xfs_fsblock_t alloc_blocks)
  368. {
  369. struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
  370. if (!dq || !xfs_this_quota_on(ip->i_mount, type))
  371. return false;
  372. /* no hi watermark, no throttle */
  373. if (!dq->q_prealloc_hi_wmark)
  374. return false;
  375. /* under the lo watermark, no throttle */
  376. if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
  377. return false;
  378. return true;
  379. }
  380. STATIC void
  381. xfs_quota_calc_throttle(
  382. struct xfs_inode *ip,
  383. int type,
  384. xfs_fsblock_t *qblocks,
  385. int *qshift,
  386. int64_t *qfreesp)
  387. {
  388. int64_t freesp;
  389. int shift = 0;
  390. struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
  391. /* no dq, or over hi wmark, squash the prealloc completely */
  392. if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
  393. *qblocks = 0;
  394. *qfreesp = 0;
  395. return;
  396. }
  397. freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
  398. if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
  399. shift = 2;
  400. if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
  401. shift += 2;
  402. if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
  403. shift += 2;
  404. }
  405. if (freesp < *qfreesp)
  406. *qfreesp = freesp;
  407. /* only overwrite the throttle values if we are more aggressive */
  408. if ((freesp >> shift) < (*qblocks >> *qshift)) {
  409. *qblocks = freesp;
  410. *qshift = shift;
  411. }
  412. }
  413. /*
  414. * If we don't have a user specified preallocation size, dynamically increase
  415. * the preallocation size as the size of the file grows. Cap the maximum size
  416. * at a single extent or less if the filesystem is near full. The closer the
  417. * filesystem is to full, the smaller the maximum prealocation.
  418. */
  419. STATIC xfs_fsblock_t
  420. xfs_iomap_prealloc_size(
  421. struct xfs_mount *mp,
  422. struct xfs_inode *ip,
  423. xfs_off_t offset,
  424. struct xfs_bmbt_irec *imap,
  425. int nimaps)
  426. {
  427. xfs_fsblock_t alloc_blocks = 0;
  428. int shift = 0;
  429. int64_t freesp;
  430. xfs_fsblock_t qblocks;
  431. int qshift = 0;
  432. alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
  433. imap, nimaps);
  434. if (!alloc_blocks)
  435. goto check_writeio;
  436. qblocks = alloc_blocks;
  437. /*
  438. * MAXEXTLEN is not a power of two value but we round the prealloc down
  439. * to the nearest power of two value after throttling. To prevent the
  440. * round down from unconditionally reducing the maximum supported prealloc
  441. * size, we round up first, apply appropriate throttling, round down and
  442. * cap the value to MAXEXTLEN.
  443. */
  444. alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
  445. alloc_blocks);
  446. freesp = percpu_counter_read_positive(&mp->m_fdblocks);
  447. if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
  448. shift = 2;
  449. if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
  450. shift++;
  451. if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
  452. shift++;
  453. if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
  454. shift++;
  455. if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
  456. shift++;
  457. }
  458. /*
  459. * Check each quota to cap the prealloc size, provide a shift value to
  460. * throttle with and adjust amount of available space.
  461. */
  462. if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
  463. xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
  464. &freesp);
  465. if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
  466. xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
  467. &freesp);
  468. if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
  469. xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
  470. &freesp);
  471. /*
  472. * The final prealloc size is set to the minimum of free space available
  473. * in each of the quotas and the overall filesystem.
  474. *
  475. * The shift throttle value is set to the maximum value as determined by
  476. * the global low free space values and per-quota low free space values.
  477. */
  478. alloc_blocks = MIN(alloc_blocks, qblocks);
  479. shift = MAX(shift, qshift);
  480. if (shift)
  481. alloc_blocks >>= shift;
  482. /*
  483. * rounddown_pow_of_two() returns an undefined result if we pass in
  484. * alloc_blocks = 0.
  485. */
  486. if (alloc_blocks)
  487. alloc_blocks = rounddown_pow_of_two(alloc_blocks);
  488. if (alloc_blocks > MAXEXTLEN)
  489. alloc_blocks = MAXEXTLEN;
  490. /*
  491. * If we are still trying to allocate more space than is
  492. * available, squash the prealloc hard. This can happen if we
  493. * have a large file on a small filesystem and the above
  494. * lowspace thresholds are smaller than MAXEXTLEN.
  495. */
  496. while (alloc_blocks && alloc_blocks >= freesp)
  497. alloc_blocks >>= 4;
  498. check_writeio:
  499. if (alloc_blocks < mp->m_writeio_blocks)
  500. alloc_blocks = mp->m_writeio_blocks;
  501. trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
  502. mp->m_writeio_blocks);
  503. return alloc_blocks;
  504. }
  505. int
  506. xfs_iomap_write_delay(
  507. xfs_inode_t *ip,
  508. xfs_off_t offset,
  509. size_t count,
  510. xfs_bmbt_irec_t *ret_imap)
  511. {
  512. xfs_mount_t *mp = ip->i_mount;
  513. xfs_fileoff_t offset_fsb;
  514. xfs_fileoff_t last_fsb;
  515. xfs_off_t aligned_offset;
  516. xfs_fileoff_t ioalign;
  517. xfs_extlen_t extsz;
  518. int nimaps;
  519. xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
  520. int prealloc;
  521. int error;
  522. ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
  523. /*
  524. * Make sure that the dquots are there. This doesn't hold
  525. * the ilock across a disk read.
  526. */
  527. error = xfs_qm_dqattach_locked(ip, 0);
  528. if (error)
  529. return error;
  530. extsz = xfs_get_extsz_hint(ip);
  531. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  532. error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
  533. imap, XFS_WRITE_IMAPS, &prealloc);
  534. if (error)
  535. return error;
  536. retry:
  537. if (prealloc) {
  538. xfs_fsblock_t alloc_blocks;
  539. alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
  540. XFS_WRITE_IMAPS);
  541. aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
  542. ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
  543. last_fsb = ioalign + alloc_blocks;
  544. } else {
  545. last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
  546. }
  547. if (prealloc || extsz) {
  548. error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
  549. if (error)
  550. return error;
  551. }
  552. /*
  553. * Make sure preallocation does not create extents beyond the range we
  554. * actually support in this filesystem.
  555. */
  556. if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
  557. last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
  558. ASSERT(last_fsb > offset_fsb);
  559. nimaps = XFS_WRITE_IMAPS;
  560. error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
  561. imap, &nimaps, XFS_BMAPI_ENTIRE);
  562. switch (error) {
  563. case 0:
  564. case -ENOSPC:
  565. case -EDQUOT:
  566. break;
  567. default:
  568. return error;
  569. }
  570. /*
  571. * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
  572. * without EOF preallocation.
  573. */
  574. if (nimaps == 0) {
  575. trace_xfs_delalloc_enospc(ip, offset, count);
  576. if (prealloc) {
  577. prealloc = 0;
  578. error = 0;
  579. goto retry;
  580. }
  581. return error ? error : -ENOSPC;
  582. }
  583. if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
  584. return xfs_alert_fsblock_zero(ip, &imap[0]);
  585. /*
  586. * Tag the inode as speculatively preallocated so we can reclaim this
  587. * space on demand, if necessary.
  588. */
  589. if (prealloc)
  590. xfs_inode_set_eofblocks_tag(ip);
  591. *ret_imap = imap[0];
  592. return 0;
  593. }
  594. /*
  595. * Pass in a delayed allocate extent, convert it to real extents;
  596. * return to the caller the extent we create which maps on top of
  597. * the originating callers request.
  598. *
  599. * Called without a lock on the inode.
  600. *
  601. * We no longer bother to look at the incoming map - all we have to
  602. * guarantee is that whatever we allocate fills the required range.
  603. */
  604. int
  605. xfs_iomap_write_allocate(
  606. xfs_inode_t *ip,
  607. xfs_off_t offset,
  608. xfs_bmbt_irec_t *imap)
  609. {
  610. xfs_mount_t *mp = ip->i_mount;
  611. xfs_fileoff_t offset_fsb, last_block;
  612. xfs_fileoff_t end_fsb, map_start_fsb;
  613. xfs_fsblock_t first_block;
  614. struct xfs_defer_ops dfops;
  615. xfs_filblks_t count_fsb;
  616. xfs_trans_t *tp;
  617. int nimaps;
  618. int error = 0;
  619. int nres;
  620. /*
  621. * Make sure that the dquots are there.
  622. */
  623. error = xfs_qm_dqattach(ip, 0);
  624. if (error)
  625. return error;
  626. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  627. count_fsb = imap->br_blockcount;
  628. map_start_fsb = imap->br_startoff;
  629. XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
  630. while (count_fsb != 0) {
  631. /*
  632. * Set up a transaction with which to allocate the
  633. * backing store for the file. Do allocations in a
  634. * loop until we get some space in the range we are
  635. * interested in. The other space that might be allocated
  636. * is in the delayed allocation extent on which we sit
  637. * but before our buffer starts.
  638. */
  639. nimaps = 0;
  640. while (nimaps == 0) {
  641. nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
  642. /*
  643. * We have already reserved space for the extent and any
  644. * indirect blocks when creating the delalloc extent,
  645. * there is no need to reserve space in this transaction
  646. * again.
  647. */
  648. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
  649. 0, XFS_TRANS_RESERVE, &tp);
  650. if (error)
  651. return error;
  652. xfs_ilock(ip, XFS_ILOCK_EXCL);
  653. xfs_trans_ijoin(tp, ip, 0);
  654. xfs_defer_init(&dfops, &first_block);
  655. /*
  656. * it is possible that the extents have changed since
  657. * we did the read call as we dropped the ilock for a
  658. * while. We have to be careful about truncates or hole
  659. * punchs here - we are not allowed to allocate
  660. * non-delalloc blocks here.
  661. *
  662. * The only protection against truncation is the pages
  663. * for the range we are being asked to convert are
  664. * locked and hence a truncate will block on them
  665. * first.
  666. *
  667. * As a result, if we go beyond the range we really
  668. * need and hit an delalloc extent boundary followed by
  669. * a hole while we have excess blocks in the map, we
  670. * will fill the hole incorrectly and overrun the
  671. * transaction reservation.
  672. *
  673. * Using a single map prevents this as we are forced to
  674. * check each map we look for overlap with the desired
  675. * range and abort as soon as we find it. Also, given
  676. * that we only return a single map, having one beyond
  677. * what we can return is probably a bit silly.
  678. *
  679. * We also need to check that we don't go beyond EOF;
  680. * this is a truncate optimisation as a truncate sets
  681. * the new file size before block on the pages we
  682. * currently have locked under writeback. Because they
  683. * are about to be tossed, we don't need to write them
  684. * back....
  685. */
  686. nimaps = 1;
  687. end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
  688. error = xfs_bmap_last_offset(ip, &last_block,
  689. XFS_DATA_FORK);
  690. if (error)
  691. goto trans_cancel;
  692. last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
  693. if ((map_start_fsb + count_fsb) > last_block) {
  694. count_fsb = last_block - map_start_fsb;
  695. if (count_fsb == 0) {
  696. error = -EAGAIN;
  697. goto trans_cancel;
  698. }
  699. }
  700. /*
  701. * From this point onwards we overwrite the imap
  702. * pointer that the caller gave to us.
  703. */
  704. error = xfs_bmapi_write(tp, ip, map_start_fsb,
  705. count_fsb, 0, &first_block,
  706. nres, imap, &nimaps,
  707. &dfops);
  708. if (error)
  709. goto trans_cancel;
  710. error = xfs_defer_finish(&tp, &dfops, NULL);
  711. if (error)
  712. goto trans_cancel;
  713. error = xfs_trans_commit(tp);
  714. if (error)
  715. goto error0;
  716. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  717. }
  718. /*
  719. * See if we were able to allocate an extent that
  720. * covers at least part of the callers request
  721. */
  722. if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
  723. return xfs_alert_fsblock_zero(ip, imap);
  724. if ((offset_fsb >= imap->br_startoff) &&
  725. (offset_fsb < (imap->br_startoff +
  726. imap->br_blockcount))) {
  727. XFS_STATS_INC(mp, xs_xstrat_quick);
  728. return 0;
  729. }
  730. /*
  731. * So far we have not mapped the requested part of the
  732. * file, just surrounding data, try again.
  733. */
  734. count_fsb -= imap->br_blockcount;
  735. map_start_fsb = imap->br_startoff + imap->br_blockcount;
  736. }
  737. trans_cancel:
  738. xfs_defer_cancel(&dfops);
  739. xfs_trans_cancel(tp);
  740. error0:
  741. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  742. return error;
  743. }
  744. int
  745. xfs_iomap_write_unwritten(
  746. xfs_inode_t *ip,
  747. xfs_off_t offset,
  748. xfs_off_t count)
  749. {
  750. xfs_mount_t *mp = ip->i_mount;
  751. xfs_fileoff_t offset_fsb;
  752. xfs_filblks_t count_fsb;
  753. xfs_filblks_t numblks_fsb;
  754. xfs_fsblock_t firstfsb;
  755. int nimaps;
  756. xfs_trans_t *tp;
  757. xfs_bmbt_irec_t imap;
  758. struct xfs_defer_ops dfops;
  759. xfs_fsize_t i_size;
  760. uint resblks;
  761. int error;
  762. trace_xfs_unwritten_convert(ip, offset, count);
  763. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  764. count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
  765. count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
  766. /*
  767. * Reserve enough blocks in this transaction for two complete extent
  768. * btree splits. We may be converting the middle part of an unwritten
  769. * extent and in this case we will insert two new extents in the btree
  770. * each of which could cause a full split.
  771. *
  772. * This reservation amount will be used in the first call to
  773. * xfs_bmbt_split() to select an AG with enough space to satisfy the
  774. * rest of the operation.
  775. */
  776. resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
  777. do {
  778. /*
  779. * Set up a transaction to convert the range of extents
  780. * from unwritten to real. Do allocations in a loop until
  781. * we have covered the range passed in.
  782. *
  783. * Note that we can't risk to recursing back into the filesystem
  784. * here as we might be asked to write out the same inode that we
  785. * complete here and might deadlock on the iolock.
  786. */
  787. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
  788. XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
  789. if (error)
  790. return error;
  791. xfs_ilock(ip, XFS_ILOCK_EXCL);
  792. xfs_trans_ijoin(tp, ip, 0);
  793. /*
  794. * Modify the unwritten extent state of the buffer.
  795. */
  796. xfs_defer_init(&dfops, &firstfsb);
  797. nimaps = 1;
  798. error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
  799. XFS_BMAPI_CONVERT, &firstfsb, resblks,
  800. &imap, &nimaps, &dfops);
  801. if (error)
  802. goto error_on_bmapi_transaction;
  803. /*
  804. * Log the updated inode size as we go. We have to be careful
  805. * to only log it up to the actual write offset if it is
  806. * halfway into a block.
  807. */
  808. i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
  809. if (i_size > offset + count)
  810. i_size = offset + count;
  811. i_size = xfs_new_eof(ip, i_size);
  812. if (i_size) {
  813. ip->i_d.di_size = i_size;
  814. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  815. }
  816. error = xfs_defer_finish(&tp, &dfops, NULL);
  817. if (error)
  818. goto error_on_bmapi_transaction;
  819. error = xfs_trans_commit(tp);
  820. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  821. if (error)
  822. return error;
  823. if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
  824. return xfs_alert_fsblock_zero(ip, &imap);
  825. if ((numblks_fsb = imap.br_blockcount) == 0) {
  826. /*
  827. * The numblks_fsb value should always get
  828. * smaller, otherwise the loop is stuck.
  829. */
  830. ASSERT(imap.br_blockcount);
  831. break;
  832. }
  833. offset_fsb += numblks_fsb;
  834. count_fsb -= numblks_fsb;
  835. } while (count_fsb > 0);
  836. return 0;
  837. error_on_bmapi_transaction:
  838. xfs_defer_cancel(&dfops);
  839. xfs_trans_cancel(tp);
  840. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  841. return error;
  842. }
  843. void
  844. xfs_bmbt_to_iomap(
  845. struct xfs_inode *ip,
  846. struct iomap *iomap,
  847. struct xfs_bmbt_irec *imap)
  848. {
  849. struct xfs_mount *mp = ip->i_mount;
  850. if (imap->br_startblock == HOLESTARTBLOCK) {
  851. iomap->blkno = IOMAP_NULL_BLOCK;
  852. iomap->type = IOMAP_HOLE;
  853. } else if (imap->br_startblock == DELAYSTARTBLOCK) {
  854. iomap->blkno = IOMAP_NULL_BLOCK;
  855. iomap->type = IOMAP_DELALLOC;
  856. } else {
  857. iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
  858. if (imap->br_state == XFS_EXT_UNWRITTEN)
  859. iomap->type = IOMAP_UNWRITTEN;
  860. else
  861. iomap->type = IOMAP_MAPPED;
  862. }
  863. iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
  864. iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
  865. iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
  866. }
  867. static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
  868. {
  869. return !nimaps ||
  870. imap->br_startblock == HOLESTARTBLOCK ||
  871. imap->br_startblock == DELAYSTARTBLOCK;
  872. }
  873. static int
  874. xfs_file_iomap_begin(
  875. struct inode *inode,
  876. loff_t offset,
  877. loff_t length,
  878. unsigned flags,
  879. struct iomap *iomap)
  880. {
  881. struct xfs_inode *ip = XFS_I(inode);
  882. struct xfs_mount *mp = ip->i_mount;
  883. struct xfs_bmbt_irec imap;
  884. xfs_fileoff_t offset_fsb, end_fsb;
  885. int nimaps = 1, error = 0;
  886. if (XFS_FORCED_SHUTDOWN(mp))
  887. return -EIO;
  888. xfs_ilock(ip, XFS_ILOCK_EXCL);
  889. ASSERT(offset <= mp->m_super->s_maxbytes);
  890. if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
  891. length = mp->m_super->s_maxbytes - offset;
  892. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  893. end_fsb = XFS_B_TO_FSB(mp, offset + length);
  894. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
  895. &nimaps, XFS_BMAPI_ENTIRE);
  896. if (error) {
  897. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  898. return error;
  899. }
  900. if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
  901. /*
  902. * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
  903. * pages to keep the chunks of work done where somewhat symmetric
  904. * with the work writeback does. This is a completely arbitrary
  905. * number pulled out of thin air as a best guess for initial
  906. * testing.
  907. *
  908. * Note that the values needs to be less than 32-bits wide until
  909. * the lower level functions are updated.
  910. */
  911. length = min_t(loff_t, length, 1024 * PAGE_SIZE);
  912. if (xfs_get_extsz_hint(ip)) {
  913. /*
  914. * xfs_iomap_write_direct() expects the shared lock. It
  915. * is unlocked on return.
  916. */
  917. xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
  918. error = xfs_iomap_write_direct(ip, offset, length, &imap,
  919. nimaps);
  920. } else {
  921. error = xfs_iomap_write_delay(ip, offset, length, &imap);
  922. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  923. }
  924. if (error)
  925. return error;
  926. trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
  927. } else {
  928. ASSERT(nimaps);
  929. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  930. trace_xfs_iomap_found(ip, offset, length, 0, &imap);
  931. }
  932. xfs_bmbt_to_iomap(ip, iomap, &imap);
  933. return 0;
  934. }
  935. static int
  936. xfs_file_iomap_end_delalloc(
  937. struct xfs_inode *ip,
  938. loff_t offset,
  939. loff_t length,
  940. ssize_t written)
  941. {
  942. struct xfs_mount *mp = ip->i_mount;
  943. xfs_fileoff_t start_fsb;
  944. xfs_fileoff_t end_fsb;
  945. int error = 0;
  946. start_fsb = XFS_B_TO_FSB(mp, offset + written);
  947. end_fsb = XFS_B_TO_FSB(mp, offset + length);
  948. /*
  949. * Trim back delalloc blocks if we didn't manage to write the whole
  950. * range reserved.
  951. *
  952. * We don't need to care about racing delalloc as we hold i_mutex
  953. * across the reserve/allocate/unreserve calls. If there are delalloc
  954. * blocks in the range, they are ours.
  955. */
  956. if (start_fsb < end_fsb) {
  957. xfs_ilock(ip, XFS_ILOCK_EXCL);
  958. error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
  959. end_fsb - start_fsb);
  960. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  961. if (error && !XFS_FORCED_SHUTDOWN(mp)) {
  962. xfs_alert(mp, "%s: unable to clean up ino %lld",
  963. __func__, ip->i_ino);
  964. return error;
  965. }
  966. }
  967. return 0;
  968. }
  969. static int
  970. xfs_file_iomap_end(
  971. struct inode *inode,
  972. loff_t offset,
  973. loff_t length,
  974. ssize_t written,
  975. unsigned flags,
  976. struct iomap *iomap)
  977. {
  978. if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
  979. return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
  980. length, written);
  981. return 0;
  982. }
  983. struct iomap_ops xfs_iomap_ops = {
  984. .iomap_begin = xfs_file_iomap_begin,
  985. .iomap_end = xfs_file_iomap_end,
  986. };
  987. static int
  988. xfs_xattr_iomap_begin(
  989. struct inode *inode,
  990. loff_t offset,
  991. loff_t length,
  992. unsigned flags,
  993. struct iomap *iomap)
  994. {
  995. struct xfs_inode *ip = XFS_I(inode);
  996. struct xfs_mount *mp = ip->i_mount;
  997. xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
  998. xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
  999. struct xfs_bmbt_irec imap;
  1000. int nimaps = 1, error = 0;
  1001. unsigned lockmode;
  1002. if (XFS_FORCED_SHUTDOWN(mp))
  1003. return -EIO;
  1004. lockmode = xfs_ilock_data_map_shared(ip);
  1005. /* if there are no attribute fork or extents, return ENOENT */
  1006. if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
  1007. error = -ENOENT;
  1008. goto out_unlock;
  1009. }
  1010. ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
  1011. error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
  1012. &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
  1013. out_unlock:
  1014. xfs_iunlock(ip, lockmode);
  1015. if (!error) {
  1016. ASSERT(nimaps);
  1017. xfs_bmbt_to_iomap(ip, iomap, &imap);
  1018. }
  1019. return error;
  1020. }
  1021. struct iomap_ops xfs_xattr_iomap_ops = {
  1022. .iomap_begin = xfs_xattr_iomap_begin,
  1023. };