xfs_bmap_util.c 52 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949
  1. /*
  2. * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  3. * Copyright (c) 2012 Red Hat, Inc.
  4. * All Rights Reserved.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License as
  8. * published by the Free Software Foundation.
  9. *
  10. * This program is distributed in the hope that it would be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write the Free Software Foundation,
  17. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  18. */
  19. #include "xfs.h"
  20. #include "xfs_fs.h"
  21. #include "xfs_shared.h"
  22. #include "xfs_format.h"
  23. #include "xfs_log_format.h"
  24. #include "xfs_trans_resv.h"
  25. #include "xfs_bit.h"
  26. #include "xfs_mount.h"
  27. #include "xfs_da_format.h"
  28. #include "xfs_inode.h"
  29. #include "xfs_btree.h"
  30. #include "xfs_trans.h"
  31. #include "xfs_extfree_item.h"
  32. #include "xfs_alloc.h"
  33. #include "xfs_bmap.h"
  34. #include "xfs_bmap_util.h"
  35. #include "xfs_bmap_btree.h"
  36. #include "xfs_rtalloc.h"
  37. #include "xfs_error.h"
  38. #include "xfs_quota.h"
  39. #include "xfs_trans_space.h"
  40. #include "xfs_trace.h"
  41. #include "xfs_icache.h"
  42. #include "xfs_log.h"
  43. /* Kernel only BMAP related definitions and functions */
  44. /*
  45. * Convert the given file system block to a disk block. We have to treat it
  46. * differently based on whether the file is a real time file or not, because the
  47. * bmap code does.
  48. */
  49. xfs_daddr_t
  50. xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  51. {
  52. return (XFS_IS_REALTIME_INODE(ip) ? \
  53. (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
  54. XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
  55. }
  56. /*
  57. * Routine to zero an extent on disk allocated to the specific inode.
  58. *
  59. * The VFS functions take a linearised filesystem block offset, so we have to
  60. * convert the sparse xfs fsb to the right format first.
  61. * VFS types are real funky, too.
  62. */
  63. int
  64. xfs_zero_extent(
  65. struct xfs_inode *ip,
  66. xfs_fsblock_t start_fsb,
  67. xfs_off_t count_fsb)
  68. {
  69. struct xfs_mount *mp = ip->i_mount;
  70. xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
  71. sector_t block = XFS_BB_TO_FSBT(mp, sector);
  72. ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
  73. if (IS_DAX(VFS_I(ip)))
  74. return dax_clear_blocks(VFS_I(ip), block, size);
  75. /*
  76. * let the block layer decide on the fastest method of
  77. * implementing the zeroing.
  78. */
  79. return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
  80. }
  81. /*
  82. * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  83. * caller. Frees all the extents that need freeing, which must be done
  84. * last due to locking considerations. We never free any extents in
  85. * the first transaction.
  86. *
  87. * If an inode *ip is provided, rejoin it to the transaction if
  88. * the transaction was committed.
  89. */
  90. int /* error */
  91. xfs_bmap_finish(
  92. struct xfs_trans **tp, /* transaction pointer addr */
  93. struct xfs_bmap_free *flist, /* i/o: list extents to free */
  94. struct xfs_inode *ip)
  95. {
  96. struct xfs_efd_log_item *efd; /* extent free data */
  97. struct xfs_efi_log_item *efi; /* extent free intention */
  98. int error; /* error return value */
  99. int committed;/* xact committed or not */
  100. struct xfs_bmap_free_item *free; /* free extent item */
  101. struct xfs_bmap_free_item *next; /* next item on free list */
  102. ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
  103. if (flist->xbf_count == 0)
  104. return 0;
  105. efi = xfs_trans_get_efi(*tp, flist->xbf_count);
  106. for (free = flist->xbf_first; free; free = free->xbfi_next)
  107. xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
  108. free->xbfi_blockcount);
  109. error = __xfs_trans_roll(tp, ip, &committed);
  110. if (error) {
  111. /*
  112. * If the transaction was committed, drop the EFD reference
  113. * since we're bailing out of here. The other reference is
  114. * dropped when the EFI hits the AIL.
  115. *
  116. * If the transaction was not committed, the EFI is freed by the
  117. * EFI item unlock handler on abort. Also, we have a new
  118. * transaction so we should return committed=1 even though we're
  119. * returning an error.
  120. */
  121. if (committed) {
  122. xfs_efi_release(efi);
  123. xfs_force_shutdown((*tp)->t_mountp,
  124. (error == -EFSCORRUPTED) ?
  125. SHUTDOWN_CORRUPT_INCORE :
  126. SHUTDOWN_META_IO_ERROR);
  127. }
  128. return error;
  129. }
  130. /*
  131. * Get an EFD and free each extent in the list, logging to the EFD in
  132. * the process. The remaining bmap free list is cleaned up by the caller
  133. * on error.
  134. */
  135. efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
  136. for (free = flist->xbf_first; free != NULL; free = next) {
  137. next = free->xbfi_next;
  138. error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
  139. free->xbfi_blockcount);
  140. if (error)
  141. return error;
  142. xfs_bmap_del_free(flist, NULL, free);
  143. }
  144. return 0;
  145. }
  146. int
  147. xfs_bmap_rtalloc(
  148. struct xfs_bmalloca *ap) /* bmap alloc argument struct */
  149. {
  150. xfs_alloctype_t atype = 0; /* type for allocation routines */
  151. int error; /* error return value */
  152. xfs_mount_t *mp; /* mount point structure */
  153. xfs_extlen_t prod = 0; /* product factor for allocators */
  154. xfs_extlen_t ralen = 0; /* realtime allocation length */
  155. xfs_extlen_t align; /* minimum allocation alignment */
  156. xfs_rtblock_t rtb;
  157. mp = ap->ip->i_mount;
  158. align = xfs_get_extsz_hint(ap->ip);
  159. prod = align / mp->m_sb.sb_rextsize;
  160. error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
  161. align, 1, ap->eof, 0,
  162. ap->conv, &ap->offset, &ap->length);
  163. if (error)
  164. return error;
  165. ASSERT(ap->length);
  166. ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
  167. /*
  168. * If the offset & length are not perfectly aligned
  169. * then kill prod, it will just get us in trouble.
  170. */
  171. if (do_mod(ap->offset, align) || ap->length % align)
  172. prod = 1;
  173. /*
  174. * Set ralen to be the actual requested length in rtextents.
  175. */
  176. ralen = ap->length / mp->m_sb.sb_rextsize;
  177. /*
  178. * If the old value was close enough to MAXEXTLEN that
  179. * we rounded up to it, cut it back so it's valid again.
  180. * Note that if it's a really large request (bigger than
  181. * MAXEXTLEN), we don't hear about that number, and can't
  182. * adjust the starting point to match it.
  183. */
  184. if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
  185. ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
  186. /*
  187. * Lock out other modifications to the RT bitmap inode.
  188. */
  189. xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
  190. xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
  191. /*
  192. * If it's an allocation to an empty file at offset 0,
  193. * pick an extent that will space things out in the rt area.
  194. */
  195. if (ap->eof && ap->offset == 0) {
  196. xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
  197. error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
  198. if (error)
  199. return error;
  200. ap->blkno = rtx * mp->m_sb.sb_rextsize;
  201. } else {
  202. ap->blkno = 0;
  203. }
  204. xfs_bmap_adjacent(ap);
  205. /*
  206. * Realtime allocation, done through xfs_rtallocate_extent.
  207. */
  208. atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
  209. do_div(ap->blkno, mp->m_sb.sb_rextsize);
  210. rtb = ap->blkno;
  211. ap->length = ralen;
  212. if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
  213. &ralen, atype, ap->wasdel, prod, &rtb)))
  214. return error;
  215. if (rtb == NULLFSBLOCK && prod > 1 &&
  216. (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
  217. ap->length, &ralen, atype,
  218. ap->wasdel, 1, &rtb)))
  219. return error;
  220. ap->blkno = rtb;
  221. if (ap->blkno != NULLFSBLOCK) {
  222. ap->blkno *= mp->m_sb.sb_rextsize;
  223. ralen *= mp->m_sb.sb_rextsize;
  224. ap->length = ralen;
  225. ap->ip->i_d.di_nblocks += ralen;
  226. xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
  227. if (ap->wasdel)
  228. ap->ip->i_delayed_blks -= ralen;
  229. /*
  230. * Adjust the disk quota also. This was reserved
  231. * earlier.
  232. */
  233. xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
  234. ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
  235. XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
  236. /* Zero the extent if we were asked to do so */
  237. if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
  238. error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
  239. if (error)
  240. return error;
  241. }
  242. } else {
  243. ap->length = 0;
  244. }
  245. return 0;
  246. }
  247. /*
  248. * Check if the endoff is outside the last extent. If so the caller will grow
  249. * the allocation to a stripe unit boundary. All offsets are considered outside
  250. * the end of file for an empty fork, so 1 is returned in *eof in that case.
  251. */
  252. int
  253. xfs_bmap_eof(
  254. struct xfs_inode *ip,
  255. xfs_fileoff_t endoff,
  256. int whichfork,
  257. int *eof)
  258. {
  259. struct xfs_bmbt_irec rec;
  260. int error;
  261. error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
  262. if (error || *eof)
  263. return error;
  264. *eof = endoff >= rec.br_startoff + rec.br_blockcount;
  265. return 0;
  266. }
  267. /*
  268. * Extent tree block counting routines.
  269. */
  270. /*
  271. * Count leaf blocks given a range of extent records.
  272. */
  273. STATIC void
  274. xfs_bmap_count_leaves(
  275. xfs_ifork_t *ifp,
  276. xfs_extnum_t idx,
  277. int numrecs,
  278. int *count)
  279. {
  280. int b;
  281. for (b = 0; b < numrecs; b++) {
  282. xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
  283. *count += xfs_bmbt_get_blockcount(frp);
  284. }
  285. }
  286. /*
  287. * Count leaf blocks given a range of extent records originally
  288. * in btree format.
  289. */
  290. STATIC void
  291. xfs_bmap_disk_count_leaves(
  292. struct xfs_mount *mp,
  293. struct xfs_btree_block *block,
  294. int numrecs,
  295. int *count)
  296. {
  297. int b;
  298. xfs_bmbt_rec_t *frp;
  299. for (b = 1; b <= numrecs; b++) {
  300. frp = XFS_BMBT_REC_ADDR(mp, block, b);
  301. *count += xfs_bmbt_disk_get_blockcount(frp);
  302. }
  303. }
  304. /*
  305. * Recursively walks each level of a btree
  306. * to count total fsblocks in use.
  307. */
  308. STATIC int /* error */
  309. xfs_bmap_count_tree(
  310. xfs_mount_t *mp, /* file system mount point */
  311. xfs_trans_t *tp, /* transaction pointer */
  312. xfs_ifork_t *ifp, /* inode fork pointer */
  313. xfs_fsblock_t blockno, /* file system block number */
  314. int levelin, /* level in btree */
  315. int *count) /* Count of blocks */
  316. {
  317. int error;
  318. xfs_buf_t *bp, *nbp;
  319. int level = levelin;
  320. __be64 *pp;
  321. xfs_fsblock_t bno = blockno;
  322. xfs_fsblock_t nextbno;
  323. struct xfs_btree_block *block, *nextblock;
  324. int numrecs;
  325. error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
  326. &xfs_bmbt_buf_ops);
  327. if (error)
  328. return error;
  329. *count += 1;
  330. block = XFS_BUF_TO_BLOCK(bp);
  331. if (--level) {
  332. /* Not at node above leaves, count this level of nodes */
  333. nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
  334. while (nextbno != NULLFSBLOCK) {
  335. error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
  336. XFS_BMAP_BTREE_REF,
  337. &xfs_bmbt_buf_ops);
  338. if (error)
  339. return error;
  340. *count += 1;
  341. nextblock = XFS_BUF_TO_BLOCK(nbp);
  342. nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
  343. xfs_trans_brelse(tp, nbp);
  344. }
  345. /* Dive to the next level */
  346. pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
  347. bno = be64_to_cpu(*pp);
  348. if (unlikely((error =
  349. xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
  350. xfs_trans_brelse(tp, bp);
  351. XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
  352. XFS_ERRLEVEL_LOW, mp);
  353. return -EFSCORRUPTED;
  354. }
  355. xfs_trans_brelse(tp, bp);
  356. } else {
  357. /* count all level 1 nodes and their leaves */
  358. for (;;) {
  359. nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
  360. numrecs = be16_to_cpu(block->bb_numrecs);
  361. xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
  362. xfs_trans_brelse(tp, bp);
  363. if (nextbno == NULLFSBLOCK)
  364. break;
  365. bno = nextbno;
  366. error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
  367. XFS_BMAP_BTREE_REF,
  368. &xfs_bmbt_buf_ops);
  369. if (error)
  370. return error;
  371. *count += 1;
  372. block = XFS_BUF_TO_BLOCK(bp);
  373. }
  374. }
  375. return 0;
  376. }
  377. /*
  378. * Count fsblocks of the given fork.
  379. */
  380. int /* error */
  381. xfs_bmap_count_blocks(
  382. xfs_trans_t *tp, /* transaction pointer */
  383. xfs_inode_t *ip, /* incore inode */
  384. int whichfork, /* data or attr fork */
  385. int *count) /* out: count of blocks */
  386. {
  387. struct xfs_btree_block *block; /* current btree block */
  388. xfs_fsblock_t bno; /* block # of "block" */
  389. xfs_ifork_t *ifp; /* fork structure */
  390. int level; /* btree level, for checking */
  391. xfs_mount_t *mp; /* file system mount structure */
  392. __be64 *pp; /* pointer to block address */
  393. bno = NULLFSBLOCK;
  394. mp = ip->i_mount;
  395. ifp = XFS_IFORK_PTR(ip, whichfork);
  396. if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
  397. xfs_bmap_count_leaves(ifp, 0,
  398. ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
  399. count);
  400. return 0;
  401. }
  402. /*
  403. * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
  404. */
  405. block = ifp->if_broot;
  406. level = be16_to_cpu(block->bb_level);
  407. ASSERT(level > 0);
  408. pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
  409. bno = be64_to_cpu(*pp);
  410. ASSERT(bno != NULLFSBLOCK);
  411. ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
  412. ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
  413. if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
  414. XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
  415. mp);
  416. return -EFSCORRUPTED;
  417. }
  418. return 0;
  419. }
  420. /*
  421. * returns 1 for success, 0 if we failed to map the extent.
  422. */
  423. STATIC int
  424. xfs_getbmapx_fix_eof_hole(
  425. xfs_inode_t *ip, /* xfs incore inode pointer */
  426. struct getbmapx *out, /* output structure */
  427. int prealloced, /* this is a file with
  428. * preallocated data space */
  429. __int64_t end, /* last block requested */
  430. xfs_fsblock_t startblock)
  431. {
  432. __int64_t fixlen;
  433. xfs_mount_t *mp; /* file system mount point */
  434. xfs_ifork_t *ifp; /* inode fork pointer */
  435. xfs_extnum_t lastx; /* last extent pointer */
  436. xfs_fileoff_t fileblock;
  437. if (startblock == HOLESTARTBLOCK) {
  438. mp = ip->i_mount;
  439. out->bmv_block = -1;
  440. fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
  441. fixlen -= out->bmv_offset;
  442. if (prealloced && out->bmv_offset + out->bmv_length == end) {
  443. /* Came to hole at EOF. Trim it. */
  444. if (fixlen <= 0)
  445. return 0;
  446. out->bmv_length = fixlen;
  447. }
  448. } else {
  449. if (startblock == DELAYSTARTBLOCK)
  450. out->bmv_block = -2;
  451. else
  452. out->bmv_block = xfs_fsb_to_db(ip, startblock);
  453. fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
  454. ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
  455. if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
  456. (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
  457. out->bmv_oflags |= BMV_OF_LAST;
  458. }
  459. return 1;
  460. }
  461. /*
  462. * Get inode's extents as described in bmv, and format for output.
  463. * Calls formatter to fill the user's buffer until all extents
  464. * are mapped, until the passed-in bmv->bmv_count slots have
  465. * been filled, or until the formatter short-circuits the loop,
  466. * if it is tracking filled-in extents on its own.
  467. */
  468. int /* error code */
  469. xfs_getbmap(
  470. xfs_inode_t *ip,
  471. struct getbmapx *bmv, /* user bmap structure */
  472. xfs_bmap_format_t formatter, /* format to user */
  473. void *arg) /* formatter arg */
  474. {
  475. __int64_t bmvend; /* last block requested */
  476. int error = 0; /* return value */
  477. __int64_t fixlen; /* length for -1 case */
  478. int i; /* extent number */
  479. int lock; /* lock state */
  480. xfs_bmbt_irec_t *map; /* buffer for user's data */
  481. xfs_mount_t *mp; /* file system mount point */
  482. int nex; /* # of user extents can do */
  483. int nexleft; /* # of user extents left */
  484. int subnex; /* # of bmapi's can do */
  485. int nmap; /* number of map entries */
  486. struct getbmapx *out; /* output structure */
  487. int whichfork; /* data or attr fork */
  488. int prealloced; /* this is a file with
  489. * preallocated data space */
  490. int iflags; /* interface flags */
  491. int bmapi_flags; /* flags for xfs_bmapi */
  492. int cur_ext = 0;
  493. mp = ip->i_mount;
  494. iflags = bmv->bmv_iflags;
  495. whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
  496. if (whichfork == XFS_ATTR_FORK) {
  497. if (XFS_IFORK_Q(ip)) {
  498. if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
  499. ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
  500. ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
  501. return -EINVAL;
  502. } else if (unlikely(
  503. ip->i_d.di_aformat != 0 &&
  504. ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
  505. XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
  506. ip->i_mount);
  507. return -EFSCORRUPTED;
  508. }
  509. prealloced = 0;
  510. fixlen = 1LL << 32;
  511. } else {
  512. if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
  513. ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
  514. ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
  515. return -EINVAL;
  516. if (xfs_get_extsz_hint(ip) ||
  517. ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
  518. prealloced = 1;
  519. fixlen = mp->m_super->s_maxbytes;
  520. } else {
  521. prealloced = 0;
  522. fixlen = XFS_ISIZE(ip);
  523. }
  524. }
  525. if (bmv->bmv_length == -1) {
  526. fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
  527. bmv->bmv_length =
  528. max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
  529. } else if (bmv->bmv_length == 0) {
  530. bmv->bmv_entries = 0;
  531. return 0;
  532. } else if (bmv->bmv_length < 0) {
  533. return -EINVAL;
  534. }
  535. nex = bmv->bmv_count - 1;
  536. if (nex <= 0)
  537. return -EINVAL;
  538. bmvend = bmv->bmv_offset + bmv->bmv_length;
  539. if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
  540. return -ENOMEM;
  541. out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
  542. if (!out)
  543. return -ENOMEM;
  544. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  545. if (whichfork == XFS_DATA_FORK) {
  546. if (!(iflags & BMV_IF_DELALLOC) &&
  547. (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
  548. error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
  549. if (error)
  550. goto out_unlock_iolock;
  551. /*
  552. * Even after flushing the inode, there can still be
  553. * delalloc blocks on the inode beyond EOF due to
  554. * speculative preallocation. These are not removed
  555. * until the release function is called or the inode
  556. * is inactivated. Hence we cannot assert here that
  557. * ip->i_delayed_blks == 0.
  558. */
  559. }
  560. lock = xfs_ilock_data_map_shared(ip);
  561. } else {
  562. lock = xfs_ilock_attr_map_shared(ip);
  563. }
  564. /*
  565. * Don't let nex be bigger than the number of extents
  566. * we can have assuming alternating holes and real extents.
  567. */
  568. if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
  569. nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
  570. bmapi_flags = xfs_bmapi_aflag(whichfork);
  571. if (!(iflags & BMV_IF_PREALLOC))
  572. bmapi_flags |= XFS_BMAPI_IGSTATE;
  573. /*
  574. * Allocate enough space to handle "subnex" maps at a time.
  575. */
  576. error = -ENOMEM;
  577. subnex = 16;
  578. map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
  579. if (!map)
  580. goto out_unlock_ilock;
  581. bmv->bmv_entries = 0;
  582. if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
  583. (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
  584. error = 0;
  585. goto out_free_map;
  586. }
  587. nexleft = nex;
  588. do {
  589. nmap = (nexleft > subnex) ? subnex : nexleft;
  590. error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
  591. XFS_BB_TO_FSB(mp, bmv->bmv_length),
  592. map, &nmap, bmapi_flags);
  593. if (error)
  594. goto out_free_map;
  595. ASSERT(nmap <= subnex);
  596. for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
  597. out[cur_ext].bmv_oflags = 0;
  598. if (map[i].br_state == XFS_EXT_UNWRITTEN)
  599. out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
  600. else if (map[i].br_startblock == DELAYSTARTBLOCK)
  601. out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
  602. out[cur_ext].bmv_offset =
  603. XFS_FSB_TO_BB(mp, map[i].br_startoff);
  604. out[cur_ext].bmv_length =
  605. XFS_FSB_TO_BB(mp, map[i].br_blockcount);
  606. out[cur_ext].bmv_unused1 = 0;
  607. out[cur_ext].bmv_unused2 = 0;
  608. /*
  609. * delayed allocation extents that start beyond EOF can
  610. * occur due to speculative EOF allocation when the
  611. * delalloc extent is larger than the largest freespace
  612. * extent at conversion time. These extents cannot be
  613. * converted by data writeback, so can exist here even
  614. * if we are not supposed to be finding delalloc
  615. * extents.
  616. */
  617. if (map[i].br_startblock == DELAYSTARTBLOCK &&
  618. map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
  619. ASSERT((iflags & BMV_IF_DELALLOC) != 0);
  620. if (map[i].br_startblock == HOLESTARTBLOCK &&
  621. whichfork == XFS_ATTR_FORK) {
  622. /* came to the end of attribute fork */
  623. out[cur_ext].bmv_oflags |= BMV_OF_LAST;
  624. goto out_free_map;
  625. }
  626. if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
  627. prealloced, bmvend,
  628. map[i].br_startblock))
  629. goto out_free_map;
  630. bmv->bmv_offset =
  631. out[cur_ext].bmv_offset +
  632. out[cur_ext].bmv_length;
  633. bmv->bmv_length =
  634. max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
  635. /*
  636. * In case we don't want to return the hole,
  637. * don't increase cur_ext so that we can reuse
  638. * it in the next loop.
  639. */
  640. if ((iflags & BMV_IF_NO_HOLES) &&
  641. map[i].br_startblock == HOLESTARTBLOCK) {
  642. memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
  643. continue;
  644. }
  645. nexleft--;
  646. bmv->bmv_entries++;
  647. cur_ext++;
  648. }
  649. } while (nmap && nexleft && bmv->bmv_length);
  650. out_free_map:
  651. kmem_free(map);
  652. out_unlock_ilock:
  653. xfs_iunlock(ip, lock);
  654. out_unlock_iolock:
  655. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  656. for (i = 0; i < cur_ext; i++) {
  657. int full = 0; /* user array is full */
  658. /* format results & advance arg */
  659. error = formatter(&arg, &out[i], &full);
  660. if (error || full)
  661. break;
  662. }
  663. kmem_free(out);
  664. return error;
  665. }
  666. /*
  667. * dead simple method of punching delalyed allocation blocks from a range in
  668. * the inode. Walks a block at a time so will be slow, but is only executed in
  669. * rare error cases so the overhead is not critical. This will always punch out
  670. * both the start and end blocks, even if the ranges only partially overlap
  671. * them, so it is up to the caller to ensure that partial blocks are not
  672. * passed in.
  673. */
  674. int
  675. xfs_bmap_punch_delalloc_range(
  676. struct xfs_inode *ip,
  677. xfs_fileoff_t start_fsb,
  678. xfs_fileoff_t length)
  679. {
  680. xfs_fileoff_t remaining = length;
  681. int error = 0;
  682. ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
  683. do {
  684. int done;
  685. xfs_bmbt_irec_t imap;
  686. int nimaps = 1;
  687. xfs_fsblock_t firstblock;
  688. xfs_bmap_free_t flist;
  689. /*
  690. * Map the range first and check that it is a delalloc extent
  691. * before trying to unmap the range. Otherwise we will be
  692. * trying to remove a real extent (which requires a
  693. * transaction) or a hole, which is probably a bad idea...
  694. */
  695. error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
  696. XFS_BMAPI_ENTIRE);
  697. if (error) {
  698. /* something screwed, just bail */
  699. if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  700. xfs_alert(ip->i_mount,
  701. "Failed delalloc mapping lookup ino %lld fsb %lld.",
  702. ip->i_ino, start_fsb);
  703. }
  704. break;
  705. }
  706. if (!nimaps) {
  707. /* nothing there */
  708. goto next_block;
  709. }
  710. if (imap.br_startblock != DELAYSTARTBLOCK) {
  711. /* been converted, ignore */
  712. goto next_block;
  713. }
  714. WARN_ON(imap.br_blockcount == 0);
  715. /*
  716. * Note: while we initialise the firstblock/flist pair, they
  717. * should never be used because blocks should never be
  718. * allocated or freed for a delalloc extent and hence we need
  719. * don't cancel or finish them after the xfs_bunmapi() call.
  720. */
  721. xfs_bmap_init(&flist, &firstblock);
  722. error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
  723. &flist, &done);
  724. if (error)
  725. break;
  726. ASSERT(!flist.xbf_count && !flist.xbf_first);
  727. next_block:
  728. start_fsb++;
  729. remaining--;
  730. } while(remaining > 0);
  731. return error;
  732. }
  733. /*
  734. * Test whether it is appropriate to check an inode for and free post EOF
  735. * blocks. The 'force' parameter determines whether we should also consider
  736. * regular files that are marked preallocated or append-only.
  737. */
  738. bool
  739. xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
  740. {
  741. /* prealloc/delalloc exists only on regular files */
  742. if (!S_ISREG(ip->i_d.di_mode))
  743. return false;
  744. /*
  745. * Zero sized files with no cached pages and delalloc blocks will not
  746. * have speculative prealloc/delalloc blocks to remove.
  747. */
  748. if (VFS_I(ip)->i_size == 0 &&
  749. VFS_I(ip)->i_mapping->nrpages == 0 &&
  750. ip->i_delayed_blks == 0)
  751. return false;
  752. /* If we haven't read in the extent list, then don't do it now. */
  753. if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
  754. return false;
  755. /*
  756. * Do not free real preallocated or append-only files unless the file
  757. * has delalloc blocks and we are forced to remove them.
  758. */
  759. if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
  760. if (!force || ip->i_delayed_blks == 0)
  761. return false;
  762. return true;
  763. }
  764. /*
  765. * This is called by xfs_inactive to free any blocks beyond eof
  766. * when the link count isn't zero and by xfs_dm_punch_hole() when
  767. * punching a hole to EOF.
  768. */
  769. int
  770. xfs_free_eofblocks(
  771. xfs_mount_t *mp,
  772. xfs_inode_t *ip,
  773. bool need_iolock)
  774. {
  775. xfs_trans_t *tp;
  776. int error;
  777. xfs_fileoff_t end_fsb;
  778. xfs_fileoff_t last_fsb;
  779. xfs_filblks_t map_len;
  780. int nimaps;
  781. xfs_bmbt_irec_t imap;
  782. /*
  783. * Figure out if there are any blocks beyond the end
  784. * of the file. If not, then there is nothing to do.
  785. */
  786. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
  787. last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
  788. if (last_fsb <= end_fsb)
  789. return 0;
  790. map_len = last_fsb - end_fsb;
  791. nimaps = 1;
  792. xfs_ilock(ip, XFS_ILOCK_SHARED);
  793. error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
  794. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  795. if (!error && (nimaps != 0) &&
  796. (imap.br_startblock != HOLESTARTBLOCK ||
  797. ip->i_delayed_blks)) {
  798. /*
  799. * Attach the dquots to the inode up front.
  800. */
  801. error = xfs_qm_dqattach(ip, 0);
  802. if (error)
  803. return error;
  804. /*
  805. * There are blocks after the end of file.
  806. * Free them up now by truncating the file to
  807. * its current size.
  808. */
  809. tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
  810. if (need_iolock) {
  811. if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
  812. xfs_trans_cancel(tp);
  813. return -EAGAIN;
  814. }
  815. }
  816. error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
  817. if (error) {
  818. ASSERT(XFS_FORCED_SHUTDOWN(mp));
  819. xfs_trans_cancel(tp);
  820. if (need_iolock)
  821. xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  822. return error;
  823. }
  824. xfs_ilock(ip, XFS_ILOCK_EXCL);
  825. xfs_trans_ijoin(tp, ip, 0);
  826. /*
  827. * Do not update the on-disk file size. If we update the
  828. * on-disk file size and then the system crashes before the
  829. * contents of the file are flushed to disk then the files
  830. * may be full of holes (ie NULL files bug).
  831. */
  832. error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
  833. XFS_ISIZE(ip));
  834. if (error) {
  835. /*
  836. * If we get an error at this point we simply don't
  837. * bother truncating the file.
  838. */
  839. xfs_trans_cancel(tp);
  840. } else {
  841. error = xfs_trans_commit(tp);
  842. if (!error)
  843. xfs_inode_clear_eofblocks_tag(ip);
  844. }
  845. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  846. if (need_iolock)
  847. xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  848. }
  849. return error;
  850. }
  851. int
  852. xfs_alloc_file_space(
  853. struct xfs_inode *ip,
  854. xfs_off_t offset,
  855. xfs_off_t len,
  856. int alloc_type)
  857. {
  858. xfs_mount_t *mp = ip->i_mount;
  859. xfs_off_t count;
  860. xfs_filblks_t allocated_fsb;
  861. xfs_filblks_t allocatesize_fsb;
  862. xfs_extlen_t extsz, temp;
  863. xfs_fileoff_t startoffset_fsb;
  864. xfs_fsblock_t firstfsb;
  865. int nimaps;
  866. int quota_flag;
  867. int rt;
  868. xfs_trans_t *tp;
  869. xfs_bmbt_irec_t imaps[1], *imapp;
  870. xfs_bmap_free_t free_list;
  871. uint qblocks, resblks, resrtextents;
  872. int error;
  873. trace_xfs_alloc_file_space(ip);
  874. if (XFS_FORCED_SHUTDOWN(mp))
  875. return -EIO;
  876. error = xfs_qm_dqattach(ip, 0);
  877. if (error)
  878. return error;
  879. if (len <= 0)
  880. return -EINVAL;
  881. rt = XFS_IS_REALTIME_INODE(ip);
  882. extsz = xfs_get_extsz_hint(ip);
  883. count = len;
  884. imapp = &imaps[0];
  885. nimaps = 1;
  886. startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
  887. allocatesize_fsb = XFS_B_TO_FSB(mp, count);
  888. /*
  889. * Allocate file space until done or until there is an error
  890. */
  891. while (allocatesize_fsb && !error) {
  892. xfs_fileoff_t s, e;
  893. /*
  894. * Determine space reservations for data/realtime.
  895. */
  896. if (unlikely(extsz)) {
  897. s = startoffset_fsb;
  898. do_div(s, extsz);
  899. s *= extsz;
  900. e = startoffset_fsb + allocatesize_fsb;
  901. if ((temp = do_mod(startoffset_fsb, extsz)))
  902. e += temp;
  903. if ((temp = do_mod(e, extsz)))
  904. e += extsz - temp;
  905. } else {
  906. s = 0;
  907. e = allocatesize_fsb;
  908. }
  909. /*
  910. * The transaction reservation is limited to a 32-bit block
  911. * count, hence we need to limit the number of blocks we are
  912. * trying to reserve to avoid an overflow. We can't allocate
  913. * more than @nimaps extents, and an extent is limited on disk
  914. * to MAXEXTLEN (21 bits), so use that to enforce the limit.
  915. */
  916. resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
  917. if (unlikely(rt)) {
  918. resrtextents = qblocks = resblks;
  919. resrtextents /= mp->m_sb.sb_rextsize;
  920. resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  921. quota_flag = XFS_QMOPT_RES_RTBLKS;
  922. } else {
  923. resrtextents = 0;
  924. resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
  925. quota_flag = XFS_QMOPT_RES_REGBLKS;
  926. }
  927. /*
  928. * Allocate and setup the transaction.
  929. */
  930. tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
  931. error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
  932. resblks, resrtextents);
  933. /*
  934. * Check for running out of space
  935. */
  936. if (error) {
  937. /*
  938. * Free the transaction structure.
  939. */
  940. ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
  941. xfs_trans_cancel(tp);
  942. break;
  943. }
  944. xfs_ilock(ip, XFS_ILOCK_EXCL);
  945. error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
  946. 0, quota_flag);
  947. if (error)
  948. goto error1;
  949. xfs_trans_ijoin(tp, ip, 0);
  950. xfs_bmap_init(&free_list, &firstfsb);
  951. error = xfs_bmapi_write(tp, ip, startoffset_fsb,
  952. allocatesize_fsb, alloc_type, &firstfsb,
  953. resblks, imapp, &nimaps, &free_list);
  954. if (error)
  955. goto error0;
  956. /*
  957. * Complete the transaction
  958. */
  959. error = xfs_bmap_finish(&tp, &free_list, NULL);
  960. if (error)
  961. goto error0;
  962. error = xfs_trans_commit(tp);
  963. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  964. if (error)
  965. break;
  966. allocated_fsb = imapp->br_blockcount;
  967. if (nimaps == 0) {
  968. error = -ENOSPC;
  969. break;
  970. }
  971. startoffset_fsb += allocated_fsb;
  972. allocatesize_fsb -= allocated_fsb;
  973. }
  974. return error;
  975. error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
  976. xfs_bmap_cancel(&free_list);
  977. xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
  978. error1: /* Just cancel transaction */
  979. xfs_trans_cancel(tp);
  980. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  981. return error;
  982. }
  983. /*
  984. * Zero file bytes between startoff and endoff inclusive.
  985. * The iolock is held exclusive and no blocks are buffered.
  986. *
  987. * This function is used by xfs_free_file_space() to zero
  988. * partial blocks when the range to free is not block aligned.
  989. * When unreserving space with boundaries that are not block
  990. * aligned we round up the start and round down the end
  991. * boundaries and then use this function to zero the parts of
  992. * the blocks that got dropped during the rounding.
  993. */
  994. STATIC int
  995. xfs_zero_remaining_bytes(
  996. xfs_inode_t *ip,
  997. xfs_off_t startoff,
  998. xfs_off_t endoff)
  999. {
  1000. xfs_bmbt_irec_t imap;
  1001. xfs_fileoff_t offset_fsb;
  1002. xfs_off_t lastoffset;
  1003. xfs_off_t offset;
  1004. xfs_buf_t *bp;
  1005. xfs_mount_t *mp = ip->i_mount;
  1006. int nimap;
  1007. int error = 0;
  1008. /*
  1009. * Avoid doing I/O beyond eof - it's not necessary
  1010. * since nothing can read beyond eof. The space will
  1011. * be zeroed when the file is extended anyway.
  1012. */
  1013. if (startoff >= XFS_ISIZE(ip))
  1014. return 0;
  1015. if (endoff > XFS_ISIZE(ip))
  1016. endoff = XFS_ISIZE(ip);
  1017. for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
  1018. uint lock_mode;
  1019. offset_fsb = XFS_B_TO_FSBT(mp, offset);
  1020. nimap = 1;
  1021. lock_mode = xfs_ilock_data_map_shared(ip);
  1022. error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
  1023. xfs_iunlock(ip, lock_mode);
  1024. if (error || nimap < 1)
  1025. break;
  1026. ASSERT(imap.br_blockcount >= 1);
  1027. ASSERT(imap.br_startoff == offset_fsb);
  1028. ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
  1029. if (imap.br_startblock == HOLESTARTBLOCK ||
  1030. imap.br_state == XFS_EXT_UNWRITTEN) {
  1031. /* skip the entire extent */
  1032. lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
  1033. imap.br_blockcount) - 1;
  1034. continue;
  1035. }
  1036. lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
  1037. if (lastoffset > endoff)
  1038. lastoffset = endoff;
  1039. /* DAX can just zero the backing device directly */
  1040. if (IS_DAX(VFS_I(ip))) {
  1041. error = dax_zero_page_range(VFS_I(ip), offset,
  1042. lastoffset - offset + 1,
  1043. xfs_get_blocks_direct);
  1044. if (error)
  1045. return error;
  1046. continue;
  1047. }
  1048. error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
  1049. mp->m_rtdev_targp : mp->m_ddev_targp,
  1050. xfs_fsb_to_db(ip, imap.br_startblock),
  1051. BTOBB(mp->m_sb.sb_blocksize),
  1052. 0, &bp, NULL);
  1053. if (error)
  1054. return error;
  1055. memset(bp->b_addr +
  1056. (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
  1057. 0, lastoffset - offset + 1);
  1058. error = xfs_bwrite(bp);
  1059. xfs_buf_relse(bp);
  1060. if (error)
  1061. return error;
  1062. }
  1063. return error;
  1064. }
  1065. int
  1066. xfs_free_file_space(
  1067. struct xfs_inode *ip,
  1068. xfs_off_t offset,
  1069. xfs_off_t len)
  1070. {
  1071. int done;
  1072. xfs_fileoff_t endoffset_fsb;
  1073. int error;
  1074. xfs_fsblock_t firstfsb;
  1075. xfs_bmap_free_t free_list;
  1076. xfs_bmbt_irec_t imap;
  1077. xfs_off_t ioffset;
  1078. xfs_off_t iendoffset;
  1079. xfs_extlen_t mod=0;
  1080. xfs_mount_t *mp;
  1081. int nimap;
  1082. uint resblks;
  1083. xfs_off_t rounding;
  1084. int rt;
  1085. xfs_fileoff_t startoffset_fsb;
  1086. xfs_trans_t *tp;
  1087. mp = ip->i_mount;
  1088. trace_xfs_free_file_space(ip);
  1089. error = xfs_qm_dqattach(ip, 0);
  1090. if (error)
  1091. return error;
  1092. error = 0;
  1093. if (len <= 0) /* if nothing being freed */
  1094. return error;
  1095. rt = XFS_IS_REALTIME_INODE(ip);
  1096. startoffset_fsb = XFS_B_TO_FSB(mp, offset);
  1097. endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
  1098. /* wait for the completion of any pending DIOs */
  1099. inode_dio_wait(VFS_I(ip));
  1100. rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
  1101. ioffset = round_down(offset, rounding);
  1102. iendoffset = round_up(offset + len, rounding) - 1;
  1103. error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
  1104. iendoffset);
  1105. if (error)
  1106. goto out;
  1107. truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
  1108. /*
  1109. * Need to zero the stuff we're not freeing, on disk.
  1110. * If it's a realtime file & can't use unwritten extents then we
  1111. * actually need to zero the extent edges. Otherwise xfs_bunmapi
  1112. * will take care of it for us.
  1113. */
  1114. if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
  1115. nimap = 1;
  1116. error = xfs_bmapi_read(ip, startoffset_fsb, 1,
  1117. &imap, &nimap, 0);
  1118. if (error)
  1119. goto out;
  1120. ASSERT(nimap == 0 || nimap == 1);
  1121. if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
  1122. xfs_daddr_t block;
  1123. ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
  1124. block = imap.br_startblock;
  1125. mod = do_div(block, mp->m_sb.sb_rextsize);
  1126. if (mod)
  1127. startoffset_fsb += mp->m_sb.sb_rextsize - mod;
  1128. }
  1129. nimap = 1;
  1130. error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
  1131. &imap, &nimap, 0);
  1132. if (error)
  1133. goto out;
  1134. ASSERT(nimap == 0 || nimap == 1);
  1135. if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
  1136. ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
  1137. mod++;
  1138. if (mod && (mod != mp->m_sb.sb_rextsize))
  1139. endoffset_fsb -= mod;
  1140. }
  1141. }
  1142. if ((done = (endoffset_fsb <= startoffset_fsb)))
  1143. /*
  1144. * One contiguous piece to clear
  1145. */
  1146. error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
  1147. else {
  1148. /*
  1149. * Some full blocks, possibly two pieces to clear
  1150. */
  1151. if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
  1152. error = xfs_zero_remaining_bytes(ip, offset,
  1153. XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
  1154. if (!error &&
  1155. XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
  1156. error = xfs_zero_remaining_bytes(ip,
  1157. XFS_FSB_TO_B(mp, endoffset_fsb),
  1158. offset + len - 1);
  1159. }
  1160. /*
  1161. * free file space until done or until there is an error
  1162. */
  1163. resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  1164. while (!error && !done) {
  1165. /*
  1166. * allocate and setup the transaction. Allow this
  1167. * transaction to dip into the reserve blocks to ensure
  1168. * the freeing of the space succeeds at ENOSPC.
  1169. */
  1170. tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
  1171. error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
  1172. /*
  1173. * check for running out of space
  1174. */
  1175. if (error) {
  1176. /*
  1177. * Free the transaction structure.
  1178. */
  1179. ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
  1180. xfs_trans_cancel(tp);
  1181. break;
  1182. }
  1183. xfs_ilock(ip, XFS_ILOCK_EXCL);
  1184. error = xfs_trans_reserve_quota(tp, mp,
  1185. ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
  1186. resblks, 0, XFS_QMOPT_RES_REGBLKS);
  1187. if (error)
  1188. goto error1;
  1189. xfs_trans_ijoin(tp, ip, 0);
  1190. /*
  1191. * issue the bunmapi() call to free the blocks
  1192. */
  1193. xfs_bmap_init(&free_list, &firstfsb);
  1194. error = xfs_bunmapi(tp, ip, startoffset_fsb,
  1195. endoffset_fsb - startoffset_fsb,
  1196. 0, 2, &firstfsb, &free_list, &done);
  1197. if (error)
  1198. goto error0;
  1199. /*
  1200. * complete the transaction
  1201. */
  1202. error = xfs_bmap_finish(&tp, &free_list, NULL);
  1203. if (error)
  1204. goto error0;
  1205. error = xfs_trans_commit(tp);
  1206. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  1207. }
  1208. out:
  1209. return error;
  1210. error0:
  1211. xfs_bmap_cancel(&free_list);
  1212. error1:
  1213. xfs_trans_cancel(tp);
  1214. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  1215. goto out;
  1216. }
  1217. /*
  1218. * Preallocate and zero a range of a file. This mechanism has the allocation
  1219. * semantics of fallocate and in addition converts data in the range to zeroes.
  1220. */
  1221. int
  1222. xfs_zero_file_space(
  1223. struct xfs_inode *ip,
  1224. xfs_off_t offset,
  1225. xfs_off_t len)
  1226. {
  1227. struct xfs_mount *mp = ip->i_mount;
  1228. uint blksize;
  1229. int error;
  1230. trace_xfs_zero_file_space(ip);
  1231. blksize = 1 << mp->m_sb.sb_blocklog;
  1232. /*
  1233. * Punch a hole and prealloc the range. We use hole punch rather than
  1234. * unwritten extent conversion for two reasons:
  1235. *
  1236. * 1.) Hole punch handles partial block zeroing for us.
  1237. *
  1238. * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
  1239. * by virtue of the hole punch.
  1240. */
  1241. error = xfs_free_file_space(ip, offset, len);
  1242. if (error)
  1243. goto out;
  1244. error = xfs_alloc_file_space(ip, round_down(offset, blksize),
  1245. round_up(offset + len, blksize) -
  1246. round_down(offset, blksize),
  1247. XFS_BMAPI_PREALLOC);
  1248. out:
  1249. return error;
  1250. }
  1251. /*
  1252. * @next_fsb will keep track of the extent currently undergoing shift.
  1253. * @stop_fsb will keep track of the extent at which we have to stop.
  1254. * If we are shifting left, we will start with block (offset + len) and
  1255. * shift each extent till last extent.
  1256. * If we are shifting right, we will start with last extent inside file space
  1257. * and continue until we reach the block corresponding to offset.
  1258. */
  1259. static int
  1260. xfs_shift_file_space(
  1261. struct xfs_inode *ip,
  1262. xfs_off_t offset,
  1263. xfs_off_t len,
  1264. enum shift_direction direction)
  1265. {
  1266. int done = 0;
  1267. struct xfs_mount *mp = ip->i_mount;
  1268. struct xfs_trans *tp;
  1269. int error;
  1270. struct xfs_bmap_free free_list;
  1271. xfs_fsblock_t first_block;
  1272. xfs_fileoff_t stop_fsb;
  1273. xfs_fileoff_t next_fsb;
  1274. xfs_fileoff_t shift_fsb;
  1275. ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
  1276. if (direction == SHIFT_LEFT) {
  1277. next_fsb = XFS_B_TO_FSB(mp, offset + len);
  1278. stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
  1279. } else {
  1280. /*
  1281. * If right shift, delegate the work of initialization of
  1282. * next_fsb to xfs_bmap_shift_extent as it has ilock held.
  1283. */
  1284. next_fsb = NULLFSBLOCK;
  1285. stop_fsb = XFS_B_TO_FSB(mp, offset);
  1286. }
  1287. shift_fsb = XFS_B_TO_FSB(mp, len);
  1288. /*
  1289. * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
  1290. * into the accessible region of the file.
  1291. */
  1292. if (xfs_can_free_eofblocks(ip, true)) {
  1293. error = xfs_free_eofblocks(mp, ip, false);
  1294. if (error)
  1295. return error;
  1296. }
  1297. /*
  1298. * Writeback and invalidate cache for the remainder of the file as we're
  1299. * about to shift down every extent from offset to EOF.
  1300. */
  1301. error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
  1302. offset, -1);
  1303. if (error)
  1304. return error;
  1305. error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
  1306. offset >> PAGE_CACHE_SHIFT, -1);
  1307. if (error)
  1308. return error;
  1309. /*
  1310. * The extent shiting code works on extent granularity. So, if
  1311. * stop_fsb is not the starting block of extent, we need to split
  1312. * the extent at stop_fsb.
  1313. */
  1314. if (direction == SHIFT_RIGHT) {
  1315. error = xfs_bmap_split_extent(ip, stop_fsb);
  1316. if (error)
  1317. return error;
  1318. }
  1319. while (!error && !done) {
  1320. tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
  1321. /*
  1322. * We would need to reserve permanent block for transaction.
  1323. * This will come into picture when after shifting extent into
  1324. * hole we found that adjacent extents can be merged which
  1325. * may lead to freeing of a block during record update.
  1326. */
  1327. error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
  1328. XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
  1329. if (error) {
  1330. xfs_trans_cancel(tp);
  1331. break;
  1332. }
  1333. xfs_ilock(ip, XFS_ILOCK_EXCL);
  1334. error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
  1335. ip->i_gdquot, ip->i_pdquot,
  1336. XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
  1337. XFS_QMOPT_RES_REGBLKS);
  1338. if (error)
  1339. goto out_trans_cancel;
  1340. xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  1341. xfs_bmap_init(&free_list, &first_block);
  1342. /*
  1343. * We are using the write transaction in which max 2 bmbt
  1344. * updates are allowed
  1345. */
  1346. error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
  1347. &done, stop_fsb, &first_block, &free_list,
  1348. direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
  1349. if (error)
  1350. goto out_bmap_cancel;
  1351. error = xfs_bmap_finish(&tp, &free_list, NULL);
  1352. if (error)
  1353. goto out_bmap_cancel;
  1354. error = xfs_trans_commit(tp);
  1355. }
  1356. return error;
  1357. out_bmap_cancel:
  1358. xfs_bmap_cancel(&free_list);
  1359. out_trans_cancel:
  1360. xfs_trans_cancel(tp);
  1361. return error;
  1362. }
  1363. /*
  1364. * xfs_collapse_file_space()
  1365. * This routine frees disk space and shift extent for the given file.
  1366. * The first thing we do is to free data blocks in the specified range
  1367. * by calling xfs_free_file_space(). It would also sync dirty data
  1368. * and invalidate page cache over the region on which collapse range
  1369. * is working. And Shift extent records to the left to cover a hole.
  1370. * RETURNS:
  1371. * 0 on success
  1372. * errno on error
  1373. *
  1374. */
  1375. int
  1376. xfs_collapse_file_space(
  1377. struct xfs_inode *ip,
  1378. xfs_off_t offset,
  1379. xfs_off_t len)
  1380. {
  1381. int error;
  1382. ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
  1383. trace_xfs_collapse_file_space(ip);
  1384. error = xfs_free_file_space(ip, offset, len);
  1385. if (error)
  1386. return error;
  1387. return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
  1388. }
  1389. /*
  1390. * xfs_insert_file_space()
  1391. * This routine create hole space by shifting extents for the given file.
  1392. * The first thing we do is to sync dirty data and invalidate page cache
  1393. * over the region on which insert range is working. And split an extent
  1394. * to two extents at given offset by calling xfs_bmap_split_extent.
  1395. * And shift all extent records which are laying between [offset,
  1396. * last allocated extent] to the right to reserve hole range.
  1397. * RETURNS:
  1398. * 0 on success
  1399. * errno on error
  1400. */
  1401. int
  1402. xfs_insert_file_space(
  1403. struct xfs_inode *ip,
  1404. loff_t offset,
  1405. loff_t len)
  1406. {
  1407. ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
  1408. trace_xfs_insert_file_space(ip);
  1409. return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
  1410. }
  1411. /*
  1412. * We need to check that the format of the data fork in the temporary inode is
  1413. * valid for the target inode before doing the swap. This is not a problem with
  1414. * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
  1415. * data fork depending on the space the attribute fork is taking so we can get
  1416. * invalid formats on the target inode.
  1417. *
  1418. * E.g. target has space for 7 extents in extent format, temp inode only has
  1419. * space for 6. If we defragment down to 7 extents, then the tmp format is a
  1420. * btree, but when swapped it needs to be in extent format. Hence we can't just
  1421. * blindly swap data forks on attr2 filesystems.
  1422. *
  1423. * Note that we check the swap in both directions so that we don't end up with
  1424. * a corrupt temporary inode, either.
  1425. *
  1426. * Note that fixing the way xfs_fsr sets up the attribute fork in the source
  1427. * inode will prevent this situation from occurring, so all we do here is
  1428. * reject and log the attempt. basically we are putting the responsibility on
  1429. * userspace to get this right.
  1430. */
  1431. static int
  1432. xfs_swap_extents_check_format(
  1433. xfs_inode_t *ip, /* target inode */
  1434. xfs_inode_t *tip) /* tmp inode */
  1435. {
  1436. /* Should never get a local format */
  1437. if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
  1438. tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
  1439. return -EINVAL;
  1440. /*
  1441. * if the target inode has less extents that then temporary inode then
  1442. * why did userspace call us?
  1443. */
  1444. if (ip->i_d.di_nextents < tip->i_d.di_nextents)
  1445. return -EINVAL;
  1446. /*
  1447. * if the target inode is in extent form and the temp inode is in btree
  1448. * form then we will end up with the target inode in the wrong format
  1449. * as we already know there are less extents in the temp inode.
  1450. */
  1451. if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
  1452. tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
  1453. return -EINVAL;
  1454. /* Check temp in extent form to max in target */
  1455. if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
  1456. XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
  1457. XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
  1458. return -EINVAL;
  1459. /* Check target in extent form to max in temp */
  1460. if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
  1461. XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
  1462. XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
  1463. return -EINVAL;
  1464. /*
  1465. * If we are in a btree format, check that the temp root block will fit
  1466. * in the target and that it has enough extents to be in btree format
  1467. * in the target.
  1468. *
  1469. * Note that we have to be careful to allow btree->extent conversions
  1470. * (a common defrag case) which will occur when the temp inode is in
  1471. * extent format...
  1472. */
  1473. if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
  1474. if (XFS_IFORK_BOFF(ip) &&
  1475. XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
  1476. return -EINVAL;
  1477. if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
  1478. XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
  1479. return -EINVAL;
  1480. }
  1481. /* Reciprocal target->temp btree format checks */
  1482. if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
  1483. if (XFS_IFORK_BOFF(tip) &&
  1484. XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
  1485. return -EINVAL;
  1486. if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
  1487. XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
  1488. return -EINVAL;
  1489. }
  1490. return 0;
  1491. }
  1492. static int
  1493. xfs_swap_extent_flush(
  1494. struct xfs_inode *ip)
  1495. {
  1496. int error;
  1497. error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
  1498. if (error)
  1499. return error;
  1500. truncate_pagecache_range(VFS_I(ip), 0, -1);
  1501. /* Verify O_DIRECT for ftmp */
  1502. if (VFS_I(ip)->i_mapping->nrpages)
  1503. return -EINVAL;
  1504. return 0;
  1505. }
  1506. int
  1507. xfs_swap_extents(
  1508. xfs_inode_t *ip, /* target inode */
  1509. xfs_inode_t *tip, /* tmp inode */
  1510. xfs_swapext_t *sxp)
  1511. {
  1512. xfs_mount_t *mp = ip->i_mount;
  1513. xfs_trans_t *tp;
  1514. xfs_bstat_t *sbp = &sxp->sx_stat;
  1515. xfs_ifork_t *tempifp, *ifp, *tifp;
  1516. int src_log_flags, target_log_flags;
  1517. int error = 0;
  1518. int aforkblks = 0;
  1519. int taforkblks = 0;
  1520. __uint64_t tmp;
  1521. int lock_flags;
  1522. tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
  1523. if (!tempifp) {
  1524. error = -ENOMEM;
  1525. goto out;
  1526. }
  1527. /*
  1528. * Lock the inodes against other IO, page faults and truncate to
  1529. * begin with. Then we can ensure the inodes are flushed and have no
  1530. * page cache safely. Once we have done this we can take the ilocks and
  1531. * do the rest of the checks.
  1532. */
  1533. lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
  1534. xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
  1535. xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
  1536. /* Verify that both files have the same format */
  1537. if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
  1538. error = -EINVAL;
  1539. goto out_unlock;
  1540. }
  1541. /* Verify both files are either real-time or non-realtime */
  1542. if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
  1543. error = -EINVAL;
  1544. goto out_unlock;
  1545. }
  1546. error = xfs_swap_extent_flush(ip);
  1547. if (error)
  1548. goto out_unlock;
  1549. error = xfs_swap_extent_flush(tip);
  1550. if (error)
  1551. goto out_unlock;
  1552. tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
  1553. error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
  1554. if (error) {
  1555. xfs_trans_cancel(tp);
  1556. goto out_unlock;
  1557. }
  1558. /*
  1559. * Lock and join the inodes to the tansaction so that transaction commit
  1560. * or cancel will unlock the inodes from this point onwards.
  1561. */
  1562. xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
  1563. lock_flags |= XFS_ILOCK_EXCL;
  1564. xfs_trans_ijoin(tp, ip, lock_flags);
  1565. xfs_trans_ijoin(tp, tip, lock_flags);
  1566. /* Verify all data are being swapped */
  1567. if (sxp->sx_offset != 0 ||
  1568. sxp->sx_length != ip->i_d.di_size ||
  1569. sxp->sx_length != tip->i_d.di_size) {
  1570. error = -EFAULT;
  1571. goto out_trans_cancel;
  1572. }
  1573. trace_xfs_swap_extent_before(ip, 0);
  1574. trace_xfs_swap_extent_before(tip, 1);
  1575. /* check inode formats now that data is flushed */
  1576. error = xfs_swap_extents_check_format(ip, tip);
  1577. if (error) {
  1578. xfs_notice(mp,
  1579. "%s: inode 0x%llx format is incompatible for exchanging.",
  1580. __func__, ip->i_ino);
  1581. goto out_trans_cancel;
  1582. }
  1583. /*
  1584. * Compare the current change & modify times with that
  1585. * passed in. If they differ, we abort this swap.
  1586. * This is the mechanism used to ensure the calling
  1587. * process that the file was not changed out from
  1588. * under it.
  1589. */
  1590. if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
  1591. (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
  1592. (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
  1593. (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
  1594. error = -EBUSY;
  1595. goto out_trans_cancel;
  1596. }
  1597. /*
  1598. * Count the number of extended attribute blocks
  1599. */
  1600. if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
  1601. (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
  1602. error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
  1603. if (error)
  1604. goto out_trans_cancel;
  1605. }
  1606. if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
  1607. (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
  1608. error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
  1609. &taforkblks);
  1610. if (error)
  1611. goto out_trans_cancel;
  1612. }
  1613. /*
  1614. * Before we've swapped the forks, lets set the owners of the forks
  1615. * appropriately. We have to do this as we are demand paging the btree
  1616. * buffers, and so the validation done on read will expect the owner
  1617. * field to be correctly set. Once we change the owners, we can swap the
  1618. * inode forks.
  1619. *
  1620. * Note the trickiness in setting the log flags - we set the owner log
  1621. * flag on the opposite inode (i.e. the inode we are setting the new
  1622. * owner to be) because once we swap the forks and log that, log
  1623. * recovery is going to see the fork as owned by the swapped inode,
  1624. * not the pre-swapped inodes.
  1625. */
  1626. src_log_flags = XFS_ILOG_CORE;
  1627. target_log_flags = XFS_ILOG_CORE;
  1628. if (ip->i_d.di_version == 3 &&
  1629. ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
  1630. target_log_flags |= XFS_ILOG_DOWNER;
  1631. error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
  1632. tip->i_ino, NULL);
  1633. if (error)
  1634. goto out_trans_cancel;
  1635. }
  1636. if (tip->i_d.di_version == 3 &&
  1637. tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
  1638. src_log_flags |= XFS_ILOG_DOWNER;
  1639. error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
  1640. ip->i_ino, NULL);
  1641. if (error)
  1642. goto out_trans_cancel;
  1643. }
  1644. /*
  1645. * Swap the data forks of the inodes
  1646. */
  1647. ifp = &ip->i_df;
  1648. tifp = &tip->i_df;
  1649. *tempifp = *ifp; /* struct copy */
  1650. *ifp = *tifp; /* struct copy */
  1651. *tifp = *tempifp; /* struct copy */
  1652. /*
  1653. * Fix the on-disk inode values
  1654. */
  1655. tmp = (__uint64_t)ip->i_d.di_nblocks;
  1656. ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
  1657. tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
  1658. tmp = (__uint64_t) ip->i_d.di_nextents;
  1659. ip->i_d.di_nextents = tip->i_d.di_nextents;
  1660. tip->i_d.di_nextents = tmp;
  1661. tmp = (__uint64_t) ip->i_d.di_format;
  1662. ip->i_d.di_format = tip->i_d.di_format;
  1663. tip->i_d.di_format = tmp;
  1664. /*
  1665. * The extents in the source inode could still contain speculative
  1666. * preallocation beyond EOF (e.g. the file is open but not modified
  1667. * while defrag is in progress). In that case, we need to copy over the
  1668. * number of delalloc blocks the data fork in the source inode is
  1669. * tracking beyond EOF so that when the fork is truncated away when the
  1670. * temporary inode is unlinked we don't underrun the i_delayed_blks
  1671. * counter on that inode.
  1672. */
  1673. ASSERT(tip->i_delayed_blks == 0);
  1674. tip->i_delayed_blks = ip->i_delayed_blks;
  1675. ip->i_delayed_blks = 0;
  1676. switch (ip->i_d.di_format) {
  1677. case XFS_DINODE_FMT_EXTENTS:
  1678. /* If the extents fit in the inode, fix the
  1679. * pointer. Otherwise it's already NULL or
  1680. * pointing to the extent.
  1681. */
  1682. if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
  1683. ifp->if_u1.if_extents =
  1684. ifp->if_u2.if_inline_ext;
  1685. }
  1686. src_log_flags |= XFS_ILOG_DEXT;
  1687. break;
  1688. case XFS_DINODE_FMT_BTREE:
  1689. ASSERT(ip->i_d.di_version < 3 ||
  1690. (src_log_flags & XFS_ILOG_DOWNER));
  1691. src_log_flags |= XFS_ILOG_DBROOT;
  1692. break;
  1693. }
  1694. switch (tip->i_d.di_format) {
  1695. case XFS_DINODE_FMT_EXTENTS:
  1696. /* If the extents fit in the inode, fix the
  1697. * pointer. Otherwise it's already NULL or
  1698. * pointing to the extent.
  1699. */
  1700. if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
  1701. tifp->if_u1.if_extents =
  1702. tifp->if_u2.if_inline_ext;
  1703. }
  1704. target_log_flags |= XFS_ILOG_DEXT;
  1705. break;
  1706. case XFS_DINODE_FMT_BTREE:
  1707. target_log_flags |= XFS_ILOG_DBROOT;
  1708. ASSERT(tip->i_d.di_version < 3 ||
  1709. (target_log_flags & XFS_ILOG_DOWNER));
  1710. break;
  1711. }
  1712. xfs_trans_log_inode(tp, ip, src_log_flags);
  1713. xfs_trans_log_inode(tp, tip, target_log_flags);
  1714. /*
  1715. * If this is a synchronous mount, make sure that the
  1716. * transaction goes to disk before returning to the user.
  1717. */
  1718. if (mp->m_flags & XFS_MOUNT_WSYNC)
  1719. xfs_trans_set_sync(tp);
  1720. error = xfs_trans_commit(tp);
  1721. trace_xfs_swap_extent_after(ip, 0);
  1722. trace_xfs_swap_extent_after(tip, 1);
  1723. out:
  1724. kmem_free(tempifp);
  1725. return error;
  1726. out_unlock:
  1727. xfs_iunlock(ip, lock_flags);
  1728. xfs_iunlock(tip, lock_flags);
  1729. goto out;
  1730. out_trans_cancel:
  1731. xfs_trans_cancel(tp);
  1732. goto out;
  1733. }