extents_status.c 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * fs/ext4/extents_status.c
  4. *
  5. * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
  6. * Modified by
  7. * Allison Henderson <achender@linux.vnet.ibm.com>
  8. * Hugh Dickins <hughd@google.com>
  9. * Zheng Liu <wenqing.lz@taobao.com>
  10. *
  11. * Ext4 extents status tree core functions.
  12. */
  13. #include <linux/list_sort.h>
  14. #include <linux/proc_fs.h>
  15. #include <linux/seq_file.h>
  16. #include "ext4.h"
  17. #include <trace/events/ext4.h>
  18. /*
  19. * According to previous discussion in Ext4 Developer Workshop, we
  20. * will introduce a new structure called io tree to track all extent
  21. * status in order to solve some problems that we have met
  22. * (e.g. Reservation space warning), and provide extent-level locking.
  23. * Delay extent tree is the first step to achieve this goal. It is
  24. * original built by Yongqiang Yang. At that time it is called delay
  25. * extent tree, whose goal is only track delayed extents in memory to
  26. * simplify the implementation of fiemap and bigalloc, and introduce
  27. * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
  28. * delay extent tree at the first commit. But for better understand
  29. * what it does, it has been rename to extent status tree.
  30. *
  31. * Step1:
  32. * Currently the first step has been done. All delayed extents are
  33. * tracked in the tree. It maintains the delayed extent when a delayed
  34. * allocation is issued, and the delayed extent is written out or
  35. * invalidated. Therefore the implementation of fiemap and bigalloc
  36. * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
  37. *
  38. * The following comment describes the implemenmtation of extent
  39. * status tree and future works.
  40. *
  41. * Step2:
  42. * In this step all extent status are tracked by extent status tree.
  43. * Thus, we can first try to lookup a block mapping in this tree before
  44. * finding it in extent tree. Hence, single extent cache can be removed
  45. * because extent status tree can do a better job. Extents in status
  46. * tree are loaded on-demand. Therefore, the extent status tree may not
  47. * contain all of the extents in a file. Meanwhile we define a shrinker
  48. * to reclaim memory from extent status tree because fragmented extent
  49. * tree will make status tree cost too much memory. written/unwritten/-
  50. * hole extents in the tree will be reclaimed by this shrinker when we
  51. * are under high memory pressure. Delayed extents will not be
  52. * reclimed because fiemap, bigalloc, and seek_data/hole need it.
  53. */
  54. /*
  55. * Extent status tree implementation for ext4.
  56. *
  57. *
  58. * ==========================================================================
  59. * Extent status tree tracks all extent status.
  60. *
  61. * 1. Why we need to implement extent status tree?
  62. *
  63. * Without extent status tree, ext4 identifies a delayed extent by looking
  64. * up page cache, this has several deficiencies - complicated, buggy,
  65. * and inefficient code.
  66. *
  67. * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
  68. * block or a range of blocks are belonged to a delayed extent.
  69. *
  70. * Let us have a look at how they do without extent status tree.
  71. * -- FIEMAP
  72. * FIEMAP looks up page cache to identify delayed allocations from holes.
  73. *
  74. * -- SEEK_HOLE/DATA
  75. * SEEK_HOLE/DATA has the same problem as FIEMAP.
  76. *
  77. * -- bigalloc
  78. * bigalloc looks up page cache to figure out if a block is
  79. * already under delayed allocation or not to determine whether
  80. * quota reserving is needed for the cluster.
  81. *
  82. * -- writeout
  83. * Writeout looks up whole page cache to see if a buffer is
  84. * mapped, If there are not very many delayed buffers, then it is
  85. * time consuming.
  86. *
  87. * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
  88. * bigalloc and writeout can figure out if a block or a range of
  89. * blocks is under delayed allocation(belonged to a delayed extent) or
  90. * not by searching the extent tree.
  91. *
  92. *
  93. * ==========================================================================
  94. * 2. Ext4 extent status tree impelmentation
  95. *
  96. * -- extent
  97. * A extent is a range of blocks which are contiguous logically and
  98. * physically. Unlike extent in extent tree, this extent in ext4 is
  99. * a in-memory struct, there is no corresponding on-disk data. There
  100. * is no limit on length of extent, so an extent can contain as many
  101. * blocks as they are contiguous logically and physically.
  102. *
  103. * -- extent status tree
  104. * Every inode has an extent status tree and all allocation blocks
  105. * are added to the tree with different status. The extent in the
  106. * tree are ordered by logical block no.
  107. *
  108. * -- operations on a extent status tree
  109. * There are three important operations on a delayed extent tree: find
  110. * next extent, adding a extent(a range of blocks) and removing a extent.
  111. *
  112. * -- race on a extent status tree
  113. * Extent status tree is protected by inode->i_es_lock.
  114. *
  115. * -- memory consumption
  116. * Fragmented extent tree will make extent status tree cost too much
  117. * memory. Hence, we will reclaim written/unwritten/hole extents from
  118. * the tree under a heavy memory pressure.
  119. *
  120. *
  121. * ==========================================================================
  122. * 3. Performance analysis
  123. *
  124. * -- overhead
  125. * 1. There is a cache extent for write access, so if writes are
  126. * not very random, adding space operaions are in O(1) time.
  127. *
  128. * -- gain
  129. * 2. Code is much simpler, more readable, more maintainable and
  130. * more efficient.
  131. *
  132. *
  133. * ==========================================================================
  134. * 4. TODO list
  135. *
  136. * -- Refactor delayed space reservation
  137. *
  138. * -- Extent-level locking
  139. */
  140. static struct kmem_cache *ext4_es_cachep;
  141. static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
  142. static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
  143. ext4_lblk_t end);
  144. static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
  145. static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
  146. struct ext4_inode_info *locked_ei);
  147. int __init ext4_init_es(void)
  148. {
  149. ext4_es_cachep = kmem_cache_create("ext4_extent_status",
  150. sizeof(struct extent_status),
  151. 0, (SLAB_RECLAIM_ACCOUNT), NULL);
  152. if (ext4_es_cachep == NULL)
  153. return -ENOMEM;
  154. return 0;
  155. }
  156. void ext4_exit_es(void)
  157. {
  158. if (ext4_es_cachep)
  159. kmem_cache_destroy(ext4_es_cachep);
  160. }
  161. void ext4_es_init_tree(struct ext4_es_tree *tree)
  162. {
  163. tree->root = RB_ROOT;
  164. tree->cache_es = NULL;
  165. }
  166. #ifdef ES_DEBUG__
  167. static void ext4_es_print_tree(struct inode *inode)
  168. {
  169. struct ext4_es_tree *tree;
  170. struct rb_node *node;
  171. printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
  172. tree = &EXT4_I(inode)->i_es_tree;
  173. node = rb_first(&tree->root);
  174. while (node) {
  175. struct extent_status *es;
  176. es = rb_entry(node, struct extent_status, rb_node);
  177. printk(KERN_DEBUG " [%u/%u) %llu %x",
  178. es->es_lblk, es->es_len,
  179. ext4_es_pblock(es), ext4_es_status(es));
  180. node = rb_next(node);
  181. }
  182. printk(KERN_DEBUG "\n");
  183. }
  184. #else
  185. #define ext4_es_print_tree(inode)
  186. #endif
  187. static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
  188. {
  189. BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
  190. return es->es_lblk + es->es_len - 1;
  191. }
  192. /*
  193. * search through the tree for an delayed extent with a given offset. If
  194. * it can't be found, try to find next extent.
  195. */
  196. static struct extent_status *__es_tree_search(struct rb_root *root,
  197. ext4_lblk_t lblk)
  198. {
  199. struct rb_node *node = root->rb_node;
  200. struct extent_status *es = NULL;
  201. while (node) {
  202. es = rb_entry(node, struct extent_status, rb_node);
  203. if (lblk < es->es_lblk)
  204. node = node->rb_left;
  205. else if (lblk > ext4_es_end(es))
  206. node = node->rb_right;
  207. else
  208. return es;
  209. }
  210. if (es && lblk < es->es_lblk)
  211. return es;
  212. if (es && lblk > ext4_es_end(es)) {
  213. node = rb_next(&es->rb_node);
  214. return node ? rb_entry(node, struct extent_status, rb_node) :
  215. NULL;
  216. }
  217. return NULL;
  218. }
  219. /*
  220. * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
  221. * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
  222. *
  223. * @inode: the inode which owns delayed extents
  224. * @lblk: the offset where we start to search
  225. * @end: the offset where we stop to search
  226. * @es: delayed extent that we found
  227. */
  228. void ext4_es_find_delayed_extent_range(struct inode *inode,
  229. ext4_lblk_t lblk, ext4_lblk_t end,
  230. struct extent_status *es)
  231. {
  232. struct ext4_es_tree *tree = NULL;
  233. struct extent_status *es1 = NULL;
  234. struct rb_node *node;
  235. BUG_ON(es == NULL);
  236. BUG_ON(end < lblk);
  237. trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
  238. read_lock(&EXT4_I(inode)->i_es_lock);
  239. tree = &EXT4_I(inode)->i_es_tree;
  240. /* find extent in cache firstly */
  241. es->es_lblk = es->es_len = es->es_pblk = 0;
  242. if (tree->cache_es) {
  243. es1 = tree->cache_es;
  244. if (in_range(lblk, es1->es_lblk, es1->es_len)) {
  245. es_debug("%u cached by [%u/%u) %llu %x\n",
  246. lblk, es1->es_lblk, es1->es_len,
  247. ext4_es_pblock(es1), ext4_es_status(es1));
  248. goto out;
  249. }
  250. }
  251. es1 = __es_tree_search(&tree->root, lblk);
  252. out:
  253. if (es1 && !ext4_es_is_delayed(es1)) {
  254. while ((node = rb_next(&es1->rb_node)) != NULL) {
  255. es1 = rb_entry(node, struct extent_status, rb_node);
  256. if (es1->es_lblk > end) {
  257. es1 = NULL;
  258. break;
  259. }
  260. if (ext4_es_is_delayed(es1))
  261. break;
  262. }
  263. }
  264. if (es1 && ext4_es_is_delayed(es1)) {
  265. tree->cache_es = es1;
  266. es->es_lblk = es1->es_lblk;
  267. es->es_len = es1->es_len;
  268. es->es_pblk = es1->es_pblk;
  269. }
  270. read_unlock(&EXT4_I(inode)->i_es_lock);
  271. trace_ext4_es_find_delayed_extent_range_exit(inode, es);
  272. }
  273. static void ext4_es_list_add(struct inode *inode)
  274. {
  275. struct ext4_inode_info *ei = EXT4_I(inode);
  276. struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  277. if (!list_empty(&ei->i_es_list))
  278. return;
  279. spin_lock(&sbi->s_es_lock);
  280. if (list_empty(&ei->i_es_list)) {
  281. list_add_tail(&ei->i_es_list, &sbi->s_es_list);
  282. sbi->s_es_nr_inode++;
  283. }
  284. spin_unlock(&sbi->s_es_lock);
  285. }
  286. static void ext4_es_list_del(struct inode *inode)
  287. {
  288. struct ext4_inode_info *ei = EXT4_I(inode);
  289. struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  290. spin_lock(&sbi->s_es_lock);
  291. if (!list_empty(&ei->i_es_list)) {
  292. list_del_init(&ei->i_es_list);
  293. sbi->s_es_nr_inode--;
  294. WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
  295. }
  296. spin_unlock(&sbi->s_es_lock);
  297. }
  298. static struct extent_status *
  299. ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
  300. ext4_fsblk_t pblk)
  301. {
  302. struct extent_status *es;
  303. es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
  304. if (es == NULL)
  305. return NULL;
  306. es->es_lblk = lblk;
  307. es->es_len = len;
  308. es->es_pblk = pblk;
  309. /*
  310. * We don't count delayed extent because we never try to reclaim them
  311. */
  312. if (!ext4_es_is_delayed(es)) {
  313. if (!EXT4_I(inode)->i_es_shk_nr++)
  314. ext4_es_list_add(inode);
  315. percpu_counter_inc(&EXT4_SB(inode->i_sb)->
  316. s_es_stats.es_stats_shk_cnt);
  317. }
  318. EXT4_I(inode)->i_es_all_nr++;
  319. percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
  320. return es;
  321. }
  322. static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
  323. {
  324. EXT4_I(inode)->i_es_all_nr--;
  325. percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
  326. /* Decrease the shrink counter when this es is not delayed */
  327. if (!ext4_es_is_delayed(es)) {
  328. BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
  329. if (!--EXT4_I(inode)->i_es_shk_nr)
  330. ext4_es_list_del(inode);
  331. percpu_counter_dec(&EXT4_SB(inode->i_sb)->
  332. s_es_stats.es_stats_shk_cnt);
  333. }
  334. kmem_cache_free(ext4_es_cachep, es);
  335. }
  336. /*
  337. * Check whether or not two extents can be merged
  338. * Condition:
  339. * - logical block number is contiguous
  340. * - physical block number is contiguous
  341. * - status is equal
  342. */
  343. static int ext4_es_can_be_merged(struct extent_status *es1,
  344. struct extent_status *es2)
  345. {
  346. if (ext4_es_type(es1) != ext4_es_type(es2))
  347. return 0;
  348. if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
  349. pr_warn("ES assertion failed when merging extents. "
  350. "The sum of lengths of es1 (%d) and es2 (%d) "
  351. "is bigger than allowed file size (%d)\n",
  352. es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
  353. WARN_ON(1);
  354. return 0;
  355. }
  356. if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
  357. return 0;
  358. if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
  359. (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
  360. return 1;
  361. if (ext4_es_is_hole(es1))
  362. return 1;
  363. /* we need to check delayed extent is without unwritten status */
  364. if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
  365. return 1;
  366. return 0;
  367. }
  368. static struct extent_status *
  369. ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
  370. {
  371. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  372. struct extent_status *es1;
  373. struct rb_node *node;
  374. node = rb_prev(&es->rb_node);
  375. if (!node)
  376. return es;
  377. es1 = rb_entry(node, struct extent_status, rb_node);
  378. if (ext4_es_can_be_merged(es1, es)) {
  379. es1->es_len += es->es_len;
  380. if (ext4_es_is_referenced(es))
  381. ext4_es_set_referenced(es1);
  382. rb_erase(&es->rb_node, &tree->root);
  383. ext4_es_free_extent(inode, es);
  384. es = es1;
  385. }
  386. return es;
  387. }
  388. static struct extent_status *
  389. ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
  390. {
  391. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  392. struct extent_status *es1;
  393. struct rb_node *node;
  394. node = rb_next(&es->rb_node);
  395. if (!node)
  396. return es;
  397. es1 = rb_entry(node, struct extent_status, rb_node);
  398. if (ext4_es_can_be_merged(es, es1)) {
  399. es->es_len += es1->es_len;
  400. if (ext4_es_is_referenced(es1))
  401. ext4_es_set_referenced(es);
  402. rb_erase(node, &tree->root);
  403. ext4_es_free_extent(inode, es1);
  404. }
  405. return es;
  406. }
  407. #ifdef ES_AGGRESSIVE_TEST
  408. #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
  409. static void ext4_es_insert_extent_ext_check(struct inode *inode,
  410. struct extent_status *es)
  411. {
  412. struct ext4_ext_path *path = NULL;
  413. struct ext4_extent *ex;
  414. ext4_lblk_t ee_block;
  415. ext4_fsblk_t ee_start;
  416. unsigned short ee_len;
  417. int depth, ee_status, es_status;
  418. path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
  419. if (IS_ERR(path))
  420. return;
  421. depth = ext_depth(inode);
  422. ex = path[depth].p_ext;
  423. if (ex) {
  424. ee_block = le32_to_cpu(ex->ee_block);
  425. ee_start = ext4_ext_pblock(ex);
  426. ee_len = ext4_ext_get_actual_len(ex);
  427. ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
  428. es_status = ext4_es_is_unwritten(es) ? 1 : 0;
  429. /*
  430. * Make sure ex and es are not overlap when we try to insert
  431. * a delayed/hole extent.
  432. */
  433. if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
  434. if (in_range(es->es_lblk, ee_block, ee_len)) {
  435. pr_warn("ES insert assertion failed for "
  436. "inode: %lu we can find an extent "
  437. "at block [%d/%d/%llu/%c], but we "
  438. "want to add a delayed/hole extent "
  439. "[%d/%d/%llu/%x]\n",
  440. inode->i_ino, ee_block, ee_len,
  441. ee_start, ee_status ? 'u' : 'w',
  442. es->es_lblk, es->es_len,
  443. ext4_es_pblock(es), ext4_es_status(es));
  444. }
  445. goto out;
  446. }
  447. /*
  448. * We don't check ee_block == es->es_lblk, etc. because es
  449. * might be a part of whole extent, vice versa.
  450. */
  451. if (es->es_lblk < ee_block ||
  452. ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
  453. pr_warn("ES insert assertion failed for inode: %lu "
  454. "ex_status [%d/%d/%llu/%c] != "
  455. "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
  456. ee_block, ee_len, ee_start,
  457. ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
  458. ext4_es_pblock(es), es_status ? 'u' : 'w');
  459. goto out;
  460. }
  461. if (ee_status ^ es_status) {
  462. pr_warn("ES insert assertion failed for inode: %lu "
  463. "ex_status [%d/%d/%llu/%c] != "
  464. "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
  465. ee_block, ee_len, ee_start,
  466. ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
  467. ext4_es_pblock(es), es_status ? 'u' : 'w');
  468. }
  469. } else {
  470. /*
  471. * We can't find an extent on disk. So we need to make sure
  472. * that we don't want to add an written/unwritten extent.
  473. */
  474. if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
  475. pr_warn("ES insert assertion failed for inode: %lu "
  476. "can't find an extent at block %d but we want "
  477. "to add a written/unwritten extent "
  478. "[%d/%d/%llu/%x]\n", inode->i_ino,
  479. es->es_lblk, es->es_lblk, es->es_len,
  480. ext4_es_pblock(es), ext4_es_status(es));
  481. }
  482. }
  483. out:
  484. ext4_ext_drop_refs(path);
  485. kfree(path);
  486. }
  487. static void ext4_es_insert_extent_ind_check(struct inode *inode,
  488. struct extent_status *es)
  489. {
  490. struct ext4_map_blocks map;
  491. int retval;
  492. /*
  493. * Here we call ext4_ind_map_blocks to lookup a block mapping because
  494. * 'Indirect' structure is defined in indirect.c. So we couldn't
  495. * access direct/indirect tree from outside. It is too dirty to define
  496. * this function in indirect.c file.
  497. */
  498. map.m_lblk = es->es_lblk;
  499. map.m_len = es->es_len;
  500. retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
  501. if (retval > 0) {
  502. if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
  503. /*
  504. * We want to add a delayed/hole extent but this
  505. * block has been allocated.
  506. */
  507. pr_warn("ES insert assertion failed for inode: %lu "
  508. "We can find blocks but we want to add a "
  509. "delayed/hole extent [%d/%d/%llu/%x]\n",
  510. inode->i_ino, es->es_lblk, es->es_len,
  511. ext4_es_pblock(es), ext4_es_status(es));
  512. return;
  513. } else if (ext4_es_is_written(es)) {
  514. if (retval != es->es_len) {
  515. pr_warn("ES insert assertion failed for "
  516. "inode: %lu retval %d != es_len %d\n",
  517. inode->i_ino, retval, es->es_len);
  518. return;
  519. }
  520. if (map.m_pblk != ext4_es_pblock(es)) {
  521. pr_warn("ES insert assertion failed for "
  522. "inode: %lu m_pblk %llu != "
  523. "es_pblk %llu\n",
  524. inode->i_ino, map.m_pblk,
  525. ext4_es_pblock(es));
  526. return;
  527. }
  528. } else {
  529. /*
  530. * We don't need to check unwritten extent because
  531. * indirect-based file doesn't have it.
  532. */
  533. BUG_ON(1);
  534. }
  535. } else if (retval == 0) {
  536. if (ext4_es_is_written(es)) {
  537. pr_warn("ES insert assertion failed for inode: %lu "
  538. "We can't find the block but we want to add "
  539. "a written extent [%d/%d/%llu/%x]\n",
  540. inode->i_ino, es->es_lblk, es->es_len,
  541. ext4_es_pblock(es), ext4_es_status(es));
  542. return;
  543. }
  544. }
  545. }
  546. static inline void ext4_es_insert_extent_check(struct inode *inode,
  547. struct extent_status *es)
  548. {
  549. /*
  550. * We don't need to worry about the race condition because
  551. * caller takes i_data_sem locking.
  552. */
  553. BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
  554. if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
  555. ext4_es_insert_extent_ext_check(inode, es);
  556. else
  557. ext4_es_insert_extent_ind_check(inode, es);
  558. }
  559. #else
  560. static inline void ext4_es_insert_extent_check(struct inode *inode,
  561. struct extent_status *es)
  562. {
  563. }
  564. #endif
  565. static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
  566. {
  567. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  568. struct rb_node **p = &tree->root.rb_node;
  569. struct rb_node *parent = NULL;
  570. struct extent_status *es;
  571. while (*p) {
  572. parent = *p;
  573. es = rb_entry(parent, struct extent_status, rb_node);
  574. if (newes->es_lblk < es->es_lblk) {
  575. if (ext4_es_can_be_merged(newes, es)) {
  576. /*
  577. * Here we can modify es_lblk directly
  578. * because it isn't overlapped.
  579. */
  580. es->es_lblk = newes->es_lblk;
  581. es->es_len += newes->es_len;
  582. if (ext4_es_is_written(es) ||
  583. ext4_es_is_unwritten(es))
  584. ext4_es_store_pblock(es,
  585. newes->es_pblk);
  586. es = ext4_es_try_to_merge_left(inode, es);
  587. goto out;
  588. }
  589. p = &(*p)->rb_left;
  590. } else if (newes->es_lblk > ext4_es_end(es)) {
  591. if (ext4_es_can_be_merged(es, newes)) {
  592. es->es_len += newes->es_len;
  593. es = ext4_es_try_to_merge_right(inode, es);
  594. goto out;
  595. }
  596. p = &(*p)->rb_right;
  597. } else {
  598. BUG_ON(1);
  599. return -EINVAL;
  600. }
  601. }
  602. es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
  603. newes->es_pblk);
  604. if (!es)
  605. return -ENOMEM;
  606. rb_link_node(&es->rb_node, parent, p);
  607. rb_insert_color(&es->rb_node, &tree->root);
  608. out:
  609. tree->cache_es = es;
  610. return 0;
  611. }
  612. /*
  613. * ext4_es_insert_extent() adds information to an inode's extent
  614. * status tree.
  615. *
  616. * Return 0 on success, error code on failure.
  617. */
  618. int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
  619. ext4_lblk_t len, ext4_fsblk_t pblk,
  620. unsigned int status)
  621. {
  622. struct extent_status newes;
  623. ext4_lblk_t end = lblk + len - 1;
  624. int err = 0;
  625. es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
  626. lblk, len, pblk, status, inode->i_ino);
  627. if (!len)
  628. return 0;
  629. BUG_ON(end < lblk);
  630. if ((status & EXTENT_STATUS_DELAYED) &&
  631. (status & EXTENT_STATUS_WRITTEN)) {
  632. ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
  633. " delayed and written which can potentially "
  634. " cause data loss.", lblk, len);
  635. WARN_ON(1);
  636. }
  637. newes.es_lblk = lblk;
  638. newes.es_len = len;
  639. ext4_es_store_pblock_status(&newes, pblk, status);
  640. trace_ext4_es_insert_extent(inode, &newes);
  641. ext4_es_insert_extent_check(inode, &newes);
  642. write_lock(&EXT4_I(inode)->i_es_lock);
  643. err = __es_remove_extent(inode, lblk, end);
  644. if (err != 0)
  645. goto error;
  646. retry:
  647. err = __es_insert_extent(inode, &newes);
  648. if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
  649. 128, EXT4_I(inode)))
  650. goto retry;
  651. if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
  652. err = 0;
  653. error:
  654. write_unlock(&EXT4_I(inode)->i_es_lock);
  655. ext4_es_print_tree(inode);
  656. return err;
  657. }
  658. /*
  659. * ext4_es_cache_extent() inserts information into the extent status
  660. * tree if and only if there isn't information about the range in
  661. * question already.
  662. */
  663. void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
  664. ext4_lblk_t len, ext4_fsblk_t pblk,
  665. unsigned int status)
  666. {
  667. struct extent_status *es;
  668. struct extent_status newes;
  669. ext4_lblk_t end = lblk + len - 1;
  670. newes.es_lblk = lblk;
  671. newes.es_len = len;
  672. ext4_es_store_pblock_status(&newes, pblk, status);
  673. trace_ext4_es_cache_extent(inode, &newes);
  674. if (!len)
  675. return;
  676. BUG_ON(end < lblk);
  677. write_lock(&EXT4_I(inode)->i_es_lock);
  678. es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
  679. if (!es || es->es_lblk > end)
  680. __es_insert_extent(inode, &newes);
  681. write_unlock(&EXT4_I(inode)->i_es_lock);
  682. }
  683. /*
  684. * ext4_es_lookup_extent() looks up an extent in extent status tree.
  685. *
  686. * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
  687. *
  688. * Return: 1 on found, 0 on not
  689. */
  690. int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
  691. struct extent_status *es)
  692. {
  693. struct ext4_es_tree *tree;
  694. struct ext4_es_stats *stats;
  695. struct extent_status *es1 = NULL;
  696. struct rb_node *node;
  697. int found = 0;
  698. trace_ext4_es_lookup_extent_enter(inode, lblk);
  699. es_debug("lookup extent in block %u\n", lblk);
  700. tree = &EXT4_I(inode)->i_es_tree;
  701. read_lock(&EXT4_I(inode)->i_es_lock);
  702. /* find extent in cache firstly */
  703. es->es_lblk = es->es_len = es->es_pblk = 0;
  704. if (tree->cache_es) {
  705. es1 = tree->cache_es;
  706. if (in_range(lblk, es1->es_lblk, es1->es_len)) {
  707. es_debug("%u cached by [%u/%u)\n",
  708. lblk, es1->es_lblk, es1->es_len);
  709. found = 1;
  710. goto out;
  711. }
  712. }
  713. node = tree->root.rb_node;
  714. while (node) {
  715. es1 = rb_entry(node, struct extent_status, rb_node);
  716. if (lblk < es1->es_lblk)
  717. node = node->rb_left;
  718. else if (lblk > ext4_es_end(es1))
  719. node = node->rb_right;
  720. else {
  721. found = 1;
  722. break;
  723. }
  724. }
  725. out:
  726. stats = &EXT4_SB(inode->i_sb)->s_es_stats;
  727. if (found) {
  728. BUG_ON(!es1);
  729. es->es_lblk = es1->es_lblk;
  730. es->es_len = es1->es_len;
  731. es->es_pblk = es1->es_pblk;
  732. if (!ext4_es_is_referenced(es1))
  733. ext4_es_set_referenced(es1);
  734. stats->es_stats_cache_hits++;
  735. } else {
  736. stats->es_stats_cache_misses++;
  737. }
  738. read_unlock(&EXT4_I(inode)->i_es_lock);
  739. trace_ext4_es_lookup_extent_exit(inode, es, found);
  740. return found;
  741. }
  742. static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
  743. ext4_lblk_t end)
  744. {
  745. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  746. struct rb_node *node;
  747. struct extent_status *es;
  748. struct extent_status orig_es;
  749. ext4_lblk_t len1, len2;
  750. ext4_fsblk_t block;
  751. int err;
  752. retry:
  753. err = 0;
  754. es = __es_tree_search(&tree->root, lblk);
  755. if (!es)
  756. goto out;
  757. if (es->es_lblk > end)
  758. goto out;
  759. /* Simply invalidate cache_es. */
  760. tree->cache_es = NULL;
  761. orig_es.es_lblk = es->es_lblk;
  762. orig_es.es_len = es->es_len;
  763. orig_es.es_pblk = es->es_pblk;
  764. len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
  765. len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
  766. if (len1 > 0)
  767. es->es_len = len1;
  768. if (len2 > 0) {
  769. if (len1 > 0) {
  770. struct extent_status newes;
  771. newes.es_lblk = end + 1;
  772. newes.es_len = len2;
  773. block = 0x7FDEADBEEFULL;
  774. if (ext4_es_is_written(&orig_es) ||
  775. ext4_es_is_unwritten(&orig_es))
  776. block = ext4_es_pblock(&orig_es) +
  777. orig_es.es_len - len2;
  778. ext4_es_store_pblock_status(&newes, block,
  779. ext4_es_status(&orig_es));
  780. err = __es_insert_extent(inode, &newes);
  781. if (err) {
  782. es->es_lblk = orig_es.es_lblk;
  783. es->es_len = orig_es.es_len;
  784. if ((err == -ENOMEM) &&
  785. __es_shrink(EXT4_SB(inode->i_sb),
  786. 128, EXT4_I(inode)))
  787. goto retry;
  788. goto out;
  789. }
  790. } else {
  791. es->es_lblk = end + 1;
  792. es->es_len = len2;
  793. if (ext4_es_is_written(es) ||
  794. ext4_es_is_unwritten(es)) {
  795. block = orig_es.es_pblk + orig_es.es_len - len2;
  796. ext4_es_store_pblock(es, block);
  797. }
  798. }
  799. goto out;
  800. }
  801. if (len1 > 0) {
  802. node = rb_next(&es->rb_node);
  803. if (node)
  804. es = rb_entry(node, struct extent_status, rb_node);
  805. else
  806. es = NULL;
  807. }
  808. while (es && ext4_es_end(es) <= end) {
  809. node = rb_next(&es->rb_node);
  810. rb_erase(&es->rb_node, &tree->root);
  811. ext4_es_free_extent(inode, es);
  812. if (!node) {
  813. es = NULL;
  814. break;
  815. }
  816. es = rb_entry(node, struct extent_status, rb_node);
  817. }
  818. if (es && es->es_lblk < end + 1) {
  819. ext4_lblk_t orig_len = es->es_len;
  820. len1 = ext4_es_end(es) - end;
  821. es->es_lblk = end + 1;
  822. es->es_len = len1;
  823. if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
  824. block = es->es_pblk + orig_len - len1;
  825. ext4_es_store_pblock(es, block);
  826. }
  827. }
  828. out:
  829. return err;
  830. }
  831. /*
  832. * ext4_es_remove_extent() removes a space from a extent status tree.
  833. *
  834. * Return 0 on success, error code on failure.
  835. */
  836. int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
  837. ext4_lblk_t len)
  838. {
  839. ext4_lblk_t end;
  840. int err = 0;
  841. trace_ext4_es_remove_extent(inode, lblk, len);
  842. es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
  843. lblk, len, inode->i_ino);
  844. if (!len)
  845. return err;
  846. end = lblk + len - 1;
  847. BUG_ON(end < lblk);
  848. /*
  849. * ext4_clear_inode() depends on us taking i_es_lock unconditionally
  850. * so that we are sure __es_shrink() is done with the inode before it
  851. * is reclaimed.
  852. */
  853. write_lock(&EXT4_I(inode)->i_es_lock);
  854. err = __es_remove_extent(inode, lblk, end);
  855. write_unlock(&EXT4_I(inode)->i_es_lock);
  856. ext4_es_print_tree(inode);
  857. return err;
  858. }
  859. static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
  860. struct ext4_inode_info *locked_ei)
  861. {
  862. struct ext4_inode_info *ei;
  863. struct ext4_es_stats *es_stats;
  864. ktime_t start_time;
  865. u64 scan_time;
  866. int nr_to_walk;
  867. int nr_shrunk = 0;
  868. int retried = 0, nr_skipped = 0;
  869. es_stats = &sbi->s_es_stats;
  870. start_time = ktime_get();
  871. retry:
  872. spin_lock(&sbi->s_es_lock);
  873. nr_to_walk = sbi->s_es_nr_inode;
  874. while (nr_to_walk-- > 0) {
  875. if (list_empty(&sbi->s_es_list)) {
  876. spin_unlock(&sbi->s_es_lock);
  877. goto out;
  878. }
  879. ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
  880. i_es_list);
  881. /* Move the inode to the tail */
  882. list_move_tail(&ei->i_es_list, &sbi->s_es_list);
  883. /*
  884. * Normally we try hard to avoid shrinking precached inodes,
  885. * but we will as a last resort.
  886. */
  887. if (!retried && ext4_test_inode_state(&ei->vfs_inode,
  888. EXT4_STATE_EXT_PRECACHED)) {
  889. nr_skipped++;
  890. continue;
  891. }
  892. if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
  893. nr_skipped++;
  894. continue;
  895. }
  896. /*
  897. * Now we hold i_es_lock which protects us from inode reclaim
  898. * freeing inode under us
  899. */
  900. spin_unlock(&sbi->s_es_lock);
  901. nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
  902. write_unlock(&ei->i_es_lock);
  903. if (nr_to_scan <= 0)
  904. goto out;
  905. spin_lock(&sbi->s_es_lock);
  906. }
  907. spin_unlock(&sbi->s_es_lock);
  908. /*
  909. * If we skipped any inodes, and we weren't able to make any
  910. * forward progress, try again to scan precached inodes.
  911. */
  912. if ((nr_shrunk == 0) && nr_skipped && !retried) {
  913. retried++;
  914. goto retry;
  915. }
  916. if (locked_ei && nr_shrunk == 0)
  917. nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
  918. out:
  919. scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
  920. if (likely(es_stats->es_stats_scan_time))
  921. es_stats->es_stats_scan_time = (scan_time +
  922. es_stats->es_stats_scan_time*3) / 4;
  923. else
  924. es_stats->es_stats_scan_time = scan_time;
  925. if (scan_time > es_stats->es_stats_max_scan_time)
  926. es_stats->es_stats_max_scan_time = scan_time;
  927. if (likely(es_stats->es_stats_shrunk))
  928. es_stats->es_stats_shrunk = (nr_shrunk +
  929. es_stats->es_stats_shrunk*3) / 4;
  930. else
  931. es_stats->es_stats_shrunk = nr_shrunk;
  932. trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
  933. nr_skipped, retried);
  934. return nr_shrunk;
  935. }
  936. static unsigned long ext4_es_count(struct shrinker *shrink,
  937. struct shrink_control *sc)
  938. {
  939. unsigned long nr;
  940. struct ext4_sb_info *sbi;
  941. sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
  942. nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
  943. trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
  944. return nr;
  945. }
  946. static unsigned long ext4_es_scan(struct shrinker *shrink,
  947. struct shrink_control *sc)
  948. {
  949. struct ext4_sb_info *sbi = container_of(shrink,
  950. struct ext4_sb_info, s_es_shrinker);
  951. int nr_to_scan = sc->nr_to_scan;
  952. int ret, nr_shrunk;
  953. ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
  954. trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
  955. if (!nr_to_scan)
  956. return ret;
  957. nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
  958. trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
  959. return nr_shrunk;
  960. }
  961. int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
  962. {
  963. struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
  964. struct ext4_es_stats *es_stats = &sbi->s_es_stats;
  965. struct ext4_inode_info *ei, *max = NULL;
  966. unsigned int inode_cnt = 0;
  967. if (v != SEQ_START_TOKEN)
  968. return 0;
  969. /* here we just find an inode that has the max nr. of objects */
  970. spin_lock(&sbi->s_es_lock);
  971. list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
  972. inode_cnt++;
  973. if (max && max->i_es_all_nr < ei->i_es_all_nr)
  974. max = ei;
  975. else if (!max)
  976. max = ei;
  977. }
  978. spin_unlock(&sbi->s_es_lock);
  979. seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
  980. percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
  981. percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
  982. seq_printf(seq, " %lu/%lu cache hits/misses\n",
  983. es_stats->es_stats_cache_hits,
  984. es_stats->es_stats_cache_misses);
  985. if (inode_cnt)
  986. seq_printf(seq, " %d inodes on list\n", inode_cnt);
  987. seq_printf(seq, "average:\n %llu us scan time\n",
  988. div_u64(es_stats->es_stats_scan_time, 1000));
  989. seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
  990. if (inode_cnt)
  991. seq_printf(seq,
  992. "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
  993. " %llu us max scan time\n",
  994. max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
  995. div_u64(es_stats->es_stats_max_scan_time, 1000));
  996. return 0;
  997. }
  998. int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
  999. {
  1000. int err;
  1001. /* Make sure we have enough bits for physical block number */
  1002. BUILD_BUG_ON(ES_SHIFT < 48);
  1003. INIT_LIST_HEAD(&sbi->s_es_list);
  1004. sbi->s_es_nr_inode = 0;
  1005. spin_lock_init(&sbi->s_es_lock);
  1006. sbi->s_es_stats.es_stats_shrunk = 0;
  1007. sbi->s_es_stats.es_stats_cache_hits = 0;
  1008. sbi->s_es_stats.es_stats_cache_misses = 0;
  1009. sbi->s_es_stats.es_stats_scan_time = 0;
  1010. sbi->s_es_stats.es_stats_max_scan_time = 0;
  1011. err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
  1012. if (err)
  1013. return err;
  1014. err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
  1015. if (err)
  1016. goto err1;
  1017. sbi->s_es_shrinker.scan_objects = ext4_es_scan;
  1018. sbi->s_es_shrinker.count_objects = ext4_es_count;
  1019. sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
  1020. err = register_shrinker(&sbi->s_es_shrinker);
  1021. if (err)
  1022. goto err2;
  1023. return 0;
  1024. err2:
  1025. percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
  1026. err1:
  1027. percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
  1028. return err;
  1029. }
  1030. void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
  1031. {
  1032. percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
  1033. percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
  1034. unregister_shrinker(&sbi->s_es_shrinker);
  1035. }
  1036. /*
  1037. * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
  1038. * most *nr_to_scan extents, update *nr_to_scan accordingly.
  1039. *
  1040. * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
  1041. * Increment *nr_shrunk by the number of reclaimed extents. Also update
  1042. * ei->i_es_shrink_lblk to where we should continue scanning.
  1043. */
  1044. static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
  1045. int *nr_to_scan, int *nr_shrunk)
  1046. {
  1047. struct inode *inode = &ei->vfs_inode;
  1048. struct ext4_es_tree *tree = &ei->i_es_tree;
  1049. struct extent_status *es;
  1050. struct rb_node *node;
  1051. es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
  1052. if (!es)
  1053. goto out_wrap;
  1054. node = &es->rb_node;
  1055. while (*nr_to_scan > 0) {
  1056. if (es->es_lblk > end) {
  1057. ei->i_es_shrink_lblk = end + 1;
  1058. return 0;
  1059. }
  1060. (*nr_to_scan)--;
  1061. node = rb_next(&es->rb_node);
  1062. /*
  1063. * We can't reclaim delayed extent from status tree because
  1064. * fiemap, bigallic, and seek_data/hole need to use it.
  1065. */
  1066. if (ext4_es_is_delayed(es))
  1067. goto next;
  1068. if (ext4_es_is_referenced(es)) {
  1069. ext4_es_clear_referenced(es);
  1070. goto next;
  1071. }
  1072. rb_erase(&es->rb_node, &tree->root);
  1073. ext4_es_free_extent(inode, es);
  1074. (*nr_shrunk)++;
  1075. next:
  1076. if (!node)
  1077. goto out_wrap;
  1078. es = rb_entry(node, struct extent_status, rb_node);
  1079. }
  1080. ei->i_es_shrink_lblk = es->es_lblk;
  1081. return 1;
  1082. out_wrap:
  1083. ei->i_es_shrink_lblk = 0;
  1084. return 0;
  1085. }
  1086. static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
  1087. {
  1088. struct inode *inode = &ei->vfs_inode;
  1089. int nr_shrunk = 0;
  1090. ext4_lblk_t start = ei->i_es_shrink_lblk;
  1091. static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
  1092. DEFAULT_RATELIMIT_BURST);
  1093. if (ei->i_es_shk_nr == 0)
  1094. return 0;
  1095. if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
  1096. __ratelimit(&_rs))
  1097. ext4_warning(inode->i_sb, "forced shrink of precached extents");
  1098. if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
  1099. start != 0)
  1100. es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
  1101. ei->i_es_tree.cache_es = NULL;
  1102. return nr_shrunk;
  1103. }