ialloc.c 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400
  1. /*
  2. * linux/fs/ext4/ialloc.c
  3. *
  4. * Copyright (C) 1992, 1993, 1994, 1995
  5. * Remy Card (card@masi.ibp.fr)
  6. * Laboratoire MASI - Institut Blaise Pascal
  7. * Universite Pierre et Marie Curie (Paris VI)
  8. *
  9. * BSD ufs-inspired inode and directory allocation by
  10. * Stephen Tweedie (sct@redhat.com), 1993
  11. * Big-endian to little-endian byte-swapping/bitmaps by
  12. * David S. Miller (davem@caip.rutgers.edu), 1995
  13. */
  14. #include <linux/time.h>
  15. #include <linux/fs.h>
  16. #include <linux/stat.h>
  17. #include <linux/string.h>
  18. #include <linux/quotaops.h>
  19. #include <linux/buffer_head.h>
  20. #include <linux/random.h>
  21. #include <linux/bitops.h>
  22. #include <linux/blkdev.h>
  23. #include <linux/cred.h>
  24. #include <asm/byteorder.h>
  25. #include "ext4.h"
  26. #include "ext4_jbd2.h"
  27. #include "xattr.h"
  28. #include "acl.h"
  29. #include <trace/events/ext4.h>
  30. /*
  31. * ialloc.c contains the inodes allocation and deallocation routines
  32. */
  33. /*
  34. * The free inodes are managed by bitmaps. A file system contains several
  35. * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
  36. * block for inodes, N blocks for the inode table and data blocks.
  37. *
  38. * The file system contains group descriptors which are located after the
  39. * super block. Each descriptor contains the number of the bitmap block and
  40. * the free blocks count in the block.
  41. */
  42. /*
  43. * To avoid calling the atomic setbit hundreds or thousands of times, we only
  44. * need to use it within a single byte (to ensure we get endianness right).
  45. * We can use memset for the rest of the bitmap as there are no other users.
  46. */
  47. void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
  48. {
  49. int i;
  50. if (start_bit >= end_bit)
  51. return;
  52. ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
  53. for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
  54. ext4_set_bit(i, bitmap);
  55. if (i < end_bit)
  56. memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
  57. }
  58. /* Initializes an uninitialized inode bitmap */
  59. static int ext4_init_inode_bitmap(struct super_block *sb,
  60. struct buffer_head *bh,
  61. ext4_group_t block_group,
  62. struct ext4_group_desc *gdp)
  63. {
  64. struct ext4_group_info *grp;
  65. struct ext4_sb_info *sbi = EXT4_SB(sb);
  66. J_ASSERT_BH(bh, buffer_locked(bh));
  67. /* If checksum is bad mark all blocks and inodes use to prevent
  68. * allocation, essentially implementing a per-group read-only flag. */
  69. if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
  70. grp = ext4_get_group_info(sb, block_group);
  71. if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
  72. percpu_counter_sub(&sbi->s_freeclusters_counter,
  73. grp->bb_free);
  74. set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
  75. if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
  76. int count;
  77. count = ext4_free_inodes_count(sb, gdp);
  78. percpu_counter_sub(&sbi->s_freeinodes_counter,
  79. count);
  80. }
  81. set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
  82. return -EFSBADCRC;
  83. }
  84. memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
  85. ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
  86. bh->b_data);
  87. ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
  88. EXT4_INODES_PER_GROUP(sb) / 8);
  89. ext4_group_desc_csum_set(sb, block_group, gdp);
  90. return 0;
  91. }
  92. void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
  93. {
  94. if (uptodate) {
  95. set_buffer_uptodate(bh);
  96. set_bitmap_uptodate(bh);
  97. }
  98. unlock_buffer(bh);
  99. put_bh(bh);
  100. }
  101. static int ext4_validate_inode_bitmap(struct super_block *sb,
  102. struct ext4_group_desc *desc,
  103. ext4_group_t block_group,
  104. struct buffer_head *bh)
  105. {
  106. ext4_fsblk_t blk;
  107. struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
  108. struct ext4_sb_info *sbi = EXT4_SB(sb);
  109. if (buffer_verified(bh))
  110. return 0;
  111. if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
  112. return -EFSCORRUPTED;
  113. ext4_lock_group(sb, block_group);
  114. blk = ext4_inode_bitmap(sb, desc);
  115. if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
  116. EXT4_INODES_PER_GROUP(sb) / 8)) {
  117. ext4_unlock_group(sb, block_group);
  118. ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
  119. "inode_bitmap = %llu", block_group, blk);
  120. grp = ext4_get_group_info(sb, block_group);
  121. if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
  122. int count;
  123. count = ext4_free_inodes_count(sb, desc);
  124. percpu_counter_sub(&sbi->s_freeinodes_counter,
  125. count);
  126. }
  127. set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
  128. return -EFSBADCRC;
  129. }
  130. set_buffer_verified(bh);
  131. ext4_unlock_group(sb, block_group);
  132. return 0;
  133. }
  134. /*
  135. * Read the inode allocation bitmap for a given block_group, reading
  136. * into the specified slot in the superblock's bitmap cache.
  137. *
  138. * Return buffer_head of bitmap on success or NULL.
  139. */
  140. static struct buffer_head *
  141. ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
  142. {
  143. struct ext4_group_desc *desc;
  144. struct buffer_head *bh = NULL;
  145. ext4_fsblk_t bitmap_blk;
  146. int err;
  147. desc = ext4_get_group_desc(sb, block_group, NULL);
  148. if (!desc)
  149. return ERR_PTR(-EFSCORRUPTED);
  150. bitmap_blk = ext4_inode_bitmap(sb, desc);
  151. bh = sb_getblk(sb, bitmap_blk);
  152. if (unlikely(!bh)) {
  153. ext4_error(sb, "Cannot read inode bitmap - "
  154. "block_group = %u, inode_bitmap = %llu",
  155. block_group, bitmap_blk);
  156. return ERR_PTR(-EIO);
  157. }
  158. if (bitmap_uptodate(bh))
  159. goto verify;
  160. lock_buffer(bh);
  161. if (bitmap_uptodate(bh)) {
  162. unlock_buffer(bh);
  163. goto verify;
  164. }
  165. ext4_lock_group(sb, block_group);
  166. if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
  167. err = ext4_init_inode_bitmap(sb, bh, block_group, desc);
  168. set_bitmap_uptodate(bh);
  169. set_buffer_uptodate(bh);
  170. set_buffer_verified(bh);
  171. ext4_unlock_group(sb, block_group);
  172. unlock_buffer(bh);
  173. if (err) {
  174. ext4_error(sb, "Failed to init inode bitmap for group "
  175. "%u: %d", block_group, err);
  176. goto out;
  177. }
  178. return bh;
  179. }
  180. ext4_unlock_group(sb, block_group);
  181. if (buffer_uptodate(bh)) {
  182. /*
  183. * if not uninit if bh is uptodate,
  184. * bitmap is also uptodate
  185. */
  186. set_bitmap_uptodate(bh);
  187. unlock_buffer(bh);
  188. goto verify;
  189. }
  190. /*
  191. * submit the buffer_head for reading
  192. */
  193. trace_ext4_load_inode_bitmap(sb, block_group);
  194. bh->b_end_io = ext4_end_bitmap_read;
  195. get_bh(bh);
  196. submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
  197. wait_on_buffer(bh);
  198. if (!buffer_uptodate(bh)) {
  199. put_bh(bh);
  200. ext4_error(sb, "Cannot read inode bitmap - "
  201. "block_group = %u, inode_bitmap = %llu",
  202. block_group, bitmap_blk);
  203. return ERR_PTR(-EIO);
  204. }
  205. verify:
  206. err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
  207. if (err)
  208. goto out;
  209. return bh;
  210. out:
  211. put_bh(bh);
  212. return ERR_PTR(err);
  213. }
  214. /*
  215. * NOTE! When we get the inode, we're the only people
  216. * that have access to it, and as such there are no
  217. * race conditions we have to worry about. The inode
  218. * is not on the hash-lists, and it cannot be reached
  219. * through the filesystem because the directory entry
  220. * has been deleted earlier.
  221. *
  222. * HOWEVER: we must make sure that we get no aliases,
  223. * which means that we have to call "clear_inode()"
  224. * _before_ we mark the inode not in use in the inode
  225. * bitmaps. Otherwise a newly created file might use
  226. * the same inode number (not actually the same pointer
  227. * though), and then we'd have two inodes sharing the
  228. * same inode number and space on the harddisk.
  229. */
  230. void ext4_free_inode(handle_t *handle, struct inode *inode)
  231. {
  232. struct super_block *sb = inode->i_sb;
  233. int is_directory;
  234. unsigned long ino;
  235. struct buffer_head *bitmap_bh = NULL;
  236. struct buffer_head *bh2;
  237. ext4_group_t block_group;
  238. unsigned long bit;
  239. struct ext4_group_desc *gdp;
  240. struct ext4_super_block *es;
  241. struct ext4_sb_info *sbi;
  242. int fatal = 0, err, count, cleared;
  243. struct ext4_group_info *grp;
  244. if (!sb) {
  245. printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
  246. "nonexistent device\n", __func__, __LINE__);
  247. return;
  248. }
  249. if (atomic_read(&inode->i_count) > 1) {
  250. ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
  251. __func__, __LINE__, inode->i_ino,
  252. atomic_read(&inode->i_count));
  253. return;
  254. }
  255. if (inode->i_nlink) {
  256. ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
  257. __func__, __LINE__, inode->i_ino, inode->i_nlink);
  258. return;
  259. }
  260. sbi = EXT4_SB(sb);
  261. ino = inode->i_ino;
  262. ext4_debug("freeing inode %lu\n", ino);
  263. trace_ext4_free_inode(inode);
  264. /*
  265. * Note: we must free any quota before locking the superblock,
  266. * as writing the quota to disk may need the lock as well.
  267. */
  268. dquot_initialize(inode);
  269. ext4_xattr_delete_inode(handle, inode);
  270. dquot_free_inode(inode);
  271. dquot_drop(inode);
  272. is_directory = S_ISDIR(inode->i_mode);
  273. /* Do this BEFORE marking the inode not in use or returning an error */
  274. ext4_clear_inode(inode);
  275. es = EXT4_SB(sb)->s_es;
  276. if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
  277. ext4_error(sb, "reserved or nonexistent inode %lu", ino);
  278. goto error_return;
  279. }
  280. block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
  281. bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
  282. bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
  283. /* Don't bother if the inode bitmap is corrupt. */
  284. grp = ext4_get_group_info(sb, block_group);
  285. if (IS_ERR(bitmap_bh)) {
  286. fatal = PTR_ERR(bitmap_bh);
  287. bitmap_bh = NULL;
  288. goto error_return;
  289. }
  290. if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
  291. fatal = -EFSCORRUPTED;
  292. goto error_return;
  293. }
  294. BUFFER_TRACE(bitmap_bh, "get_write_access");
  295. fatal = ext4_journal_get_write_access(handle, bitmap_bh);
  296. if (fatal)
  297. goto error_return;
  298. fatal = -ESRCH;
  299. gdp = ext4_get_group_desc(sb, block_group, &bh2);
  300. if (gdp) {
  301. BUFFER_TRACE(bh2, "get_write_access");
  302. fatal = ext4_journal_get_write_access(handle, bh2);
  303. }
  304. ext4_lock_group(sb, block_group);
  305. cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
  306. if (fatal || !cleared) {
  307. ext4_unlock_group(sb, block_group);
  308. goto out;
  309. }
  310. count = ext4_free_inodes_count(sb, gdp) + 1;
  311. ext4_free_inodes_set(sb, gdp, count);
  312. if (is_directory) {
  313. count = ext4_used_dirs_count(sb, gdp) - 1;
  314. ext4_used_dirs_set(sb, gdp, count);
  315. percpu_counter_dec(&sbi->s_dirs_counter);
  316. }
  317. ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
  318. EXT4_INODES_PER_GROUP(sb) / 8);
  319. ext4_group_desc_csum_set(sb, block_group, gdp);
  320. ext4_unlock_group(sb, block_group);
  321. percpu_counter_inc(&sbi->s_freeinodes_counter);
  322. if (sbi->s_log_groups_per_flex) {
  323. ext4_group_t f = ext4_flex_group(sbi, block_group);
  324. atomic_inc(&sbi->s_flex_groups[f].free_inodes);
  325. if (is_directory)
  326. atomic_dec(&sbi->s_flex_groups[f].used_dirs);
  327. }
  328. BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
  329. fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
  330. out:
  331. if (cleared) {
  332. BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
  333. err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
  334. if (!fatal)
  335. fatal = err;
  336. } else {
  337. ext4_error(sb, "bit already cleared for inode %lu", ino);
  338. if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
  339. int count;
  340. count = ext4_free_inodes_count(sb, gdp);
  341. percpu_counter_sub(&sbi->s_freeinodes_counter,
  342. count);
  343. }
  344. set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
  345. }
  346. error_return:
  347. brelse(bitmap_bh);
  348. ext4_std_error(sb, fatal);
  349. }
  350. struct orlov_stats {
  351. __u64 free_clusters;
  352. __u32 free_inodes;
  353. __u32 used_dirs;
  354. };
  355. /*
  356. * Helper function for Orlov's allocator; returns critical information
  357. * for a particular block group or flex_bg. If flex_size is 1, then g
  358. * is a block group number; otherwise it is flex_bg number.
  359. */
  360. static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
  361. int flex_size, struct orlov_stats *stats)
  362. {
  363. struct ext4_group_desc *desc;
  364. struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
  365. if (flex_size > 1) {
  366. stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
  367. stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
  368. stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
  369. return;
  370. }
  371. desc = ext4_get_group_desc(sb, g, NULL);
  372. if (desc) {
  373. stats->free_inodes = ext4_free_inodes_count(sb, desc);
  374. stats->free_clusters = ext4_free_group_clusters(sb, desc);
  375. stats->used_dirs = ext4_used_dirs_count(sb, desc);
  376. } else {
  377. stats->free_inodes = 0;
  378. stats->free_clusters = 0;
  379. stats->used_dirs = 0;
  380. }
  381. }
  382. /*
  383. * Orlov's allocator for directories.
  384. *
  385. * We always try to spread first-level directories.
  386. *
  387. * If there are blockgroups with both free inodes and free blocks counts
  388. * not worse than average we return one with smallest directory count.
  389. * Otherwise we simply return a random group.
  390. *
  391. * For the rest rules look so:
  392. *
  393. * It's OK to put directory into a group unless
  394. * it has too many directories already (max_dirs) or
  395. * it has too few free inodes left (min_inodes) or
  396. * it has too few free blocks left (min_blocks) or
  397. * Parent's group is preferred, if it doesn't satisfy these
  398. * conditions we search cyclically through the rest. If none
  399. * of the groups look good we just look for a group with more
  400. * free inodes than average (starting at parent's group).
  401. */
  402. static int find_group_orlov(struct super_block *sb, struct inode *parent,
  403. ext4_group_t *group, umode_t mode,
  404. const struct qstr *qstr)
  405. {
  406. ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
  407. struct ext4_sb_info *sbi = EXT4_SB(sb);
  408. ext4_group_t real_ngroups = ext4_get_groups_count(sb);
  409. int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
  410. unsigned int freei, avefreei, grp_free;
  411. ext4_fsblk_t freeb, avefreec;
  412. unsigned int ndirs;
  413. int max_dirs, min_inodes;
  414. ext4_grpblk_t min_clusters;
  415. ext4_group_t i, grp, g, ngroups;
  416. struct ext4_group_desc *desc;
  417. struct orlov_stats stats;
  418. int flex_size = ext4_flex_bg_size(sbi);
  419. struct dx_hash_info hinfo;
  420. ngroups = real_ngroups;
  421. if (flex_size > 1) {
  422. ngroups = (real_ngroups + flex_size - 1) >>
  423. sbi->s_log_groups_per_flex;
  424. parent_group >>= sbi->s_log_groups_per_flex;
  425. }
  426. freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
  427. avefreei = freei / ngroups;
  428. freeb = EXT4_C2B(sbi,
  429. percpu_counter_read_positive(&sbi->s_freeclusters_counter));
  430. avefreec = freeb;
  431. do_div(avefreec, ngroups);
  432. ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
  433. if (S_ISDIR(mode) &&
  434. ((parent == d_inode(sb->s_root)) ||
  435. (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
  436. int best_ndir = inodes_per_group;
  437. int ret = -1;
  438. if (qstr) {
  439. hinfo.hash_version = DX_HASH_HALF_MD4;
  440. hinfo.seed = sbi->s_hash_seed;
  441. ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
  442. grp = hinfo.hash;
  443. } else
  444. grp = prandom_u32();
  445. parent_group = (unsigned)grp % ngroups;
  446. for (i = 0; i < ngroups; i++) {
  447. g = (parent_group + i) % ngroups;
  448. get_orlov_stats(sb, g, flex_size, &stats);
  449. if (!stats.free_inodes)
  450. continue;
  451. if (stats.used_dirs >= best_ndir)
  452. continue;
  453. if (stats.free_inodes < avefreei)
  454. continue;
  455. if (stats.free_clusters < avefreec)
  456. continue;
  457. grp = g;
  458. ret = 0;
  459. best_ndir = stats.used_dirs;
  460. }
  461. if (ret)
  462. goto fallback;
  463. found_flex_bg:
  464. if (flex_size == 1) {
  465. *group = grp;
  466. return 0;
  467. }
  468. /*
  469. * We pack inodes at the beginning of the flexgroup's
  470. * inode tables. Block allocation decisions will do
  471. * something similar, although regular files will
  472. * start at 2nd block group of the flexgroup. See
  473. * ext4_ext_find_goal() and ext4_find_near().
  474. */
  475. grp *= flex_size;
  476. for (i = 0; i < flex_size; i++) {
  477. if (grp+i >= real_ngroups)
  478. break;
  479. desc = ext4_get_group_desc(sb, grp+i, NULL);
  480. if (desc && ext4_free_inodes_count(sb, desc)) {
  481. *group = grp+i;
  482. return 0;
  483. }
  484. }
  485. goto fallback;
  486. }
  487. max_dirs = ndirs / ngroups + inodes_per_group / 16;
  488. min_inodes = avefreei - inodes_per_group*flex_size / 4;
  489. if (min_inodes < 1)
  490. min_inodes = 1;
  491. min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
  492. /*
  493. * Start looking in the flex group where we last allocated an
  494. * inode for this parent directory
  495. */
  496. if (EXT4_I(parent)->i_last_alloc_group != ~0) {
  497. parent_group = EXT4_I(parent)->i_last_alloc_group;
  498. if (flex_size > 1)
  499. parent_group >>= sbi->s_log_groups_per_flex;
  500. }
  501. for (i = 0; i < ngroups; i++) {
  502. grp = (parent_group + i) % ngroups;
  503. get_orlov_stats(sb, grp, flex_size, &stats);
  504. if (stats.used_dirs >= max_dirs)
  505. continue;
  506. if (stats.free_inodes < min_inodes)
  507. continue;
  508. if (stats.free_clusters < min_clusters)
  509. continue;
  510. goto found_flex_bg;
  511. }
  512. fallback:
  513. ngroups = real_ngroups;
  514. avefreei = freei / ngroups;
  515. fallback_retry:
  516. parent_group = EXT4_I(parent)->i_block_group;
  517. for (i = 0; i < ngroups; i++) {
  518. grp = (parent_group + i) % ngroups;
  519. desc = ext4_get_group_desc(sb, grp, NULL);
  520. if (desc) {
  521. grp_free = ext4_free_inodes_count(sb, desc);
  522. if (grp_free && grp_free >= avefreei) {
  523. *group = grp;
  524. return 0;
  525. }
  526. }
  527. }
  528. if (avefreei) {
  529. /*
  530. * The free-inodes counter is approximate, and for really small
  531. * filesystems the above test can fail to find any blockgroups
  532. */
  533. avefreei = 0;
  534. goto fallback_retry;
  535. }
  536. return -1;
  537. }
  538. static int find_group_other(struct super_block *sb, struct inode *parent,
  539. ext4_group_t *group, umode_t mode)
  540. {
  541. ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
  542. ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
  543. struct ext4_group_desc *desc;
  544. int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
  545. /*
  546. * Try to place the inode is the same flex group as its
  547. * parent. If we can't find space, use the Orlov algorithm to
  548. * find another flex group, and store that information in the
  549. * parent directory's inode information so that use that flex
  550. * group for future allocations.
  551. */
  552. if (flex_size > 1) {
  553. int retry = 0;
  554. try_again:
  555. parent_group &= ~(flex_size-1);
  556. last = parent_group + flex_size;
  557. if (last > ngroups)
  558. last = ngroups;
  559. for (i = parent_group; i < last; i++) {
  560. desc = ext4_get_group_desc(sb, i, NULL);
  561. if (desc && ext4_free_inodes_count(sb, desc)) {
  562. *group = i;
  563. return 0;
  564. }
  565. }
  566. if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
  567. retry = 1;
  568. parent_group = EXT4_I(parent)->i_last_alloc_group;
  569. goto try_again;
  570. }
  571. /*
  572. * If this didn't work, use the Orlov search algorithm
  573. * to find a new flex group; we pass in the mode to
  574. * avoid the topdir algorithms.
  575. */
  576. *group = parent_group + flex_size;
  577. if (*group > ngroups)
  578. *group = 0;
  579. return find_group_orlov(sb, parent, group, mode, NULL);
  580. }
  581. /*
  582. * Try to place the inode in its parent directory
  583. */
  584. *group = parent_group;
  585. desc = ext4_get_group_desc(sb, *group, NULL);
  586. if (desc && ext4_free_inodes_count(sb, desc) &&
  587. ext4_free_group_clusters(sb, desc))
  588. return 0;
  589. /*
  590. * We're going to place this inode in a different blockgroup from its
  591. * parent. We want to cause files in a common directory to all land in
  592. * the same blockgroup. But we want files which are in a different
  593. * directory which shares a blockgroup with our parent to land in a
  594. * different blockgroup.
  595. *
  596. * So add our directory's i_ino into the starting point for the hash.
  597. */
  598. *group = (*group + parent->i_ino) % ngroups;
  599. /*
  600. * Use a quadratic hash to find a group with a free inode and some free
  601. * blocks.
  602. */
  603. for (i = 1; i < ngroups; i <<= 1) {
  604. *group += i;
  605. if (*group >= ngroups)
  606. *group -= ngroups;
  607. desc = ext4_get_group_desc(sb, *group, NULL);
  608. if (desc && ext4_free_inodes_count(sb, desc) &&
  609. ext4_free_group_clusters(sb, desc))
  610. return 0;
  611. }
  612. /*
  613. * That failed: try linear search for a free inode, even if that group
  614. * has no free blocks.
  615. */
  616. *group = parent_group;
  617. for (i = 0; i < ngroups; i++) {
  618. if (++*group >= ngroups)
  619. *group = 0;
  620. desc = ext4_get_group_desc(sb, *group, NULL);
  621. if (desc && ext4_free_inodes_count(sb, desc))
  622. return 0;
  623. }
  624. return -1;
  625. }
  626. /*
  627. * In no journal mode, if an inode has recently been deleted, we want
  628. * to avoid reusing it until we're reasonably sure the inode table
  629. * block has been written back to disk. (Yes, these values are
  630. * somewhat arbitrary...)
  631. */
  632. #define RECENTCY_MIN 5
  633. #define RECENTCY_DIRTY 30
  634. static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
  635. {
  636. struct ext4_group_desc *gdp;
  637. struct ext4_inode *raw_inode;
  638. struct buffer_head *bh;
  639. unsigned long dtime, now;
  640. int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
  641. int offset, ret = 0, recentcy = RECENTCY_MIN;
  642. gdp = ext4_get_group_desc(sb, group, NULL);
  643. if (unlikely(!gdp))
  644. return 0;
  645. bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
  646. (ino / inodes_per_block));
  647. if (unlikely(!bh) || !buffer_uptodate(bh))
  648. /*
  649. * If the block is not in the buffer cache, then it
  650. * must have been written out.
  651. */
  652. goto out;
  653. offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
  654. raw_inode = (struct ext4_inode *) (bh->b_data + offset);
  655. dtime = le32_to_cpu(raw_inode->i_dtime);
  656. now = get_seconds();
  657. if (buffer_dirty(bh))
  658. recentcy += RECENTCY_DIRTY;
  659. if (dtime && (dtime < now) && (now < dtime + recentcy))
  660. ret = 1;
  661. out:
  662. brelse(bh);
  663. return ret;
  664. }
  665. /*
  666. * There are two policies for allocating an inode. If the new inode is
  667. * a directory, then a forward search is made for a block group with both
  668. * free space and a low directory-to-inode ratio; if that fails, then of
  669. * the groups with above-average free space, that group with the fewest
  670. * directories already is chosen.
  671. *
  672. * For other inodes, search forward from the parent directory's block
  673. * group to find a free inode.
  674. */
  675. struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
  676. umode_t mode, const struct qstr *qstr,
  677. __u32 goal, uid_t *owner, int handle_type,
  678. unsigned int line_no, int nblocks)
  679. {
  680. struct super_block *sb;
  681. struct buffer_head *inode_bitmap_bh = NULL;
  682. struct buffer_head *group_desc_bh;
  683. ext4_group_t ngroups, group = 0;
  684. unsigned long ino = 0;
  685. struct inode *inode;
  686. struct ext4_group_desc *gdp = NULL;
  687. struct ext4_inode_info *ei;
  688. struct ext4_sb_info *sbi;
  689. int ret2, err;
  690. struct inode *ret;
  691. ext4_group_t i;
  692. ext4_group_t flex_group;
  693. struct ext4_group_info *grp;
  694. int encrypt = 0;
  695. /* Cannot create files in a deleted directory */
  696. if (!dir || !dir->i_nlink)
  697. return ERR_PTR(-EPERM);
  698. if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
  699. return ERR_PTR(-EIO);
  700. if ((ext4_encrypted_inode(dir) ||
  701. DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
  702. (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
  703. err = fscrypt_get_encryption_info(dir);
  704. if (err)
  705. return ERR_PTR(err);
  706. if (!fscrypt_has_encryption_key(dir))
  707. return ERR_PTR(-ENOKEY);
  708. if (!handle)
  709. nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
  710. encrypt = 1;
  711. }
  712. sb = dir->i_sb;
  713. ngroups = ext4_get_groups_count(sb);
  714. trace_ext4_request_inode(dir, mode);
  715. inode = new_inode(sb);
  716. if (!inode)
  717. return ERR_PTR(-ENOMEM);
  718. ei = EXT4_I(inode);
  719. sbi = EXT4_SB(sb);
  720. /*
  721. * Initialize owners and quota early so that we don't have to account
  722. * for quota initialization worst case in standard inode creating
  723. * transaction
  724. */
  725. if (owner) {
  726. inode->i_mode = mode;
  727. i_uid_write(inode, owner[0]);
  728. i_gid_write(inode, owner[1]);
  729. } else if (test_opt(sb, GRPID)) {
  730. inode->i_mode = mode;
  731. inode->i_uid = current_fsuid();
  732. inode->i_gid = dir->i_gid;
  733. } else
  734. inode_init_owner(inode, dir, mode);
  735. if (ext4_has_feature_project(sb) &&
  736. ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
  737. ei->i_projid = EXT4_I(dir)->i_projid;
  738. else
  739. ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
  740. err = dquot_initialize(inode);
  741. if (err)
  742. goto out;
  743. if (!goal)
  744. goal = sbi->s_inode_goal;
  745. if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
  746. group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
  747. ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
  748. ret2 = 0;
  749. goto got_group;
  750. }
  751. if (S_ISDIR(mode))
  752. ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
  753. else
  754. ret2 = find_group_other(sb, dir, &group, mode);
  755. got_group:
  756. EXT4_I(dir)->i_last_alloc_group = group;
  757. err = -ENOSPC;
  758. if (ret2 == -1)
  759. goto out;
  760. /*
  761. * Normally we will only go through one pass of this loop,
  762. * unless we get unlucky and it turns out the group we selected
  763. * had its last inode grabbed by someone else.
  764. */
  765. for (i = 0; i < ngroups; i++, ino = 0) {
  766. err = -EIO;
  767. gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
  768. if (!gdp)
  769. goto out;
  770. /*
  771. * Check free inodes count before loading bitmap.
  772. */
  773. if (ext4_free_inodes_count(sb, gdp) == 0) {
  774. if (++group == ngroups)
  775. group = 0;
  776. continue;
  777. }
  778. grp = ext4_get_group_info(sb, group);
  779. /* Skip groups with already-known suspicious inode tables */
  780. if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
  781. if (++group == ngroups)
  782. group = 0;
  783. continue;
  784. }
  785. brelse(inode_bitmap_bh);
  786. inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
  787. /* Skip groups with suspicious inode tables */
  788. if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
  789. IS_ERR(inode_bitmap_bh)) {
  790. inode_bitmap_bh = NULL;
  791. if (++group == ngroups)
  792. group = 0;
  793. continue;
  794. }
  795. repeat_in_this_group:
  796. ino = ext4_find_next_zero_bit((unsigned long *)
  797. inode_bitmap_bh->b_data,
  798. EXT4_INODES_PER_GROUP(sb), ino);
  799. if (ino >= EXT4_INODES_PER_GROUP(sb))
  800. goto next_group;
  801. if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
  802. ext4_error(sb, "reserved inode found cleared - "
  803. "inode=%lu", ino + 1);
  804. continue;
  805. }
  806. if ((EXT4_SB(sb)->s_journal == NULL) &&
  807. recently_deleted(sb, group, ino)) {
  808. ino++;
  809. goto next_inode;
  810. }
  811. if (!handle) {
  812. BUG_ON(nblocks <= 0);
  813. handle = __ext4_journal_start_sb(dir->i_sb, line_no,
  814. handle_type, nblocks,
  815. 0);
  816. if (IS_ERR(handle)) {
  817. err = PTR_ERR(handle);
  818. ext4_std_error(sb, err);
  819. goto out;
  820. }
  821. }
  822. BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
  823. err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
  824. if (err) {
  825. ext4_std_error(sb, err);
  826. goto out;
  827. }
  828. ext4_lock_group(sb, group);
  829. ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
  830. ext4_unlock_group(sb, group);
  831. ino++; /* the inode bitmap is zero-based */
  832. if (!ret2)
  833. goto got; /* we grabbed the inode! */
  834. next_inode:
  835. if (ino < EXT4_INODES_PER_GROUP(sb))
  836. goto repeat_in_this_group;
  837. next_group:
  838. if (++group == ngroups)
  839. group = 0;
  840. }
  841. err = -ENOSPC;
  842. goto out;
  843. got:
  844. BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
  845. err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
  846. if (err) {
  847. ext4_std_error(sb, err);
  848. goto out;
  849. }
  850. BUFFER_TRACE(group_desc_bh, "get_write_access");
  851. err = ext4_journal_get_write_access(handle, group_desc_bh);
  852. if (err) {
  853. ext4_std_error(sb, err);
  854. goto out;
  855. }
  856. /* We may have to initialize the block bitmap if it isn't already */
  857. if (ext4_has_group_desc_csum(sb) &&
  858. gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
  859. struct buffer_head *block_bitmap_bh;
  860. block_bitmap_bh = ext4_read_block_bitmap(sb, group);
  861. if (IS_ERR(block_bitmap_bh)) {
  862. err = PTR_ERR(block_bitmap_bh);
  863. goto out;
  864. }
  865. BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
  866. err = ext4_journal_get_write_access(handle, block_bitmap_bh);
  867. if (err) {
  868. brelse(block_bitmap_bh);
  869. ext4_std_error(sb, err);
  870. goto out;
  871. }
  872. BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
  873. err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
  874. /* recheck and clear flag under lock if we still need to */
  875. ext4_lock_group(sb, group);
  876. if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
  877. gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
  878. ext4_free_group_clusters_set(sb, gdp,
  879. ext4_free_clusters_after_init(sb, group, gdp));
  880. ext4_block_bitmap_csum_set(sb, group, gdp,
  881. block_bitmap_bh);
  882. ext4_group_desc_csum_set(sb, group, gdp);
  883. }
  884. ext4_unlock_group(sb, group);
  885. brelse(block_bitmap_bh);
  886. if (err) {
  887. ext4_std_error(sb, err);
  888. goto out;
  889. }
  890. }
  891. /* Update the relevant bg descriptor fields */
  892. if (ext4_has_group_desc_csum(sb)) {
  893. int free;
  894. struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  895. down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
  896. ext4_lock_group(sb, group); /* while we modify the bg desc */
  897. free = EXT4_INODES_PER_GROUP(sb) -
  898. ext4_itable_unused_count(sb, gdp);
  899. if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
  900. gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
  901. free = 0;
  902. }
  903. /*
  904. * Check the relative inode number against the last used
  905. * relative inode number in this group. if it is greater
  906. * we need to update the bg_itable_unused count
  907. */
  908. if (ino > free)
  909. ext4_itable_unused_set(sb, gdp,
  910. (EXT4_INODES_PER_GROUP(sb) - ino));
  911. up_read(&grp->alloc_sem);
  912. } else {
  913. ext4_lock_group(sb, group);
  914. }
  915. ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
  916. if (S_ISDIR(mode)) {
  917. ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
  918. if (sbi->s_log_groups_per_flex) {
  919. ext4_group_t f = ext4_flex_group(sbi, group);
  920. atomic_inc(&sbi->s_flex_groups[f].used_dirs);
  921. }
  922. }
  923. if (ext4_has_group_desc_csum(sb)) {
  924. ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
  925. EXT4_INODES_PER_GROUP(sb) / 8);
  926. ext4_group_desc_csum_set(sb, group, gdp);
  927. }
  928. ext4_unlock_group(sb, group);
  929. BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
  930. err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
  931. if (err) {
  932. ext4_std_error(sb, err);
  933. goto out;
  934. }
  935. percpu_counter_dec(&sbi->s_freeinodes_counter);
  936. if (S_ISDIR(mode))
  937. percpu_counter_inc(&sbi->s_dirs_counter);
  938. if (sbi->s_log_groups_per_flex) {
  939. flex_group = ext4_flex_group(sbi, group);
  940. atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
  941. }
  942. inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
  943. /* This is the optimal IO size (for stat), not the fs block size */
  944. inode->i_blocks = 0;
  945. inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
  946. current_time(inode);
  947. memset(ei->i_data, 0, sizeof(ei->i_data));
  948. ei->i_dir_start_lookup = 0;
  949. ei->i_disksize = 0;
  950. /* Don't inherit extent flag from directory, amongst others. */
  951. ei->i_flags =
  952. ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
  953. ei->i_file_acl = 0;
  954. ei->i_dtime = 0;
  955. ei->i_block_group = group;
  956. ei->i_last_alloc_group = ~0;
  957. ext4_set_inode_flags(inode);
  958. if (IS_DIRSYNC(inode))
  959. ext4_handle_sync(handle);
  960. if (insert_inode_locked(inode) < 0) {
  961. /*
  962. * Likely a bitmap corruption causing inode to be allocated
  963. * twice.
  964. */
  965. err = -EIO;
  966. ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
  967. inode->i_ino);
  968. goto out;
  969. }
  970. spin_lock(&sbi->s_next_gen_lock);
  971. inode->i_generation = sbi->s_next_generation++;
  972. spin_unlock(&sbi->s_next_gen_lock);
  973. /* Precompute checksum seed for inode metadata */
  974. if (ext4_has_metadata_csum(sb)) {
  975. __u32 csum;
  976. __le32 inum = cpu_to_le32(inode->i_ino);
  977. __le32 gen = cpu_to_le32(inode->i_generation);
  978. csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
  979. sizeof(inum));
  980. ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
  981. sizeof(gen));
  982. }
  983. ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
  984. ext4_set_inode_state(inode, EXT4_STATE_NEW);
  985. ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
  986. ei->i_inline_off = 0;
  987. if (ext4_has_feature_inline_data(sb))
  988. ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
  989. ret = inode;
  990. err = dquot_alloc_inode(inode);
  991. if (err)
  992. goto fail_drop;
  993. /*
  994. * Since the encryption xattr will always be unique, create it first so
  995. * that it's less likely to end up in an external xattr block and
  996. * prevent its deduplication.
  997. */
  998. if (encrypt) {
  999. err = fscrypt_inherit_context(dir, inode, handle, true);
  1000. if (err)
  1001. goto fail_free_drop;
  1002. }
  1003. err = ext4_init_acl(handle, inode, dir);
  1004. if (err)
  1005. goto fail_free_drop;
  1006. err = ext4_init_security(handle, inode, dir, qstr);
  1007. if (err)
  1008. goto fail_free_drop;
  1009. if (ext4_has_feature_extents(sb)) {
  1010. /* set extent flag only for directory, file and normal symlink*/
  1011. if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
  1012. ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
  1013. ext4_ext_tree_init(handle, inode);
  1014. }
  1015. }
  1016. if (ext4_handle_valid(handle)) {
  1017. ei->i_sync_tid = handle->h_transaction->t_tid;
  1018. ei->i_datasync_tid = handle->h_transaction->t_tid;
  1019. }
  1020. err = ext4_mark_inode_dirty(handle, inode);
  1021. if (err) {
  1022. ext4_std_error(sb, err);
  1023. goto fail_free_drop;
  1024. }
  1025. ext4_debug("allocating inode %lu\n", inode->i_ino);
  1026. trace_ext4_allocate_inode(inode, dir, mode);
  1027. brelse(inode_bitmap_bh);
  1028. return ret;
  1029. fail_free_drop:
  1030. dquot_free_inode(inode);
  1031. fail_drop:
  1032. clear_nlink(inode);
  1033. unlock_new_inode(inode);
  1034. out:
  1035. dquot_drop(inode);
  1036. inode->i_flags |= S_NOQUOTA;
  1037. iput(inode);
  1038. brelse(inode_bitmap_bh);
  1039. return ERR_PTR(err);
  1040. }
  1041. /* Verify that we are loading a valid orphan from disk */
  1042. struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
  1043. {
  1044. unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
  1045. ext4_group_t block_group;
  1046. int bit;
  1047. struct buffer_head *bitmap_bh = NULL;
  1048. struct inode *inode = NULL;
  1049. int err = -EFSCORRUPTED;
  1050. if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
  1051. goto bad_orphan;
  1052. block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
  1053. bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
  1054. bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
  1055. if (IS_ERR(bitmap_bh)) {
  1056. ext4_error(sb, "inode bitmap error %ld for orphan %lu",
  1057. ino, PTR_ERR(bitmap_bh));
  1058. return (struct inode *) bitmap_bh;
  1059. }
  1060. /* Having the inode bit set should be a 100% indicator that this
  1061. * is a valid orphan (no e2fsck run on fs). Orphans also include
  1062. * inodes that were being truncated, so we can't check i_nlink==0.
  1063. */
  1064. if (!ext4_test_bit(bit, bitmap_bh->b_data))
  1065. goto bad_orphan;
  1066. inode = ext4_iget(sb, ino);
  1067. if (IS_ERR(inode)) {
  1068. err = PTR_ERR(inode);
  1069. ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
  1070. ino, err);
  1071. return inode;
  1072. }
  1073. /*
  1074. * If the orphans has i_nlinks > 0 then it should be able to
  1075. * be truncated, otherwise it won't be removed from the orphan
  1076. * list during processing and an infinite loop will result.
  1077. * Similarly, it must not be a bad inode.
  1078. */
  1079. if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
  1080. is_bad_inode(inode))
  1081. goto bad_orphan;
  1082. if (NEXT_ORPHAN(inode) > max_ino)
  1083. goto bad_orphan;
  1084. brelse(bitmap_bh);
  1085. return inode;
  1086. bad_orphan:
  1087. ext4_error(sb, "bad orphan inode %lu", ino);
  1088. if (bitmap_bh)
  1089. printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
  1090. bit, (unsigned long long)bitmap_bh->b_blocknr,
  1091. ext4_test_bit(bit, bitmap_bh->b_data));
  1092. if (inode) {
  1093. printk(KERN_ERR "is_bad_inode(inode)=%d\n",
  1094. is_bad_inode(inode));
  1095. printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
  1096. NEXT_ORPHAN(inode));
  1097. printk(KERN_ERR "max_ino=%lu\n", max_ino);
  1098. printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
  1099. /* Avoid freeing blocks if we got a bad deleted inode */
  1100. if (inode->i_nlink == 0)
  1101. inode->i_blocks = 0;
  1102. iput(inode);
  1103. }
  1104. brelse(bitmap_bh);
  1105. return ERR_PTR(err);
  1106. }
  1107. unsigned long ext4_count_free_inodes(struct super_block *sb)
  1108. {
  1109. unsigned long desc_count;
  1110. struct ext4_group_desc *gdp;
  1111. ext4_group_t i, ngroups = ext4_get_groups_count(sb);
  1112. #ifdef EXT4FS_DEBUG
  1113. struct ext4_super_block *es;
  1114. unsigned long bitmap_count, x;
  1115. struct buffer_head *bitmap_bh = NULL;
  1116. es = EXT4_SB(sb)->s_es;
  1117. desc_count = 0;
  1118. bitmap_count = 0;
  1119. gdp = NULL;
  1120. for (i = 0; i < ngroups; i++) {
  1121. gdp = ext4_get_group_desc(sb, i, NULL);
  1122. if (!gdp)
  1123. continue;
  1124. desc_count += ext4_free_inodes_count(sb, gdp);
  1125. brelse(bitmap_bh);
  1126. bitmap_bh = ext4_read_inode_bitmap(sb, i);
  1127. if (IS_ERR(bitmap_bh)) {
  1128. bitmap_bh = NULL;
  1129. continue;
  1130. }
  1131. x = ext4_count_free(bitmap_bh->b_data,
  1132. EXT4_INODES_PER_GROUP(sb) / 8);
  1133. printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
  1134. (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
  1135. bitmap_count += x;
  1136. }
  1137. brelse(bitmap_bh);
  1138. printk(KERN_DEBUG "ext4_count_free_inodes: "
  1139. "stored = %u, computed = %lu, %lu\n",
  1140. le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
  1141. return desc_count;
  1142. #else
  1143. desc_count = 0;
  1144. for (i = 0; i < ngroups; i++) {
  1145. gdp = ext4_get_group_desc(sb, i, NULL);
  1146. if (!gdp)
  1147. continue;
  1148. desc_count += ext4_free_inodes_count(sb, gdp);
  1149. cond_resched();
  1150. }
  1151. return desc_count;
  1152. #endif
  1153. }
  1154. /* Called at mount-time, super-block is locked */
  1155. unsigned long ext4_count_dirs(struct super_block * sb)
  1156. {
  1157. unsigned long count = 0;
  1158. ext4_group_t i, ngroups = ext4_get_groups_count(sb);
  1159. for (i = 0; i < ngroups; i++) {
  1160. struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
  1161. if (!gdp)
  1162. continue;
  1163. count += ext4_used_dirs_count(sb, gdp);
  1164. }
  1165. return count;
  1166. }
  1167. /*
  1168. * Zeroes not yet zeroed inode table - just write zeroes through the whole
  1169. * inode table. Must be called without any spinlock held. The only place
  1170. * where it is called from on active part of filesystem is ext4lazyinit
  1171. * thread, so we do not need any special locks, however we have to prevent
  1172. * inode allocation from the current group, so we take alloc_sem lock, to
  1173. * block ext4_new_inode() until we are finished.
  1174. */
  1175. int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
  1176. int barrier)
  1177. {
  1178. struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  1179. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1180. struct ext4_group_desc *gdp = NULL;
  1181. struct buffer_head *group_desc_bh;
  1182. handle_t *handle;
  1183. ext4_fsblk_t blk;
  1184. int num, ret = 0, used_blks = 0;
  1185. /* This should not happen, but just to be sure check this */
  1186. if (sb->s_flags & MS_RDONLY) {
  1187. ret = 1;
  1188. goto out;
  1189. }
  1190. gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
  1191. if (!gdp)
  1192. goto out;
  1193. /*
  1194. * We do not need to lock this, because we are the only one
  1195. * handling this flag.
  1196. */
  1197. if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
  1198. goto out;
  1199. handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
  1200. if (IS_ERR(handle)) {
  1201. ret = PTR_ERR(handle);
  1202. goto out;
  1203. }
  1204. down_write(&grp->alloc_sem);
  1205. /*
  1206. * If inode bitmap was already initialized there may be some
  1207. * used inodes so we need to skip blocks with used inodes in
  1208. * inode table.
  1209. */
  1210. if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
  1211. used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
  1212. ext4_itable_unused_count(sb, gdp)),
  1213. sbi->s_inodes_per_block);
  1214. if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
  1215. ext4_error(sb, "Something is wrong with group %u: "
  1216. "used itable blocks: %d; "
  1217. "itable unused count: %u",
  1218. group, used_blks,
  1219. ext4_itable_unused_count(sb, gdp));
  1220. ret = 1;
  1221. goto err_out;
  1222. }
  1223. blk = ext4_inode_table(sb, gdp) + used_blks;
  1224. num = sbi->s_itb_per_group - used_blks;
  1225. BUFFER_TRACE(group_desc_bh, "get_write_access");
  1226. ret = ext4_journal_get_write_access(handle,
  1227. group_desc_bh);
  1228. if (ret)
  1229. goto err_out;
  1230. /*
  1231. * Skip zeroout if the inode table is full. But we set the ZEROED
  1232. * flag anyway, because obviously, when it is full it does not need
  1233. * further zeroing.
  1234. */
  1235. if (unlikely(num == 0))
  1236. goto skip_zeroout;
  1237. ext4_debug("going to zero out inode table in group %d\n",
  1238. group);
  1239. ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
  1240. if (ret < 0)
  1241. goto err_out;
  1242. if (barrier)
  1243. blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
  1244. skip_zeroout:
  1245. ext4_lock_group(sb, group);
  1246. gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
  1247. ext4_group_desc_csum_set(sb, group, gdp);
  1248. ext4_unlock_group(sb, group);
  1249. BUFFER_TRACE(group_desc_bh,
  1250. "call ext4_handle_dirty_metadata");
  1251. ret = ext4_handle_dirty_metadata(handle, NULL,
  1252. group_desc_bh);
  1253. err_out:
  1254. up_write(&grp->alloc_sem);
  1255. ext4_journal_stop(handle);
  1256. out:
  1257. return ret;
  1258. }