Эх сурвалжийг харах

ext4: make the zero-out chunk size tunable

Currently in ext4 the length of zero-out chunk is set to 7 file system
blocks.  But if an inode has uninitailized extents from using
fallocate to preallocate space, and the workload issues many random
writes, this can cause a fragmented extent tree that will
unnecessarily grow the extent tree.

So create a new sysfs tunable, extent_max_zeroout_kb, which controls
the maximum size where blocks will be zeroed out instead of creating a
new uninitialized extent.  The default of this has been sent to 32kb.

CC: Zach Brown <zab@zabbo.net>
CC: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Zheng Liu 13 жил өмнө
parent
commit
67a5da564f

+ 13 - 0
Documentation/ABI/testing/sysfs-fs-ext4

@@ -96,3 +96,16 @@ Contact:	"Theodore Ts'o" <tytso@mit.edu>
 Description:
 Description:
 		The maximum number of megabytes the writeback code will
 		The maximum number of megabytes the writeback code will
 		try to write out before move on to another inode.
 		try to write out before move on to another inode.
+
+What:		/sys/fs/ext4/<disk>/extent_max_zeroout_kb
+Date:		August 2012
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		The maximum number of kilobytes which will be zeroed
+		out in preference to creating a new uninitialized
+		extent when manipulating an inode's extent tree.  Note
+		that using a larger value will increase the
+		variability of time necessary to complete a random
+		write operation (since a 4k random write might turn
+		into a much larger write due to the zeroout
+		operation).

+ 3 - 0
fs/ext4/ext4.h

@@ -1271,6 +1271,9 @@ struct ext4_sb_info {
 	unsigned long s_sectors_written_start;
 	unsigned long s_sectors_written_start;
 	u64 s_kbytes_written;
 	u64 s_kbytes_written;
 
 
+	/* the size of zero-out chunk */
+	unsigned int s_extent_max_zeroout_kb;
+
 	unsigned int s_log_groups_per_flex;
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
 	struct flex_groups *s_flex_groups;
 
 

+ 13 - 12
fs/ext4/extents.c

@@ -3085,7 +3085,6 @@ out:
 	return err ? err : map->m_len;
 	return err ? err : map->m_len;
 }
 }
 
 
-#define EXT4_EXT_ZERO_LEN 7
 /*
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3111,13 +3110,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct ext4_map_blocks *map,
 					   struct ext4_map_blocks *map,
 					   struct ext4_ext_path *path)
 					   struct ext4_ext_path *path)
 {
 {
+	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
 	struct ext4_extent zero_ex;
 	struct ext4_extent *ex;
 	struct ext4_extent *ex;
 	ext4_lblk_t ee_block, eof_block;
 	ext4_lblk_t ee_block, eof_block;
 	unsigned int ee_len, depth;
 	unsigned int ee_len, depth;
-	int allocated;
+	int allocated, max_zeroout = 0;
 	int err = 0;
 	int err = 0;
 	int split_flag = 0;
 	int split_flag = 0;
 
 
@@ -3125,6 +3125,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		(unsigned long long)map->m_lblk, map->m_len);
 		(unsigned long long)map->m_lblk, map->m_len);
 
 
+	sbi = EXT4_SB(inode->i_sb);
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
 		inode->i_sb->s_blocksize_bits;
 	if (eof_block < map->m_lblk + map->m_len)
 	if (eof_block < map->m_lblk + map->m_len)
@@ -3224,9 +3225,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	 */
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
 
-	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+		max_zeroout = sbi->s_extent_max_zeroout_kb >>
+			inode->i_sb->s_blocksize_bits;
+
+	/* If extent is less than s_max_zeroout_kb, zeroout directly */
+	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
 		if (err)
 			goto out;
 			goto out;
@@ -3250,9 +3254,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	split_map.m_lblk = map->m_lblk;
 	split_map.m_lblk = map->m_lblk;
 	split_map.m_len = map->m_len;
 	split_map.m_len = map->m_len;
 
 
-	if (allocated > map->m_len) {
-		if (allocated <= EXT4_EXT_ZERO_LEN &&
-		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+	if (max_zeroout && (allocated > map->m_len)) {
+		if (allocated <= max_zeroout) {
 			/* case 3 */
 			/* case 3 */
 			zero_ex.ee_block =
 			zero_ex.ee_block =
 					 cpu_to_le32(map->m_lblk);
 					 cpu_to_le32(map->m_lblk);
@@ -3264,9 +3267,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				goto out;
 				goto out;
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_lblk = map->m_lblk;
 			split_map.m_len = allocated;
 			split_map.m_len = allocated;
-		} else if ((map->m_lblk - ee_block + map->m_len <
-			   EXT4_EXT_ZERO_LEN) &&
-			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
 			/* case 2 */
 			/* case 2 */
 			if (map->m_lblk != ee_block) {
 			if (map->m_lblk != ee_block) {
 				zero_ex.ee_block = ex->ee_block;
 				zero_ex.ee_block = ex->ee_block;
@@ -3286,7 +3287,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	}
 	}
 
 
 	allocated = ext4_split_extent(handle, inode, path,
 	allocated = ext4_split_extent(handle, inode, path,
-				       &split_map, split_flag, 0);
+				      &split_map, split_flag, 0);
 	if (allocated < 0)
 	if (allocated < 0)
 		err = allocated;
 		err = allocated;
 
 

+ 3 - 0
fs/ext4/super.c

@@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
 
 static struct attribute *ext4_attrs[] = {
 static struct attribute *ext4_attrs[] = {
@@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(mb_group_prealloc),
 	ATTR_LIST(max_writeback_mb_bump),
 	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_max_zeroout_kb),
 	ATTR_LIST(trigger_fs_error),
 	ATTR_LIST(trigger_fs_error),
 	NULL,
 	NULL,
 };
 };
@@ -3756,6 +3758,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 	sbi->s_max_writeback_mb_bump = 128;
+	sbi->s_extent_max_zeroout_kb = 32;
 
 
 	/*
 	/*
 	 * set up enough so that it can read an inode
 	 * set up enough so that it can read an inode