Browse Source

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: Patch up how we claim metadata blocks for quota purposes
  ext4: Ensure zeroout blocks have no dirty metadata
  ext4: return correct wbc.nr_to_write in ext4_da_writepages
  ext4: Update documentation to correct the inode_readahead_blks option name
  jbd2: don't use __GFP_NOFAIL in journal_init_common()
  ext4: flush delalloc blocks when space is low
  fs-writeback: Add helper function to start writeback if idle
  ext4: Eliminate potential double free on error path
  ext4: fix unsigned long long printk warning in super.c
  ext4, jbd2: Add barriers for file systems with exernal journals
  ext4: replace BUG() with return -EIO in ext4_ext_get_blocks
  ext4: add module aliases for ext2 and ext3
  ext4: Don't ask about supporting ext2/3 in ext4 if ext4 is not configured
  ext4: remove unused #include <linux/version.h>
Linus Torvalds 16 years ago
parent
commit
1f11abc966

+ 1 - 1
Documentation/filesystems/ext4.txt

@@ -196,7 +196,7 @@ nobarrier		This also requires an IO stack which can support
 			also be used to enable or disable barriers, for
 			also be used to enable or disable barriers, for
 			consistency with other ext4 mount options.
 			consistency with other ext4 mount options.
 
 
-inode_readahead=n	This tuning parameter controls the maximum
+inode_readahead_blks=n	This tuning parameter controls the maximum
 			number of inode table blocks that ext4's inode
 			number of inode table blocks that ext4's inode
 			table readahead algorithm will pre-read into
 			table readahead algorithm will pre-read into
 			the buffer cache.  The default value is 32 blocks.
 			the buffer cache.  The default value is 32 blocks.

+ 1 - 0
fs/ext4/Kconfig

@@ -28,6 +28,7 @@ config EXT4_FS
 
 
 config EXT4_USE_FOR_EXT23
 config EXT4_USE_FOR_EXT23
 	bool "Use ext4 for ext2/ext3 file systems"
 	bool "Use ext4 for ext2/ext3 file systems"
+	depends on EXT4_FS
 	depends on EXT3_FS=n || EXT2_FS=n
 	depends on EXT3_FS=n || EXT2_FS=n
 	default y
 	default y
 	help
 	help

+ 0 - 1
fs/ext4/block_validity.c

@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/pagemap.h>
-#include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include "ext4.h"
 #include "ext4.h"

+ 27 - 1
fs/ext4/extents.c

@@ -3023,6 +3023,14 @@ static int ext4_convert_unwritten_extents_dio(handle_t *handle,
 	return err;
 	return err;
 }
 }
 
 
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+			sector_t block, int count)
+{
+	int i;
+	for (i = 0; i < count; i++)
+                unmap_underlying_metadata(bdev, block + i);
+}
+
 static int
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock, unsigned int max_blocks,
 			ext4_lblk_t iblock, unsigned int max_blocks,
@@ -3098,6 +3106,18 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	} else
 	} else
 		allocated = ret;
 		allocated = ret;
 	set_buffer_new(bh_result);
 	set_buffer_new(bh_result);
+	/*
+	 * if we allocated more blocks than requested
+	 * we need to make sure we unmap the extra block
+	 * allocated. The actual needed block will get
+	 * unmapped later when we find the buffer_head marked
+	 * new.
+	 */
+	if (allocated > max_blocks) {
+		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+					newblock + max_blocks,
+					allocated - max_blocks);
+	}
 map_out:
 map_out:
 	set_buffer_mapped(bh_result);
 	set_buffer_mapped(bh_result);
 out1:
 out1:
@@ -3190,7 +3210,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * this situation is possible, though, _during_ tree modification;
 	 * this situation is possible, though, _during_ tree modification;
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
 	 */
-	BUG_ON(path[depth].p_ext == NULL && depth != 0);
+	if (path[depth].p_ext == NULL && depth != 0) {
+		ext4_error(inode->i_sb, __func__, "bad extent address "
+			   "inode: %lu, iblock: %d, depth: %d",
+			   inode->i_ino, iblock, depth);
+		err = -EIO;
+		goto out2;
+	}
 	eh = path[depth].p_hdr;
 	eh = path[depth].p_hdr;
 
 
 	ex = path[depth].p_ext;
 	ex = path[depth].p_ext;

+ 14 - 2
fs/ext4/fsync.c

@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		return ext4_force_commit(inode->i_sb);
 		return ext4_force_commit(inode->i_sb);
 
 
 	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
 	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-	if (jbd2_log_start_commit(journal, commit_tid))
+	if (jbd2_log_start_commit(journal, commit_tid)) {
+		/*
+		 * When the journal is on a different device than the
+		 * fs data disk, we need to issue the barrier in
+		 * writeback mode.  (In ordered mode, the jbd2 layer
+		 * will take care of issuing the barrier.  In
+		 * data=journal, all of the data blocks are written to
+		 * the journal device.)
+		 */
+		if (ext4_should_writeback_data(inode) &&
+		    (journal->j_fs_dev != journal->j_dev) &&
+		    (journal->j_flags & JBD2_BARRIER))
+			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 		jbd2_log_wait_commit(journal, commit_tid);
 		jbd2_log_wait_commit(journal, commit_tid);
-	else if (journal->j_flags & JBD2_BARRIER)
+	} else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 	return ret;
 }
 }

+ 94 - 77
fs/ext4/inode.c

@@ -1043,43 +1043,47 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
 	return ext4_indirect_calc_metadata_amount(inode, blocks);
 	return ext4_indirect_calc_metadata_amount(inode, blocks);
 }
 }
 
 
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
 static void ext4_da_update_reserve_space(struct inode *inode, int used)
 static void ext4_da_update_reserve_space(struct inode *inode, int used)
 {
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	int total, mdb, mdb_free, mdb_claim = 0;
-
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-	/* recalculate the number of metablocks still need to be reserved */
-	total = EXT4_I(inode)->i_reserved_data_blocks - used;
-	mdb = ext4_calc_metadata_amount(inode, total);
-
-	/* figure out how many metablocks to release */
-	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
-	if (mdb_free) {
-		/* Account for allocated meta_blocks */
-		mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
-		BUG_ON(mdb_free < mdb_claim);
-		mdb_free -= mdb_claim;
-
-		/* update fs dirty blocks counter */
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int mdb_free = 0;
+
+	spin_lock(&ei->i_block_reservation_lock);
+	if (unlikely(used > ei->i_reserved_data_blocks)) {
+		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+			 "with only %d reserved data blocks\n",
+			 __func__, inode->i_ino, used,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		used = ei->i_reserved_data_blocks;
+	}
+
+	/* Update per-inode reservations */
+	ei->i_reserved_data_blocks -= used;
+	used += ei->i_allocated_meta_blocks;
+	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+	ei->i_allocated_meta_blocks = 0;
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+
+	if (ei->i_reserved_data_blocks == 0) {
+		/*
+		 * We can release all of the reserved metadata blocks
+		 * only when we have written all of the delayed
+		 * allocation blocks.
+		 */
+		mdb_free = ei->i_allocated_meta_blocks;
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
-		EXT4_I(inode)->i_allocated_meta_blocks = 0;
-		EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+		ei->i_allocated_meta_blocks = 0;
 	}
 	}
-
-	/* update per-inode reservations */
-	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-	EXT4_I(inode)->i_reserved_data_blocks -= used;
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
 
-	vfs_dq_claim_block(inode, used + mdb_claim);
-
-	/*
-	 * free those over-booking quota for metadata blocks
-	 */
+	/* Update quota subsystem */
+	vfs_dq_claim_block(inode, used);
 	if (mdb_free)
 	if (mdb_free)
 		vfs_dq_release_reservation_block(inode, mdb_free);
 		vfs_dq_release_reservation_block(inode, mdb_free);
 
 
@@ -1088,7 +1092,8 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 	 * there aren't any writers on the inode, we can discard the
 	 * there aren't any writers on the inode, we can discard the
 	 * inode's preallocations.
 	 * inode's preallocations.
 	 */
 	 */
-	if (!total && (atomic_read(&inode->i_writecount) == 0))
+	if ((ei->i_reserved_data_blocks == 0) &&
+	    (atomic_read(&inode->i_writecount) == 0))
 		ext4_discard_preallocations(inode);
 		ext4_discard_preallocations(inode);
 }
 }
 
 
@@ -1801,7 +1806,8 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
 {
 	int retries = 0;
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	unsigned long md_needed, mdblocks, total = 0;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long md_needed, md_reserved, total = 0;
 
 
 	/*
 	/*
 	 * recalculate the amount of metadata blocks to reserve
 	 * recalculate the amount of metadata blocks to reserve
@@ -1809,35 +1815,44 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 	 * worse case is one extent per block
 	 * worse case is one extent per block
 	 */
 	 */
 repeat:
 repeat:
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
-	mdblocks = ext4_calc_metadata_amount(inode, total);
-	BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
-
-	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+	spin_lock(&ei->i_block_reservation_lock);
+	md_reserved = ei->i_reserved_meta_blocks;
+	md_needed = ext4_calc_metadata_amount(inode, nrblocks);
 	total = md_needed + nrblocks;
 	total = md_needed + nrblocks;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	spin_unlock(&ei->i_block_reservation_lock);
 
 
 	/*
 	/*
 	 * Make quota reservation here to prevent quota overflow
 	 * Make quota reservation here to prevent quota overflow
 	 * later. Real quota accounting is done at pages writeout
 	 * later. Real quota accounting is done at pages writeout
 	 * time.
 	 * time.
 	 */
 	 */
-	if (vfs_dq_reserve_block(inode, total))
+	if (vfs_dq_reserve_block(inode, total)) {
+		/* 
+		 * We tend to badly over-estimate the amount of
+		 * metadata blocks which are needed, so if we have
+		 * reserved any metadata blocks, try to force out the
+		 * inode and see if we have any better luck.
+		 */
+		if (md_reserved && retries++ <= 3)
+			goto retry;
 		return -EDQUOT;
 		return -EDQUOT;
+	}
 
 
 	if (ext4_claim_free_blocks(sbi, total)) {
 	if (ext4_claim_free_blocks(sbi, total)) {
 		vfs_dq_release_reservation_block(inode, total);
 		vfs_dq_release_reservation_block(inode, total);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+		retry:
+			if (md_reserved)
+				write_inode_now(inode, (retries == 3));
 			yield();
 			yield();
 			goto repeat;
 			goto repeat;
 		}
 		}
 		return -ENOSPC;
 		return -ENOSPC;
 	}
 	}
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
-	EXT4_I(inode)->i_reserved_meta_blocks += md_needed;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	spin_lock(&ei->i_block_reservation_lock);
+	ei->i_reserved_data_blocks += nrblocks;
+	ei->i_reserved_meta_blocks += md_needed;
+	spin_unlock(&ei->i_block_reservation_lock);
 
 
 	return 0;       /* success */
 	return 0;       /* success */
 }
 }
@@ -1845,49 +1860,45 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 static void ext4_da_release_space(struct inode *inode, int to_free)
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	int total, mdb, mdb_free, release;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 
 
 	if (!to_free)
 	if (!to_free)
 		return;		/* Nothing to release, exit */
 		return;		/* Nothing to release, exit */
 
 
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 
 
-	if (!EXT4_I(inode)->i_reserved_data_blocks) {
+	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		/*
-		 * if there is no reserved blocks, but we try to free some
-		 * then the counter is messed up somewhere.
-		 * but since this function is called from invalidate
-		 * page, it's harmless to return without any action
+		 * if there aren't enough reserved blocks, then the
+		 * counter is messed up somewhere.  Since this
+		 * function is called from invalidate page, it's
+		 * harmless to return without any action.
 		 */
 		 */
-		printk(KERN_INFO "ext4 delalloc try to release %d reserved "
-			    "blocks for inode %lu, but there is no reserved "
-			    "data blocks\n", to_free, inode->i_ino);
-		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-		return;
+		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+			 "ino %lu, to_free %d with only %d reserved "
+			 "data blocks\n", inode->i_ino, to_free,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		to_free = ei->i_reserved_data_blocks;
 	}
 	}
+	ei->i_reserved_data_blocks -= to_free;
 
 
-	/* recalculate the number of metablocks still need to be reserved */
-	total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
-	mdb = ext4_calc_metadata_amount(inode, total);
-
-	/* figure out how many metablocks to release */
-	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
-	release = to_free + mdb_free;
-
-	/* update fs dirty blocks counter for truncate case */
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+	if (ei->i_reserved_data_blocks == 0) {
+		/*
+		 * We can release all of the reserved metadata blocks
+		 * only when we have written all of the delayed
+		 * allocation blocks.
+		 */
+		to_free += ei->i_allocated_meta_blocks;
+		ei->i_allocated_meta_blocks = 0;
+	}
 
 
-	/* update per-inode reservations */
-	BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
-	EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+	/* update fs dirty blocks counter */
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 
 
-	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
 
-	vfs_dq_release_reservation_block(inode, release);
+	vfs_dq_release_reservation_block(inode, to_free);
 }
 }
 
 
 static void ext4_da_page_release_reservation(struct page *page,
 static void ext4_da_page_release_reservation(struct page *page,
@@ -2967,8 +2978,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 out_writepages:
 out_writepages:
 	if (!no_nrwrite_index_update)
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 		wbc->no_nrwrite_index_update = 0;
-	if (wbc->nr_to_write > nr_to_writebump)
-		wbc->nr_to_write -= nr_to_writebump;
+	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 	return ret;
@@ -2993,11 +3003,18 @@ static int ext4_nonda_switch(struct super_block *sb)
 	if (2 * free_blocks < 3 * dirty_blocks ||
 	if (2 * free_blocks < 3 * dirty_blocks ||
 		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
 		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
 		/*
 		/*
-		 * free block count is less that 150% of dirty blocks
-		 * or free blocks is less that watermark
+		 * free block count is less than 150% of dirty blocks
+		 * or free blocks is less than watermark
 		 */
 		 */
 		return 1;
 		return 1;
 	}
 	}
+	/*
+	 * Even if we don't switch but are nearing capacity,
+	 * start pushing delalloc when 1/2 of free blocks are dirty.
+	 */
+	if (free_blocks < 2 * dirty_blocks)
+		writeback_inodes_sb_if_idle(sb);
+
 	return 0;
 	return 0;
 }
 }
 
 

+ 0 - 1
fs/ext4/mballoc.h

@@ -17,7 +17,6 @@
 #include <linux/proc_fs.h>
 #include <linux/proc_fs.h>
 #include <linux/pagemap.h>
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
-#include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4_jbd2.h"

+ 4 - 2
fs/ext4/super.c

@@ -2174,9 +2174,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
 
 
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-			sbi->s_kbytes_written + 
+			(unsigned long long)(sbi->s_kbytes_written +
 			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-			  EXT4_SB(sb)->s_sectors_written_start) >> 1));
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 }
 
 
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
@@ -4005,6 +4005,7 @@ static inline void unregister_as_ext2(void)
 {
 {
 	unregister_filesystem(&ext2_fs_type);
 	unregister_filesystem(&ext2_fs_type);
 }
 }
+MODULE_ALIAS("ext2");
 #else
 #else
 static inline void register_as_ext2(void) { }
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
@@ -4031,6 +4032,7 @@ static inline void unregister_as_ext3(void)
 {
 {
 	unregister_filesystem(&ext3_fs_type);
 	unregister_filesystem(&ext3_fs_type);
 }
 }
+MODULE_ALIAS("ext3");
 #else
 #else
 static inline void register_as_ext3(void) { }
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }

+ 2 - 0
fs/ext4/xattr.c

@@ -1332,6 +1332,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			goto cleanup;
 			goto cleanup;
 		kfree(b_entry_name);
 		kfree(b_entry_name);
 		kfree(buffer);
 		kfree(buffer);
+		b_entry_name = NULL;
+		buffer = NULL;
 		brelse(is->iloc.bh);
 		brelse(is->iloc.bh);
 		kfree(is);
 		kfree(is);
 		kfree(bs);
 		kfree(bs);

+ 17 - 0
fs/fs-writeback.c

@@ -1186,6 +1186,23 @@ void writeback_inodes_sb(struct super_block *sb)
 }
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 
+/**
+ * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * @sb: the superblock
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_if_idle(struct super_block *sb)
+{
+	if (!writeback_in_progress(sb->s_bdi)) {
+		writeback_inodes_sb(sb);
+		return 1;
+	} else
+		return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+
 /**
 /**
  * sync_inodes_sb	-	sync sb inode pages
  * sync_inodes_sb	-	sync sb inode pages
  * @sb: the superblock
  * @sb: the superblock

+ 15 - 0
fs/jbd2/checkpoint.c

@@ -22,6 +22,7 @@
 #include <linux/jbd2.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
 #include <trace/events/jbd2.h>
 
 
 /*
 /*
@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail = blocknr;
 	journal->j_tail = blocknr;
 	spin_unlock(&journal->j_state_lock);
 	spin_unlock(&journal->j_state_lock);
+
+	/*
+	 * If there is an external journal, we need to make sure that
+	 * any data blocks that were recently written out --- perhaps
+	 * by jbd2_log_do_checkpoint() --- are flushed out before we
+	 * drop the transactions from the external journal.  It's
+	 * unlikely this will be necessary, especially with a
+	 * appropriately sized journal, but we need this to guarantee
+	 * correctness.  Fortunately jbd2_cleanup_journal_tail()
+	 * doesn't get called all that often.
+	 */
+	if ((journal->j_fs_dev != journal->j_dev) &&
+	    (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(journal->j_fs_dev, NULL);
 	if (!(journal->j_flags & JBD2_ABORT))
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
 	return 0;

+ 11 - 8
fs/jbd2/commit.c

@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 			ret = err;
 			ret = err;
 		spin_lock(&journal->j_list_lock);
 		spin_lock(&journal->j_list_lock);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
+		commit_transaction->t_flushed_data_blocks = 1;
 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 	}
@@ -708,8 +709,17 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		}
 		}
 	}
 	}
 
 
-	/* Done it all: now write the commit record asynchronously. */
+	/* 
+	 * If the journal is not located on the file system device,
+	 * then we must flush the file system device before we issue
+	 * the commit record
+	 */
+	if (commit_transaction->t_flushed_data_blocks &&
+	    (journal->j_fs_dev != journal->j_dev) &&
+	    (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(journal->j_fs_dev, NULL);
 
 
+	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 		err = journal_submit_commit_record(journal, commit_transaction,
@@ -720,13 +730,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 			blkdev_issue_flush(journal->j_dev, NULL);
 			blkdev_issue_flush(journal->j_dev, NULL);
 	}
 	}
 
 
-	/*
-	 * This is the right place to wait for data buffers both for ASYNC
-	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
-	 * the commit block went to disk (which happens above). If commit is
-	 * SYNC, we need to wait for data buffers before we start writing
-	 * commit block, which happens below in such setting.
-	 */
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 	if (err) {
 	if (err) {
 		printk(KERN_WARNING
 		printk(KERN_WARNING

+ 1 - 1
fs/jbd2/journal.c

@@ -814,7 +814,7 @@ static journal_t * journal_init_common (void)
 	journal_t *journal;
 	journal_t *journal;
 	int err;
 	int err;
 
 
-	journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL);
+	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
 	if (!journal)
 	if (!journal)
 		goto fail;
 		goto fail;
 
 

+ 1 - 0
include/linux/jbd2.h

@@ -653,6 +653,7 @@ struct transaction_s
 	 * waiting for it to finish.
 	 * waiting for it to finish.
 	 */
 	 */
 	unsigned int t_synchronous_commit:1;
 	unsigned int t_synchronous_commit:1;
+	unsigned int t_flushed_data_blocks:1;
 
 
 	/*
 	/*
 	 * For use by the filesystem to store fs-specific data
 	 * For use by the filesystem to store fs-specific data

+ 1 - 0
include/linux/writeback.h

@@ -70,6 +70,7 @@ struct writeback_control {
 struct bdi_writeback;
 struct bdi_writeback;
 int inode_wait(void *);
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
 void writeback_inodes_sb(struct super_block *);
+int writeback_inodes_sb_if_idle(struct super_block *);
 void sync_inodes_sb(struct super_block *);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
 void writeback_inodes_wbc(struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);