Преглед изворни кода

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: Remove automatic enabling of the HUGE_FILE feature flag
  ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback
  ext4: Update Documentation/filesystems/ext4.txt
  ext4: Remove unused mount options: nomballoc, mballoc, nocheck
  ext4: Remove compile warnings when building w/o CONFIG_PROC_FS
  ext4: Add missing newlines to printk messages
  ext4: Fix file fragmentation during large file write.
  vfs: Add no_nrwrite_index_update writeback control flag
  vfs: Remove the range_cont writeback mode.
  ext4: Use tag dirty lookup during mpage_da_submit_io
  ext4: let the block device know when unused blocks can be discarded
  ext4: Don't reuse released data blocks until transaction commits
  ext4: Use an rbtree for tracking blocks freed during transaction.
  ext4: Do mballoc init before doing filesystem recovery
  ext4: Free ext4_prealloc_space using kmem_cache_free
  ext4: Fix Kconfig typo for ext4dev
  ext4: Remove an old reference to ext4dev in Makefile comment
Linus Torvalds пре 17 година
родитељ
комит
58617d5e59
15 измењених фајлова са 320 додато и 336 уклоњено
  1. 15 17
      Documentation/filesystems/ext4.txt
  2. 1 1
      fs/Kconfig
  3. 1 1
      fs/Makefile
  4. 10 2
      fs/ext4/balloc.c
  5. 0 1
      fs/ext4/ext4.h
  6. 0 3
      fs/ext4/ext4_sb.h
  7. 76 67
      fs/ext4/inode.c
  8. 135 128
      fs/ext4/mballoc.c
  9. 19 12
      fs/ext4/mballoc.h
  10. 34 98
      fs/ext4/super.c
  11. 3 0
      fs/jbd2/commit.c
  12. 1 0
      fs/jbd2/transaction.c
  13. 9 0
      include/linux/jbd2.h
  14. 9 1
      include/linux/writeback.h
  15. 7 5
      mm/page-writeback.c

+ 15 - 17
Documentation/filesystems/ext4.txt

@@ -2,19 +2,24 @@
 Ext4 Filesystem
 Ext4 Filesystem
 ===============
 ===============
 
 
-This is a development version of the ext4 filesystem, an advanced level
-of the ext3 filesystem which incorporates scalability and reliability
-enhancements for supporting large filesystems (64 bit) in keeping with
-increasing disk capacities and state-of-the-art feature requirements.
+Ext4 is an an advanced level of the ext3 filesystem which incorporates
+scalability and reliability enhancements for supporting large filesystems
+(64 bit) in keeping with increasing disk capacities and state-of-the-art
+feature requirements.
 
 
-Mailing list: linux-ext4@vger.kernel.org
+Mailing list:	linux-ext4@vger.kernel.org
+Web site:	http://ext4.wiki.kernel.org
 
 
 
 
 1. Quick usage instructions:
 1. Quick usage instructions:
 ===========================
 ===========================
 
 
+Note: More extensive information for getting started with ext4 can be
+      found at the ext4 wiki site at the URL:
+      http://ext4.wiki.kernel.org/index.php/Ext4_Howto
+
   - Compile and install the latest version of e2fsprogs (as of this
   - Compile and install the latest version of e2fsprogs (as of this
-    writing version 1.41) from:
+    writing version 1.41.3) from:
 
 
     http://sourceforge.net/project/showfiles.php?group_id=2406
     http://sourceforge.net/project/showfiles.php?group_id=2406
 	
 	
@@ -36,11 +41,9 @@ Mailing list: linux-ext4@vger.kernel.org
 
 
     	# mke2fs -t ext4 /dev/hda1
     	# mke2fs -t ext4 /dev/hda1
 
 
-    Or configure an existing ext3 filesystem to support extents and set
-    the test_fs flag to indicate that it's ok for an in-development
-    filesystem to touch this filesystem:
+    Or to configure an existing ext3 filesystem to support extents: 
 
 
-	# tune2fs -O extents -E test_fs /dev/hda1
+	# tune2fs -O extents /dev/hda1
 
 
     If the filesystem was created with 128 byte inodes, it can be
     If the filesystem was created with 128 byte inodes, it can be
     converted to use 256 byte for greater efficiency via:
     converted to use 256 byte for greater efficiency via:
@@ -104,8 +107,8 @@ exist yet so I'm not sure they're in the near-term roadmap.
 The big performance win will come with mballoc, delalloc and flex_bg
 The big performance win will come with mballoc, delalloc and flex_bg
 grouping of bitmaps and inode tables.  Some test results available here:
 grouping of bitmaps and inode tables.  Some test results available here:
 
 
- - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
- - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
+ - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
 
 
 3. Options
 3. Options
 ==========
 ==========
@@ -214,9 +217,6 @@ noreservation
 bsddf		(*)	Make 'df' act like BSD.
 bsddf		(*)	Make 'df' act like BSD.
 minixdf			Make 'df' act like Minix.
 minixdf			Make 'df' act like Minix.
 
 
-check=none		Don't do extra checking of bitmaps on mount.
-nocheck
-
 debug			Extra debugging information is sent to syslog.
 debug			Extra debugging information is sent to syslog.
 
 
 errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=remount-ro(*)	Remount the filesystem read-only on an error.
@@ -253,8 +253,6 @@ nobh			(a) cache disk block mapping information
 			"nobh" option tries to avoid associating buffer
 			"nobh" option tries to avoid associating buffer
 			heads (supported only for "writeback" mode).
 			heads (supported only for "writeback" mode).
 
 
-mballoc		(*)	Use the multiple block allocator for block allocation
-nomballoc		disabled multiple block allocator for block allocation.
 stripe=n		Number of filesystem blocks that mballoc will try
 stripe=n		Number of filesystem blocks that mballoc will try
 			to use for allocation size and alignment. For RAID5/6
 			to use for allocation size and alignment. For RAID5/6
 			systems this should be the number of data
 			systems this should be the number of data

+ 1 - 1
fs/Kconfig

@@ -160,7 +160,7 @@ config EXT4_FS
 	  filesystem initially.
 	  filesystem initially.
 
 
 	  To compile this file system support as a module, choose M here. The
 	  To compile this file system support as a module, choose M here. The
-	  module will be called ext4dev.
+	  module will be called ext4.
 
 
 	  If unsure, say N.
 	  If unsure, say N.
 
 

+ 1 - 1
fs/Makefile

@@ -71,7 +71,7 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/

+ 10 - 2
fs/ext4/balloc.c

@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
 
 	/* this isn't the right place to decide whether block is metadata
 	/* this isn't the right place to decide whether block is metadata
 	 * inode.c/extents.c knows better, but for safety ... */
 	 * inode.c/extents.c knows better, but for safety ... */
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
-			ext4_should_journal_data(inode))
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		metadata = 1;
+
+	/* We need to make sure we don't reuse
+	 * block released untill the transaction commit.
+	 * writeback mode have weak data consistency so
+	 * don't force data as metadata when freeing block
+	 * for writeback mode.
+	 */
+	if (metadata == 0 && !ext4_should_writeback_data(inode))
 		metadata = 1;
 		metadata = 1;
 
 
 	sb = inode->i_sb;
 	sb = inode->i_sb;

+ 0 - 1
fs/ext4/ext4.h

@@ -511,7 +511,6 @@ do {									       \
 /*
 /*
  * Mount flags
  * Mount flags
  */
  */
-#define EXT4_MOUNT_CHECK		0x00001	/* Do mount-time checks */
 #define EXT4_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */

+ 0 - 3
fs/ext4/ext4_sb.h

@@ -99,9 +99,6 @@ struct ext4_sb_info {
 	struct inode *s_buddy_cache;
 	struct inode *s_buddy_cache;
 	long s_blocks_reserved;
 	long s_blocks_reserved;
 	spinlock_t s_reserve_lock;
 	spinlock_t s_reserve_lock;
-	struct list_head s_active_transaction;
-	struct list_head s_closed_transaction;
-	struct list_head s_committed_transaction;
 	spinlock_t s_md_lock;
 	spinlock_t s_md_lock;
 	tid_t s_last_transaction;
 	tid_t s_last_transaction;
 	unsigned short *s_mb_offsets, *s_mb_maxs;
 	unsigned short *s_mb_offsets, *s_mb_maxs;

+ 76 - 67
fs/ext4/inode.c

@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	int ret = 0, err, nr_pages, i;
 	int ret = 0, err, nr_pages, i;
 	unsigned long index, end;
 	unsigned long index, end;
 	struct pagevec pvec;
 	struct pagevec pvec;
+	long pages_skipped;
 
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	pagevec_init(&pvec, 0);
 	pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 	end = mpd->next_page - 1;
 	end = mpd->next_page - 1;
 
 
 	while (index <= end) {
 	while (index <= end) {
-		/* XXX: optimize tail */
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		/*
+		 * We can use PAGECACHE_TAG_DIRTY lookup here because
+		 * even though we have cleared the dirty flag on the page
+		 * We still keep the page in the radix tree with tag
+		 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+		 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+		 * which is called via the below writepage callback.
+		 */
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY,
+					min(end - index,
+					(pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 		if (nr_pages == 0)
 			break;
 			break;
 		for (i = 0; i < nr_pages; i++) {
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			struct page *page = pvec.pages[i];
 
 
-			index = page->index;
-			if (index > end)
-				break;
-			index++;
-
+			pages_skipped = mpd->wbc->pages_skipped;
 			err = mapping->a_ops->writepage(page, mpd->wbc);
 			err = mapping->a_ops->writepage(page, mpd->wbc);
-			if (!err)
+			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+				/*
+				 * have successfully written the page
+				 * without skipping the same
+				 */
 				mpd->pages_written++;
 				mpd->pages_written++;
 			/*
 			/*
 			 * In error case, we have to continue because
 			 * In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc,
 			       struct writeback_control *wbc,
 			       struct mpage_da_data *mpd)
 			       struct mpage_da_data *mpd)
 {
 {
-	long to_write;
 	int ret;
 	int ret;
 
 
 	if (!mpd->get_block)
 	if (!mpd->get_block)
@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
 	mpd->pages_written = 0;
 	mpd->pages_written = 0;
 	mpd->retval = 0;
 	mpd->retval = 0;
 
 
-	to_write = wbc->nr_to_write;
-
 	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
 	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
 	/*
 	/*
 	 * Handle last extent of pages
 	 * Handle last extent of pages
 	 */
 	 */
 	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
 	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
 		if (mpage_da_map_blocks(mpd) == 0)
 		if (mpage_da_map_blocks(mpd) == 0)
 			mpage_da_submit_io(mpd);
 			mpage_da_submit_io(mpd);
-	}
 
 
-	wbc->nr_to_write = to_write - mpd->pages_written;
+		mpd->io_done = 1;
+		ret = MPAGE_DA_EXTENT_TAIL;
+	}
+	wbc->nr_to_write -= mpd->pages_written;
 	return ret;
 	return ret;
 }
 }
 
 
@@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 static int ext4_da_writepages(struct address_space *mapping,
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 			      struct writeback_control *wbc)
 {
 {
+	pgoff_t	index;
+	int range_whole = 0;
 	handle_t *handle = NULL;
 	handle_t *handle = NULL;
-	loff_t range_start = 0;
 	struct mpage_da_data mpd;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
+	int no_nrwrite_index_update;
+	long pages_written = 0, pages_skipped;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
-	long to_write, pages_skipped = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
 
 	/*
 	/*
@@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
 		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
 		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
 		wbc->nr_to_write = sbi->s_mb_stream_request;
 		wbc->nr_to_write = sbi->s_mb_stream_request;
 	}
 	}
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
 
 
-	if (!wbc->range_cyclic)
-		/*
-		 * If range_cyclic is not set force range_cont
-		 * and save the old writeback_index
-		 */
-		wbc->range_cont = 1;
-
-	range_start =  wbc->range_start;
-	pages_skipped = wbc->pages_skipped;
+	if (wbc->range_cyclic)
+		index = mapping->writeback_index;
+	else
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
 
 	mpd.wbc = wbc;
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
 	mpd.inode = mapping->host;
 
 
-restart_loop:
-	to_write = wbc->nr_to_write;
-	while (!ret && to_write > 0) {
+	/*
+	 * we don't want write_cache_pages to update
+	 * nr_to_write and writeback_index
+	 */
+	no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+	wbc->no_nrwrite_index_update = 1;
+	pages_skipped = wbc->pages_skipped;
+
+	while (!ret && wbc->nr_to_write > 0) {
 
 
 		/*
 		/*
 		 * we  insert one extent at a time. So we need
 		 * we  insert one extent at a time. So we need
@@ -2422,48 +2436,53 @@ static int ext4_da_writepages(struct address_space *mapping,
 			dump_stack();
 			dump_stack();
 			goto out_writepages;
 			goto out_writepages;
 		}
 		}
-		to_write -= wbc->nr_to_write;
-
 		mpd.get_block = ext4_da_get_block_write;
 		mpd.get_block = ext4_da_get_block_write;
 		ret = mpage_da_writepages(mapping, wbc, &mpd);
 		ret = mpage_da_writepages(mapping, wbc, &mpd);
 
 
 		ext4_journal_stop(handle);
 		ext4_journal_stop(handle);
 
 
-		if (mpd.retval == -ENOSPC)
+		if (mpd.retval == -ENOSPC) {
+			/* commit the transaction which would
+			 * free blocks released in the transaction
+			 * and try again
+			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			jbd2_journal_force_commit_nested(sbi->s_journal);
-
-		/* reset the retry count */
-		if (ret == MPAGE_DA_EXTENT_TAIL) {
+			wbc->pages_skipped = pages_skipped;
+			ret = 0;
+		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
 			/*
 			 * got one extent now try with
 			 * got one extent now try with
 			 * rest of the pages
 			 * rest of the pages
 			 */
 			 */
-			to_write += wbc->nr_to_write;
+			pages_written += mpd.pages_written;
+			wbc->pages_skipped = pages_skipped;
 			ret = 0;
 			ret = 0;
-		} else if (wbc->nr_to_write) {
+		} else if (wbc->nr_to_write)
 			/*
 			/*
 			 * There is no more writeout needed
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
 			 * or we requested for a noblocking writeout
 			 * and we found the device congested
 			 * and we found the device congested
 			 */
 			 */
-			to_write += wbc->nr_to_write;
 			break;
 			break;
-		}
-		wbc->nr_to_write = to_write;
-	}
-
-	if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-		/* We skipped pages in this loop */
-		wbc->range_start = range_start;
-		wbc->nr_to_write = to_write +
-				wbc->pages_skipped - pages_skipped;
-		wbc->pages_skipped = pages_skipped;
-		goto restart_loop;
 	}
 	}
+	if (pages_skipped != wbc->pages_skipped)
+		printk(KERN_EMERG "This should not happen leaving %s "
+				"with nr_to_write = %ld ret = %d\n",
+				__func__, wbc->nr_to_write, ret);
+
+	/* Update index */
+	index += pages_written;
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		/*
+		 * set the writeback_index so that range_cyclic
+		 * mode will write it back later
+		 */
+		mapping->writeback_index = index;
 
 
 out_writepages:
 out_writepages:
-	wbc->nr_to_write = to_write - nr_to_writebump;
-	wbc->range_start = range_start;
+	if (!no_nrwrite_index_update)
+		wbc->no_nrwrite_index_update = 0;
+	wbc->nr_to_write -= nr_to_writebump;
 	return ret;
 	return ret;
 }
 }
 
 
@@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
 	struct inode *inode = &(ei->vfs_inode);
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
 	struct super_block *sb = inode->i_sb;
-	int err = 0;
 
 
 	if (i_blocks <= ~0U) {
 	if (i_blocks <= ~0U) {
 		/*
 		/*
@@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
 		raw_inode->i_blocks_high = 0;
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
-	} else if (i_blocks <= 0xffffffffffffULL) {
+		return 0;
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+		return -EFBIG;
+
+	if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 * as multiple of 512 bytes
 		 */
 		 */
-		err = ext4_update_rocompat_feature(handle, sb,
-					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-		if (err)
-			goto  err_out;
-		/* i_block is stored in the split  48 bit fields */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 	} else {
 	} else {
-		/*
-		 * i_blocks should be represented in a 48 bit variable
-		 * as multiple of  file system block size
-		 */
-		err = ext4_update_rocompat_feature(handle, sb,
-					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-		if (err)
-			goto  err_out;
 		ei->i_flags |= EXT4_HUGE_FILE_FL;
 		ei->i_flags |= EXT4_HUGE_FILE_FL;
 		/* i_block is stored in file system block size */
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
 	}
-err_out:
-	return err;
+	return 0;
 }
 }
 
 
 /*
 /*

+ 135 - 128
fs/ext4/mballoc.c

@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	}
 	}
 
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	meta_group_info[i]->bb_free_root.rb_node = NULL;;
 
 
 #ifdef DOUBLE_CHECK
 #ifdef DOUBLE_CHECK
 	{
 	{
@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	}
 	}
 
 
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_md_lock);
-	INIT_LIST_HEAD(&sbi->s_active_transaction);
-	INIT_LIST_HEAD(&sbi->s_closed_transaction);
-	INIT_LIST_HEAD(&sbi->s_committed_transaction);
 	spin_lock_init(&sbi->s_bal_lock);
 	spin_lock_init(&sbi->s_bal_lock);
 
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 	ext4_mb_history_init(sb);
 
 
+	sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
 	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
 	return 0;
 }
 }
@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 		list_del(&pa->pa_group_list);
 		list_del(&pa->pa_group_list);
 		count++;
 		count++;
-		kfree(pa);
+		kmem_cache_free(ext4_pspace_cachep, pa);
 	}
 	}
 	if (count)
 	if (count)
 		mb_debug("mballoc: %u PAs left\n", count);
 		mb_debug("mballoc: %u PAs left\n", count);
@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
 	struct ext4_group_info *grinfo;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 
-	/* release freed, non-committed blocks */
-	spin_lock(&sbi->s_md_lock);
-	list_splice_init(&sbi->s_closed_transaction,
-			&sbi->s_committed_transaction);
-	list_splice_init(&sbi->s_active_transaction,
-			&sbi->s_committed_transaction);
-	spin_unlock(&sbi->s_md_lock);
-	ext4_mb_free_committed_blocks(sb);
-
 	if (sbi->s_group_info) {
 	if (sbi->s_group_info) {
 		for (i = 0; i < sbi->s_groups_count; i++) {
 		for (i = 0; i < sbi->s_groups_count; i++) {
 			grinfo = ext4_get_group_info(sb, i);
 			grinfo = ext4_get_group_info(sb, i);
@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
 	return 0;
 	return 0;
 }
 }
 
 
-static noinline_for_stack void
-ext4_mb_free_committed_blocks(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 {
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int err;
-	int i;
-	int count = 0;
-	int count2 = 0;
-	struct ext4_free_metadata *md;
+	struct super_block *sb = journal->j_private;
 	struct ext4_buddy e4b;
 	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	int err, count = 0, count2 = 0;
+	struct ext4_free_data *entry;
+	ext4_fsblk_t discard_block;
+	struct list_head *l, *ltmp;
 
 
-	if (list_empty(&sbi->s_committed_transaction))
-		return;
-
-	/* there is committed blocks to be freed yet */
-	do {
-		/* get next array of blocks */
-		md = NULL;
-		spin_lock(&sbi->s_md_lock);
-		if (!list_empty(&sbi->s_committed_transaction)) {
-			md = list_entry(sbi->s_committed_transaction.next,
-					struct ext4_free_metadata, list);
-			list_del(&md->list);
-		}
-		spin_unlock(&sbi->s_md_lock);
-
-		if (md == NULL)
-			break;
+	list_for_each_safe(l, ltmp, &txn->t_private_list) {
+		entry = list_entry(l, struct ext4_free_data, list);
 
 
 		mb_debug("gonna free %u blocks in group %lu (0x%p):",
 		mb_debug("gonna free %u blocks in group %lu (0x%p):",
-				md->num, md->group, md);
+			 entry->count, entry->group, entry);
 
 
-		err = ext4_mb_load_buddy(sb, md->group, &e4b);
+		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
 		/* we expect to find existing buddy because it's pinned */
 		BUG_ON(err != 0);
 		BUG_ON(err != 0);
 
 
+		db = e4b.bd_info;
 		/* there are blocks to put in buddy to make them really free */
 		/* there are blocks to put in buddy to make them really free */
-		count += md->num;
+		count += entry->count;
 		count2++;
 		count2++;
-		ext4_lock_group(sb, md->group);
-		for (i = 0; i < md->num; i++) {
-			mb_debug(" %u", md->blocks[i]);
-			mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+		ext4_lock_group(sb, entry->group);
+		/* Take it out of per group rb tree */
+		rb_erase(&entry->node, &(db->bb_free_root));
+		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+		if (!db->bb_free_root.rb_node) {
+			/* No more items in the per group rb tree
+			 * balance refcounts from ext4_mb_free_metadata()
+			 */
+			page_cache_release(e4b.bd_buddy_page);
+			page_cache_release(e4b.bd_bitmap_page);
 		}
 		}
-		mb_debug("\n");
-		ext4_unlock_group(sb, md->group);
-
-		/* balance refcounts from ext4_mb_free_metadata() */
-		page_cache_release(e4b.bd_buddy_page);
-		page_cache_release(e4b.bd_bitmap_page);
-
-		kfree(md);
+		ext4_unlock_group(sb, entry->group);
+		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+			+ entry->start_blk
+			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+			   (unsigned long long) discard_block, entry->count);
+		sb_issue_discard(sb, discard_block, entry->count);
+
+		kmem_cache_free(ext4_free_ext_cachep, entry);
 		ext4_mb_release_desc(&e4b);
 		ext4_mb_release_desc(&e4b);
-
-	} while (md);
+	}
 
 
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
 }
@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 
 
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 {
 {
+#ifdef CONFIG_PROC_FS
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct proc_dir_entry *proc;
 	struct proc_dir_entry *proc;
@@ -2735,10 +2723,14 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 	return -ENOMEM;
 	return -ENOMEM;
+#else
+	return 0;
+#endif
 }
 }
 
 
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
 {
+#ifdef CONFIG_PROC_FS
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 
 	if (sbi->s_proc == NULL)
 	if (sbi->s_proc == NULL)
@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-
+#endif
 	return 0;
 	return 0;
 }
 }
 
 
@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
 		kmem_cache_destroy(ext4_pspace_cachep);
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
+
+	ext4_free_ext_cachep =
+		kmem_cache_create("ext4_free_block_extents",
+				     sizeof(struct ext4_free_data),
+				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	if (ext4_free_ext_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		kmem_cache_destroy(ext4_ac_cachep);
+		return -ENOMEM;
+	}
 	return 0;
 	return 0;
 }
 }
 
 
@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
 	/* XXX: synchronize_rcu(); */
 	/* XXX: synchronize_rcu(); */
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
+	kmem_cache_destroy(ext4_free_ext_cachep);
 }
 }
 
 
 
 
@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		goto out1;
 		goto out1;
 	}
 	}
 
 
-	ext4_mb_poll_new_transaction(sb, handle);
-
 	*errp = ext4_mb_initialize_context(ac, ar);
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 	if (*errp) {
 		ar->len = 0;
 		ar->len = 0;
@@ -4384,35 +4385,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 
 
 	return block;
 	return block;
 }
 }
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
-						handle_t *handle)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_last_transaction == handle->h_transaction->t_tid)
-		return;
-
-	/* new transaction! time to close last one and free blocks for
-	 * committed transaction. we know that only transaction can be
-	 * active, so previos transaction can be being logged and we
-	 * know that transaction before previous is known to be already
-	 * logged. this means that now we may free blocks freed in all
-	 * transactions before previous one. hope I'm clear enough ... */
 
 
-	spin_lock(&sbi->s_md_lock);
-	if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
-		mb_debug("new transaction %lu, old %lu\n",
-				(unsigned long) handle->h_transaction->t_tid,
-				(unsigned long) sbi->s_last_transaction);
-		list_splice_init(&sbi->s_closed_transaction,
-				&sbi->s_committed_transaction);
-		list_splice_init(&sbi->s_active_transaction,
-				&sbi->s_closed_transaction);
-		sbi->s_last_transaction = handle->h_transaction->t_tid;
-	}
-	spin_unlock(&sbi->s_md_lock);
-
-	ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+			struct ext4_free_data *entry2)
+{
+	if ((entry1->t_tid == entry2->t_tid) &&
+	    (entry1->group == entry2->group) &&
+	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
+		return 1;
+	return 0;
 }
 }
 
 
 static noinline_for_stack int
 static noinline_for_stack int
@@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	struct ext4_group_info *db = e4b->bd_info;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_metadata *md;
-	int i;
+	struct ext4_free_data *entry, *new_entry;
+	struct rb_node **n = &db->bb_free_root.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node;
+
 
 
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
 
+	new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+	new_entry->start_blk = block;
+	new_entry->group  = group;
+	new_entry->count = count;
+	new_entry->t_tid = handle->h_transaction->t_tid;
+	new_node = &new_entry->node;
+
 	ext4_lock_group(sb, group);
 	ext4_lock_group(sb, group);
-	for (i = 0; i < count; i++) {
-		md = db->bb_md_cur;
-		if (md && db->bb_tid != handle->h_transaction->t_tid) {
-			db->bb_md_cur = NULL;
-			md = NULL;
+	if (!*n) {
+		/* first free block exent. We need to
+		   protect buddy cache from being freed,
+		 * otherwise we'll refresh it from
+		 * on-disk bitmap and lose not-yet-available
+		 * blocks */
+		page_cache_get(e4b->bd_buddy_page);
+		page_cache_get(e4b->bd_bitmap_page);
+	}
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_free_data, node);
+		if (block < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (block >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			ext4_error(sb, __func__,
+			    "Double free of blocks %d (%d %d)\n",
+			    block, entry->start_blk, entry->count);
+			return 0;
 		}
 		}
+	}
 
 
-		if (md == NULL) {
-			ext4_unlock_group(sb, group);
-			md = kmalloc(sizeof(*md), GFP_NOFS);
-			if (md == NULL)
-				return -ENOMEM;
-			md->num = 0;
-			md->group = group;
-
-			ext4_lock_group(sb, group);
-			if (db->bb_md_cur == NULL) {
-				spin_lock(&sbi->s_md_lock);
-				list_add(&md->list, &sbi->s_active_transaction);
-				spin_unlock(&sbi->s_md_lock);
-				/* protect buddy cache from being freed,
-				 * otherwise we'll refresh it from
-				 * on-disk bitmap and lose not-yet-available
-				 * blocks */
-				page_cache_get(e4b->bd_buddy_page);
-				page_cache_get(e4b->bd_bitmap_page);
-				db->bb_md_cur = md;
-				db->bb_tid = handle->h_transaction->t_tid;
-				mb_debug("new md 0x%p for group %lu\n",
-						md, md->group);
-			} else {
-				kfree(md);
-				md = db->bb_md_cur;
-			}
+	rb_link_node(new_node, parent, n);
+	rb_insert_color(new_node, &db->bb_free_root);
+
+	/* Now try to see the extent can be merged to left and right */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &(db->bb_free_root));
+			spin_lock(&sbi->s_md_lock);
+			list_del(&entry->list);
+			spin_unlock(&sbi->s_md_lock);
+			kmem_cache_free(ext4_free_ext_cachep, entry);
 		}
 		}
+	}
 
 
-		BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
-		md->blocks[md->num] = block + i;
-		md->num++;
-		if (md->num == EXT4_BB_MAX_BLOCKS) {
-			/* no more space, put full container on a sb's list */
-			db->bb_md_cur = NULL;
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &(db->bb_free_root));
+			spin_lock(&sbi->s_md_lock);
+			list_del(&entry->list);
+			spin_unlock(&sbi->s_md_lock);
+			kmem_cache_free(ext4_free_ext_cachep, entry);
 		}
 		}
 	}
 	}
+	/* Add the extent to transaction's private list */
+	spin_lock(&sbi->s_md_lock);
+	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
 	ext4_unlock_group(sb, group);
 	ext4_unlock_group(sb, group);
 	return 0;
 	return 0;
 }
 }
@@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 
 
 	*freed = 0;
 	*freed = 0;
 
 
-	ext4_mb_poll_new_transaction(sb, handle);
-
 	sbi = EXT4_SB(sb);
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
 	es = EXT4_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	if (block < le32_to_cpu(es->s_first_data_block) ||

+ 19 - 12
fs/ext4/mballoc.h

@@ -18,6 +18,8 @@
 #include <linux/pagemap.h>
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
 #include <linux/version.h>
 #include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4_jbd2.h"
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "ext4.h"
 #include "group.h"
 #include "group.h"
@@ -98,23 +100,29 @@
 
 
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
 
 
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS	30
+struct ext4_free_data {
+	/* this links the free block information from group_info */
+	struct rb_node node;
 
 
-struct ext4_free_metadata {
-	ext4_group_t group;
-	unsigned short num;
-	ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
+	/* this links the free block information from ext4_sb_info */
 	struct list_head list;
 	struct list_head list;
+
+	/* group which free block extent belongs */
+	ext4_group_t group;
+
+	/* free block extent */
+	ext4_grpblk_t start_blk;
+	ext4_grpblk_t count;
+
+	/* transaction which freed this extent */
+	tid_t	t_tid;
 };
 };
 
 
 struct ext4_group_info {
 struct ext4_group_info {
 	unsigned long	bb_state;
 	unsigned long	bb_state;
-	unsigned long	bb_tid;
-	struct ext4_free_metadata *bb_md_cur;
+	struct rb_root  bb_free_root;
 	unsigned short	bb_first_free;
 	unsigned short	bb_first_free;
 	unsigned short	bb_free;
 	unsigned short	bb_free;
 	unsigned short	bb_fragments;
 	unsigned short	bb_fragments;
@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 
 
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 					ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
 static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_return_to_preallocation(struct inode *inode,
 					struct ext4_buddy *e4b, sector_t block,
 					struct ext4_buddy *e4b, sector_t block,
 					int count);
 					int count);
@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
 			struct super_block *, struct ext4_prealloc_space *pa);
 			struct super_block *, struct ext4_prealloc_space *pa);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
 
 
 
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)

+ 34 - 98
fs/ext4/super.c

@@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	 */
 	 */
 }
 }
 
 
-int ext4_update_compat_feature(handle_t *handle,
-					struct super_block *sb, __u32 compat)
-{
-	int err = 0;
-	if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_COMPAT_FEATURE(sb, compat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
-					struct super_block *sb, __u32 rocompat)
-{
-	int err = 0;
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
-					struct super_block *sb, __u32 incompat)
-{
-	int err = 0;
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
 /*
 /*
  * Open the external journal device
  * Open the external journal device
  */
  */
@@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = {
 enum {
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
@@ -915,7 +855,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_inode_readahead_blks
 	Opt_inode_readahead_blks
 };
 };
 
 
@@ -933,8 +873,6 @@ static const match_table_t tokens = {
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_nouid32, "nouid32"},
 	{Opt_nouid32, "nouid32"},
-	{Opt_nocheck, "nocheck"},
-	{Opt_nocheck, "check=none"},
 	{Opt_debug, "debug"},
 	{Opt_debug, "debug"},
 	{Opt_oldalloc, "oldalloc"},
 	{Opt_oldalloc, "oldalloc"},
 	{Opt_orlov, "orlov"},
 	{Opt_orlov, "orlov"},
@@ -973,8 +911,6 @@ static const match_table_t tokens = {
 	{Opt_extents, "extents"},
 	{Opt_extents, "extents"},
 	{Opt_noextents, "noextents"},
 	{Opt_noextents, "noextents"},
 	{Opt_i_version, "i_version"},
 	{Opt_i_version, "i_version"},
-	{Opt_mballoc, "mballoc"},
-	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_delalloc, "delalloc"},
@@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_nouid32:
 		case Opt_nouid32:
 			set_opt(sbi->s_mount_opt, NO_UID32);
 			set_opt(sbi->s_mount_opt, NO_UID32);
 			break;
 			break;
-		case Opt_nocheck:
-			clear_opt(sbi->s_mount_opt, CHECK);
-			break;
 		case Opt_debug:
 		case Opt_debug:
 			set_opt(sbi->s_mount_opt, DEBUG);
 			set_opt(sbi->s_mount_opt, DEBUG);
 			break;
 			break;
@@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Block bitmap for group %lu not in group "
 			       "Block bitmap for group %lu not in group "
-			       "(block %llu)!", i, block_bitmap);
+			       "(block %llu)!\n", i, block_bitmap);
 			return 0;
 			return 0;
 		}
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Inode bitmap for group %lu not in group "
 			       "Inode bitmap for group %lu not in group "
-			       "(block %llu)!", i, inode_bitmap);
+			       "(block %llu)!\n", i, inode_bitmap);
 			return 0;
 			return 0;
 		}
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		inode_table = ext4_inode_table(sb, gdp);
@@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Inode table for group %lu not in group "
 			       "Inode table for group %lu not in group "
-			       "(block %llu)!", i, inode_table);
+			       "(block %llu)!\n", i, inode_table);
 			return 0;
 			return 0;
 		}
 		}
 		spin_lock(sb_bgl_lock(sbi, i));
 		spin_lock(sb_bgl_lock(sbi, i));
@@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
  *
  *
  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
  */
  */
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
 {
 {
 	loff_t res;
 	loff_t res;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 
 
 	/* small i_blocks in vfs inode? */
 	/* small i_blocks in vfs inode? */
-	if (sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
 		/*
 		 * CONFIG_LSF is not enabled implies the inode
 		 * CONFIG_LSF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * i_block represent total blocks in 512 bytes
@@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
  * We need to be 1 filesystem block less than the 2^48 sector limit.
  * We need to be 1 filesystem block less than the 2^48 sector limit.
  */
  */
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 {
 {
 	loff_t res = EXT4_NDIR_BLOCKS;
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	int meta_blocks;
@@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
 	 * total number of  512 bytes blocks of the file
 	 * total number of  512 bytes blocks of the file
 	 */
 	 */
 
 
-	if (sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
-		 * i_block represent total blocks in 512 bytes
-		 * 32 == size of vfs inode i_blocks * 8
+		 * !has_huge_files or CONFIG_LSF is not enabled
+		 * implies the inode i_block represent total blocks in
+		 * 512 bytes 32 == size of vfs inode i_blocks * 8
 		 */
 		 */
 		upper_limit = (1LL << 32) - 1;
 		upper_limit = (1LL << 32) - 1;
 
 
@@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int blocksize;
 	int blocksize;
 	int db_count;
 	int db_count;
 	int i;
 	int i;
-	int needs_recovery;
+	int needs_recovery, has_huge_files;
 	__le32 features;
 	__le32 features;
 	__u64 blocks_count;
 	__u64 blocks_count;
 	int err;
 	int err;
@@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       sb->s_id, le32_to_cpu(features));
 		       sb->s_id, le32_to_cpu(features));
 		goto failed_mount;
 		goto failed_mount;
 	}
 	}
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+	if (has_huge_files) {
 		/*
 		/*
 		 * Large file size enabled file system can only be
 		 * Large file size enabled file system can only be
 		 * mount if kernel is build with CONFIG_LSF
 		 * mount if kernel is build with CONFIG_LSF
@@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 		}
 	}
 	}
 
 
-	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
-	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+						      has_huge_files);
+	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
 
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			"available.\n");
 			"available.\n");
 	}
 	}
 
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+				"requested data journaling mode\n");
+		clear_opt(sbi->s_mount_opt, DELALLOC);
+	} else if (test_opt(sb, DELALLOC))
+		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
+	ext4_ext_init(sb);
+	err = ext4_mb_init(sb, needs_recovery);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+		       err);
+		goto failed_mount4;
+	}
+
 	/*
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
 	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
 	       "writeback");
 	       "writeback");
 
 
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-				"requested data journaling mode\n");
-		clear_opt(sbi->s_mount_opt, DELALLOC);
-	} else if (test_opt(sb, DELALLOC))
-		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-
-	ext4_ext_init(sb);
-	err = ext4_mb_init(sb, needs_recovery);
-	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
-		       err);
-		goto failed_mount4;
-	}
-
 	lock_kernel();
 	lock_kernel();
 	return 0;
 	return 0;
 
 

+ 3 - 0
fs/jbd2/commit.c

@@ -995,6 +995,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	}
 	}
 	spin_unlock(&journal->j_list_lock);
 	spin_unlock(&journal->j_list_lock);
 
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
 		   journal->j_devname, commit_transaction->t_tid,
 		   journal->j_devname, commit_transaction->t_tid,
 		   journal->j_tail_sequence);
 		   journal->j_tail_sequence);

+ 1 - 0
fs/jbd2/transaction.c

@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
 	spin_lock_init(&transaction->t_handle_lock);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
+	INIT_LIST_HEAD(&transaction->t_private_list);
 
 
 	/* Set up the commit timer for the new transaction. */
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);

+ 9 - 0
include/linux/jbd2.h

@@ -641,6 +641,11 @@ struct transaction_s
 	 */
 	 */
 	int t_handle_count;
 	int t_handle_count;
 
 
+	/*
+	 * For use by the filesystem to store fs-specific data
+	 * structures associated with the transaction
+	 */
+	struct list_head	t_private_list;
 };
 };
 
 
 struct transaction_run_stats_s {
 struct transaction_run_stats_s {
@@ -935,6 +940,10 @@ struct journal_s
 
 
 	pid_t			j_last_sync_writer;
 	pid_t			j_last_sync_writer;
 
 
+	/* This function is called when a transaction is closed */
+	void			(*j_commit_callback)(journal_t *,
+						     transaction_t *);
+
 	/*
 	/*
 	 * Journal statistics
 	 * Journal statistics
 	 */
 	 */

+ 9 - 1
include/linux/writeback.h

@@ -63,7 +63,15 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
 	unsigned more_io:1;		/* more io to be dispatched */
-	unsigned range_cont:1;
+	/*
+	 * write_cache_pages() won't update wbc->nr_to_write and
+	 * mapping->writeback_index if no_nrwrite_index_update
+	 * is set.  write_cache_pages() may write more than we
+	 * requested and we want to make sure nr_to_write and
+	 * writeback_index are updated in a consistent manner
+	 * so we use a single control to update them
+	 */
+	unsigned no_nrwrite_index_update:1;
 };
 };
 
 
 /*
 /*

+ 7 - 5
mm/page-writeback.c

@@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
 	pgoff_t end;		/* Inclusive */
 	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
 	int scanned = 0;
 	int range_whole = 0;
 	int range_whole = 0;
+	long nr_to_write = wbc->nr_to_write;
 
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
 		wbc->encountered_congestion = 1;
@@ -939,7 +940,7 @@ int write_cache_pages(struct address_space *mapping,
 				unlock_page(page);
 				unlock_page(page);
 				ret = 0;
 				ret = 0;
 			}
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
+			if (ret || (--nr_to_write <= 0))
 				done = 1;
 				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
 				wbc->encountered_congestion = 1;
@@ -958,11 +959,12 @@ int write_cache_pages(struct address_space *mapping,
 		index = 0;
 		index = 0;
 		goto retry;
 		goto retry;
 	}
 	}
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
+	if (!wbc->no_nrwrite_index_update) {
+		if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+			mapping->writeback_index = index;
+		wbc->nr_to_write = nr_to_write;
+	}
 
 
-	if (wbc->range_cont)
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 	return ret;
 }
 }
 EXPORT_SYMBOL(write_cache_pages);
 EXPORT_SYMBOL(write_cache_pages);