11 年之前 · 669249eea3
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1849,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 
															 	return 0;
														
 
															 }
														
 
															+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	atomic_inc(&BTRFS_I(inode)->sync_writers);
														
 
															+	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
														
 
															+	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
														
 
															+			     &BTRFS_I(inode)->runtime_flags))
														
 
															+		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
														
 
															+	atomic_dec(&BTRFS_I(inode)->sync_writers);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * fsync call for both files and directories.  This logs the inode into
														
 
															  * the tree log instead of forcing full commits whenever possible.
														
@@ -1878,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
															 	 * multi-task, and make the performance up.  See
														
 
															 	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
														
 
															 	 */
														
 
															-	atomic_inc(&BTRFS_I(inode)->sync_writers);
														
 
															-	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
														
 
															-	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
														
 
															-			     &BTRFS_I(inode)->runtime_flags))
														
 
															-		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
														
 
															-	atomic_dec(&BTRFS_I(inode)->sync_writers);
														
 
															+	ret = start_ordered_ops(inode, start, end);
														
 
															 	if (ret)
														
 
															 		return ret;
														
 
															 	mutex_lock(&inode->i_mutex);
														
 
															-
														
 
															-	/*
														
 
															-	 * We flush the dirty pages again to avoid some dirty pages in the
														
 
															-	 * range being left.
														
 
															-	 */
														
 
															 	atomic_inc(&root->log_batch);
														
 
															 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
														
 
															 			     &BTRFS_I(inode)->runtime_flags);
														
 
															+	/*
														
 
															+	 * We might have have had more pages made dirty after calling
														
 
															+	 * start_ordered_ops and before acquiring the inode's i_mutex.
														
 
															+	 */
														
 
															 	if (full_sync) {
														
 
															+		/*
														
 
															+		 * For a full sync, we need to make sure any ordered operations
														
 
															+		 * start and finish before we start logging the inode, so that
														
 
															+		 * all extents are persisted and the respective file extent
														
 
															+		 * items are in the fs/subvol btree.
														
 
															+		 */
														
 
															 		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
														
 
															-		if (ret) {
														
 
															-			mutex_unlock(&inode->i_mutex);
														
 
															-			goto out;
														
 
															-		}
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * Start any new ordered operations before starting to log the
														
 
															+		 * inode. We will wait for them to finish in btrfs_sync_log().
														
 
															+		 *
														
 
															+		 * Right before acquiring the inode's mutex, we might have new
														
 
															+		 * writes dirtying pages, which won't immediately start the
														
 
															+		 * respective ordered operations - that is done through the
														
 
															+		 * fill_delalloc callbacks invoked from the writepage and
														
 
															+		 * writepages address space operations. So make sure we start
														
 
															+		 * all ordered operations before starting to log our inode. Not
														
 
															+		 * doing this means that while logging the inode, writeback
														
 
															+		 * could start and invoke writepage/writepages, which would call
														
 
															+		 * the fill_delalloc callbacks (cow_file_range,
														
 
															+		 * submit_compressed_extents). These callbacks add first an
														
 
															+		 * extent map to the modified list of extents and then create
														
 
															+		 * the respective ordered operation, which means in
														
 
															+		 * tree-log.c:btrfs_log_inode() we might capture all existing
														
 
															+		 * ordered operations (with btrfs_get_logged_extents()) before
														
 
															+		 * the fill_delalloc callback adds its ordered operation, and by
														
 
															+		 * the time we visit the modified list of extent maps (with
														
 
															+		 * btrfs_log_changed_extents()), we see and process the extent
														
 
															+		 * map they created. We then use the extent map to construct a
														
 
															+		 * file extent item for logging without waiting for the
														
 
															+		 * respective ordered operation to finish - this file extent
														
 
															+		 * item points to a disk location that might not have yet been
														
 
															+		 * written to, containing random data - so after a crash a log
														
 
															+		 * replay will make our inode have file extent items that point
														
 
															+		 * to disk locations containing invalid data, as we returned
														
 
															+		 * success to userspace without waiting for the respective
														
 
															+		 * ordered operation to finish, because it wasn't captured by
														
 
															+		 * btrfs_get_logged_extents().
														
 
															+		 */
														
 
															+		ret = start_ordered_ops(inode, start, end);
														
 
															+	}
														
 
															+	if (ret) {
														
 
															+		mutex_unlock(&inode->i_mutex);
														
 
															+		goto out;
														
 
															 	}
														
 
															 	atomic_inc(&root->log_batch);