7 년 전 · b5e6c3e170
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2068,53 +2068,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
				 	atomic_inc(&root->log_batch);
			
 
				 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			
 
				 			     &BTRFS_I(inode)->runtime_flags);
			
 
				+
			
 
				 	/*
			
 
				-	 * We might have have had more pages made dirty after calling
			
 
				-	 * start_ordered_ops and before acquiring the inode's i_mutex.
			
 
				+	 * We have to do this here to avoid the priority inversion of waiting on
			
 
				+	 * IO of a lower priority task while holding a transaciton open.
			
 
				 	 */
			
 
				-	if (full_sync) {
			
 
				-		/*
			
 
				-		 * For a full sync, we need to make sure any ordered operations
			
 
				-		 * start and finish before we start logging the inode, so that
			
 
				-		 * all extents are persisted and the respective file extent
			
 
				-		 * items are in the fs/subvol btree.
			
 
				-		 */
			
 
				-		ret = btrfs_wait_ordered_range(inode, start, len);
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * Start any new ordered operations before starting to log the
			
 
				-		 * inode. We will wait for them to finish in btrfs_sync_log().
			
 
				-		 *
			
 
				-		 * Right before acquiring the inode's mutex, we might have new
			
 
				-		 * writes dirtying pages, which won't immediately start the
			
 
				-		 * respective ordered operations - that is done through the
			
 
				-		 * fill_delalloc callbacks invoked from the writepage and
			
 
				-		 * writepages address space operations. So make sure we start
			
 
				-		 * all ordered operations before starting to log our inode. Not
			
 
				-		 * doing this means that while logging the inode, writeback
			
 
				-		 * could start and invoke writepage/writepages, which would call
			
 
				-		 * the fill_delalloc callbacks (cow_file_range,
			
 
				-		 * submit_compressed_extents). These callbacks add first an
			
 
				-		 * extent map to the modified list of extents and then create
			
 
				-		 * the respective ordered operation, which means in
			
 
				-		 * tree-log.c:btrfs_log_inode() we might capture all existing
			
 
				-		 * ordered operations (with btrfs_get_logged_extents()) before
			
 
				-		 * the fill_delalloc callback adds its ordered operation, and by
			
 
				-		 * the time we visit the modified list of extent maps (with
			
 
				-		 * btrfs_log_changed_extents()), we see and process the extent
			
 
				-		 * map they created. We then use the extent map to construct a
			
 
				-		 * file extent item for logging without waiting for the
			
 
				-		 * respective ordered operation to finish - this file extent
			
 
				-		 * item points to a disk location that might not have yet been
			
 
				-		 * written to, containing random data - so after a crash a log
			
 
				-		 * replay will make our inode have file extent items that point
			
 
				-		 * to disk locations containing invalid data, as we returned
			
 
				-		 * success to userspace without waiting for the respective
			
 
				-		 * ordered operation to finish, because it wasn't captured by
			
 
				-		 * btrfs_get_logged_extents().
			
 
				-		 */
			
 
				-		ret = start_ordered_ops(inode, start, end);
			
 
				-	}
			
 
				+	ret = btrfs_wait_ordered_range(inode, start, len);
			
 
				 	if (ret) {
			
 
				 		inode_unlock(inode);
			
 
				 		goto out;
			
@@ -2239,13 +2198,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
				 				goto out;
			
 
				 			}
			
 
				 		}
			
 
				-		if (!full_sync) {
			
 
				-			ret = btrfs_wait_ordered_range(inode, start, len);
			
 
				-			if (ret) {
			
 
				-				btrfs_end_transaction(trans);
			
 
				-				goto out;
			
 
				-			}
			
 
				-		}
			
 
				 		ret = btrfs_commit_transaction(trans);
			
 
				 	} else {
			
 
				 		ret = btrfs_end_transaction(trans);