|
@@ -1849,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
|
|
|
|
+{
|
|
|
|
+ int ret;
|
|
|
|
+
|
|
|
|
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
|
|
|
|
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
|
|
|
|
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
+ &BTRFS_I(inode)->runtime_flags))
|
|
|
|
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
|
|
|
|
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
|
|
|
|
+
|
|
|
|
+ return ret;
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* fsync call for both files and directories. This logs the inode into
|
|
* fsync call for both files and directories. This logs the inode into
|
|
* the tree log instead of forcing full commits whenever possible.
|
|
* the tree log instead of forcing full commits whenever possible.
|
|
@@ -1878,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
|
* multi-task, and make the performance up. See
|
|
* multi-task, and make the performance up. See
|
|
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
|
|
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
|
|
*/
|
|
*/
|
|
- atomic_inc(&BTRFS_I(inode)->sync_writers);
|
|
|
|
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
|
|
|
|
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
- &BTRFS_I(inode)->runtime_flags))
|
|
|
|
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
|
|
|
|
- atomic_dec(&BTRFS_I(inode)->sync_writers);
|
|
|
|
|
|
+ ret = start_ordered_ops(inode, start, end);
|
|
if (ret)
|
|
if (ret)
|
|
return ret;
|
|
return ret;
|
|
|
|
|
|
mutex_lock(&inode->i_mutex);
|
|
mutex_lock(&inode->i_mutex);
|
|
-
|
|
|
|
- /*
|
|
|
|
- * We flush the dirty pages again to avoid some dirty pages in the
|
|
|
|
- * range being left.
|
|
|
|
- */
|
|
|
|
atomic_inc(&root->log_batch);
|
|
atomic_inc(&root->log_batch);
|
|
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
+ /*
|
|
|
|
+ * We might have have had more pages made dirty after calling
|
|
|
|
+ * start_ordered_ops and before acquiring the inode's i_mutex.
|
|
|
|
+ */
|
|
if (full_sync) {
|
|
if (full_sync) {
|
|
|
|
+ /*
|
|
|
|
+ * For a full sync, we need to make sure any ordered operations
|
|
|
|
+ * start and finish before we start logging the inode, so that
|
|
|
|
+ * all extents are persisted and the respective file extent
|
|
|
|
+ * items are in the fs/subvol btree.
|
|
|
|
+ */
|
|
ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
|
|
ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
|
|
- if (ret) {
|
|
|
|
- mutex_unlock(&inode->i_mutex);
|
|
|
|
- goto out;
|
|
|
|
- }
|
|
|
|
|
|
+ } else {
|
|
|
|
+ /*
|
|
|
|
+ * Start any new ordered operations before starting to log the
|
|
|
|
+ * inode. We will wait for them to finish in btrfs_sync_log().
|
|
|
|
+ *
|
|
|
|
+ * Right before acquiring the inode's mutex, we might have new
|
|
|
|
+ * writes dirtying pages, which won't immediately start the
|
|
|
|
+ * respective ordered operations - that is done through the
|
|
|
|
+ * fill_delalloc callbacks invoked from the writepage and
|
|
|
|
+ * writepages address space operations. So make sure we start
|
|
|
|
+ * all ordered operations before starting to log our inode. Not
|
|
|
|
+ * doing this means that while logging the inode, writeback
|
|
|
|
+ * could start and invoke writepage/writepages, which would call
|
|
|
|
+ * the fill_delalloc callbacks (cow_file_range,
|
|
|
|
+ * submit_compressed_extents). These callbacks add first an
|
|
|
|
+ * extent map to the modified list of extents and then create
|
|
|
|
+ * the respective ordered operation, which means in
|
|
|
|
+ * tree-log.c:btrfs_log_inode() we might capture all existing
|
|
|
|
+ * ordered operations (with btrfs_get_logged_extents()) before
|
|
|
|
+ * the fill_delalloc callback adds its ordered operation, and by
|
|
|
|
+ * the time we visit the modified list of extent maps (with
|
|
|
|
+ * btrfs_log_changed_extents()), we see and process the extent
|
|
|
|
+ * map they created. We then use the extent map to construct a
|
|
|
|
+ * file extent item for logging without waiting for the
|
|
|
|
+ * respective ordered operation to finish - this file extent
|
|
|
|
+ * item points to a disk location that might not have yet been
|
|
|
|
+ * written to, containing random data - so after a crash a log
|
|
|
|
+ * replay will make our inode have file extent items that point
|
|
|
|
+ * to disk locations containing invalid data, as we returned
|
|
|
|
+ * success to userspace without waiting for the respective
|
|
|
|
+ * ordered operation to finish, because it wasn't captured by
|
|
|
|
+ * btrfs_get_logged_extents().
|
|
|
|
+ */
|
|
|
|
+ ret = start_ordered_ops(inode, start, end);
|
|
|
|
+ }
|
|
|
|
+ if (ret) {
|
|
|
|
+ mutex_unlock(&inode->i_mutex);
|
|
|
|
+ goto out;
|
|
}
|
|
}
|
|
atomic_inc(&root->log_batch);
|
|
atomic_inc(&root->log_batch);
|
|
|
|
|