|
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
|
|
struct btrfs_root *root;
|
|
struct btrfs_root *root;
|
|
};
|
|
};
|
|
|
|
|
|
|
|
+struct btrfs_dio_data {
|
|
|
|
+ u64 outstanding_extents;
|
|
|
|
+ u64 reserve;
|
|
|
|
+ u64 unsubmitted_oe_range_start;
|
|
|
|
+ u64 unsubmitted_oe_range_end;
|
|
|
|
+};
|
|
|
|
+
|
|
static const struct inode_operations btrfs_dir_inode_operations;
|
|
static const struct inode_operations btrfs_dir_inode_operations;
|
|
static const struct inode_operations btrfs_symlink_inode_operations;
|
|
static const struct inode_operations btrfs_symlink_inode_operations;
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations;
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations;
|
|
@@ -7408,25 +7415,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
|
btrfs_put_ordered_extent(ordered);
|
|
btrfs_put_ordered_extent(ordered);
|
|
} else {
|
|
} else {
|
|
- /* Screw you mmap */
|
|
|
|
- ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
|
|
|
|
- if (ret)
|
|
|
|
- break;
|
|
|
|
- ret = filemap_fdatawait_range(inode->i_mapping,
|
|
|
|
- lockstart,
|
|
|
|
- lockend);
|
|
|
|
- if (ret)
|
|
|
|
- break;
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
- * If we found a page that couldn't be invalidated just
|
|
|
|
- * fall back to buffered.
|
|
|
|
|
|
+ * We could trigger writeback for this range (and wait
|
|
|
|
+ * for it to complete) and then invalidate the pages for
|
|
|
|
+ * this range (through invalidate_inode_pages2_range()),
|
|
|
|
+ * but that can lead us to a deadlock with a concurrent
|
|
|
|
+ * call to readpages() (a buffered read or a defrag call
|
|
|
|
+ * triggered a readahead) on a page lock due to an
|
|
|
|
+ * ordered dio extent we created before but did not have
|
|
|
|
+ * yet a corresponding bio submitted (whence it can not
|
|
|
|
+ * complete), which makes readpages() wait for that
|
|
|
|
+ * ordered extent to complete while holding a lock on
|
|
|
|
+ * that page.
|
|
*/
|
|
*/
|
|
- ret = invalidate_inode_pages2_range(inode->i_mapping,
|
|
|
|
- lockstart >> PAGE_CACHE_SHIFT,
|
|
|
|
- lockend >> PAGE_CACHE_SHIFT);
|
|
|
|
- if (ret)
|
|
|
|
- break;
|
|
|
|
|
|
+ ret = -ENOTBLK;
|
|
|
|
+ break;
|
|
}
|
|
}
|
|
|
|
|
|
cond_resched();
|
|
cond_resched();
|
|
@@ -7482,11 +7485,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
|
|
return em;
|
|
return em;
|
|
}
|
|
}
|
|
|
|
|
|
-struct btrfs_dio_data {
|
|
|
|
- u64 outstanding_extents;
|
|
|
|
- u64 reserve;
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
static void adjust_dio_outstanding_extents(struct inode *inode,
|
|
static void adjust_dio_outstanding_extents(struct inode *inode,
|
|
struct btrfs_dio_data *dio_data,
|
|
struct btrfs_dio_data *dio_data,
|
|
const u64 len)
|
|
const u64 len)
|
|
@@ -7670,6 +7668,7 @@ unlock:
|
|
btrfs_free_reserved_data_space(inode, start, len);
|
|
btrfs_free_reserved_data_space(inode, start, len);
|
|
WARN_ON(dio_data->reserve < len);
|
|
WARN_ON(dio_data->reserve < len);
|
|
dio_data->reserve -= len;
|
|
dio_data->reserve -= len;
|
|
|
|
+ dio_data->unsubmitted_oe_range_end = start + len;
|
|
current->journal_info = dio_data;
|
|
current->journal_info = dio_data;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
|
|
bio_put(bio);
|
|
bio_put(bio);
|
|
}
|
|
}
|
|
|
|
|
|
-static void btrfs_endio_direct_write(struct bio *bio)
|
|
|
|
|
|
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
|
|
|
|
+ const u64 offset,
|
|
|
|
+ const u64 bytes,
|
|
|
|
+ const int uptodate)
|
|
{
|
|
{
|
|
- struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
- struct inode *inode = dip->inode;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
struct btrfs_ordered_extent *ordered = NULL;
|
|
struct btrfs_ordered_extent *ordered = NULL;
|
|
- u64 ordered_offset = dip->logical_offset;
|
|
|
|
- u64 ordered_bytes = dip->bytes;
|
|
|
|
- struct bio *dio_bio;
|
|
|
|
|
|
+ u64 ordered_offset = offset;
|
|
|
|
+ u64 ordered_bytes = bytes;
|
|
int ret;
|
|
int ret;
|
|
|
|
|
|
again:
|
|
again:
|
|
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
|
|
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
|
|
&ordered_offset,
|
|
&ordered_offset,
|
|
ordered_bytes,
|
|
ordered_bytes,
|
|
- !bio->bi_error);
|
|
|
|
|
|
+ uptodate);
|
|
if (!ret)
|
|
if (!ret)
|
|
goto out_test;
|
|
goto out_test;
|
|
|
|
|
|
@@ -8020,13 +8019,22 @@ out_test:
|
|
* our bio might span multiple ordered extents. If we haven't
|
|
* our bio might span multiple ordered extents. If we haven't
|
|
* completed the accounting for the whole dio, go back and try again
|
|
* completed the accounting for the whole dio, go back and try again
|
|
*/
|
|
*/
|
|
- if (ordered_offset < dip->logical_offset + dip->bytes) {
|
|
|
|
- ordered_bytes = dip->logical_offset + dip->bytes -
|
|
|
|
- ordered_offset;
|
|
|
|
|
|
+ if (ordered_offset < offset + bytes) {
|
|
|
|
+ ordered_bytes = offset + bytes - ordered_offset;
|
|
ordered = NULL;
|
|
ordered = NULL;
|
|
goto again;
|
|
goto again;
|
|
}
|
|
}
|
|
- dio_bio = dip->dio_bio;
|
|
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void btrfs_endio_direct_write(struct bio *bio)
|
|
|
|
+{
|
|
|
|
+ struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
+ struct bio *dio_bio = dip->dio_bio;
|
|
|
|
+
|
|
|
|
+ btrfs_endio_direct_write_update_ordered(dip->inode,
|
|
|
|
+ dip->logical_offset,
|
|
|
|
+ dip->bytes,
|
|
|
|
+ !bio->bi_error);
|
|
|
|
|
|
kfree(dip);
|
|
kfree(dip);
|
|
|
|
|
|
@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
|
|
dip->subio_endio = btrfs_subio_endio_read;
|
|
dip->subio_endio = btrfs_subio_endio_read;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Reset the range for unsubmitted ordered extents (to a 0 length range)
|
|
|
|
+ * even if we fail to submit a bio, because in such case we do the
|
|
|
|
+ * corresponding error handling below and it must not be done a second
|
|
|
|
+ * time by btrfs_direct_IO().
|
|
|
|
+ */
|
|
|
|
+ if (write) {
|
|
|
|
+ struct btrfs_dio_data *dio_data = current->journal_info;
|
|
|
|
+
|
|
|
|
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
|
|
|
|
+ dip->bytes;
|
|
|
|
+ dio_data->unsubmitted_oe_range_start =
|
|
|
|
+ dio_data->unsubmitted_oe_range_end;
|
|
|
|
+ }
|
|
|
|
+
|
|
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
|
|
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
|
|
if (!ret)
|
|
if (!ret)
|
|
return;
|
|
return;
|
|
@@ -8362,24 +8385,15 @@ free_ordered:
|
|
dip = NULL;
|
|
dip = NULL;
|
|
io_bio = NULL;
|
|
io_bio = NULL;
|
|
} else {
|
|
} else {
|
|
- if (write) {
|
|
|
|
- struct btrfs_ordered_extent *ordered;
|
|
|
|
-
|
|
|
|
- ordered = btrfs_lookup_ordered_extent(inode,
|
|
|
|
- file_offset);
|
|
|
|
- set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
|
|
|
|
- /*
|
|
|
|
- * Decrements our ref on the ordered extent and removes
|
|
|
|
- * the ordered extent from the inode's ordered tree,
|
|
|
|
- * doing all the proper resource cleanup such as for the
|
|
|
|
- * reserved space and waking up any waiters for this
|
|
|
|
- * ordered extent (through btrfs_remove_ordered_extent).
|
|
|
|
- */
|
|
|
|
- btrfs_finish_ordered_io(ordered);
|
|
|
|
- } else {
|
|
|
|
|
|
+ if (write)
|
|
|
|
+ btrfs_endio_direct_write_update_ordered(inode,
|
|
|
|
+ file_offset,
|
|
|
|
+ dio_bio->bi_iter.bi_size,
|
|
|
|
+ 0);
|
|
|
|
+ else
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
|
|
file_offset + dio_bio->bi_iter.bi_size - 1);
|
|
file_offset + dio_bio->bi_iter.bi_size - 1);
|
|
- }
|
|
|
|
|
|
+
|
|
dio_bio->bi_error = -EIO;
|
|
dio_bio->bi_error = -EIO;
|
|
/*
|
|
/*
|
|
* Releases and cleans up our dio_bio, no need to bio_put()
|
|
* Releases and cleans up our dio_bio, no need to bio_put()
|
|
@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
|
* originally calculated. Abuse current->journal_info for this.
|
|
* originally calculated. Abuse current->journal_info for this.
|
|
*/
|
|
*/
|
|
dio_data.reserve = round_up(count, root->sectorsize);
|
|
dio_data.reserve = round_up(count, root->sectorsize);
|
|
|
|
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
|
|
|
|
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
|
|
current->journal_info = &dio_data;
|
|
current->journal_info = &dio_data;
|
|
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
|
|
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
|
@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
|
if (dio_data.reserve)
|
|
if (dio_data.reserve)
|
|
btrfs_delalloc_release_space(inode, offset,
|
|
btrfs_delalloc_release_space(inode, offset,
|
|
dio_data.reserve);
|
|
dio_data.reserve);
|
|
|
|
+ /*
|
|
|
|
+ * On error we might have left some ordered extents
|
|
|
|
+ * without submitting corresponding bios for them, so
|
|
|
|
+ * cleanup them up to avoid other tasks getting them
|
|
|
|
+ * and waiting for them to complete forever.
|
|
|
|
+ */
|
|
|
|
+ if (dio_data.unsubmitted_oe_range_start <
|
|
|
|
+ dio_data.unsubmitted_oe_range_end)
|
|
|
|
+ btrfs_endio_direct_write_update_ordered(inode,
|
|
|
|
+ dio_data.unsubmitted_oe_range_start,
|
|
|
|
+ dio_data.unsubmitted_oe_range_end -
|
|
|
|
+ dio_data.unsubmitted_oe_range_start,
|
|
|
|
+ 0);
|
|
} else if (ret >= 0 && (size_t)ret < count)
|
|
} else if (ret >= 0 && (size_t)ret < count)
|
|
btrfs_delalloc_release_space(inode, offset,
|
|
btrfs_delalloc_release_space(inode, offset,
|
|
count - (size_t)ret);
|
|
count - (size_t)ret);
|