|
@@ -623,10 +623,11 @@ _xfs_buf_read(
|
|
|
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
|
|
|
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
|
|
|
|
|
|
- xfs_buf_iorequest(bp);
|
|
|
- if (flags & XBF_ASYNC)
|
|
|
+ if (flags & XBF_ASYNC) {
|
|
|
+ xfs_buf_submit(bp);
|
|
|
return 0;
|
|
|
- return xfs_buf_iowait(bp);
|
|
|
+ }
|
|
|
+ return xfs_buf_submit_wait(bp);
|
|
|
}
|
|
|
|
|
|
xfs_buf_t *
|
|
@@ -687,34 +688,39 @@ xfs_buf_readahead_map(
|
|
|
* Read an uncached buffer from disk. Allocates and returns a locked
|
|
|
* buffer containing the disk contents or nothing.
|
|
|
*/
|
|
|
-struct xfs_buf *
|
|
|
+int
|
|
|
xfs_buf_read_uncached(
|
|
|
struct xfs_buftarg *target,
|
|
|
xfs_daddr_t daddr,
|
|
|
size_t numblks,
|
|
|
int flags,
|
|
|
+ struct xfs_buf **bpp,
|
|
|
const struct xfs_buf_ops *ops)
|
|
|
{
|
|
|
struct xfs_buf *bp;
|
|
|
|
|
|
+ *bpp = NULL;
|
|
|
+
|
|
|
bp = xfs_buf_get_uncached(target, numblks, flags);
|
|
|
if (!bp)
|
|
|
- return NULL;
|
|
|
+ return -ENOMEM;
|
|
|
|
|
|
/* set up the buffer for a read IO */
|
|
|
ASSERT(bp->b_map_count == 1);
|
|
|
- bp->b_bn = daddr;
|
|
|
+ bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
|
|
|
bp->b_maps[0].bm_bn = daddr;
|
|
|
bp->b_flags |= XBF_READ;
|
|
|
bp->b_ops = ops;
|
|
|
|
|
|
- if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
|
|
|
+ xfs_buf_submit_wait(bp);
|
|
|
+ if (bp->b_error) {
|
|
|
+ int error = bp->b_error;
|
|
|
xfs_buf_relse(bp);
|
|
|
- return NULL;
|
|
|
+ return error;
|
|
|
}
|
|
|
- xfs_buf_iorequest(bp);
|
|
|
- xfs_buf_iowait(bp);
|
|
|
- return bp;
|
|
|
+
|
|
|
+ *bpp = bp;
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -998,53 +1004,56 @@ xfs_buf_wait_unpin(
|
|
|
* Buffer Utility Routines
|
|
|
*/
|
|
|
|
|
|
-STATIC void
|
|
|
-xfs_buf_iodone_work(
|
|
|
- struct work_struct *work)
|
|
|
+void
|
|
|
+xfs_buf_ioend(
|
|
|
+ struct xfs_buf *bp)
|
|
|
{
|
|
|
- struct xfs_buf *bp =
|
|
|
- container_of(work, xfs_buf_t, b_iodone_work);
|
|
|
- bool read = !!(bp->b_flags & XBF_READ);
|
|
|
+ bool read = bp->b_flags & XBF_READ;
|
|
|
+
|
|
|
+ trace_xfs_buf_iodone(bp, _RET_IP_);
|
|
|
|
|
|
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
|
|
|
|
|
|
- /* only validate buffers that were read without errors */
|
|
|
- if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
|
|
|
+ /*
|
|
|
+ * Pull in IO completion errors now. We are guaranteed to be running
|
|
|
+ * single threaded, so we don't need the lock to read b_io_error.
|
|
|
+ */
|
|
|
+ if (!bp->b_error && bp->b_io_error)
|
|
|
+ xfs_buf_ioerror(bp, bp->b_io_error);
|
|
|
+
|
|
|
+ /* Only validate buffers that were read without errors */
|
|
|
+ if (read && !bp->b_error && bp->b_ops) {
|
|
|
+ ASSERT(!bp->b_iodone);
|
|
|
bp->b_ops->verify_read(bp);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!bp->b_error)
|
|
|
+ bp->b_flags |= XBF_DONE;
|
|
|
|
|
|
if (bp->b_iodone)
|
|
|
(*(bp->b_iodone))(bp);
|
|
|
else if (bp->b_flags & XBF_ASYNC)
|
|
|
xfs_buf_relse(bp);
|
|
|
- else {
|
|
|
- ASSERT(read && bp->b_ops);
|
|
|
+ else
|
|
|
complete(&bp->b_iowait);
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
-void
|
|
|
-xfs_buf_ioend(
|
|
|
- struct xfs_buf *bp,
|
|
|
- int schedule)
|
|
|
+static void
|
|
|
+xfs_buf_ioend_work(
|
|
|
+ struct work_struct *work)
|
|
|
{
|
|
|
- bool read = !!(bp->b_flags & XBF_READ);
|
|
|
-
|
|
|
- trace_xfs_buf_iodone(bp, _RET_IP_);
|
|
|
+ struct xfs_buf *bp =
|
|
|
+ container_of(work, xfs_buf_t, b_iodone_work);
|
|
|
|
|
|
- if (bp->b_error == 0)
|
|
|
- bp->b_flags |= XBF_DONE;
|
|
|
+ xfs_buf_ioend(bp);
|
|
|
+}
|
|
|
|
|
|
- if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
|
|
|
- if (schedule) {
|
|
|
- INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
|
|
|
- queue_work(xfslogd_workqueue, &bp->b_iodone_work);
|
|
|
- } else {
|
|
|
- xfs_buf_iodone_work(&bp->b_iodone_work);
|
|
|
- }
|
|
|
- } else {
|
|
|
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
|
|
|
- complete(&bp->b_iowait);
|
|
|
- }
|
|
|
+void
|
|
|
+xfs_buf_ioend_async(
|
|
|
+ struct xfs_buf *bp)
|
|
|
+{
|
|
|
+ INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
|
|
|
+ queue_work(xfslogd_workqueue, &bp->b_iodone_work);
|
|
|
}
|
|
|
|
|
|
void
|
|
@@ -1067,96 +1076,6 @@ xfs_buf_ioerror_alert(
|
|
|
(__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Called when we want to stop a buffer from getting written or read.
|
|
|
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
|
|
|
- * so that the proper iodone callbacks get called.
|
|
|
- */
|
|
|
-STATIC int
|
|
|
-xfs_bioerror(
|
|
|
- xfs_buf_t *bp)
|
|
|
-{
|
|
|
-#ifdef XFSERRORDEBUG
|
|
|
- ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
|
|
|
-#endif
|
|
|
-
|
|
|
- /*
|
|
|
- * No need to wait until the buffer is unpinned, we aren't flushing it.
|
|
|
- */
|
|
|
- xfs_buf_ioerror(bp, -EIO);
|
|
|
-
|
|
|
- /*
|
|
|
- * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
|
|
|
- */
|
|
|
- XFS_BUF_UNREAD(bp);
|
|
|
- XFS_BUF_UNDONE(bp);
|
|
|
- xfs_buf_stale(bp);
|
|
|
-
|
|
|
- xfs_buf_ioend(bp, 0);
|
|
|
-
|
|
|
- return -EIO;
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * Same as xfs_bioerror, except that we are releasing the buffer
|
|
|
- * here ourselves, and avoiding the xfs_buf_ioend call.
|
|
|
- * This is meant for userdata errors; metadata bufs come with
|
|
|
- * iodone functions attached, so that we can track down errors.
|
|
|
- */
|
|
|
-int
|
|
|
-xfs_bioerror_relse(
|
|
|
- struct xfs_buf *bp)
|
|
|
-{
|
|
|
- int64_t fl = bp->b_flags;
|
|
|
- /*
|
|
|
- * No need to wait until the buffer is unpinned.
|
|
|
- * We aren't flushing it.
|
|
|
- *
|
|
|
- * chunkhold expects B_DONE to be set, whether
|
|
|
- * we actually finish the I/O or not. We don't want to
|
|
|
- * change that interface.
|
|
|
- */
|
|
|
- XFS_BUF_UNREAD(bp);
|
|
|
- XFS_BUF_DONE(bp);
|
|
|
- xfs_buf_stale(bp);
|
|
|
- bp->b_iodone = NULL;
|
|
|
- if (!(fl & XBF_ASYNC)) {
|
|
|
- /*
|
|
|
- * Mark b_error and B_ERROR _both_.
|
|
|
- * Lot's of chunkcache code assumes that.
|
|
|
- * There's no reason to mark error for
|
|
|
- * ASYNC buffers.
|
|
|
- */
|
|
|
- xfs_buf_ioerror(bp, -EIO);
|
|
|
- complete(&bp->b_iowait);
|
|
|
- } else {
|
|
|
- xfs_buf_relse(bp);
|
|
|
- }
|
|
|
-
|
|
|
- return -EIO;
|
|
|
-}
|
|
|
-
|
|
|
-STATIC int
|
|
|
-xfs_bdstrat_cb(
|
|
|
- struct xfs_buf *bp)
|
|
|
-{
|
|
|
- if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
|
|
|
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
|
|
|
- /*
|
|
|
- * Metadata write that didn't get logged but
|
|
|
- * written delayed anyway. These aren't associated
|
|
|
- * with a transaction, and can be ignored.
|
|
|
- */
|
|
|
- if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
|
|
|
- return xfs_bioerror_relse(bp);
|
|
|
- else
|
|
|
- return xfs_bioerror(bp);
|
|
|
- }
|
|
|
-
|
|
|
- xfs_buf_iorequest(bp);
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
int
|
|
|
xfs_bwrite(
|
|
|
struct xfs_buf *bp)
|
|
@@ -1166,11 +1085,10 @@ xfs_bwrite(
|
|
|
ASSERT(xfs_buf_islocked(bp));
|
|
|
|
|
|
bp->b_flags |= XBF_WRITE;
|
|
|
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
|
|
|
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
|
|
|
+ XBF_WRITE_FAIL | XBF_DONE);
|
|
|
|
|
|
- xfs_bdstrat_cb(bp);
|
|
|
-
|
|
|
- error = xfs_buf_iowait(bp);
|
|
|
+ error = xfs_buf_submit_wait(bp);
|
|
|
if (error) {
|
|
|
xfs_force_shutdown(bp->b_target->bt_mount,
|
|
|
SHUTDOWN_META_IO_ERROR);
|
|
@@ -1178,15 +1096,6 @@ xfs_bwrite(
|
|
|
return error;
|
|
|
}
|
|
|
|
|
|
-STATIC void
|
|
|
-_xfs_buf_ioend(
|
|
|
- xfs_buf_t *bp,
|
|
|
- int schedule)
|
|
|
-{
|
|
|
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
|
|
|
- xfs_buf_ioend(bp, schedule);
|
|
|
-}
|
|
|
-
|
|
|
STATIC void
|
|
|
xfs_buf_bio_end_io(
|
|
|
struct bio *bio,
|
|
@@ -1198,13 +1107,18 @@ xfs_buf_bio_end_io(
|
|
|
* don't overwrite existing errors - otherwise we can lose errors on
|
|
|
* buffers that require multiple bios to complete.
|
|
|
*/
|
|
|
- if (!bp->b_error)
|
|
|
- xfs_buf_ioerror(bp, error);
|
|
|
+ if (error) {
|
|
|
+ spin_lock(&bp->b_lock);
|
|
|
+ if (!bp->b_io_error)
|
|
|
+ bp->b_io_error = error;
|
|
|
+ spin_unlock(&bp->b_lock);
|
|
|
+ }
|
|
|
|
|
|
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
|
|
|
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
|
|
|
|
|
|
- _xfs_buf_ioend(bp, 1);
|
|
|
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
|
|
|
+ xfs_buf_ioend_async(bp);
|
|
|
bio_put(bio);
|
|
|
}
|
|
|
|
|
@@ -1283,7 +1197,7 @@ next_chunk:
|
|
|
} else {
|
|
|
/*
|
|
|
* This is guaranteed not to be the last io reference count
|
|
|
- * because the caller (xfs_buf_iorequest) holds a count itself.
|
|
|
+ * because the caller (xfs_buf_submit) holds a count itself.
|
|
|
*/
|
|
|
atomic_dec(&bp->b_io_remaining);
|
|
|
xfs_buf_ioerror(bp, -EIO);
|
|
@@ -1373,53 +1287,131 @@ _xfs_buf_ioapply(
|
|
|
blk_finish_plug(&plug);
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
|
|
|
+ * the current reference to the IO. It is not safe to reference the buffer after
|
|
|
+ * a call to this function unless the caller holds an additional reference
|
|
|
+ * itself.
|
|
|
+ */
|
|
|
void
|
|
|
-xfs_buf_iorequest(
|
|
|
- xfs_buf_t *bp)
|
|
|
+xfs_buf_submit(
|
|
|
+ struct xfs_buf *bp)
|
|
|
{
|
|
|
- trace_xfs_buf_iorequest(bp, _RET_IP_);
|
|
|
+ trace_xfs_buf_submit(bp, _RET_IP_);
|
|
|
|
|
|
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
|
|
|
+ ASSERT(bp->b_flags & XBF_ASYNC);
|
|
|
+
|
|
|
+ /* on shutdown we stale and complete the buffer immediately */
|
|
|
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
|
|
|
+ xfs_buf_ioerror(bp, -EIO);
|
|
|
+ bp->b_flags &= ~XBF_DONE;
|
|
|
+ xfs_buf_stale(bp);
|
|
|
+ xfs_buf_ioend(bp);
|
|
|
+ return;
|
|
|
+ }
|
|
|
|
|
|
if (bp->b_flags & XBF_WRITE)
|
|
|
xfs_buf_wait_unpin(bp);
|
|
|
+
|
|
|
+ /* clear the internal error state to avoid spurious errors */
|
|
|
+ bp->b_io_error = 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The caller's reference is released during I/O completion.
|
|
|
+ * This occurs some time after the last b_io_remaining reference is
|
|
|
+ * released, so after we drop our Io reference we have to have some
|
|
|
+ * other reference to ensure the buffer doesn't go away from underneath
|
|
|
+ * us. Take a direct reference to ensure we have safe access to the
|
|
|
+ * buffer until we are finished with it.
|
|
|
+ */
|
|
|
xfs_buf_hold(bp);
|
|
|
|
|
|
/*
|
|
|
- * Set the count to 1 initially, this will stop an I/O
|
|
|
- * completion callout which happens before we have started
|
|
|
- * all the I/O from calling xfs_buf_ioend too early.
|
|
|
+ * Set the count to 1 initially, this will stop an I/O completion
|
|
|
+ * callout which happens before we have started all the I/O from calling
|
|
|
+ * xfs_buf_ioend too early.
|
|
|
*/
|
|
|
atomic_set(&bp->b_io_remaining, 1);
|
|
|
_xfs_buf_ioapply(bp);
|
|
|
+
|
|
|
/*
|
|
|
- * If _xfs_buf_ioapply failed, we'll get back here with
|
|
|
- * only the reference we took above. _xfs_buf_ioend will
|
|
|
- * drop it to zero, so we'd better not queue it for later,
|
|
|
- * or we'll free it before it's done.
|
|
|
+ * If _xfs_buf_ioapply failed, we can get back here with only the IO
|
|
|
+ * reference we took above. If we drop it to zero, run completion so
|
|
|
+ * that we don't return to the caller with completion still pending.
|
|
|
*/
|
|
|
- _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
|
|
|
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
|
|
|
+ if (bp->b_error)
|
|
|
+ xfs_buf_ioend(bp);
|
|
|
+ else
|
|
|
+ xfs_buf_ioend_async(bp);
|
|
|
+ }
|
|
|
|
|
|
xfs_buf_rele(bp);
|
|
|
+ /* Note: it is not safe to reference bp now we've dropped our ref */
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Waits for I/O to complete on the buffer supplied. It returns immediately if
|
|
|
- * no I/O is pending or there is already a pending error on the buffer, in which
|
|
|
- * case nothing will ever complete. It returns the I/O error code, if any, or
|
|
|
- * 0 if there was no error.
|
|
|
+ * Synchronous buffer IO submission path, read or write.
|
|
|
*/
|
|
|
int
|
|
|
-xfs_buf_iowait(
|
|
|
- xfs_buf_t *bp)
|
|
|
+xfs_buf_submit_wait(
|
|
|
+ struct xfs_buf *bp)
|
|
|
{
|
|
|
- trace_xfs_buf_iowait(bp, _RET_IP_);
|
|
|
+ int error;
|
|
|
|
|
|
- if (!bp->b_error)
|
|
|
- wait_for_completion(&bp->b_iowait);
|
|
|
+ trace_xfs_buf_submit_wait(bp, _RET_IP_);
|
|
|
+
|
|
|
+ ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
|
|
|
+
|
|
|
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
|
|
|
+ xfs_buf_ioerror(bp, -EIO);
|
|
|
+ xfs_buf_stale(bp);
|
|
|
+ bp->b_flags &= ~XBF_DONE;
|
|
|
+ return -EIO;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (bp->b_flags & XBF_WRITE)
|
|
|
+ xfs_buf_wait_unpin(bp);
|
|
|
+
|
|
|
+ /* clear the internal error state to avoid spurious errors */
|
|
|
+ bp->b_io_error = 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * For synchronous IO, the IO does not inherit the submitters reference
|
|
|
+ * count, nor the buffer lock. Hence we cannot release the reference we
|
|
|
+ * are about to take until we've waited for all IO completion to occur,
|
|
|
+ * including any xfs_buf_ioend_async() work that may be pending.
|
|
|
+ */
|
|
|
+ xfs_buf_hold(bp);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Set the count to 1 initially, this will stop an I/O completion
|
|
|
+ * callout which happens before we have started all the I/O from calling
|
|
|
+ * xfs_buf_ioend too early.
|
|
|
+ */
|
|
|
+ atomic_set(&bp->b_io_remaining, 1);
|
|
|
+ _xfs_buf_ioapply(bp);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * make sure we run completion synchronously if it raced with us and is
|
|
|
+ * already complete.
|
|
|
+ */
|
|
|
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
|
|
|
+ xfs_buf_ioend(bp);
|
|
|
|
|
|
+ /* wait for completion before gathering the error from the buffer */
|
|
|
+ trace_xfs_buf_iowait(bp, _RET_IP_);
|
|
|
+ wait_for_completion(&bp->b_iowait);
|
|
|
trace_xfs_buf_iowait_done(bp, _RET_IP_);
|
|
|
- return bp->b_error;
|
|
|
+ error = bp->b_error;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * all done now, we can release the hold that keeps the buffer
|
|
|
+ * referenced for the entire IO.
|
|
|
+ */
|
|
|
+ xfs_buf_rele(bp);
|
|
|
+ return error;
|
|
|
}
|
|
|
|
|
|
xfs_caddr_t
|
|
@@ -1813,13 +1805,19 @@ __xfs_buf_delwri_submit(
|
|
|
blk_start_plug(&plug);
|
|
|
list_for_each_entry_safe(bp, n, io_list, b_list) {
|
|
|
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
|
|
|
- bp->b_flags |= XBF_WRITE;
|
|
|
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC;
|
|
|
|
|
|
- if (!wait) {
|
|
|
- bp->b_flags |= XBF_ASYNC;
|
|
|
+ /*
|
|
|
+ * we do all Io submission async. This means if we need to wait
|
|
|
+ * for IO completion we need to take an extra reference so the
|
|
|
+ * buffer is still valid on the other side.
|
|
|
+ */
|
|
|
+ if (wait)
|
|
|
+ xfs_buf_hold(bp);
|
|
|
+ else
|
|
|
list_del_init(&bp->b_list);
|
|
|
- }
|
|
|
- xfs_bdstrat_cb(bp);
|
|
|
+
|
|
|
+ xfs_buf_submit(bp);
|
|
|
}
|
|
|
blk_finish_plug(&plug);
|
|
|
|
|
@@ -1866,7 +1864,10 @@ xfs_buf_delwri_submit(
|
|
|
bp = list_first_entry(&io_list, struct xfs_buf, b_list);
|
|
|
|
|
|
list_del_init(&bp->b_list);
|
|
|
- error2 = xfs_buf_iowait(bp);
|
|
|
+
|
|
|
+ /* locking the buffer will wait for async IO completion. */
|
|
|
+ xfs_buf_lock(bp);
|
|
|
+ error2 = bp->b_error;
|
|
|
xfs_buf_relse(bp);
|
|
|
if (!error)
|
|
|
error = error2;
|