|
@@ -36,6 +36,10 @@
|
|
|
#include <linux/pagevec.h>
|
|
|
#include <linux/writeback.h>
|
|
|
|
|
|
+/* flags for direct write completions */
|
|
|
+#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
|
|
|
+#define XFS_DIO_FLAG_APPEND (1 << 1)
|
|
|
+
|
|
|
void
|
|
|
xfs_count_page_state(
|
|
|
struct page *page,
|
|
@@ -1238,27 +1242,8 @@ xfs_vm_releasepage(
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * When we map a DIO buffer, we may need to attach an ioend that describes the
|
|
|
- * type of write IO we are doing. This passes to the completion function the
|
|
|
- * operations it needs to perform. If the mapping is for an overwrite wholly
|
|
|
- * within the EOF then we don't need an ioend and so we don't allocate one.
|
|
|
- * This avoids the unnecessary overhead of allocating and freeing ioends for
|
|
|
- * workloads that don't require transactions on IO completion.
|
|
|
- *
|
|
|
- * If we get multiple mappings in a single IO, we might be mapping different
|
|
|
- * types. But because the direct IO can only have a single private pointer, we
|
|
|
- * need to ensure that:
|
|
|
- *
|
|
|
- * a) i) the ioend spans the entire region of unwritten mappings; or
|
|
|
- * ii) the ioend spans all the mappings that cross or are beyond EOF; and
|
|
|
- * b) if it contains unwritten extents, it is *permanently* marked as such
|
|
|
- *
|
|
|
- * We could do this by chaining ioends like buffered IO does, but we only
|
|
|
- * actually get one IO completion callback from the direct IO, and that spans
|
|
|
- * the entire IO regardless of how many mappings and IOs are needed to complete
|
|
|
- * the DIO. There is only going to be one reference to the ioend and its life
|
|
|
- * cycle is constrained by the DIO completion code. hence we don't need
|
|
|
- * reference counting here.
|
|
|
+ * When we map a DIO buffer, we may need to pass flags to
|
|
|
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
|
|
|
*
|
|
|
* Note that for DIO, an IO to the highest supported file block offset (i.e.
|
|
|
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
|
|
@@ -1266,68 +1251,26 @@ xfs_vm_releasepage(
|
|
|
* extending the file size. We won't know for sure until IO completion is run
|
|
|
* and the actual max write offset is communicated to the IO completion
|
|
|
* routine.
|
|
|
- *
|
|
|
- * For DAX page faults, we are preparing to never see unwritten extents here,
|
|
|
- * nor should we ever extend the inode size. Hence we will soon have nothing to
|
|
|
- * do here for this case, ensuring we don't have to provide an IO completion
|
|
|
- * callback to free an ioend that we don't actually need for a fault into the
|
|
|
- * page at offset (2^63 - 1FSB) bytes.
|
|
|
*/
|
|
|
-
|
|
|
static void
|
|
|
xfs_map_direct(
|
|
|
struct inode *inode,
|
|
|
struct buffer_head *bh_result,
|
|
|
struct xfs_bmbt_irec *imap,
|
|
|
- xfs_off_t offset,
|
|
|
- bool dax_fault)
|
|
|
+ xfs_off_t offset)
|
|
|
{
|
|
|
- struct xfs_ioend *ioend;
|
|
|
+ uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
|
|
|
xfs_off_t size = bh_result->b_size;
|
|
|
- int type;
|
|
|
-
|
|
|
- if (ISUNWRITTEN(imap))
|
|
|
- type = XFS_IO_UNWRITTEN;
|
|
|
- else
|
|
|
- type = XFS_IO_OVERWRITE;
|
|
|
-
|
|
|
- trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
|
|
|
|
|
|
- if (dax_fault) {
|
|
|
- ASSERT(type == XFS_IO_OVERWRITE);
|
|
|
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
|
|
|
- imap);
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- if (bh_result->b_private) {
|
|
|
- ioend = bh_result->b_private;
|
|
|
- ASSERT(ioend->io_size > 0);
|
|
|
- ASSERT(offset >= ioend->io_offset);
|
|
|
- if (offset + size > ioend->io_offset + ioend->io_size)
|
|
|
- ioend->io_size = offset - ioend->io_offset + size;
|
|
|
-
|
|
|
- if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
|
|
|
- ioend->io_type = XFS_IO_UNWRITTEN;
|
|
|
-
|
|
|
- trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
|
|
|
- ioend->io_size, ioend->io_type,
|
|
|
- imap);
|
|
|
- } else if (type == XFS_IO_UNWRITTEN ||
|
|
|
- offset + size > i_size_read(inode) ||
|
|
|
- offset + size < 0) {
|
|
|
- ioend = xfs_alloc_ioend(inode, type);
|
|
|
- ioend->io_offset = offset;
|
|
|
- ioend->io_size = size;
|
|
|
+ trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
|
|
|
+ ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
|
|
|
|
|
|
- bh_result->b_private = ioend;
|
|
|
+ if (ISUNWRITTEN(imap)) {
|
|
|
+ *flags |= XFS_DIO_FLAG_UNWRITTEN;
|
|
|
+ set_buffer_defer_completion(bh_result);
|
|
|
+ } else if (offset + size > i_size_read(inode) || offset + size < 0) {
|
|
|
+ *flags |= XFS_DIO_FLAG_APPEND;
|
|
|
set_buffer_defer_completion(bh_result);
|
|
|
-
|
|
|
- trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
|
|
|
- imap);
|
|
|
- } else {
|
|
|
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
|
|
|
- imap);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -1498,9 +1441,12 @@ __xfs_get_blocks(
|
|
|
if (ISUNWRITTEN(&imap))
|
|
|
set_buffer_unwritten(bh_result);
|
|
|
/* direct IO needs special help */
|
|
|
- if (create && direct)
|
|
|
- xfs_map_direct(inode, bh_result, &imap, offset,
|
|
|
- dax_fault);
|
|
|
+ if (create && direct) {
|
|
|
+ if (dax_fault)
|
|
|
+ ASSERT(!ISUNWRITTEN(&imap));
|
|
|
+ else
|
|
|
+ xfs_map_direct(inode, bh_result, &imap, offset);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1570,42 +1516,50 @@ xfs_get_blocks_dax_fault(
|
|
|
return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
|
|
|
}
|
|
|
|
|
|
-static void
|
|
|
-__xfs_end_io_direct_write(
|
|
|
- struct inode *inode,
|
|
|
- struct xfs_ioend *ioend,
|
|
|
+/*
|
|
|
+ * Complete a direct I/O write request.
|
|
|
+ *
|
|
|
+ * xfs_map_direct passes us some flags in the private data to tell us what to
|
|
|
+ * do. If no flags are set, then the write IO is an overwrite wholly within
|
|
|
+ * the existing allocated file size and so there is nothing for us to do.
|
|
|
+ *
|
|
|
+ * Note that in this case the completion can be called in interrupt context,
|
|
|
+ * whereas if we have flags set we will always be called in task context
|
|
|
+ * (i.e. from a workqueue).
|
|
|
+ */
|
|
|
+STATIC int
|
|
|
+xfs_end_io_direct_write(
|
|
|
+ struct kiocb *iocb,
|
|
|
loff_t offset,
|
|
|
- ssize_t size)
|
|
|
+ ssize_t size,
|
|
|
+ void *private)
|
|
|
{
|
|
|
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
|
|
|
+ struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
+ struct xfs_inode *ip = XFS_I(inode);
|
|
|
+ struct xfs_mount *mp = ip->i_mount;
|
|
|
+ uintptr_t flags = (uintptr_t)private;
|
|
|
+ int error = 0;
|
|
|
|
|
|
- if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
|
|
|
- goto out_end_io;
|
|
|
+ trace_xfs_end_io_direct_write(ip, offset, size);
|
|
|
|
|
|
- /*
|
|
|
- * dio completion end_io functions are only called on writes if more
|
|
|
- * than 0 bytes was written.
|
|
|
- */
|
|
|
- ASSERT(size > 0);
|
|
|
+ if (XFS_FORCED_SHUTDOWN(mp))
|
|
|
+ return -EIO;
|
|
|
|
|
|
- /*
|
|
|
- * The ioend only maps whole blocks, while the IO may be sector aligned.
|
|
|
- * Hence the ioend offset/size may not match the IO offset/size exactly.
|
|
|
- * Because we don't map overwrites within EOF into the ioend, the offset
|
|
|
- * may not match, but only if the endio spans EOF. Either way, write
|
|
|
- * the IO sizes into the ioend so that completion processing does the
|
|
|
- * right thing.
|
|
|
- */
|
|
|
- ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
|
|
|
- ioend->io_size = size;
|
|
|
- ioend->io_offset = offset;
|
|
|
+ if (size <= 0)
|
|
|
+ return size;
|
|
|
|
|
|
/*
|
|
|
- * The ioend tells us whether we are doing unwritten extent conversion
|
|
|
+ * The flags tell us whether we are doing unwritten extent conversions
|
|
|
* or an append transaction that updates the on-disk file size. These
|
|
|
* cases are the only cases where we should *potentially* be needing
|
|
|
* to update the VFS inode size.
|
|
|
- *
|
|
|
+ */
|
|
|
+ if (flags == 0) {
|
|
|
+ ASSERT(offset + size <= i_size_read(inode));
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
* We need to update the in-core inode size here so that we don't end up
|
|
|
* with the on-disk inode size being outside the in-core inode size. We
|
|
|
* have no other method of updating EOF for AIO, so always do it here
|
|
@@ -1616,91 +1570,56 @@ __xfs_end_io_direct_write(
|
|
|
* here can result in EOF moving backwards and Bad Things Happen when
|
|
|
* that occurs.
|
|
|
*/
|
|
|
- spin_lock(&XFS_I(inode)->i_flags_lock);
|
|
|
+ spin_lock(&ip->i_flags_lock);
|
|
|
if (offset + size > i_size_read(inode))
|
|
|
i_size_write(inode, offset + size);
|
|
|
- spin_unlock(&XFS_I(inode)->i_flags_lock);
|
|
|
+ spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
|
- /*
|
|
|
- * If we are doing an append IO that needs to update the EOF on disk,
|
|
|
- * do the transaction reserve now so we can use common end io
|
|
|
- * processing. Stashing the error (if there is one) in the ioend will
|
|
|
- * result in the ioend processing passing on the error if it is
|
|
|
- * possible as we can't return it from here.
|
|
|
- */
|
|
|
- if (ioend->io_type == XFS_IO_OVERWRITE)
|
|
|
- ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
|
|
|
+ if (flags & XFS_DIO_FLAG_UNWRITTEN) {
|
|
|
+ trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
|
|
|
|
|
|
-out_end_io:
|
|
|
- xfs_end_io(&ioend->io_work);
|
|
|
- return;
|
|
|
-}
|
|
|
+ error = xfs_iomap_write_unwritten(ip, offset, size);
|
|
|
+ } else if (flags & XFS_DIO_FLAG_APPEND) {
|
|
|
+ struct xfs_trans *tp;
|
|
|
|
|
|
-/*
|
|
|
- * Complete a direct I/O write request.
|
|
|
- *
|
|
|
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
|
|
|
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
|
|
|
- * wholly within the EOF and so there is nothing for us to do. Note that in this
|
|
|
- * case the completion can be called in interrupt context, whereas if we have an
|
|
|
- * ioend we will always be called in task context (i.e. from a workqueue).
|
|
|
- */
|
|
|
-STATIC void
|
|
|
-xfs_end_io_direct_write(
|
|
|
- struct kiocb *iocb,
|
|
|
- loff_t offset,
|
|
|
- ssize_t size,
|
|
|
- void *private)
|
|
|
-{
|
|
|
- struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
- struct xfs_ioend *ioend = private;
|
|
|
+ trace_xfs_end_io_direct_write_append(ip, offset, size);
|
|
|
|
|
|
- trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
|
|
|
- ioend ? ioend->io_type : 0, NULL);
|
|
|
-
|
|
|
- if (!ioend) {
|
|
|
- ASSERT(offset + size <= i_size_read(inode));
|
|
|
- return;
|
|
|
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
|
|
|
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
|
|
|
+ if (error) {
|
|
|
+ xfs_trans_cancel(tp);
|
|
|
+ return error;
|
|
|
+ }
|
|
|
+ error = xfs_setfilesize(ip, tp, offset, size);
|
|
|
}
|
|
|
|
|
|
- __xfs_end_io_direct_write(inode, ioend, offset, size);
|
|
|
+ return error;
|
|
|
}
|
|
|
|
|
|
-static inline ssize_t
|
|
|
-xfs_vm_do_dio(
|
|
|
- struct inode *inode,
|
|
|
+STATIC ssize_t
|
|
|
+xfs_vm_direct_IO(
|
|
|
struct kiocb *iocb,
|
|
|
struct iov_iter *iter,
|
|
|
- loff_t offset,
|
|
|
- void (*endio)(struct kiocb *iocb,
|
|
|
- loff_t offset,
|
|
|
- ssize_t size,
|
|
|
- void *private),
|
|
|
- int flags)
|
|
|
+ loff_t offset)
|
|
|
{
|
|
|
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
|
|
|
+ dio_iodone_t *endio = NULL;
|
|
|
+ int flags = 0;
|
|
|
struct block_device *bdev;
|
|
|
|
|
|
- if (IS_DAX(inode))
|
|
|
+ if (iov_iter_rw(iter) == WRITE) {
|
|
|
+ endio = xfs_end_io_direct_write;
|
|
|
+ flags = DIO_ASYNC_EXTEND;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (IS_DAX(inode)) {
|
|
|
return dax_do_io(iocb, inode, iter, offset,
|
|
|
xfs_get_blocks_direct, endio, 0);
|
|
|
+ }
|
|
|
|
|
|
bdev = xfs_find_bdev_for_inode(inode);
|
|
|
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
|
|
|
- xfs_get_blocks_direct, endio, NULL, flags);
|
|
|
-}
|
|
|
-
|
|
|
-STATIC ssize_t
|
|
|
-xfs_vm_direct_IO(
|
|
|
- struct kiocb *iocb,
|
|
|
- struct iov_iter *iter,
|
|
|
- loff_t offset)
|
|
|
-{
|
|
|
- struct inode *inode = iocb->ki_filp->f_mapping->host;
|
|
|
-
|
|
|
- if (iov_iter_rw(iter) == WRITE)
|
|
|
- return xfs_vm_do_dio(inode, iocb, iter, offset,
|
|
|
- xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
|
|
|
- return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
|
|
|
+ xfs_get_blocks_direct, endio, NULL, flags);
|
|
|
}
|
|
|
|
|
|
/*
|