|
@@ -36,6 +36,21 @@
|
|
|
#include <linux/pagevec.h>
|
|
|
#include <linux/writeback.h>
|
|
|
|
|
|
+/* flags for direct write completions */
|
|
|
+#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
|
|
|
+#define XFS_DIO_FLAG_APPEND (1 << 1)
|
|
|
+
|
|
|
+/*
|
|
|
+ * structure owned by writepages passed to individual writepage calls
|
|
|
+ */
|
|
|
+struct xfs_writepage_ctx {
|
|
|
+ struct xfs_bmbt_irec imap;
|
|
|
+ bool imap_valid;
|
|
|
+ unsigned int io_type;
|
|
|
+ struct xfs_ioend *ioend;
|
|
|
+ sector_t last_block;
|
|
|
+};
|
|
|
+
|
|
|
void
|
|
|
xfs_count_page_state(
|
|
|
struct page *page,
|
|
@@ -214,10 +229,12 @@ xfs_end_io(
|
|
|
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
|
|
int error = 0;
|
|
|
|
|
|
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
|
|
|
+ /*
|
|
|
+ * Set an error if the mount has shut down and proceed with end I/O
|
|
|
+ * processing so it can perform whatever cleanups are necessary.
|
|
|
+ */
|
|
|
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
|
|
|
ioend->io_error = -EIO;
|
|
|
- goto done;
|
|
|
- }
|
|
|
|
|
|
/*
|
|
|
* For unwritten extents we need to issue transactions to convert a
|
|
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
|
|
|
*/
|
|
|
atomic_set(&ioend->io_remaining, 1);
|
|
|
ioend->io_error = 0;
|
|
|
- ioend->io_list = NULL;
|
|
|
+ INIT_LIST_HEAD(&ioend->io_list);
|
|
|
ioend->io_type = type;
|
|
|
ioend->io_inode = inode;
|
|
|
ioend->io_buffer_head = NULL;
|
|
@@ -283,8 +300,7 @@ xfs_map_blocks(
|
|
|
struct inode *inode,
|
|
|
loff_t offset,
|
|
|
struct xfs_bmbt_irec *imap,
|
|
|
- int type,
|
|
|
- int nonblocking)
|
|
|
+ int type)
|
|
|
{
|
|
|
struct xfs_inode *ip = XFS_I(inode);
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
@@ -300,12 +316,7 @@ xfs_map_blocks(
|
|
|
if (type == XFS_IO_UNWRITTEN)
|
|
|
bmapi_flags |= XFS_BMAPI_IGSTATE;
|
|
|
|
|
|
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
|
|
|
- if (nonblocking)
|
|
|
- return -EAGAIN;
|
|
|
- xfs_ilock(ip, XFS_ILOCK_SHARED);
|
|
|
- }
|
|
|
-
|
|
|
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
|
|
|
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
|
|
|
(ip->i_df.if_flags & XFS_IFEXTENTS));
|
|
|
ASSERT(offset <= mp->m_super->s_maxbytes);
|
|
@@ -341,7 +352,7 @@ xfs_map_blocks(
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-STATIC int
|
|
|
+STATIC bool
|
|
|
xfs_imap_valid(
|
|
|
struct inode *inode,
|
|
|
struct xfs_bmbt_irec *imap,
|
|
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
|
|
|
STATIC void
|
|
|
xfs_start_page_writeback(
|
|
|
struct page *page,
|
|
|
- int clear_dirty,
|
|
|
- int buffers)
|
|
|
+ int clear_dirty)
|
|
|
{
|
|
|
ASSERT(PageLocked(page));
|
|
|
ASSERT(!PageWriteback(page));
|
|
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
|
|
|
set_page_writeback_keepwrite(page);
|
|
|
|
|
|
unlock_page(page);
|
|
|
-
|
|
|
- /* If no buffers on the page are to be written, finish it here */
|
|
|
- if (!buffers)
|
|
|
- end_page_writeback(page);
|
|
|
}
|
|
|
|
|
|
static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
|
|
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Submit all of the bios for all of the ioends we have saved up, covering the
|
|
|
- * initial writepage page and also any probed pages.
|
|
|
- *
|
|
|
- * Because we may have multiple ioends spanning a page, we need to start
|
|
|
- * writeback on all the buffers before we submit them for I/O. If we mark the
|
|
|
- * buffers as we got, then we can end up with a page that only has buffers
|
|
|
- * marked async write and I/O complete on can occur before we mark the other
|
|
|
- * buffers async write.
|
|
|
- *
|
|
|
- * The end result of this is that we trip a bug in end_page_writeback() because
|
|
|
- * we call it twice for the one page as the code in end_buffer_async_write()
|
|
|
- * assumes that all buffers on the page are started at the same time.
|
|
|
- *
|
|
|
- * The fix is two passes across the ioend list - one to start writeback on the
|
|
|
- * buffer_heads, and then submit them for I/O on the second pass.
|
|
|
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
|
|
|
+ * time; the caller is responsible for chaining prior to submission.
|
|
|
*
|
|
|
* If @fail is non-zero, it means that we have a situation where some part of
|
|
|
* the submission process has failed after we have marked paged for writeback
|
|
|
* and unlocked them. In this situation, we need to fail the ioend chain rather
|
|
|
* than submit it to IO. This typically only happens on a filesystem shutdown.
|
|
|
*/
|
|
|
-STATIC void
|
|
|
+STATIC int
|
|
|
xfs_submit_ioend(
|
|
|
struct writeback_control *wbc,
|
|
|
xfs_ioend_t *ioend,
|
|
|
- int fail)
|
|
|
+ int status)
|
|
|
{
|
|
|
- xfs_ioend_t *head = ioend;
|
|
|
- xfs_ioend_t *next;
|
|
|
struct buffer_head *bh;
|
|
|
struct bio *bio;
|
|
|
sector_t lastblock = 0;
|
|
|
|
|
|
- /* Pass 1 - start writeback */
|
|
|
- do {
|
|
|
- next = ioend->io_list;
|
|
|
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
|
|
|
- xfs_start_buffer_writeback(bh);
|
|
|
- } while ((ioend = next) != NULL);
|
|
|
+ /* Reserve log space if we might write beyond the on-disk inode size. */
|
|
|
+ if (!status &&
|
|
|
+ ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
|
|
|
+ status = xfs_setfilesize_trans_alloc(ioend);
|
|
|
+ /*
|
|
|
+ * If we are failing the IO now, just mark the ioend with an
|
|
|
+ * error and finish it. This will run IO completion immediately
|
|
|
+ * as there is only one reference to the ioend at this point in
|
|
|
+ * time.
|
|
|
+ */
|
|
|
+ if (status) {
|
|
|
+ ioend->io_error = status;
|
|
|
+ xfs_finish_ioend(ioend);
|
|
|
+ return status;
|
|
|
+ }
|
|
|
|
|
|
- /* Pass 2 - submit I/O */
|
|
|
- ioend = head;
|
|
|
- do {
|
|
|
- next = ioend->io_list;
|
|
|
- bio = NULL;
|
|
|
+ bio = NULL;
|
|
|
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
|
|
|
|
|
|
- /*
|
|
|
- * If we are failing the IO now, just mark the ioend with an
|
|
|
- * error and finish it. This will run IO completion immediately
|
|
|
- * as there is only one reference to the ioend at this point in
|
|
|
- * time.
|
|
|
- */
|
|
|
- if (fail) {
|
|
|
- ioend->io_error = fail;
|
|
|
- xfs_finish_ioend(ioend);
|
|
|
- continue;
|
|
|
+ if (!bio) {
|
|
|
+retry:
|
|
|
+ bio = xfs_alloc_ioend_bio(bh);
|
|
|
+ } else if (bh->b_blocknr != lastblock + 1) {
|
|
|
+ xfs_submit_ioend_bio(wbc, ioend, bio);
|
|
|
+ goto retry;
|
|
|
}
|
|
|
|
|
|
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
|
|
|
-
|
|
|
- if (!bio) {
|
|
|
- retry:
|
|
|
- bio = xfs_alloc_ioend_bio(bh);
|
|
|
- } else if (bh->b_blocknr != lastblock + 1) {
|
|
|
- xfs_submit_ioend_bio(wbc, ioend, bio);
|
|
|
- goto retry;
|
|
|
- }
|
|
|
-
|
|
|
- if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
|
|
|
- xfs_submit_ioend_bio(wbc, ioend, bio);
|
|
|
- goto retry;
|
|
|
- }
|
|
|
-
|
|
|
- lastblock = bh->b_blocknr;
|
|
|
- }
|
|
|
- if (bio)
|
|
|
+ if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
|
|
|
xfs_submit_ioend_bio(wbc, ioend, bio);
|
|
|
- xfs_finish_ioend(ioend);
|
|
|
- } while ((ioend = next) != NULL);
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * Cancel submission of all buffer_heads so far in this endio.
|
|
|
- * Toss the endio too. Only ever called for the initial page
|
|
|
- * in a writepage request, so only ever one page.
|
|
|
- */
|
|
|
-STATIC void
|
|
|
-xfs_cancel_ioend(
|
|
|
- xfs_ioend_t *ioend)
|
|
|
-{
|
|
|
- xfs_ioend_t *next;
|
|
|
- struct buffer_head *bh, *next_bh;
|
|
|
-
|
|
|
- do {
|
|
|
- next = ioend->io_list;
|
|
|
- bh = ioend->io_buffer_head;
|
|
|
- do {
|
|
|
- next_bh = bh->b_private;
|
|
|
- clear_buffer_async_write(bh);
|
|
|
- /*
|
|
|
- * The unwritten flag is cleared when added to the
|
|
|
- * ioend. We're not submitting for I/O so mark the
|
|
|
- * buffer unwritten again for next time around.
|
|
|
- */
|
|
|
- if (ioend->io_type == XFS_IO_UNWRITTEN)
|
|
|
- set_buffer_unwritten(bh);
|
|
|
- unlock_buffer(bh);
|
|
|
- } while ((bh = next_bh) != NULL);
|
|
|
+ goto retry;
|
|
|
+ }
|
|
|
|
|
|
- mempool_free(ioend, xfs_ioend_pool);
|
|
|
- } while ((ioend = next) != NULL);
|
|
|
+ lastblock = bh->b_blocknr;
|
|
|
+ }
|
|
|
+ if (bio)
|
|
|
+ xfs_submit_ioend_bio(wbc, ioend, bio);
|
|
|
+ xfs_finish_ioend(ioend);
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* Test to see if we've been building up a completion structure for
|
|
|
* earlier buffers -- if so, we try to append to this ioend if we
|
|
|
* can, otherwise we finish off any current ioend and start another.
|
|
|
- * Return true if we've finished the given ioend.
|
|
|
+ * Return the ioend we finished off so that the caller can submit it
|
|
|
+ * once it has finished processing the dirty page.
|
|
|
*/
|
|
|
STATIC void
|
|
|
xfs_add_to_ioend(
|
|
|
struct inode *inode,
|
|
|
struct buffer_head *bh,
|
|
|
xfs_off_t offset,
|
|
|
- unsigned int type,
|
|
|
- xfs_ioend_t **result,
|
|
|
- int need_ioend)
|
|
|
+ struct xfs_writepage_ctx *wpc,
|
|
|
+ struct list_head *iolist)
|
|
|
{
|
|
|
- xfs_ioend_t *ioend = *result;
|
|
|
-
|
|
|
- if (!ioend || need_ioend || type != ioend->io_type) {
|
|
|
- xfs_ioend_t *previous = *result;
|
|
|
-
|
|
|
- ioend = xfs_alloc_ioend(inode, type);
|
|
|
- ioend->io_offset = offset;
|
|
|
- ioend->io_buffer_head = bh;
|
|
|
- ioend->io_buffer_tail = bh;
|
|
|
- if (previous)
|
|
|
- previous->io_list = ioend;
|
|
|
- *result = ioend;
|
|
|
+ if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
|
|
|
+ bh->b_blocknr != wpc->last_block + 1 ||
|
|
|
+ offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
|
|
|
+ struct xfs_ioend *new;
|
|
|
+
|
|
|
+ if (wpc->ioend)
|
|
|
+ list_add(&wpc->ioend->io_list, iolist);
|
|
|
+
|
|
|
+ new = xfs_alloc_ioend(inode, wpc->io_type);
|
|
|
+ new->io_offset = offset;
|
|
|
+ new->io_buffer_head = bh;
|
|
|
+ new->io_buffer_tail = bh;
|
|
|
+ wpc->ioend = new;
|
|
|
} else {
|
|
|
- ioend->io_buffer_tail->b_private = bh;
|
|
|
- ioend->io_buffer_tail = bh;
|
|
|
+ wpc->ioend->io_buffer_tail->b_private = bh;
|
|
|
+ wpc->ioend->io_buffer_tail = bh;
|
|
|
}
|
|
|
|
|
|
bh->b_private = NULL;
|
|
|
- ioend->io_size += bh->b_size;
|
|
|
+ wpc->ioend->io_size += bh->b_size;
|
|
|
+ wpc->last_block = bh->b_blocknr;
|
|
|
+ xfs_start_buffer_writeback(bh);
|
|
|
}
|
|
|
|
|
|
STATIC void
|
|
@@ -678,183 +632,6 @@ xfs_check_page_type(
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Allocate & map buffers for page given the extent map. Write it out.
|
|
|
- * except for the original page of a writepage, this is called on
|
|
|
- * delalloc/unwritten pages only, for the original page it is possible
|
|
|
- * that the page has no mapping at all.
|
|
|
- */
|
|
|
-STATIC int
|
|
|
-xfs_convert_page(
|
|
|
- struct inode *inode,
|
|
|
- struct page *page,
|
|
|
- loff_t tindex,
|
|
|
- struct xfs_bmbt_irec *imap,
|
|
|
- xfs_ioend_t **ioendp,
|
|
|
- struct writeback_control *wbc)
|
|
|
-{
|
|
|
- struct buffer_head *bh, *head;
|
|
|
- xfs_off_t end_offset;
|
|
|
- unsigned long p_offset;
|
|
|
- unsigned int type;
|
|
|
- int len, page_dirty;
|
|
|
- int count = 0, done = 0, uptodate = 1;
|
|
|
- xfs_off_t offset = page_offset(page);
|
|
|
-
|
|
|
- if (page->index != tindex)
|
|
|
- goto fail;
|
|
|
- if (!trylock_page(page))
|
|
|
- goto fail;
|
|
|
- if (PageWriteback(page))
|
|
|
- goto fail_unlock_page;
|
|
|
- if (page->mapping != inode->i_mapping)
|
|
|
- goto fail_unlock_page;
|
|
|
- if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
|
|
|
- goto fail_unlock_page;
|
|
|
-
|
|
|
- /*
|
|
|
- * page_dirty is initially a count of buffers on the page before
|
|
|
- * EOF and is decremented as we move each into a cleanable state.
|
|
|
- *
|
|
|
- * Derivation:
|
|
|
- *
|
|
|
- * End offset is the highest offset that this page should represent.
|
|
|
- * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
|
|
|
- * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
|
|
|
- * hence give us the correct page_dirty count. On any other page,
|
|
|
- * it will be zero and in that case we need page_dirty to be the
|
|
|
- * count of buffers on the page.
|
|
|
- */
|
|
|
- end_offset = min_t(unsigned long long,
|
|
|
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
|
|
|
- i_size_read(inode));
|
|
|
-
|
|
|
- /*
|
|
|
- * If the current map does not span the entire page we are about to try
|
|
|
- * to write, then give up. The only way we can write a page that spans
|
|
|
- * multiple mappings in a single writeback iteration is via the
|
|
|
- * xfs_vm_writepage() function. Data integrity writeback requires the
|
|
|
- * entire page to be written in a single attempt, otherwise the part of
|
|
|
- * the page we don't write here doesn't get written as part of the data
|
|
|
- * integrity sync.
|
|
|
- *
|
|
|
- * For normal writeback, we also don't attempt to write partial pages
|
|
|
- * here as it simply means that write_cache_pages() will see it under
|
|
|
- * writeback and ignore the page until some point in the future, at
|
|
|
- * which time this will be the only page in the file that needs
|
|
|
- * writeback. Hence for more optimal IO patterns, we should always
|
|
|
- * avoid partial page writeback due to multiple mappings on a page here.
|
|
|
- */
|
|
|
- if (!xfs_imap_valid(inode, imap, end_offset))
|
|
|
- goto fail_unlock_page;
|
|
|
-
|
|
|
- len = 1 << inode->i_blkbits;
|
|
|
- p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
|
|
|
- PAGE_CACHE_SIZE);
|
|
|
- p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
|
|
|
- page_dirty = p_offset / len;
|
|
|
-
|
|
|
- /*
|
|
|
- * The moment we find a buffer that doesn't match our current type
|
|
|
- * specification or can't be written, abort the loop and start
|
|
|
- * writeback. As per the above xfs_imap_valid() check, only
|
|
|
- * xfs_vm_writepage() can handle partial page writeback fully - we are
|
|
|
- * limited here to the buffers that are contiguous with the current
|
|
|
- * ioend, and hence a buffer we can't write breaks that contiguity and
|
|
|
- * we have to defer the rest of the IO to xfs_vm_writepage().
|
|
|
- */
|
|
|
- bh = head = page_buffers(page);
|
|
|
- do {
|
|
|
- if (offset >= end_offset)
|
|
|
- break;
|
|
|
- if (!buffer_uptodate(bh))
|
|
|
- uptodate = 0;
|
|
|
- if (!(PageUptodate(page) || buffer_uptodate(bh))) {
|
|
|
- done = 1;
|
|
|
- break;
|
|
|
- }
|
|
|
-
|
|
|
- if (buffer_unwritten(bh) || buffer_delay(bh) ||
|
|
|
- buffer_mapped(bh)) {
|
|
|
- if (buffer_unwritten(bh))
|
|
|
- type = XFS_IO_UNWRITTEN;
|
|
|
- else if (buffer_delay(bh))
|
|
|
- type = XFS_IO_DELALLOC;
|
|
|
- else
|
|
|
- type = XFS_IO_OVERWRITE;
|
|
|
-
|
|
|
- /*
|
|
|
- * imap should always be valid because of the above
|
|
|
- * partial page end_offset check on the imap.
|
|
|
- */
|
|
|
- ASSERT(xfs_imap_valid(inode, imap, offset));
|
|
|
-
|
|
|
- lock_buffer(bh);
|
|
|
- if (type != XFS_IO_OVERWRITE)
|
|
|
- xfs_map_at_offset(inode, bh, imap, offset);
|
|
|
- xfs_add_to_ioend(inode, bh, offset, type,
|
|
|
- ioendp, done);
|
|
|
-
|
|
|
- page_dirty--;
|
|
|
- count++;
|
|
|
- } else {
|
|
|
- done = 1;
|
|
|
- break;
|
|
|
- }
|
|
|
- } while (offset += len, (bh = bh->b_this_page) != head);
|
|
|
-
|
|
|
- if (uptodate && bh == head)
|
|
|
- SetPageUptodate(page);
|
|
|
-
|
|
|
- if (count) {
|
|
|
- if (--wbc->nr_to_write <= 0 &&
|
|
|
- wbc->sync_mode == WB_SYNC_NONE)
|
|
|
- done = 1;
|
|
|
- }
|
|
|
- xfs_start_page_writeback(page, !page_dirty, count);
|
|
|
-
|
|
|
- return done;
|
|
|
- fail_unlock_page:
|
|
|
- unlock_page(page);
|
|
|
- fail:
|
|
|
- return 1;
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * Convert & write out a cluster of pages in the same extent as defined
|
|
|
- * by mp and following the start page.
|
|
|
- */
|
|
|
-STATIC void
|
|
|
-xfs_cluster_write(
|
|
|
- struct inode *inode,
|
|
|
- pgoff_t tindex,
|
|
|
- struct xfs_bmbt_irec *imap,
|
|
|
- xfs_ioend_t **ioendp,
|
|
|
- struct writeback_control *wbc,
|
|
|
- pgoff_t tlast)
|
|
|
-{
|
|
|
- struct pagevec pvec;
|
|
|
- int done = 0, i;
|
|
|
-
|
|
|
- pagevec_init(&pvec, 0);
|
|
|
- while (!done && tindex <= tlast) {
|
|
|
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
|
|
|
-
|
|
|
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
|
|
|
- break;
|
|
|
-
|
|
|
- for (i = 0; i < pagevec_count(&pvec); i++) {
|
|
|
- done = xfs_convert_page(inode, pvec.pages[i], tindex++,
|
|
|
- imap, ioendp, wbc);
|
|
|
- if (done)
|
|
|
- break;
|
|
|
- }
|
|
|
-
|
|
|
- pagevec_release(&pvec);
|
|
|
- cond_resched();
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
STATIC void
|
|
|
xfs_vm_invalidatepage(
|
|
|
struct page *page,
|
|
@@ -931,6 +708,164 @@ out_invalidate:
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * We implement an immediate ioend submission policy here to avoid needing to
|
|
|
+ * chain multiple ioends and hence nest mempool allocations which can violate
|
|
|
+ * forward progress guarantees we need to provide. The current ioend we are
|
|
|
+ * adding buffers to is cached on the writepage context, and if the new buffer
|
|
|
+ * does not append to the cached ioend it will create a new ioend and cache that
|
|
|
+ * instead.
|
|
|
+ *
|
|
|
+ * If a new ioend is created and cached, the old ioend is returned and queued
|
|
|
+ * locally for submission once the entire page is processed or an error has been
|
|
|
+ * detected. While ioends are submitted immediately after they are completed,
|
|
|
+ * batching optimisations are provided by higher level block plugging.
|
|
|
+ *
|
|
|
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
|
|
|
+ * writepage context that the caller will need to submit.
|
|
|
+ */
|
|
|
+static int
|
|
|
+xfs_writepage_map(
|
|
|
+ struct xfs_writepage_ctx *wpc,
|
|
|
+ struct writeback_control *wbc,
|
|
|
+ struct inode *inode,
|
|
|
+ struct page *page,
|
|
|
+ loff_t offset,
|
|
|
+ __uint64_t end_offset)
|
|
|
+{
|
|
|
+ LIST_HEAD(submit_list);
|
|
|
+ struct xfs_ioend *ioend, *next;
|
|
|
+ struct buffer_head *bh, *head;
|
|
|
+ ssize_t len = 1 << inode->i_blkbits;
|
|
|
+ int error = 0;
|
|
|
+ int count = 0;
|
|
|
+ int uptodate = 1;
|
|
|
+
|
|
|
+ bh = head = page_buffers(page);
|
|
|
+ offset = page_offset(page);
|
|
|
+ do {
|
|
|
+ if (offset >= end_offset)
|
|
|
+ break;
|
|
|
+ if (!buffer_uptodate(bh))
|
|
|
+ uptodate = 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * set_page_dirty dirties all buffers in a page, independent
|
|
|
+ * of their state. The dirty state however is entirely
|
|
|
+ * meaningless for holes (!mapped && uptodate), so skip
|
|
|
+ * buffers covering holes here.
|
|
|
+ */
|
|
|
+ if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
|
|
|
+ wpc->imap_valid = false;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (buffer_unwritten(bh)) {
|
|
|
+ if (wpc->io_type != XFS_IO_UNWRITTEN) {
|
|
|
+ wpc->io_type = XFS_IO_UNWRITTEN;
|
|
|
+ wpc->imap_valid = false;
|
|
|
+ }
|
|
|
+ } else if (buffer_delay(bh)) {
|
|
|
+ if (wpc->io_type != XFS_IO_DELALLOC) {
|
|
|
+ wpc->io_type = XFS_IO_DELALLOC;
|
|
|
+ wpc->imap_valid = false;
|
|
|
+ }
|
|
|
+ } else if (buffer_uptodate(bh)) {
|
|
|
+ if (wpc->io_type != XFS_IO_OVERWRITE) {
|
|
|
+ wpc->io_type = XFS_IO_OVERWRITE;
|
|
|
+ wpc->imap_valid = false;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if (PageUptodate(page))
|
|
|
+ ASSERT(buffer_mapped(bh));
|
|
|
+ /*
|
|
|
+ * This buffer is not uptodate and will not be
|
|
|
+ * written to disk. Ensure that we will put any
|
|
|
+ * subsequent writeable buffers into a new
|
|
|
+ * ioend.
|
|
|
+ */
|
|
|
+ wpc->imap_valid = false;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (wpc->imap_valid)
|
|
|
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
|
|
|
+ offset);
|
|
|
+ if (!wpc->imap_valid) {
|
|
|
+ error = xfs_map_blocks(inode, offset, &wpc->imap,
|
|
|
+ wpc->io_type);
|
|
|
+ if (error)
|
|
|
+ goto out;
|
|
|
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
|
|
|
+ offset);
|
|
|
+ }
|
|
|
+ if (wpc->imap_valid) {
|
|
|
+ lock_buffer(bh);
|
|
|
+ if (wpc->io_type != XFS_IO_OVERWRITE)
|
|
|
+ xfs_map_at_offset(inode, bh, &wpc->imap, offset);
|
|
|
+ xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
|
|
|
+ count++;
|
|
|
+ }
|
|
|
+
|
|
|
+ } while (offset += len, ((bh = bh->b_this_page) != head));
|
|
|
+
|
|
|
+ if (uptodate && bh == head)
|
|
|
+ SetPageUptodate(page);
|
|
|
+
|
|
|
+ ASSERT(wpc->ioend || list_empty(&submit_list));
|
|
|
+
|
|
|
+out:
|
|
|
+ /*
|
|
|
+ * On error, we have to fail the ioend here because we have locked
|
|
|
+ * buffers in the ioend. If we don't do this, we'll deadlock
|
|
|
+ * invalidating the page as that tries to lock the buffers on the page.
|
|
|
+ * Also, because we may have set pages under writeback, we have to make
|
|
|
+ * sure we run IO completion to mark the error state of the IO
|
|
|
+ * appropriately, so we can't cancel the ioend directly here. That means
|
|
|
+ * we have to mark this page as under writeback if we included any
|
|
|
+ * buffers from it in the ioend chain so that completion treats it
|
|
|
+ * correctly.
|
|
|
+ *
|
|
|
+ * If we didn't include the page in the ioend, the on error we can
|
|
|
+ * simply discard and unlock it as there are no other users of the page
|
|
|
+ * or it's buffers right now. The caller will still need to trigger
|
|
|
+ * submission of outstanding ioends on the writepage context so they are
|
|
|
+ * treated correctly on error.
|
|
|
+ */
|
|
|
+ if (count) {
|
|
|
+ xfs_start_page_writeback(page, !error);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Preserve the original error if there was one, otherwise catch
|
|
|
+ * submission errors here and propagate into subsequent ioend
|
|
|
+ * submissions.
|
|
|
+ */
|
|
|
+ list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
|
|
|
+ int error2;
|
|
|
+
|
|
|
+ list_del_init(&ioend->io_list);
|
|
|
+ error2 = xfs_submit_ioend(wbc, ioend, error);
|
|
|
+ if (error2 && !error)
|
|
|
+ error = error2;
|
|
|
+ }
|
|
|
+ } else if (error) {
|
|
|
+ xfs_aops_discard_page(page);
|
|
|
+ ClearPageUptodate(page);
|
|
|
+ unlock_page(page);
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * We can end up here with no error and nothing to write if we
|
|
|
+ * race with a partial page truncate on a sub-page block sized
|
|
|
+ * filesystem. In that case we need to mark the page clean.
|
|
|
+ */
|
|
|
+ xfs_start_page_writeback(page, 1);
|
|
|
+ end_page_writeback(page);
|
|
|
+ }
|
|
|
+
|
|
|
+ mapping_set_error(page->mapping, error);
|
|
|
+ return error;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Write out a dirty page.
|
|
|
*
|
|
@@ -940,22 +875,16 @@ out_invalidate:
|
|
|
* For any other dirty buffer heads on the page we should flush them.
|
|
|
*/
|
|
|
STATIC int
|
|
|
-xfs_vm_writepage(
|
|
|
+xfs_do_writepage(
|
|
|
struct page *page,
|
|
|
- struct writeback_control *wbc)
|
|
|
+ struct writeback_control *wbc,
|
|
|
+ void *data)
|
|
|
{
|
|
|
+ struct xfs_writepage_ctx *wpc = data;
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
- struct buffer_head *bh, *head;
|
|
|
- struct xfs_bmbt_irec imap;
|
|
|
- xfs_ioend_t *ioend = NULL, *iohead = NULL;
|
|
|
loff_t offset;
|
|
|
- unsigned int type;
|
|
|
__uint64_t end_offset;
|
|
|
- pgoff_t end_index, last_index;
|
|
|
- ssize_t len;
|
|
|
- int err, imap_valid = 0, uptodate = 1;
|
|
|
- int count = 0;
|
|
|
- int nonblocking = 0;
|
|
|
+ pgoff_t end_index;
|
|
|
|
|
|
trace_xfs_writepage(inode, page, 0, 0);
|
|
|
|
|
@@ -982,12 +911,9 @@ xfs_vm_writepage(
|
|
|
if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
|
|
|
goto redirty;
|
|
|
|
|
|
- /* Is this page beyond the end of the file? */
|
|
|
- offset = i_size_read(inode);
|
|
|
- end_index = offset >> PAGE_CACHE_SHIFT;
|
|
|
- last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
|
|
|
-
|
|
|
/*
|
|
|
+ * Is this page beyond the end of the file?
|
|
|
+ *
|
|
|
* The page index is less than the end_index, adjust the end_offset
|
|
|
* to the highest offset that this page should represent.
|
|
|
* -----------------------------------------------------
|
|
@@ -998,6 +924,8 @@ xfs_vm_writepage(
|
|
|
* | desired writeback range | see else |
|
|
|
* ---------------------------------^------------------|
|
|
|
*/
|
|
|
+ offset = i_size_read(inode);
|
|
|
+ end_index = offset >> PAGE_CACHE_SHIFT;
|
|
|
if (page->index < end_index)
|
|
|
end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
|
|
|
else {
|
|
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
|
|
|
end_offset = offset;
|
|
|
}
|
|
|
|
|
|
- len = 1 << inode->i_blkbits;
|
|
|
-
|
|
|
- bh = head = page_buffers(page);
|
|
|
- offset = page_offset(page);
|
|
|
- type = XFS_IO_OVERWRITE;
|
|
|
-
|
|
|
- if (wbc->sync_mode == WB_SYNC_NONE)
|
|
|
- nonblocking = 1;
|
|
|
-
|
|
|
- do {
|
|
|
- int new_ioend = 0;
|
|
|
-
|
|
|
- if (offset >= end_offset)
|
|
|
- break;
|
|
|
- if (!buffer_uptodate(bh))
|
|
|
- uptodate = 0;
|
|
|
-
|
|
|
- /*
|
|
|
- * set_page_dirty dirties all buffers in a page, independent
|
|
|
- * of their state. The dirty state however is entirely
|
|
|
- * meaningless for holes (!mapped && uptodate), so skip
|
|
|
- * buffers covering holes here.
|
|
|
- */
|
|
|
- if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
|
|
|
- imap_valid = 0;
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- if (buffer_unwritten(bh)) {
|
|
|
- if (type != XFS_IO_UNWRITTEN) {
|
|
|
- type = XFS_IO_UNWRITTEN;
|
|
|
- imap_valid = 0;
|
|
|
- }
|
|
|
- } else if (buffer_delay(bh)) {
|
|
|
- if (type != XFS_IO_DELALLOC) {
|
|
|
- type = XFS_IO_DELALLOC;
|
|
|
- imap_valid = 0;
|
|
|
- }
|
|
|
- } else if (buffer_uptodate(bh)) {
|
|
|
- if (type != XFS_IO_OVERWRITE) {
|
|
|
- type = XFS_IO_OVERWRITE;
|
|
|
- imap_valid = 0;
|
|
|
- }
|
|
|
- } else {
|
|
|
- if (PageUptodate(page))
|
|
|
- ASSERT(buffer_mapped(bh));
|
|
|
- /*
|
|
|
- * This buffer is not uptodate and will not be
|
|
|
- * written to disk. Ensure that we will put any
|
|
|
- * subsequent writeable buffers into a new
|
|
|
- * ioend.
|
|
|
- */
|
|
|
- imap_valid = 0;
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- if (imap_valid)
|
|
|
- imap_valid = xfs_imap_valid(inode, &imap, offset);
|
|
|
- if (!imap_valid) {
|
|
|
- /*
|
|
|
- * If we didn't have a valid mapping then we need to
|
|
|
- * put the new mapping into a separate ioend structure.
|
|
|
- * This ensures non-contiguous extents always have
|
|
|
- * separate ioends, which is particularly important
|
|
|
- * for unwritten extent conversion at I/O completion
|
|
|
- * time.
|
|
|
- */
|
|
|
- new_ioend = 1;
|
|
|
- err = xfs_map_blocks(inode, offset, &imap, type,
|
|
|
- nonblocking);
|
|
|
- if (err)
|
|
|
- goto error;
|
|
|
- imap_valid = xfs_imap_valid(inode, &imap, offset);
|
|
|
- }
|
|
|
- if (imap_valid) {
|
|
|
- lock_buffer(bh);
|
|
|
- if (type != XFS_IO_OVERWRITE)
|
|
|
- xfs_map_at_offset(inode, bh, &imap, offset);
|
|
|
- xfs_add_to_ioend(inode, bh, offset, type, &ioend,
|
|
|
- new_ioend);
|
|
|
- count++;
|
|
|
- }
|
|
|
-
|
|
|
- if (!iohead)
|
|
|
- iohead = ioend;
|
|
|
-
|
|
|
- } while (offset += len, ((bh = bh->b_this_page) != head));
|
|
|
-
|
|
|
- if (uptodate && bh == head)
|
|
|
- SetPageUptodate(page);
|
|
|
-
|
|
|
- xfs_start_page_writeback(page, 1, count);
|
|
|
-
|
|
|
- /* if there is no IO to be submitted for this page, we are done */
|
|
|
- if (!ioend)
|
|
|
- return 0;
|
|
|
-
|
|
|
- ASSERT(iohead);
|
|
|
-
|
|
|
- /*
|
|
|
- * Any errors from this point onwards need tobe reported through the IO
|
|
|
- * completion path as we have marked the initial page as under writeback
|
|
|
- * and unlocked it.
|
|
|
- */
|
|
|
- if (imap_valid) {
|
|
|
- xfs_off_t end_index;
|
|
|
-
|
|
|
- end_index = imap.br_startoff + imap.br_blockcount;
|
|
|
-
|
|
|
- /* to bytes */
|
|
|
- end_index <<= inode->i_blkbits;
|
|
|
-
|
|
|
- /* to pages */
|
|
|
- end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
|
|
|
-
|
|
|
- /* check against file size */
|
|
|
- if (end_index > last_index)
|
|
|
- end_index = last_index;
|
|
|
-
|
|
|
- xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
|
|
|
- wbc, end_index);
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- /*
|
|
|
- * Reserve log space if we might write beyond the on-disk inode size.
|
|
|
- */
|
|
|
- err = 0;
|
|
|
- if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
|
|
|
- err = xfs_setfilesize_trans_alloc(ioend);
|
|
|
-
|
|
|
- xfs_submit_ioend(wbc, iohead, err);
|
|
|
-
|
|
|
- return 0;
|
|
|
-
|
|
|
-error:
|
|
|
- if (iohead)
|
|
|
- xfs_cancel_ioend(iohead);
|
|
|
-
|
|
|
- if (err == -EAGAIN)
|
|
|
- goto redirty;
|
|
|
-
|
|
|
- xfs_aops_discard_page(page);
|
|
|
- ClearPageUptodate(page);
|
|
|
- unlock_page(page);
|
|
|
- return err;
|
|
|
+ return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
|
|
|
|
|
|
redirty:
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
@@ -1202,17 +985,41 @@ redirty:
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+STATIC int
|
|
|
+xfs_vm_writepage(
|
|
|
+ struct page *page,
|
|
|
+ struct writeback_control *wbc)
|
|
|
+{
|
|
|
+ struct xfs_writepage_ctx wpc = {
|
|
|
+ .io_type = XFS_IO_INVALID,
|
|
|
+ };
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ ret = xfs_do_writepage(page, wbc, &wpc);
|
|
|
+ if (wpc.ioend)
|
|
|
+ ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
STATIC int
|
|
|
xfs_vm_writepages(
|
|
|
struct address_space *mapping,
|
|
|
struct writeback_control *wbc)
|
|
|
{
|
|
|
+ struct xfs_writepage_ctx wpc = {
|
|
|
+ .io_type = XFS_IO_INVALID,
|
|
|
+ };
|
|
|
+ int ret;
|
|
|
+
|
|
|
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
|
|
|
if (dax_mapping(mapping))
|
|
|
return dax_writeback_mapping_range(mapping,
|
|
|
xfs_find_bdev_for_inode(mapping->host), wbc);
|
|
|
|
|
|
- return generic_writepages(mapping, wbc);
|
|
|
+ ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
|
|
|
+ if (wpc.ioend)
|
|
|
+ ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1242,27 +1049,8 @@ xfs_vm_releasepage(
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * When we map a DIO buffer, we may need to attach an ioend that describes the
|
|
|
- * type of write IO we are doing. This passes to the completion function the
|
|
|
- * operations it needs to perform. If the mapping is for an overwrite wholly
|
|
|
- * within the EOF then we don't need an ioend and so we don't allocate one.
|
|
|
- * This avoids the unnecessary overhead of allocating and freeing ioends for
|
|
|
- * workloads that don't require transactions on IO completion.
|
|
|
- *
|
|
|
- * If we get multiple mappings in a single IO, we might be mapping different
|
|
|
- * types. But because the direct IO can only have a single private pointer, we
|
|
|
- * need to ensure that:
|
|
|
- *
|
|
|
- * a) i) the ioend spans the entire region of unwritten mappings; or
|
|
|
- * ii) the ioend spans all the mappings that cross or are beyond EOF; and
|
|
|
- * b) if it contains unwritten extents, it is *permanently* marked as such
|
|
|
- *
|
|
|
- * We could do this by chaining ioends like buffered IO does, but we only
|
|
|
- * actually get one IO completion callback from the direct IO, and that spans
|
|
|
- * the entire IO regardless of how many mappings and IOs are needed to complete
|
|
|
- * the DIO. There is only going to be one reference to the ioend and its life
|
|
|
- * cycle is constrained by the DIO completion code. hence we don't need
|
|
|
- * reference counting here.
|
|
|
+ * When we map a DIO buffer, we may need to pass flags to
|
|
|
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
|
|
|
*
|
|
|
* Note that for DIO, an IO to the highest supported file block offset (i.e.
|
|
|
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
|
|
@@ -1270,68 +1058,26 @@ xfs_vm_releasepage(
|
|
|
* extending the file size. We won't know for sure until IO completion is run
|
|
|
* and the actual max write offset is communicated to the IO completion
|
|
|
* routine.
|
|
|
- *
|
|
|
- * For DAX page faults, we are preparing to never see unwritten extents here,
|
|
|
- * nor should we ever extend the inode size. Hence we will soon have nothing to
|
|
|
- * do here for this case, ensuring we don't have to provide an IO completion
|
|
|
- * callback to free an ioend that we don't actually need for a fault into the
|
|
|
- * page at offset (2^63 - 1FSB) bytes.
|
|
|
*/
|
|
|
-
|
|
|
static void
|
|
|
xfs_map_direct(
|
|
|
struct inode *inode,
|
|
|
struct buffer_head *bh_result,
|
|
|
struct xfs_bmbt_irec *imap,
|
|
|
- xfs_off_t offset,
|
|
|
- bool dax_fault)
|
|
|
+ xfs_off_t offset)
|
|
|
{
|
|
|
- struct xfs_ioend *ioend;
|
|
|
+ uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
|
|
|
xfs_off_t size = bh_result->b_size;
|
|
|
- int type;
|
|
|
-
|
|
|
- if (ISUNWRITTEN(imap))
|
|
|
- type = XFS_IO_UNWRITTEN;
|
|
|
- else
|
|
|
- type = XFS_IO_OVERWRITE;
|
|
|
|
|
|
- trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
|
|
|
-
|
|
|
- if (dax_fault) {
|
|
|
- ASSERT(type == XFS_IO_OVERWRITE);
|
|
|
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
|
|
|
- imap);
|
|
|
- return;
|
|
|
- }
|
|
|
+ trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
|
|
|
+ ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
|
|
|
|
|
|
- if (bh_result->b_private) {
|
|
|
- ioend = bh_result->b_private;
|
|
|
- ASSERT(ioend->io_size > 0);
|
|
|
- ASSERT(offset >= ioend->io_offset);
|
|
|
- if (offset + size > ioend->io_offset + ioend->io_size)
|
|
|
- ioend->io_size = offset - ioend->io_offset + size;
|
|
|
-
|
|
|
- if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
|
|
|
- ioend->io_type = XFS_IO_UNWRITTEN;
|
|
|
-
|
|
|
- trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
|
|
|
- ioend->io_size, ioend->io_type,
|
|
|
- imap);
|
|
|
- } else if (type == XFS_IO_UNWRITTEN ||
|
|
|
- offset + size > i_size_read(inode) ||
|
|
|
- offset + size < 0) {
|
|
|
- ioend = xfs_alloc_ioend(inode, type);
|
|
|
- ioend->io_offset = offset;
|
|
|
- ioend->io_size = size;
|
|
|
-
|
|
|
- bh_result->b_private = ioend;
|
|
|
+ if (ISUNWRITTEN(imap)) {
|
|
|
+ *flags |= XFS_DIO_FLAG_UNWRITTEN;
|
|
|
+ set_buffer_defer_completion(bh_result);
|
|
|
+ } else if (offset + size > i_size_read(inode) || offset + size < 0) {
|
|
|
+ *flags |= XFS_DIO_FLAG_APPEND;
|
|
|
set_buffer_defer_completion(bh_result);
|
|
|
-
|
|
|
- trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
|
|
|
- imap);
|
|
|
- } else {
|
|
|
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
|
|
|
- imap);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -1502,9 +1248,12 @@ __xfs_get_blocks(
|
|
|
if (ISUNWRITTEN(&imap))
|
|
|
set_buffer_unwritten(bh_result);
|
|
|
/* direct IO needs special help */
|
|
|
- if (create && direct)
|
|
|
- xfs_map_direct(inode, bh_result, &imap, offset,
|
|
|
- dax_fault);
|
|
|
+ if (create && direct) {
|
|
|
+ if (dax_fault)
|
|
|
+ ASSERT(!ISUNWRITTEN(&imap));
|
|
|
+ else
|
|
|
+ xfs_map_direct(inode, bh_result, &imap, offset);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault(
|
|
|
return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
|
|
|
}
|
|
|
|
|
|
-static void
|
|
|
-__xfs_end_io_direct_write(
|
|
|
- struct inode *inode,
|
|
|
- struct xfs_ioend *ioend,
|
|
|
+/*
|
|
|
+ * Complete a direct I/O write request.
|
|
|
+ *
|
|
|
+ * xfs_map_direct passes us some flags in the private data to tell us what to
|
|
|
+ * do. If no flags are set, then the write IO is an overwrite wholly within
|
|
|
+ * the existing allocated file size and so there is nothing for us to do.
|
|
|
+ *
|
|
|
+ * Note that in this case the completion can be called in interrupt context,
|
|
|
+ * whereas if we have flags set we will always be called in task context
|
|
|
+ * (i.e. from a workqueue).
|
|
|
+ */
|
|
|
+STATIC int
|
|
|
+xfs_end_io_direct_write(
|
|
|
+ struct kiocb *iocb,
|
|
|
loff_t offset,
|
|
|
- ssize_t size)
|
|
|
+ ssize_t size,
|
|
|
+ void *private)
|
|
|
{
|
|
|
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
|
|
|
+ struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
+ struct xfs_inode *ip = XFS_I(inode);
|
|
|
+ struct xfs_mount *mp = ip->i_mount;
|
|
|
+ uintptr_t flags = (uintptr_t)private;
|
|
|
+ int error = 0;
|
|
|
|
|
|
- if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
|
|
|
- goto out_end_io;
|
|
|
+ trace_xfs_end_io_direct_write(ip, offset, size);
|
|
|
|
|
|
- /*
|
|
|
- * dio completion end_io functions are only called on writes if more
|
|
|
- * than 0 bytes was written.
|
|
|
- */
|
|
|
- ASSERT(size > 0);
|
|
|
+ if (XFS_FORCED_SHUTDOWN(mp))
|
|
|
+ return -EIO;
|
|
|
|
|
|
- /*
|
|
|
- * The ioend only maps whole blocks, while the IO may be sector aligned.
|
|
|
- * Hence the ioend offset/size may not match the IO offset/size exactly.
|
|
|
- * Because we don't map overwrites within EOF into the ioend, the offset
|
|
|
- * may not match, but only if the endio spans EOF. Either way, write
|
|
|
- * the IO sizes into the ioend so that completion processing does the
|
|
|
- * right thing.
|
|
|
- */
|
|
|
- ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
|
|
|
- ioend->io_size = size;
|
|
|
- ioend->io_offset = offset;
|
|
|
+ if (size <= 0)
|
|
|
+ return size;
|
|
|
|
|
|
/*
|
|
|
- * The ioend tells us whether we are doing unwritten extent conversion
|
|
|
+ * The flags tell us whether we are doing unwritten extent conversions
|
|
|
* or an append transaction that updates the on-disk file size. These
|
|
|
* cases are the only cases where we should *potentially* be needing
|
|
|
* to update the VFS inode size.
|
|
|
- *
|
|
|
+ */
|
|
|
+ if (flags == 0) {
|
|
|
+ ASSERT(offset + size <= i_size_read(inode));
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
* We need to update the in-core inode size here so that we don't end up
|
|
|
* with the on-disk inode size being outside the in-core inode size. We
|
|
|
* have no other method of updating EOF for AIO, so always do it here
|
|
@@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write(
|
|
|
* here can result in EOF moving backwards and Bad Things Happen when
|
|
|
* that occurs.
|
|
|
*/
|
|
|
- spin_lock(&XFS_I(inode)->i_flags_lock);
|
|
|
+ spin_lock(&ip->i_flags_lock);
|
|
|
if (offset + size > i_size_read(inode))
|
|
|
i_size_write(inode, offset + size);
|
|
|
- spin_unlock(&XFS_I(inode)->i_flags_lock);
|
|
|
+ spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
|
- /*
|
|
|
- * If we are doing an append IO that needs to update the EOF on disk,
|
|
|
- * do the transaction reserve now so we can use common end io
|
|
|
- * processing. Stashing the error (if there is one) in the ioend will
|
|
|
- * result in the ioend processing passing on the error if it is
|
|
|
- * possible as we can't return it from here.
|
|
|
- */
|
|
|
- if (ioend->io_type == XFS_IO_OVERWRITE)
|
|
|
- ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
|
|
|
+ if (flags & XFS_DIO_FLAG_UNWRITTEN) {
|
|
|
+ trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
|
|
|
|
|
|
-out_end_io:
|
|
|
- xfs_end_io(&ioend->io_work);
|
|
|
- return;
|
|
|
-}
|
|
|
+ error = xfs_iomap_write_unwritten(ip, offset, size);
|
|
|
+ } else if (flags & XFS_DIO_FLAG_APPEND) {
|
|
|
+ struct xfs_trans *tp;
|
|
|
|
|
|
-/*
|
|
|
- * Complete a direct I/O write request.
|
|
|
- *
|
|
|
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
|
|
|
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
|
|
|
- * wholly within the EOF and so there is nothing for us to do. Note that in this
|
|
|
- * case the completion can be called in interrupt context, whereas if we have an
|
|
|
- * ioend we will always be called in task context (i.e. from a workqueue).
|
|
|
- */
|
|
|
-STATIC void
|
|
|
-xfs_end_io_direct_write(
|
|
|
- struct kiocb *iocb,
|
|
|
- loff_t offset,
|
|
|
- ssize_t size,
|
|
|
- void *private)
|
|
|
-{
|
|
|
- struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
- struct xfs_ioend *ioend = private;
|
|
|
-
|
|
|
- trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
|
|
|
- ioend ? ioend->io_type : 0, NULL);
|
|
|
+ trace_xfs_end_io_direct_write_append(ip, offset, size);
|
|
|
|
|
|
- if (!ioend) {
|
|
|
- ASSERT(offset + size <= i_size_read(inode));
|
|
|
- return;
|
|
|
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
|
|
|
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
|
|
|
+ if (error) {
|
|
|
+ xfs_trans_cancel(tp);
|
|
|
+ return error;
|
|
|
+ }
|
|
|
+ error = xfs_setfilesize(ip, tp, offset, size);
|
|
|
}
|
|
|
|
|
|
- __xfs_end_io_direct_write(inode, ioend, offset, size);
|
|
|
+ return error;
|
|
|
}
|
|
|
|
|
|
-static inline ssize_t
|
|
|
-xfs_vm_do_dio(
|
|
|
- struct inode *inode,
|
|
|
+STATIC ssize_t
|
|
|
+xfs_vm_direct_IO(
|
|
|
struct kiocb *iocb,
|
|
|
struct iov_iter *iter,
|
|
|
- loff_t offset,
|
|
|
- void (*endio)(struct kiocb *iocb,
|
|
|
- loff_t offset,
|
|
|
- ssize_t size,
|
|
|
- void *private),
|
|
|
- int flags)
|
|
|
+ loff_t offset)
|
|
|
{
|
|
|
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
|
|
|
+ dio_iodone_t *endio = NULL;
|
|
|
+ int flags = 0;
|
|
|
struct block_device *bdev;
|
|
|
|
|
|
- if (IS_DAX(inode))
|
|
|
+ if (iov_iter_rw(iter) == WRITE) {
|
|
|
+ endio = xfs_end_io_direct_write;
|
|
|
+ flags = DIO_ASYNC_EXTEND;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (IS_DAX(inode)) {
|
|
|
return dax_do_io(iocb, inode, iter, offset,
|
|
|
xfs_get_blocks_direct, endio, 0);
|
|
|
+ }
|
|
|
|
|
|
bdev = xfs_find_bdev_for_inode(inode);
|
|
|
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
|
|
|
- xfs_get_blocks_direct, endio, NULL, flags);
|
|
|
-}
|
|
|
-
|
|
|
-STATIC ssize_t
|
|
|
-xfs_vm_direct_IO(
|
|
|
- struct kiocb *iocb,
|
|
|
- struct iov_iter *iter,
|
|
|
- loff_t offset)
|
|
|
-{
|
|
|
- struct inode *inode = iocb->ki_filp->f_mapping->host;
|
|
|
-
|
|
|
- if (iov_iter_rw(iter) == WRITE)
|
|
|
- return xfs_vm_do_dio(inode, iocb, iter, offset,
|
|
|
- xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
|
|
|
- return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
|
|
|
+ xfs_get_blocks_direct, endio, NULL, flags);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1756,6 +1478,7 @@ xfs_vm_write_failed(
|
|
|
loff_t from = pos & (PAGE_CACHE_SIZE - 1);
|
|
|
loff_t to = from + len;
|
|
|
struct buffer_head *bh, *head;
|
|
|
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
|
|
|
|
|
|
/*
|
|
|
* The request pos offset might be 32 or 64 bit, this is all fine
|
|
@@ -1787,14 +1510,23 @@ xfs_vm_write_failed(
|
|
|
if (block_start >= to)
|
|
|
break;
|
|
|
|
|
|
- if (!buffer_delay(bh))
|
|
|
+ /*
|
|
|
+ * Process delalloc and unwritten buffers beyond EOF. We can
|
|
|
+ * encounter unwritten buffers in the event that a file has
|
|
|
+ * post-EOF unwritten extents and an extending write happens to
|
|
|
+ * fail (e.g., an unaligned write that also involves a delalloc
|
|
|
+ * to the same page).
|
|
|
+ */
|
|
|
+ if (!buffer_delay(bh) && !buffer_unwritten(bh))
|
|
|
continue;
|
|
|
|
|
|
- if (!buffer_new(bh) && block_offset < i_size_read(inode))
|
|
|
+ if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
|
|
|
+ block_offset < i_size_read(inode))
|
|
|
continue;
|
|
|
|
|
|
- xfs_vm_kill_delalloc_range(inode, block_offset,
|
|
|
- block_offset + bh->b_size);
|
|
|
+ if (buffer_delay(bh))
|
|
|
+ xfs_vm_kill_delalloc_range(inode, block_offset,
|
|
|
+ block_offset + bh->b_size);
|
|
|
|
|
|
/*
|
|
|
* This buffer does not contain data anymore. make sure anyone
|
|
@@ -1805,6 +1537,7 @@ xfs_vm_write_failed(
|
|
|
clear_buffer_mapped(bh);
|
|
|
clear_buffer_new(bh);
|
|
|
clear_buffer_dirty(bh);
|
|
|
+ clear_buffer_unwritten(bh);
|
|
|
}
|
|
|
|
|
|
}
|
|
@@ -1828,6 +1561,7 @@ xfs_vm_write_begin(
|
|
|
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
|
|
|
struct page *page;
|
|
|
int status;
|
|
|
+ struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
|
|
|
|
|
|
ASSERT(len <= PAGE_CACHE_SIZE);
|
|
|
|
|
@@ -1836,6 +1570,8 @@ xfs_vm_write_begin(
|
|
|
return -ENOMEM;
|
|
|
|
|
|
status = __block_write_begin(page, pos, len, xfs_get_blocks);
|
|
|
+ if (xfs_mp_fail_writes(mp))
|
|
|
+ status = -EIO;
|
|
|
if (unlikely(status)) {
|
|
|
struct inode *inode = mapping->host;
|
|
|
size_t isize = i_size_read(inode);
|
|
@@ -1848,6 +1584,8 @@ xfs_vm_write_begin(
|
|
|
* allocated in this write, not blocks that were previously
|
|
|
* written successfully.
|
|
|
*/
|
|
|
+ if (xfs_mp_fail_writes(mp))
|
|
|
+ isize = 0;
|
|
|
if (pos + len > isize) {
|
|
|
ssize_t start = max_t(ssize_t, pos, isize);
|
|
|
|