10 år sedan · 542c311813
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1233,6 +1233,117 @@ xfs_vm_releasepage(
 
															 	return try_to_free_buffers(page);
														
 
															 }
														
 
															+/*
														
 
															+ * When we map a DIO buffer, we may need to attach an ioend that describes the
														
 
															+ * type of write IO we are doing. This passes to the completion function the
														
 
															+ * operations it needs to perform. If the mapping is for an overwrite wholly
														
 
															+ * within the EOF then we don't need an ioend and so we don't allocate one.
														
 
															+ * This avoids the unnecessary overhead of allocating and freeing ioends for
														
 
															+ * workloads that don't require transactions on IO completion.
														
 
															+ *
														
 
															+ * If we get multiple mappings in a single IO, we might be mapping different
														
 
															+ * types. But because the direct IO can only have a single private pointer, we
														
 
															+ * need to ensure that:
														
 
															+ *
														
 
															+ * a) i) the ioend spans the entire region of unwritten mappings; or
														
 
															+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
														
 
															+ * b) if it contains unwritten extents, it is *permanently* marked as such
														
 
															+ *
														
 
															+ * We could do this by chaining ioends like buffered IO does, but we only
														
 
															+ * actually get one IO completion callback from the direct IO, and that spans
														
 
															+ * the entire IO regardless of how many mappings and IOs are needed to complete
														
 
															+ * the DIO. There is only going to be one reference to the ioend and its life
														
 
															+ * cycle is constrained by the DIO completion code. hence we don't need
														
 
															+ * reference counting here.
														
 
															+ */
														
 
															+static void
														
 
															+xfs_map_direct(
														
 
															+	struct inode		*inode,
														
 
															+	struct buffer_head	*bh_result,
														
 
															+	struct xfs_bmbt_irec	*imap,
														
 
															+	xfs_off_t		offset)
														
 
															+{
														
 
															+	struct xfs_ioend	*ioend;
														
 
															+	xfs_off_t		size = bh_result->b_size;
														
 
															+	int			type;
														
 
															+
														
 
															+	if (ISUNWRITTEN(imap))
														
 
															+		type = XFS_IO_UNWRITTEN;
														
 
															+	else
														
 
															+		type = XFS_IO_OVERWRITE;
														
 
															+
														
 
															+	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
														
 
															+
														
 
															+	if (bh_result->b_private) {
														
 
															+		ioend = bh_result->b_private;
														
 
															+		ASSERT(ioend->io_size > 0);
														
 
															+		ASSERT(offset >= ioend->io_offset);
														
 
															+		if (offset + size > ioend->io_offset + ioend->io_size)
														
 
															+			ioend->io_size = offset - ioend->io_offset + size;
														
 
															+
														
 
															+		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
														
 
															+			ioend->io_type = XFS_IO_UNWRITTEN;
														
 
															+
														
 
															+		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
														
 
															+					      ioend->io_size, ioend->io_type,
														
 
															+					      imap);
														
 
															+	} else if (type == XFS_IO_UNWRITTEN ||
														
 
															+		   offset + size > i_size_read(inode)) {
														
 
															+		ioend = xfs_alloc_ioend(inode, type);
														
 
															+		ioend->io_offset = offset;
														
 
															+		ioend->io_size = size;
														
 
															+
														
 
															+		bh_result->b_private = ioend;
														
 
															+		set_buffer_defer_completion(bh_result);
														
 
															+
														
 
															+		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
														
 
															+					   imap);
														
 
															+	} else {
														
 
															+		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
														
 
															+					    imap);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
														
 
															+ * is, so that we can avoid repeated get_blocks calls.
														
 
															+ *
														
 
															+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
														
 
															+ * for blocks beyond EOF must be marked new so that sub block regions can be
														
 
															+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
														
 
															+ * was just allocated or is unwritten, otherwise the callers would overwrite
														
 
															+ * existing data with zeros. Hence we have to split the mapping into a range up
														
 
															+ * to and including EOF, and a second mapping for beyond EOF.
														
 
															+ */
														
 
															+static void
														
 
															+xfs_map_trim_size(
														
 
															+	struct inode		*inode,
														
 
															+	sector_t		iblock,
														
 
															+	struct buffer_head	*bh_result,
														
 
															+	struct xfs_bmbt_irec	*imap,
														
 
															+	xfs_off_t		offset,
														
 
															+	ssize_t			size)
														
 
															+{
														
 
															+	xfs_off_t		mapping_size;
														
 
															+
														
 
															+	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
														
 
															+	mapping_size <<= inode->i_blkbits;
														
 
															+
														
 
															+	ASSERT(mapping_size > 0);
														
 
															+	if (mapping_size > size)
														
 
															+		mapping_size = size;
														
 
															+	if (offset < i_size_read(inode) &&
														
 
															+	    offset + mapping_size >= i_size_read(inode)) {
														
 
															+		/* limit mapping to block that spans EOF */
														
 
															+		mapping_size = roundup_64(i_size_read(inode) - offset,
														
 
															+					  1 << inode->i_blkbits);
														
 
															+	}
														
 
															+	if (mapping_size > LONG_MAX)
														
 
															+		mapping_size = LONG_MAX;
														
 
															+
														
 
															+	bh_result->b_size = mapping_size;
														
 
															+}
														
 
															+
														
 
															 STATIC int
														
 
															 __xfs_get_blocks(
														
 
															 	struct inode		*inode,
														
@@ -1321,31 +1432,37 @@ __xfs_get_blocks(
 
															 			xfs_iunlock(ip, lockmode);
														
 
															 		}
														
 
															-
														
 
															-		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
														
 
															+		trace_xfs_get_blocks_alloc(ip, offset, size,
														
 
															+				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
														
 
															+						   : XFS_IO_DELALLOC, &imap);
														
 
															 	} else if (nimaps) {
														
 
															-		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
														
 
															+		trace_xfs_get_blocks_found(ip, offset, size,
														
 
															+				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
														
 
															+						   : XFS_IO_OVERWRITE, &imap);
														
 
															 		xfs_iunlock(ip, lockmode);
														
 
															 	} else {
														
 
															 		trace_xfs_get_blocks_notfound(ip, offset, size);
														
 
															 		goto out_unlock;
														
 
															 	}
														
 
															+	/* trim mapping down to size requested */
														
 
															+	if (direct || size > (1 << inode->i_blkbits))
														
 
															+		xfs_map_trim_size(inode, iblock, bh_result,
														
 
															+				  &imap, offset, size);
														
 
															+
														
 
															+	/*
														
 
															+	 * For unwritten extents do not report a disk address in the buffered
														
 
															+	 * read case (treat as if we're reading into a hole).
														
 
															+	 */
														
 
															 	if (imap.br_startblock != HOLESTARTBLOCK &&
														
 
															-	    imap.br_startblock != DELAYSTARTBLOCK) {
														
 
															-		/*
														
 
															-		 * For unwritten extents do not report a disk address on
														
 
															-		 * the read case (treat as if we're reading into a hole).
														
 
															-		 */
														
 
															-		if (create || !ISUNWRITTEN(&imap))
														
 
															-			xfs_map_buffer(inode, bh_result, &imap, offset);
														
 
															-		if (create && ISUNWRITTEN(&imap)) {
														
 
															-			if (direct) {
														
 
															-				bh_result->b_private = inode;
														
 
															-				set_buffer_defer_completion(bh_result);
														
 
															-			}
														
 
															+	    imap.br_startblock != DELAYSTARTBLOCK &&
														
 
															+	    (create || !ISUNWRITTEN(&imap))) {
														
 
															+		xfs_map_buffer(inode, bh_result, &imap, offset);
														
 
															+		if (ISUNWRITTEN(&imap))
														
 
															 			set_buffer_unwritten(bh_result);
														
 
															-		}
														
 
															+		/* direct IO needs special help */
														
 
															+		if (create && direct)
														
 
															+			xfs_map_direct(inode, bh_result, &imap, offset);
														
 
															 	}
														
 
															 	/*
														
@@ -1378,39 +1495,6 @@ __xfs_get_blocks(
 
															 		}
														
 
															 	}
														
 
															-	/*
														
 
															-	 * If this is O_DIRECT or the mpage code calling tell them how large
														
 
															-	 * the mapping is, so that we can avoid repeated get_blocks calls.
														
 
															-	 *
														
 
															-	 * If the mapping spans EOF, then we have to break the mapping up as the
														
 
															-	 * mapping for blocks beyond EOF must be marked new so that sub block
														
 
															-	 * regions can be correctly zeroed. We can't do this for mappings within
														
 
															-	 * EOF unless the mapping was just allocated or is unwritten, otherwise
														
 
															-	 * the callers would overwrite existing data with zeros. Hence we have
														
 
															-	 * to split the mapping into a range up to and including EOF, and a
														
 
															-	 * second mapping for beyond EOF.
														
 
															-	 */
														
 
															-	if (direct || size > (1 << inode->i_blkbits)) {
														
 
															-		xfs_off_t		mapping_size;
														
 
															-
														
 
															-		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
														
 
															-		mapping_size <<= inode->i_blkbits;
														
 
															-
														
 
															-		ASSERT(mapping_size > 0);
														
 
															-		if (mapping_size > size)
														
 
															-			mapping_size = size;
														
 
															-		if (offset < i_size_read(inode) &&
														
 
															-		    offset + mapping_size >= i_size_read(inode)) {
														
 
															-			/* limit mapping to block that spans EOF */
														
 
															-			mapping_size = roundup_64(i_size_read(inode) - offset,
														
 
															-						  1 << inode->i_blkbits);
														
 
															-		}
														
 
															-		if (mapping_size > LONG_MAX)
														
 
															-			mapping_size = LONG_MAX;
														
 
															-
														
 
															-		bh_result->b_size = mapping_size;
														
 
															-	}
														
 
															-
														
 
															 	return 0;
														
 
															 out_unlock:
														
@@ -1441,9 +1525,11 @@ xfs_get_blocks_direct(
 
															 /*
														
 
															  * Complete a direct I/O write request.
														
 
															  *
														
 
															- * If the private argument is non-NULL __xfs_get_blocks signals us that we
														
 
															- * need to issue a transaction to convert the range from unwritten to written
														
 
															- * extents.
														
 
															+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
														
 
															+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
														
 
															+ * wholly within the EOF and so there is nothing for us to do. Note that in this
														
 
															+ * case the completion can be called in interrupt context, whereas if we have an
														
 
															+ * ioend we will always be called in task context (i.e. from a workqueue).
														
 
															  */
														
 
															 STATIC void
														
 
															 xfs_end_io_direct_write(
														
@@ -1455,43 +1541,71 @@ xfs_end_io_direct_write(
 
															 	struct inode		*inode = file_inode(iocb->ki_filp);
														
 
															 	struct xfs_inode	*ip = XFS_I(inode);
														
 
															 	struct xfs_mount	*mp = ip->i_mount;
														
 
															+	struct xfs_ioend	*ioend = private;
														
 
															-	if (XFS_FORCED_SHUTDOWN(mp))
														
 
															+	trace_xfs_gbmap_direct_endio(ip, offset, size,
														
 
															+				     ioend ? ioend->io_type : 0, NULL);
														
 
															+
														
 
															+	if (!ioend) {
														
 
															+		ASSERT(offset + size <= i_size_read(inode));
														
 
															 		return;
														
 
															+	}
														
 
															+
														
 
															+	if (XFS_FORCED_SHUTDOWN(mp))
														
 
															+		goto out_end_io;
														
 
															 	/*
														
 
															-	 * While the generic direct I/O code updates the inode size, it does
														
 
															-	 * so only after the end_io handler is called, which means our
														
 
															-	 * end_io handler thinks the on-disk size is outside the in-core
														
 
															-	 * size.  To prevent this just update it a little bit earlier here.
														
 
															+	 * dio completion end_io functions are only called on writes if more
														
 
															+	 * than 0 bytes was written.
														
 
															 	 */
														
 
															+	ASSERT(size > 0);
														
 
															+
														
 
															+	/*
														
 
															+	 * The ioend only maps whole blocks, while the IO may be sector aligned.
														
 
															+	 * Hence the ioend offset/size may not match the IO offset/size exactly.
														
 
															+	 * Because we don't map overwrites within EOF into the ioend, the offset
														
 
															+	 * may not match, but only if the endio spans EOF.  Either way, write
														
 
															+	 * the IO sizes into the ioend so that completion processing does the
														
 
															+	 * right thing.
														
 
															+	 */
														
 
															+	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
														
 
															+	ioend->io_size = size;
														
 
															+	ioend->io_offset = offset;
														
 
															+
														
 
															+	/*
														
 
															+	 * The ioend tells us whether we are doing unwritten extent conversion
														
 
															+	 * or an append transaction that updates the on-disk file size. These
														
 
															+	 * cases are the only cases where we should *potentially* be needing
														
 
															+	 * to update the VFS inode size.
														
 
															+	 *
														
 
															+	 * We need to update the in-core inode size here so that we don't end up
														
 
															+	 * with the on-disk inode size being outside the in-core inode size. We
														
 
															+	 * have no other method of updating EOF for AIO, so always do it here
														
 
															+	 * if necessary.
														
 
															+	 *
														
 
															+	 * We need to lock the test/set EOF update as we can be racing with
														
 
															+	 * other IO completions here to update the EOF. Failing to serialise
														
 
															+	 * here can result in EOF moving backwards and Bad Things Happen when
														
 
															+	 * that occurs.
														
 
															+	 */
														
 
															+	spin_lock(&ip->i_flags_lock);
														
 
															 	if (offset + size > i_size_read(inode))
														
 
															 		i_size_write(inode, offset + size);
														
 
															+	spin_unlock(&ip->i_flags_lock);
														
 
															 	/*
														
 
															-	 * For direct I/O we do not know if we need to allocate blocks or not,
														
 
															-	 * so we can't preallocate an append transaction, as that results in
														
 
															-	 * nested reservations and log space deadlocks. Hence allocate the
														
 
															-	 * transaction here. While this is sub-optimal and can block IO
														
 
															-	 * completion for some time, we're stuck with doing it this way until
														
 
															-	 * we can pass the ioend to the direct IO allocation callbacks and
														
 
															-	 * avoid nesting that way.
														
 
															+	 * If we are doing an append IO that needs to update the EOF on disk,
														
 
															+	 * do the transaction reserve now so we can use common end io
														
 
															+	 * processing. Stashing the error (if there is one) in the ioend will
														
 
															+	 * result in the ioend processing passing on the error if it is
														
 
															+	 * possible as we can't return it from here.
														
 
															 	 */
														
 
															-	if (private && size > 0) {
														
 
															-		xfs_iomap_write_unwritten(ip, offset, size);
														
 
															-	} else if (offset + size > ip->i_d.di_size) {
														
 
															-		struct xfs_trans	*tp;
														
 
															-		int			error;
														
 
															-
														
 
															-		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
														
 
															-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
														
 
															-		if (error) {
														
 
															-			xfs_trans_cancel(tp, 0);
														
 
															-			return;
														
 
															-		}
														
 
															+	if (ioend->io_type == XFS_IO_OVERWRITE)
														
 
															+		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
														
 
															-		xfs_setfilesize(ip, tp, offset, size);
														
 
															-	}
														
 
															+out_end_io:
														
 
															+	xfs_end_io(&ioend->io_work);
														
 
															+	return;
														
 
															 }
														
 
															 STATIC ssize_t
														
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -569,20 +569,41 @@ restart:
 
															 	 * write.  If zeroing is needed and we are currently holding the
														
 
															 	 * iolock shared, we need to update it to exclusive which implies
														
 
															 	 * having to redo all checks before.
														
 
															+	 *
														
 
															+	 * We need to serialise against EOF updates that occur in IO
														
 
															+	 * completions here. We want to make sure that nobody is changing the
														
 
															+	 * size while we do this check until we have placed an IO barrier (i.e.
														
 
															+	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
														
 
															+	 * The spinlock effectively forms a memory barrier once we have the
														
 
															+	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
														
 
															+	 * and hence be able to correctly determine if we need to run zeroing.
														
 
															 	 */
														
 
															+	spin_lock(&ip->i_flags_lock);
														
 
															 	if (*pos > i_size_read(inode)) {
														
 
															 		bool	zero = false;
														
 
															+		spin_unlock(&ip->i_flags_lock);
														
 
															 		if (*iolock == XFS_IOLOCK_SHARED) {
														
 
															 			xfs_rw_iunlock(ip, *iolock);
														
 
															 			*iolock = XFS_IOLOCK_EXCL;
														
 
															 			xfs_rw_ilock(ip, *iolock);
														
 
															+
														
 
															+			/*
														
 
															+			 * We now have an IO submission barrier in place, but
														
 
															+			 * AIO can do EOF updates during IO completion and hence
														
 
															+			 * we now need to wait for all of them to drain. Non-AIO
														
 
															+			 * DIO will have drained before we are given the
														
 
															+			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
														
 
															+			 * no-op.
														
 
															+			 */
														
 
															+			inode_dio_wait(inode);
														
 
															 			goto restart;
														
 
															 		}
														
 
															 		error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
														
 
															 		if (error)
														
 
															 			return error;
														
 
															-	}
														
 
															+	} else
														
 
															+		spin_unlock(&ip->i_flags_lock);
														
 
															 	/*
														
 
															 	 * Updating the timestamps will grab the ilock again from
														
@@ -644,6 +665,8 @@ xfs_file_dio_aio_write(
 
															 	int			iolock;
														
 
															 	size_t			count = iov_iter_count(from);
														
 
															 	loff_t			pos = iocb->ki_pos;
														
 
															+	loff_t			end;
														
 
															+	struct iov_iter		data;
														
 
															 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
														
 
															 					mp->m_rtdev_targp : mp->m_ddev_targp;
														
@@ -683,10 +706,11 @@ xfs_file_dio_aio_write(
 
															 	if (ret)
														
 
															 		goto out;
														
 
															 	iov_iter_truncate(from, count);
														
 
															+	end = pos + count - 1;
														
 
															 	if (mapping->nrpages) {
														
 
															 		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
														
 
															-						    pos, pos + count - 1);
														
 
															+						   pos, end);
														
 
															 		if (ret)
														
 
															 			goto out;
														
 
															 		/*
														
@@ -696,7 +720,7 @@ xfs_file_dio_aio_write(
 
															 		 */
														
 
															 		ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
														
 
															 					pos >> PAGE_CACHE_SHIFT,
														
 
															-					(pos + count - 1) >> PAGE_CACHE_SHIFT);
														
 
															+					end >> PAGE_CACHE_SHIFT);
														
 
															 		WARN_ON_ONCE(ret);
														
 
															 		ret = 0;
														
 
															 	}
														
@@ -713,8 +737,22 @@ xfs_file_dio_aio_write(
 
															 	}
														
 
															 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
														
 
															-	ret = generic_file_direct_write(iocb, from, pos);
														
 
															+	data = *from;
														
 
															+	ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
														
 
															+
														
 
															+	/* see generic_file_direct_write() for why this is necessary */
														
 
															+	if (mapping->nrpages) {
														
 
															+		invalidate_inode_pages2_range(mapping,
														
 
															+					      pos >> PAGE_CACHE_SHIFT,
														
 
															+					      end >> PAGE_CACHE_SHIFT);
														
 
															+	}
														
 
															+
														
 
															+	if (ret > 0) {
														
 
															+		pos += ret;
														
 
															+		iov_iter_advance(from, ret);
														
 
															+		iocb->ki_pos = pos;
														
 
															+	}
														
 
															 out:
														
 
															 	xfs_rw_iunlock(ip, iolock);
														
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1221,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 
															 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
														
 
															 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
														
 
															 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
														
 
															+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
														
 
															+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
														
 
															+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
														
 
															+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
														
 
															+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
														
 
															 DECLARE_EVENT_CLASS(xfs_simple_io_class,
														
 
															 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),