6 년 전 · 4718dcad7d
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1220,35 +1220,92 @@ retry:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* Unlock both inodes after they've been prepped for a range clone. */
			
 
				+STATIC void
			
 
				+xfs_reflink_remap_unlock(
			
 
				+	struct file		*file_in,
			
 
				+	struct file		*file_out)
			
 
				+{
			
 
				+	struct inode		*inode_in = file_inode(file_in);
			
 
				+	struct xfs_inode	*src = XFS_I(inode_in);
			
 
				+	struct inode		*inode_out = file_inode(file_out);
			
 
				+	struct xfs_inode	*dest = XFS_I(inode_out);
			
 
				+	bool			same_inode = (inode_in == inode_out);
			
 
				+
			
 
				+	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
			
 
				+	if (!same_inode)
			
 
				+		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
			
 
				+	inode_unlock(inode_out);
			
 
				+	if (!same_inode)
			
 
				+		inode_unlock_shared(inode_in);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				- * Link a range of blocks from one file to another.
			
 
				+ * If we're reflinking to a point past the destination file's EOF, we must
			
 
				+ * zero any speculative post-EOF preallocations that sit between the old EOF
			
 
				+ * and the destination file offset.
			
 
				  */
			
 
				-int
			
 
				-xfs_reflink_remap_range(
			
 
				+static int
			
 
				+xfs_reflink_zero_posteof(
			
 
				+	struct xfs_inode	*ip,
			
 
				+	loff_t			pos)
			
 
				+{
			
 
				+	loff_t			isize = i_size_read(VFS_I(ip));
			
 
				+
			
 
				+	if (pos <= isize)
			
 
				+		return 0;
			
 
				+
			
 
				+	trace_xfs_zero_eof(ip, isize, pos - isize);
			
 
				+	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
			
 
				+			&xfs_iomap_ops);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Prepare two files for range cloning.  Upon a successful return both inodes
			
 
				+ * will have the iolock and mmaplock held, the page cache of the out file will
			
 
				+ * be truncated, and any leases on the out file will have been broken.  This
			
 
				+ * function borrows heavily from xfs_file_aio_write_checks.
			
 
				+ *
			
 
				+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
			
 
				+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
			
 
				+ * EOF block in the source dedupe range because it's not a complete block match,
			
 
				+ * hence can introduce a corruption into the file that has it's block replaced.
			
 
				+ *
			
 
				+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
			
 
				+ * "block aligned" for the purposes of cloning entire files.  However, if the
			
 
				+ * source file range includes the EOF block and it lands within the existing EOF
			
 
				+ * of the destination file, then we can expose stale data from beyond the source
			
 
				+ * file EOF in the destination file.
			
 
				+ *
			
 
				+ * XFS doesn't support partial block sharing, so in both cases we have check
			
 
				+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
			
 
				+ * down to the previous whole block and ignore the partial EOF block. While this
			
 
				+ * means we can't dedupe the last block of a file, this is an acceptible
			
 
				+ * tradeoff for simplicity on implementation.
			
 
				+ *
			
 
				+ * For cloning, we want to share the partial EOF block if it is also the new EOF
			
 
				+ * block of the destination file. If the partial EOF block lies inside the
			
 
				+ * existing destination EOF, then we have to abort the clone to avoid exposing
			
 
				+ * stale data in the destination file. Hence we reject these clone attempts with
			
 
				+ * -EINVAL in this case.
			
 
				+ */
			
 
				+STATIC int
			
 
				+xfs_reflink_remap_prep(
			
 
				 	struct file		*file_in,
			
 
				 	loff_t			pos_in,
			
 
				 	struct file		*file_out,
			
 
				 	loff_t			pos_out,
			
 
				-	u64			len,
			
 
				+	u64			*len,
			
 
				 	bool			is_dedupe)
			
 
				 {
			
 
				 	struct inode		*inode_in = file_inode(file_in);
			
 
				 	struct xfs_inode	*src = XFS_I(inode_in);
			
 
				 	struct inode		*inode_out = file_inode(file_out);
			
 
				 	struct xfs_inode	*dest = XFS_I(inode_out);
			
 
				-	struct xfs_mount	*mp = src->i_mount;
			
 
				 	bool			same_inode = (inode_in == inode_out);
			
 
				-	xfs_fileoff_t		sfsbno, dfsbno;
			
 
				-	xfs_filblks_t		fsblen;
			
 
				-	xfs_extlen_t		cowextsize;
			
 
				+	u64			blkmask = i_blocksize(inode_in) - 1;
			
 
				 	ssize_t			ret;
			
 
				 
			
 
				-	if (!xfs_sb_version_hasreflink(&mp->m_sb))
			
 
				-		return -EOPNOTSUPP;
			
 
				-
			
 
				-	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				-		return -EIO;
			
 
				-
			
 
				 	/* Lock both files against IO */
			
 
				 	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
			
 
				 	if (ret)
			
@@ -1270,33 +1327,115 @@ xfs_reflink_remap_range(
 
				 		goto out_unlock;
			
 
				 
			
 
				 	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
			
 
				-			&len, is_dedupe);
			
 
				+			len, is_dedupe);
			
 
				 	if (ret <= 0)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				+	/*
			
 
				+	 * If the dedupe data matches, chop off the partial EOF block
			
 
				+	 * from the source file so we don't try to dedupe the partial
			
 
				+	 * EOF block.
			
 
				+	 */
			
 
				+	if (is_dedupe) {
			
 
				+		*len &= ~blkmask;
			
 
				+	} else if (*len & blkmask) {
			
 
				+		/*
			
 
				+		 * The user is attempting to share a partial EOF block,
			
 
				+		 * if it's inside the destination EOF then reject it.
			
 
				+		 */
			
 
				+		if (pos_out + *len < i_size_read(inode_out)) {
			
 
				+			ret = -EINVAL;
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	/* Attach dquots to dest inode before changing block map */
			
 
				 	ret = xfs_qm_dqattach(dest);
			
 
				 	if (ret)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				-	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
			
 
				-
			
 
				 	/*
			
 
				-	 * Clear out post-eof preallocations because we don't have page cache
			
 
				-	 * backing the delayed allocations and they'll never get freed on
			
 
				-	 * their own.
			
 
				+	 * Zero existing post-eof speculative preallocations in the destination
			
 
				+	 * file.
			
 
				 	 */
			
 
				-	if (xfs_can_free_eofblocks(dest, true)) {
			
 
				-		ret = xfs_free_eofblocks(dest);
			
 
				-		if (ret)
			
 
				-			goto out_unlock;
			
 
				-	}
			
 
				+	ret = xfs_reflink_zero_posteof(dest, pos_out);
			
 
				+	if (ret)
			
 
				+		goto out_unlock;
			
 
				 
			
 
				 	/* Set flags and remap blocks. */
			
 
				 	ret = xfs_reflink_set_inode_flag(src, dest);
			
 
				 	if (ret)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				+	/* Zap any page cache for the destination file's range. */
			
 
				+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
			
 
				+				   PAGE_ALIGN(pos_out + *len) - 1);
			
 
				+
			
 
				+	/* If we're altering the file contents... */
			
 
				+	if (!is_dedupe) {
			
 
				+		/*
			
 
				+		 * ...update the timestamps (which will grab the ilock again
			
 
				+		 * from xfs_fs_dirty_inode, so we have to call it before we
			
 
				+		 * take the ilock).
			
 
				+		 */
			
 
				+		if (!(file_out->f_mode & FMODE_NOCMTIME)) {
			
 
				+			ret = file_update_time(file_out);
			
 
				+			if (ret)
			
 
				+				goto out_unlock;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * ...clear the security bits if the process is not being run
			
 
				+		 * by root.  This keeps people from modifying setuid and setgid
			
 
				+		 * binaries.
			
 
				+		 */
			
 
				+		ret = file_remove_privs(file_out);
			
 
				+		if (ret)
			
 
				+			goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+out_unlock:
			
 
				+	xfs_reflink_remap_unlock(file_in, file_out);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Link a range of blocks from one file to another.
			
 
				+ */
			
 
				+int
			
 
				+xfs_reflink_remap_range(
			
 
				+	struct file		*file_in,
			
 
				+	loff_t			pos_in,
			
 
				+	struct file		*file_out,
			
 
				+	loff_t			pos_out,
			
 
				+	u64			len,
			
 
				+	bool			is_dedupe)
			
 
				+{
			
 
				+	struct inode		*inode_in = file_inode(file_in);
			
 
				+	struct xfs_inode	*src = XFS_I(inode_in);
			
 
				+	struct inode		*inode_out = file_inode(file_out);
			
 
				+	struct xfs_inode	*dest = XFS_I(inode_out);
			
 
				+	struct xfs_mount	*mp = src->i_mount;
			
 
				+	xfs_fileoff_t		sfsbno, dfsbno;
			
 
				+	xfs_filblks_t		fsblen;
			
 
				+	xfs_extlen_t		cowextsize;
			
 
				+	ssize_t			ret;
			
 
				+
			
 
				+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				+	if (XFS_FORCED_SHUTDOWN(mp))
			
 
				+		return -EIO;
			
 
				+
			
 
				+	/* Prepare and then clone file data. */
			
 
				+	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
			
 
				+			&len, is_dedupe);
			
 
				+	if (ret <= 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
			
 
				+
			
 
				 	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
			
 
				 	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
			
 
				 	fsblen = XFS_B_TO_FSB(mp, len);
			
@@ -1305,10 +1444,6 @@ xfs_reflink_remap_range(
 
				 	if (ret)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				-	/* Zap any page cache for the destination file's range. */
			
 
				-	truncate_inode_pages_range(&inode_out->i_data, pos_out,
			
 
				-				   PAGE_ALIGN(pos_out + len) - 1);
			
 
				-
			
 
				 	/*
			
 
				 	 * Carry the cowextsize hint from src to dest if we're sharing the
			
 
				 	 * entire source file to the entire destination file, the source file
			
@@ -1325,12 +1460,7 @@ xfs_reflink_remap_range(
 
				 			is_dedupe);
			
 
				 
			
 
				 out_unlock:
			
 
				-	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
			
 
				-	if (!same_inode)
			
 
				-		xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
			
 
				-	inode_unlock(inode_out);
			
 
				-	if (!same_inode)
			
 
				-		inode_unlock_shared(inode_in);
			
 
				+	xfs_reflink_remap_unlock(file_in, file_out);
			
 
				 	if (ret)
			
 
				 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
			
 
				 	return ret;