|
@@ -1220,35 +1220,92 @@ retry:
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+/* Unlock both inodes after they've been prepped for a range clone. */
|
|
|
+STATIC void
|
|
|
+xfs_reflink_remap_unlock(
|
|
|
+ struct file *file_in,
|
|
|
+ struct file *file_out)
|
|
|
+{
|
|
|
+ struct inode *inode_in = file_inode(file_in);
|
|
|
+ struct xfs_inode *src = XFS_I(inode_in);
|
|
|
+ struct inode *inode_out = file_inode(file_out);
|
|
|
+ struct xfs_inode *dest = XFS_I(inode_out);
|
|
|
+ bool same_inode = (inode_in == inode_out);
|
|
|
+
|
|
|
+ xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
|
|
|
+ if (!same_inode)
|
|
|
+ xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
|
|
|
+ inode_unlock(inode_out);
|
|
|
+ if (!same_inode)
|
|
|
+ inode_unlock_shared(inode_in);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
- * Link a range of blocks from one file to another.
|
|
|
+ * If we're reflinking to a point past the destination file's EOF, we must
|
|
|
+ * zero any speculative post-EOF preallocations that sit between the old EOF
|
|
|
+ * and the destination file offset.
|
|
|
*/
|
|
|
-int
|
|
|
-xfs_reflink_remap_range(
|
|
|
+static int
|
|
|
+xfs_reflink_zero_posteof(
|
|
|
+ struct xfs_inode *ip,
|
|
|
+ loff_t pos)
|
|
|
+{
|
|
|
+ loff_t isize = i_size_read(VFS_I(ip));
|
|
|
+
|
|
|
+ if (pos <= isize)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ trace_xfs_zero_eof(ip, isize, pos - isize);
|
|
|
+ return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
|
|
|
+ &xfs_iomap_ops);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Prepare two files for range cloning. Upon a successful return both inodes
|
|
|
+ * will have the iolock and mmaplock held, the page cache of the out file will
|
|
|
+ * be truncated, and any leases on the out file will have been broken. This
|
|
|
+ * function borrows heavily from xfs_file_aio_write_checks.
|
|
|
+ *
|
|
|
+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
|
|
|
+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
|
|
|
+ * EOF block in the source dedupe range because it's not a complete block match,
|
|
|
+ * hence can introduce a corruption into the file that has it's block replaced.
|
|
|
+ *
|
|
|
+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
|
|
|
+ * "block aligned" for the purposes of cloning entire files. However, if the
|
|
|
+ * source file range includes the EOF block and it lands within the existing EOF
|
|
|
+ * of the destination file, then we can expose stale data from beyond the source
|
|
|
+ * file EOF in the destination file.
|
|
|
+ *
|
|
|
+ * XFS doesn't support partial block sharing, so in both cases we have check
|
|
|
+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
|
|
|
+ * down to the previous whole block and ignore the partial EOF block. While this
|
|
|
+ * means we can't dedupe the last block of a file, this is an acceptible
|
|
|
+ * tradeoff for simplicity on implementation.
|
|
|
+ *
|
|
|
+ * For cloning, we want to share the partial EOF block if it is also the new EOF
|
|
|
+ * block of the destination file. If the partial EOF block lies inside the
|
|
|
+ * existing destination EOF, then we have to abort the clone to avoid exposing
|
|
|
+ * stale data in the destination file. Hence we reject these clone attempts with
|
|
|
+ * -EINVAL in this case.
|
|
|
+ */
|
|
|
+STATIC int
|
|
|
+xfs_reflink_remap_prep(
|
|
|
struct file *file_in,
|
|
|
loff_t pos_in,
|
|
|
struct file *file_out,
|
|
|
loff_t pos_out,
|
|
|
- u64 len,
|
|
|
+ u64 *len,
|
|
|
bool is_dedupe)
|
|
|
{
|
|
|
struct inode *inode_in = file_inode(file_in);
|
|
|
struct xfs_inode *src = XFS_I(inode_in);
|
|
|
struct inode *inode_out = file_inode(file_out);
|
|
|
struct xfs_inode *dest = XFS_I(inode_out);
|
|
|
- struct xfs_mount *mp = src->i_mount;
|
|
|
bool same_inode = (inode_in == inode_out);
|
|
|
- xfs_fileoff_t sfsbno, dfsbno;
|
|
|
- xfs_filblks_t fsblen;
|
|
|
- xfs_extlen_t cowextsize;
|
|
|
+ u64 blkmask = i_blocksize(inode_in) - 1;
|
|
|
ssize_t ret;
|
|
|
|
|
|
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
|
|
|
- return -EOPNOTSUPP;
|
|
|
-
|
|
|
- if (XFS_FORCED_SHUTDOWN(mp))
|
|
|
- return -EIO;
|
|
|
-
|
|
|
/* Lock both files against IO */
|
|
|
ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
|
|
|
if (ret)
|
|
@@ -1270,33 +1327,115 @@ xfs_reflink_remap_range(
|
|
|
goto out_unlock;
|
|
|
|
|
|
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
|
|
|
- &len, is_dedupe);
|
|
|
+ len, is_dedupe);
|
|
|
if (ret <= 0)
|
|
|
goto out_unlock;
|
|
|
|
|
|
+ /*
|
|
|
+ * If the dedupe data matches, chop off the partial EOF block
|
|
|
+ * from the source file so we don't try to dedupe the partial
|
|
|
+ * EOF block.
|
|
|
+ */
|
|
|
+ if (is_dedupe) {
|
|
|
+ *len &= ~blkmask;
|
|
|
+ } else if (*len & blkmask) {
|
|
|
+ /*
|
|
|
+ * The user is attempting to share a partial EOF block,
|
|
|
+ * if it's inside the destination EOF then reject it.
|
|
|
+ */
|
|
|
+ if (pos_out + *len < i_size_read(inode_out)) {
|
|
|
+ ret = -EINVAL;
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
/* Attach dquots to dest inode before changing block map */
|
|
|
ret = xfs_qm_dqattach(dest);
|
|
|
if (ret)
|
|
|
goto out_unlock;
|
|
|
|
|
|
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
|
|
|
-
|
|
|
/*
|
|
|
- * Clear out post-eof preallocations because we don't have page cache
|
|
|
- * backing the delayed allocations and they'll never get freed on
|
|
|
- * their own.
|
|
|
+ * Zero existing post-eof speculative preallocations in the destination
|
|
|
+ * file.
|
|
|
*/
|
|
|
- if (xfs_can_free_eofblocks(dest, true)) {
|
|
|
- ret = xfs_free_eofblocks(dest);
|
|
|
- if (ret)
|
|
|
- goto out_unlock;
|
|
|
- }
|
|
|
+ ret = xfs_reflink_zero_posteof(dest, pos_out);
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
|
|
|
/* Set flags and remap blocks. */
|
|
|
ret = xfs_reflink_set_inode_flag(src, dest);
|
|
|
if (ret)
|
|
|
goto out_unlock;
|
|
|
|
|
|
+ /* Zap any page cache for the destination file's range. */
|
|
|
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
|
|
|
+ PAGE_ALIGN(pos_out + *len) - 1);
|
|
|
+
|
|
|
+ /* If we're altering the file contents... */
|
|
|
+ if (!is_dedupe) {
|
|
|
+ /*
|
|
|
+ * ...update the timestamps (which will grab the ilock again
|
|
|
+ * from xfs_fs_dirty_inode, so we have to call it before we
|
|
|
+ * take the ilock).
|
|
|
+ */
|
|
|
+ if (!(file_out->f_mode & FMODE_NOCMTIME)) {
|
|
|
+ ret = file_update_time(file_out);
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * ...clear the security bits if the process is not being run
|
|
|
+ * by root. This keeps people from modifying setuid and setgid
|
|
|
+ * binaries.
|
|
|
+ */
|
|
|
+ ret = file_remove_privs(file_out);
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ return 1;
|
|
|
+out_unlock:
|
|
|
+ xfs_reflink_remap_unlock(file_in, file_out);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Link a range of blocks from one file to another.
|
|
|
+ */
|
|
|
+int
|
|
|
+xfs_reflink_remap_range(
|
|
|
+ struct file *file_in,
|
|
|
+ loff_t pos_in,
|
|
|
+ struct file *file_out,
|
|
|
+ loff_t pos_out,
|
|
|
+ u64 len,
|
|
|
+ bool is_dedupe)
|
|
|
+{
|
|
|
+ struct inode *inode_in = file_inode(file_in);
|
|
|
+ struct xfs_inode *src = XFS_I(inode_in);
|
|
|
+ struct inode *inode_out = file_inode(file_out);
|
|
|
+ struct xfs_inode *dest = XFS_I(inode_out);
|
|
|
+ struct xfs_mount *mp = src->i_mount;
|
|
|
+ xfs_fileoff_t sfsbno, dfsbno;
|
|
|
+ xfs_filblks_t fsblen;
|
|
|
+ xfs_extlen_t cowextsize;
|
|
|
+ ssize_t ret;
|
|
|
+
|
|
|
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
|
|
|
+ return -EOPNOTSUPP;
|
|
|
+
|
|
|
+ if (XFS_FORCED_SHUTDOWN(mp))
|
|
|
+ return -EIO;
|
|
|
+
|
|
|
+ /* Prepare and then clone file data. */
|
|
|
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
|
|
|
+ &len, is_dedupe);
|
|
|
+ if (ret <= 0)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
|
|
|
+
|
|
|
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
|
|
|
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
|
|
|
fsblen = XFS_B_TO_FSB(mp, len);
|
|
@@ -1305,10 +1444,6 @@ xfs_reflink_remap_range(
|
|
|
if (ret)
|
|
|
goto out_unlock;
|
|
|
|
|
|
- /* Zap any page cache for the destination file's range. */
|
|
|
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
|
|
|
- PAGE_ALIGN(pos_out + len) - 1);
|
|
|
-
|
|
|
/*
|
|
|
* Carry the cowextsize hint from src to dest if we're sharing the
|
|
|
* entire source file to the entire destination file, the source file
|
|
@@ -1325,12 +1460,7 @@ xfs_reflink_remap_range(
|
|
|
is_dedupe);
|
|
|
|
|
|
out_unlock:
|
|
|
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
|
|
|
- if (!same_inode)
|
|
|
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
|
|
|
- inode_unlock(inode_out);
|
|
|
- if (!same_inode)
|
|
|
- inode_unlock_shared(inode_in);
|
|
|
+ xfs_reflink_remap_unlock(file_in, file_out);
|
|
|
if (ret)
|
|
|
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
|
|
|
return ret;
|