|
@@ -242,19 +242,30 @@ xfs_file_fsync(
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
- * All metadata updates are logged, which means that we just have
|
|
|
|
- * to flush the log up to the latest LSN that touched the inode.
|
|
|
|
|
|
+ * All metadata updates are logged, which means that we just have to
|
|
|
|
+ * flush the log up to the latest LSN that touched the inode. If we have
|
|
|
|
+ * concurrent fsync/fdatasync() calls, we need them to all block on the
|
|
|
|
+ * log force before we clear the ili_fsync_fields field. This ensures
|
|
|
|
+ * that we don't get a racing sync operation that does not wait for the
|
|
|
|
+ * metadata to hit the journal before returning. If we race with
|
|
|
|
+ * clearing the ili_fsync_fields, then all that will happen is the log
|
|
|
|
+ * force will do nothing as the lsn will already be on disk. We can't
|
|
|
|
+ * race with setting ili_fsync_fields because that is done under
|
|
|
|
+ * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
|
|
|
|
+ * until after the ili_fsync_fields is cleared.
|
|
*/
|
|
*/
|
|
xfs_ilock(ip, XFS_ILOCK_SHARED);
|
|
xfs_ilock(ip, XFS_ILOCK_SHARED);
|
|
if (xfs_ipincount(ip)) {
|
|
if (xfs_ipincount(ip)) {
|
|
if (!datasync ||
|
|
if (!datasync ||
|
|
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
|
|
|
|
|
|
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
|
|
lsn = ip->i_itemp->ili_last_lsn;
|
|
lsn = ip->i_itemp->ili_last_lsn;
|
|
}
|
|
}
|
|
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
|
|
|
|
|
|
|
- if (lsn)
|
|
|
|
|
|
+ if (lsn) {
|
|
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
|
|
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
|
|
|
|
+ ip->i_itemp->ili_fsync_fields = 0;
|
|
|
|
+ }
|
|
|
|
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
|
|
|
|
|
/*
|
|
/*
|
|
* If we only have a single device, and the log force about was
|
|
* If we only have a single device, and the log force about was
|
|
@@ -287,7 +298,7 @@ xfs_file_read_iter(
|
|
xfs_fsize_t n;
|
|
xfs_fsize_t n;
|
|
loff_t pos = iocb->ki_pos;
|
|
loff_t pos = iocb->ki_pos;
|
|
|
|
|
|
- XFS_STATS_INC(xs_read_calls);
|
|
|
|
|
|
+ XFS_STATS_INC(mp, xs_read_calls);
|
|
|
|
|
|
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
|
|
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
|
|
ioflags |= XFS_IO_ISDIRECT;
|
|
ioflags |= XFS_IO_ISDIRECT;
|
|
@@ -365,7 +376,7 @@ xfs_file_read_iter(
|
|
|
|
|
|
ret = generic_file_read_iter(iocb, to);
|
|
ret = generic_file_read_iter(iocb, to);
|
|
if (ret > 0)
|
|
if (ret > 0)
|
|
- XFS_STATS_ADD(xs_read_bytes, ret);
|
|
|
|
|
|
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
|
|
|
|
|
|
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
|
|
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
|
|
return ret;
|
|
return ret;
|
|
@@ -383,7 +394,7 @@ xfs_file_splice_read(
|
|
int ioflags = 0;
|
|
int ioflags = 0;
|
|
ssize_t ret;
|
|
ssize_t ret;
|
|
|
|
|
|
- XFS_STATS_INC(xs_read_calls);
|
|
|
|
|
|
+ XFS_STATS_INC(ip->i_mount, xs_read_calls);
|
|
|
|
|
|
if (infilp->f_mode & FMODE_NOCMTIME)
|
|
if (infilp->f_mode & FMODE_NOCMTIME)
|
|
ioflags |= XFS_IO_INVIS;
|
|
ioflags |= XFS_IO_INVIS;
|
|
@@ -401,7 +412,7 @@ xfs_file_splice_read(
|
|
else
|
|
else
|
|
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
|
|
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
|
|
if (ret > 0)
|
|
if (ret > 0)
|
|
- XFS_STATS_ADD(xs_read_bytes, ret);
|
|
|
|
|
|
+ XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
|
|
|
|
|
|
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
|
|
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
|
|
return ret;
|
|
return ret;
|
|
@@ -482,6 +493,8 @@ xfs_zero_eof(
|
|
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
|
|
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
|
|
ASSERT(offset > isize);
|
|
ASSERT(offset > isize);
|
|
|
|
|
|
|
|
+ trace_xfs_zero_eof(ip, isize, offset - isize);
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* First handle zeroing the block on which isize resides.
|
|
* First handle zeroing the block on which isize resides.
|
|
*
|
|
*
|
|
@@ -574,6 +587,7 @@ xfs_file_aio_write_checks(
|
|
struct xfs_inode *ip = XFS_I(inode);
|
|
struct xfs_inode *ip = XFS_I(inode);
|
|
ssize_t error = 0;
|
|
ssize_t error = 0;
|
|
size_t count = iov_iter_count(from);
|
|
size_t count = iov_iter_count(from);
|
|
|
|
+ bool drained_dio = false;
|
|
|
|
|
|
restart:
|
|
restart:
|
|
error = generic_write_checks(iocb, from);
|
|
error = generic_write_checks(iocb, from);
|
|
@@ -611,12 +625,13 @@ restart:
|
|
bool zero = false;
|
|
bool zero = false;
|
|
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
spin_unlock(&ip->i_flags_lock);
|
|
- if (*iolock == XFS_IOLOCK_SHARED) {
|
|
|
|
- xfs_rw_iunlock(ip, *iolock);
|
|
|
|
- *iolock = XFS_IOLOCK_EXCL;
|
|
|
|
- xfs_rw_ilock(ip, *iolock);
|
|
|
|
- iov_iter_reexpand(from, count);
|
|
|
|
-
|
|
|
|
|
|
+ if (!drained_dio) {
|
|
|
|
+ if (*iolock == XFS_IOLOCK_SHARED) {
|
|
|
|
+ xfs_rw_iunlock(ip, *iolock);
|
|
|
|
+ *iolock = XFS_IOLOCK_EXCL;
|
|
|
|
+ xfs_rw_ilock(ip, *iolock);
|
|
|
|
+ iov_iter_reexpand(from, count);
|
|
|
|
+ }
|
|
/*
|
|
/*
|
|
* We now have an IO submission barrier in place, but
|
|
* We now have an IO submission barrier in place, but
|
|
* AIO can do EOF updates during IO completion and hence
|
|
* AIO can do EOF updates during IO completion and hence
|
|
@@ -626,6 +641,7 @@ restart:
|
|
* no-op.
|
|
* no-op.
|
|
*/
|
|
*/
|
|
inode_dio_wait(inode);
|
|
inode_dio_wait(inode);
|
|
|
|
+ drained_dio = true;
|
|
goto restart;
|
|
goto restart;
|
|
}
|
|
}
|
|
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
|
|
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
|
|
@@ -867,7 +883,7 @@ xfs_file_write_iter(
|
|
ssize_t ret;
|
|
ssize_t ret;
|
|
size_t ocount = iov_iter_count(from);
|
|
size_t ocount = iov_iter_count(from);
|
|
|
|
|
|
- XFS_STATS_INC(xs_write_calls);
|
|
|
|
|
|
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
|
|
|
|
|
|
if (ocount == 0)
|
|
if (ocount == 0)
|
|
return 0;
|
|
return 0;
|
|
@@ -883,7 +899,7 @@ xfs_file_write_iter(
|
|
if (ret > 0) {
|
|
if (ret > 0) {
|
|
ssize_t err;
|
|
ssize_t err;
|
|
|
|
|
|
- XFS_STATS_ADD(xs_write_bytes, ret);
|
|
|
|
|
|
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
|
|
|
|
|
|
/* Handle various SYNC-type writes */
|
|
/* Handle various SYNC-type writes */
|
|
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
|
|
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
|
|
@@ -1477,7 +1493,7 @@ xfs_file_llseek(
|
|
*
|
|
*
|
|
* mmap_sem (MM)
|
|
* mmap_sem (MM)
|
|
* sb_start_pagefault(vfs, freeze)
|
|
* sb_start_pagefault(vfs, freeze)
|
|
- * i_mmap_lock (XFS - truncate serialisation)
|
|
|
|
|
|
+ * i_mmaplock (XFS - truncate serialisation)
|
|
* page_lock (MM)
|
|
* page_lock (MM)
|
|
* i_lock (XFS - extent map serialisation)
|
|
* i_lock (XFS - extent map serialisation)
|
|
*/
|
|
*/
|
|
@@ -1503,8 +1519,7 @@ xfs_filemap_page_mkwrite(
|
|
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
|
|
|
|
if (IS_DAX(inode)) {
|
|
if (IS_DAX(inode)) {
|
|
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
|
|
|
|
- xfs_end_io_dax_write);
|
|
|
|
|
|
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
|
|
} else {
|
|
} else {
|
|
ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
|
|
ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
|
|
ret = block_page_mkwrite_return(ret);
|
|
ret = block_page_mkwrite_return(ret);
|
|
@@ -1538,7 +1553,7 @@ xfs_filemap_fault(
|
|
* changes to xfs_get_blocks_direct() to map unwritten extent
|
|
* changes to xfs_get_blocks_direct() to map unwritten extent
|
|
* ioend for conversion on read-only mappings.
|
|
* ioend for conversion on read-only mappings.
|
|
*/
|
|
*/
|
|
- ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
|
|
|
|
|
|
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
|
|
} else
|
|
} else
|
|
ret = filemap_fault(vma, vmf);
|
|
ret = filemap_fault(vma, vmf);
|
|
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
@@ -1546,6 +1561,13 @@ xfs_filemap_fault(
|
|
return ret;
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
|
|
|
|
+ * both read and write faults. Hence we need to handle both cases. There is no
|
|
|
|
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
|
|
|
|
+ * handle both cases here. @flags carries the information on the type of fault
|
|
|
|
+ * occuring.
|
|
|
|
+ */
|
|
STATIC int
|
|
STATIC int
|
|
xfs_filemap_pmd_fault(
|
|
xfs_filemap_pmd_fault(
|
|
struct vm_area_struct *vma,
|
|
struct vm_area_struct *vma,
|
|
@@ -1562,15 +1584,54 @@ xfs_filemap_pmd_fault(
|
|
|
|
|
|
trace_xfs_filemap_pmd_fault(ip);
|
|
trace_xfs_filemap_pmd_fault(ip);
|
|
|
|
|
|
- sb_start_pagefault(inode->i_sb);
|
|
|
|
- file_update_time(vma->vm_file);
|
|
|
|
|
|
+ if (flags & FAULT_FLAG_WRITE) {
|
|
|
|
+ sb_start_pagefault(inode->i_sb);
|
|
|
|
+ file_update_time(vma->vm_file);
|
|
|
|
+ }
|
|
|
|
+
|
|
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
|
|
|
|
- xfs_end_io_dax_write);
|
|
|
|
|
|
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
|
|
|
|
+ NULL);
|
|
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
|
- sb_end_pagefault(inode->i_sb);
|
|
|
|
|
|
|
|
|
|
+ if (flags & FAULT_FLAG_WRITE)
|
|
|
|
+ sb_end_pagefault(inode->i_sb);
|
|
|
|
+
|
|
|
|
+ return ret;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
|
|
|
|
+ * updates on write faults. In reality, it's need to serialise against
|
|
|
|
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
|
|
|
|
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
|
|
|
|
+ * barrier in place.
|
|
|
|
+ */
|
|
|
|
+static int
|
|
|
|
+xfs_filemap_pfn_mkwrite(
|
|
|
|
+ struct vm_area_struct *vma,
|
|
|
|
+ struct vm_fault *vmf)
|
|
|
|
+{
|
|
|
|
+
|
|
|
|
+ struct inode *inode = file_inode(vma->vm_file);
|
|
|
|
+ struct xfs_inode *ip = XFS_I(inode);
|
|
|
|
+ int ret = VM_FAULT_NOPAGE;
|
|
|
|
+ loff_t size;
|
|
|
|
+
|
|
|
|
+ trace_xfs_filemap_pfn_mkwrite(ip);
|
|
|
|
+
|
|
|
|
+ sb_start_pagefault(inode->i_sb);
|
|
|
|
+ file_update_time(vma->vm_file);
|
|
|
|
+
|
|
|
|
+ /* check if the faulting page hasn't raced with truncate */
|
|
|
|
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
|
|
|
|
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
|
|
+ if (vmf->pgoff >= size)
|
|
|
|
+ ret = VM_FAULT_SIGBUS;
|
|
|
|
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
|
|
|
|
+ sb_end_pagefault(inode->i_sb);
|
|
return ret;
|
|
return ret;
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
|
|
static const struct vm_operations_struct xfs_file_vm_ops = {
|
|
static const struct vm_operations_struct xfs_file_vm_ops = {
|
|
@@ -1578,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
|
|
.pmd_fault = xfs_filemap_pmd_fault,
|
|
.pmd_fault = xfs_filemap_pmd_fault,
|
|
.map_pages = filemap_map_pages,
|
|
.map_pages = filemap_map_pages,
|
|
.page_mkwrite = xfs_filemap_page_mkwrite,
|
|
.page_mkwrite = xfs_filemap_page_mkwrite,
|
|
|
|
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
|
|
};
|
|
};
|
|
|
|
|
|
STATIC int
|
|
STATIC int
|