Skip to content

Commit b01b2d7

Browse files
author
Andreas Gruenbacher
committed
gfs2: Fix mmap + page fault deadlocks for direct I/O
Also disable page faults during direct I/O requests and implement a similar kind of retry logic as in the buffered I/O case. The retry logic in the direct I/O case differs from the buffered I/O case in the following way: direct I/O doesn't provide the kinds of consistency guarantees between concurrent reads and writes that buffered I/O provides, so once we lose the inode glock while faulting in user pages, we always resume the operation. We never need to return a partial read or write. This locking problem was originally reported by Jan Kara. Linus came up with the idea of disabling page faults. Many thanks to Al Viro and Matthew Wilcox for their feedback. Signed-off-by: Andreas Gruenbacher <[email protected]>
1 parent 3337ab0 commit b01b2d7

File tree

1 file changed

+87
-12
lines changed

1 file changed

+87
-12
lines changed

fs/gfs2/file.c

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -811,22 +811,64 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
811811
{
812812
struct file *file = iocb->ki_filp;
813813
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
814-
size_t count = iov_iter_count(to);
814+
size_t prev_count = 0, window_size = 0;
815+
size_t written = 0;
815816
ssize_t ret;
816817

817-
if (!count)
818+
/*
819+
* In this function, we disable page faults when we're holding the
820+
* inode glock while doing I/O. If a page fault occurs, we indicate
821+
* that the inode glock may be dropped, fault in the pages manually,
822+
* and retry.
823+
*
824+
* Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
825+
* physical as well as manual page faults, and we need to disable both
826+
* kinds.
827+
*
828+
* For direct I/O, gfs2 takes the inode glock in deferred mode. This
829+
* locking mode is compatible with other deferred holders, so multiple
830+
* processes and nodes can do direct I/O to a file at the same time.
831+
* There's no guarantee that reads or writes will be atomic. Any
832+
* coordination among readers and writers needs to happen externally.
833+
*/
834+
835+
if (!iov_iter_count(to))
818836
return 0; /* skip atime */
819837

820838
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
839+
retry:
821840
ret = gfs2_glock_nq(gh);
822841
if (ret)
823842
goto out_uninit;
843+
retry_under_glock:
844+
pagefault_disable();
845+
to->nofault = true;
846+
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
847+
IOMAP_DIO_PARTIAL, written);
848+
to->nofault = false;
849+
pagefault_enable();
850+
if (ret > 0)
851+
written = ret;
824852

825-
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0);
826-
gfs2_glock_dq(gh);
853+
if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
854+
size_t leftover;
855+
856+
gfs2_holder_allow_demote(gh);
857+
leftover = fault_in_iov_iter_writeable(to, window_size);
858+
gfs2_holder_disallow_demote(gh);
859+
if (leftover != window_size) {
860+
if (!gfs2_holder_queued(gh))
861+
goto retry;
862+
goto retry_under_glock;
863+
}
864+
}
865+
if (gfs2_holder_queued(gh))
866+
gfs2_glock_dq(gh);
827867
out_uninit:
828868
gfs2_holder_uninit(gh);
829-
return ret;
869+
if (ret < 0)
870+
return ret;
871+
return written;
830872
}
831873

832874
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -835,10 +877,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
835877
struct file *file = iocb->ki_filp;
836878
struct inode *inode = file->f_mapping->host;
837879
struct gfs2_inode *ip = GFS2_I(inode);
838-
size_t len = iov_iter_count(from);
839-
loff_t offset = iocb->ki_pos;
880+
size_t prev_count = 0, window_size = 0;
881+
size_t read = 0;
840882
ssize_t ret;
841883

884+
/*
885+
* In this function, we disable page faults when we're holding the
886+
* inode glock while doing I/O. If a page fault occurs, we indicate
887+
* that the inode glock may be dropped, fault in the pages manually,
888+
* and retry.
889+
*
890+
* For writes, iomap_dio_rw only triggers manual page faults, so we
891+
* don't need to disable physical ones.
892+
*/
893+
842894
/*
843895
* Deferred lock, even if its a write, since we do no allocation on
844896
* this path. All we need to change is the atime, and this lock mode
@@ -848,22 +900,45 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
848900
* VFS does.
849901
*/
850902
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
903+
retry:
851904
ret = gfs2_glock_nq(gh);
852905
if (ret)
853906
goto out_uninit;
854-
907+
retry_under_glock:
855908
/* Silently fall back to buffered I/O when writing beyond EOF */
856-
if (offset + len > i_size_read(&ip->i_inode))
909+
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
857910
goto out;
858911

859-
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0);
912+
from->nofault = true;
913+
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
914+
IOMAP_DIO_PARTIAL, read);
915+
from->nofault = false;
916+
860917
if (ret == -ENOTBLK)
861918
ret = 0;
919+
if (ret > 0)
920+
read = ret;
921+
922+
if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
923+
size_t leftover;
924+
925+
gfs2_holder_allow_demote(gh);
926+
leftover = fault_in_iov_iter_readable(from, window_size);
927+
gfs2_holder_disallow_demote(gh);
928+
if (leftover != window_size) {
929+
if (!gfs2_holder_queued(gh))
930+
goto retry;
931+
goto retry_under_glock;
932+
}
933+
}
862934
out:
863-
gfs2_glock_dq(gh);
935+
if (gfs2_holder_queued(gh))
936+
gfs2_glock_dq(gh);
864937
out_uninit:
865938
gfs2_holder_uninit(gh);
866-
return ret;
939+
if (ret < 0)
940+
return ret;
941+
return read;
867942
}
868943

869944
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)

0 commit comments

Comments
 (0)