Skip to content

Commit 00bfe02

Browse files
author
Andreas Gruenbacher
committed
gfs2: Fix mmap + page fault deadlocks for buffered I/O
In the .read_iter and .write_iter file operations, we're accessing user-space memory while holding the inode glock. There is a possibility that the memory is mapped to the same file, in which case we'd recurse on the same glock. We could detect and work around this simple case of recursive locking, but more complex scenarios exist that involve multiple glocks, processes, and cluster nodes, and working around all of those cases isn't practical or even possible. Avoid these kinds of problems by disabling page faults while holding the inode glock. If a page fault would occur, we either end up with a partial read or write or with -EFAULT if nothing could be read or written. In either case, we know that we're not done with the operation, so we indicate that we're willing to give up the inode glock and then we fault in the missing pages. If that made us lose the inode glock, we return a partial read or write. Otherwise, we resume the operation. This locking problem was originally reported by Jan Kara. Linus came up with the idea of disabling page faults. Many thanks to Al Viro and Matthew Wilcox for their feedback. Signed-off-by: Andreas Gruenbacher <[email protected]>
1 parent 1b223f7 commit 00bfe02

File tree

1 file changed

+94
-5
lines changed

1 file changed

+94
-5
lines changed

fs/gfs2/file.c

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,36 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
776776
return ret ? ret : ret1;
777777
}
778778

779+
static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
780+
size_t *prev_count,
781+
size_t *window_size)
782+
{
783+
char __user *p = i->iov[0].iov_base + i->iov_offset;
784+
size_t count = iov_iter_count(i);
785+
int pages = 1;
786+
787+
if (likely(!count))
788+
return false;
789+
if (ret <= 0 && ret != -EFAULT)
790+
return false;
791+
if (!iter_is_iovec(i))
792+
return false;
793+
794+
if (*prev_count != count || !*window_size) {
795+
int pages, nr_dirtied;
796+
797+
pages = min_t(int, BIO_MAX_VECS,
798+
DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
799+
nr_dirtied = max(current->nr_dirtied_pause -
800+
current->nr_dirtied, 1);
801+
pages = min(pages, nr_dirtied);
802+
}
803+
804+
*prev_count = count;
805+
*window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
806+
return true;
807+
}
808+
779809
static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
780810
struct gfs2_holder *gh)
781811
{
@@ -840,9 +870,17 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
840870
{
841871
struct gfs2_inode *ip;
842872
struct gfs2_holder gh;
873+
size_t prev_count = 0, window_size = 0;
843874
size_t written = 0;
844875
ssize_t ret;
845876

877+
/*
878+
* In this function, we disable page faults when we're holding the
879+
* inode glock while doing I/O. If a page fault occurs, we indicate
880+
* that the inode glock may be dropped, fault in the pages manually,
881+
* and retry.
882+
*/
883+
846884
if (iocb->ki_flags & IOCB_DIRECT) {
847885
ret = gfs2_file_direct_read(iocb, to, &gh);
848886
if (likely(ret != -ENOTBLK))
@@ -864,13 +902,34 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
864902
}
865903
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
866904
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
905+
retry:
867906
ret = gfs2_glock_nq(&gh);
868907
if (ret)
869908
goto out_uninit;
909+
retry_under_glock:
910+
pagefault_disable();
870911
ret = generic_file_read_iter(iocb, to);
912+
pagefault_enable();
871913
if (ret > 0)
872914
written += ret;
873-
gfs2_glock_dq(&gh);
915+
916+
if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
917+
size_t leftover;
918+
919+
gfs2_holder_allow_demote(&gh);
920+
leftover = fault_in_iov_iter_writeable(to, window_size);
921+
gfs2_holder_disallow_demote(&gh);
922+
if (leftover != window_size) {
923+
if (!gfs2_holder_queued(&gh)) {
924+
if (written)
925+
goto out_uninit;
926+
goto retry;
927+
}
928+
goto retry_under_glock;
929+
}
930+
}
931+
if (gfs2_holder_queued(&gh))
932+
gfs2_glock_dq(&gh);
874933
out_uninit:
875934
gfs2_holder_uninit(&gh);
876935
return written ? written : ret;
@@ -885,19 +944,29 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
885944
struct gfs2_inode *ip = GFS2_I(inode);
886945
struct gfs2_sbd *sdp = GFS2_SB(inode);
887946
struct gfs2_holder *statfs_gh = NULL;
947+
size_t prev_count = 0, window_size = 0;
948+
size_t read = 0;
888949
ssize_t ret;
889950

951+
/*
952+
* In this function, we disable page faults when we're holding the
953+
* inode glock while doing I/O. If a page fault occurs, we indicate
954+
* that the inode glock may be dropped, fault in the pages manually,
955+
* and retry.
956+
*/
957+
890958
if (inode == sdp->sd_rindex) {
891959
statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
892960
if (!statfs_gh)
893961
return -ENOMEM;
894962
}
895963

896964
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
965+
retry:
897966
ret = gfs2_glock_nq(gh);
898967
if (ret)
899968
goto out_uninit;
900-
969+
retry_under_glock:
901970
if (inode == sdp->sd_rindex) {
902971
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
903972

@@ -908,21 +977,41 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
908977
}
909978

910979
current->backing_dev_info = inode_to_bdi(inode);
980+
pagefault_disable();
911981
ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
982+
pagefault_enable();
912983
current->backing_dev_info = NULL;
913-
if (ret > 0)
984+
if (ret > 0) {
914985
iocb->ki_pos += ret;
986+
read += ret;
987+
}
915988

916989
if (inode == sdp->sd_rindex)
917990
gfs2_glock_dq_uninit(statfs_gh);
918991

992+
if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
993+
size_t leftover;
994+
995+
gfs2_holder_allow_demote(gh);
996+
leftover = fault_in_iov_iter_readable(from, window_size);
997+
gfs2_holder_disallow_demote(gh);
998+
if (leftover != window_size) {
999+
if (!gfs2_holder_queued(gh)) {
1000+
if (read)
1001+
goto out_uninit;
1002+
goto retry;
1003+
}
1004+
goto retry_under_glock;
1005+
}
1006+
}
9191007
out_unlock:
920-
gfs2_glock_dq(gh);
1008+
if (gfs2_holder_queued(gh))
1009+
gfs2_glock_dq(gh);
9211010
out_uninit:
9221011
gfs2_holder_uninit(gh);
9231012
if (statfs_gh)
9241013
kfree(statfs_gh);
925-
return ret;
1014+
return read ? read : ret;
9261015
}
9271016

9281017
/**

0 commit comments

Comments
 (0)