@@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
19121912
19131913static ssize_t btrfs_direct_write (struct kiocb * iocb , struct iov_iter * from )
19141914{
1915+ const bool is_sync_write = (iocb -> ki_flags & IOCB_DSYNC );
19151916 struct file * file = iocb -> ki_filp ;
19161917 struct inode * inode = file_inode (file );
19171918 struct btrfs_fs_info * fs_info = btrfs_sb (inode -> i_sb );
19181919 loff_t pos ;
19191920 ssize_t written = 0 ;
19201921 ssize_t written_buffered ;
1922+ size_t prev_left = 0 ;
19211923 loff_t endbyte ;
19221924 ssize_t err ;
19231925 unsigned int ilock_flags = 0 ;
1924- struct iomap_dio * dio = NULL ;
19251926
19261927 if (iocb -> ki_flags & IOCB_NOWAIT )
19271928 ilock_flags |= BTRFS_ILOCK_TRY ;
@@ -1964,23 +1965,80 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
19641965 goto buffered ;
19651966 }
19661967
1967- dio = __iomap_dio_rw (iocb , from , & btrfs_dio_iomap_ops , & btrfs_dio_ops ,
1968- 0 , 0 );
1968+ /*
1969+ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
1970+ * calls generic_write_sync() (through iomap_dio_complete()), because
1971+ * that results in calling fsync (btrfs_sync_file()) which will try to
1972+ * lock the inode in exclusive/write mode.
1973+ */
1974+ if (is_sync_write )
1975+ iocb -> ki_flags &= ~IOCB_DSYNC ;
19691976
1970- btrfs_inode_unlock (inode , ilock_flags );
1977+ /*
1978+ * The iov_iter can be mapped to the same file range we are writing to.
1979+ * If that's the case, then we will deadlock in the iomap code, because
1980+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
1981+ * an ordered extent, and after that it will fault in the pages that the
1982+ * iov_iter refers to. During the fault in we end up in the readahead
1983+ * pages code (starting at btrfs_readahead()), which will lock the range,
1984+ * find that ordered extent and then wait for it to complete (at
1985+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1986+ * obviously the ordered extent can never complete as we didn't submit
1987+ * yet the respective bio(s). This always happens when the buffer is
1988+ * memory mapped to the same file range, since the iomap DIO code always
1989+ * invalidates pages in the target file range (after starting and waiting
1990+ * for any writeback).
1991+ *
1992+ * So here we disable page faults in the iov_iter and then retry if we
1993+ * got -EFAULT, faulting in the pages before the retry.
1994+ */
1995+ again :
1996+ from -> nofault = true;
1997+ err = iomap_dio_rw (iocb , from , & btrfs_dio_iomap_ops , & btrfs_dio_ops ,
1998+ IOMAP_DIO_PARTIAL , written );
1999+ from -> nofault = false;
19712000
1972- if (IS_ERR_OR_NULL (dio )) {
1973- err = PTR_ERR_OR_ZERO (dio );
1974- if (err < 0 && err != - ENOTBLK )
1975- goto out ;
1976- } else {
1977- written = iomap_dio_complete (dio );
2001+ /* No increment (+=) because iomap returns a cumulative value. */
2002+ if (err > 0 )
2003+ written = err ;
2004+
2005+ if (iov_iter_count (from ) > 0 && (err == - EFAULT || err > 0 )) {
2006+ const size_t left = iov_iter_count (from );
2007+ /*
2008+ * We have more data left to write. Try to fault in as many as
2009+ * possible of the remainder pages and retry. We do this without
2010+ * releasing and locking again the inode, to prevent races with
2011+ * truncate.
2012+ *
2013+ * Also, in case the iov refers to pages in the file range of the
2014+ * file we want to write to (due to a mmap), we could enter an
2015+ * infinite loop if we retry after faulting the pages in, since
2016+ * iomap will invalidate any pages in the range early on, before
2017+ * it tries to fault in the pages of the iov. So we keep track of
2018+ * how much was left of iov in the previous EFAULT and fallback
2019+ * to buffered IO in case we haven't made any progress.
2020+ */
2021+ if (left == prev_left ) {
2022+ err = - ENOTBLK ;
2023+ } else {
2024+ fault_in_iov_iter_readable (from , left );
2025+ prev_left = left ;
2026+ goto again ;
2027+ }
19782028 }
19792029
1980- if (written < 0 || !iov_iter_count (from )) {
1981- err = written ;
2030+ btrfs_inode_unlock (inode , ilock_flags );
2031+
2032+ /*
2033+ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
2034+ * the fsync (call generic_write_sync()).
2035+ */
2036+ if (is_sync_write )
2037+ iocb -> ki_flags |= IOCB_DSYNC ;
2038+
2039+ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
2040+ if ((err < 0 && err != - ENOTBLK ) || !iov_iter_count (from ))
19822041 goto out ;
1983- }
19842042
19852043buffered :
19862044 pos = iocb -> ki_pos ;
@@ -2005,7 +2063,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
20052063 invalidate_mapping_pages (file -> f_mapping , pos >> PAGE_SHIFT ,
20062064 endbyte >> PAGE_SHIFT );
20072065out :
2008- return written ? written : err ;
2066+ return err < 0 ? err : written ;
20092067}
20102068
20112069static ssize_t btrfs_file_write_iter (struct kiocb * iocb ,
@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
36593717static ssize_t btrfs_direct_read (struct kiocb * iocb , struct iov_iter * to )
36603718{
36613719 struct inode * inode = file_inode (iocb -> ki_filp );
3720+ size_t prev_left = 0 ;
3721+ ssize_t read = 0 ;
36623722 ssize_t ret ;
36633723
36643724 if (fsverity_active (inode ))
@@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
36683728 return 0 ;
36693729
36703730 btrfs_inode_lock (inode , BTRFS_ILOCK_SHARED );
3731+ again :
3732+ /*
3733+ * This is similar to what we do for direct IO writes, see the comment
3734+ * at btrfs_direct_write(), but we also disable page faults in addition
3735+ * to disabling them only at the iov_iter level. This is because when
3736+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3737+ * which can still trigger page fault ins despite having set ->nofault
3738+ * to true of our 'to' iov_iter.
3739+ *
3740+ * The difference to direct IO writes is that we deadlock when trying
3741+ * to lock the extent range in the inode's tree during he page reads
3742+ * triggered by the fault in (while for writes it is due to waiting for
3743+ * our own ordered extent). This is because for direct IO reads,
3744+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
3745+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
3746+ */
3747+ pagefault_disable ();
3748+ to -> nofault = true;
36713749 ret = iomap_dio_rw (iocb , to , & btrfs_dio_iomap_ops , & btrfs_dio_ops ,
3672- 0 , 0 );
3750+ IOMAP_DIO_PARTIAL , read );
3751+ to -> nofault = false;
3752+ pagefault_enable ();
3753+
3754+ /* No increment (+=) because iomap returns a cumulative value. */
3755+ if (ret > 0 )
3756+ read = ret ;
3757+
3758+ if (iov_iter_count (to ) > 0 && (ret == - EFAULT || ret > 0 )) {
3759+ const size_t left = iov_iter_count (to );
3760+
3761+ if (left == prev_left ) {
3762+ /*
3763+ * We didn't make any progress since the last attempt,
3764+ * fallback to a buffered read for the remainder of the
3765+ * range. This is just to avoid any possibility of looping
3766+ * for too long.
3767+ */
3768+ ret = read ;
3769+ } else {
3770+ /*
3771+ * We made some progress since the last retry or this is
3772+ * the first time we are retrying. Fault in as many pages
3773+ * as possible and retry.
3774+ */
3775+ fault_in_iov_iter_writeable (to , left );
3776+ prev_left = left ;
3777+ goto again ;
3778+ }
3779+ }
36733780 btrfs_inode_unlock (inode , BTRFS_ILOCK_SHARED );
3674- return ret ;
3781+ return ret < 0 ? ret : read ;
36753782}
36763783
36773784static ssize_t btrfs_file_read_iter (struct kiocb * iocb , struct iov_iter * to )
0 commit comments