Skip to content

Commit 7e64c86

Browse files
author
Kent Overstreet
committed
bcachefs: Buffered write path now can avoid the inode lock
Non append, non extending buffered writes can now avoid taking the inode lock. To ensure atomicity of writes w.r.t. other writes, we lock every folio that we'll be writing to, and if this fails we fall back to taking the inode lock. Extensive comments are provided as to corner cases. Link: https://lore.kernel.org/linux-fsdevel/[email protected]/ Signed-off-by: Kent Overstreet <[email protected]>
1 parent 66a67c8 commit 7e64c86

File tree

2 files changed

+111
-41
lines changed

2 files changed

+111
-41
lines changed

fs/bcachefs/errcode.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,8 @@
250250
x(BCH_ERR_nopromote, nopromote_congested) \
251251
x(BCH_ERR_nopromote, nopromote_in_flight) \
252252
x(BCH_ERR_nopromote, nopromote_no_writes) \
253-
x(BCH_ERR_nopromote, nopromote_enomem)
253+
x(BCH_ERR_nopromote, nopromote_enomem) \
254+
x(0, need_inode_lock)
254255

255256
enum bch_errcode {
256257
BCH_ERR_START = 2048,

fs/bcachefs/fs-io-buffered.c

Lines changed: 109 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
810810
static int __bch2_buffered_write(struct bch_inode_info *inode,
811811
struct address_space *mapping,
812812
struct iov_iter *iter,
813-
loff_t pos, unsigned len)
813+
loff_t pos, unsigned len,
814+
bool inode_locked)
814815
{
815816
struct bch_fs *c = inode->v.i_sb->s_fs_info;
816817
struct bch2_folio_reservation res;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
835836

836837
BUG_ON(!fs.nr);
837838

839+
/*
840+
* If we're not using the inode lock, we need to lock all the folios for
841+
* atomiticity of writes vs. other writes:
842+
*/
843+
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
844+
ret = -BCH_ERR_need_inode_lock;
845+
goto out;
846+
}
847+
838848
f = darray_first(fs);
839849
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
840850
ret = bch2_read_single_folio(f, mapping);
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
929939
end = pos + copied;
930940

931941
spin_lock(&inode->v.i_lock);
932-
if (end > inode->v.i_size)
942+
if (end > inode->v.i_size) {
943+
BUG_ON(!inode_locked);
933944
i_size_write(&inode->v, end);
945+
}
934946
spin_unlock(&inode->v.i_lock);
935947

936948
f_pos = pos;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
974986
struct file *file = iocb->ki_filp;
975987
struct address_space *mapping = file->f_mapping;
976988
struct bch_inode_info *inode = file_bch_inode(file);
977-
loff_t pos = iocb->ki_pos;
978-
ssize_t written = 0;
979-
int ret = 0;
989+
loff_t pos;
990+
bool inode_locked = false;
991+
ssize_t written = 0, written2 = 0, ret = 0;
992+
993+
/*
994+
* We don't take the inode lock unless i_size will be changing. Folio
995+
* locks provide exclusion with other writes, and the pagecache add lock
996+
* provides exclusion with truncate and hole punching.
997+
*
998+
* There is one nasty corner case where atomicity would be broken
999+
* without great care: when copying data from userspace to the page
1000+
* cache, we do that with faults disable - a page fault would recurse
1001+
* back into the filesystem, taking filesystem locks again, and
1002+
* deadlock; so it's done with faults disabled, and we fault in the user
1003+
* buffer when we aren't holding locks.
1004+
*
1005+
* If we do part of the write, but we then race and in the userspace
1006+
* buffer have been evicted and are no longer resident, then we have to
1007+
* drop our folio locks to re-fault them in, breaking write atomicity.
1008+
*
1009+
* To fix this, we restart the write from the start, if we weren't
1010+
* holding the inode lock.
1011+
*
1012+
* There is another wrinkle after that; if we restart the write from the
1013+
* start, and then get an unrecoverable error, we _cannot_ claim to
1014+
* userspace that we did not write data we actually did - so we must
1015+
* track (written2) the most we ever wrote.
1016+
*/
1017+
1018+
if ((iocb->ki_flags & IOCB_APPEND) ||
1019+
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
1020+
inode_lock(&inode->v);
1021+
inode_locked = true;
1022+
}
1023+
1024+
ret = generic_write_checks(iocb, iter);
1025+
if (ret <= 0)
1026+
goto unlock;
1027+
1028+
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
1029+
if (ret) {
1030+
if (!inode_locked) {
1031+
inode_lock(&inode->v);
1032+
inode_locked = true;
1033+
ret = file_remove_privs_flags(file, 0);
1034+
}
1035+
if (ret)
1036+
goto unlock;
1037+
}
1038+
1039+
ret = file_update_time(file);
1040+
if (ret)
1041+
goto unlock;
1042+
1043+
pos = iocb->ki_pos;
9801044

9811045
bch2_pagecache_add_get(inode);
9821046

1047+
if (!inode_locked &&
1048+
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
1049+
goto get_inode_lock;
1050+
9831051
do {
9841052
unsigned offset = pos & (PAGE_SIZE - 1);
9851053
unsigned bytes = iov_iter_count(iter);
@@ -1004,12 +1072,17 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
10041072
}
10051073
}
10061074

1075+
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
1076+
goto get_inode_lock;
1077+
10071078
if (unlikely(fatal_signal_pending(current))) {
10081079
ret = -EINTR;
10091080
break;
10101081
}
10111082

1012-
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
1083+
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
1084+
if (ret == -BCH_ERR_need_inode_lock)
1085+
goto get_inode_lock;
10131086
if (unlikely(ret < 0))
10141087
break;
10151088

@@ -1030,50 +1103,46 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
10301103
}
10311104
pos += ret;
10321105
written += ret;
1106+
written2 = max(written, written2);
1107+
1108+
if (ret != bytes && !inode_locked)
1109+
goto get_inode_lock;
10331110
ret = 0;
10341111

10351112
balance_dirty_pages_ratelimited(mapping);
1036-
} while (iov_iter_count(iter));
10371113

1114+
if (0) {
1115+
get_inode_lock:
1116+
bch2_pagecache_add_put(inode);
1117+
inode_lock(&inode->v);
1118+
inode_locked = true;
1119+
bch2_pagecache_add_get(inode);
1120+
1121+
iov_iter_revert(iter, written);
1122+
pos -= written;
1123+
written = 0;
1124+
ret = 0;
1125+
}
1126+
} while (iov_iter_count(iter));
10381127
bch2_pagecache_add_put(inode);
1128+
unlock:
1129+
if (inode_locked)
1130+
inode_unlock(&inode->v);
1131+
1132+
iocb->ki_pos += written;
10391133

1040-
return written ? written : ret;
1134+
ret = max(written, written2) ?: ret;
1135+
if (ret > 0)
1136+
ret = generic_write_sync(iocb, ret);
1137+
return ret;
10411138
}
10421139

1043-
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
1140+
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
10441141
{
1045-
struct file *file = iocb->ki_filp;
1046-
struct bch_inode_info *inode = file_bch_inode(file);
1047-
ssize_t ret;
1048-
1049-
if (iocb->ki_flags & IOCB_DIRECT) {
1050-
ret = bch2_direct_write(iocb, from);
1051-
goto out;
1052-
}
1053-
1054-
inode_lock(&inode->v);
1055-
1056-
ret = generic_write_checks(iocb, from);
1057-
if (ret <= 0)
1058-
goto unlock;
1059-
1060-
ret = file_remove_privs(file);
1061-
if (ret)
1062-
goto unlock;
1063-
1064-
ret = file_update_time(file);
1065-
if (ret)
1066-
goto unlock;
1067-
1068-
ret = bch2_buffered_write(iocb, from);
1069-
if (likely(ret > 0))
1070-
iocb->ki_pos += ret;
1071-
unlock:
1072-
inode_unlock(&inode->v);
1142+
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
1143+
? bch2_direct_write(iocb, iter)
1144+
: bch2_buffered_write(iocb, iter);
10731145

1074-
if (ret > 0)
1075-
ret = generic_write_sync(iocb, ret);
1076-
out:
10771146
return bch2_err_class(ret);
10781147
}
10791148

0 commit comments

Comments
 (0)