@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
810
810
static int __bch2_buffered_write (struct bch_inode_info * inode ,
811
811
struct address_space * mapping ,
812
812
struct iov_iter * iter ,
813
- loff_t pos , unsigned len )
813
+ loff_t pos , unsigned len ,
814
+ bool inode_locked )
814
815
{
815
816
struct bch_fs * c = inode -> v .i_sb -> s_fs_info ;
816
817
struct bch2_folio_reservation res ;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
835
836
836
837
BUG_ON (!fs .nr );
837
838
839
+ /*
840
+ * If we're not using the inode lock, we need to lock all the folios for
841
+ * atomiticity of writes vs. other writes:
842
+ */
843
+ if (!inode_locked && folio_end_pos (darray_last (fs )) < end ) {
844
+ ret = - BCH_ERR_need_inode_lock ;
845
+ goto out ;
846
+ }
847
+
838
848
f = darray_first (fs );
839
849
if (pos != folio_pos (f ) && !folio_test_uptodate (f )) {
840
850
ret = bch2_read_single_folio (f , mapping );
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
929
939
end = pos + copied ;
930
940
931
941
spin_lock (& inode -> v .i_lock );
932
- if (end > inode -> v .i_size )
942
+ if (end > inode -> v .i_size ) {
943
+ BUG_ON (!inode_locked );
933
944
i_size_write (& inode -> v , end );
945
+ }
934
946
spin_unlock (& inode -> v .i_lock );
935
947
936
948
f_pos = pos ;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
974
986
struct file * file = iocb -> ki_filp ;
975
987
struct address_space * mapping = file -> f_mapping ;
976
988
struct bch_inode_info * inode = file_bch_inode (file );
977
- loff_t pos = iocb -> ki_pos ;
978
- ssize_t written = 0 ;
979
- int ret = 0 ;
989
+ loff_t pos ;
990
+ bool inode_locked = false;
991
+ ssize_t written = 0 , written2 = 0 , ret = 0 ;
992
+
993
+ /*
994
+ * We don't take the inode lock unless i_size will be changing. Folio
995
+ * locks provide exclusion with other writes, and the pagecache add lock
996
+ * provides exclusion with truncate and hole punching.
997
+ *
998
+ * There is one nasty corner case where atomicity would be broken
999
+ * without great care: when copying data from userspace to the page
1000
+ * cache, we do that with faults disable - a page fault would recurse
1001
+ * back into the filesystem, taking filesystem locks again, and
1002
+ * deadlock; so it's done with faults disabled, and we fault in the user
1003
+ * buffer when we aren't holding locks.
1004
+ *
1005
+ * If we do part of the write, but we then race and in the userspace
1006
+ * buffer have been evicted and are no longer resident, then we have to
1007
+ * drop our folio locks to re-fault them in, breaking write atomicity.
1008
+ *
1009
+ * To fix this, we restart the write from the start, if we weren't
1010
+ * holding the inode lock.
1011
+ *
1012
+ * There is another wrinkle after that; if we restart the write from the
1013
+ * start, and then get an unrecoverable error, we _cannot_ claim to
1014
+ * userspace that we did not write data we actually did - so we must
1015
+ * track (written2) the most we ever wrote.
1016
+ */
1017
+
1018
+ if ((iocb -> ki_flags & IOCB_APPEND ) ||
1019
+ (iocb -> ki_pos + iov_iter_count (iter ) > i_size_read (& inode -> v ))) {
1020
+ inode_lock (& inode -> v );
1021
+ inode_locked = true;
1022
+ }
1023
+
1024
+ ret = generic_write_checks (iocb , iter );
1025
+ if (ret <= 0 )
1026
+ goto unlock ;
1027
+
1028
+ ret = file_remove_privs_flags (file , !inode_locked ? IOCB_NOWAIT : 0 );
1029
+ if (ret ) {
1030
+ if (!inode_locked ) {
1031
+ inode_lock (& inode -> v );
1032
+ inode_locked = true;
1033
+ ret = file_remove_privs_flags (file , 0 );
1034
+ }
1035
+ if (ret )
1036
+ goto unlock ;
1037
+ }
1038
+
1039
+ ret = file_update_time (file );
1040
+ if (ret )
1041
+ goto unlock ;
1042
+
1043
+ pos = iocb -> ki_pos ;
980
1044
981
1045
bch2_pagecache_add_get (inode );
982
1046
1047
+ if (!inode_locked &&
1048
+ (iocb -> ki_pos + iov_iter_count (iter ) > i_size_read (& inode -> v )))
1049
+ goto get_inode_lock ;
1050
+
983
1051
do {
984
1052
unsigned offset = pos & (PAGE_SIZE - 1 );
985
1053
unsigned bytes = iov_iter_count (iter );
@@ -1004,12 +1072,17 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1004
1072
}
1005
1073
}
1006
1074
1075
+ if (unlikely (bytes != iov_iter_count (iter ) && !inode_locked ))
1076
+ goto get_inode_lock ;
1077
+
1007
1078
if (unlikely (fatal_signal_pending (current ))) {
1008
1079
ret = - EINTR ;
1009
1080
break ;
1010
1081
}
1011
1082
1012
- ret = __bch2_buffered_write (inode , mapping , iter , pos , bytes );
1083
+ ret = __bch2_buffered_write (inode , mapping , iter , pos , bytes , inode_locked );
1084
+ if (ret == - BCH_ERR_need_inode_lock )
1085
+ goto get_inode_lock ;
1013
1086
if (unlikely (ret < 0 ))
1014
1087
break ;
1015
1088
@@ -1030,50 +1103,46 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1030
1103
}
1031
1104
pos += ret ;
1032
1105
written += ret ;
1106
+ written2 = max (written , written2 );
1107
+
1108
+ if (ret != bytes && !inode_locked )
1109
+ goto get_inode_lock ;
1033
1110
ret = 0 ;
1034
1111
1035
1112
balance_dirty_pages_ratelimited (mapping );
1036
- } while (iov_iter_count (iter ));
1037
1113
1114
+ if (0 ) {
1115
+ get_inode_lock :
1116
+ bch2_pagecache_add_put (inode );
1117
+ inode_lock (& inode -> v );
1118
+ inode_locked = true;
1119
+ bch2_pagecache_add_get (inode );
1120
+
1121
+ iov_iter_revert (iter , written );
1122
+ pos -= written ;
1123
+ written = 0 ;
1124
+ ret = 0 ;
1125
+ }
1126
+ } while (iov_iter_count (iter ));
1038
1127
bch2_pagecache_add_put (inode );
1128
+ unlock :
1129
+ if (inode_locked )
1130
+ inode_unlock (& inode -> v );
1131
+
1132
+ iocb -> ki_pos += written ;
1039
1133
1040
- return written ? written : ret ;
1134
+ ret = max (written , written2 ) ?: ret ;
1135
+ if (ret > 0 )
1136
+ ret = generic_write_sync (iocb , ret );
1137
+ return ret ;
1041
1138
}
1042
1139
1043
- ssize_t bch2_write_iter (struct kiocb * iocb , struct iov_iter * from )
1140
+ ssize_t bch2_write_iter (struct kiocb * iocb , struct iov_iter * iter )
1044
1141
{
1045
- struct file * file = iocb -> ki_filp ;
1046
- struct bch_inode_info * inode = file_bch_inode (file );
1047
- ssize_t ret ;
1048
-
1049
- if (iocb -> ki_flags & IOCB_DIRECT ) {
1050
- ret = bch2_direct_write (iocb , from );
1051
- goto out ;
1052
- }
1053
-
1054
- inode_lock (& inode -> v );
1055
-
1056
- ret = generic_write_checks (iocb , from );
1057
- if (ret <= 0 )
1058
- goto unlock ;
1059
-
1060
- ret = file_remove_privs (file );
1061
- if (ret )
1062
- goto unlock ;
1063
-
1064
- ret = file_update_time (file );
1065
- if (ret )
1066
- goto unlock ;
1067
-
1068
- ret = bch2_buffered_write (iocb , from );
1069
- if (likely (ret > 0 ))
1070
- iocb -> ki_pos += ret ;
1071
- unlock :
1072
- inode_unlock (& inode -> v );
1142
+ ssize_t ret = iocb -> ki_flags & IOCB_DIRECT
1143
+ ? bch2_direct_write (iocb , iter )
1144
+ : bch2_buffered_write (iocb , iter );
1073
1145
1074
- if (ret > 0 )
1075
- ret = generic_write_sync (iocb , ret );
1076
- out :
1077
1146
return bch2_err_class (ret );
1078
1147
}
1079
1148
0 commit comments