@@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode,
11451145}
11461146
11471147/*
1148+ * When a short write occurs, the filesystem might need to use ->iomap_end
1149+ * to remove space reservations created in ->iomap_begin.
1150+ *
1151+ * For filesystems that use delayed allocation, there can be dirty pages over
1152+ * the delalloc extent outside the range of a short write but still within the
1153+ * delalloc extent allocated for this iomap if the write raced with page
1154+ * faults.
1155+ *
11481156 * Punch out all the delalloc blocks in the range given except for those that
11491157 * have dirty data still pending in the page cache - those are going to be
11501158 * written and so must still retain the delalloc backing for writeback.
11511159 *
1160+ * The punch() callback *must* only punch delalloc extents in the range passed
1161+ * to it. It must skip over all other types of extents in the range and leave
1162+ * them completely unchanged. It must do this punch atomically with respect to
1163+ * other extent modifications.
1164+ *
1165+ * The punch() callback may be called with a folio locked to prevent writeback
1166+ * extent allocation racing at the edge of the range we are currently punching.
1167+ * The locked folio may or may not cover the range being punched, so it is not
1168+ * safe for the punch() callback to lock folios itself.
1169+ *
1170+ * Lock order is:
1171+ *
1172+ * inode->i_rwsem (shared or exclusive)
1173+ * inode->i_mapping->invalidate_lock (exclusive)
1174+ * folio_lock()
1175+ * ->punch
1176+ * internal filesystem allocation lock
1177+ *
11521178 * As we are scanning the page cache for data, we don't need to reimplement the
11531179 * wheel - mapping_seek_hole_data() does exactly what we need to identify the
11541180 * start and end of data ranges correctly even for sub-folio block sizes. This
@@ -1177,20 +1203,21 @@ static void iomap_write_delalloc_scan(struct inode *inode,
11771203 * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
11781204 * the code to subtle off-by-one bugs....
11791205 */
1180- static void iomap_write_delalloc_release (struct inode * inode , loff_t start_byte ,
1206+ void iomap_write_delalloc_release (struct inode * inode , loff_t start_byte ,
11811207 loff_t end_byte , unsigned flags , struct iomap * iomap ,
11821208 iomap_punch_t punch )
11831209{
11841210 loff_t punch_start_byte = start_byte ;
11851211 loff_t scan_end_byte = min (i_size_read (inode ), end_byte );
11861212
11871213 /*
1188- * Lock the mapping to avoid races with page faults re-instantiating
1189- * folios and dirtying them via ->page_mkwrite whilst we walk the
1190- * cache and perform delalloc extent removal. Failing to do this can
1191- * leave dirty pages with no space reservation in the cache.
1214+ * The caller must hold invalidate_lock to avoid races with page faults
1215+ * re-instantiating folios and dirtying them via ->page_mkwrite whilst
1216+ * we walk the cache and perform delalloc extent removal. Failing to do
1217+ * this can leave dirty pages with no space reservation in the cache.
11921218 */
1193- filemap_invalidate_lock (inode -> i_mapping );
1219+ lockdep_assert_held_write (& inode -> i_mapping -> invalidate_lock );
1220+
11941221 while (start_byte < scan_end_byte ) {
11951222 loff_t data_end ;
11961223
@@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
12071234 if (start_byte == - ENXIO || start_byte == scan_end_byte )
12081235 break ;
12091236 if (WARN_ON_ONCE (start_byte < 0 ))
1210- goto out_unlock ;
1237+ return ;
12111238 WARN_ON_ONCE (start_byte < punch_start_byte );
12121239 WARN_ON_ONCE (start_byte > scan_end_byte );
12131240
@@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
12181245 data_end = mapping_seek_hole_data (inode -> i_mapping , start_byte ,
12191246 scan_end_byte , SEEK_HOLE );
12201247 if (WARN_ON_ONCE (data_end < 0 ))
1221- goto out_unlock ;
1248+ return ;
12221249
12231250 /*
12241251 * If we race with post-direct I/O invalidation of the page cache,
@@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
12401267 if (punch_start_byte < end_byte )
12411268 punch (inode , punch_start_byte , end_byte - punch_start_byte ,
12421269 iomap );
1243- out_unlock :
1244- filemap_invalidate_unlock (inode -> i_mapping );
1245- }
1246-
1247- /*
1248- * When a short write occurs, the filesystem may need to remove reserved space
1249- * that was allocated in ->iomap_begin from it's ->iomap_end method. For
1250- * filesystems that use delayed allocation, we need to punch out delalloc
1251- * extents from the range that are not dirty in the page cache. As the write can
1252- * race with page faults, there can be dirty pages over the delalloc extent
1253- * outside the range of a short write but still within the delalloc extent
1254- * allocated for this iomap.
1255- *
1256- * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1257- * simplify range iterations.
1258- *
1259- * The punch() callback *must* only punch delalloc extents in the range passed
1260- * to it. It must skip over all other types of extents in the range and leave
1261- * them completely unchanged. It must do this punch atomically with respect to
1262- * other extent modifications.
1263- *
1264- * The punch() callback may be called with a folio locked to prevent writeback
1265- * extent allocation racing at the edge of the range we are currently punching.
1266- * The locked folio may or may not cover the range being punched, so it is not
1267- * safe for the punch() callback to lock folios itself.
1268- *
1269- * Lock order is:
1270- *
1271- * inode->i_rwsem (shared or exclusive)
1272- * inode->i_mapping->invalidate_lock (exclusive)
1273- * folio_lock()
1274- * ->punch
1275- * internal filesystem allocation lock
1276- */
1277- void iomap_file_buffered_write_punch_delalloc (struct inode * inode ,
1278- loff_t pos , loff_t length , ssize_t written , unsigned flags ,
1279- struct iomap * iomap , iomap_punch_t punch )
1280- {
1281- loff_t start_byte ;
1282- loff_t end_byte ;
1283- unsigned int blocksize = i_blocksize (inode );
1284-
1285- if (iomap -> type != IOMAP_DELALLOC )
1286- return ;
1287-
1288- /* If we didn't reserve the blocks, we're not allowed to punch them. */
1289- if (!(iomap -> flags & IOMAP_F_NEW ))
1290- return ;
1291-
1292- /*
1293- * start_byte refers to the first unused block after a short write. If
1294- * nothing was written, round offset down to point at the first block in
1295- * the range.
1296- */
1297- if (unlikely (!written ))
1298- start_byte = round_down (pos , blocksize );
1299- else
1300- start_byte = round_up (pos + written , blocksize );
1301- end_byte = round_up (pos + length , blocksize );
1302-
1303- /* Nothing to do if we've written the entire delalloc extent */
1304- if (start_byte >= end_byte )
1305- return ;
1306-
1307- iomap_write_delalloc_release (inode , start_byte , end_byte , flags , iomap ,
1308- punch );
13091270}
1310- EXPORT_SYMBOL_GPL (iomap_file_buffered_write_punch_delalloc );
1271+ EXPORT_SYMBOL_GPL (iomap_write_delalloc_release );
13111272
13121273static loff_t iomap_unshare_iter (struct iomap_iter * iter )
13131274{
0 commit comments