@@ -1394,16 +1394,53 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1394
1394
}
1395
1395
EXPORT_SYMBOL_GPL (iomap_file_unshare );
1396
1396
1397
- static loff_t iomap_zero_iter (struct iomap_iter * iter , bool * did_zero )
1397
+ /*
1398
+ * Flush the remaining range of the iter and mark the current mapping stale.
1399
+ * This is used when zero range sees an unwritten mapping that may have had
1400
+ * dirty pagecache over it.
1401
+ */
1402
+ static inline int iomap_zero_iter_flush_and_stale (struct iomap_iter * i )
1403
+ {
1404
+ struct address_space * mapping = i -> inode -> i_mapping ;
1405
+ loff_t end = i -> pos + i -> len - 1 ;
1406
+
1407
+ i -> iomap .flags |= IOMAP_F_STALE ;
1408
+ return filemap_write_and_wait_range (mapping , i -> pos , end );
1409
+ }
1410
+
1411
+ static loff_t iomap_zero_iter (struct iomap_iter * iter , bool * did_zero ,
1412
+ bool * range_dirty )
1398
1413
{
1399
1414
const struct iomap * srcmap = iomap_iter_srcmap (iter );
1400
1415
loff_t pos = iter -> pos ;
1401
1416
loff_t length = iomap_length (iter );
1402
1417
loff_t written = 0 ;
1403
1418
1404
- /* already zeroed? we're done. */
1405
- if (srcmap -> type == IOMAP_HOLE || srcmap -> type == IOMAP_UNWRITTEN )
1419
+ /*
1420
+ * We must zero subranges of unwritten mappings that might be dirty in
1421
+ * pagecache from previous writes. We only know whether the entire range
1422
+ * was clean or not, however, and dirty folios may have been written
1423
+ * back or reclaimed at any point after mapping lookup.
1424
+ *
1425
+ * The easiest way to deal with this is to flush pagecache to trigger
1426
+ * any pending unwritten conversions and then grab the updated extents
1427
+ * from the fs. The flush may change the current mapping, so mark it
1428
+ * stale for the iterator to remap it for the next pass to handle
1429
+ * properly.
1430
+ *
1431
+ * Note that holes are treated the same as unwritten because zero range
1432
+ * is (ab)used for partial folio zeroing in some cases. Hole backed
1433
+ * post-eof ranges can be dirtied via mapped write and the flush
1434
+ * triggers writeback time post-eof zeroing.
1435
+ */
1436
+ if (srcmap -> type == IOMAP_HOLE || srcmap -> type == IOMAP_UNWRITTEN ) {
1437
+ if (* range_dirty ) {
1438
+ * range_dirty = false;
1439
+ return iomap_zero_iter_flush_and_stale (iter );
1440
+ }
1441
+ /* range is clean and already zeroed, nothing to do */
1406
1442
return length ;
1443
+ }
1407
1444
1408
1445
do {
1409
1446
struct folio * folio ;
@@ -1451,9 +1488,27 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1451
1488
.flags = IOMAP_ZERO ,
1452
1489
};
1453
1490
int ret ;
1491
+ bool range_dirty ;
1492
+
1493
+ /*
1494
+ * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but
1495
+ * pagecache must be flushed to ensure stale data from previous
1496
+ * buffered writes is not exposed. A flush is only required for certain
1497
+ * types of mappings, but checking pagecache after mapping lookup is
1498
+ * racy with writeback and reclaim.
1499
+ *
1500
+ * Therefore, check the entire range first and pass along whether any
1501
+ * part of it is dirty. If so and an underlying mapping warrants it,
1502
+ * flush the cache at that point. This trades off the occasional false
1503
+ * positive (and spurious flush, if the dirty data and mapping don't
1504
+ * happen to overlap) for simplicity in handling a relatively uncommon
1505
+ * situation.
1506
+ */
1507
+ range_dirty = filemap_range_needs_writeback (inode -> i_mapping ,
1508
+ pos , pos + len - 1 );
1454
1509
1455
1510
while ((ret = iomap_iter (& iter , ops )) > 0 )
1456
- iter .processed = iomap_zero_iter (& iter , did_zero );
1511
+ iter .processed = iomap_zero_iter (& iter , did_zero , & range_dirty );
1457
1512
return ret ;
1458
1513
}
1459
1514
EXPORT_SYMBOL_GPL (iomap_zero_range );
0 commit comments