Skip to content

Commit c84042b

Browse files
committed
Merge patch series "further iomap large atomic writes changes"
John Garry <[email protected]> says: These iomap changes are spun-off the XFS large atomic writes series at https://lore.kernel.org/linux-xfs/[email protected]/T/#ma99c763221de9d49ea2ccfca9ff9b8d71c8b2677 The XFS parts there are not ready yet, but it is worth having the iomap changes queued in advance. Some much earlier changes from that same series were already queued in the vfs tree, and these patches rework those changes - specifically the first patch in this series does. The most other significant change is the patch to rework how the bio flags are set in the DIO patch. * patches from https://lore.kernel.org/r/[email protected]: iomap: rework IOMAP atomic flags iomap: comment on atomic write checks in iomap_dio_bio_iter() iomap: inline iomap_dio_bio_opflags() Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents b26816b + 370a6de commit c84042b

File tree

6 files changed

+91
-93
lines changed

6 files changed

+91
-93
lines changed

Documentation/filesystems/iomap/operations.rst

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -514,29 +514,32 @@ IOMAP_WRITE`` with any combination of the following enhancements:
514514
if the mapping is unwritten and the filesystem cannot handle zeroing
515515
the unaligned regions without exposing stale contents.
516516

517-
* ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write
518-
protection based on HW-offload support.
519-
Only a single bio can be created for the write, and the write must
520-
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
521-
set.
517+
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
518+
protection.
519+
Torn-write protection may be provided based on HW-offload or by a
520+
software mechanism provided by the filesystem.
521+
522+
For HW-offload based support, only a single bio can be created for the
523+
write, and the write must not be split into multiple I/O requests, i.e.
524+
flag REQ_ATOMIC must be set.
522525
The file range to write must be aligned to satisfy the requirements
523526
of both the filesystem and the underlying block device's atomic
524527
commit capabilities.
525528
If filesystem metadata updates are required (e.g. unwritten extent
526-
conversion or copy on write), all updates for the entire file range
529+
conversion or copy-on-write), all updates for the entire file range
527530
must be committed atomically as well.
528-
Only one space mapping is allowed per untorn write.
529-
Untorn writes may be longer than a single file block. In all cases,
531+
Untorn-writes may be longer than a single file block. In all cases,
530532
the mapping start disk block must have at least the same alignment as
531533
the write offset.
532-
533-
* ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write
534-
protection via a software mechanism provided by the filesystem.
535-
All the disk block alignment and single bio restrictions which apply
536-
to IOMAP_ATOMIC_HW do not apply here.
537-
SW-based untorn writes would typically be used as a fallback when
538-
HW-based untorn writes may not be issued, e.g. the range of the write
539-
covers multiple extents, meaning that it is not possible to issue
534+
The filesystems must set IOMAP_F_ATOMIC_BIO to inform iomap core of an
535+
untorn-write based on HW-offload.
536+
537+
For untorn-writes based on a software mechanism provided by the
538+
filesystem, all the disk block alignment and single bio restrictions
539+
which apply for HW-offload based untorn-writes do not apply.
540+
The mechanism would typically be used as a fallback for when
541+
HW-offload based untorn-writes may not be issued, e.g. the range of the
542+
write covers multiple extents, meaning that it is not possible to issue
540543
a single bio.
541544
All filesystem metadata updates for the entire file range must be
542545
committed atomically as well.

fs/ext4/inode.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3290,6 +3290,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
32903290
if (map->m_flags & EXT4_MAP_NEW)
32913291
iomap->flags |= IOMAP_F_NEW;
32923292

3293+
/* HW-offload atomics are always used */
3294+
if (flags & IOMAP_ATOMIC)
3295+
iomap->flags |= IOMAP_F_ATOMIC_BIO;
3296+
32933297
if (flags & IOMAP_DAX)
32943298
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
32953299
else
@@ -3467,7 +3471,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
34673471
return false;
34683472

34693473
/* atomic writes are all-or-nothing */
3470-
if (flags & IOMAP_ATOMIC_HW)
3474+
if (flags & IOMAP_ATOMIC)
34713475
return false;
34723476

34733477
/* can only try again if we wrote nothing */

fs/iomap/direct-io.c

Lines changed: 57 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -312,80 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
312312
}
313313

314314
/*
315-
* Figure out the bio's operation flags from the dio request, the
316-
* mapping, and whether or not we want FUA. Note that we can end up
317-
* clearing the WRITE_THROUGH flag in the dio request.
315+
* Use a FUA write if we need datasync semantics and this is a pure data I/O
316+
* that doesn't require any metadata updates (including after I/O completion
317+
* such as unwritten extent conversion) and the underlying device either
318+
* doesn't have a volatile write cache or supports FUA.
319+
* This allows us to avoid cache flushes on I/O completion.
318320
*/
319-
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
320-
const struct iomap *iomap, bool use_fua, bool atomic_hw)
321+
static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
322+
struct iomap_dio *dio)
321323
{
322-
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
323-
324-
if (!(dio->flags & IOMAP_DIO_WRITE))
325-
return REQ_OP_READ;
326-
327-
opflags |= REQ_OP_WRITE;
328-
if (use_fua)
329-
opflags |= REQ_FUA;
330-
else
331-
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
332-
if (atomic_hw)
333-
opflags |= REQ_ATOMIC;
334-
335-
return opflags;
324+
if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
325+
return false;
326+
if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
327+
return false;
328+
return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
336329
}
337330

338331
static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
339332
{
340333
const struct iomap *iomap = &iter->iomap;
341334
struct inode *inode = iter->inode;
342335
unsigned int fs_block_size = i_blocksize(inode), pad;
343-
bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW;
344336
const loff_t length = iomap_length(iter);
345337
loff_t pos = iter->pos;
346-
blk_opf_t bio_opf;
338+
blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
347339
struct bio *bio;
348340
bool need_zeroout = false;
349-
bool use_fua = false;
350341
int nr_pages, ret = 0;
351342
u64 copied = 0;
352343
size_t orig_count;
353344

354-
if (atomic_hw && length != iter->len)
355-
return -EINVAL;
356-
357345
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
358346
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
359347
return -EINVAL;
360348

361-
if (iomap->type == IOMAP_UNWRITTEN) {
362-
dio->flags |= IOMAP_DIO_UNWRITTEN;
363-
need_zeroout = true;
364-
}
349+
if (dio->flags & IOMAP_DIO_WRITE) {
350+
bio_opf |= REQ_OP_WRITE;
351+
352+
if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
353+
/*
354+
* Ensure that the mapping covers the full write
355+
* length, otherwise it won't be submitted as a single
356+
* bio, which is required to use hardware atomics.
357+
*/
358+
if (length != iter->len)
359+
return -EINVAL;
360+
bio_opf |= REQ_ATOMIC;
361+
}
365362

366-
if (iomap->flags & IOMAP_F_SHARED)
367-
dio->flags |= IOMAP_DIO_COW;
363+
if (iomap->type == IOMAP_UNWRITTEN) {
364+
dio->flags |= IOMAP_DIO_UNWRITTEN;
365+
need_zeroout = true;
366+
}
367+
368+
if (iomap->flags & IOMAP_F_SHARED)
369+
dio->flags |= IOMAP_DIO_COW;
370+
371+
if (iomap->flags & IOMAP_F_NEW) {
372+
need_zeroout = true;
373+
} else if (iomap->type == IOMAP_MAPPED) {
374+
if (iomap_dio_can_use_fua(iomap, dio))
375+
bio_opf |= REQ_FUA;
376+
else
377+
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
378+
}
368379

369-
if (iomap->flags & IOMAP_F_NEW) {
370-
need_zeroout = true;
371-
} else if (iomap->type == IOMAP_MAPPED) {
372380
/*
373-
* Use a FUA write if we need datasync semantics, this is a pure
374-
* data IO that doesn't require any metadata updates (including
375-
* after IO completion such as unwritten extent conversion) and
376-
* the underlying device either supports FUA or doesn't have
377-
* a volatile write cache. This allows us to avoid cache flushes
378-
* on IO completion. If we can't use writethrough and need to
379-
* sync, disable in-task completions as dio completion will
380-
* need to call generic_write_sync() which will do a blocking
381-
* fsync / cache flush call.
381+
* We can only do deferred completion for pure overwrites that
382+
* don't require additional I/O at completion time.
383+
*
384+
* This rules out writes that need zeroing or extent conversion,
385+
* extend the file size, or issue metadata I/O or cache flushes
386+
* during completion processing.
382387
*/
383-
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
384-
(dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
385-
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
386-
use_fua = true;
387-
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
388+
if (need_zeroout || (pos >= i_size_read(inode)) ||
389+
((dio->flags & IOMAP_DIO_NEED_SYNC) &&
390+
!(bio_opf & REQ_FUA)))
388391
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
392+
} else {
393+
bio_opf |= REQ_OP_READ;
389394
}
390395

391396
/*
@@ -399,18 +404,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
399404
if (!iov_iter_count(dio->submit.iter))
400405
goto out;
401406

402-
/*
403-
* We can only do deferred completion for pure overwrites that
404-
* don't require additional IO at completion. This rules out
405-
* writes that need zeroing or extent conversion, extend
406-
* the file size, or issue journal IO or cache flushes
407-
* during completion processing.
408-
*/
409-
if (need_zeroout ||
410-
((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
411-
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
412-
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
413-
414407
/*
415408
* The rules for polled IO completions follow the guidelines as the
416409
* ones we set for inline and deferred completions. If none of those
@@ -428,8 +421,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
428421
goto out;
429422
}
430423

431-
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw);
432-
433424
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
434425
do {
435426
size_t n;
@@ -461,9 +452,9 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
461452
}
462453

463454
n = bio->bi_iter.bi_size;
464-
if (WARN_ON_ONCE(atomic_hw && n != length)) {
455+
if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
465456
/*
466-
* This bio should have covered the complete length,
457+
* An atomic write bio must cover the complete length,
467458
* which it doesn't, so error. We may need to zero out
468459
* the tail (complete FS block), similar to when
469460
* bio_iov_iter_get_pages() returns an error, above.
@@ -686,10 +677,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
686677
iomi.flags |= IOMAP_OVERWRITE_ONLY;
687678
}
688679

689-
if (dio_flags & IOMAP_DIO_ATOMIC_SW)
690-
iomi.flags |= IOMAP_ATOMIC_SW;
691-
else if (iocb->ki_flags & IOCB_ATOMIC)
692-
iomi.flags |= IOMAP_ATOMIC_HW;
680+
if (iocb->ki_flags & IOCB_ATOMIC)
681+
iomi.flags |= IOMAP_ATOMIC;
693682

694683
/* for data sync or sync, we need sync completion processing */
695684
if (iocb_is_dsync(iocb)) {

fs/iomap/trace.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
9999
{ IOMAP_FAULT, "FAULT" }, \
100100
{ IOMAP_DIRECT, "DIRECT" }, \
101101
{ IOMAP_NOWAIT, "NOWAIT" }, \
102-
{ IOMAP_ATOMIC_HW, "ATOMIC_HW" }
102+
{ IOMAP_ATOMIC, "ATOMIC" }
103103

104104
#define IOMAP_F_FLAGS_STRINGS \
105105
{ IOMAP_F_NEW, "NEW" }, \

fs/xfs/xfs_iomap.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,10 @@ xfs_direct_write_iomap_begin(
828828
if (offset + length > i_size_read(inode))
829829
iomap_flags |= IOMAP_F_DIRTY;
830830

831+
/* HW-offload atomics are always used in this path */
832+
if (flags & IOMAP_ATOMIC)
833+
iomap_flags |= IOMAP_F_ATOMIC_BIO;
834+
831835
/*
832836
* COW writes may allocate delalloc space or convert unwritten COW
833837
* extents, so we need to make sure to take the lock exclusively here.

include/linux/iomap.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ struct vm_fault;
6060
* IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
6161
* assigned to it yet and the file system will do that in the bio submission
6262
* handler, splitting the I/O as needed.
63+
*
64+
* IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic
65+
* bio, i.e. set REQ_ATOMIC.
6366
*/
6467
#define IOMAP_F_NEW (1U << 0)
6568
#define IOMAP_F_DIRTY (1U << 1)
@@ -73,6 +76,7 @@ struct vm_fault;
7376
#define IOMAP_F_XATTR (1U << 5)
7477
#define IOMAP_F_BOUNDARY (1U << 6)
7578
#define IOMAP_F_ANON_WRITE (1U << 7)
79+
#define IOMAP_F_ATOMIC_BIO (1U << 8)
7680

7781
/*
7882
* Flags set by the core iomap code during operations:
@@ -189,9 +193,8 @@ struct iomap_folio_ops {
189193
#else
190194
#define IOMAP_DAX 0
191195
#endif /* CONFIG_FS_DAX */
192-
#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
196+
#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */
193197
#define IOMAP_DONTCACHE (1 << 10)
194-
#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
195198

196199
struct iomap_ops {
197200
/*
@@ -503,11 +506,6 @@ struct iomap_dio_ops {
503506
*/
504507
#define IOMAP_DIO_PARTIAL (1 << 2)
505508

506-
/*
507-
* Use software-based torn-write protection.
508-
*/
509-
#define IOMAP_DIO_ATOMIC_SW (1 << 3)
510-
511509
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
512510
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
513511
unsigned int dio_flags, void *private, size_t done_before);

0 commit comments

Comments
 (0)