Skip to content

Commit 267bf1d

Browse files
committed
Merge tag 'fs-atomic_2024-11-05' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into vfs.untorn.writes
Snapshot of untorn@fs-atomic#ritesh.list_ext4-do-not-fallback-to-buffered-io-for-dio-atomic-write at Tue Nov 5 16:20:51 PST 2024 Link: https://lore.kernel.org/r/20241106-zerkleinern-verzweifeln-7ec8173c56ad@brauner Signed-off-by: Christian Brauner <[email protected]>
2 parents 9852d85 + 299537e commit 267bf1d

File tree

17 files changed

+254
-27
lines changed

17 files changed

+254
-27
lines changed

Documentation/filesystems/iomap/operations.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements:
513513
if the mapping is unwritten and the filesystem cannot handle zeroing
514514
the unaligned regions without exposing stale contents.
515515

516+
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
517+
protection.
518+
Only a single bio can be created for the write, and the write must
519+
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
520+
set.
521+
The file range to write must be aligned to satisfy the requirements
522+
of both the filesystem and the underlying block device's atomic
523+
commit capabilities.
524+
If filesystem metadata updates are required (e.g. unwritten extent
525+
conversion or copy on write), all updates for the entire file range
526+
must be committed atomically as well.
527+
Only one space mapping is allowed per untorn write.
528+
Untorn writes must be aligned to, and must not be longer than, a
529+
single file block.
530+
516531
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
517532
calling this function.
518533

block/fops.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,10 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
3535
return opf;
3636
}
3737

38-
static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos,
39-
struct iov_iter *iter, bool is_atomic)
38+
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
39+
struct iov_iter *iter)
4040
{
41-
if (is_atomic && !generic_atomic_write_valid(iter, pos))
42-
return true;
43-
44-
return pos & (bdev_logical_block_size(bdev) - 1) ||
41+
return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
4542
!bdev_iter_is_aligned(bdev, iter);
4643
}
4744

@@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
368365
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
369366
{
370367
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
371-
bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
372368
unsigned int nr_pages;
373369

374370
if (!iov_iter_count(iter))
375371
return 0;
376372

377-
if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic))
373+
if (blkdev_dio_invalid(bdev, iocb, iter))
378374
return -EINVAL;
379375

380376
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
@@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
383379
return __blkdev_direct_IO_simple(iocb, iter, bdev,
384380
nr_pages);
385381
return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
386-
} else if (is_atomic) {
382+
} else if (iocb->ki_flags & IOCB_ATOMIC) {
387383
return -EINVAL;
388384
}
389385
return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
@@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
625621
if (!bdev)
626622
return -ENXIO;
627623

628-
if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
624+
if (bdev_can_atomic_write(bdev))
629625
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
630626

631627
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
@@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
700696
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
701697
return -EOPNOTSUPP;
702698

699+
if (iocb->ki_flags & IOCB_ATOMIC) {
700+
ret = generic_atomic_write_valid(iocb, from);
701+
if (ret)
702+
return ret;
703+
}
704+
703705
size -= iocb->ki_pos;
704706
if (iov_iter_count(from) > size) {
705707
shorted = iov_iter_count(from) - size;

fs/ext4/ext4.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1729,6 +1729,10 @@ struct ext4_sb_info {
17291729
*/
17301730
struct work_struct s_sb_upd_work;
17311731

1732+
/* Atomic write unit values in bytes */
1733+
unsigned int s_awu_min;
1734+
unsigned int s_awu_max;
1735+
17321736
/* Ext4 fast commit sub transaction ID */
17331737
atomic_t s_fc_subtid;
17341738

@@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
38553859
return buffer_uptodate(bh);
38563860
}
38573861

3862+
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
3863+
{
3864+
3865+
return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
3866+
}
3867+
38583868
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
38593869
loff_t pos, unsigned len,
38603870
get_block_t *get_block);

fs/ext4/file.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,13 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
599599
ssize_t err;
600600
loff_t endbyte;
601601

602+
/*
603+
* There is no support for atomic writes on buffered-io yet,
604+
* we should never fallback to buffered-io for DIO atomic
605+
* writes.
606+
*/
607+
WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
608+
602609
offset = iocb->ki_pos;
603610
err = ext4_buffered_write_iter(iocb, from);
604611
if (err < 0)
@@ -692,6 +699,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
692699
if (IS_DAX(inode))
693700
return ext4_dax_write_iter(iocb, from);
694701
#endif
702+
703+
if (iocb->ki_flags & IOCB_ATOMIC) {
704+
size_t len = iov_iter_count(from);
705+
int ret;
706+
707+
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
708+
len > EXT4_SB(inode->i_sb)->s_awu_max)
709+
return -EINVAL;
710+
711+
ret = generic_atomic_write_valid(iocb, from);
712+
if (ret)
713+
return ret;
714+
}
715+
695716
if (iocb->ki_flags & IOCB_DIRECT)
696717
return ext4_dio_write_iter(iocb, from);
697718
else
@@ -884,6 +905,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
884905
return ret;
885906
}
886907

908+
if (ext4_inode_can_atomic_write(inode))
909+
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
910+
887911
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
888912
return dquot_file_open(inode, filp);
889913
}

fs/ext4/inode.c

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
34443444
return ret;
34453445
}
34463446

3447+
static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
3448+
{
3449+
/* must be a directio to fall back to buffered */
3450+
if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
3451+
(IOMAP_WRITE | IOMAP_DIRECT))
3452+
return false;
3453+
3454+
/* atomic writes are all-or-nothing */
3455+
if (flags & IOMAP_ATOMIC)
3456+
return false;
3457+
3458+
/* can only try again if we wrote nothing */
3459+
return written == 0;
3460+
}
3461+
34473462
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
34483463
ssize_t written, unsigned flags, struct iomap *iomap)
34493464
{
34503465
/*
34513466
* Check to see whether an error occurred while writing out the data to
3452-
* the allocated blocks. If so, return the magic error code so that we
3453-
* fallback to buffered I/O and attempt to complete the remainder of
3454-
* the I/O. Any blocks that may have been allocated in preparation for
3455-
* the direct I/O will be reused during buffered I/O.
3467+
* the allocated blocks. If so, return the magic error code for
3468+
* non-atomic write so that we fallback to buffered I/O and attempt to
3469+
* complete the remainder of the I/O.
3470+
* For non-atomic writes, any blocks that may have been
3471+
* allocated in preparation for the direct I/O will be reused during
3472+
* buffered I/O. For atomic write, we never fallback to buffered-io.
34563473
*/
3457-
if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
3474+
if (ext4_want_directio_fallback(flags, written))
34583475
return -ENOTBLK;
34593476

34603477
return 0;
@@ -5578,6 +5595,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
55785595
}
55795596
}
55805597

5598+
if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
5599+
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5600+
unsigned int awu_min = 0, awu_max = 0;
5601+
5602+
if (ext4_inode_can_atomic_write(inode)) {
5603+
awu_min = sbi->s_awu_min;
5604+
awu_max = sbi->s_awu_max;
5605+
}
5606+
5607+
generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
5608+
}
5609+
55815610
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
55825611
if (flags & EXT4_APPEND_FL)
55835612
stat->attributes |= STATX_ATTR_APPEND;

fs/ext4/super.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
44254425
return 0;
44264426
}
44274427

4428+
/*
4429+
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
4430+
* @sb: super block
4431+
* TODO: Later add support for bigalloc
4432+
*/
4433+
static void ext4_atomic_write_init(struct super_block *sb)
4434+
{
4435+
struct ext4_sb_info *sbi = EXT4_SB(sb);
4436+
struct block_device *bdev = sb->s_bdev;
4437+
4438+
if (!bdev_can_atomic_write(bdev))
4439+
return;
4440+
4441+
if (!ext4_has_feature_extents(sb))
4442+
return;
4443+
4444+
sbi->s_awu_min = max(sb->s_blocksize,
4445+
bdev_atomic_write_unit_min_bytes(bdev));
4446+
sbi->s_awu_max = min(sb->s_blocksize,
4447+
bdev_atomic_write_unit_max_bytes(bdev));
4448+
if (sbi->s_awu_min && sbi->s_awu_max &&
4449+
sbi->s_awu_min <= sbi->s_awu_max) {
4450+
ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
4451+
sbi->s_awu_min, sbi->s_awu_max);
4452+
} else {
4453+
sbi->s_awu_min = 0;
4454+
sbi->s_awu_max = 0;
4455+
}
4456+
}
4457+
44284458
static void ext4_fast_commit_init(struct super_block *sb)
44294459
{
44304460
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
53365366

53375367
spin_lock_init(&sbi->s_bdev_wb_lock);
53385368

5369+
ext4_atomic_write_init(sb);
53395370
ext4_fast_commit_init(sb);
53405371

53415372
sb->s_root = NULL;

fs/iomap/direct-io.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
271271
* clearing the WRITE_THROUGH flag in the dio request.
272272
*/
273273
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
274-
const struct iomap *iomap, bool use_fua)
274+
const struct iomap *iomap, bool use_fua, bool atomic)
275275
{
276276
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
277277

@@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
283283
opflags |= REQ_FUA;
284284
else
285285
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
286+
if (atomic)
287+
opflags |= REQ_ATOMIC;
286288

287289
return opflags;
288290
}
@@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
293295
const struct iomap *iomap = &iter->iomap;
294296
struct inode *inode = iter->inode;
295297
unsigned int fs_block_size = i_blocksize(inode), pad;
296-
loff_t length = iomap_length(iter);
298+
const loff_t length = iomap_length(iter);
299+
bool atomic = iter->flags & IOMAP_ATOMIC;
297300
loff_t pos = iter->pos;
298301
blk_opf_t bio_opf;
299302
struct bio *bio;
@@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
303306
size_t copied = 0;
304307
size_t orig_count;
305308

309+
if (atomic && length != fs_block_size)
310+
return -EINVAL;
311+
306312
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
307313
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
308314
return -EINVAL;
@@ -382,7 +388,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
382388
* can set up the page vector appropriately for a ZONE_APPEND
383389
* operation.
384390
*/
385-
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
391+
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
386392

387393
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
388394
do {
@@ -415,6 +421,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
415421
}
416422

417423
n = bio->bi_iter.bi_size;
424+
if (WARN_ON_ONCE(atomic && n != length)) {
425+
/*
426+
* This bio should have covered the complete length,
427+
* which it doesn't, so error. We may need to zero out
428+
* the tail (complete FS block), similar to when
429+
* bio_iov_iter_get_pages() returns an error, above.
430+
*/
431+
ret = -EINVAL;
432+
bio_put(bio);
433+
goto zero_tail;
434+
}
418435
if (dio->flags & IOMAP_DIO_WRITE) {
419436
task_io_account_write(n);
420437
} else {
@@ -598,6 +615,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
598615
if (iocb->ki_flags & IOCB_NOWAIT)
599616
iomi.flags |= IOMAP_NOWAIT;
600617

618+
if (iocb->ki_flags & IOCB_ATOMIC)
619+
iomi.flags |= IOMAP_ATOMIC;
620+
601621
if (iov_iter_rw(iter) == READ) {
602622
/* reads can always complete inline */
603623
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +679,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
659679
if (ret != -EAGAIN) {
660680
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
661681
iomi.len);
662-
ret = -ENOTBLK;
682+
if (iocb->ki_flags & IOCB_ATOMIC) {
683+
/*
684+
* folio invalidation failed, maybe
685+
* this is transient, unlock and see if
686+
* the caller tries again.
687+
*/
688+
ret = -EAGAIN;
689+
} else {
690+
/* fall back to buffered write */
691+
ret = -ENOTBLK;
692+
}
663693
}
664694
goto out_free_dio;
665695
}

fs/iomap/trace.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
9898
{ IOMAP_REPORT, "REPORT" }, \
9999
{ IOMAP_FAULT, "FAULT" }, \
100100
{ IOMAP_DIRECT, "DIRECT" }, \
101-
{ IOMAP_NOWAIT, "NOWAIT" }
101+
{ IOMAP_NOWAIT, "NOWAIT" }, \
102+
{ IOMAP_ATOMIC, "ATOMIC" }
102103

103104
#define IOMAP_F_FLAGS_STRINGS \
104105
{ IOMAP_F_NEW, "NEW" }, \

fs/read_write.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,18 +1830,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
18301830
return 0;
18311831
}
18321832

1833-
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
1833+
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
18341834
{
18351835
size_t len = iov_iter_count(iter);
18361836

18371837
if (!iter_is_ubuf(iter))
1838-
return false;
1838+
return -EINVAL;
18391839

18401840
if (!is_power_of_2(len))
1841-
return false;
1841+
return -EINVAL;
1842+
1843+
if (!IS_ALIGNED(iocb->ki_pos, len))
1844+
return -EINVAL;
18421845

1843-
if (!IS_ALIGNED(pos, len))
1844-
return false;
1846+
if (!(iocb->ki_flags & IOCB_DIRECT))
1847+
return -EOPNOTSUPP;
18451848

1846-
return true;
1849+
return 0;
18471850
}
1851+
EXPORT_SYMBOL_GPL(generic_atomic_write_valid);

0 commit comments

Comments
 (0)