Skip to content

Commit 241c7ed

Browse files
committed
Merge tag 'vfs-6.13.untorn.writes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs untorn write support from Christian Brauner: "An atomic write is a write issed with torn-write protection. This means for a power failure or any hardware failure all or none of the data from the write will be stored, never a mix of old and new data. This work is already supported for block devices. If a block device is opened with O_DIRECT and the block device supports atomic write, then FMODE_CAN_ATOMIC_WRITE is added to the file of the opened block device. This contains the work to expand atomic write support to filesystems, specifically ext4 and XFS. Currently, only support for writing exactly one filesystem block atomically is added. Since it's now possible to have filesystem block size > page size for XFS, it's possible to write 4K+ blocks atomically on x86" * tag 'vfs-6.13.untorn.writes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: iomap: drop an obsolete comment in iomap_dio_bio_iter ext4: Do not fallback to buffered-io for DIO atomic write ext4: Support setting FMODE_CAN_ATOMIC_WRITE ext4: Check for atomic writes support in write iter ext4: Add statx support for atomic writes xfs: Support setting FMODE_CAN_ATOMIC_WRITE xfs: Validate atomic writes xfs: Support atomic write for statx fs: iomap: Atomic write support fs: Export generic_atomic_write_valid() block: Add bdev atomic write limits helpers fs/block: Check for IOCB_DIRECT in generic_atomic_write_valid() block/fs: Pass an iocb to generic_atomic_write_valid()
2 parents 7956186 + 5407943 commit 241c7ed

File tree

17 files changed

+254
-32
lines changed

17 files changed

+254
-32
lines changed

Documentation/filesystems/iomap/operations.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,21 @@ IOMAP_WRITE`` with any combination of the following enhancements:
513513
if the mapping is unwritten and the filesystem cannot handle zeroing
514514
the unaligned regions without exposing stale contents.
515515

516+
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
517+
protection.
518+
Only a single bio can be created for the write, and the write must
519+
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
520+
set.
521+
The file range to write must be aligned to satisfy the requirements
522+
of both the filesystem and the underlying block device's atomic
523+
commit capabilities.
524+
If filesystem metadata updates are required (e.g. unwritten extent
525+
conversion or copy on write), all updates for the entire file range
526+
must be committed atomically as well.
527+
Only one space mapping is allowed per untorn write.
528+
Untorn writes must be aligned to, and must not be longer than, a
529+
single file block.
530+
516531
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
517532
calling this function.
518533

block/fops.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,10 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
3535
return opf;
3636
}
3737

38-
static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos,
39-
struct iov_iter *iter, bool is_atomic)
38+
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
39+
struct iov_iter *iter)
4040
{
41-
if (is_atomic && !generic_atomic_write_valid(iter, pos))
42-
return true;
43-
44-
return pos & (bdev_logical_block_size(bdev) - 1) ||
41+
return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
4542
!bdev_iter_is_aligned(bdev, iter);
4643
}
4744

@@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
368365
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
369366
{
370367
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
371-
bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
372368
unsigned int nr_pages;
373369

374370
if (!iov_iter_count(iter))
375371
return 0;
376372

377-
if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic))
373+
if (blkdev_dio_invalid(bdev, iocb, iter))
378374
return -EINVAL;
379375

380376
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
@@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
383379
return __blkdev_direct_IO_simple(iocb, iter, bdev,
384380
nr_pages);
385381
return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
386-
} else if (is_atomic) {
382+
} else if (iocb->ki_flags & IOCB_ATOMIC) {
387383
return -EINVAL;
388384
}
389385
return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
@@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
625621
if (!bdev)
626622
return -ENXIO;
627623

628-
if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
624+
if (bdev_can_atomic_write(bdev))
629625
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
630626

631627
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
@@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
700696
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
701697
return -EOPNOTSUPP;
702698

699+
if (iocb->ki_flags & IOCB_ATOMIC) {
700+
ret = generic_atomic_write_valid(iocb, from);
701+
if (ret)
702+
return ret;
703+
}
704+
703705
size -= iocb->ki_pos;
704706
if (iov_iter_count(from) > size) {
705707
shorted = iov_iter_count(from) - size;

fs/ext4/ext4.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1729,6 +1729,10 @@ struct ext4_sb_info {
17291729
*/
17301730
struct work_struct s_sb_upd_work;
17311731

1732+
/* Atomic write unit values in bytes */
1733+
unsigned int s_awu_min;
1734+
unsigned int s_awu_max;
1735+
17321736
/* Ext4 fast commit sub transaction ID */
17331737
atomic_t s_fc_subtid;
17341738

@@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
38553859
return buffer_uptodate(bh);
38563860
}
38573861

3862+
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
3863+
{
3864+
3865+
return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
3866+
}
3867+
38583868
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
38593869
loff_t pos, unsigned len,
38603870
get_block_t *get_block);

fs/ext4/file.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,13 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
599599
ssize_t err;
600600
loff_t endbyte;
601601

602+
/*
603+
* There is no support for atomic writes on buffered-io yet,
604+
* we should never fallback to buffered-io for DIO atomic
605+
* writes.
606+
*/
607+
WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
608+
602609
offset = iocb->ki_pos;
603610
err = ext4_buffered_write_iter(iocb, from);
604611
if (err < 0)
@@ -692,6 +699,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
692699
if (IS_DAX(inode))
693700
return ext4_dax_write_iter(iocb, from);
694701
#endif
702+
703+
if (iocb->ki_flags & IOCB_ATOMIC) {
704+
size_t len = iov_iter_count(from);
705+
int ret;
706+
707+
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
708+
len > EXT4_SB(inode->i_sb)->s_awu_max)
709+
return -EINVAL;
710+
711+
ret = generic_atomic_write_valid(iocb, from);
712+
if (ret)
713+
return ret;
714+
}
715+
695716
if (iocb->ki_flags & IOCB_DIRECT)
696717
return ext4_dio_write_iter(iocb, from);
697718
else
@@ -884,6 +905,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
884905
return ret;
885906
}
886907

908+
if (ext4_inode_can_atomic_write(inode))
909+
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
910+
887911
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
888912
return dquot_file_open(inode, filp);
889913
}

fs/ext4/inode.c

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
34443444
return ret;
34453445
}
34463446

3447+
static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
3448+
{
3449+
/* must be a directio to fall back to buffered */
3450+
if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
3451+
(IOMAP_WRITE | IOMAP_DIRECT))
3452+
return false;
3453+
3454+
/* atomic writes are all-or-nothing */
3455+
if (flags & IOMAP_ATOMIC)
3456+
return false;
3457+
3458+
/* can only try again if we wrote nothing */
3459+
return written == 0;
3460+
}
3461+
34473462
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
34483463
ssize_t written, unsigned flags, struct iomap *iomap)
34493464
{
34503465
/*
34513466
* Check to see whether an error occurred while writing out the data to
3452-
* the allocated blocks. If so, return the magic error code so that we
3453-
* fallback to buffered I/O and attempt to complete the remainder of
3454-
* the I/O. Any blocks that may have been allocated in preparation for
3455-
* the direct I/O will be reused during buffered I/O.
3467+
* the allocated blocks. If so, return the magic error code for
3468+
* non-atomic write so that we fallback to buffered I/O and attempt to
3469+
* complete the remainder of the I/O.
3470+
* For non-atomic writes, any blocks that may have been
3471+
* allocated in preparation for the direct I/O will be reused during
3472+
* buffered I/O. For atomic write, we never fallback to buffered-io.
34563473
*/
3457-
if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
3474+
if (ext4_want_directio_fallback(flags, written))
34583475
return -ENOTBLK;
34593476

34603477
return 0;
@@ -5578,6 +5595,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
55785595
}
55795596
}
55805597

5598+
if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
5599+
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5600+
unsigned int awu_min = 0, awu_max = 0;
5601+
5602+
if (ext4_inode_can_atomic_write(inode)) {
5603+
awu_min = sbi->s_awu_min;
5604+
awu_max = sbi->s_awu_max;
5605+
}
5606+
5607+
generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
5608+
}
5609+
55815610
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
55825611
if (flags & EXT4_APPEND_FL)
55835612
stat->attributes |= STATX_ATTR_APPEND;

fs/ext4/super.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
44254425
return 0;
44264426
}
44274427

4428+
/*
4429+
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
4430+
* @sb: super block
4431+
* TODO: Later add support for bigalloc
4432+
*/
4433+
static void ext4_atomic_write_init(struct super_block *sb)
4434+
{
4435+
struct ext4_sb_info *sbi = EXT4_SB(sb);
4436+
struct block_device *bdev = sb->s_bdev;
4437+
4438+
if (!bdev_can_atomic_write(bdev))
4439+
return;
4440+
4441+
if (!ext4_has_feature_extents(sb))
4442+
return;
4443+
4444+
sbi->s_awu_min = max(sb->s_blocksize,
4445+
bdev_atomic_write_unit_min_bytes(bdev));
4446+
sbi->s_awu_max = min(sb->s_blocksize,
4447+
bdev_atomic_write_unit_max_bytes(bdev));
4448+
if (sbi->s_awu_min && sbi->s_awu_max &&
4449+
sbi->s_awu_min <= sbi->s_awu_max) {
4450+
ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
4451+
sbi->s_awu_min, sbi->s_awu_max);
4452+
} else {
4453+
sbi->s_awu_min = 0;
4454+
sbi->s_awu_max = 0;
4455+
}
4456+
}
4457+
44284458
static void ext4_fast_commit_init(struct super_block *sb)
44294459
{
44304460
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
53365366

53375367
spin_lock_init(&sbi->s_bdev_wb_lock);
53385368

5369+
ext4_atomic_write_init(sb);
53395370
ext4_fast_commit_init(sb);
53405371

53415372
sb->s_root = NULL;

fs/iomap/direct-io.c

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
271271
* clearing the WRITE_THROUGH flag in the dio request.
272272
*/
273273
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
274-
const struct iomap *iomap, bool use_fua)
274+
const struct iomap *iomap, bool use_fua, bool atomic)
275275
{
276276
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
277277

@@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
283283
opflags |= REQ_FUA;
284284
else
285285
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
286+
if (atomic)
287+
opflags |= REQ_ATOMIC;
286288

287289
return opflags;
288290
}
@@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
293295
const struct iomap *iomap = &iter->iomap;
294296
struct inode *inode = iter->inode;
295297
unsigned int fs_block_size = i_blocksize(inode), pad;
296-
loff_t length = iomap_length(iter);
298+
const loff_t length = iomap_length(iter);
299+
bool atomic = iter->flags & IOMAP_ATOMIC;
297300
loff_t pos = iter->pos;
298301
blk_opf_t bio_opf;
299302
struct bio *bio;
@@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
303306
size_t copied = 0;
304307
size_t orig_count;
305308

309+
if (atomic && length != fs_block_size)
310+
return -EINVAL;
311+
306312
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
307313
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
308314
return -EINVAL;
@@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
377383
goto out;
378384
}
379385

380-
/*
381-
* Set the operation flags early so that bio_iov_iter_get_pages
382-
* can set up the page vector appropriately for a ZONE_APPEND
383-
* operation.
384-
*/
385-
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
386+
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
386387

387388
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
388389
do {
@@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
415416
}
416417

417418
n = bio->bi_iter.bi_size;
419+
if (WARN_ON_ONCE(atomic && n != length)) {
420+
/*
421+
* This bio should have covered the complete length,
422+
* which it doesn't, so error. We may need to zero out
423+
* the tail (complete FS block), similar to when
424+
* bio_iov_iter_get_pages() returns an error, above.
425+
*/
426+
ret = -EINVAL;
427+
bio_put(bio);
428+
goto zero_tail;
429+
}
418430
if (dio->flags & IOMAP_DIO_WRITE) {
419431
task_io_account_write(n);
420432
} else {
@@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
598610
if (iocb->ki_flags & IOCB_NOWAIT)
599611
iomi.flags |= IOMAP_NOWAIT;
600612

613+
if (iocb->ki_flags & IOCB_ATOMIC)
614+
iomi.flags |= IOMAP_ATOMIC;
615+
601616
if (iov_iter_rw(iter) == READ) {
602617
/* reads can always complete inline */
603618
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
659674
if (ret != -EAGAIN) {
660675
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
661676
iomi.len);
662-
ret = -ENOTBLK;
677+
if (iocb->ki_flags & IOCB_ATOMIC) {
678+
/*
679+
* folio invalidation failed, maybe
680+
* this is transient, unlock and see if
681+
* the caller tries again.
682+
*/
683+
ret = -EAGAIN;
684+
} else {
685+
/* fall back to buffered write */
686+
ret = -ENOTBLK;
687+
}
663688
}
664689
goto out_free_dio;
665690
}

fs/iomap/trace.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
9898
{ IOMAP_REPORT, "REPORT" }, \
9999
{ IOMAP_FAULT, "FAULT" }, \
100100
{ IOMAP_DIRECT, "DIRECT" }, \
101-
{ IOMAP_NOWAIT, "NOWAIT" }
101+
{ IOMAP_NOWAIT, "NOWAIT" }, \
102+
{ IOMAP_ATOMIC, "ATOMIC" }
102103

103104
#define IOMAP_F_FLAGS_STRINGS \
104105
{ IOMAP_F_NEW, "NEW" }, \

fs/read_write.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,18 +1830,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
18301830
return 0;
18311831
}
18321832

1833-
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
1833+
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
18341834
{
18351835
size_t len = iov_iter_count(iter);
18361836

18371837
if (!iter_is_ubuf(iter))
1838-
return false;
1838+
return -EINVAL;
18391839

18401840
if (!is_power_of_2(len))
1841-
return false;
1841+
return -EINVAL;
1842+
1843+
if (!IS_ALIGNED(iocb->ki_pos, len))
1844+
return -EINVAL;
18421845

1843-
if (!IS_ALIGNED(pos, len))
1844-
return false;
1846+
if (!(iocb->ki_flags & IOCB_DIRECT))
1847+
return -EOPNOTSUPP;
18451848

1846-
return true;
1849+
return 0;
18471850
}
1851+
EXPORT_SYMBOL_GPL(generic_atomic_write_valid);

0 commit comments

Comments
 (0)