Skip to content

Commit 725737e

Browse files
committed
Merge tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux
Pull STATX_DIOALIGN support from Eric Biggers: "Make statx() support reporting direct I/O (DIO) alignment information. This provides a generic interface for userspace programs to determine whether a file supports DIO, and if so with what alignment restrictions. Specifically, STATX_DIOALIGN works on block devices, and on regular files when their containing filesystem has implemented support. An interface like this has been requested for years, since the conditions for when DIO is supported in Linux have gotten increasingly complex over time. Today, DIO support and alignment requirements can be affected by various filesystem features such as multi-device support, data journalling, inline data, encryption, verity, compression, checkpoint disabling, log-structured mode, etc. Further complicating things, Linux v6.0 relaxed the traditional rule of DIO needing to be aligned to the block device's logical block size; now user buffers (but not file offsets) only need to be aligned to the DMA alignment. The approach of uplifting the XFS specific ioctl XFS_IOC_DIOINFO was discarded in favor of creating a clean new interface with statx(). For more information, see the individual commits and the man page update[1]" Link: https://lore.kernel.org/r/[email protected] [1] * tag 'statx-dioalign-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: xfs: support STATX_DIOALIGN f2fs: support STATX_DIOALIGN f2fs: simplify f2fs_force_buffered_io() f2fs: move f2fs_force_buffered_io() into file.c ext4: support STATX_DIOALIGN fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN vfs: support STATX_DIOALIGN on block devices statx: add direct I/O alignment information
2 parents 5779aa2 + 61a223d commit 725737e

File tree

13 files changed

+188
-83
lines changed

13 files changed

+188
-83
lines changed

block/bdev.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/namei.h>
2727
#include <linux/part_stat.h>
2828
#include <linux/uaccess.h>
29+
#include <linux/stat.h>
2930
#include "../fs/internal.h"
3031
#include "blk.h"
3132

@@ -1069,3 +1070,25 @@ void sync_bdevs(bool wait)
10691070
spin_unlock(&blockdev_superblock->s_inode_list_lock);
10701071
iput(old_inode);
10711072
}
1073+
1074+
/*
1075+
* Handle STATX_DIOALIGN for block devices.
1076+
*
1077+
* Note that the inode passed to this is the inode of a block device node file,
1078+
* not the block device's internal inode. Therefore it is *not* valid to use
1079+
* I_BDEV() here; the block device has to be looked up by i_rdev instead.
1080+
*/
1081+
void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
1082+
{
1083+
struct block_device *bdev;
1084+
1085+
bdev = blkdev_get_no_open(inode->i_rdev);
1086+
if (!bdev)
1087+
return;
1088+
1089+
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
1090+
stat->dio_offset_align = bdev_logical_block_size(bdev);
1091+
stat->result_mask |= STATX_DIOALIGN;
1092+
1093+
blkdev_put_no_open(bdev);
1094+
}

fs/crypto/inline_crypt.c

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -396,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
396396
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
397397

398398
/**
399-
* fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
400-
* supported as far as encryption is concerned
401-
* @iocb: the file and position the I/O is targeting
402-
* @iter: the I/O data segment(s)
399+
* fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
400+
* inode, as far as encryption is concerned
401+
* @inode: the inode in question
403402
*
404403
* Return: %true if there are no encryption constraints that prevent DIO from
405404
* being supported; %false if DIO is unsupported. (Note that in the
406405
* %true case, the filesystem might have other, non-encryption-related
407-
* constraints that prevent DIO from actually being supported.)
406+
* constraints that prevent DIO from actually being supported. Also, on
407+
* encrypted files the filesystem is still responsible for only allowing
408+
* DIO when requests are filesystem-block-aligned.)
408409
*/
409-
bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
410+
bool fscrypt_dio_supported(struct inode *inode)
410411
{
411-
const struct inode *inode = file_inode(iocb->ki_filp);
412-
const unsigned int blocksize = i_blocksize(inode);
412+
int err;
413413

414414
/* If the file is unencrypted, no veto from us. */
415415
if (!fscrypt_needs_contents_encryption(inode))
416416
return true;
417417

418-
/* We only support DIO with inline crypto, not fs-layer crypto. */
419-
if (!fscrypt_inode_uses_inline_crypto(inode))
420-
return false;
421-
422418
/*
423-
* Since the granularity of encryption is filesystem blocks, the file
424-
* position and total I/O length must be aligned to the filesystem block
425-
* size -- not just to the block device's logical block size as is
426-
* traditionally the case for DIO on many filesystems.
419+
* We only support DIO with inline crypto, not fs-layer crypto.
427420
*
428-
* We require that the user-provided memory buffers be filesystem block
429-
* aligned too. It is simpler to have a single alignment value required
430-
* for all properties of the I/O, as is normally the case for DIO.
431-
* Also, allowing less aligned buffers would imply that data units could
432-
* cross bvecs, which would greatly complicate the I/O stack, which
433-
* assumes that bios can be split at any bvec boundary.
421+
* To determine whether the inode is using inline crypto, we have to set
422+
* up the key if it wasn't already done. This is because in the current
423+
* design of fscrypt, the decision of whether to use inline crypto or
424+
* not isn't made until the inode's encryption key is being set up. In
425+
* the DIO read/write case, the key will always be set up already, since
426+
* the file will be open. But in the case of statx(), the key might not
427+
* be set up yet, as the file might not have been opened yet.
434428
*/
435-
if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
429+
err = fscrypt_require_key(inode);
430+
if (err) {
431+
/*
432+
* Key unavailable or couldn't be set up. This edge case isn't
433+
* worth worrying about; just report that DIO is unsupported.
434+
*/
436435
return false;
437-
438-
return true;
436+
}
437+
return fscrypt_inode_uses_inline_crypto(inode);
439438
}
440439
EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
441440

fs/ext4/ext4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2977,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
29772977
extern int ext4_write_inode(struct inode *, struct writeback_control *);
29782978
extern int ext4_setattr(struct user_namespace *, struct dentry *,
29792979
struct iattr *);
2980+
extern u32 ext4_dio_alignment(struct inode *inode);
29802981
extern int ext4_getattr(struct user_namespace *, const struct path *,
29812982
struct kstat *, u32, unsigned int);
29822983
extern void ext4_evict_inode(struct inode *);

fs/ext4/file.c

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,34 @@
3636
#include "acl.h"
3737
#include "truncate.h"
3838

39-
static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
39+
/*
40+
* Returns %true if the given DIO request should be attempted with DIO, or
41+
* %false if it should fall back to buffered I/O.
42+
*
43+
* DIO isn't well specified; when it's unsupported (either due to the request
44+
* being misaligned, or due to the file not supporting DIO at all), filesystems
45+
* either fall back to buffered I/O or return EINVAL. For files that don't use
46+
* any special features like encryption or verity, ext4 has traditionally
47+
* returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
48+
* In this case, we should attempt the DIO, *not* fall back to buffered I/O.
49+
*
50+
* In contrast, in cases where DIO is unsupported due to ext4 features, ext4
51+
* traditionally falls back to buffered I/O.
52+
*
53+
* This function implements the traditional ext4 behavior in all these cases.
54+
*/
55+
static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
4056
{
4157
struct inode *inode = file_inode(iocb->ki_filp);
58+
u32 dio_align = ext4_dio_alignment(inode);
4259

43-
if (!fscrypt_dio_supported(iocb, iter))
44-
return false;
45-
if (fsverity_active(inode))
60+
if (dio_align == 0)
4661
return false;
47-
if (ext4_should_journal_data(inode))
48-
return false;
49-
if (ext4_has_inline_data(inode))
50-
return false;
51-
return true;
62+
63+
if (dio_align == 1)
64+
return true;
65+
66+
return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
5267
}
5368

5469
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
6378
inode_lock_shared(inode);
6479
}
6580

66-
if (!ext4_dio_supported(iocb, to)) {
81+
if (!ext4_should_use_dio(iocb, to)) {
6782
inode_unlock_shared(inode);
6883
/*
6984
* Fallback to buffered I/O if the operation being performed on
@@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
511526
}
512527

513528
/* Fallback to buffered I/O if the inode does not support direct I/O. */
514-
if (!ext4_dio_supported(iocb, from)) {
529+
if (!ext4_should_use_dio(iocb, from)) {
515530
if (ilock_shared)
516531
inode_unlock_shared(inode);
517532
else

fs/ext4/inode.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5550,6 +5550,22 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
55505550
return error;
55515551
}
55525552

5553+
u32 ext4_dio_alignment(struct inode *inode)
5554+
{
5555+
if (fsverity_active(inode))
5556+
return 0;
5557+
if (ext4_should_journal_data(inode))
5558+
return 0;
5559+
if (ext4_has_inline_data(inode))
5560+
return 0;
5561+
if (IS_ENCRYPTED(inode)) {
5562+
if (!fscrypt_dio_supported(inode))
5563+
return 0;
5564+
return i_blocksize(inode);
5565+
}
5566+
return 1; /* use the iomap defaults */
5567+
}
5568+
55535569
int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
55545570
struct kstat *stat, u32 request_mask, unsigned int query_flags)
55555571
{
@@ -5565,6 +5581,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
55655581
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
55665582
}
55675583

5584+
/*
5585+
* Return the DIO alignment restrictions if requested. We only return
5586+
* this information when requested, since on encrypted files it might
5587+
* take a fair bit of work to get if the file wasn't opened recently.
5588+
*/
5589+
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
5590+
u32 dio_align = ext4_dio_alignment(inode);
5591+
5592+
stat->result_mask |= STATX_DIOALIGN;
5593+
if (dio_align == 1) {
5594+
struct block_device *bdev = inode->i_sb->s_bdev;
5595+
5596+
/* iomap defaults */
5597+
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
5598+
stat->dio_offset_align = bdev_logical_block_size(bdev);
5599+
} else {
5600+
stat->dio_mem_align = dio_align;
5601+
stat->dio_offset_align = dio_align;
5602+
}
5603+
}
5604+
55685605
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
55695606
if (flags & EXT4_APPEND_FL)
55705607
stat->attributes |= STATX_ATTR_APPEND;

fs/f2fs/f2fs.h

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4471,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
44714471
f2fs_mark_inode_dirty_sync(inode, true);
44724472
}
44734473

4474-
static inline int block_unaligned_IO(struct inode *inode,
4475-
struct kiocb *iocb, struct iov_iter *iter)
4476-
{
4477-
unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
4478-
unsigned int blocksize_mask = (1 << i_blkbits) - 1;
4479-
loff_t offset = iocb->ki_pos;
4480-
unsigned long align = offset | iov_iter_alignment(iter);
4481-
4482-
return align & blocksize_mask;
4483-
}
4484-
44854474
static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
44864475
int flag)
44874476
{
@@ -4492,35 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
44924481
return sbi->aligned_blksize;
44934482
}
44944483

4495-
static inline bool f2fs_force_buffered_io(struct inode *inode,
4496-
struct kiocb *iocb, struct iov_iter *iter)
4497-
{
4498-
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
4499-
int rw = iov_iter_rw(iter);
4500-
4501-
if (!fscrypt_dio_supported(iocb, iter))
4502-
return true;
4503-
if (fsverity_active(inode))
4504-
return true;
4505-
if (f2fs_compressed_file(inode))
4506-
return true;
4507-
4508-
/* disallow direct IO if any of devices has unaligned blksize */
4509-
if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
4510-
return true;
4511-
4512-
if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
4513-
if (block_unaligned_IO(inode, iocb, iter))
4514-
return true;
4515-
if (F2FS_IO_ALIGNED(sbi))
4516-
return true;
4517-
}
4518-
if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
4519-
return true;
4520-
4521-
return false;
4522-
}
4523-
45244484
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
45254485
{
45264486
return fsverity_active(inode) &&

fs/f2fs/file.c

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,29 @@ int f2fs_truncate(struct inode *inode)
808808
return 0;
809809
}
810810

811+
static bool f2fs_force_buffered_io(struct inode *inode, int rw)
812+
{
813+
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
814+
815+
if (!fscrypt_dio_supported(inode))
816+
return true;
817+
if (fsverity_active(inode))
818+
return true;
819+
if (f2fs_compressed_file(inode))
820+
return true;
821+
822+
/* disallow direct IO if any of devices has unaligned blksize */
823+
if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
824+
return true;
825+
826+
if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
827+
return true;
828+
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
829+
return true;
830+
831+
return false;
832+
}
833+
811834
int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
812835
struct kstat *stat, u32 request_mask, unsigned int query_flags)
813836
{
@@ -824,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
824847
stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
825848
}
826849

850+
/*
851+
* Return the DIO alignment restrictions if requested. We only return
852+
* this information when requested, since on encrypted files it might
853+
* take a fair bit of work to get if the file wasn't opened recently.
854+
*
855+
* f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN
856+
* cannot represent that, so in that case we report no DIO support.
857+
*/
858+
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
859+
unsigned int bsize = i_blocksize(inode);
860+
861+
stat->result_mask |= STATX_DIOALIGN;
862+
if (!f2fs_force_buffered_io(inode, WRITE)) {
863+
stat->dio_mem_align = bsize;
864+
stat->dio_offset_align = bsize;
865+
}
866+
}
867+
827868
flags = fi->i_flags;
828869
if (flags & F2FS_COMPR_FL)
829870
stat->attributes |= STATX_ATTR_COMPRESSED;
@@ -4182,7 +4223,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
41824223
if (!(iocb->ki_flags & IOCB_DIRECT))
41834224
return false;
41844225

4185-
if (f2fs_force_buffered_io(inode, iocb, iter))
4226+
if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
41864227
return false;
41874228

41884229
/*

fs/stat.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Copyright (C) 1991, 1992 Linus Torvalds
66
*/
77

8+
#include <linux/blkdev.h>
89
#include <linux/export.h>
910
#include <linux/mm.h>
1011
#include <linux/errno.h>
@@ -230,11 +231,22 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
230231
goto out;
231232

232233
error = vfs_getattr(&path, stat, request_mask, flags);
234+
233235
stat->mnt_id = real_mount(path.mnt)->mnt_id;
234236
stat->result_mask |= STATX_MNT_ID;
237+
235238
if (path.mnt->mnt_root == path.dentry)
236239
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
237240
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
241+
242+
/* Handle STATX_DIOALIGN for block devices. */
243+
if (request_mask & STATX_DIOALIGN) {
244+
struct inode *inode = d_backing_inode(path.dentry);
245+
246+
if (S_ISBLK(inode->i_mode))
247+
bdev_statx_dioalign(inode, stat);
248+
}
249+
238250
path_put(&path);
239251
if (retry_estale(error, lookup_flags)) {
240252
lookup_flags |= LOOKUP_REVAL;
@@ -611,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
611623
tmp.stx_dev_major = MAJOR(stat->dev);
612624
tmp.stx_dev_minor = MINOR(stat->dev);
613625
tmp.stx_mnt_id = stat->mnt_id;
626+
tmp.stx_dio_mem_align = stat->dio_mem_align;
627+
tmp.stx_dio_offset_align = stat->dio_offset_align;
614628

615629
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
616630
}

0 commit comments

Comments
 (0)