Skip to content

Commit cf40ebb

Browse files
committed
Merge patch series "add STATX_DIO_READ_ALIGN v3"
Christoph Hellwig <[email protected]> says: File systems that write out of place usually require different alignment for direct I/O writes than what they can do for reads. This series tries to address this by adding yet another statx field. * patches from https://lore.kernel.org/r/[email protected]: xfs: report larger dio alignment for COW inodes xfs: report the correct read/write dio alignment for reflinked inodes xfs: cleanup xfs_vn_getattr fs: add STATX_DIO_READ_ALIGN fs: reformat the statx definition Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents 40384c8 + 468210e commit cf40ebb

File tree

5 files changed

+125
-49
lines changed

5 files changed

+125
-49
lines changed

fs/stat.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
725725
tmp.stx_mnt_id = stat->mnt_id;
726726
tmp.stx_dio_mem_align = stat->dio_mem_align;
727727
tmp.stx_dio_offset_align = stat->dio_offset_align;
728+
tmp.stx_dio_read_offset_align = stat->dio_read_offset_align;
728729
tmp.stx_subvol = stat->subvol;
729730
tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
730731
tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;

fs/xfs/xfs_ioctl.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1204,7 +1204,16 @@ xfs_file_ioctl(
12041204
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
12051205
struct dioattr da;
12061206

1207-
da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
1207+
da.d_mem = target->bt_logical_sectorsize;
1208+
1209+
/*
1210+
* See xfs_report_dioalign() for an explanation about why this
1211+
* reports a value larger than the sector size for COW inodes.
1212+
*/
1213+
if (xfs_is_cow_inode(ip))
1214+
da.d_miniosz = xfs_inode_alloc_unitsize(ip);
1215+
else
1216+
da.d_miniosz = target->bt_logical_sectorsize;
12081217
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
12091218

12101219
if (copy_to_user(arg, &da, sizeof(da)))

fs/xfs/xfs_iops.c

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -573,17 +573,43 @@ xfs_stat_blksize(
573573
}
574574

575575
static void
576-
xfs_get_atomic_write_attr(
576+
xfs_report_dioalign(
577577
struct xfs_inode *ip,
578-
unsigned int *unit_min,
579-
unsigned int *unit_max)
578+
struct kstat *stat)
580579
{
581-
if (!xfs_inode_can_atomicwrite(ip)) {
582-
*unit_min = *unit_max = 0;
583-
return;
584-
}
580+
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
581+
struct block_device *bdev = target->bt_bdev;
582+
583+
stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
584+
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
585585

586-
*unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
586+
/*
587+
* For COW inodes, we can only perform out of place writes of entire
588+
* allocation units (blocks or RT extents).
589+
* For writes smaller than the allocation unit, we must fall back to
590+
* buffered I/O to perform read-modify-write cycles. At best this is
591+
* highly inefficient; at worst it leads to page cache invalidation
592+
* races. Tell applications to avoid this by reporting the larger write
593+
* alignment in dio_offset_align, and the smaller read alignment in
594+
* dio_read_offset_align.
595+
*/
596+
stat->dio_read_offset_align = bdev_logical_block_size(bdev);
597+
if (xfs_is_cow_inode(ip))
598+
stat->dio_offset_align = xfs_inode_alloc_unitsize(ip);
599+
else
600+
stat->dio_offset_align = stat->dio_read_offset_align;
601+
}
602+
603+
static void
604+
xfs_report_atomic_write(
605+
struct xfs_inode *ip,
606+
struct kstat *stat)
607+
{
608+
unsigned int unit_min = 0, unit_max = 0;
609+
610+
if (xfs_inode_can_atomicwrite(ip))
611+
unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
612+
generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
587613
}
588614

589615
STATIC int
@@ -647,22 +673,10 @@ xfs_vn_getattr(
647673
stat->rdev = inode->i_rdev;
648674
break;
649675
case S_IFREG:
650-
if (request_mask & STATX_DIOALIGN) {
651-
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
652-
struct block_device *bdev = target->bt_bdev;
653-
654-
stat->result_mask |= STATX_DIOALIGN;
655-
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
656-
stat->dio_offset_align = bdev_logical_block_size(bdev);
657-
}
658-
if (request_mask & STATX_WRITE_ATOMIC) {
659-
unsigned int unit_min, unit_max;
660-
661-
xfs_get_atomic_write_attr(ip, &unit_min,
662-
&unit_max);
663-
generic_fill_statx_atomic_writes(stat,
664-
unit_min, unit_max);
665-
}
676+
if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
677+
xfs_report_dioalign(ip, stat);
678+
if (request_mask & STATX_WRITE_ATOMIC)
679+
xfs_report_atomic_write(ip, stat);
666680
fallthrough;
667681
default:
668682
stat->blksize = xfs_stat_blksize(ip);

include/linux/stat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ struct kstat {
5252
u64 mnt_id;
5353
u32 dio_mem_align;
5454
u32 dio_offset_align;
55+
u32 dio_read_offset_align;
5556
u64 change_cookie;
5657
u64 subvol;
5758
u32 atomic_write_unit_min;

include/uapi/linux/stat.h

Lines changed: 75 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -98,43 +98,93 @@ struct statx_timestamp {
9898
*/
9999
struct statx {
100100
/* 0x00 */
101-
__u32 stx_mask; /* What results were written [uncond] */
102-
__u32 stx_blksize; /* Preferred general I/O size [uncond] */
103-
__u64 stx_attributes; /* Flags conveying information about the file [uncond] */
101+
/* What results were written [uncond] */
102+
__u32 stx_mask;
103+
104+
/* Preferred general I/O size [uncond] */
105+
__u32 stx_blksize;
106+
107+
/* Flags conveying information about the file [uncond] */
108+
__u64 stx_attributes;
109+
104110
/* 0x10 */
105-
__u32 stx_nlink; /* Number of hard links */
106-
__u32 stx_uid; /* User ID of owner */
107-
__u32 stx_gid; /* Group ID of owner */
108-
__u16 stx_mode; /* File mode */
111+
/* Number of hard links */
112+
__u32 stx_nlink;
113+
114+
/* User ID of owner */
115+
__u32 stx_uid;
116+
117+
/* Group ID of owner */
118+
__u32 stx_gid;
119+
120+
/* File mode */
121+
__u16 stx_mode;
109122
__u16 __spare0[1];
123+
110124
/* 0x20 */
111-
__u64 stx_ino; /* Inode number */
112-
__u64 stx_size; /* File size */
113-
__u64 stx_blocks; /* Number of 512-byte blocks allocated */
114-
__u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
125+
/* Inode number */
126+
__u64 stx_ino;
127+
128+
/* File size */
129+
__u64 stx_size;
130+
131+
/* Number of 512-byte blocks allocated */
132+
__u64 stx_blocks;
133+
134+
/* Mask to show what's supported in stx_attributes */
135+
__u64 stx_attributes_mask;
136+
115137
/* 0x40 */
116-
struct statx_timestamp stx_atime; /* Last access time */
117-
struct statx_timestamp stx_btime; /* File creation time */
118-
struct statx_timestamp stx_ctime; /* Last attribute change time */
119-
struct statx_timestamp stx_mtime; /* Last data modification time */
138+
/* Last access time */
139+
struct statx_timestamp stx_atime;
140+
141+
/* File creation time */
142+
struct statx_timestamp stx_btime;
143+
144+
/* Last attribute change time */
145+
struct statx_timestamp stx_ctime;
146+
147+
/* Last data modification time */
148+
struct statx_timestamp stx_mtime;
149+
120150
/* 0x80 */
121-
__u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */
151+
/* Device ID of special file [if bdev/cdev] */
152+
__u32 stx_rdev_major;
122153
__u32 stx_rdev_minor;
123-
__u32 stx_dev_major; /* ID of device containing file [uncond] */
154+
155+
/* ID of device containing file [uncond] */
156+
__u32 stx_dev_major;
124157
__u32 stx_dev_minor;
158+
125159
/* 0x90 */
126160
__u64 stx_mnt_id;
127-
__u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */
128-
__u32 stx_dio_offset_align; /* File offset alignment for direct I/O */
161+
162+
/* Memory buffer alignment for direct I/O */
163+
__u32 stx_dio_mem_align;
164+
165+
/* File offset alignment for direct I/O */
166+
__u32 stx_dio_offset_align;
167+
129168
/* 0xa0 */
130-
__u64 stx_subvol; /* Subvolume identifier */
131-
__u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */
132-
__u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */
169+
/* Subvolume identifier */
170+
__u64 stx_subvol;
171+
172+
/* Min atomic write unit in bytes */
173+
__u32 stx_atomic_write_unit_min;
174+
175+
/* Max atomic write unit in bytes */
176+
__u32 stx_atomic_write_unit_max;
177+
133178
/* 0xb0 */
134-
__u32 stx_atomic_write_segments_max; /* Max atomic write segment count */
135-
__u32 __spare1[1];
179+
/* Max atomic write segment count */
180+
__u32 stx_atomic_write_segments_max;
181+
182+
/* File offset alignment for direct I/O reads */
183+
__u32 stx_dio_read_offset_align;
184+
136185
/* 0xb8 */
137186
__u64 __spare3[9]; /* Spare space for future expansion */
187+
138188
/* 0x100 */
139189
};
140190

@@ -164,6 +214,7 @@ struct statx {
164214
#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
165215
#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */
166216
#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */
217+
#define STATX_DIO_READ_ALIGN 0x00020000U /* Want/got dio read alignment info */
167218

168219
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
169220

0 commit comments

Comments
 (0)