Skip to content

Commit 4ce02c6

Browse files
committed
iomap: Add per-block dirty state tracking to improve performance
When filesystem blocksize is less than folio size (either with mapping_large_folio_support() or with blocksize < pagesize) and when the folio is uptodate in pagecache, then even a byte write can cause an entire folio to be written to disk during writeback. This happens because we currently don't have a mechanism to track per-block dirty state within struct iomap_folio_state. We currently only track uptodate state. This patch implements support for tracking per-block dirty state in iomap_folio_state->state bitmap. This should help improve the filesystem write performance and help reduce write amplification. Performance testing of below fio workload reveals ~16x performance improvement using nvme with XFS (4k blocksize) on Power (64K pagesize) FIO reported write bw scores improved from around ~28 MBps to ~452 MBps. 1. <test_randwrite.fio> [global] ioengine=psync rw=randwrite overwrite=1 pre_read=1 direct=0 bs=4k size=1G dir=./ numjobs=8 fdatasync=1 runtime=60 iodepth=64 group_reporting=1 [fio-run] 2. Also our internal performance team reported that this patch improves their database workload performance by around ~83% (with XFS on Power) Reported-by: Aravinda Herle <[email protected]> Reported-by: Brian Foster <[email protected]> Signed-off-by: Ritesh Harjani (IBM) <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]>
1 parent a01b8f2 commit 4ce02c6

File tree

5 files changed

+154
-14
lines changed

5 files changed

+154
-14
lines changed

fs/gfs2/aops.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = {
747747
.writepages = gfs2_writepages,
748748
.read_folio = gfs2_read_folio,
749749
.readahead = gfs2_readahead,
750-
.dirty_folio = filemap_dirty_folio,
750+
.dirty_folio = iomap_dirty_folio,
751751
.release_folio = iomap_release_folio,
752752
.invalidate_folio = iomap_invalidate_folio,
753753
.bmap = gfs2_bmap,

fs/iomap/buffered-io.c

Lines changed: 150 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,19 @@
2525

2626
typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
2727
/*
28-
* Structure allocated for each folio to track per-block uptodate state
28+
* Structure allocated for each folio to track per-block uptodate, dirty state
2929
* and I/O completions.
3030
*/
3131
struct iomap_folio_state {
3232
atomic_t read_bytes_pending;
3333
atomic_t write_bytes_pending;
3434
spinlock_t state_lock;
35+
36+
/*
37+
* Each block has two bits in this bitmap:
38+
* Bits [0..blocks_per_folio) has the uptodate status.
39+
* Bits [b_p_f...(2*b_p_f)) has the dirty status.
40+
*/
3541
unsigned long state[];
3642
};
3743

@@ -78,6 +84,61 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
7884
folio_mark_uptodate(folio);
7985
}
8086

87+
static inline bool ifs_block_is_dirty(struct folio *folio,
88+
struct iomap_folio_state *ifs, int block)
89+
{
90+
struct inode *inode = folio->mapping->host;
91+
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
92+
93+
return test_bit(block + blks_per_folio, ifs->state);
94+
}
95+
96+
static void ifs_clear_range_dirty(struct folio *folio,
97+
struct iomap_folio_state *ifs, size_t off, size_t len)
98+
{
99+
struct inode *inode = folio->mapping->host;
100+
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
101+
unsigned int first_blk = (off >> inode->i_blkbits);
102+
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
103+
unsigned int nr_blks = last_blk - first_blk + 1;
104+
unsigned long flags;
105+
106+
spin_lock_irqsave(&ifs->state_lock, flags);
107+
bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
108+
spin_unlock_irqrestore(&ifs->state_lock, flags);
109+
}
110+
111+
static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
112+
{
113+
struct iomap_folio_state *ifs = folio->private;
114+
115+
if (ifs)
116+
ifs_clear_range_dirty(folio, ifs, off, len);
117+
}
118+
119+
static void ifs_set_range_dirty(struct folio *folio,
120+
struct iomap_folio_state *ifs, size_t off, size_t len)
121+
{
122+
struct inode *inode = folio->mapping->host;
123+
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
124+
unsigned int first_blk = (off >> inode->i_blkbits);
125+
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
126+
unsigned int nr_blks = last_blk - first_blk + 1;
127+
unsigned long flags;
128+
129+
spin_lock_irqsave(&ifs->state_lock, flags);
130+
bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
131+
spin_unlock_irqrestore(&ifs->state_lock, flags);
132+
}
133+
134+
static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
135+
{
136+
struct iomap_folio_state *ifs = folio->private;
137+
138+
if (ifs)
139+
ifs_set_range_dirty(folio, ifs, off, len);
140+
}
141+
81142
static struct iomap_folio_state *ifs_alloc(struct inode *inode,
82143
struct folio *folio, unsigned int flags)
83144
{
@@ -93,14 +154,24 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
93154
else
94155
gfp = GFP_NOFS | __GFP_NOFAIL;
95156

96-
ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
97-
gfp);
98-
if (ifs) {
99-
spin_lock_init(&ifs->state_lock);
100-
if (folio_test_uptodate(folio))
101-
bitmap_fill(ifs->state, nr_blocks);
102-
folio_attach_private(folio, ifs);
103-
}
157+
/*
158+
* ifs->state tracks two sets of state flags when the
159+
* filesystem block size is smaller than the folio size.
160+
* The first state tracks per-block uptodate and the
161+
* second tracks per-block dirty state.
162+
*/
163+
ifs = kzalloc(struct_size(ifs, state,
164+
BITS_TO_LONGS(2 * nr_blocks)), gfp);
165+
if (!ifs)
166+
return ifs;
167+
168+
spin_lock_init(&ifs->state_lock);
169+
if (folio_test_uptodate(folio))
170+
bitmap_set(ifs->state, 0, nr_blocks);
171+
if (folio_test_dirty(folio))
172+
bitmap_set(ifs->state, nr_blocks, nr_blocks);
173+
folio_attach_private(folio, ifs);
174+
104175
return ifs;
105176
}
106177

@@ -519,6 +590,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
519590
}
520591
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
521592

593+
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
594+
{
595+
struct inode *inode = mapping->host;
596+
size_t len = folio_size(folio);
597+
598+
ifs_alloc(inode, folio, 0);
599+
iomap_set_range_dirty(folio, 0, len);
600+
return filemap_dirty_folio(mapping, folio);
601+
}
602+
EXPORT_SYMBOL_GPL(iomap_dirty_folio);
603+
522604
static void
523605
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
524606
{
@@ -723,6 +805,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
723805
if (unlikely(copied < len && !folio_test_uptodate(folio)))
724806
return 0;
725807
iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
808+
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
726809
filemap_dirty_folio(inode->i_mapping, folio);
727810
return copied;
728811
}
@@ -892,6 +975,43 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
892975
}
893976
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
894977

978+
static int iomap_write_delalloc_ifs_punch(struct inode *inode,
979+
struct folio *folio, loff_t start_byte, loff_t end_byte,
980+
iomap_punch_t punch)
981+
{
982+
unsigned int first_blk, last_blk, i;
983+
loff_t last_byte;
984+
u8 blkbits = inode->i_blkbits;
985+
struct iomap_folio_state *ifs;
986+
int ret = 0;
987+
988+
/*
989+
* When we have per-block dirty tracking, there can be
990+
* blocks within a folio which are marked uptodate
991+
* but not dirty. In that case it is necessary to punch
992+
* out such blocks to avoid leaking any delalloc blocks.
993+
*/
994+
ifs = folio->private;
995+
if (!ifs)
996+
return ret;
997+
998+
last_byte = min_t(loff_t, end_byte - 1,
999+
folio_pos(folio) + folio_size(folio) - 1);
1000+
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
1001+
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
1002+
for (i = first_blk; i <= last_blk; i++) {
1003+
if (!ifs_block_is_dirty(folio, ifs, i)) {
1004+
ret = punch(inode, folio_pos(folio) + (i << blkbits),
1005+
1 << blkbits);
1006+
if (ret)
1007+
return ret;
1008+
}
1009+
}
1010+
1011+
return ret;
1012+
}
1013+
1014+
8951015
static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
8961016
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
8971017
iomap_punch_t punch)
@@ -909,6 +1029,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
9091029
return ret;
9101030
}
9111031

1032+
/* Punch non-dirty blocks within folio */
1033+
ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
1034+
end_byte, punch);
1035+
if (ret)
1036+
return ret;
1037+
9121038
/*
9131039
* Make sure the next punch start is correctly bound to
9141040
* the end of this data range, not the end of the folio.
@@ -1639,14 +1765,21 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
16391765
struct writeback_control *wbc, struct inode *inode,
16401766
struct folio *folio, u64 end_pos)
16411767
{
1642-
struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
1768+
struct iomap_folio_state *ifs = folio->private;
16431769
struct iomap_ioend *ioend, *next;
16441770
unsigned len = i_blocksize(inode);
16451771
unsigned nblocks = i_blocks_per_folio(inode, folio);
16461772
u64 pos = folio_pos(folio);
16471773
int error = 0, count = 0, i;
16481774
LIST_HEAD(submit_list);
16491775

1776+
WARN_ON_ONCE(end_pos <= pos);
1777+
1778+
if (!ifs && nblocks > 1) {
1779+
ifs = ifs_alloc(inode, folio, 0);
1780+
iomap_set_range_dirty(folio, 0, end_pos - pos);
1781+
}
1782+
16501783
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
16511784

16521785
/*
@@ -1655,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
16551788
* invalid, grab a new one.
16561789
*/
16571790
for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
1658-
if (ifs && !ifs_block_is_uptodate(ifs, i))
1791+
if (ifs && !ifs_block_is_dirty(folio, ifs, i))
16591792
continue;
16601793

16611794
error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1699,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
16991832
}
17001833
}
17011834

1835+
/*
1836+
* We can have dirty bits set past end of file in page_mkwrite path
1837+
* while mapping the last partial folio. Hence it's better to clear
1838+
* all the dirty bits in the folio here.
1839+
*/
1840+
iomap_clear_range_dirty(folio, 0, folio_size(folio));
17021841
folio_start_writeback(folio);
17031842
folio_unlock(folio);
17041843

fs/xfs/xfs_aops.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
578578
.read_folio = xfs_vm_read_folio,
579579
.readahead = xfs_vm_readahead,
580580
.writepages = xfs_vm_writepages,
581-
.dirty_folio = filemap_dirty_folio,
581+
.dirty_folio = iomap_dirty_folio,
582582
.release_folio = iomap_release_folio,
583583
.invalidate_folio = iomap_invalidate_folio,
584584
.bmap = xfs_vm_bmap,

fs/zonefs/file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
175175
.read_folio = zonefs_read_folio,
176176
.readahead = zonefs_readahead,
177177
.writepages = zonefs_writepages,
178-
.dirty_folio = filemap_dirty_folio,
178+
.dirty_folio = iomap_dirty_folio,
179179
.release_folio = iomap_release_folio,
180180
.invalidate_folio = iomap_invalidate_folio,
181181
.migrate_folio = filemap_migrate_folio,

include/linux/iomap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
264264
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
265265
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
266266
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
267+
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
267268
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
268269
const struct iomap_ops *ops);
269270
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,

0 commit comments

Comments
 (0)