Skip to content

Commit d42bd17

Browse files
author
Darrick J. Wong
committed
Merge tag 'large-folio-writes' of git://git.infradead.org/users/willy/pagecache into iomap-6.6-merge
Create large folios in iomap buffered write path Commit ebb7fb1 limited the length of ioend chains to 4096 entries to improve worst-case latency. Unfortunately, this had the effect of limiting the performance of: fio -name write-bandwidth -rw=write -bs=1024Ki -size=32Gi -runtime=30 \ -iodepth 1 -ioengine sync -zero_buffers=1 -direct=0 -end_fsync=1 \ -numjobs=4 -directory=/mnt/test https://lore.kernel.org/linux-xfs/[email protected]/ The problem ends up being lock contention on the i_pages spinlock as we clear the writeback bit on each folio (and propagate that up through the tree). By using larger folios, we decrease the number of folios to be processed by a factor of 256 for this benchmark, eliminating the lock contention. Creating large folios in the buffered write path is also the right thing to do. It's a project that has been on the back burner for years, it just hasn't been important enough to do before now. * tag 'large-folio-writes' of git://git.infradead.org/users/willy/pagecache: iomap: Copy larger chunks from userspace iomap: Create large folios in the buffered write path filemap: Allow __filemap_get_folio to allocate large folios filemap: Add fgf_t typedef iomap: Remove unnecessary test from iomap_release_folio() doc: Correct the description of ->release_folio iomap: Remove large folio handling in iomap_invalidate_folio() iov_iter: Add copy_folio_from_iter_atomic() iov_iter: Handle compound highmem pages in copy_page_from_iter_atomic() iov_iter: Map the page later in copy_page_from_iter_atomic() [djwong: yay amortizations!] Signed-off-by: Darrick J. Wong <[email protected]>
2 parents 6eaae19 + 5d8edfb commit d42bd17

File tree

13 files changed

+187
-110
lines changed

13 files changed

+187
-110
lines changed

Documentation/filesystems/locking.rst

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,17 @@ invalidate_lock before invalidating page cache in truncate / hole punch
374374
path (and thus calling into ->invalidate_folio) to block races between page
375375
cache invalidation and page cache filling functions (fault, read, ...).
376376

377-
->release_folio() is called when the kernel is about to try to drop the
378-
buffers from the folio in preparation for freeing it. It returns false to
379-
indicate that the buffers are (or may be) freeable. If ->release_folio is
380-
NULL, the kernel assumes that the fs has no private interest in the buffers.
377+
->release_folio() is called when the MM wants to make a change to the
378+
folio that would invalidate the filesystem's private data. For example,
379+
it may be about to be removed from the address_space or split. The folio
380+
is locked and not under writeback. It may be dirty. The gfp parameter
381+
is not usually used for allocation, but rather to indicate what the
382+
filesystem may do to attempt to free the private data. The filesystem may
383+
return false to indicate that the folio's private data cannot be freed.
384+
If it returns true, it should have already removed the private data from
385+
the folio. If a filesystem does not provide a ->release_folio method,
386+
the pagecache will assume that private data is buffer_heads and call
387+
try_to_free_buffers().
381388

382389
->free_folio() is called when the kernel has dropped the folio
383390
from the page cache.

fs/btrfs/file.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -876,9 +876,9 @@ static int prepare_uptodate_page(struct inode *inode,
876876
return 0;
877877
}
878878

879-
static unsigned int get_prepare_fgp_flags(bool nowait)
879+
static fgf_t get_prepare_fgp_flags(bool nowait)
880880
{
881-
unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
881+
fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
882882

883883
if (nowait)
884884
fgp_flags |= FGP_NOWAIT;
@@ -910,7 +910,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
910910
int i;
911911
unsigned long index = pos >> PAGE_SHIFT;
912912
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
913-
unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
913+
fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
914914
int err = 0;
915915
int faili;
916916

fs/f2fs/compress.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
10451045
struct address_space *mapping = cc->inode->i_mapping;
10461046
struct page *page;
10471047
sector_t last_block_in_bio;
1048-
unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
1048+
fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
10491049
pgoff_t start_idx = start_idx_of_cluster(cc);
10501050
int i, ret;
10511051

fs/f2fs/f2fs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
27362736

27372737
static inline struct page *f2fs_pagecache_get_page(
27382738
struct address_space *mapping, pgoff_t index,
2739-
int fgp_flags, gfp_t gfp_mask)
2739+
fgf_t fgp_flags, gfp_t gfp_mask)
27402740
{
27412741
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET))
27422742
return NULL;

fs/gfs2/bmap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
971971
if (status)
972972
return ERR_PTR(status);
973973

974-
folio = iomap_get_folio(iter, pos);
974+
folio = iomap_get_folio(iter, pos, len);
975975
if (IS_ERR(folio))
976976
gfs2_trans_end(sdp);
977977
return folio;

fs/iomap/buffered-io.c

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -461,16 +461,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
461461
* iomap_get_folio - get a folio reference for writing
462462
* @iter: iteration structure
463463
* @pos: start offset of write
464+
* @len: Suggested size of folio to create.
464465
*
465466
* Returns a locked reference to the folio at @pos, or an error pointer if the
466467
* folio could not be obtained.
467468
*/
468-
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
469+
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
469470
{
470-
unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
471+
fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
471472

472473
if (iter->flags & IOMAP_NOWAIT)
473474
fgp |= FGP_NOWAIT;
475+
fgp |= fgf_set_order(len);
474476

475477
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
476478
fgp, mapping_gfp_mask(iter->inode->i_mapping));
@@ -483,12 +485,11 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
483485
folio_size(folio));
484486

485487
/*
486-
* mm accommodates an old ext3 case where clean folios might
487-
* not have had the dirty bit cleared. Thus, it can send actual
488-
* dirty folios to ->release_folio() via shrink_active_list();
489-
* skip those here.
488+
* If the folio is dirty, we refuse to release our metadata because
489+
* it may be partially dirty. Once we track per-block dirty state,
490+
* we can release the metadata if every block is dirty.
490491
*/
491-
if (folio_test_dirty(folio) || folio_test_writeback(folio))
492+
if (folio_test_dirty(folio))
492493
return false;
493494
iomap_page_release(folio);
494495
return true;
@@ -508,11 +509,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
508509
WARN_ON_ONCE(folio_test_writeback(folio));
509510
folio_cancel_dirty(folio);
510511
iomap_page_release(folio);
511-
} else if (folio_test_large(folio)) {
512-
/* Must release the iop so the page can be split */
513-
WARN_ON_ONCE(!folio_test_uptodate(folio) &&
514-
folio_test_dirty(folio));
515-
iomap_page_release(folio);
516512
}
517513
}
518514
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
@@ -603,7 +599,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
603599
if (folio_ops && folio_ops->get_folio)
604600
return folio_ops->get_folio(iter, pos, len);
605601
else
606-
return iomap_get_folio(iter, pos);
602+
return iomap_get_folio(iter, pos, len);
607603
}
608604

609605
static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
@@ -773,6 +769,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
773769
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
774770
{
775771
loff_t length = iomap_length(iter);
772+
size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
776773
loff_t pos = iter->pos;
777774
ssize_t written = 0;
778775
long status = 0;
@@ -781,15 +778,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
781778

782779
do {
783780
struct folio *folio;
784-
struct page *page;
785-
unsigned long offset; /* Offset into pagecache page */
786-
unsigned long bytes; /* Bytes to write to page */
781+
size_t offset; /* Offset into folio */
782+
size_t bytes; /* Bytes to write to folio */
787783
size_t copied; /* Bytes copied from user */
788784

789-
offset = offset_in_page(pos);
790-
bytes = min_t(unsigned long, PAGE_SIZE - offset,
791-
iov_iter_count(i));
792-
again:
785+
offset = pos & (chunk - 1);
786+
bytes = min(chunk - offset, iov_iter_count(i));
793787
status = balance_dirty_pages_ratelimited_flags(mapping,
794788
bdp_flags);
795789
if (unlikely(status))
@@ -819,12 +813,14 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
819813
if (iter->iomap.flags & IOMAP_F_STALE)
820814
break;
821815

822-
page = folio_file_page(folio, pos >> PAGE_SHIFT);
823-
if (mapping_writably_mapped(mapping))
824-
flush_dcache_page(page);
816+
offset = offset_in_folio(folio, pos);
817+
if (bytes > folio_size(folio) - offset)
818+
bytes = folio_size(folio) - offset;
825819

826-
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
820+
if (mapping_writably_mapped(mapping))
821+
flush_dcache_folio(folio);
827822

823+
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
828824
status = iomap_write_end(iter, pos, bytes, copied, folio);
829825

830826
if (unlikely(copied != status))
@@ -840,11 +836,13 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
840836
*/
841837
if (copied)
842838
bytes = copied;
843-
goto again;
839+
if (chunk > PAGE_SIZE)
840+
chunk /= 2;
841+
} else {
842+
pos += status;
843+
written += status;
844+
length -= status;
844845
}
845-
pos += status;
846-
written += status;
847-
length -= status;
848846
} while (iov_iter_count(i) && length);
849847

850848
if (status == -EAGAIN) {

include/linux/iomap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
261261
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
262262
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
263263
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
264-
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
264+
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
265265
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
266266
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
267267
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,

include/linux/pagemap.h

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,19 @@ static inline void *detach_page_private(struct page *page)
470470
return folio_detach_private(page_folio(page));
471471
}
472472

473+
/*
474+
* There are some parts of the kernel which assume that PMD entries
475+
* are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
476+
* limit the maximum allocation order to PMD size. I'm not aware of any
477+
* assumptions about maximum order if THP are disabled, but 8 seems like
478+
* a good order (that's 1MB if you're using 4kB pages)
479+
*/
480+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
481+
#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
482+
#else
483+
#define MAX_PAGECACHE_ORDER 8
484+
#endif
485+
473486
#ifdef CONFIG_NUMA
474487
struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
475488
#else
@@ -501,22 +514,69 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
501514
pgoff_t page_cache_prev_miss(struct address_space *mapping,
502515
pgoff_t index, unsigned long max_scan);
503516

504-
#define FGP_ACCESSED 0x00000001
505-
#define FGP_LOCK 0x00000002
506-
#define FGP_CREAT 0x00000004
507-
#define FGP_WRITE 0x00000008
508-
#define FGP_NOFS 0x00000010
509-
#define FGP_NOWAIT 0x00000020
510-
#define FGP_FOR_MMAP 0x00000040
511-
#define FGP_STABLE 0x00000080
517+
/**
518+
* typedef fgf_t - Flags for getting folios from the page cache.
519+
*
520+
* Most users of the page cache will not need to use these flags;
521+
* there are convenience functions such as filemap_get_folio() and
522+
* filemap_lock_folio(). For users which need more control over exactly
523+
* what is done with the folios, these flags to __filemap_get_folio()
524+
* are available.
525+
*
526+
* * %FGP_ACCESSED - The folio will be marked accessed.
527+
* * %FGP_LOCK - The folio is returned locked.
528+
* * %FGP_CREAT - If no folio is present then a new folio is allocated,
529+
* added to the page cache and the VM's LRU list. The folio is
530+
* returned locked.
531+
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
532+
* folio is already in cache. If the folio was allocated, unlock it
533+
* before returning so the caller can do the same dance.
534+
* * %FGP_WRITE - The folio will be written to by the caller.
535+
* * %FGP_NOFS - __GFP_FS will get cleared in gfp.
536+
* * %FGP_NOWAIT - Don't block on the folio lock.
537+
* * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
538+
* * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
539+
* implementation.
540+
*/
541+
typedef unsigned int __bitwise fgf_t;
542+
543+
#define FGP_ACCESSED ((__force fgf_t)0x00000001)
544+
#define FGP_LOCK ((__force fgf_t)0x00000002)
545+
#define FGP_CREAT ((__force fgf_t)0x00000004)
546+
#define FGP_WRITE ((__force fgf_t)0x00000008)
547+
#define FGP_NOFS ((__force fgf_t)0x00000010)
548+
#define FGP_NOWAIT ((__force fgf_t)0x00000020)
549+
#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)
550+
#define FGP_STABLE ((__force fgf_t)0x00000080)
551+
#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */
512552

513553
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
514554

555+
/**
556+
* fgf_set_order - Encode a length in the fgf_t flags.
557+
* @size: The suggested size of the folio to create.
558+
*
559+
* The caller of __filemap_get_folio() can use this to suggest a preferred
560+
* size for the folio that is created. If there is already a folio at
561+
* the index, it will be returned, no matter what its size. If a folio
562+
* is freshly created, it may be of a different size than requested
563+
* due to alignment constraints, memory pressure, or the presence of
564+
* other folios at nearby indices.
565+
*/
566+
static inline fgf_t fgf_set_order(size_t size)
567+
{
568+
unsigned int shift = ilog2(size);
569+
570+
if (shift <= PAGE_SHIFT)
571+
return 0;
572+
return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
573+
}
574+
515575
void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
516576
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
517-
int fgp_flags, gfp_t gfp);
577+
fgf_t fgp_flags, gfp_t gfp);
518578
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
519-
int fgp_flags, gfp_t gfp);
579+
fgf_t fgp_flags, gfp_t gfp);
520580

521581
/**
522582
* filemap_get_folio - Find and get a folio.
@@ -590,7 +650,7 @@ static inline struct page *find_get_page(struct address_space *mapping,
590650
}
591651

592652
static inline struct page *find_get_page_flags(struct address_space *mapping,
593-
pgoff_t offset, int fgp_flags)
653+
pgoff_t offset, fgf_t fgp_flags)
594654
{
595655
return pagecache_get_page(mapping, offset, fgp_flags, 0);
596656
}

include/linux/uio.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
163163
return ret;
164164
}
165165

166-
size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
166+
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
167167
size_t bytes, struct iov_iter *i);
168168
void iov_iter_advance(struct iov_iter *i, size_t bytes);
169169
void iov_iter_revert(struct iov_iter *i, size_t bytes);
@@ -184,6 +184,13 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
184184
{
185185
return copy_page_to_iter(&folio->page, offset, bytes, i);
186186
}
187+
188+
static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
189+
size_t offset, size_t bytes, struct iov_iter *i)
190+
{
191+
return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
192+
}
193+
187194
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
188195
size_t bytes, struct iov_iter *i);
189196

lib/iov_iter.c

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -566,24 +566,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
566566
}
567567
EXPORT_SYMBOL(iov_iter_zero);
568568

569-
size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
570-
struct iov_iter *i)
569+
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
570+
size_t bytes, struct iov_iter *i)
571571
{
572-
char *kaddr = kmap_atomic(page), *p = kaddr + offset;
573-
if (!page_copy_sane(page, offset, bytes)) {
574-
kunmap_atomic(kaddr);
572+
size_t n, copied = 0;
573+
574+
if (!page_copy_sane(page, offset, bytes))
575575
return 0;
576-
}
577-
if (WARN_ON_ONCE(!i->data_source)) {
578-
kunmap_atomic(kaddr);
576+
if (WARN_ON_ONCE(!i->data_source))
579577
return 0;
580-
}
581-
iterate_and_advance(i, bytes, base, len, off,
582-
copyin(p + off, base, len),
583-
memcpy_from_iter(i, p + off, base, len)
584-
)
585-
kunmap_atomic(kaddr);
586-
return bytes;
578+
579+
do {
580+
char *p;
581+
582+
n = bytes - copied;
583+
if (PageHighMem(page)) {
584+
page += offset / PAGE_SIZE;
585+
offset %= PAGE_SIZE;
586+
n = min_t(size_t, n, PAGE_SIZE - offset);
587+
}
588+
589+
p = kmap_atomic(page) + offset;
590+
iterate_and_advance(i, n, base, len, off,
591+
copyin(p + off, base, len),
592+
memcpy_from_iter(i, p + off, base, len)
593+
)
594+
kunmap_atomic(p);
595+
copied += n;
596+
offset += n;
597+
} while (PageHighMem(page) && copied != bytes && n > 0);
598+
599+
return copied;
587600
}
588601
EXPORT_SYMBOL(copy_page_from_iter_atomic);
589602

0 commit comments

Comments
 (0)