Skip to content

Commit cc4b2dd

Browse files
committed
erofs: fix infinite loop due to a race of filling compressed_bvecs
I encountered a race issue after lengthy (~594647 secs) stress tests on a 64k-page arm64 VM with several 4k-block EROFS images. The timing is like below: z_erofs_try_inplace_io z_erofs_fill_bio_vec cmpxchg(&compressed_bvecs[].page, NULL, ..) [access bufvec] compressed_bvecs[] = *bvec; Previously, z_erofs_submit_queue() just accessed bufvec->page only, so other fields in bufvec didn't matter. After the subpage block support is landed, .offset and .end can be used too, but filling bufvec isn't an atomic operation which can cause inconsistency. Let's use a spinlock to keep the atomicity of each bufvec. More specifically, just reuse the existing spinlock `pcl->obj.lockref.lock` since it's rarely used (also it takes a short time if even used) as long as the pcluster has a reference. Fixes: 1923516 ("erofs: support I/O submission for sub-page compressed blocks") Signed-off-by: Gao Xiang <[email protected]> Reviewed-by: Yue Hu <[email protected]> Reviewed-by: Sandeep Dhavale <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 97cf5d5 commit cc4b2dd

File tree

1 file changed

+38
-36
lines changed

1 file changed

+38
-36
lines changed

fs/erofs/zdata.c

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
563563
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
564564
unsigned int i;
565565

566-
if (i_blocksize(fe->inode) != PAGE_SIZE)
567-
return;
568-
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
566+
if (i_blocksize(fe->inode) != PAGE_SIZE ||
567+
fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
569568
return;
570569

571570
for (i = 0; i < pclusterpages; ++i) {
572571
struct page *page, *newpage;
573572
void *t; /* mark pages just found for debugging */
574573

575-
/* the compressed page was loaded before */
574+
/* Inaccurate check w/o locking to avoid unneeded lookups */
576575
if (READ_ONCE(pcl->compressed_bvecs[i].page))
577576
continue;
578577

579578
page = find_get_page(mc, pcl->obj.index + i);
580-
581579
if (page) {
582580
t = (void *)((unsigned long)page | 1);
583581
newpage = NULL;
@@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
597595
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
598596
t = (void *)((unsigned long)newpage | 1);
599597
}
600-
601-
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
598+
spin_lock(&pcl->obj.lockref.lock);
599+
if (!pcl->compressed_bvecs[i].page) {
600+
pcl->compressed_bvecs[i].page = t;
601+
spin_unlock(&pcl->obj.lockref.lock);
602602
continue;
603+
}
604+
spin_unlock(&pcl->obj.lockref.lock);
603605

604606
if (page)
605607
put_page(page);
@@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
718720
return 0;
719721
}
720722

721-
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
722-
struct z_erofs_bvec *bvec)
723-
{
724-
struct z_erofs_pcluster *const pcl = fe->pcl;
725-
726-
while (fe->icur > 0) {
727-
if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
728-
NULL, bvec->page)) {
729-
pcl->compressed_bvecs[fe->icur] = *bvec;
730-
return true;
731-
}
732-
}
733-
return false;
734-
}
735-
736723
/* callers must be with pcluster lock held */
737724
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
738725
struct z_erofs_bvec *bvec, bool exclusive)
739726
{
727+
struct z_erofs_pcluster *pcl = fe->pcl;
740728
int ret;
741729

742730
if (exclusive) {
743731
/* give priority for inplaceio to use file pages first */
744-
if (z_erofs_try_inplace_io(fe, bvec))
732+
spin_lock(&pcl->obj.lockref.lock);
733+
while (fe->icur > 0) {
734+
if (pcl->compressed_bvecs[--fe->icur].page)
735+
continue;
736+
pcl->compressed_bvecs[fe->icur] = *bvec;
737+
spin_unlock(&pcl->obj.lockref.lock);
745738
return 0;
739+
}
740+
spin_unlock(&pcl->obj.lockref.lock);
741+
746742
/* otherwise, check if it can be used as a bvpage */
747743
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
748744
!fe->candidate_bvpage)
@@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
14231419
{
14241420
gfp_t gfp = mapping_gfp_mask(mc);
14251421
bool tocache = false;
1426-
struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
1422+
struct z_erofs_bvec zbv;
14271423
struct address_space *mapping;
1428-
struct page *page, *oldpage;
1424+
struct page *page;
14291425
int justfound, bs = i_blocksize(f->inode);
14301426

14311427
/* Except for inplace pages, the entire page can be used for I/Os */
14321428
bvec->bv_offset = 0;
14331429
bvec->bv_len = PAGE_SIZE;
14341430
repeat:
1435-
oldpage = READ_ONCE(zbv->page);
1436-
if (!oldpage)
1431+
spin_lock(&pcl->obj.lockref.lock);
1432+
zbv = pcl->compressed_bvecs[nr];
1433+
page = zbv.page;
1434+
justfound = (unsigned long)page & 1UL;
1435+
page = (struct page *)((unsigned long)page & ~1UL);
1436+
pcl->compressed_bvecs[nr].page = page;
1437+
spin_unlock(&pcl->obj.lockref.lock);
1438+
if (!page)
14371439
goto out_allocpage;
14381440

1439-
justfound = (unsigned long)oldpage & 1UL;
1440-
page = (struct page *)((unsigned long)oldpage & ~1UL);
14411441
bvec->bv_page = page;
1442-
14431442
DBG_BUGON(z_erofs_is_shortlived_page(page));
14441443
/*
14451444
* Handle preallocated cached pages. We tried to allocate such pages
@@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
14481447
*/
14491448
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
14501449
set_page_private(page, 0);
1451-
WRITE_ONCE(zbv->page, page);
14521450
tocache = true;
14531451
goto out_tocache;
14541452
}
@@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
14591457
* therefore it is impossible for `mapping` to be NULL.
14601458
*/
14611459
if (mapping && mapping != mc) {
1462-
if (zbv->offset < 0)
1463-
bvec->bv_offset = round_up(-zbv->offset, bs);
1464-
bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
1460+
if (zbv.offset < 0)
1461+
bvec->bv_offset = round_up(-zbv.offset, bs);
1462+
bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
14651463
return;
14661464
}
14671465

@@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
14711469

14721470
/* the cached page is still in managed cache */
14731471
if (page->mapping == mc) {
1474-
WRITE_ONCE(zbv->page, page);
14751472
/*
14761473
* The cached page is still available but without a valid
14771474
* `->private` pcluster hint. Let's reconnect them.
@@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
15031500
put_page(page);
15041501
out_allocpage:
15051502
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
1506-
if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
1503+
spin_lock(&pcl->obj.lockref.lock);
1504+
if (pcl->compressed_bvecs[nr].page) {
15071505
erofs_pagepool_add(&f->pagepool, page);
1506+
spin_unlock(&pcl->obj.lockref.lock);
15081507
cond_resched();
15091508
goto repeat;
15101509
}
1510+
pcl->compressed_bvecs[nr].page = page;
1511+
spin_unlock(&pcl->obj.lockref.lock);
15111512
bvec->bv_page = page;
15121513
out_tocache:
15131514
if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
16851686

16861687
if (cur + bvec.bv_len > end)
16871688
bvec.bv_len = end - cur;
1689+
DBG_BUGON(bvec.bv_len < sb->s_blocksize);
16881690
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
16891691
bvec.bv_offset))
16901692
goto submit_bio_retry;

0 commit comments

Comments
 (0)