Skip to content

Commit 967c28b

Browse files
committed
erofs: kill hooked chains to avoid loops on deduplicated compressed images
After heavily stressing EROFS with several images which include a hand-crafted image of repeated patterns for more than 46 days, I found two chains could be linked with each other almost simultaneously and form a loop so that the entire loop won't be submitted. As a consequence, the corresponding file pages will remain locked forever. It can be _only_ observed on data-deduplicated compressed images. For example, consider two chains with five pclusters in total: Chain 1: 2->3->4->5 -- The tail pcluster is 5; Chain 2: 5->1->2 -- The tail pcluster is 2. Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link to Chain 2 at the same time with pcluster 2. Since hooked chains are all linked locklessly now, I have no idea how to simply avoid the race. Instead, let's avoid hooked chains completely until I could work out a proper way to fix this and end users finally tell us that it's needed to add it back. Actually, this optimization can be found with multi-threaded workloads (especially even more often on deduplicated compressed images), yet I'm not sure about the overall system impacts of not having this compared with implementation complexity. Fixes: 267f249 ("erofs: introduce multi-reference pclusters (fully-referenced)") Signed-off-by: Gao Xiang <[email protected]> Reviewed-by: Yue Hu <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 6ab5eed commit 967c28b

File tree

1 file changed

+11
-61
lines changed

1 file changed

+11
-61
lines changed

fs/erofs/zdata.c

Lines changed: 11 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,8 @@ struct z_erofs_pcluster {
9393

9494
/* let's avoid the valid 32-bit kernel addresses */
9595

96-
/* the chained workgroup has't submitted io (still open) */
96+
/* the end of a chain of pclusters */
9797
#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
98-
/* the chained workgroup has already submitted io */
99-
#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
100-
10198
#define Z_EROFS_PCLUSTER_NIL (NULL)
10299

103100
struct z_erofs_decompressqueue {
@@ -504,20 +501,6 @@ int __init z_erofs_init_zip_subsystem(void)
504501

505502
enum z_erofs_pclustermode {
506503
Z_EROFS_PCLUSTER_INFLIGHT,
507-
/*
508-
* The current pclusters was the tail of an exist chain, in addition
509-
* that the previous processed chained pclusters are all decided to
510-
* be hooked up to it.
511-
* A new chain will be created for the remaining pclusters which are
512-
* not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
513-
* the next pcluster cannot reuse the whole page safely for inplace I/O
514-
* in the following scenario:
515-
* ________________________________________________________________
516-
* | tail (partial) page | head (partial) page |
517-
* | (belongs to the next pcl) | (belongs to the current pcl) |
518-
* |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
519-
*/
520-
Z_EROFS_PCLUSTER_HOOKED,
521504
/*
522505
* a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
523506
* could be dispatched into bypass queue later due to uptodated managed
@@ -535,8 +518,8 @@ enum z_erofs_pclustermode {
535518
* ________________________________________________________________
536519
* | tail (partial) page | head (partial) page |
537520
* | (of the current cl) | (of the previous collection) |
538-
* | PCLUSTER_FOLLOWED or | |
539-
* |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
521+
* | | |
522+
* |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
540523
*
541524
* [ (*) the above page can be used as inplace I/O. ]
542525
*/
@@ -550,7 +533,7 @@ struct z_erofs_decompress_frontend {
550533

551534
struct page *pagepool;
552535
struct page *candidate_bvpage;
553-
struct z_erofs_pcluster *pcl, *tailpcl;
536+
struct z_erofs_pcluster *pcl;
554537
z_erofs_next_pcluster_t owned_head;
555538
enum z_erofs_pclustermode mode;
556539

@@ -755,19 +738,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
755738
return;
756739
}
757740

758-
/*
759-
* type 2, link to the end of an existing open chain, be careful
760-
* that its submission is controlled by the original attached chain.
761-
*/
762-
if (*owned_head != &pcl->next && pcl != f->tailpcl &&
763-
cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
764-
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
765-
*owned_head = Z_EROFS_PCLUSTER_TAIL;
766-
f->mode = Z_EROFS_PCLUSTER_HOOKED;
767-
f->tailpcl = NULL;
768-
return;
769-
}
770-
/* type 3, it belongs to a chain, but it isn't the end of the chain */
741+
/* type 2, it belongs to an ongoing chain */
771742
f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
772743
}
773744

@@ -828,9 +799,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
828799
goto err_out;
829800
}
830801
}
831-
/* used to check tail merging loop due to corrupted images */
832-
if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
833-
fe->tailpcl = pcl;
834802
fe->owned_head = &pcl->next;
835803
fe->pcl = pcl;
836804
return 0;
@@ -851,7 +819,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
851819

852820
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
853821
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
854-
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
855822

856823
if (!(map->m_flags & EROFS_MAP_META)) {
857824
grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -870,10 +837,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
870837

871838
if (ret == -EEXIST) {
872839
mutex_lock(&fe->pcl->lock);
873-
/* used to check tail merging loop due to corrupted images */
874-
if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
875-
fe->tailpcl = fe->pcl;
876-
877840
z_erofs_try_to_claim_pcluster(fe);
878841
} else if (ret) {
879842
return ret;
@@ -1028,8 +991,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
1028991
* those chains are handled asynchronously thus the page cannot be used
1029992
* for inplace I/O or bvpage (should be processed in a strict order.)
1030993
*/
1031-
tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
1032-
fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
994+
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
1033995

1034996
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
1035997
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1398,10 +1360,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
13981360
};
13991361
z_erofs_next_pcluster_t owned = io->head;
14001362

1401-
while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
1402-
/* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
1403-
DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
1404-
/* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
1363+
while (owned != Z_EROFS_PCLUSTER_TAIL) {
14051364
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
14061365

14071366
be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1418,7 +1377,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
14181377
container_of(work, struct z_erofs_decompressqueue, u.work);
14191378
struct page *pagepool = NULL;
14201379

1421-
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1380+
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
14221381
z_erofs_decompress_queue(bgq, &pagepool);
14231382
erofs_release_pages(&pagepool);
14241383
kvfree(bgq);
@@ -1606,7 +1565,7 @@ static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
16061565
q->sync = true;
16071566
}
16081567
q->sb = sb;
1609-
q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1568+
q->head = Z_EROFS_PCLUSTER_TAIL;
16101569
return q;
16111570
}
16121571

@@ -1624,11 +1583,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
16241583
z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
16251584
z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
16261585

1627-
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1628-
if (owned_head == Z_EROFS_PCLUSTER_TAIL)
1629-
owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1630-
1631-
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
1586+
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
16321587

16331588
WRITE_ONCE(*submit_qtail, owned_head);
16341589
WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1698,15 +1653,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
16981653
unsigned int i = 0;
16991654
bool bypass = true;
17001655

1701-
/* no possible 'owned_head' equals the following */
1702-
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
17031656
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1704-
17051657
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1658+
owned_head = READ_ONCE(pcl->next);
17061659

1707-
/* close the main owned chain at first */
1708-
owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
1709-
Z_EROFS_PCLUSTER_TAIL_CLOSED);
17101660
if (z_erofs_is_inline_pcluster(pcl)) {
17111661
move_to_bypass_jobqueue(pcl, qtail, owned_head);
17121662
continue;

0 commit comments

Comments
 (0)