Skip to content

Commit bd22553

Browse files
Yu Zhaoakpm00
authored andcommitted
mm/hugetlb_vmemmap: fix race with speculative PFN walkers
While investigating HVO for THPs [1], it turns out that speculative PFN walkers like compaction can race with vmemmap modifications, e.g., CPU 1 (vmemmap modifier) CPU 2 (speculative PFN walker) ------------------------------- ------------------------------ Allocates an LRU folio page1 Sees page1 Frees page1 Allocates a hugeTLB folio page2 (page1 being a tail of page2) Updates vmemmap mapping page1 get_page_unless_zero(page1) Even though page1->_refcount is zero after HVO, get_page_unless_zero() can still try to modify this read-only field, resulting in a crash. An independent report [2] confirmed this race. There are two discussed approaches to fix this race: 1. Make RO vmemmap RW so that get_page_unless_zero() can fail without triggering a PF. 2. Use RCU to make sure get_page_unless_zero() either sees zero page->_refcount through the old vmemmap or non-zero page->_refcount through the new one. The second approach is preferred here because: 1. It can prevent illegal modifications to struct page[] that has been HVO'ed; 2. It can be generalized, in a way similar to ZERO_PAGE(), to fix similar races in other places, e.g., arch_remove_memory() on x86 [3], which frees vmemmap mapping offlined struct page[]. While adding synchronize_rcu(), the goal is to be surgical, rather than optimized. Specifically, calls to synchronize_rcu() on the error handling paths can be coalesced, but it is not done for the sake of Simplicity: noticeably, this fix removes ~50% more lines than it adds. According to the hugetlb_optimize_vmemmap section in Documentation/admin-guide/sysctl/vm.rst, enabling HVO makes allocating or freeing hugeTLB pages "~2x slower than before". Having synchronize_rcu() on top makes those operations even worse, and this also affects the user interface /proc/sys/vm/nr_overcommit_hugepages. This is *very* hard to trigger: 1. Most hugeTLB use cases I know of are static, i.e., reserved at boot time, because allocating at runtime is not reliable at all. 2. On top of that, someone has to be very unlucky to get tripped over above, because the race window is so small -- I wasn't able to trigger it with a stress testing that does nothing but that (with THPs though). [1] https://lore.kernel.org/[email protected]/ [2] https://lore.kernel.org/[email protected]/ [3] https://lore.kernel.org/[email protected]/ Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Yu Zhao <[email protected]> Acked-by: Muchun Song <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Frank van der Linden <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Peter Xu <[email protected]> Cc: Yang Shi <[email protected]> Cc: Yu Zhao <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 5a4d894 commit bd22553

File tree

3 files changed

+30
-47
lines changed

3 files changed

+30
-47
lines changed

include/linux/page_ref.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,13 @@ static inline int folio_ref_dec_return(struct folio *folio)
230230

231231
static inline bool page_ref_add_unless(struct page *page, int nr, int u)
232232
{
233-
bool ret = atomic_add_unless(&page->_refcount, nr, u);
233+
bool ret = false;
234+
235+
rcu_read_lock();
236+
/* avoid writing to the vmemmap area being remapped */
237+
if (!page_is_fake_head(page) && page_ref_count(page) != u)
238+
ret = atomic_add_unless(&page->_refcount, nr, u);
239+
rcu_read_unlock();
234240

235241
if (page_ref_tracepoint_active(page_ref_mod_unless))
236242
__page_ref_mod_unless(page, nr, ret);

mm/hugetlb.c

Lines changed: 7 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,13 +1625,10 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
16251625
* folio appears as just a compound page. Otherwise, wait until after
16261626
* allocating vmemmap to clear the flag.
16271627
*
1628-
* A reference is held on the folio, except in the case of demote.
1629-
*
16301628
* Must be called with hugetlb lock held.
16311629
*/
1632-
static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
1633-
bool adjust_surplus,
1634-
bool demote)
1630+
static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
1631+
bool adjust_surplus)
16351632
{
16361633
int nid = folio_nid(folio);
16371634

@@ -1645,6 +1642,7 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
16451642
list_del(&folio->lru);
16461643

16471644
if (folio_test_hugetlb_freed(folio)) {
1645+
folio_clear_hugetlb_freed(folio);
16481646
h->free_huge_pages--;
16491647
h->free_huge_pages_node[nid]--;
16501648
}
@@ -1661,33 +1659,13 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
16611659
if (!folio_test_hugetlb_vmemmap_optimized(folio))
16621660
__folio_clear_hugetlb(folio);
16631661

1664-
/*
1665-
* In the case of demote we do not ref count the page as it will soon
1666-
* be turned into a page of smaller size.
1667-
*/
1668-
if (!demote)
1669-
folio_ref_unfreeze(folio, 1);
1670-
16711662
h->nr_huge_pages--;
16721663
h->nr_huge_pages_node[nid]--;
16731664
}
16741665

1675-
static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
1676-
bool adjust_surplus)
1677-
{
1678-
__remove_hugetlb_folio(h, folio, adjust_surplus, false);
1679-
}
1680-
1681-
static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
1682-
bool adjust_surplus)
1683-
{
1684-
__remove_hugetlb_folio(h, folio, adjust_surplus, true);
1685-
}
1686-
16871666
static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
16881667
bool adjust_surplus)
16891668
{
1690-
int zeroed;
16911669
int nid = folio_nid(folio);
16921670

16931671
VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
@@ -1711,21 +1689,6 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
17111689
*/
17121690
folio_set_hugetlb_vmemmap_optimized(folio);
17131691

1714-
/*
1715-
* This folio is about to be managed by the hugetlb allocator and
1716-
* should have no users. Drop our reference, and check for others
1717-
* just in case.
1718-
*/
1719-
zeroed = folio_put_testzero(folio);
1720-
if (unlikely(!zeroed))
1721-
/*
1722-
* It is VERY unlikely soneone else has taken a ref
1723-
* on the folio. In this case, we simply return as
1724-
* free_huge_folio() will be called when this other ref
1725-
* is dropped.
1726-
*/
1727-
return;
1728-
17291692
arch_clear_hugetlb_flags(folio);
17301693
enqueue_hugetlb_folio(h, folio);
17311694
}
@@ -1779,6 +1742,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
17791742
spin_unlock_irq(&hugetlb_lock);
17801743
}
17811744

1745+
folio_ref_unfreeze(folio, 1);
1746+
17821747
/*
17831748
* Non-gigantic pages demoted from CMA allocated gigantic pages
17841749
* need to be given back to CMA in free_gigantic_folio.
@@ -3079,11 +3044,8 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
30793044

30803045
free_new:
30813046
spin_unlock_irq(&hugetlb_lock);
3082-
if (new_folio) {
3083-
/* Folio has a zero ref count, but needs a ref to be freed */
3084-
folio_ref_unfreeze(new_folio, 1);
3047+
if (new_folio)
30853048
update_and_free_hugetlb_folio(h, new_folio, false);
3086-
}
30873049

30883050
return ret;
30893051
}
@@ -3938,7 +3900,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
39383900

39393901
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
39403902

3941-
remove_hugetlb_folio_for_demote(h, folio, false);
3903+
remove_hugetlb_folio(h, folio, false);
39423904
spin_unlock_irq(&hugetlb_lock);
39433905

39443906
/*
@@ -3952,7 +3914,6 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
39523914
if (rc) {
39533915
/* Allocation of vmemmmap failed, we can not demote folio */
39543916
spin_lock_irq(&hugetlb_lock);
3955-
folio_ref_unfreeze(folio, 1);
39563917
add_hugetlb_folio(h, folio, false);
39573918
return rc;
39583919
}

mm/hugetlb_vmemmap.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
446446
unsigned long vmemmap_reuse;
447447

448448
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
449+
VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
450+
449451
if (!folio_test_hugetlb_vmemmap_optimized(folio))
450452
return 0;
451453

@@ -481,6 +483,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
481483
*/
482484
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
483485
{
486+
/* avoid writes from page_ref_add_unless() while unfolding vmemmap */
487+
synchronize_rcu();
488+
484489
return __hugetlb_vmemmap_restore_folio(h, folio, 0);
485490
}
486491

@@ -505,6 +510,9 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
505510
long restored = 0;
506511
long ret = 0;
507512

513+
/* avoid writes from page_ref_add_unless() while unfolding vmemmap */
514+
synchronize_rcu();
515+
508516
list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
509517
if (folio_test_hugetlb_vmemmap_optimized(folio)) {
510518
ret = __hugetlb_vmemmap_restore_folio(h, folio,
@@ -550,6 +558,8 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
550558
unsigned long vmemmap_reuse;
551559

552560
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
561+
VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
562+
553563
if (!vmemmap_should_optimize_folio(h, folio))
554564
return ret;
555565

@@ -601,6 +611,9 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
601611
{
602612
LIST_HEAD(vmemmap_pages);
603613

614+
/* avoid writes from page_ref_add_unless() while folding vmemmap */
615+
synchronize_rcu();
616+
604617
__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
605618
free_vmemmap_page_list(&vmemmap_pages);
606619
}
@@ -644,6 +657,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
644657

645658
flush_tlb_all();
646659

660+
/* avoid writes from page_ref_add_unless() while folding vmemmap */
661+
synchronize_rcu();
662+
647663
list_for_each_entry(folio, folio_list, lru) {
648664
int ret;
649665

0 commit comments

Comments
 (0)