Skip to content

Commit 081056d

Browse files
thejhakpm00
authored andcommitted
mm/hugetlb: unshare page tables during VMA split, not before
Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14c ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [[email protected]: v2] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn <[email protected]> Cc: Liam Howlett <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Cc: Lorenzo Stoakes <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: <[email protected]> [b30c14c: hugetlb: unshare some PMDs when splitting VMAs] Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent a2946fb commit 081056d

File tree

4 files changed

+56
-16
lines changed

4 files changed

+56
-16
lines changed

include/linux/hugetlb.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ bool is_hugetlb_entry_migration(pte_t pte);
279279
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
280280
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
281281
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
282+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
282283

283284
#else /* !CONFIG_HUGETLB_PAGE */
284285

@@ -476,6 +477,8 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
476477
{
477478
}
478479

480+
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
481+
479482
#endif /* !CONFIG_HUGETLB_PAGE */
480483

481484
#ifndef pgd_write

mm/hugetlb.c

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
121121
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
122122
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
123123
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
124-
unsigned long start, unsigned long end);
124+
unsigned long start, unsigned long end, bool take_locks);
125125
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
126126

127127
static void hugetlb_free_folio(struct folio *folio)
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
54265426
{
54275427
if (addr & ~(huge_page_mask(hstate_vma(vma))))
54285428
return -EINVAL;
5429+
return 0;
5430+
}
54295431

5432+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
5433+
{
54305434
/*
54315435
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
54325436
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
54335437
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5438+
* This function is called in the middle of a VMA split operation, with
5439+
* MM, VMA and rmap all write-locked to prevent concurrent page table
5440+
* walks (except hardware and gup_fast()).
54345441
*/
5442+
vma_assert_write_locked(vma);
5443+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
5444+
54355445
if (addr & ~PUD_MASK) {
5436-
/*
5437-
* hugetlb_vm_op_split is called right before we attempt to
5438-
* split the VMA. We will need to unshare PMDs in the old and
5439-
* new VMAs, so let's unshare before we split.
5440-
*/
54415446
unsigned long floor = addr & PUD_MASK;
54425447
unsigned long ceil = floor + PUD_SIZE;
54435448

5444-
if (floor >= vma->vm_start && ceil <= vma->vm_end)
5445-
hugetlb_unshare_pmds(vma, floor, ceil);
5449+
if (floor >= vma->vm_start && ceil <= vma->vm_end) {
5450+
/*
5451+
* Locking:
5452+
* Use take_locks=false here.
5453+
* The file rmap lock is already held.
5454+
* The hugetlb VMA lock can't be taken when we already
5455+
* hold the file rmap lock, and we don't need it because
5456+
* its purpose is to synchronize against concurrent page
5457+
* table walks, which are not possible thanks to the
5458+
* locks held by our caller.
5459+
*/
5460+
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
5461+
}
54465462
}
5447-
5448-
return 0;
54495463
}
54505464

54515465
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7885,9 +7899,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
78857899
spin_unlock_irq(&hugetlb_lock);
78867900
}
78877901

7902+
/*
7903+
* If @take_locks is false, the caller must ensure that no concurrent page table
7904+
* access can happen (except for gup_fast() and hardware page walks).
7905+
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7906+
* concurrent page fault handling) and the file rmap lock.
7907+
*/
78887908
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
78897909
unsigned long start,
7890-
unsigned long end)
7910+
unsigned long end,
7911+
bool take_locks)
78917912
{
78927913
struct hstate *h = hstate_vma(vma);
78937914
unsigned long sz = huge_page_size(h);
@@ -7911,8 +7932,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
79117932
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
79127933
start, end);
79137934
mmu_notifier_invalidate_range_start(&range);
7914-
hugetlb_vma_lock_write(vma);
7915-
i_mmap_lock_write(vma->vm_file->f_mapping);
7935+
if (take_locks) {
7936+
hugetlb_vma_lock_write(vma);
7937+
i_mmap_lock_write(vma->vm_file->f_mapping);
7938+
} else {
7939+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7940+
}
79167941
for (address = start; address < end; address += PUD_SIZE) {
79177942
ptep = hugetlb_walk(vma, address, sz);
79187943
if (!ptep)
@@ -7922,8 +7947,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
79227947
spin_unlock(ptl);
79237948
}
79247949
flush_hugetlb_tlb_range(vma, start, end);
7925-
i_mmap_unlock_write(vma->vm_file->f_mapping);
7926-
hugetlb_vma_unlock_write(vma);
7950+
if (take_locks) {
7951+
i_mmap_unlock_write(vma->vm_file->f_mapping);
7952+
hugetlb_vma_unlock_write(vma);
7953+
}
79277954
/*
79287955
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
79297956
* Documentation/mm/mmu_notifier.rst.
@@ -7938,7 +7965,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
79387965
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
79397966
{
79407967
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7941-
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7968+
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
7969+
/* take_locks = */ true);
79427970
}
79437971

79447972
/*

mm/vma.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
539539
init_vma_prep(&vp, vma);
540540
vp.insert = new;
541541
vma_prepare(&vp);
542+
543+
/*
544+
* Get rid of huge pages and shared page tables straddling the split
545+
* boundary.
546+
*/
542547
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
548+
if (is_vm_hugetlb_page(vma))
549+
hugetlb_split(vma, addr);
543550

544551
if (new_below) {
545552
vma->vm_start = addr;

tools/testing/vma/vma_internal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
932932
(void)next;
933933
}
934934

935+
static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
936+
935937
static inline void vma_iter_free(struct vma_iterator *vmi)
936938
{
937939
mas_destroy(&vmi->mas);

0 commit comments

Comments
 (0)