Skip to content

Commit e057f2b

Browse files
thejhopsiff
authored andcommitted
mm/hugetlb: unshare page tables during VMA split, not before
commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 upstream. Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14c ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [[email protected]: v2] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn <[email protected]> Cc: Liam Howlett <[email protected]> Reviewed-by: Lorenzo Stoakes <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Cc: Lorenzo Stoakes <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: <[email protected]> [b30c14c: hugetlb: unshare some PMDs when splitting VMAs] Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> [stable backport: code got moved from mmap.c to vma.c] Signed-off-by: Jann Horn <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]> (cherry picked from commit af6cfcd0efb7f051af221c418ec8b37a10211947)
1 parent 0c662e2 commit e057f2b

File tree

3 files changed

+53
-16
lines changed

3 files changed

+53
-16
lines changed

include/linux/hugetlb.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
281281

282282
bool is_hugetlb_entry_migration(pte_t pte);
283283
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
284+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
284285

285286
#else /* !CONFIG_HUGETLB_PAGE */
286287

@@ -491,6 +492,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
491492

492493
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
493494

495+
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
496+
494497
#endif /* !CONFIG_HUGETLB_PAGE */
495498
/*
496499
* hugepages at page global directory. If arch support

mm/hugetlb.c

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
9797
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
9898
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
9999
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
100-
unsigned long start, unsigned long end);
100+
unsigned long start, unsigned long end, bool take_locks);
101101
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
102102

103103
static inline bool subpool_is_free(struct hugepage_subpool *spool)
@@ -4961,26 +4961,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
49614961
{
49624962
if (addr & ~(huge_page_mask(hstate_vma(vma))))
49634963
return -EINVAL;
4964+
return 0;
4965+
}
49644966

4967+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
4968+
{
49654969
/*
49664970
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
49674971
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
49684972
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
4973+
* This function is called in the middle of a VMA split operation, with
4974+
* MM, VMA and rmap all write-locked to prevent concurrent page table
4975+
* walks (except hardware and gup_fast()).
49694976
*/
4977+
vma_assert_write_locked(vma);
4978+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
4979+
49704980
if (addr & ~PUD_MASK) {
4971-
/*
4972-
* hugetlb_vm_op_split is called right before we attempt to
4973-
* split the VMA. We will need to unshare PMDs in the old and
4974-
* new VMAs, so let's unshare before we split.
4975-
*/
49764981
unsigned long floor = addr & PUD_MASK;
49774982
unsigned long ceil = floor + PUD_SIZE;
49784983

4979-
if (floor >= vma->vm_start && ceil <= vma->vm_end)
4980-
hugetlb_unshare_pmds(vma, floor, ceil);
4984+
if (floor >= vma->vm_start && ceil <= vma->vm_end) {
4985+
/*
4986+
* Locking:
4987+
* Use take_locks=false here.
4988+
* The file rmap lock is already held.
4989+
* The hugetlb VMA lock can't be taken when we already
4990+
* hold the file rmap lock, and we don't need it because
4991+
* its purpose is to synchronize against concurrent page
4992+
* table walks, which are not possible thanks to the
4993+
* locks held by our caller.
4994+
*/
4995+
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
4996+
}
49814997
}
4982-
4983-
return 0;
49844998
}
49854999

49865000
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7363,9 +7377,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
73637377
}
73647378
}
73657379

7380+
/*
7381+
* If @take_locks is false, the caller must ensure that no concurrent page table
7382+
* access can happen (except for gup_fast() and hardware page walks).
7383+
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7384+
* concurrent page fault handling) and the file rmap lock.
7385+
*/
73667386
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
73677387
unsigned long start,
7368-
unsigned long end)
7388+
unsigned long end,
7389+
bool take_locks)
73697390
{
73707391
struct hstate *h = hstate_vma(vma);
73717392
unsigned long sz = huge_page_size(h);
@@ -7389,8 +7410,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
73897410
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
73907411
start, end);
73917412
mmu_notifier_invalidate_range_start(&range);
7392-
hugetlb_vma_lock_write(vma);
7393-
i_mmap_lock_write(vma->vm_file->f_mapping);
7413+
if (take_locks) {
7414+
hugetlb_vma_lock_write(vma);
7415+
i_mmap_lock_write(vma->vm_file->f_mapping);
7416+
} else {
7417+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7418+
}
73947419
for (address = start; address < end; address += PUD_SIZE) {
73957420
ptep = hugetlb_walk(vma, address, sz);
73967421
if (!ptep)
@@ -7400,8 +7425,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
74007425
spin_unlock(ptl);
74017426
}
74027427
flush_hugetlb_tlb_range(vma, start, end);
7403-
i_mmap_unlock_write(vma->vm_file->f_mapping);
7404-
hugetlb_vma_unlock_write(vma);
7428+
if (take_locks) {
7429+
i_mmap_unlock_write(vma->vm_file->f_mapping);
7430+
hugetlb_vma_unlock_write(vma);
7431+
}
74057432
/*
74067433
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
74077434
* Documentation/mm/mmu_notifier.rst.
@@ -7416,7 +7443,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
74167443
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
74177444
{
74187445
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7419-
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7446+
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
7447+
/* take_locks = */ true);
74207448
}
74217449

74227450
#ifdef CONFIG_CMA

mm/mmap.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2402,7 +2402,13 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
24022402
init_vma_prep(&vp, vma);
24032403
vp.insert = new;
24042404
vma_prepare(&vp);
2405+
/*
2406+
* Get rid of huge pages and shared page tables straddling the split
2407+
* boundary.
2408+
*/
24052409
vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
2410+
if (is_vm_hugetlb_page(vma))
2411+
hugetlb_split(vma, addr);
24062412

24072413
if (new_below) {
24082414
vma->vm_start = addr;

0 commit comments

Comments
 (0)