Skip to content

Commit 1afaeb8

Browse files
committed
mm/migrate: Trylock device page in do_swap_page
Avoid multiple CPU page faults to the same device page racing by trying to lock the page in do_swap_page before taking an extra reference to the page. This prevents scenarios where multiple CPU page faults each take an extra reference to a device page, which could abort migration in folio_migrate_mapping. With the device page being locked in do_swap_page, the migrate_vma_* functions need to be updated to avoid locking the fault_page argument. Prior to this change, a livelock scenario could occur in Xe's (Intel GPU DRM driver) SVM implementation if enough threads faulted the same device page. v3: - Put page after unlocking page (Alistair) - Warn on spliting a TPH which is fault page (Alistair) - Warn on dst page == fault page (Alistair) v6: - Add more verbose comment around THP (Alistair) v7: - Fix migrate_device_finalize alignment (Checkpatch) Cc: Alistair Popple <[email protected]> Cc: Philip Yang <[email protected]> Cc: Felix Kuehling <[email protected]> Cc: Christian König <[email protected]> Cc: Andrew Morton <[email protected]> Suggested-by: Simona Vetter <[email protected]> Signed-off-by: Matthew Brost <[email protected]> Reviewed-by: Alistair Popple <[email protected]> Tested-by: Alistair Popple <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent a14fa8e commit 1afaeb8

File tree

2 files changed

+55
-22
lines changed

2 files changed

+55
-22
lines changed

mm/memory.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4348,10 +4348,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
43484348
* Get a page reference while we know the page can't be
43494349
* freed.
43504350
*/
4351-
get_page(vmf->page);
4352-
pte_unmap_unlock(vmf->pte, vmf->ptl);
4353-
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
4354-
put_page(vmf->page);
4351+
if (trylock_page(vmf->page)) {
4352+
get_page(vmf->page);
4353+
pte_unmap_unlock(vmf->pte, vmf->ptl);
4354+
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
4355+
unlock_page(vmf->page);
4356+
put_page(vmf->page);
4357+
} else {
4358+
pte_unmap_unlock(vmf->pte, vmf->ptl);
4359+
}
43554360
} else if (is_hwpoison_entry(entry)) {
43564361
ret = VM_FAULT_HWPOISON;
43574362
} else if (is_pte_marker_entry(entry)) {

mm/migrate_device.c

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
6060
struct mm_walk *walk)
6161
{
6262
struct migrate_vma *migrate = walk->private;
63+
struct folio *fault_folio = migrate->fault_page ?
64+
page_folio(migrate->fault_page) : NULL;
6365
struct vm_area_struct *vma = walk->vma;
6466
struct mm_struct *mm = vma->vm_mm;
6567
unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
8890

8991
folio_get(folio);
9092
spin_unlock(ptl);
93+
/* FIXME: we don't expect THP for fault_folio */
94+
if (WARN_ON_ONCE(fault_folio == folio))
95+
return migrate_vma_collect_skip(start, end,
96+
walk);
9197
if (unlikely(!folio_trylock(folio)))
9298
return migrate_vma_collect_skip(start, end,
9399
walk);
94100
ret = split_folio(folio);
95-
folio_unlock(folio);
101+
if (fault_folio != folio)
102+
folio_unlock(folio);
96103
folio_put(folio);
97104
if (ret)
98105
return migrate_vma_collect_skip(start, end,
@@ -192,7 +199,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
192199
* optimisation to avoid walking the rmap later with
193200
* try_to_migrate().
194201
*/
195-
if (folio_trylock(folio)) {
202+
if (fault_folio == folio || folio_trylock(folio)) {
196203
bool anon_exclusive;
197204
pte_t swp_pte;
198205

@@ -204,7 +211,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
204211

205212
if (folio_try_share_anon_rmap_pte(folio, page)) {
206213
set_pte_at(mm, addr, ptep, pte);
207-
folio_unlock(folio);
214+
if (fault_folio != folio)
215+
folio_unlock(folio);
208216
folio_put(folio);
209217
mpfn = 0;
210218
goto next;
@@ -363,6 +371,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
363371
unsigned long npages,
364372
struct page *fault_page)
365373
{
374+
struct folio *fault_folio = fault_page ?
375+
page_folio(fault_page) : NULL;
366376
unsigned long i, restore = 0;
367377
bool allow_drain = true;
368378
unsigned long unmapped = 0;
@@ -427,7 +437,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
427437
remove_migration_ptes(folio, folio, 0);
428438

429439
src_pfns[i] = 0;
430-
folio_unlock(folio);
440+
if (fault_folio != folio)
441+
folio_unlock(folio);
431442
folio_put(folio);
432443
restore--;
433444
}
@@ -536,6 +547,8 @@ int migrate_vma_setup(struct migrate_vma *args)
536547
return -EINVAL;
537548
if (args->fault_page && !is_device_private_page(args->fault_page))
538549
return -EINVAL;
550+
if (args->fault_page && !PageLocked(args->fault_page))
551+
return -EINVAL;
539552

540553
memset(args->src, 0, sizeof(*args->src) * nr_pages);
541554
args->cpages = 0;
@@ -799,19 +812,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
799812
}
800813
EXPORT_SYMBOL(migrate_vma_pages);
801814

802-
/*
803-
* migrate_device_finalize() - complete page migration
804-
* @src_pfns: src_pfns returned from migrate_device_range()
805-
* @dst_pfns: array of pfns allocated by the driver to migrate memory to
806-
* @npages: number of pages in the range
807-
*
808-
* Completes migration of the page by removing special migration entries.
809-
* Drivers must ensure copying of page data is complete and visible to the CPU
810-
* before calling this.
811-
*/
812-
void migrate_device_finalize(unsigned long *src_pfns,
813-
unsigned long *dst_pfns, unsigned long npages)
815+
static void __migrate_device_finalize(unsigned long *src_pfns,
816+
unsigned long *dst_pfns,
817+
unsigned long npages,
818+
struct page *fault_page)
814819
{
820+
struct folio *fault_folio = fault_page ?
821+
page_folio(fault_page) : NULL;
815822
unsigned long i;
816823

817824
for (i = 0; i < npages; i++) {
@@ -824,6 +831,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
824831

825832
if (!page) {
826833
if (dst) {
834+
WARN_ON_ONCE(fault_folio == dst);
827835
folio_unlock(dst);
828836
folio_put(dst);
829837
}
@@ -834,6 +842,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
834842

835843
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
836844
if (dst) {
845+
WARN_ON_ONCE(fault_folio == dst);
837846
folio_unlock(dst);
838847
folio_put(dst);
839848
}
@@ -843,15 +852,33 @@ void migrate_device_finalize(unsigned long *src_pfns,
843852
if (!folio_is_zone_device(dst))
844853
folio_add_lru(dst);
845854
remove_migration_ptes(src, dst, 0);
846-
folio_unlock(src);
855+
if (fault_folio != src)
856+
folio_unlock(src);
847857
folio_put(src);
848858

849859
if (dst != src) {
860+
WARN_ON_ONCE(fault_folio == dst);
850861
folio_unlock(dst);
851862
folio_put(dst);
852863
}
853864
}
854865
}
866+
867+
/*
868+
* migrate_device_finalize() - complete page migration
869+
* @src_pfns: src_pfns returned from migrate_device_range()
870+
* @dst_pfns: array of pfns allocated by the driver to migrate memory to
871+
* @npages: number of pages in the range
872+
*
873+
* Completes migration of the page by removing special migration entries.
874+
* Drivers must ensure copying of page data is complete and visible to the CPU
875+
* before calling this.
876+
*/
877+
void migrate_device_finalize(unsigned long *src_pfns,
878+
unsigned long *dst_pfns, unsigned long npages)
879+
{
880+
return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
881+
}
855882
EXPORT_SYMBOL(migrate_device_finalize);
856883

857884
/**
@@ -867,7 +894,8 @@ EXPORT_SYMBOL(migrate_device_finalize);
867894
*/
868895
void migrate_vma_finalize(struct migrate_vma *migrate)
869896
{
870-
migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
897+
__migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
898+
migrate->fault_page);
871899
}
872900
EXPORT_SYMBOL(migrate_vma_finalize);
873901

0 commit comments

Comments
 (0)