Skip to content

Commit 72e315f

Browse files
Hugh Dickinsakpm00
authored andcommitted
mempolicy: mmap_lock is not needed while migrating folios
mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Hugh Dickins <[email protected]> Cc: Andi Kleen <[email protected]> Cc: Christoph Lameter <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: "Huang, Ying" <[email protected]> Cc: Kefeng Wang <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Nhat Pham <[email protected]> Cc: Sidhartha Kumar <[email protected]> Cc: Suren Baghdasaryan <[email protected]> Cc: Tejun heo <[email protected]> Cc: Vishal Moola (Oracle) <[email protected]> Cc: Yang Shi <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent ddc1a5c commit 72e315f

File tree

3 files changed

+63
-67
lines changed

3 files changed

+63
-67
lines changed

include/linux/hugetlb.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
748748
unsigned long addr, int avoid_reserve);
749749
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
750750
nodemask_t *nmask, gfp_t gfp_mask);
751-
struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
752-
unsigned long address);
753751
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
754752
pgoff_t idx);
755753
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
10721070
return NULL;
10731071
}
10741072

1075-
static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
1076-
struct vm_area_struct *vma,
1077-
unsigned long address)
1078-
{
1079-
return NULL;
1080-
}
1081-
10821073
static inline int __alloc_bootmem_huge_page(struct hstate *h)
10831074
{
10841075
return 0;

mm/hugetlb.c

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
26302630
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
26312631
}
26322632

2633-
/* mempolicy aware migration callback */
2634-
struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
2635-
unsigned long address)
2636-
{
2637-
struct mempolicy *mpol;
2638-
nodemask_t *nodemask;
2639-
struct folio *folio;
2640-
gfp_t gfp_mask;
2641-
int node;
2642-
2643-
gfp_mask = htlb_alloc_mask(h);
2644-
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2645-
folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
2646-
mpol_cond_put(mpol);
2647-
2648-
return folio;
2649-
}
2650-
26512633
/*
26522634
* Increase the hugetlb pool such that it can accommodate a reservation
26532635
* of size 'delta'.
@@ -6559,6 +6541,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
65596541
}
65606542

65616543
#ifdef CONFIG_USERFAULTFD
6544+
/*
6545+
* Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
6546+
*/
6547+
static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
6548+
struct vm_area_struct *vma, unsigned long address)
6549+
{
6550+
struct mempolicy *mpol;
6551+
nodemask_t *nodemask;
6552+
struct folio *folio;
6553+
gfp_t gfp_mask;
6554+
int node;
6555+
6556+
gfp_mask = htlb_alloc_mask(h);
6557+
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
6558+
folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
6559+
mpol_cond_put(mpol);
6560+
6561+
return folio;
6562+
}
6563+
65626564
/*
65636565
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
65646566
* with modifications for hugetlb pages.

mm/mempolicy.c

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
415415

416416
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
417417
unsigned long flags);
418+
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
419+
pgoff_t ilx, int *nid);
418420

419421
static bool strictly_unmovable(unsigned long flags)
420422
{
@@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
10211023
node_set(source, nmask);
10221024

10231025
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1026+
1027+
mmap_read_lock(mm);
10241028
vma = find_vma(mm, 0);
10251029

10261030
/*
@@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
10311035
*/
10321036
nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
10331037
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1038+
mmap_read_unlock(mm);
10341039

10351040
if (!list_empty(&pagelist)) {
10361041
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
@@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
10591064

10601065
lru_cache_disable();
10611066

1062-
mmap_read_lock(mm);
1063-
10641067
/*
10651068
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
10661069
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
@@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
11401143
if (err < 0)
11411144
break;
11421145
}
1143-
mmap_read_unlock(mm);
11441146

11451147
lru_cache_enable();
11461148
if (err < 0)
@@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
11491151
}
11501152

11511153
/*
1152-
* Allocate a new page for page migration based on vma policy.
1153-
* Start by assuming the page is mapped by the same vma as contains @start.
1154-
* Search forward from there, if not. N.B., this assumes that the
1155-
* list of pages handed to migrate_pages()--which is how we get here--
1156-
* is in virtual address order.
1154+
* Allocate a new folio for page migration, according to NUMA mempolicy.
11571155
*/
1158-
static struct folio *new_folio(struct folio *src, unsigned long start)
1156+
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1157+
unsigned long private)
11591158
{
1160-
struct vm_area_struct *vma;
1161-
unsigned long address;
1162-
VMA_ITERATOR(vmi, current->mm, start);
1163-
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
1164-
1165-
for_each_vma(vmi, vma) {
1166-
address = page_address_in_vma(&src->page, vma);
1167-
if (address != -EFAULT)
1168-
break;
1169-
}
1159+
struct mempolicy *pol = (struct mempolicy *)private;
1160+
pgoff_t ilx = 0; /* improve on this later */
1161+
struct page *page;
1162+
unsigned int order;
1163+
int nid = numa_node_id();
1164+
gfp_t gfp;
11701165

1171-
/*
1172-
* __get_vma_policy() now expects a genuine non-NULL vma. Return NULL
1173-
* when the page can no longer be located in a vma: that is not ideal
1174-
* (migrate_pages() will give up early, presuming ENOMEM), but good
1175-
* enough to avoid a crash by syzkaller or concurrent holepunch.
1176-
*/
1177-
if (!vma)
1178-
return NULL;
1166+
order = folio_order(src);
1167+
ilx += src->index >> order;
11791168

11801169
if (folio_test_hugetlb(src)) {
1181-
return alloc_hugetlb_folio_vma(folio_hstate(src),
1182-
vma, address);
1170+
nodemask_t *nodemask;
1171+
struct hstate *h;
1172+
1173+
h = folio_hstate(src);
1174+
gfp = htlb_alloc_mask(h);
1175+
nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1176+
return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
11831177
}
11841178

11851179
if (folio_test_large(src))
11861180
gfp = GFP_TRANSHUGE;
1181+
else
1182+
gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
11871183

1188-
return vma_alloc_folio(gfp, folio_order(src), vma, address,
1189-
folio_test_large(src));
1184+
page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
1185+
return page_rmappable_folio(page);
11901186
}
11911187
#else
11921188

@@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
12021198
return -ENOSYS;
12031199
}
12041200

1205-
static struct folio *new_folio(struct folio *src, unsigned long start)
1201+
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1202+
unsigned long private)
12061203
{
12071204
return NULL;
12081205
}
@@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len,
12761273

12771274
if (nr_failed < 0) {
12781275
err = nr_failed;
1276+
nr_failed = 0;
12791277
} else {
12801278
vma_iter_init(&vmi, mm, start);
12811279
prev = vma_prev(&vmi);
@@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len,
12861284
}
12871285
}
12881286

1289-
if (!err) {
1290-
if (!list_empty(&pagelist)) {
1291-
nr_failed |= migrate_pages(&pagelist, new_folio, NULL,
1292-
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1287+
mmap_write_unlock(mm);
1288+
1289+
if (!err && !list_empty(&pagelist)) {
1290+
/* Convert MPOL_DEFAULT's NULL to task or default policy */
1291+
if (!new) {
1292+
new = get_task_policy(current);
1293+
mpol_get(new);
12931294
}
1294-
if (nr_failed && (flags & MPOL_MF_STRICT))
1295-
err = -EIO;
1295+
nr_failed |= migrate_pages(&pagelist,
1296+
alloc_migration_target_by_mpol, NULL,
1297+
(unsigned long)new, MIGRATE_SYNC,
1298+
MR_MEMPOLICY_MBIND, NULL);
12961299
}
12971300

1301+
if (nr_failed && (flags & MPOL_MF_STRICT))
1302+
err = -EIO;
12981303
if (!list_empty(&pagelist))
12991304
putback_movable_pages(&pagelist);
1300-
1301-
mmap_write_unlock(mm);
13021305
mpol_out:
13031306
mpol_put(new);
13041307
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))

0 commit comments

Comments
 (0)