Skip to content

Commit 9651fce

Browse files
committed
mm: add MAP_DROPPABLE for designating always lazily freeable mappings
The vDSO getrandom() implementation works with a buffer allocated with a new system call that has certain requirements: - It shouldn't be written to core dumps. * Easy: VM_DONTDUMP. - It should be zeroed on fork. * Easy: VM_WIPEONFORK. - It shouldn't be written to swap. * Uh-oh: mlock is rlimited. * Uh-oh: mlock isn't inherited by forks. - It shouldn't reserve actual memory, but it also shouldn't crash when page faulting in memory if none is available * Uh-oh: VM_NORESERVE means segfaults. It turns out that the vDSO getrandom() function has three really nice characteristics that we can exploit to solve this problem: 1) Due to being wiped during fork(), the vDSO code is already robust to having the contents of the pages it reads zeroed out midway through the function's execution. 2) In the absolute worst case of whatever contingency we're coding for, we have the option to fallback to the getrandom() syscall, and everything is fine. 3) The buffers the function uses are only ever useful for a maximum of 60 seconds -- a sort of cache, rather than a long term allocation. These characteristics mean that we can introduce VM_DROPPABLE, which has the following semantics: a) It never is written out to swap. b) Under memory pressure, mm can just drop the pages (so that they're zero when read back again). c) It is inherited by fork. d) It doesn't count against the mlock budget, since nothing is locked. e) If there's not enough memory to service a page fault, it's not fatal, and no signal is sent. This way, allocations used by vDSO getrandom() can use: VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE And there will be no problem with OOMing, crashing on overcommitment, using memory when not in use, not wiping on fork(), coredumps, or writing out to swap. In order to let vDSO getrandom() use this, expose these via mmap(2) as MAP_DROPPABLE. Note that this involves removing the MADV_FREE special case from sort_folio(), which according to Yu Zhao is unnecessary and will simply result in an extra call to shrink_folio_list() in the worst case. The chunk removed reenables the swapbacked flag, which we don't want for VM_DROPPABLE, and we can't conditionalize it here because there isn't a vma reference available. Finally, the provided self test ensures that this is working as desired. Cc: [email protected] Acked-by: David Hildenbrand <[email protected]> Signed-off-by: Jason A. Donenfeld <[email protected]>
1 parent 8a18fda commit 9651fce

File tree

17 files changed

+146
-15
lines changed

17 files changed

+146
-15
lines changed

fs/proc/task_mmu.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
708708
[ilog2(VM_SHADOW_STACK)] = "ss",
709709
#endif
710710
#ifdef CONFIG_64BIT
711+
[ilog2(VM_DROPPABLE)] = "dp",
711712
[ilog2(VM_SEALED)] = "sl",
712713
#endif
713714
};

include/linux/mm.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp);
406406
#define VM_ALLOW_ANY_UNCACHED VM_NONE
407407
#endif
408408

409+
#ifdef CONFIG_64BIT
410+
#define VM_DROPPABLE_BIT 40
411+
#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
412+
#else
413+
#define VM_DROPPABLE VM_NONE
414+
#endif
415+
409416
#ifdef CONFIG_64BIT
410417
/* VM is sealed, in vm_flags */
411418
#define VM_SEALED _BITUL(63)

include/linux/userfaultfd_k.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
218218
{
219219
vm_flags &= __VM_UFFD_FLAGS;
220220

221+
if (vm_flags & VM_DROPPABLE)
222+
return false;
223+
221224
if ((vm_flags & VM_UFFD_MINOR) &&
222225
(!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
223226
return false;

include/trace/events/mmflags.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3)
165165
# define IF_HAVE_UFFD_MINOR(flag, name)
166166
#endif
167167

168+
#ifdef CONFIG_64BIT
169+
# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name},
170+
#else
171+
# define IF_HAVE_VM_DROPPABLE(flag, name)
172+
#endif
173+
168174
#define __def_vmaflag_names \
169175
{VM_READ, "read" }, \
170176
{VM_WRITE, "write" }, \
@@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
197203
{VM_MIXEDMAP, "mixedmap" }, \
198204
{VM_HUGEPAGE, "hugepage" }, \
199205
{VM_NOHUGEPAGE, "nohugepage" }, \
206+
IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \
200207
{VM_MERGEABLE, "mergeable" } \
201208

202209
#define show_vma_flags(flags) \

include/uapi/linux/mman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#define MAP_SHARED 0x01 /* Share changes */
1818
#define MAP_PRIVATE 0x02 /* Changes are private */
1919
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
20+
#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
2021

2122
/*
2223
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page

mm/ksm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma)
717717
{
718718
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
719719
VM_IO | VM_DONTEXPAND | VM_HUGETLB |
720-
VM_MIXEDMAP))
720+
VM_MIXEDMAP| VM_DROPPABLE))
721721
return false; /* just ignore the advice */
722722

723723
if (vma_is_dax(vma))

mm/madvise.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
10681068
new_flags |= VM_WIPEONFORK;
10691069
break;
10701070
case MADV_KEEPONFORK:
1071+
if (vma->vm_flags & VM_DROPPABLE)
1072+
return -EINVAL;
10711073
new_flags &= ~VM_WIPEONFORK;
10721074
break;
10731075
case MADV_DONTDUMP:
10741076
new_flags |= VM_DONTDUMP;
10751077
break;
10761078
case MADV_DODUMP:
1077-
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1079+
if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
1080+
(vma->vm_flags & VM_DROPPABLE))
10781081
return -EINVAL;
10791082
new_flags &= ~VM_DONTDUMP;
10801083
break;

mm/memory.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5660,6 +5660,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
56605660
/* If the fault handler drops the mmap_lock, vma may be freed */
56615661
struct mm_struct *mm = vma->vm_mm;
56625662
vm_fault_t ret;
5663+
bool is_droppable;
56635664

56645665
__set_current_state(TASK_RUNNING);
56655666

@@ -5674,6 +5675,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
56745675
goto out;
56755676
}
56765677

5678+
is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
5679+
56775680
/*
56785681
* Enable the memcg OOM handling for faults triggered in user
56795682
* space. Kernel faults are handled more gracefully.
@@ -5688,8 +5691,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
56885691
else
56895692
ret = __handle_mm_fault(vma, address, flags);
56905693

5694+
/*
5695+
* Warning: It is no longer safe to dereference vma-> after this point,
5696+
* because mmap_lock might have been dropped by __handle_mm_fault(), so
5697+
* vma might be destroyed from underneath us.
5698+
*/
5699+
56915700
lru_gen_exit_fault();
56925701

5702+
/* If the mapping is droppable, then errors due to OOM aren't fatal. */
5703+
if (is_droppable)
5704+
ret &= ~VM_FAULT_OOM;
5705+
56935706
if (flags & FAULT_FLAG_USER) {
56945707
mem_cgroup_exit_user_fault();
56955708
/*

mm/mempolicy.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2300,6 +2300,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
23002300
pgoff_t ilx;
23012301
struct page *page;
23022302

2303+
if (vma->vm_flags & VM_DROPPABLE)
2304+
gfp |= __GFP_NOWARN;
2305+
23032306
pol = get_vma_policy(vma, addr, order, &ilx);
23042307
page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
23052308
pol, ilx, numa_node_id());

mm/mlock.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
485485

486486
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
487487
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
488-
vma_is_dax(vma) || vma_is_secretmem(vma))
488+
vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
489489
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
490490
goto out;
491491

0 commit comments

Comments
 (0)