Skip to content

Commit 8be7258

Browse files
peaktocreekakpm00
authored andcommitted
mseal: add mseal syscall
The new mseal() is an syscall on 64 bit CPU, and with following signature: int mseal(void addr, size_t len, unsigned long flags) addr/len: memory range. flags: reserved. mseal() blocks following operations for the given memory range. 1> Unmapping, moving to another location, and shrinking the size, via munmap() and mremap(), can leave an empty space, therefore can be replaced with a VMA with a new set of attributes. 2> Moving or expanding a different VMA into the current location, via mremap(). 3> Modifying a VMA via mmap(MAP_FIXED). 4> Size expansion, via mremap(), does not appear to pose any specific risks to sealed VMAs. It is included anyway because the use case is unclear. In any case, users can rely on merging to expand a sealed VMA. 5> mprotect() and pkey_mprotect(). 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous memory, when users don't have write permission to the memory. Those behaviors can alter region contents by discarding pages, effectively a memset(0) for anonymous memory. Following input during RFC are incooperated into this patch: Jann Horn: raising awareness and providing valuable insights on the destructive madvise operations. Linus Torvalds: assisting in defining system call signature and scope. Liam R. Howlett: perf optimization. Theo de Raadt: sharing the experiences and insight gained from implementing mimmutable() in OpenBSD. Finally, the idea that inspired this patch comes from Stephen Röttger's work in Chrome V8 CFI. [[email protected]: add branch prediction hint, per Pedro] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Jeff Xu <[email protected]> Reviewed-by: Kees Cook <[email protected]> Reviewed-by: Liam R. Howlett <[email protected]> Cc: Pedro Falcato <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: Guenter Roeck <[email protected]> Cc: Jann Horn <[email protected]> Cc: Jeff Xu <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Jorge Lucangeli Obes <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Muhammad Usama Anjum <[email protected]> Cc: Pedro Falcato <[email protected]> Cc: Stephen Röttger <[email protected]> Cc: Suren Baghdasaryan <[email protected]> Cc: Amer Al Shanawany <[email protected]> Cc: Javier Carrasco <[email protected]> Cc: Shuah Khan <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent ff388fe commit 8be7258

File tree

8 files changed

+432
-1
lines changed

8 files changed

+432
-1
lines changed

include/linux/syscalls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,7 @@ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
821821
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
822822
unsigned long prot, unsigned long pgoff,
823823
unsigned long flags);
824+
asmlinkage long sys_mseal(unsigned long start, size_t len, unsigned long flags);
824825
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
825826
unsigned long mode,
826827
const unsigned long __user *nmask,

mm/Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
4343
mmu-$(CONFIG_MMU) += process_vm_access.o
4444
endif
4545

46+
ifdef CONFIG_64BIT
47+
mmu-$(CONFIG_MMU) += mseal.o
48+
endif
49+
4650
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
4751
maccess.o page-writeback.o folio-compat.o \
4852
readahead.o swap.o truncate.o vmscan.o shrinker.o \

mm/internal.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,6 +1435,43 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn,
14351435
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
14361436
int priority);
14371437

1438+
#ifdef CONFIG_64BIT
1439+
/* VM is sealed, in vm_flags */
1440+
#define VM_SEALED _BITUL(63)
1441+
#endif
1442+
1443+
#ifdef CONFIG_64BIT
1444+
static inline int can_do_mseal(unsigned long flags)
1445+
{
1446+
if (flags)
1447+
return -EINVAL;
1448+
1449+
return 0;
1450+
}
1451+
1452+
bool can_modify_mm(struct mm_struct *mm, unsigned long start,
1453+
unsigned long end);
1454+
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
1455+
unsigned long end, int behavior);
1456+
#else
1457+
static inline int can_do_mseal(unsigned long flags)
1458+
{
1459+
return -EPERM;
1460+
}
1461+
1462+
static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
1463+
unsigned long end)
1464+
{
1465+
return true;
1466+
}
1467+
1468+
static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
1469+
unsigned long end, int behavior)
1470+
{
1471+
return true;
1472+
}
1473+
#endif
1474+
14381475
#ifdef CONFIG_SHRINKER_DEBUG
14391476
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
14401477
struct shrinker *shrinker, const char *fmt, va_list ap)

mm/madvise.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
14011401
* -EIO - an I/O error occurred while paging in data.
14021402
* -EBADF - map exists, but area maps something that isn't a file.
14031403
* -EAGAIN - a kernel resource was temporarily unavailable.
1404+
* -EPERM - memory is sealed.
14041405
*/
14051406
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
14061407
{
@@ -1444,6 +1445,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
14441445
start = untagged_addr_remote(mm, start);
14451446
end = start + len;
14461447

1448+
/*
1449+
* Check if the address range is sealed for do_madvise().
1450+
* can_modify_mm_madv assumes we have acquired the lock on MM.
1451+
*/
1452+
if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
1453+
error = -EPERM;
1454+
goto out;
1455+
}
1456+
14471457
blk_start_plug(&plug);
14481458
switch (behavior) {
14491459
case MADV_POPULATE_READ:
@@ -1456,6 +1466,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
14561466
break;
14571467
}
14581468
blk_finish_plug(&plug);
1469+
1470+
out:
14591471
if (write)
14601472
mmap_write_unlock(mm);
14611473
else

mm/mmap.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
12551255
if (mm->map_count > sysctl_max_map_count)
12561256
return -ENOMEM;
12571257

1258+
/*
1259+
* addr is returned from get_unmapped_area,
1260+
* There are two cases:
1261+
* 1> MAP_FIXED == false
1262+
* unallocated memory, no need to check sealing.
1263+
* 1> MAP_FIXED == true
1264+
* sealing is checked inside mmap_region when
1265+
* do_vmi_munmap is called.
1266+
*/
1267+
12581268
if (prot == PROT_EXEC) {
12591269
pkey = execute_only_pkey(mm);
12601270
if (pkey < 0)
@@ -2727,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
27272737
if (end == start)
27282738
return -EINVAL;
27292739

2740+
/*
2741+
* Check if memory is sealed before arch_unmap.
2742+
* Prevent unmapping a sealed VMA.
2743+
* can_modify_mm assumes we have acquired the lock on MM.
2744+
*/
2745+
if (unlikely(!can_modify_mm(mm, start, end)))
2746+
return -EPERM;
2747+
27302748
/* arch_unmap() might do unmaps itself. */
27312749
arch_unmap(mm, start, end);
27322750

@@ -2789,7 +2807,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
27892807
}
27902808

27912809
/* Unmap any existing mapping in the area */
2792-
if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
2810+
error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
2811+
if (error == -EPERM)
2812+
return error;
2813+
else if (error)
27932814
return -ENOMEM;
27942815

27952816
/*
@@ -3139,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
31393160
{
31403161
struct mm_struct *mm = vma->vm_mm;
31413162

3163+
/*
3164+
* Check if memory is sealed before arch_unmap.
3165+
* Prevent unmapping a sealed VMA.
3166+
* can_modify_mm assumes we have acquired the lock on MM.
3167+
*/
3168+
if (unlikely(!can_modify_mm(mm, start, end)))
3169+
return -EPERM;
3170+
31423171
arch_unmap(mm, start, end);
31433172
return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
31443173
}

mm/mprotect.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <linux/sched/sysctl.h>
3333
#include <linux/userfaultfd_k.h>
3434
#include <linux/memory-tiers.h>
35+
#include <uapi/linux/mman.h>
3536
#include <asm/cacheflush.h>
3637
#include <asm/mmu_context.h>
3738
#include <asm/tlbflush.h>
@@ -744,6 +745,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
744745
}
745746
}
746747

748+
/*
749+
* checking if memory is sealed.
750+
* can_modify_mm assumes we have acquired the lock on MM.
751+
*/
752+
if (unlikely(!can_modify_mm(current->mm, start, end))) {
753+
error = -EPERM;
754+
goto out;
755+
}
756+
747757
prev = vma_prev(&vmi);
748758
if (start > vma->vm_start)
749759
prev = vma;

mm/mremap.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
902902
if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
903903
return -ENOMEM;
904904

905+
/*
906+
* In mremap_to().
907+
* Move a VMA to another location, check if src addr is sealed.
908+
*
909+
* Place can_modify_mm here because mremap_to()
910+
* does its own checking for address range, and we only
911+
* check the sealing after passing those checks.
912+
*
913+
* can_modify_mm assumes we have acquired the lock on MM.
914+
*/
915+
if (unlikely(!can_modify_mm(mm, addr, addr + old_len)))
916+
return -EPERM;
917+
905918
if (flags & MREMAP_FIXED) {
919+
/*
920+
* In mremap_to().
921+
* VMA is moved to dst address, and munmap dst first.
922+
* do_munmap will check if dst is sealed.
923+
*/
906924
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
907925
if (ret)
908926
goto out;
@@ -1061,6 +1079,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
10611079
goto out;
10621080
}
10631081

1082+
/*
1083+
* Below is shrink/expand case (not mremap_to())
1084+
* Check if src address is sealed, if so, reject.
1085+
* In other words, prevent shrinking or expanding a sealed VMA.
1086+
*
1087+
* Place can_modify_mm here so we can keep the logic related to
1088+
* shrink/expand together.
1089+
*/
1090+
if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) {
1091+
ret = -EPERM;
1092+
goto out;
1093+
}
1094+
10641095
/*
10651096
* Always allow a shrinking remap: that just unmaps
10661097
* the unnecessary pages..

0 commit comments

Comments
 (0)