Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
From 138b7a4c83c43b42851cb8fec2bbdbaadd960241 Mon Sep 17 00:00:00 2001
From: Patrick Roy <[email protected]>
Date: Fri, 7 Feb 2025 11:16:06 +0000
Subject: [PATCH 1/2] mm: introduce AS_NO_DIRECT_MAP

Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are
set to not present . Currently, mappings that match this description are
secretmem mappings (memfd_secret()). Later, some guest_memfd
configurations will also fall into this category.

Reject this new type of mappings in all locations that currently reject
secretmem mappings, on the assumption that if secretmem mappings are
rejected somewhere, it is precisely because of an inability to deal with
folios without direct map entries, and then make memfd_secret() use
AS_NO_DIRECT_MAP on its address_space to drop its special
vma_is_secretmem()/secretmem_mapping() checks.

This drops a optimization in gup_fast_folio_allowed() where
secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
by default"), so the secretmem check did not actually end up elided in
most cases anymore anyway.

Use a new flag instead of overloading AS_INACCESSIBLE (which is already
set by guest_memfd) because not all guest_memfd mappings will end up
being direct map removed (e.g. in pKVM setups, parts of guest_memfd that
can be mapped to userspace should also be GUP-able, and generally not
have restrictions on who can access it).

Signed-off-by: Patrick Roy <[email protected]>
---
include/linux/pagemap.h | 16 ++++++++++++++++
include/linux/secretmem.h | 18 ------------------
lib/buildid.c | 4 ++--
mm/gup.c | 14 +++-----------
mm/mlock.c | 2 +-
mm/secretmem.c | 6 +-----
6 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 47bfc6b1b632..903b41e89cf8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -210,6 +210,7 @@ enum mapping_flags {
AS_STABLE_WRITES = 7, /* must wait for writeback before modifying
folio contents */
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
+ AS_NO_DIRECT_MAP = 9, /* Folios in the mapping are not in the direct map */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -335,6 +336,21 @@ static inline bool mapping_inaccessible(struct address_space *mapping)
return test_bit(AS_INACCESSIBLE, &mapping->flags);
}

+static inline void mapping_set_no_direct_map(struct address_space *mapping)
+{
+ set_bit(AS_NO_DIRECT_MAP, &mapping->flags);
+}
+
+static inline bool mapping_no_direct_map(struct address_space *mapping)
+{
+ return test_bit(AS_NO_DIRECT_MAP, &mapping->flags);
+}
+
+static inline bool vma_is_no_direct_map(const struct vm_area_struct *vma)
+{
+ return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index e918f96881f5..0ae1fb057b3d 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -4,28 +4,10 @@

#ifdef CONFIG_SECRETMEM

-extern const struct address_space_operations secretmem_aops;
-
-static inline bool secretmem_mapping(struct address_space *mapping)
-{
- return mapping->a_ops == &secretmem_aops;
-}
-
-bool vma_is_secretmem(struct vm_area_struct *vma);
bool secretmem_active(void);

#else

-static inline bool vma_is_secretmem(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline bool secretmem_mapping(struct address_space *mapping)
-{
- return false;
-}
-
static inline bool secretmem_active(void)
{
return false;
diff --git a/lib/buildid.c b/lib/buildid.c
index c4b0f376fb34..33f173a607ad 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -65,8 +65,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off)

freader_put_folio(r);

- /* reject secretmem folios created with memfd_secret() */
- if (secretmem_mapping(r->file->f_mapping))
+ /* reject secretmem folios created with memfd_secret() or guest_memfd() */
+ if (mapping_no_direct_map(r->file->f_mapping))
return -EFAULT;

r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT);
diff --git a/mm/gup.c b/mm/gup.c
index 3883b307780e..b1483a876740 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1283,7 +1283,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;

- if (vma_is_secretmem(vma))
+ if (vma_is_no_direct_map(vma))
return -EFAULT;

if (write) {
@@ -2786,7 +2786,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
{
bool reject_file_backed = false;
struct address_space *mapping;
- bool check_secretmem = false;
unsigned long mapping_flags;

/*
@@ -2798,14 +2797,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
reject_file_backed = true;

/* We hold a folio reference, so we can safely access folio fields. */
-
- /* secretmem folios are always order-0 folios. */
- if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
- check_secretmem = true;
-
- if (!reject_file_backed && !check_secretmem)
- return true;
-
if (WARN_ON_ONCE(folio_test_slab(folio)))
return false;

@@ -2847,8 +2838,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
* At this point, we know the mapping is non-null and points to an
* address_space object.
*/
- if (check_secretmem && secretmem_mapping(mapping))
+ if (mapping_no_direct_map(mapping))
return false;
+
/* The only remaining allowed file system is shmem. */
return !reject_file_backed || shmem_mapping(mapping);
}
diff --git a/mm/mlock.c b/mm/mlock.c
index cde076fa7d5e..a43f308be70d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,

if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
- vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
+ vma_is_dax(vma) || vma_is_no_direct_map(vma) || (oldflags & VM_DROPPABLE))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;

diff --git a/mm/secretmem.c b/mm/secretmem.c
index 1b0a214ee558..ea4c04d469b1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -136,11 +136,6 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}

-bool vma_is_secretmem(struct vm_area_struct *vma)
-{
- return vma->vm_ops == &secretmem_vm_ops;
-}
-
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
.mmap = secretmem_mmap,
@@ -214,6 +209,7 @@ static struct file *secretmem_file_create(unsigned long flags)

mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
+ mapping_set_no_direct_map(inode->i_mapping);

inode->i_op = &secretmem_iops;
inode->i_mapping->a_ops = &secretmem_aops;
--
2.48.1

Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
From 9bbc39f9c7622f0060d395b1063a564c24926d8d Mon Sep 17 00:00:00 2001
From: Patrick Roy <[email protected]>
Date: Fri, 7 Feb 2025 14:33:01 +0000
Subject: [PATCH 2/2] KVM: guest_memfd: Add flag to remove from direct map

Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When
set, guest_memfd folios will be removed from the direct map after
preparation, with direct map entries only restored when the folios are
freed.

To ensure these folios do not end up in places where the kernel cannot
deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
address_space if KVM_GMEM_NO_DIRECT_MAP is requested.

Add KVM_CAP_GMEM_NO_DIRECT_MAP to let userspace discover whether
guest_memfd supports KVM_GMEM_NO_DIRECT_MAP. Support depends on
guest_memfd itself being supported, but also on whether KVM can
manipulate the direct map at page granularity at all (possible most of
the time, just arm64 is a notable outlier where its impossible if the
direct map has been setup using hugepages, as arm64 cannot break these
apart due to break-before-make semantics).

Note that this flag causes removal of direct map entries for all
guest_memfd folios independent of whether they are "shared" or "private"
(although current guest_memfd only supports either all folios in the
"shared" state, or all folios in the "private" state if
!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM)). The usecase for removing
direct map entries of also the shared parts of guest_memfd are a special
type of non-CoCo VM where, host userspace is trusted to have access to
all of guest memory, but where Spectre-style transient execution attacks
through the host kernel's direct map should still be mitigated.

Note that KVM retains access to guest memory via userspace
mappings of guest_memfd, which are reflected back into KVM's memslots
via userspace_addr. This is needed for things like MMIO emulation on
x86_64 to work. Previous iterations attempted to instead have KVM
temporarily restore direct map entries whenever such an access to guest
memory was needed, but this turned out to have a significant performance
impact, as well as additional complexity due to needing to refcount
direct map reinsertion operations and making them play nicely with gmem
truncations.

This iteration also doesn't have KVM perform TLB flushes after direct
map manipulations. This is because TLB flushes resulted in a up to 40x
elongation of page faults in guest_memfd (scaling with the number of CPU
cores), or a 5x elongation of memory population. On the one hand, TLB
flushes are not needed for functional correctness (the virt->phys
mapping technically stays "correct", the kernel should simply to not it
for a while), so this is a correct optimization to make. On the other
hand, it means that the desired protection from Spectre-style attacks is
not perfect, as an attacker could try to prevent a stale TLB entry from
getting evicted, keeping it alive until the page it refers to is used by
the guest for some sensitive data, and then targeting it using a
spectre-gadget.

Signed-off-by: Patrick Roy <[email protected]>
---
include/uapi/linux/kvm.h | 3 +++
virt/kvm/guest_memfd.c | 28 +++++++++++++++++++++++++++-
virt/kvm/kvm_main.c | 5 +++++
3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 117937a895da..fb02a93546d8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -930,6 +930,7 @@ struct kvm_enable_cap {
#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
#define KVM_CAP_X86_GUEST_MODE 238
#define KVM_CAP_GMEM_SHARED_MEM 239
+#define KVM_CAP_GMEM_NO_DIRECT_MAP 240

struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1573,6 +1574,8 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};

+#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0)
+
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)

struct kvm_pre_fault_memory {
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fbf89e643add..a2b96bc51391 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
+#include <linux/set_memory.h>

#include "kvm_mm.h"

@@ -50,8 +51,23 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}

+static bool kvm_gmem_test_no_direct_map(struct inode *inode)
+{
+ return ((unsigned long) inode->i_private) & KVM_GMEM_NO_DIRECT_MAP;
+}
+
static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
+ struct inode *inode = folio_inode(folio);
+
+ if (kvm_gmem_test_no_direct_map(inode)) {
+ int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
+ false);
+
+ if (!r)
+ folio_set_private(folio);
+ }
+
folio_mark_uptodate(folio);
}

@@ -478,6 +494,10 @@ static void kvm_gmem_free_folio(struct folio *folio)
kvm_pfn_t pfn = page_to_pfn(page);
int order = folio_order(folio);

+ if (folio_test_private(folio))
+ WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), true));
+
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
}
#endif
@@ -551,6 +571,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));

+ if (flags & KVM_GMEM_NO_DIRECT_MAP)
+ mapping_set_no_direct_map(inode->i_mapping);
+
kvm_get_kvm(kvm);
gmem->kvm = kvm;
xa_init(&gmem->bindings);
@@ -570,7 +593,10 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
{
loff_t size = args->size;
u64 flags = args->flags;
- u64 valid_flags = 0;
+ u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP;
+
+ if (!can_set_direct_map())
+ valid_flags &= ~KVM_GMEM_NO_DIRECT_MAP;

if (flags & ~valid_flags)
return -EINVAL;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3e40acb9f5c0..32ca1c921ab0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -65,6 +65,7 @@
#include <trace/events/kvm.h>

#include <linux/kvm_dirty_ring.h>
+#include <linux/set_memory.h>


/* Worst case buffer size needed for holding an integer. */
@@ -4823,6 +4824,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return kvm_supported_mem_attributes(kvm);
#endif
#ifdef CONFIG_KVM_PRIVATE_MEM
+ case KVM_CAP_GMEM_NO_DIRECT_MAP:
+ if (!can_set_direct_map())
+ return false;
+ fallthrough;
case KVM_CAP_GUEST_MEMFD:
return !kvm || kvm_arch_has_private_mem(kvm);
#endif
--
2.48.1