Skip to content

Commit 0bb84fe

Browse files
authored
Merge branch 'feature/secret-hiding' into feature/al2023_kernel_install
2 parents c0cb176 + 002b675 commit 0bb84fe

File tree

3 files changed

+389
-1
lines changed

3 files changed

+389
-1
lines changed

.buildkite/pipeline_pr.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@
6868
for step in kani_grp["steps"]:
6969
step["label"] = "🔍 Kani"
7070

71-
if not changed_files or (any(x.parent.name == "hiding_ci" for x in changed_files)):
71+
if not changed_files or (
72+
any(parent.name == "hiding_ci" for x in changed_files for parent in x.parents)
73+
):
7274
pipeline.build_group_per_arch(
7375
"🕵️ Build Secret Hiding Kernel",
7476
pipeline.devtool_test(
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
From 138b7a4c83c43b42851cb8fec2bbdbaadd960241 Mon Sep 17 00:00:00 2001
2+
From: Patrick Roy <[email protected]>
3+
Date: Fri, 7 Feb 2025 11:16:06 +0000
4+
Subject: [PATCH 1/2] mm: introduce AS_NO_DIRECT_MAP
5+
6+
Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are
7+
set to not present . Currently, mappings that match this description are
8+
secretmem mappings (memfd_secret()). Later, some guest_memfd
9+
configurations will also fall into this category.
10+
11+
Reject this new type of mappings in all locations that currently reject
12+
secretmem mappings, on the assumption that if secretmem mappings are
13+
rejected somewhere, it is precisely because of an inability to deal with
14+
folios without direct map entries, and then make memfd_secret() use
15+
AS_NO_DIRECT_MAP on its address_space to drop its special
16+
vma_is_secretmem()/secretmem_mapping() checks.
17+
18+
This drops a optimization in gup_fast_folio_allowed() where
19+
secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
20+
enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
21+
by default"), so the secretmem check did not actually end up elided in
22+
most cases anymore anyway.
23+
24+
Use a new flag instead of overloading AS_INACCESSIBLE (which is already
25+
set by guest_memfd) because not all guest_memfd mappings will end up
26+
being direct map removed (e.g. in pKVM setups, parts of guest_memfd that
27+
can be mapped to userspace should also be GUP-able, and generally not
28+
have restrictions on who can access it).
29+
30+
Signed-off-by: Patrick Roy <[email protected]>
31+
---
32+
include/linux/pagemap.h | 16 ++++++++++++++++
33+
include/linux/secretmem.h | 18 ------------------
34+
lib/buildid.c | 4 ++--
35+
mm/gup.c | 14 +++-----------
36+
mm/mlock.c | 2 +-
37+
mm/secretmem.c | 6 +-----
38+
6 files changed, 23 insertions(+), 37 deletions(-)
39+
40+
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
41+
index 47bfc6b1b632..903b41e89cf8 100644
42+
--- a/include/linux/pagemap.h
43+
+++ b/include/linux/pagemap.h
44+
@@ -210,6 +210,7 @@ enum mapping_flags {
45+
AS_STABLE_WRITES = 7, /* must wait for writeback before modifying
46+
folio contents */
47+
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
48+
+ AS_NO_DIRECT_MAP = 9, /* Folios in the mapping are not in the direct map */
49+
/* Bits 16-25 are used for FOLIO_ORDER */
50+
AS_FOLIO_ORDER_BITS = 5,
51+
AS_FOLIO_ORDER_MIN = 16,
52+
@@ -335,6 +336,21 @@ static inline bool mapping_inaccessible(struct address_space *mapping)
53+
return test_bit(AS_INACCESSIBLE, &mapping->flags);
54+
}
55+
56+
+static inline void mapping_set_no_direct_map(struct address_space *mapping)
57+
+{
58+
+ set_bit(AS_NO_DIRECT_MAP, &mapping->flags);
59+
+}
60+
+
61+
+static inline bool mapping_no_direct_map(struct address_space *mapping)
62+
+{
63+
+ return test_bit(AS_NO_DIRECT_MAP, &mapping->flags);
64+
+}
65+
+
66+
+static inline bool vma_is_no_direct_map(const struct vm_area_struct *vma)
67+
+{
68+
+ return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping);
69+
+}
70+
+
71+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
72+
{
73+
return mapping->gfp_mask;
74+
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
75+
index e918f96881f5..0ae1fb057b3d 100644
76+
--- a/include/linux/secretmem.h
77+
+++ b/include/linux/secretmem.h
78+
@@ -4,28 +4,10 @@
79+
80+
#ifdef CONFIG_SECRETMEM
81+
82+
-extern const struct address_space_operations secretmem_aops;
83+
-
84+
-static inline bool secretmem_mapping(struct address_space *mapping)
85+
-{
86+
- return mapping->a_ops == &secretmem_aops;
87+
-}
88+
-
89+
-bool vma_is_secretmem(struct vm_area_struct *vma);
90+
bool secretmem_active(void);
91+
92+
#else
93+
94+
-static inline bool vma_is_secretmem(struct vm_area_struct *vma)
95+
-{
96+
- return false;
97+
-}
98+
-
99+
-static inline bool secretmem_mapping(struct address_space *mapping)
100+
-{
101+
- return false;
102+
-}
103+
-
104+
static inline bool secretmem_active(void)
105+
{
106+
return false;
107+
diff --git a/lib/buildid.c b/lib/buildid.c
108+
index c4b0f376fb34..33f173a607ad 100644
109+
--- a/lib/buildid.c
110+
+++ b/lib/buildid.c
111+
@@ -65,8 +65,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
112+
113+
freader_put_folio(r);
114+
115+
- /* reject secretmem folios created with memfd_secret() */
116+
- if (secretmem_mapping(r->file->f_mapping))
117+
+ /* reject secretmem folios created with memfd_secret() or guest_memfd() */
118+
+ if (mapping_no_direct_map(r->file->f_mapping))
119+
return -EFAULT;
120+
121+
r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT);
122+
diff --git a/mm/gup.c b/mm/gup.c
123+
index 3883b307780e..b1483a876740 100644
124+
--- a/mm/gup.c
125+
+++ b/mm/gup.c
126+
@@ -1283,7 +1283,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
127+
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
128+
return -EOPNOTSUPP;
129+
130+
- if (vma_is_secretmem(vma))
131+
+ if (vma_is_no_direct_map(vma))
132+
return -EFAULT;
133+
134+
if (write) {
135+
@@ -2786,7 +2786,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
136+
{
137+
bool reject_file_backed = false;
138+
struct address_space *mapping;
139+
- bool check_secretmem = false;
140+
unsigned long mapping_flags;
141+
142+
/*
143+
@@ -2798,14 +2797,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
144+
reject_file_backed = true;
145+
146+
/* We hold a folio reference, so we can safely access folio fields. */
147+
-
148+
- /* secretmem folios are always order-0 folios. */
149+
- if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
150+
- check_secretmem = true;
151+
-
152+
- if (!reject_file_backed && !check_secretmem)
153+
- return true;
154+
-
155+
if (WARN_ON_ONCE(folio_test_slab(folio)))
156+
return false;
157+
158+
@@ -2847,8 +2838,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
159+
* At this point, we know the mapping is non-null and points to an
160+
* address_space object.
161+
*/
162+
- if (check_secretmem && secretmem_mapping(mapping))
163+
+ if (mapping_no_direct_map(mapping))
164+
return false;
165+
+
166+
/* The only remaining allowed file system is shmem. */
167+
return !reject_file_backed || shmem_mapping(mapping);
168+
}
169+
diff --git a/mm/mlock.c b/mm/mlock.c
170+
index cde076fa7d5e..a43f308be70d 100644
171+
--- a/mm/mlock.c
172+
+++ b/mm/mlock.c
173+
@@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
174+
175+
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
176+
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
177+
- vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
178+
+ vma_is_dax(vma) || vma_is_no_direct_map(vma) || (oldflags & VM_DROPPABLE))
179+
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
180+
goto out;
181+
182+
diff --git a/mm/secretmem.c b/mm/secretmem.c
183+
index 1b0a214ee558..ea4c04d469b1 100644
184+
--- a/mm/secretmem.c
185+
+++ b/mm/secretmem.c
186+
@@ -136,11 +136,6 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
187+
return 0;
188+
}
189+
190+
-bool vma_is_secretmem(struct vm_area_struct *vma)
191+
-{
192+
- return vma->vm_ops == &secretmem_vm_ops;
193+
-}
194+
-
195+
static const struct file_operations secretmem_fops = {
196+
.release = secretmem_release,
197+
.mmap = secretmem_mmap,
198+
@@ -214,6 +209,7 @@ static struct file *secretmem_file_create(unsigned long flags)
199+
200+
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
201+
mapping_set_unevictable(inode->i_mapping);
202+
+ mapping_set_no_direct_map(inode->i_mapping);
203+
204+
inode->i_op = &secretmem_iops;
205+
inode->i_mapping->a_ops = &secretmem_aops;
206+
--
207+
2.48.1
208+
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
From 9bbc39f9c7622f0060d395b1063a564c24926d8d Mon Sep 17 00:00:00 2001
2+
From: Patrick Roy <[email protected]>
3+
Date: Fri, 7 Feb 2025 14:33:01 +0000
4+
Subject: [PATCH 2/2] KVM: guest_memfd: Add flag to remove from direct map
5+
6+
Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When
7+
set, guest_memfd folios will be removed from the direct map after
8+
preparation, with direct map entries only restored when the folios are
9+
freed.
10+
11+
To ensure these folios do not end up in places where the kernel cannot
12+
deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
13+
address_space if KVM_GMEM_NO_DIRECT_MAP is requested.
14+
15+
Add KVM_CAP_GMEM_NO_DIRECT_MAP to let userspace discover whether
16+
guest_memfd supports KVM_GMEM_NO_DIRECT_MAP. Support depends on
17+
guest_memfd itself being supported, but also on whether KVM can
18+
manipulate the direct map at page granularity at all (possible most of
19+
the time, just arm64 is a notable outlier where its impossible if the
20+
direct map has been setup using hugepages, as arm64 cannot break these
21+
apart due to break-before-make semantics).
22+
23+
Note that this flag causes removal of direct map entries for all
24+
guest_memfd folios independent of whether they are "shared" or "private"
25+
(although current guest_memfd only supports either all folios in the
26+
"shared" state, or all folios in the "private" state if
27+
!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM)). The usecase for removing
28+
direct map entries of also the shared parts of guest_memfd are a special
29+
type of non-CoCo VM where, host userspace is trusted to have access to
30+
all of guest memory, but where Spectre-style transient execution attacks
31+
through the host kernel's direct map should still be mitigated.
32+
33+
Note that KVM retains access to guest memory via userspace
34+
mappings of guest_memfd, which are reflected back into KVM's memslots
35+
via userspace_addr. This is needed for things like MMIO emulation on
36+
x86_64 to work. Previous iterations attempted to instead have KVM
37+
temporarily restore direct map entries whenever such an access to guest
38+
memory was needed, but this turned out to have a significant performance
39+
impact, as well as additional complexity due to needing to refcount
40+
direct map reinsertion operations and making them play nicely with gmem
41+
truncations.
42+
43+
This iteration also doesn't have KVM perform TLB flushes after direct
44+
map manipulations. This is because TLB flushes resulted in a up to 40x
45+
elongation of page faults in guest_memfd (scaling with the number of CPU
46+
cores), or a 5x elongation of memory population. On the one hand, TLB
47+
flushes are not needed for functional correctness (the virt->phys
48+
mapping technically stays "correct", the kernel should simply to not it
49+
for a while), so this is a correct optimization to make. On the other
50+
hand, it means that the desired protection from Spectre-style attacks is
51+
not perfect, as an attacker could try to prevent a stale TLB entry from
52+
getting evicted, keeping it alive until the page it refers to is used by
53+
the guest for some sensitive data, and then targeting it using a
54+
spectre-gadget.
55+
56+
Signed-off-by: Patrick Roy <[email protected]>
57+
---
58+
include/uapi/linux/kvm.h | 3 +++
59+
virt/kvm/guest_memfd.c | 28 +++++++++++++++++++++++++++-
60+
virt/kvm/kvm_main.c | 5 +++++
61+
3 files changed, 35 insertions(+), 1 deletion(-)
62+
63+
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
64+
index 117937a895da..fb02a93546d8 100644
65+
--- a/include/uapi/linux/kvm.h
66+
+++ b/include/uapi/linux/kvm.h
67+
@@ -930,6 +930,7 @@ struct kvm_enable_cap {
68+
#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
69+
#define KVM_CAP_X86_GUEST_MODE 238
70+
#define KVM_CAP_GMEM_SHARED_MEM 239
71+
+#define KVM_CAP_GMEM_NO_DIRECT_MAP 240
72+
73+
struct kvm_irq_routing_irqchip {
74+
__u32 irqchip;
75+
@@ -1573,6 +1574,8 @@ struct kvm_create_guest_memfd {
76+
__u64 reserved[6];
77+
};
78+
79+
+#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0)
80+
+
81+
#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
82+
83+
struct kvm_pre_fault_memory {
84+
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
85+
index fbf89e643add..a2b96bc51391 100644
86+
--- a/virt/kvm/guest_memfd.c
87+
+++ b/virt/kvm/guest_memfd.c
88+
@@ -4,6 +4,7 @@
89+
#include <linux/kvm_host.h>
90+
#include <linux/pagemap.h>
91+
#include <linux/anon_inodes.h>
92+
+#include <linux/set_memory.h>
93+
94+
#include "kvm_mm.h"
95+
96+
@@ -50,8 +51,23 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
97+
return 0;
98+
}
99+
100+
+static bool kvm_gmem_test_no_direct_map(struct inode *inode)
101+
+{
102+
+ return ((unsigned long) inode->i_private) & KVM_GMEM_NO_DIRECT_MAP;
103+
+}
104+
+
105+
static inline void kvm_gmem_mark_prepared(struct folio *folio)
106+
{
107+
+ struct inode *inode = folio_inode(folio);
108+
+
109+
+ if (kvm_gmem_test_no_direct_map(inode)) {
110+
+ int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
111+
+ false);
112+
+
113+
+ if (!r)
114+
+ folio_set_private(folio);
115+
+ }
116+
+
117+
folio_mark_uptodate(folio);
118+
}
119+
120+
@@ -478,6 +494,10 @@ static void kvm_gmem_free_folio(struct folio *folio)
121+
kvm_pfn_t pfn = page_to_pfn(page);
122+
int order = folio_order(folio);
123+
124+
+ if (folio_test_private(folio))
125+
+ WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0),
126+
+ folio_nr_pages(folio), true));
127+
+
128+
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
129+
}
130+
#endif
131+
@@ -551,6 +571,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
132+
/* Unmovable mappings are supposed to be marked unevictable as well. */
133+
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
134+
135+
+ if (flags & KVM_GMEM_NO_DIRECT_MAP)
136+
+ mapping_set_no_direct_map(inode->i_mapping);
137+
+
138+
kvm_get_kvm(kvm);
139+
gmem->kvm = kvm;
140+
xa_init(&gmem->bindings);
141+
@@ -570,7 +593,10 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
142+
{
143+
loff_t size = args->size;
144+
u64 flags = args->flags;
145+
- u64 valid_flags = 0;
146+
+ u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP;
147+
+
148+
+ if (!can_set_direct_map())
149+
+ valid_flags &= ~KVM_GMEM_NO_DIRECT_MAP;
150+
151+
if (flags & ~valid_flags)
152+
return -EINVAL;
153+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
154+
index 3e40acb9f5c0..32ca1c921ab0 100644
155+
--- a/virt/kvm/kvm_main.c
156+
+++ b/virt/kvm/kvm_main.c
157+
@@ -65,6 +65,7 @@
158+
#include <trace/events/kvm.h>
159+
160+
#include <linux/kvm_dirty_ring.h>
161+
+#include <linux/set_memory.h>
162+
163+
164+
/* Worst case buffer size needed for holding an integer. */
165+
@@ -4823,6 +4824,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
166+
return kvm_supported_mem_attributes(kvm);
167+
#endif
168+
#ifdef CONFIG_KVM_PRIVATE_MEM
169+
+ case KVM_CAP_GMEM_NO_DIRECT_MAP:
170+
+ if (!can_set_direct_map())
171+
+ return false;
172+
+ fallthrough;
173+
case KVM_CAP_GUEST_MEMFD:
174+
return !kvm || kvm_arch_has_private_mem(kvm);
175+
#endif
176+
--
177+
2.48.1
178+

0 commit comments

Comments
 (0)