Skip to content

Commit 3a7c8fa

Browse files
committed
x86/kvm: Restrict ASYNC_PF to user space
The async page fault injection into kernel space creates more problems than it solves. The host has absolutely no knowledge about the state of the guest if the fault happens in CPL0. The only restriction for the host is interrupt disabled state. If interrupts are enabled in the guest then the exception can hit arbitrary code. The HALT based wait in non-preemotible code is a hacky replacement for a proper hypercall. For the ongoing work to restrict instrumentation and make the RCU idle interaction well defined the required extra work for supporting async pagefault in CPL0 is just not justified and creates complexity for a dubious benefit. The CPL3 injection is well defined and does not cause any issues as it is more or less the same as a regular page fault from CPL3. Suggested-by: Andy Lutomirski <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Alexandre Chartre <[email protected]> Acked-by: Paolo Bonzini <[email protected]> Acked-by: Peter Zijlstra <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 6bca69a commit 3a7c8fa

File tree

1 file changed

+7
-93
lines changed

1 file changed

+7
-93
lines changed

arch/x86/kernel/kvm.c

Lines changed: 7 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ struct kvm_task_sleep_node {
7575
struct swait_queue_head wq;
7676
u32 token;
7777
int cpu;
78-
bool use_halt;
7978
};
8079

8180
static struct kvm_task_sleep_head {
@@ -98,8 +97,7 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
9897
return NULL;
9998
}
10099

101-
static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
102-
struct kvm_task_sleep_node *n)
100+
static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
103101
{
104102
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
105103
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -117,7 +115,6 @@ static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
117115

118116
n->token = token;
119117
n->cpu = smp_processor_id();
120-
n->use_halt = use_halt;
121118
init_swait_queue_head(&n->wq);
122119
hlist_add_head(&n->link, &b->list);
123120
raw_spin_unlock(&b->lock);
@@ -138,7 +135,7 @@ void kvm_async_pf_task_wait_schedule(u32 token)
138135

139136
lockdep_assert_irqs_disabled();
140137

141-
if (!kvm_async_pf_queue_task(token, false, &n))
138+
if (!kvm_async_pf_queue_task(token, &n))
142139
return;
143140

144141
for (;;) {
@@ -154,91 +151,10 @@ void kvm_async_pf_task_wait_schedule(u32 token)
154151
}
155152
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
156153

157-
/*
158-
* Invoked from the async page fault handler.
159-
*/
160-
static void kvm_async_pf_task_wait_halt(u32 token)
161-
{
162-
struct kvm_task_sleep_node n;
163-
164-
if (!kvm_async_pf_queue_task(token, true, &n))
165-
return;
166-
167-
for (;;) {
168-
if (hlist_unhashed(&n.link))
169-
break;
170-
/*
171-
* No point in doing anything about RCU here. Any RCU read
172-
* side critical section or RCU watching section can be
173-
* interrupted by VMEXITs and the host is free to keep the
174-
* vCPU scheduled out as long as it sees fit. This is not
175-
* any different just because of the halt induced voluntary
176-
* VMEXIT.
177-
*
178-
* Also the async page fault could have interrupted any RCU
179-
* watching context, so invoking rcu_irq_exit()/enter()
180-
* around this is not gaining anything.
181-
*/
182-
native_safe_halt();
183-
local_irq_disable();
184-
}
185-
}
186-
187-
/* Invoked from the async page fault handler */
188-
static void kvm_async_pf_task_wait(u32 token, bool usermode)
189-
{
190-
bool can_schedule;
191-
192-
/*
193-
* No need to check whether interrupts were disabled because the
194-
* host will (hopefully) only inject an async page fault into
195-
* interrupt enabled regions.
196-
*
197-
* If CONFIG_PREEMPTION is enabled then check whether the code
198-
* which triggered the page fault is preemptible. This covers user
199-
* mode as well because preempt_count() is obviously 0 there.
200-
*
201-
* The check for rcu_preempt_depth() is also required because
202-
* voluntary scheduling inside a rcu read locked section is not
203-
* allowed.
204-
*
205-
* The idle task is already covered by this because idle always
206-
* has a preempt count > 0.
207-
*
208-
* If CONFIG_PREEMPTION is disabled only allow scheduling when
209-
* coming from user mode as there is no indication whether the
210-
* context which triggered the page fault could schedule or not.
211-
*/
212-
if (IS_ENABLED(CONFIG_PREEMPTION))
213-
can_schedule = preempt_count() + rcu_preempt_depth() == 0;
214-
else
215-
can_schedule = usermode;
216-
217-
/*
218-
* If the kernel context is allowed to schedule then RCU is
219-
* watching because no preemptible code in the kernel is inside RCU
220-
* idle state. So it can be treated like user mode. User mode is
221-
* safe because the #PF entry invoked enter_from_user_mode().
222-
*
223-
* For the non schedulable case invoke rcu_irq_enter() for
224-
* now. This will be moved out to the pagefault entry code later
225-
* and only invoked when really needed.
226-
*/
227-
if (can_schedule) {
228-
kvm_async_pf_task_wait_schedule(token);
229-
} else {
230-
rcu_irq_enter();
231-
kvm_async_pf_task_wait_halt(token);
232-
rcu_irq_exit();
233-
}
234-
}
235-
236154
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
237155
{
238156
hlist_del_init(&n->link);
239-
if (n->use_halt)
240-
smp_send_reschedule(n->cpu);
241-
else if (swq_has_sleeper(&n->wq))
157+
if (swq_has_sleeper(&n->wq))
242158
swake_up_one(&n->wq);
243159
}
244160

@@ -337,8 +253,10 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
337253
panic("Host injected async #PF in interrupt disabled region\n");
338254

339255
if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
340-
/* page is swapped out by the host. */
341-
kvm_async_pf_task_wait(token, user_mode(regs));
256+
if (unlikely(!(user_mode(regs))))
257+
panic("Host injected async #PF in kernel mode\n");
258+
/* Page is swapped out by the host. */
259+
kvm_async_pf_task_wait_schedule(token);
342260
} else {
343261
rcu_irq_enter();
344262
kvm_async_pf_task_wake(token);
@@ -397,10 +315,6 @@ static void kvm_guest_cpu_init(void)
397315
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
398316

399317
pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
400-
401-
#ifdef CONFIG_PREEMPTION
402-
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
403-
#endif
404318
pa |= KVM_ASYNC_PF_ENABLED;
405319

406320
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))

0 commit comments

Comments
 (0)