Skip to content

Commit 9a798b1

Browse files
sean-jcbonzini
authored andcommitted
KVM: Register cpuhp and syscore callbacks when enabling hardware
Register KVM's cpuhp and syscore callback when enabling virtualization in hardware instead of registering the callbacks during initialization, and let the CPU up/down framework invoke the inner enable/disable functions. Registering the callbacks during initialization makes things more complex than they need to be, as KVM needs to be very careful about handling races between enabling CPUs being onlined/offlined and hardware being enabled/disabled. Intel TDX support will require KVM to enable virtualization during KVM initialization, i.e. will add another wrinkle to things, at which point sorting out the potential races with kvm_usage_count would become even more complex. Note, using the cpuhp framework has a subtle behavioral change: enabling will be done serially across all CPUs, whereas KVM currently sends an IPI to all CPUs in parallel. While serializing virtualization enabling could create undesirable latency, the issue is limited to the 0=>1 transition of VM creation. And even that can be mitigated, e.g. by letting userspace force virtualization to be enabled when KVM is initialized. Cc: Chao Gao <[email protected]> Reviewed-by: Kai Huang <[email protected]> Acked-by: Kai Huang <[email protected]> Tested-by: Farrah Chen <[email protected]> Signed-off-by: Sean Christopherson <[email protected]> Message-ID: <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 44d1745 commit 9a798b1

File tree

2 files changed

+66
-117
lines changed

2 files changed

+66
-117
lines changed

Documentation/virt/kvm/locking.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ KVM Lock Overview
99

1010
The acquisition orders for mutexes are as follows:
1111

12-
- cpus_read_lock() is taken outside kvm_lock and kvm_usage_lock
12+
- cpus_read_lock() is taken outside kvm_lock
13+
14+
- kvm_usage_lock is taken outside cpus_read_lock()
1315

1416
- kvm->lock is taken outside vcpu->mutex
1517

@@ -241,9 +243,8 @@ time it will be set using the Dirty tracking mechanism described above.
241243
:Arch: any
242244
:Protects: - kvm_usage_count
243245
- hardware virtualization enable/disable
244-
:Comment: Exists because using kvm_lock leads to deadlock (see earlier comment
245-
on cpus_read_lock() vs kvm_lock). Note, KVM also disables CPU hotplug via
246-
cpus_read_lock() when enabling/disabling virtualization.
246+
:Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is
247+
protected, which simplifies the virtualization enabling logic.
247248

248249
``kvm->mn_invalidate_lock``
249250
^^^^^^^^^^^^^^^^^^^^^^^^^^^

virt/kvm/kvm_main.c

Lines changed: 61 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -5578,7 +5578,7 @@ static DEFINE_PER_CPU(bool, hardware_enabled);
55785578
static DEFINE_MUTEX(kvm_usage_lock);
55795579
static int kvm_usage_count;
55805580

5581-
static int __hardware_enable_nolock(void)
5581+
static int hardware_enable_nolock(void)
55825582
{
55835583
if (__this_cpu_read(hardware_enabled))
55845584
return 0;
@@ -5593,34 +5593,18 @@ static int __hardware_enable_nolock(void)
55935593
return 0;
55945594
}
55955595

5596-
static void hardware_enable_nolock(void *failed)
5597-
{
5598-
if (__hardware_enable_nolock())
5599-
atomic_inc(failed);
5600-
}
5601-
56025596
static int kvm_online_cpu(unsigned int cpu)
56035597
{
5604-
int ret = 0;
5605-
56065598
/*
56075599
* Abort the CPU online process if hardware virtualization cannot
56085600
* be enabled. Otherwise running VMs would encounter unrecoverable
56095601
* errors when scheduled to this CPU.
56105602
*/
5611-
mutex_lock(&kvm_usage_lock);
5612-
if (kvm_usage_count)
5613-
ret = __hardware_enable_nolock();
5614-
mutex_unlock(&kvm_usage_lock);
5615-
return ret;
5603+
return hardware_enable_nolock();
56165604
}
56175605

56185606
static void hardware_disable_nolock(void *junk)
56195607
{
5620-
/*
5621-
* Note, hardware_disable_all_nolock() tells all online CPUs to disable
5622-
* hardware, not just CPUs that successfully enabled hardware!
5623-
*/
56245608
if (!__this_cpu_read(hardware_enabled))
56255609
return;
56265610

@@ -5631,78 +5615,10 @@ static void hardware_disable_nolock(void *junk)
56315615

56325616
static int kvm_offline_cpu(unsigned int cpu)
56335617
{
5634-
mutex_lock(&kvm_usage_lock);
5635-
if (kvm_usage_count)
5636-
hardware_disable_nolock(NULL);
5637-
mutex_unlock(&kvm_usage_lock);
5618+
hardware_disable_nolock(NULL);
56385619
return 0;
56395620
}
56405621

5641-
static void hardware_disable_all_nolock(void)
5642-
{
5643-
BUG_ON(!kvm_usage_count);
5644-
5645-
kvm_usage_count--;
5646-
if (!kvm_usage_count)
5647-
on_each_cpu(hardware_disable_nolock, NULL, 1);
5648-
}
5649-
5650-
static void hardware_disable_all(void)
5651-
{
5652-
cpus_read_lock();
5653-
mutex_lock(&kvm_usage_lock);
5654-
hardware_disable_all_nolock();
5655-
mutex_unlock(&kvm_usage_lock);
5656-
cpus_read_unlock();
5657-
}
5658-
5659-
static int hardware_enable_all(void)
5660-
{
5661-
atomic_t failed = ATOMIC_INIT(0);
5662-
int r;
5663-
5664-
/*
5665-
* Do not enable hardware virtualization if the system is going down.
5666-
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
5667-
* possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5668-
* after kvm_reboot() is called. Note, this relies on system_state
5669-
* being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5670-
* hook instead of registering a dedicated reboot notifier (the latter
5671-
* runs before system_state is updated).
5672-
*/
5673-
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5674-
system_state == SYSTEM_RESTART)
5675-
return -EBUSY;
5676-
5677-
/*
5678-
* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5679-
* is called, and so on_each_cpu() between them includes the CPU that
5680-
* is being onlined. As a result, hardware_enable_nolock() may get
5681-
* invoked before kvm_online_cpu(), which also enables hardware if the
5682-
* usage count is non-zero. Disable CPU hotplug to avoid attempting to
5683-
* enable hardware multiple times.
5684-
*/
5685-
cpus_read_lock();
5686-
mutex_lock(&kvm_usage_lock);
5687-
5688-
r = 0;
5689-
5690-
kvm_usage_count++;
5691-
if (kvm_usage_count == 1) {
5692-
on_each_cpu(hardware_enable_nolock, &failed, 1);
5693-
5694-
if (atomic_read(&failed)) {
5695-
hardware_disable_all_nolock();
5696-
r = -EBUSY;
5697-
}
5698-
}
5699-
5700-
mutex_unlock(&kvm_usage_lock);
5701-
cpus_read_unlock();
5702-
5703-
return r;
5704-
}
5705-
57065622
static void kvm_shutdown(void)
57075623
{
57085624
/*
@@ -5734,8 +5650,7 @@ static int kvm_suspend(void)
57345650
lockdep_assert_not_held(&kvm_usage_lock);
57355651
lockdep_assert_irqs_disabled();
57365652

5737-
if (kvm_usage_count)
5738-
hardware_disable_nolock(NULL);
5653+
hardware_disable_nolock(NULL);
57395654
return 0;
57405655
}
57415656

@@ -5744,15 +5659,68 @@ static void kvm_resume(void)
57445659
lockdep_assert_not_held(&kvm_usage_lock);
57455660
lockdep_assert_irqs_disabled();
57465661

5747-
if (kvm_usage_count)
5748-
WARN_ON_ONCE(__hardware_enable_nolock());
5662+
WARN_ON_ONCE(hardware_enable_nolock());
57495663
}
57505664

57515665
static struct syscore_ops kvm_syscore_ops = {
57525666
.suspend = kvm_suspend,
57535667
.resume = kvm_resume,
57545668
.shutdown = kvm_shutdown,
57555669
};
5670+
5671+
static int hardware_enable_all(void)
5672+
{
5673+
int r;
5674+
5675+
guard(mutex)(&kvm_usage_lock);
5676+
5677+
if (kvm_usage_count++)
5678+
return 0;
5679+
5680+
r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
5681+
kvm_online_cpu, kvm_offline_cpu);
5682+
if (r)
5683+
goto err_cpuhp;
5684+
5685+
register_syscore_ops(&kvm_syscore_ops);
5686+
5687+
/*
5688+
* Undo virtualization enabling and bail if the system is going down.
5689+
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
5690+
* possible for an in-flight operation to enable virtualization after
5691+
* syscore_shutdown() is called, i.e. without kvm_shutdown() being
5692+
* invoked. Note, this relies on system_state being set _before_
5693+
* kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
5694+
* or this CPU observes the impending shutdown. Which is why KVM uses
5695+
* a syscore ops hook instead of registering a dedicated reboot
5696+
* notifier (the latter runs before system_state is updated).
5697+
*/
5698+
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5699+
system_state == SYSTEM_RESTART) {
5700+
r = -EBUSY;
5701+
goto err_rebooting;
5702+
}
5703+
5704+
return 0;
5705+
5706+
err_rebooting:
5707+
unregister_syscore_ops(&kvm_syscore_ops);
5708+
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5709+
err_cpuhp:
5710+
--kvm_usage_count;
5711+
return r;
5712+
}
5713+
5714+
static void hardware_disable_all(void)
5715+
{
5716+
guard(mutex)(&kvm_usage_lock);
5717+
5718+
if (--kvm_usage_count)
5719+
return;
5720+
5721+
unregister_syscore_ops(&kvm_syscore_ops);
5722+
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5723+
}
57565724
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
57575725
static int hardware_enable_all(void)
57585726
{
@@ -6461,15 +6429,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
64616429
int r;
64626430
int cpu;
64636431

6464-
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6465-
r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6466-
kvm_online_cpu, kvm_offline_cpu);
6467-
if (r)
6468-
return r;
6469-
6470-
register_syscore_ops(&kvm_syscore_ops);
6471-
#endif
6472-
64736432
/* A kmem cache lets us meet the alignment requirements of fx_save. */
64746433
if (!vcpu_align)
64756434
vcpu_align = __alignof__(struct kvm_vcpu);
@@ -6480,10 +6439,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
64806439
offsetofend(struct kvm_vcpu, stats_id)
64816440
- offsetof(struct kvm_vcpu, arch),
64826441
NULL);
6483-
if (!kvm_vcpu_cache) {
6484-
r = -ENOMEM;
6485-
goto err_vcpu_cache;
6486-
}
6442+
if (!kvm_vcpu_cache)
6443+
return -ENOMEM;
64876444

64886445
for_each_possible_cpu(cpu) {
64896446
if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@@ -6540,11 +6497,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
65406497
for_each_possible_cpu(cpu)
65416498
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
65426499
kmem_cache_destroy(kvm_vcpu_cache);
6543-
err_vcpu_cache:
6544-
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6545-
unregister_syscore_ops(&kvm_syscore_ops);
6546-
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6547-
#endif
65486500
return r;
65496501
}
65506502
EXPORT_SYMBOL_GPL(kvm_init);
@@ -6566,10 +6518,6 @@ void kvm_exit(void)
65666518
kmem_cache_destroy(kvm_vcpu_cache);
65676519
kvm_vfio_ops_exit();
65686520
kvm_async_pf_deinit();
6569-
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6570-
unregister_syscore_ops(&kvm_syscore_ops);
6571-
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6572-
#endif
65736521
kvm_irqfd_exit();
65746522
}
65756523
EXPORT_SYMBOL_GPL(kvm_exit);

0 commit comments

Comments
 (0)