Skip to content

Commit beafd7e

Browse files
committed
Merge tag 'kvm-x86-sev-6.17' of https://github.com/kvm-x86/linux into HEAD
KVM SEV cache maintenance changes for 6.17 - Drop a superfluous WBINVD (on all CPUs!) when destroying a VM. - Use WBNOINVD instead of WBINVD when possible, for SEV cache maintenance, e.g. to minimize collateral damage when reclaiming memory from an SEV guest. - When reclaiming memory from an SEV guest, only do cache flushes on CPUs that have ever run a vCPU for the guest, i.e. don't flush the caches for CPUs that can't possibly have cache lines with dirty, encrypted data.
2 parents a10acca + 6f38f8c commit beafd7e

File tree

3 files changed

+84
-35
lines changed

3 files changed

+84
-35
lines changed

arch/x86/kvm/svm/sev.c

Lines changed: 82 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
117117
*/
118118
down_write(&sev_deactivate_lock);
119119

120+
/* SNP firmware requires use of WBINVD for ASID recycling. */
120121
wbinvd_on_all_cpus();
121122

122123
if (sev_snp_enabled)
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
446447
init_args.probe = false;
447448
ret = sev_platform_init(&init_args);
448449
if (ret)
449-
goto e_free;
450+
goto e_free_asid;
451+
452+
if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
453+
ret = -ENOMEM;
454+
goto e_free_asid;
455+
}
450456

451457
/* This needs to happen after SEV/SNP firmware initialization. */
452458
if (vm_type == KVM_X86_SNP_VM) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
464470
return 0;
465471

466472
e_free:
473+
free_cpumask_var(sev->have_run_cpus);
474+
e_free_asid:
467475
argp->error = init_args.error;
468476
sev_asid_free(sev);
469477
sev->asid = 0;
@@ -708,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
708716
}
709717
}
710718

719+
static void sev_writeback_caches(struct kvm *kvm)
720+
{
721+
/*
722+
* Note, the caller is responsible for ensuring correctness if the mask
723+
* can be modified, e.g. if a CPU could be doing VMRUN.
724+
*/
725+
if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
726+
return;
727+
728+
/*
729+
* Ensure that all dirty guest tagged cache entries are written back
730+
* before releasing the pages back to the system for use. CLFLUSH will
731+
* not do this without SME_COHERENT, and flushing many cache lines
732+
* individually is slower than blasting WBINVD for large VMs, so issue
733+
* WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
734+
* on CPUs that have done VMRUN, i.e. may have dirtied data using the
735+
* VM's ASID.
736+
*
737+
* For simplicity, never remove CPUs from the bitmap. Ideally, KVM
738+
* would clear the mask when flushing caches, but doing so requires
739+
* serializing multiple calls and having responding CPUs (to the IPI)
740+
* mark themselves as still running if they are running (or about to
741+
* run) a vCPU for the VM.
742+
*/
743+
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
744+
}
745+
711746
static unsigned long get_num_contig_pages(unsigned long idx,
712747
struct page **inpages, unsigned long npages)
713748
{
@@ -2037,6 +2072,17 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
20372072
if (ret)
20382073
goto out_source_vcpu;
20392074

2075+
/*
2076+
* Allocate a new have_run_cpus for the destination, i.e. don't copy
2077+
* the set of CPUs from the source. If a CPU was used to run a vCPU in
2078+
* the source VM but is never used for the destination VM, then the CPU
2079+
* can only have cached memory that was accessible to the source VM.
2080+
*/
2081+
if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2082+
ret = -ENOMEM;
2083+
goto out_source_vcpu;
2084+
}
2085+
20402086
sev_migrate_from(kvm, source_kvm);
20412087
kvm_vm_dead(source_kvm);
20422088
cg_cleanup_sev = src_sev;
@@ -2694,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
26942740
goto failed;
26952741
}
26962742

2697-
/*
2698-
* Ensure that all guest tagged cache entries are flushed before
2699-
* releasing the pages back to the system for use. CLFLUSH will
2700-
* not do this, so issue a WBINVD.
2701-
*/
2702-
wbinvd_on_all_cpus();
2743+
sev_writeback_caches(kvm);
27032744

27042745
__unregister_enc_region_locked(kvm, region);
27052746

@@ -2741,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
27412782
goto e_unlock;
27422783
}
27432784

2785+
mirror_sev = to_kvm_sev_info(kvm);
2786+
if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
2787+
ret = -ENOMEM;
2788+
goto e_unlock;
2789+
}
2790+
27442791
/*
27452792
* The mirror kvm holds an enc_context_owner ref so its asid can't
27462793
* disappear until we're done with it
27472794
*/
27482795
source_sev = to_kvm_sev_info(source_kvm);
27492796
kvm_get_kvm(source_kvm);
2750-
mirror_sev = to_kvm_sev_info(kvm);
27512797
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
27522798

27532799
/* Set enc_context_owner and copy its encryption context over */
@@ -2809,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
28092855

28102856
WARN_ON(!list_empty(&sev->mirror_vms));
28112857

2812-
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
2858+
free_cpumask_var(sev->have_run_cpus);
2859+
2860+
/*
2861+
* If this is a mirror VM, remove it from the owner's list of a mirrors
2862+
* and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
2863+
* Note, mirror VMs don't support registering encrypted regions.
2864+
*/
28132865
if (is_mirroring_enc_context(kvm)) {
28142866
struct kvm *owner_kvm = sev->enc_context_owner;
28152867

@@ -2820,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
28202872
return;
28212873
}
28222874

2823-
/*
2824-
* Ensure that all guest tagged cache entries are flushed before
2825-
* releasing the pages back to the system for use. CLFLUSH will
2826-
* not do this, so issue a WBINVD.
2827-
*/
2828-
wbinvd_on_all_cpus();
28292875

28302876
/*
28312877
* if userspace was terminated before unregistering the memory regions
@@ -3095,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
30953141

30963142
/*
30973143
* VM Page Flush takes a host virtual address and a guest ASID. Fall
3098-
* back to WBINVD if this faults so as not to make any problems worse
3099-
* by leaving stale encrypted data in the cache.
3144+
* back to full writeback of caches if this faults so as not to make
3145+
* any problems worse by leaving stale encrypted data in the cache.
31003146
*/
31013147
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
3102-
goto do_wbinvd;
3148+
goto do_sev_writeback_caches;
31033149

31043150
return;
31053151

3106-
do_wbinvd:
3107-
wbinvd_on_all_cpus();
3152+
do_sev_writeback_caches:
3153+
sev_writeback_caches(vcpu->kvm);
31083154
}
31093155

31103156
void sev_guest_memory_reclaimed(struct kvm *kvm)
31113157
{
31123158
/*
31133159
* With SNP+gmem, private/encrypted memory is unreachable via the
3114-
* hva-based mmu notifiers, so these events are only actually
3115-
* pertaining to shared pages where there is no need to perform
3116-
* the WBINVD to flush associated caches.
3160+
* hva-based mmu notifiers, i.e. these events are explicitly scoped to
3161+
* shared pages, where there's no need to flush caches.
31173162
*/
31183163
if (!sev_guest(kvm) || sev_snp_guest(kvm))
31193164
return;
31203165

3121-
wbinvd_on_all_cpus();
3166+
sev_writeback_caches(kvm);
31223167
}
31233168

31243169
void sev_free_vcpu(struct kvm_vcpu *vcpu)
@@ -3450,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
34503495
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
34513496
return -EINVAL;
34523497

3498+
/*
3499+
* To optimize cache flushes when memory is reclaimed from an SEV VM,
3500+
* track physical CPUs that enter the guest for SEV VMs and thus can
3501+
* have encrypted, dirty data in the cache, and flush caches only for
3502+
* CPUs that have entered the guest.
3503+
*/
3504+
if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
3505+
cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
3506+
34533507
/* Assign the asid allocated with this SEV guest */
34543508
svm->asid = asid;
34553509

@@ -3882,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
38823936
* From this point forward, the VMSA will always be a guest-mapped page
38833937
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
38843938
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
3885-
* that involves cleanups like wbinvd_on_all_cpus() which would ideally
3886-
* be handled during teardown rather than guest boot. Deferring that
3887-
* also allows the existing logic for SEV-ES VMSAs to be re-used with
3939+
* that involves cleanups like flushing caches, which would ideally be
3940+
* handled during teardown rather than guest boot. Deferring that also
3941+
* allows the existing logic for SEV-ES VMSAs to be re-used with
38883942
* minimal SNP-specific changes.
38893943
*/
38903944
svm->sev_es.snp_has_guest_vmsa = true;
@@ -4875,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
48754929

48764930
/*
48774931
* SEV-ES avoids host/guest cache coherency issues through
4878-
* WBINVD hooks issued via MMU notifiers during run-time, and
4932+
* WBNOINVD hooks issued via MMU notifiers during run-time, and
48794933
* KVM's VM destroy path at shutdown. Those MMU notifier events
48804934
* don't cover gmem since there is no requirement to map pages
48814935
* to a HVA in order to use them for a running guest. While the

arch/x86/kvm/svm/svm.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ struct kvm_sev_info {
110110
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
111111
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
112112
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
113+
cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
113114
};
114115

115116
#define SEV_POLICY_NODBG BIT_ULL(0)

arch/x86/kvm/x86.c

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4994,11 +4994,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
49944994
return r;
49954995
}
49964996

4997-
static void wbinvd_ipi(void *garbage)
4998-
{
4999-
wbinvd();
5000-
}
5001-
50024997
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
50034998
{
50044999
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
@@ -5022,8 +5017,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
50225017
if (kvm_x86_call(has_wbinvd_exit)())
50235018
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
50245019
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
5025-
smp_call_function_single(vcpu->cpu,
5026-
wbinvd_ipi, NULL, 1);
5020+
wbinvd_on_cpu(vcpu->cpu);
50275021
}
50285022

50295023
kvm_x86_call(vcpu_load)(vcpu, cpu);

0 commit comments

Comments
 (0)