@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
117117 */
118118 down_write (& sev_deactivate_lock );
119119
120+ /* SNP firmware requires use of WBINVD for ASID recycling. */
120121 wbinvd_on_all_cpus ();
121122
122123 if (sev_snp_enabled )
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
446447 init_args .probe = false;
447448 ret = sev_platform_init (& init_args );
448449 if (ret )
449- goto e_free ;
450+ goto e_free_asid ;
451+
452+ if (!zalloc_cpumask_var (& sev -> have_run_cpus , GFP_KERNEL_ACCOUNT )) {
453+ ret = - ENOMEM ;
454+ goto e_free_asid ;
455+ }
450456
451457 /* This needs to happen after SEV/SNP firmware initialization. */
452458 if (vm_type == KVM_X86_SNP_VM ) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
464470 return 0 ;
465471
466472e_free :
473+ free_cpumask_var (sev -> have_run_cpus );
474+ e_free_asid :
467475 argp -> error = init_args .error ;
468476 sev_asid_free (sev );
469477 sev -> asid = 0 ;
@@ -708,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
708716 }
709717}
710718
719+ static void sev_writeback_caches (struct kvm * kvm )
720+ {
721+ /*
722+ * Note, the caller is responsible for ensuring correctness if the mask
723+ * can be modified, e.g. if a CPU could be doing VMRUN.
724+ */
725+ if (cpumask_empty (to_kvm_sev_info (kvm )-> have_run_cpus ))
726+ return ;
727+
728+ /*
729+ * Ensure that all dirty guest tagged cache entries are written back
730+ * before releasing the pages back to the system for use. CLFLUSH will
731+ * not do this without SME_COHERENT, and flushing many cache lines
732+ * individually is slower than blasting WBINVD for large VMs, so issue
733+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
734+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
735+ * VM's ASID.
736+ *
737+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
738+ * would clear the mask when flushing caches, but doing so requires
739+ * serializing multiple calls and having responding CPUs (to the IPI)
740+ * mark themselves as still running if they are running (or about to
741+ * run) a vCPU for the VM.
742+ */
743+ wbnoinvd_on_cpus_mask (to_kvm_sev_info (kvm )-> have_run_cpus );
744+ }
745+
711746static unsigned long get_num_contig_pages (unsigned long idx ,
712747 struct page * * inpages , unsigned long npages )
713748{
@@ -2037,6 +2072,17 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
20372072 if (ret )
20382073 goto out_source_vcpu ;
20392074
2075+ /*
2076+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
2077+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
2078+ * the source VM but is never used for the destination VM, then the CPU
2079+ * can only have cached memory that was accessible to the source VM.
2080+ */
2081+ if (!zalloc_cpumask_var (& dst_sev -> have_run_cpus , GFP_KERNEL_ACCOUNT )) {
2082+ ret = - ENOMEM ;
2083+ goto out_source_vcpu ;
2084+ }
2085+
20402086 sev_migrate_from (kvm , source_kvm );
20412087 kvm_vm_dead (source_kvm );
20422088 cg_cleanup_sev = src_sev ;
@@ -2694,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
26942740 goto failed ;
26952741 }
26962742
2697- /*
2698- * Ensure that all guest tagged cache entries are flushed before
2699- * releasing the pages back to the system for use. CLFLUSH will
2700- * not do this, so issue a WBINVD.
2701- */
2702- wbinvd_on_all_cpus ();
2743+ sev_writeback_caches (kvm );
27032744
27042745 __unregister_enc_region_locked (kvm , region );
27052746
@@ -2741,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
27412782 goto e_unlock ;
27422783 }
27432784
2785+ mirror_sev = to_kvm_sev_info (kvm );
2786+ if (!zalloc_cpumask_var (& mirror_sev -> have_run_cpus , GFP_KERNEL_ACCOUNT )) {
2787+ ret = - ENOMEM ;
2788+ goto e_unlock ;
2789+ }
2790+
27442791 /*
27452792 * The mirror kvm holds an enc_context_owner ref so its asid can't
27462793 * disappear until we're done with it
27472794 */
27482795 source_sev = to_kvm_sev_info (source_kvm );
27492796 kvm_get_kvm (source_kvm );
2750- mirror_sev = to_kvm_sev_info (kvm );
27512797 list_add_tail (& mirror_sev -> mirror_entry , & source_sev -> mirror_vms );
27522798
27532799 /* Set enc_context_owner and copy its encryption context over */
@@ -2809,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
28092855
28102856 WARN_ON (!list_empty (& sev -> mirror_vms ));
28112857
2812- /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
2858+ free_cpumask_var (sev -> have_run_cpus );
2859+
2860+ /*
2861+ * If this is a mirror VM, remove it from the owner's list of a mirrors
2862+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
2863+ * Note, mirror VMs don't support registering encrypted regions.
2864+ */
28132865 if (is_mirroring_enc_context (kvm )) {
28142866 struct kvm * owner_kvm = sev -> enc_context_owner ;
28152867
@@ -2820,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
28202872 return ;
28212873 }
28222874
2823- /*
2824- * Ensure that all guest tagged cache entries are flushed before
2825- * releasing the pages back to the system for use. CLFLUSH will
2826- * not do this, so issue a WBINVD.
2827- */
2828- wbinvd_on_all_cpus ();
28292875
28302876 /*
28312877 * if userspace was terminated before unregistering the memory regions
@@ -3095,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
30953141
30963142 /*
30973143 * VM Page Flush takes a host virtual address and a guest ASID. Fall
3098- * back to WBINVD if this faults so as not to make any problems worse
3099- * by leaving stale encrypted data in the cache.
3144+ * back to full writeback of caches if this faults so as not to make
3145+ * any problems worse by leaving stale encrypted data in the cache.
31003146 */
31013147 if (WARN_ON_ONCE (wrmsrq_safe (MSR_AMD64_VM_PAGE_FLUSH , addr | asid )))
3102- goto do_wbinvd ;
3148+ goto do_sev_writeback_caches ;
31033149
31043150 return ;
31053151
3106- do_wbinvd :
3107- wbinvd_on_all_cpus ( );
3152+ do_sev_writeback_caches :
3153+ sev_writeback_caches ( vcpu -> kvm );
31083154}
31093155
31103156void sev_guest_memory_reclaimed (struct kvm * kvm )
31113157{
31123158 /*
31133159 * With SNP+gmem, private/encrypted memory is unreachable via the
3114- * hva-based mmu notifiers, so these events are only actually
3115- * pertaining to shared pages where there is no need to perform
3116- * the WBINVD to flush associated caches.
3160+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
3161+ * shared pages, where there's no need to flush caches.
31173162 */
31183163 if (!sev_guest (kvm ) || sev_snp_guest (kvm ))
31193164 return ;
31203165
3121- wbinvd_on_all_cpus ( );
3166+ sev_writeback_caches ( kvm );
31223167}
31233168
31243169void sev_free_vcpu (struct kvm_vcpu * vcpu )
@@ -3450,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
34503495 if (sev_es_guest (kvm ) && !VALID_PAGE (svm -> vmcb -> control .vmsa_pa ))
34513496 return - EINVAL ;
34523497
3498+ /*
3499+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
3500+ * track physical CPUs that enter the guest for SEV VMs and thus can
3501+ * have encrypted, dirty data in the cache, and flush caches only for
3502+ * CPUs that have entered the guest.
3503+ */
3504+ if (!cpumask_test_cpu (cpu , to_kvm_sev_info (kvm )-> have_run_cpus ))
3505+ cpumask_set_cpu (cpu , to_kvm_sev_info (kvm )-> have_run_cpus );
3506+
34533507 /* Assign the asid allocated with this SEV guest */
34543508 svm -> asid = asid ;
34553509
@@ -3882,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
38823936 * From this point forward, the VMSA will always be a guest-mapped page
38833937 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
38843938 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
3885- * that involves cleanups like wbinvd_on_all_cpus() which would ideally
3886- * be handled during teardown rather than guest boot. Deferring that
3887- * also allows the existing logic for SEV-ES VMSAs to be re-used with
3939+ * that involves cleanups like flushing caches, which would ideally be
3940+ * handled during teardown rather than guest boot. Deferring that also
3941+ * allows the existing logic for SEV-ES VMSAs to be re-used with
38883942 * minimal SNP-specific changes.
38893943 */
38903944 svm -> sev_es .snp_has_guest_vmsa = true;
@@ -4875,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
48754929
48764930 /*
48774931 * SEV-ES avoids host/guest cache coherency issues through
4878- * WBINVD hooks issued via MMU notifiers during run-time, and
4932+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
48794933 * KVM's VM destroy path at shutdown. Those MMU notifier events
48804934 * don't cover gmem since there is no requirement to map pages
48814935 * to a HVA in order to use them for a running guest. While the
0 commit comments