@@ -776,7 +776,8 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
776776 kvm_flush_remote_tlbs_gfn (kvm , gfn , PG_LEVEL_4K );
777777}
778778
779- void track_possible_nx_huge_page (struct kvm * kvm , struct kvm_mmu_page * sp )
779+ void track_possible_nx_huge_page (struct kvm * kvm , struct kvm_mmu_page * sp ,
780+ enum kvm_mmu_type mmu_type )
780781{
781782 /*
782783 * If it's possible to replace the shadow page with an NX huge page,
@@ -790,8 +791,9 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
790791 return ;
791792
792793 ++ kvm -> stat .nx_lpage_splits ;
794+ ++ kvm -> arch .possible_nx_huge_pages [mmu_type ].nr_pages ;
793795 list_add_tail (& sp -> possible_nx_huge_page_link ,
794- & kvm -> arch .possible_nx_huge_pages );
796+ & kvm -> arch .possible_nx_huge_pages [ mmu_type ]. pages );
795797}
796798
797799static void account_nx_huge_page (struct kvm * kvm , struct kvm_mmu_page * sp ,
@@ -800,7 +802,7 @@ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
800802 sp -> nx_huge_page_disallowed = true;
801803
802804 if (nx_huge_page_possible )
803- track_possible_nx_huge_page (kvm , sp );
805+ track_possible_nx_huge_page (kvm , sp , KVM_SHADOW_MMU );
804806}
805807
806808static void unaccount_shadowed (struct kvm * kvm , struct kvm_mmu_page * sp )
@@ -819,20 +821,22 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
819821 kvm_mmu_gfn_allow_lpage (slot , gfn );
820822}
821823
822- void untrack_possible_nx_huge_page (struct kvm * kvm , struct kvm_mmu_page * sp )
824+ void untrack_possible_nx_huge_page (struct kvm * kvm , struct kvm_mmu_page * sp ,
825+ enum kvm_mmu_type mmu_type )
823826{
824827 if (list_empty (& sp -> possible_nx_huge_page_link ))
825828 return ;
826829
827830 -- kvm -> stat .nx_lpage_splits ;
831+ -- kvm -> arch .possible_nx_huge_pages [mmu_type ].nr_pages ;
828832 list_del_init (& sp -> possible_nx_huge_page_link );
829833}
830834
831835static void unaccount_nx_huge_page (struct kvm * kvm , struct kvm_mmu_page * sp )
832836{
833837 sp -> nx_huge_page_disallowed = false;
834838
835- untrack_possible_nx_huge_page (kvm , sp );
839+ untrack_possible_nx_huge_page (kvm , sp , KVM_SHADOW_MMU );
836840}
837841
838842static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap (struct kvm_vcpu * vcpu ,
@@ -4663,10 +4667,16 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
46634667 /*
46644668 * Retry the page fault if the gfn hit a memslot that is being deleted
46654669 * or moved. This ensures any existing SPTEs for the old memslot will
4666- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4670+ * be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the
4671+ * error to userspace if this is a prefault, as KVM's prefaulting ABI
4672+ * doesn't provide the same forward progress guarantees as KVM_RUN.
46674673 */
4668- if (slot -> flags & KVM_MEMSLOT_INVALID )
4674+ if (slot -> flags & KVM_MEMSLOT_INVALID ) {
4675+ if (fault -> prefetch )
4676+ return - EAGAIN ;
4677+
46694678 return RET_PF_RETRY ;
4679+ }
46704680
46714681 if (slot -> id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT ) {
46724682 /*
@@ -6751,11 +6761,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
67516761
67526762int kvm_mmu_init_vm (struct kvm * kvm )
67536763{
6754- int r ;
6764+ int r , i ;
67556765
67566766 kvm -> arch .shadow_mmio_value = shadow_mmio_value ;
67576767 INIT_LIST_HEAD (& kvm -> arch .active_mmu_pages );
6758- INIT_LIST_HEAD (& kvm -> arch .possible_nx_huge_pages );
6768+ for (i = 0 ; i < KVM_NR_MMU_TYPES ; ++ i )
6769+ INIT_LIST_HEAD (& kvm -> arch .possible_nx_huge_pages [i ].pages );
67596770 spin_lock_init (& kvm -> arch .mmu_unsync_pages_lock );
67606771
67616772 if (tdp_mmu_enabled ) {
@@ -7596,19 +7607,64 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
75967607 return err ;
75977608}
75987609
7599- static void kvm_recover_nx_huge_pages (struct kvm * kvm )
7610+ static unsigned long nx_huge_pages_to_zap (struct kvm * kvm ,
7611+ enum kvm_mmu_type mmu_type )
7612+ {
7613+ unsigned long pages = READ_ONCE (kvm -> arch .possible_nx_huge_pages [mmu_type ].nr_pages );
7614+ unsigned int ratio = READ_ONCE (nx_huge_pages_recovery_ratio );
7615+
7616+ return ratio ? DIV_ROUND_UP (pages , ratio ) : 0 ;
7617+ }
7618+
7619+ static bool kvm_mmu_sp_dirty_logging_enabled (struct kvm * kvm ,
7620+ struct kvm_mmu_page * sp )
76007621{
7601- unsigned long nx_lpage_splits = kvm -> stat .nx_lpage_splits ;
76027622 struct kvm_memory_slot * slot ;
7603- int rcu_idx ;
7623+
7624+ /*
7625+ * Skip the memslot lookup if dirty tracking can't possibly be enabled,
7626+ * as memslot lookups are relatively expensive.
7627+ *
7628+ * If a memslot update is in progress, reading an incorrect value of
7629+ * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
7630+ * zero, KVM will do an unnecessary memslot lookup; if it is becoming
7631+ * nonzero, the page will be zapped unnecessarily. Either way, this
7632+ * only affects efficiency in racy situations, and not correctness.
7633+ */
7634+ if (!atomic_read (& kvm -> nr_memslots_dirty_logging ))
7635+ return false;
7636+
7637+ slot = __gfn_to_memslot (kvm_memslots_for_spte_role (kvm , sp -> role ), sp -> gfn );
7638+ if (WARN_ON_ONCE (!slot ))
7639+ return false;
7640+
7641+ return kvm_slot_dirty_track_enabled (slot );
7642+ }
7643+
7644+ static void kvm_recover_nx_huge_pages (struct kvm * kvm ,
7645+ const enum kvm_mmu_type mmu_type )
7646+ {
7647+ #ifdef CONFIG_X86_64
7648+ const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU ;
7649+ spinlock_t * tdp_mmu_pages_lock = & kvm -> arch .tdp_mmu_pages_lock ;
7650+ #else
7651+ const bool is_tdp_mmu = false;
7652+ spinlock_t * tdp_mmu_pages_lock = NULL ;
7653+ #endif
7654+ unsigned long to_zap = nx_huge_pages_to_zap (kvm , mmu_type );
7655+ struct list_head * nx_huge_pages ;
76047656 struct kvm_mmu_page * sp ;
7605- unsigned int ratio ;
76067657 LIST_HEAD (invalid_list );
76077658 bool flush = false;
7608- ulong to_zap ;
7659+ int rcu_idx ;
7660+
7661+ nx_huge_pages = & kvm -> arch .possible_nx_huge_pages [mmu_type ].pages ;
76097662
76107663 rcu_idx = srcu_read_lock (& kvm -> srcu );
7611- write_lock (& kvm -> mmu_lock );
7664+ if (is_tdp_mmu )
7665+ read_lock (& kvm -> mmu_lock );
7666+ else
7667+ write_lock (& kvm -> mmu_lock );
76127668
76137669 /*
76147670 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
@@ -7617,11 +7673,15 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
76177673 */
76187674 rcu_read_lock ();
76197675
7620- ratio = READ_ONCE (nx_huge_pages_recovery_ratio );
7621- to_zap = ratio ? DIV_ROUND_UP (nx_lpage_splits , ratio ) : 0 ;
76227676 for ( ; to_zap ; -- to_zap ) {
7623- if (list_empty (& kvm -> arch .possible_nx_huge_pages ))
7677+ if (is_tdp_mmu )
7678+ spin_lock (tdp_mmu_pages_lock );
7679+
7680+ if (list_empty (nx_huge_pages )) {
7681+ if (is_tdp_mmu )
7682+ spin_unlock (tdp_mmu_pages_lock );
76247683 break ;
7684+ }
76257685
76267686 /*
76277687 * We use a separate list instead of just using active_mmu_pages
@@ -7630,64 +7690,55 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
76307690 * the total number of shadow pages. And because the TDP MMU
76317691 * doesn't use active_mmu_pages.
76327692 */
7633- sp = list_first_entry (& kvm -> arch . possible_nx_huge_pages ,
7693+ sp = list_first_entry (nx_huge_pages ,
76347694 struct kvm_mmu_page ,
76357695 possible_nx_huge_page_link );
76367696 WARN_ON_ONCE (!sp -> nx_huge_page_disallowed );
76377697 WARN_ON_ONCE (!sp -> role .direct );
76387698
7699+ unaccount_nx_huge_page (kvm , sp );
7700+
7701+ if (is_tdp_mmu )
7702+ spin_unlock (tdp_mmu_pages_lock );
7703+
76397704 /*
7640- * Unaccount and do not attempt to recover any NX Huge Pages
7641- * that are being dirty tracked, as they would just be faulted
7642- * back in as 4KiB pages. The NX Huge Pages in this slot will be
7643- * recovered, along with all the other huge pages in the slot,
7644- * when dirty logging is disabled.
7645- *
7646- * Since gfn_to_memslot() is relatively expensive, it helps to
7647- * skip it if it the test cannot possibly return true. On the
7648- * other hand, if any memslot has logging enabled, chances are
7649- * good that all of them do, in which case unaccount_nx_huge_page()
7650- * is much cheaper than zapping the page.
7651- *
7652- * If a memslot update is in progress, reading an incorrect value
7653- * of kvm->nr_memslots_dirty_logging is not a problem: if it is
7654- * becoming zero, gfn_to_memslot() will be done unnecessarily; if
7655- * it is becoming nonzero, the page will be zapped unnecessarily.
7656- * Either way, this only affects efficiency in racy situations,
7657- * and not correctness.
7705+ * Do not attempt to recover any NX Huge Pages that are being
7706+ * dirty tracked, as they would just be faulted back in as 4KiB
7707+ * pages. The NX Huge Pages in this slot will be recovered,
7708+ * along with all the other huge pages in the slot, when dirty
7709+ * logging is disabled.
76587710 */
7659- slot = NULL ;
7660- if (atomic_read (& kvm -> nr_memslots_dirty_logging )) {
7661- struct kvm_memslots * slots ;
7711+ if (!kvm_mmu_sp_dirty_logging_enabled (kvm , sp )) {
7712+ if (is_tdp_mmu )
7713+ flush |= kvm_tdp_mmu_zap_possible_nx_huge_page (kvm , sp );
7714+ else
7715+ kvm_mmu_prepare_zap_page (kvm , sp , & invalid_list );
76627716
7663- slots = kvm_memslots_for_spte_role (kvm , sp -> role );
7664- slot = __gfn_to_memslot (slots , sp -> gfn );
7665- WARN_ON_ONCE (!slot );
76667717 }
76677718
7668- if (slot && kvm_slot_dirty_track_enabled (slot ))
7669- unaccount_nx_huge_page (kvm , sp );
7670- else if (is_tdp_mmu_page (sp ))
7671- flush |= kvm_tdp_mmu_zap_sp (kvm , sp );
7672- else
7673- kvm_mmu_prepare_zap_page (kvm , sp , & invalid_list );
76747719 WARN_ON_ONCE (sp -> nx_huge_page_disallowed );
76757720
76767721 if (need_resched () || rwlock_needbreak (& kvm -> mmu_lock )) {
76777722 kvm_mmu_remote_flush_or_zap (kvm , & invalid_list , flush );
76787723 rcu_read_unlock ();
76797724
7680- cond_resched_rwlock_write (& kvm -> mmu_lock );
7681- flush = false;
7725+ if (is_tdp_mmu )
7726+ cond_resched_rwlock_read (& kvm -> mmu_lock );
7727+ else
7728+ cond_resched_rwlock_write (& kvm -> mmu_lock );
76827729
7730+ flush = false;
76837731 rcu_read_lock ();
76847732 }
76857733 }
76867734 kvm_mmu_remote_flush_or_zap (kvm , & invalid_list , flush );
76877735
76887736 rcu_read_unlock ();
76897737
7690- write_unlock (& kvm -> mmu_lock );
7738+ if (is_tdp_mmu )
7739+ read_unlock (& kvm -> mmu_lock );
7740+ else
7741+ write_unlock (& kvm -> mmu_lock );
76917742 srcu_read_unlock (& kvm -> srcu , rcu_idx );
76927743}
76937744
@@ -7698,9 +7749,10 @@ static void kvm_nx_huge_page_recovery_worker_kill(void *data)
76987749static bool kvm_nx_huge_page_recovery_worker (void * data )
76997750{
77007751 struct kvm * kvm = data ;
7752+ long remaining_time ;
77017753 bool enabled ;
77027754 uint period ;
7703- long remaining_time ;
7755+ int i ;
77047756
77057757 enabled = calc_nx_huge_pages_recovery_period (& period );
77067758 if (!enabled )
@@ -7715,7 +7767,8 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
77157767 }
77167768
77177769 __set_current_state (TASK_RUNNING );
7718- kvm_recover_nx_huge_pages (kvm );
7770+ for (i = 0 ; i < KVM_NR_MMU_TYPES ; ++ i )
7771+ kvm_recover_nx_huge_pages (kvm , i );
77197772 kvm -> arch .nx_huge_page_last = get_jiffies_64 ();
77207773 return true;
77217774}
0 commit comments