Skip to content

Commit 13e2e4f

Browse files
dmatlacksean-jc
authored andcommitted
KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping
Recover TDP MMU huge page mappings in-place instead of zapping them when dirty logging is disabled, and rename functions that recover huge page mappings when dirty logging is disabled to move away from the "zap collapsible spte" terminology. Before KVM flushes TLBs, guest accesses may be translated through either the (stale) small SPTE or the (new) huge SPTE. This is already possible when KVM is doing eager page splitting (where TLB flushes are also batched), and when vCPUs are faulting in huge mappings (where TLBs are flushed after the new huge SPTE is installed). Recovering huge pages reduces the number of page faults when dirty logging is disabled: $ perf stat -e kvm:kvm_page_fault -- ./dirty_log_perf_test -s anonymous_hugetlb_2mb -v 64 -e -b 4g Before: 393,599 kvm:kvm_page_fault After: 262,575 kvm:kvm_page_fault vCPU throughput and the latency of disabling dirty-logging are about equal compared to zapping, but avoiding faults can be beneficial to remove vCPU jitter in extreme scenarios. Signed-off-by: David Matlack <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Sean Christopherson <[email protected]>
1 parent dd2e7db commit 13e2e4f

File tree

7 files changed

+91
-30
lines changed

7 files changed

+91
-30
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1955,8 +1955,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
19551955
const struct kvm_memory_slot *memslot,
19561956
u64 start, u64 end,
19571957
int target_level);
1958-
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
1959-
const struct kvm_memory_slot *memslot);
1958+
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
1959+
const struct kvm_memory_slot *memslot);
19601960
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
19611961
const struct kvm_memory_slot *memslot);
19621962
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);

arch/x86/kvm/mmu/mmu.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6946,8 +6946,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
69466946
kvm_flush_remote_tlbs_memslot(kvm, slot);
69476947
}
69486948

6949-
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6950-
const struct kvm_memory_slot *slot)
6949+
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
6950+
const struct kvm_memory_slot *slot)
69516951
{
69526952
if (kvm_memslots_have_rmaps(kvm)) {
69536953
write_lock(&kvm->mmu_lock);
@@ -6957,7 +6957,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
69576957

69586958
if (tdp_mmu_enabled) {
69596959
read_lock(&kvm->mmu_lock);
6960-
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6960+
kvm_tdp_mmu_recover_huge_pages(kvm, slot);
69616961
read_unlock(&kvm->mmu_lock);
69626962
}
69636963
}

arch/x86/kvm/mmu/spte.c

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,22 +262,32 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
262262
return wrprot;
263263
}
264264

265-
static u64 make_spte_executable(u64 spte)
265+
static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
266266
{
267267
bool is_access_track = is_access_track_spte(spte);
268268

269269
if (is_access_track)
270270
spte = restore_acc_track_spte(spte);
271271

272-
spte &= ~shadow_nx_mask;
273-
spte |= shadow_x_mask;
272+
KVM_MMU_WARN_ON(set & clear);
273+
spte = (spte | set) & ~clear;
274274

275275
if (is_access_track)
276276
spte = mark_spte_for_access_track(spte);
277277

278278
return spte;
279279
}
280280

281+
static u64 make_spte_executable(u64 spte)
282+
{
283+
return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
284+
}
285+
286+
static u64 make_spte_nonexecutable(u64 spte)
287+
{
288+
return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
289+
}
290+
281291
/*
282292
* Construct an SPTE that maps a sub-page of the given huge page SPTE where
283293
* `index` identifies which sub-page.
@@ -314,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
314324
return child_spte;
315325
}
316326

327+
u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
328+
{
329+
u64 huge_spte;
330+
331+
KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
332+
333+
huge_spte = small_spte | PT_PAGE_SIZE_MASK;
334+
335+
/*
336+
* huge_spte already has the address of the sub-page being collapsed
337+
* from small_spte, so just clear the lower address bits to create the
338+
* huge page address.
339+
*/
340+
huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
341+
342+
if (is_nx_huge_page_enabled(kvm))
343+
huge_spte = make_spte_nonexecutable(huge_spte);
344+
345+
return huge_spte;
346+
}
317347

318348
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
319349
{

arch/x86/kvm/mmu/spte.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
506506
bool host_writable, u64 *new_spte);
507507
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
508508
union kvm_mmu_page_role role, int index);
509+
u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
509510
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
510511
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
511512
u64 mark_spte_for_access_track(u64 spte);

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,15 +1552,43 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
15521552
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
15531553
}
15541554

1555-
static void zap_collapsible_spte_range(struct kvm *kvm,
1556-
struct kvm_mmu_page *root,
1557-
const struct kvm_memory_slot *slot)
1555+
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1556+
struct tdp_iter *parent,
1557+
u64 *huge_spte)
1558+
{
1559+
struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1560+
gfn_t start = parent->gfn;
1561+
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1562+
struct tdp_iter iter;
1563+
1564+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
1565+
/*
1566+
* Use the parent iterator when checking for forward progress so
1567+
* that KVM doesn't get stuck continuously trying to yield (i.e.
1568+
* returning -EAGAIN here and then failing the forward progress
1569+
* check in the caller ad nauseam).
1570+
*/
1571+
if (tdp_mmu_iter_need_resched(kvm, parent))
1572+
return -EAGAIN;
1573+
1574+
*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1575+
return 0;
1576+
}
1577+
1578+
return -ENOENT;
1579+
}
1580+
1581+
static void recover_huge_pages_range(struct kvm *kvm,
1582+
struct kvm_mmu_page *root,
1583+
const struct kvm_memory_slot *slot)
15581584
{
15591585
gfn_t start = slot->base_gfn;
15601586
gfn_t end = start + slot->npages;
15611587
struct tdp_iter iter;
15621588
int max_mapping_level;
15631589
bool flush = false;
1590+
u64 huge_spte;
1591+
int r;
15641592

15651593
rcu_read_lock();
15661594

@@ -1597,7 +1625,13 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
15971625
if (max_mapping_level < iter.level)
15981626
continue;
15991627

1600-
if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
1628+
r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1629+
if (r == -EAGAIN)
1630+
goto retry;
1631+
else if (r)
1632+
continue;
1633+
1634+
if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
16011635
goto retry;
16021636

16031637
flush = true;
@@ -1610,17 +1644,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
16101644
}
16111645

16121646
/*
1613-
* Zap non-leaf SPTEs (and free their associated page tables) which could
1614-
* be replaced by huge pages, for GFNs within the slot.
1647+
* Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1648+
* huge SPTEs where possible.
16151649
*/
1616-
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1617-
const struct kvm_memory_slot *slot)
1650+
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1651+
const struct kvm_memory_slot *slot)
16181652
{
16191653
struct kvm_mmu_page *root;
16201654

16211655
lockdep_assert_held_read(&kvm->mmu_lock);
16221656
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1623-
zap_collapsible_spte_range(kvm, root, slot);
1657+
recover_huge_pages_range(kvm, root, slot);
16241658
}
16251659

16261660
/*

arch/x86/kvm/mmu/tdp_mmu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
4040
struct kvm_memory_slot *slot,
4141
gfn_t gfn, unsigned long mask,
4242
bool wrprot);
43-
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
44-
const struct kvm_memory_slot *slot);
43+
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
44+
const struct kvm_memory_slot *slot);
4545

4646
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
4747
struct kvm_memory_slot *slot, gfn_t gfn,

arch/x86/kvm/x86.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13104,19 +13104,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
1310413104

1310513105
if (!log_dirty_pages) {
1310613106
/*
13107-
* Dirty logging tracks sptes in 4k granularity, meaning that
13108-
* large sptes have to be split. If live migration succeeds,
13109-
* the guest in the source machine will be destroyed and large
13110-
* sptes will be created in the destination. However, if the
13111-
* guest continues to run in the source machine (for example if
13112-
* live migration fails), small sptes will remain around and
13113-
* cause bad performance.
13107+
* Recover huge page mappings in the slot now that dirty logging
13108+
* is disabled, i.e. now that KVM does not have to track guest
13109+
* writes at 4KiB granularity.
1311413110
*
13115-
* Scan sptes if dirty logging has been stopped, dropping those
13116-
* which can be collapsed into a single large-page spte. Later
13117-
* page faults will create the large-page sptes.
13111+
* Dirty logging might be disabled by userspace if an ongoing VM
13112+
* live migration is cancelled and the VM must continue running
13113+
* on the source.
1311813114
*/
13119-
kvm_mmu_zap_collapsible_sptes(kvm, new);
13115+
kvm_mmu_recover_huge_pages(kvm, new);
1312013116
} else {
1312113117
/*
1312213118
* Initially-all-set does not require write protecting any page,

0 commit comments

Comments
 (0)