Skip to content

Commit 8607daa

Browse files
committed
Merge tag 'kvmarm-fixes-6.3-2' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 fixes for 6.3, part #2 Fixes for a rather interesting set of bugs relating to the MMU: - Read the MMU notifier seq before dropping the mmap lock to guard against reading a potentially stale VMA - Disable interrupts when walking user page tables to protect against the page table being freed - Read the MTE permissions for the VMA within the mmap lock critical section, avoiding the use of a potentally stale VMA pointer Additionally, some fixes targeting the vPMU: - Return the sum of the current perf event value and PMC snapshot for reads from userspace - Don't save the value of guest writes to PMCR_EL0.{C,P}, which could otherwise lead to userspace erroneously resetting the vPMU during VM save/restore
2 parents f3e7074 + 8c2e8ac commit 8607daa

File tree

3 files changed

+85
-38
lines changed

3 files changed

+85
-38
lines changed

arch/arm64/kvm/mmu.c

Lines changed: 64 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
666666
CONFIG_PGTABLE_LEVELS),
667667
.mm_ops = &kvm_user_mm_ops,
668668
};
669+
unsigned long flags;
669670
kvm_pte_t pte = 0; /* Keep GCC quiet... */
670671
u32 level = ~0;
671672
int ret;
672673

674+
/*
675+
* Disable IRQs so that we hazard against a concurrent
676+
* teardown of the userspace page tables (which relies on
677+
* IPI-ing threads).
678+
*/
679+
local_irq_save(flags);
673680
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
674-
VM_BUG_ON(ret);
675-
VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
676-
VM_BUG_ON(!(pte & PTE_VALID));
681+
local_irq_restore(flags);
682+
683+
if (ret)
684+
return ret;
685+
686+
/*
687+
* Not seeing an error, but not updating level? Something went
688+
* deeply wrong...
689+
*/
690+
if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS))
691+
return -EFAULT;
692+
693+
/* Oops, the userspace PTs are gone... Replay the fault */
694+
if (!kvm_pte_valid(pte))
695+
return -EAGAIN;
677696

678697
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
679698
}
@@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
10791098
*
10801099
* Returns the size of the mapping.
10811100
*/
1082-
static unsigned long
1101+
static long
10831102
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
10841103
unsigned long hva, kvm_pfn_t *pfnp,
10851104
phys_addr_t *ipap)
@@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
10911110
* sure that the HVA and IPA are sufficiently aligned and that the
10921111
* block map is contained within the memslot.
10931112
*/
1094-
if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
1095-
get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
1113+
if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1114+
int sz = get_user_mapping_size(kvm, hva);
1115+
1116+
if (sz < 0)
1117+
return sz;
1118+
1119+
if (sz < PMD_SIZE)
1120+
return PAGE_SIZE;
1121+
10961122
/*
10971123
* The address we faulted on is backed by a transparent huge
10981124
* page. However, because we map the compound huge page and
@@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
11921218
{
11931219
int ret = 0;
11941220
bool write_fault, writable, force_pte = false;
1195-
bool exec_fault;
1221+
bool exec_fault, mte_allowed;
11961222
bool device = false;
11971223
unsigned long mmu_seq;
11981224
struct kvm *kvm = vcpu->kvm;
@@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
12031229
kvm_pfn_t pfn;
12041230
bool logging_active = memslot_is_logging(memslot);
12051231
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
1206-
unsigned long vma_pagesize, fault_granule;
1232+
long vma_pagesize, fault_granule;
12071233
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
12081234
struct kvm_pgtable *pgt;
12091235

@@ -1217,6 +1243,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
12171243
return -EFAULT;
12181244
}
12191245

1246+
/*
1247+
* Permission faults just need to update the existing leaf entry,
1248+
* and so normally don't require allocations from the memcache. The
1249+
* only exception to this is when dirty logging is enabled at runtime
1250+
* and a write fault needs to collapse a block entry into a table.
1251+
*/
1252+
if (fault_status != ESR_ELx_FSC_PERM ||
1253+
(logging_active && write_fault)) {
1254+
ret = kvm_mmu_topup_memory_cache(memcache,
1255+
kvm_mmu_cache_min_pages(kvm));
1256+
if (ret)
1257+
return ret;
1258+
}
1259+
12201260
/*
12211261
* Let's check if we will get back a huge page backed by hugetlbfs, or
12221262
* get block mapping for device MMIO region.
@@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
12691309
fault_ipa &= ~(vma_pagesize - 1);
12701310

12711311
gfn = fault_ipa >> PAGE_SHIFT;
1272-
mmap_read_unlock(current->mm);
1312+
mte_allowed = kvm_vma_mte_allowed(vma);
12731313

1274-
/*
1275-
* Permission faults just need to update the existing leaf entry,
1276-
* and so normally don't require allocations from the memcache. The
1277-
* only exception to this is when dirty logging is enabled at runtime
1278-
* and a write fault needs to collapse a block entry into a table.
1279-
*/
1280-
if (fault_status != ESR_ELx_FSC_PERM ||
1281-
(logging_active && write_fault)) {
1282-
ret = kvm_mmu_topup_memory_cache(memcache,
1283-
kvm_mmu_cache_min_pages(kvm));
1284-
if (ret)
1285-
return ret;
1286-
}
1314+
/* Don't use the VMA after the unlock -- it may have vanished */
1315+
vma = NULL;
12871316

1288-
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
12891317
/*
1290-
* Ensure the read of mmu_invalidate_seq happens before we call
1291-
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1292-
* the page we just got a reference to gets unmapped before we have a
1293-
* chance to grab the mmu_lock, which ensure that if the page gets
1294-
* unmapped afterwards, the call to kvm_unmap_gfn will take it away
1295-
* from us again properly. This smp_rmb() interacts with the smp_wmb()
1296-
* in kvm_mmu_notifier_invalidate_<page|range_end>.
1318+
* Read mmu_invalidate_seq so that KVM can detect if the results of
1319+
* vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
1320+
* acquiring kvm->mmu_lock.
12971321
*
1298-
* Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
1299-
* used to avoid unnecessary overhead introduced to locate the memory
1300-
* slot because it's always fixed even @gfn is adjusted for huge pages.
1322+
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
1323+
* with the smp_wmb() in kvm_mmu_invalidate_end().
13011324
*/
1302-
smp_rmb();
1325+
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
1326+
mmap_read_unlock(current->mm);
13031327

13041328
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
13051329
write_fault, &writable, NULL);
@@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
13501374
vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
13511375
hva, &pfn,
13521376
&fault_ipa);
1377+
1378+
if (vma_pagesize < 0) {
1379+
ret = vma_pagesize;
1380+
goto out_unlock;
1381+
}
13531382
}
13541383

13551384
if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
13561385
/* Check the VMM hasn't introduced a new disallowed VMA */
1357-
if (kvm_vma_mte_allowed(vma)) {
1386+
if (mte_allowed) {
13581387
sanitise_mte_tags(kvm, pfn, vma_pagesize);
13591388
} else {
13601389
ret = -EFAULT;

arch/arm64/kvm/pmu-emul.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
538538
if (!kvm_pmu_is_3p5(vcpu))
539539
val &= ~ARMV8_PMU_PMCR_LP;
540540

541-
__vcpu_sys_reg(vcpu, PMCR_EL0) = val;
541+
/* The reset bits don't indicate any state, and shouldn't be saved. */
542+
__vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P);
542543

543544
if (val & ARMV8_PMU_PMCR_E) {
544545
kvm_pmu_enable_counter_mask(vcpu,

arch/arm64/kvm/sys_regs.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
856856
return true;
857857
}
858858

859+
static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
860+
u64 *val)
861+
{
862+
u64 idx;
863+
864+
if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
865+
/* PMCCNTR_EL0 */
866+
idx = ARMV8_PMU_CYCLE_IDX;
867+
else
868+
/* PMEVCNTRn_EL0 */
869+
idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
870+
871+
*val = kvm_pmu_get_counter_value(vcpu, idx);
872+
return 0;
873+
}
874+
859875
static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
860876
struct sys_reg_params *p,
861877
const struct sys_reg_desc *r)
@@ -1072,7 +1088,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
10721088
/* Macro to expand the PMEVCNTRn_EL0 register */
10731089
#define PMU_PMEVCNTR_EL0(n) \
10741090
{ PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \
1075-
.reset = reset_pmevcntr, \
1091+
.reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \
10761092
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
10771093

10781094
/* Macro to expand the PMEVTYPERn_EL0 register */
@@ -1982,7 +1998,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
19821998
{ PMU_SYS_REG(SYS_PMCEID1_EL0),
19831999
.access = access_pmceid, .reset = NULL },
19842000
{ PMU_SYS_REG(SYS_PMCCNTR_EL0),
1985-
.access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 },
2001+
.access = access_pmu_evcntr, .reset = reset_unknown,
2002+
.reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr},
19862003
{ PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
19872004
.access = access_pmu_evtyper, .reset = NULL },
19882005
{ PMU_SYS_REG(SYS_PMXEVCNTR_EL0),

0 commit comments

Comments
 (0)