@@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
666
666
CONFIG_PGTABLE_LEVELS ),
667
667
.mm_ops = & kvm_user_mm_ops ,
668
668
};
669
+ unsigned long flags ;
669
670
kvm_pte_t pte = 0 ; /* Keep GCC quiet... */
670
671
u32 level = ~0 ;
671
672
int ret ;
672
673
674
+ /*
675
+ * Disable IRQs so that we hazard against a concurrent
676
+ * teardown of the userspace page tables (which relies on
677
+ * IPI-ing threads).
678
+ */
679
+ local_irq_save (flags );
673
680
ret = kvm_pgtable_get_leaf (& pgt , addr , & pte , & level );
674
- VM_BUG_ON (ret );
675
- VM_BUG_ON (level >= KVM_PGTABLE_MAX_LEVELS );
676
- VM_BUG_ON (!(pte & PTE_VALID ));
681
+ local_irq_restore (flags );
682
+
683
+ if (ret )
684
+ return ret ;
685
+
686
+ /*
687
+ * Not seeing an error, but not updating level? Something went
688
+ * deeply wrong...
689
+ */
690
+ if (WARN_ON (level >= KVM_PGTABLE_MAX_LEVELS ))
691
+ return - EFAULT ;
692
+
693
+ /* Oops, the userspace PTs are gone... Replay the fault */
694
+ if (!kvm_pte_valid (pte ))
695
+ return - EAGAIN ;
677
696
678
697
return BIT (ARM64_HW_PGTABLE_LEVEL_SHIFT (level ));
679
698
}
@@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1079
1098
*
1080
1099
* Returns the size of the mapping.
1081
1100
*/
1082
- static unsigned long
1101
+ static long
1083
1102
transparent_hugepage_adjust (struct kvm * kvm , struct kvm_memory_slot * memslot ,
1084
1103
unsigned long hva , kvm_pfn_t * pfnp ,
1085
1104
phys_addr_t * ipap )
@@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
1091
1110
* sure that the HVA and IPA are sufficiently aligned and that the
1092
1111
* block map is contained within the memslot.
1093
1112
*/
1094
- if (fault_supports_stage2_huge_mapping (memslot , hva , PMD_SIZE ) &&
1095
- get_user_mapping_size (kvm , hva ) >= PMD_SIZE ) {
1113
+ if (fault_supports_stage2_huge_mapping (memslot , hva , PMD_SIZE )) {
1114
+ int sz = get_user_mapping_size (kvm , hva );
1115
+
1116
+ if (sz < 0 )
1117
+ return sz ;
1118
+
1119
+ if (sz < PMD_SIZE )
1120
+ return PAGE_SIZE ;
1121
+
1096
1122
/*
1097
1123
* The address we faulted on is backed by a transparent huge
1098
1124
* page. However, because we map the compound huge page and
@@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1192
1218
{
1193
1219
int ret = 0 ;
1194
1220
bool write_fault , writable , force_pte = false;
1195
- bool exec_fault ;
1221
+ bool exec_fault , mte_allowed ;
1196
1222
bool device = false;
1197
1223
unsigned long mmu_seq ;
1198
1224
struct kvm * kvm = vcpu -> kvm ;
@@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1203
1229
kvm_pfn_t pfn ;
1204
1230
bool logging_active = memslot_is_logging (memslot );
1205
1231
unsigned long fault_level = kvm_vcpu_trap_get_fault_level (vcpu );
1206
- unsigned long vma_pagesize , fault_granule ;
1232
+ long vma_pagesize , fault_granule ;
1207
1233
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R ;
1208
1234
struct kvm_pgtable * pgt ;
1209
1235
@@ -1217,6 +1243,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1217
1243
return - EFAULT ;
1218
1244
}
1219
1245
1246
+ /*
1247
+ * Permission faults just need to update the existing leaf entry,
1248
+ * and so normally don't require allocations from the memcache. The
1249
+ * only exception to this is when dirty logging is enabled at runtime
1250
+ * and a write fault needs to collapse a block entry into a table.
1251
+ */
1252
+ if (fault_status != ESR_ELx_FSC_PERM ||
1253
+ (logging_active && write_fault )) {
1254
+ ret = kvm_mmu_topup_memory_cache (memcache ,
1255
+ kvm_mmu_cache_min_pages (kvm ));
1256
+ if (ret )
1257
+ return ret ;
1258
+ }
1259
+
1220
1260
/*
1221
1261
* Let's check if we will get back a huge page backed by hugetlbfs, or
1222
1262
* get block mapping for device MMIO region.
@@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1269
1309
fault_ipa &= ~(vma_pagesize - 1 );
1270
1310
1271
1311
gfn = fault_ipa >> PAGE_SHIFT ;
1272
- mmap_read_unlock ( current -> mm );
1312
+ mte_allowed = kvm_vma_mte_allowed ( vma );
1273
1313
1274
- /*
1275
- * Permission faults just need to update the existing leaf entry,
1276
- * and so normally don't require allocations from the memcache. The
1277
- * only exception to this is when dirty logging is enabled at runtime
1278
- * and a write fault needs to collapse a block entry into a table.
1279
- */
1280
- if (fault_status != ESR_ELx_FSC_PERM ||
1281
- (logging_active && write_fault )) {
1282
- ret = kvm_mmu_topup_memory_cache (memcache ,
1283
- kvm_mmu_cache_min_pages (kvm ));
1284
- if (ret )
1285
- return ret ;
1286
- }
1314
+ /* Don't use the VMA after the unlock -- it may have vanished */
1315
+ vma = NULL ;
1287
1316
1288
- mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
1289
1317
/*
1290
- * Ensure the read of mmu_invalidate_seq happens before we call
1291
- * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1292
- * the page we just got a reference to gets unmapped before we have a
1293
- * chance to grab the mmu_lock, which ensure that if the page gets
1294
- * unmapped afterwards, the call to kvm_unmap_gfn will take it away
1295
- * from us again properly. This smp_rmb() interacts with the smp_wmb()
1296
- * in kvm_mmu_notifier_invalidate_<page|range_end>.
1318
+ * Read mmu_invalidate_seq so that KVM can detect if the results of
1319
+ * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
1320
+ * acquiring kvm->mmu_lock.
1297
1321
*
1298
- * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
1299
- * used to avoid unnecessary overhead introduced to locate the memory
1300
- * slot because it's always fixed even @gfn is adjusted for huge pages.
1322
+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
1323
+ * with the smp_wmb() in kvm_mmu_invalidate_end().
1301
1324
*/
1302
- smp_rmb ();
1325
+ mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
1326
+ mmap_read_unlock (current -> mm );
1303
1327
1304
1328
pfn = __gfn_to_pfn_memslot (memslot , gfn , false, false, NULL ,
1305
1329
write_fault , & writable , NULL );
@@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1350
1374
vma_pagesize = transparent_hugepage_adjust (kvm , memslot ,
1351
1375
hva , & pfn ,
1352
1376
& fault_ipa );
1377
+
1378
+ if (vma_pagesize < 0 ) {
1379
+ ret = vma_pagesize ;
1380
+ goto out_unlock ;
1381
+ }
1353
1382
}
1354
1383
1355
1384
if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte (kvm )) {
1356
1385
/* Check the VMM hasn't introduced a new disallowed VMA */
1357
- if (kvm_vma_mte_allowed ( vma ) ) {
1386
+ if (mte_allowed ) {
1358
1387
sanitise_mte_tags (kvm , pfn , vma_pagesize );
1359
1388
} else {
1360
1389
ret = - EFAULT ;
0 commit comments