Skip to content

Commit a78986a

Browse files
Sean Christophersonbonzini
authored andcommitted
KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved
Explicitly exempt ZONE_DEVICE pages from kvm_is_reserved_pfn() and instead manually handle ZONE_DEVICE on a case-by-case basis. For things like page refcounts, KVM needs to treat ZONE_DEVICE pages like normal pages, e.g. put pages grabbed via gup(). But for flows such as setting A/D bits or shifting refcounts for transparent huge pages, KVM needs to to avoid processing ZONE_DEVICE pages as the flows in question lack the underlying machinery for proper handling of ZONE_DEVICE pages. This fixes a hang reported by Adam Borowski[*] in dev_pagemap_cleanup() when running a KVM guest backed with /dev/dax memory, as KVM straight up doesn't put any references to ZONE_DEVICE pages acquired by gup(). Note, Dan Williams proposed an alternative solution of doing put_page() on ZONE_DEVICE pages immediately after gup() in order to simplify the auditing needed to ensure is_zone_device_page() is called if and only if the backing device is pinned (via gup()). But that approach would break kvm_vcpu_{un}map() as KVM requires the page to be pinned from map() 'til unmap() when accessing guest memory, unlike KVM's secondary MMU, which coordinates with mmu_notifier invalidations to avoid creating stale page references, i.e. doesn't rely on pages being pinned. [*] http://lkml.kernel.org/r/[email protected] Reported-by: Adam Borowski <[email protected]> Analyzed-by: David Hildenbrand <[email protected]> Acked-by: Dan Williams <[email protected]> Cc: [email protected] Fixes: 3565fce ("mm, x86: get_user_pages() for dax mappings") Signed-off-by: Sean Christopherson <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 29881b6 commit a78986a

File tree

3 files changed

+28
-7
lines changed

3 files changed

+28
-7
lines changed

arch/x86/kvm/mmu.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3306,7 +3306,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
33063306
* here.
33073307
*/
33083308
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3309-
level == PT_PAGE_TABLE_LEVEL &&
3309+
!kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
33103310
PageTransCompoundMap(pfn_to_page(pfn)) &&
33113311
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
33123312
unsigned long mask;
@@ -5914,9 +5914,9 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
59145914
* the guest, and the guest page table is using 4K page size
59155915
* mapping if the indirect sp has level = 1.
59165916
*/
5917-
if (sp->role.direct &&
5918-
!kvm_is_reserved_pfn(pfn) &&
5919-
PageTransCompoundMap(pfn_to_page(pfn))) {
5917+
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
5918+
!kvm_is_zone_device_pfn(pfn) &&
5919+
PageTransCompoundMap(pfn_to_page(pfn))) {
59205920
pte_list_remove(rmap_head, sptep);
59215921

59225922
if (kvm_available_flush_tlb_with_range())

include/linux/kvm_host.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
966966
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
967967

968968
bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
969+
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);
969970

970971
struct kvm_irq_ack_notifier {
971972
struct hlist_node link;

virt/kvm/kvm_main.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
149149
return 0;
150150
}
151151

152+
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
153+
{
154+
/*
155+
* The metadata used by is_zone_device_page() to determine whether or
156+
* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
157+
* the device has been pinned, e.g. by get_user_pages(). WARN if the
158+
* page_count() is zero to help detect bad usage of this helper.
159+
*/
160+
if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
161+
return false;
162+
163+
return is_zone_device_page(pfn_to_page(pfn));
164+
}
165+
152166
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
153167
{
168+
/*
169+
* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
170+
* perspective they are "normal" pages, albeit with slightly different
171+
* usage rules.
172+
*/
154173
if (pfn_valid(pfn))
155-
return PageReserved(pfn_to_page(pfn));
174+
return PageReserved(pfn_to_page(pfn)) &&
175+
!kvm_is_zone_device_pfn(pfn);
156176

157177
return true;
158178
}
@@ -1857,7 +1877,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
18571877

18581878
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
18591879
{
1860-
if (!kvm_is_reserved_pfn(pfn)) {
1880+
if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
18611881
struct page *page = pfn_to_page(pfn);
18621882

18631883
SetPageDirty(page);
@@ -1867,7 +1887,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
18671887

18681888
void kvm_set_pfn_accessed(kvm_pfn_t pfn)
18691889
{
1870-
if (!kvm_is_reserved_pfn(pfn))
1890+
if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
18711891
mark_page_accessed(pfn_to_page(pfn));
18721892
}
18731893
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);

0 commit comments

Comments
 (0)