Skip to content

Commit b855243

Browse files
committed
KVM: guest_memfd: delay kvm_gmem_prepare_folio() until the memory is passed to the guest
Initializing the contents of the folio on fallocate() is unnecessarily restrictive. It means that the page is registered with the firmware and then it cannot be touched anymore. In particular, this loses the possibility of using fallocate() to pre-allocate the page for SEV-SNP guests, because kvm_arch_gmem_prepare() then fails. It's only when the guest actually accesses the page (and therefore kvm_gmem_get_pfn() is called) that the page must be cleared from any stale host data and registered with the firmware. The up-to-date flag is clear if this has to be done (i.e. it is the first access and kvm_gmem_populate() has not been called). All in all, there are enough differences between kvm_gmem_get_pfn() and kvm_gmem_populate(), that it's better to separate the two flows completely. Extract the bulk of kvm_gmem_get_folio(), which take a folio and end up setting its up-to-date flag, to a new function kvm_gmem_prepare_folio(); these are now done only by the non-__-prefixed kvm_gmem_get_pfn(). As a bonus, __kvm_gmem_get_pfn() loses its ugly "bool prepare" argument. One difference is that fallocate(PUNCH_HOLE) can now race with a page fault. Potentially this causes a page to be prepared and into the filemap even after fallocate(PUNCH_HOLE). This is harmless, as it can be fixed by another hole punching operation, and can be avoided by clearing the private-page attribute prior to invoking fallocate(PUNCH_HOLE). This way, the page fault will cause an exit to user space. The previous semantics, where fallocate() could be used to prepare the pages in advance of running the guest, can be accessed with KVM_PRE_FAULT_MEMORY. For now, accessing a page in one VM will attempt to call kvm_arch_gmem_prepare() in all of those that have bound the guest_memfd. Cleaning this up is left to a separate patch. Suggested-by: Sean Christopherson <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 78c4293 commit b855243

File tree

1 file changed

+66
-44
lines changed

1 file changed

+66
-44
lines changed

virt/kvm/guest_memfd.c

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
2525
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
2626
}
2727

28-
static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
28+
static int __kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio)
2929
{
3030
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
3131
struct list_head *gmem_list = &inode->i_mapping->i_private_list;
@@ -59,49 +59,63 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol
5959
return 0;
6060
}
6161

62-
/* Returns a locked folio on success. */
63-
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
62+
/*
63+
* Process @folio, which contains @gfn, so that the guest can use it.
64+
* The folio must be locked and the gfn must be contained in @slot.
65+
* On successful return the guest sees a zero page so as to avoid
66+
* leaking host data and the up-to-date flag is set.
67+
*/
68+
static int kvm_gmem_prepare_folio(struct file *file, struct kvm_memory_slot *slot,
69+
gfn_t gfn, struct folio *folio)
6470
{
65-
struct folio *folio;
71+
unsigned long nr_pages, i;
72+
pgoff_t index;
73+
int r;
6674

67-
/* TODO: Support huge pages. */
68-
folio = filemap_grab_folio(inode->i_mapping, index);
69-
if (IS_ERR(folio))
70-
return folio;
75+
if (folio_test_uptodate(folio))
76+
return 0;
77+
78+
nr_pages = folio_nr_pages(folio);
79+
for (i = 0; i < nr_pages; i++)
80+
clear_highpage(folio_page(folio, i));
7181

7282
/*
73-
* Use the up-to-date flag to track whether or not the memory has been
74-
* zeroed before being handed off to the guest. There is no backing
75-
* storage for the memory, so the folio will remain up-to-date until
76-
* it's removed.
83+
* Preparing huge folios should always be safe, since it should
84+
* be possible to split them later if needed.
7785
*
78-
* TODO: Skip clearing pages when trusted firmware will do it when
79-
* assigning memory to the guest.
86+
* Right now the folio order is always going to be zero, but the
87+
* code is ready for huge folios. The only assumption is that
88+
* the base pgoff of memslots is naturally aligned with the
89+
* requested page order, ensuring that huge folios can also use
90+
* huge page table entries for GPA->HPA mapping.
91+
*
92+
* The order will be passed when creating the guest_memfd, and
93+
* checked when creating memslots.
8094
*/
81-
if (!folio_test_uptodate(folio)) {
82-
unsigned long nr_pages = folio_nr_pages(folio);
83-
unsigned long i;
84-
85-
for (i = 0; i < nr_pages; i++)
86-
clear_highpage(folio_page(folio, i));
87-
}
88-
89-
if (prepare) {
90-
int r = kvm_gmem_prepare_folio(inode, index, folio);
91-
if (r < 0) {
92-
folio_unlock(folio);
93-
folio_put(folio);
94-
return ERR_PTR(r);
95-
}
95+
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
96+
index = gfn - slot->base_gfn + slot->gmem.pgoff;
97+
index = ALIGN_DOWN(index, 1 << folio_order(folio));
9698

99+
r = __kvm_gmem_prepare_folio(file_inode(file), index, folio);
100+
if (!r)
97101
folio_mark_uptodate(folio);
98-
}
99102

100-
/*
101-
* Ignore accessed, referenced, and dirty flags. The memory is
102-
* unevictable and there is no storage to write back to.
103-
*/
104-
return folio;
103+
return r;
104+
}
105+
106+
/*
107+
* Returns a locked folio on success. The caller is responsible for
108+
* setting the up-to-date flag before the memory is mapped into the guest.
109+
* There is no backing storage for the memory, so the folio will remain
110+
* up-to-date until it's removed.
111+
*
112+
* Ignore accessed, referenced, and dirty flags. The memory is
113+
* unevictable and there is no storage to write back to.
114+
*/
115+
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
116+
{
117+
/* TODO: Support huge pages. */
118+
return filemap_grab_folio(inode->i_mapping, index);
105119
}
106120

107121
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -201,7 +215,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
201215
break;
202216
}
203217

204-
folio = kvm_gmem_get_folio(inode, index, true);
218+
folio = kvm_gmem_get_folio(inode, index);
205219
if (IS_ERR(folio)) {
206220
r = PTR_ERR(folio);
207221
break;
@@ -555,7 +569,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
555569
/* Returns a locked folio on success. */
556570
static struct folio *
557571
__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
558-
gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare)
572+
gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
559573
{
560574
pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
561575
struct kvm_gmem *gmem = file->private_data;
@@ -572,7 +586,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
572586
return ERR_PTR(-EIO);
573587
}
574588

575-
folio = kvm_gmem_get_folio(file_inode(file), index, prepare);
589+
folio = kvm_gmem_get_folio(file_inode(file), index);
576590
if (IS_ERR(folio))
577591
return folio;
578592

@@ -594,17 +608,25 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
594608
{
595609
struct file *file = kvm_gmem_get_file(slot);
596610
struct folio *folio;
611+
int r = 0;
597612

598613
if (!file)
599614
return -EFAULT;
600615

601-
folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order, true);
602-
fput(file);
603-
if (IS_ERR(folio))
604-
return PTR_ERR(folio);
616+
folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, max_order);
617+
if (IS_ERR(folio)) {
618+
r = PTR_ERR(folio);
619+
goto out;
620+
}
605621

622+
r = kvm_gmem_prepare_folio(file, slot, gfn, folio);
606623
folio_unlock(folio);
607-
return 0;
624+
if (r < 0)
625+
folio_put(folio);
626+
627+
out:
628+
fput(file);
629+
return r;
608630
}
609631
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
610632

@@ -643,7 +665,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
643665
break;
644666
}
645667

646-
folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order, false);
668+
folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &max_order);
647669
if (IS_ERR(folio)) {
648670
ret = PTR_ERR(folio);
649671
break;

0 commit comments

Comments
 (0)