diff --git a/.buildkite/common.py b/.buildkite/common.py index a979638e472..7013406a0f0 100644 --- a/.buildkite/common.py +++ b/.buildkite/common.py @@ -33,6 +33,7 @@ DEFAULT_PLATFORMS = [ ("al2", "linux_5.10"), ("al2023", "linux_6.1"), + ("ubuntu24", "secret_hiding"), ] diff --git a/.buildkite/pipeline_pr.py b/.buildkite/pipeline_pr.py index 7f85f777c6b..17c0df83d94 100755 --- a/.buildkite/pipeline_pr.py +++ b/.buildkite/pipeline_pr.py @@ -68,6 +68,17 @@ for step in kani_grp["steps"]: step["label"] = "🔍 Kani" +if not changed_files or ( + any(parent.name == "hiding_ci" for x in changed_files for parent in x.parents) +): + pipeline.build_group_per_arch( + "🕵️ Build Secret Hiding Kernel", + pipeline.devtool_test( + pytest_opts="-m secret_hiding integration_tests/build/test_hiding_kernel.py", + ), + depends_on_build=False, + ) + if run_all_tests(changed_files): pipeline.build_group( "📦 Build", diff --git a/Cargo.toml b/Cargo.toml index 4f8cc3f5eb5..58a001e202e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ exit = "warn" tests_outside_test_module = "warn" assertions_on_result_states = "warn" error_impl_error = "warn" +needless-update = "allow" [profile.dev] panic = "abort" diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh new file mode 100755 index 00000000000..74bbb979906 --- /dev/null +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -0,0 +1,214 @@ +#!/bin/bash +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# fail if we encounter an error, uninitialized variable or a pipe breaks +set -eu -o pipefail + +check_root() { + # We need sudo privileges to install the kernel + if [ "$(id -u)" -ne 0 ]; then + echo "To install, this script must be run as root or with sudo privileges" + exit 1 + fi +} + +check_userspace() { + # Currently this script only works on Ubuntu and AL2023 + if grep -qi 'ubuntu' /etc/os-release; then + USERSPACE="UBUNTU" + return 0 + fi + + if grep -qi 'al2023' /etc/os-release; then + USERSPACE="AL2023" + return 0 + fi + + echo "This script currently only works on Ubuntu and Amazon Linux 2023." + exit 1 +} + +tidy_up() { + # Some cleanup after we are done + echo "Cleaning up.." + cd $START_DIR + rm -rf $TMP_BUILD_DIR +} + +confirm() { + if [[ "$*" == *"--no-install"* ]]; then + echo "Not installing new kernel." + + if [[ "$*" == *"--tidy"* ]]; then + tidy_up + fi + + exit 0 + fi + + if [[ "$*" == *"--install"* ]]; then + return 0 + fi + + while true; do + read -p "Do you want to install the new kernel? (y/n) " yn + case $yn in + [Yy]*) return 0 ;; + [Nn]*) + echo "Exiting..." + exit 1 + ;; + *) echo "Please answer yes or no." ;; + esac + done +} + +apply_patch_file() { + git apply $1 +} + +apply_series_mbox() { + git am $1 --empty=drop +} + +apply_series_link() { + patch_url=$(cat $1) + echo "Fetching mbox from:" $patch_url + curl --output lore.mbox.gz "$patch_url/t.mbox.gz" + gunzip lore.mbox + apply_series_mbox lore.mbox + rm lore.mbox +} + +apply_patch_or_series() { + case "$1" in + *.patch) apply_patch_file $1 ;; + *.mbox) apply_series_mbox $1 ;; + *.lore) apply_series_link $1 ;; + *) echo "Skipping non-patch file" $1 ;; + esac +} + +check_override_presence() { + while IFS= read -r line; do + if ! grep -Fq "$line" .config; then + echo "Missing config: $line" + exit 1 + fi + done <"$KERNEL_CONFIG_OVERRIDES" + + echo "All overrides correctly applied.." +} + +ubuntu_update_boot() { + echo "Update initramfs" + update-initramfs -c -k $KERNEL_VERSION + echo "Updating GRUB..." + update-grub +} + +al2023_update_boot() { + echo "Installing ENA driver for AL2023" + $START_DIR/install_ena.sh $KERNEL_VERSION $START_DIR/dkms.conf + + # Just ensure we are back in the build dir + cd $TMP_BUILD_DIR + + echo "Creating the new ram disk" + dracut --kver $KERNEL_VERSION -f -v + + # This varies from x86 and ARM so capture what was generated + # We add the || true here due to the fact that we have pipefail enabled + # this causes a non 0 exit when ls cant find vmlinux or vmlinux + VM_LINUX_LOCATION=$(ls /boot/vmlinu{x,z}-$KERNEL_VERSION 2>/dev/null | head -n1 || true) + + echo "Updating GRUB..." + grubby --grub2 --add-kernel $VM_LINUX_LOCATION \ + --title="Secret Hiding" \ + --initrd=/boot/initramfs-$KERNEL_VERSION.img --copy-default + grubby --set-default $VM_LINUX_LOCATION +} + +update_boot_config() { + case "$USERSPACE" in + UBUNTU) ubuntu_update_boot ;; + AL2023) al2023_update_boot ;; + *) + echo "Unknown userspace" + exit 1 + ;; + esac +} + +KERNEL_URL=$(cat kernel_url) +KERNEL_COMMIT_HASH=$(cat kernel_commit_hash) +KERNEL_PATCHES_DIR=$(pwd)/linux_patches +KERNEL_CONFIG_OVERRIDES=$(pwd)/kernel_config_overrides + +TMP_BUILD_DIR=$(mktemp -d -t kernel-build-XXXX) + +START_DIR=$(pwd) + +cd $TMP_BUILD_DIR + +echo "Cloning kernel repository into" $TMP_BUILD_DIR + +# We checkout the repository that way to make it as +# small and fast as possible +git init +git remote add origin $KERNEL_URL +git fetch --depth 1 origin $KERNEL_COMMIT_HASH +git checkout FETCH_HEAD + +# Apply our patches on top +for PATCH in $KERNEL_PATCHES_DIR/*.*; do + echo "Applying patch:" $(basename $PATCH) + apply_patch_or_series $PATCH +done + +echo "Making kernel config ready for build" +# We use olddefconfig to automatically pull in the +# config from the AMI and update to the newest +# defaults +make olddefconfig + +# Disable the ubuntu keys +scripts/config --disable SYSTEM_TRUSTED_KEYS +scripts/config --disable SYSTEM_REVOCATION_KEYS + +# We run this again to default options now changed by +# the disabling of the ubuntu keys +make olddefconfig + +# Apply our config overrides on top of the config +scripts/kconfig/merge_config.sh -m .config $KERNEL_CONFIG_OVERRIDES + +check_override_presence + +echo "Building kernel this may take a while" +make -s -j $(nproc) +echo "Building kernel modules" +make modules -s -j $(nproc) +echo "Kernel build complete!" + +KERNEL_VERSION=$(KERNELVERSION=$(make -s kernelversion) ./scripts/setlocalversion) + +echo "New kernel version:" $KERNEL_VERSION + +# Make sure a user really wants to install this kernel +confirm "$@" + +check_root +check_userspace + +echo "Installing kernel modules..." +make INSTALL_MOD_STRIP=1 modules_install +echo "Installing kernel..." +make INSTALL_MOD_STRIP=1 install + +update_boot_config + +echo "Kernel built and installed successfully!" + +tidy_up diff --git a/resources/hiding_ci/dkms.conf b/resources/hiding_ci/dkms.conf new file mode 100644 index 00000000000..29f108ba298 --- /dev/null +++ b/resources/hiding_ci/dkms.conf @@ -0,0 +1,10 @@ +PACKAGE_NAME="ena" +PACKAGE_VERSION="1.0.0" +CLEAN="make -C kernel/linux/ena clean" +MAKE="make -C kernel/linux/ena/ BUILD_KERNEL=${kernelver}" +BUILT_MODULE_NAME[0]="ena" +BUILT_MODULE_LOCATION="kernel/linux/ena" +DEST_MODULE_LOCATION[0]="/updates" +DEST_MODULE_NAME[0]="ena" +REMAKE_INITRD="yes" +AUTOINSTALL="yes" diff --git a/resources/hiding_ci/install_ena.sh b/resources/hiding_ci/install_ena.sh new file mode 100755 index 00000000000..7d0fd679395 --- /dev/null +++ b/resources/hiding_ci/install_ena.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# # SPDX-License-Identifier: Apache-2.0 + +# fail if we encounter an error, uninitialized variable or a pipe breaks +set -eu -o pipefail + +AMZN_DRIVER_VERSION="2.13.3" +KERNEL_VERSION=$1 +DKMS_CONF_LOCATION=$2 +START_DIR=$(pwd) + +cd /tmp/ + +git clone --depth=1 https://github.com/amzn/amzn-drivers.git +mv amzn-drivers /usr/src/amzn-drivers-${AMZN_DRIVER_VERSION} + +cp $DKMS_CONF_LOCATION /usr/src/amzn-drivers-${AMZN_DRIVER_VERSION} + +dkms add -m amzn-drivers -v ${AMZN_DRIVER_VERSION} +dkms build -k ${KERNEL_VERSION} -m amzn-drivers -v ${AMZN_DRIVER_VERSION} +dkms install -k ${KERNEL_VERSION} -m amzn-drivers -v ${AMZN_DRIVER_VERSION} + +cd $START_DIR diff --git a/resources/hiding_ci/kernel_commit_hash b/resources/hiding_ci/kernel_commit_hash new file mode 100644 index 00000000000..39d6afaaf51 --- /dev/null +++ b/resources/hiding_ci/kernel_commit_hash @@ -0,0 +1 @@ +4701f33a10702d5fc577c32434eb62adde0a1ae1 diff --git a/resources/hiding_ci/kernel_config_overrides b/resources/hiding_ci/kernel_config_overrides new file mode 100644 index 00000000000..e42464abb89 --- /dev/null +++ b/resources/hiding_ci/kernel_config_overrides @@ -0,0 +1,6 @@ +CONFIG_EXPERT=y +CONFIG_KVM=y +CONFIG_KVM_SW_PROTECTED_VM=y +CONFIG_KVM_PRIVATE_MEM=y +CONFIG_KVM_AMD_SEV=y +CONFIG_DEBUG_INFO=y diff --git a/resources/hiding_ci/kernel_url b/resources/hiding_ci/kernel_url new file mode 100644 index 00000000000..ce6e1a3e6a8 --- /dev/null +++ b/resources/hiding_ci/kernel_url @@ -0,0 +1 @@ +git://git.kernel.org/pub/scm/virt/kvm/kvm.git diff --git a/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch b/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch new file mode 100644 index 00000000000..4d4b5572d8a --- /dev/null +++ b/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch @@ -0,0 +1,109 @@ +From f9ca710b51263ce8317cc2fa02232e456fa1f39c Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:15 +0000 +Subject: [PATCH 01/26] mm: Consolidate freeing of typed folios on final + folio_put() + +Some folio types, such as hugetlb, handle freeing their own +folios. Moreover, guest_memfd will require being notified once a +folio's reference count reaches 0 to facilitate shared to private +folio conversion, without the folio actually being freed at that +point. + +As a first step towards that, this patch consolidates freeing +folios that have a type. The first user is hugetlb folios. Later +in this patch series, guest_memfd will become the second user of +this. + +Suggested-by: David Hildenbrand +Acked-by: Vlastimil Babka +Acked-by: David Hildenbrand +Signed-off-by: Fuad Tabba +--- + include/linux/page-flags.h | 15 +++++++++++++++ + mm/swap.c | 23 ++++++++++++++++++----- + 2 files changed, 33 insertions(+), 5 deletions(-) + +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 36d283552f80..6dc2494bd002 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -953,6 +953,21 @@ static inline bool page_has_type(const struct page *page) + return page_mapcount_is_type(data_race(page->page_type)); + } + ++static inline int page_get_type(const struct page *page) ++{ ++ return page->page_type >> 24; ++} ++ ++static inline bool folio_has_type(const struct folio *folio) ++{ ++ return page_has_type(&folio->page); ++} ++ ++static inline int folio_get_type(const struct folio *folio) ++{ ++ return page_get_type(&folio->page); ++} ++ + #define FOLIO_TYPE_OPS(lname, fname) \ + static __always_inline bool folio_test_##fname(const struct folio *folio) \ + { \ +diff --git a/mm/swap.c b/mm/swap.c +index fc8281ef4241..47bc1bb919cc 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -94,6 +94,19 @@ static void page_cache_release(struct folio *folio) + unlock_page_lruvec_irqrestore(lruvec, flags); + } + ++static void free_typed_folio(struct folio *folio) ++{ ++ switch (folio_get_type(folio)) { ++#ifdef CONFIG_HUGETLBFS ++ case PGTY_hugetlb: ++ free_huge_folio(folio); ++ return; ++#endif ++ default: ++ WARN_ON_ONCE(1); ++ } ++} ++ + void __folio_put(struct folio *folio) + { + if (unlikely(folio_is_zone_device(folio))) { +@@ -101,8 +114,8 @@ void __folio_put(struct folio *folio) + return; + } + +- if (folio_test_hugetlb(folio)) { +- free_huge_folio(folio); ++ if (unlikely(folio_has_type(folio))) { ++ free_typed_folio(folio); + return; + } + +@@ -966,13 +979,13 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) + if (!folio_ref_sub_and_test(folio, nr_refs)) + continue; + +- /* hugetlb has its own memcg */ +- if (folio_test_hugetlb(folio)) { ++ if (unlikely(folio_has_type(folio))) { ++ /* typed folios have their own memcg, if any */ + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; + } +- free_huge_folio(folio); ++ free_typed_folio(folio); + continue; + } + folio_unqueue_deferred_split(folio); + +base-commit: 4701f33a10702d5fc577c32434eb62adde0a1ae1 +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch b/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch new file mode 100644 index 00000000000..d5778165add --- /dev/null +++ b/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch @@ -0,0 +1,182 @@ +From 9a4d7cd855d14e1522f363e3e04ebb9fa0a90ff0 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:16 +0000 +Subject: [PATCH 02/26] KVM: guest_memfd: Handle final folio_put() of + guest_memfd pages + +Before transitioning a guest_memfd folio to unshared, thereby +disallowing access by the host and allowing the hypervisor to +transition its view of the guest page as private, we need to be +sure that the host doesn't have any references to the folio. + +This patch introduces a new type for guest_memfd folios, which +isn't activated in this series but is here as a placeholder and +to facilitate the code in the subsequent patch series. This will +be used in the future to register a callback that informs the +guest_memfd subsystem when the last reference is dropped, +therefore knowing that the host doesn't have any remaining +references. + +This patch also introduces the configuration option, +KVM_GMEM_SHARED_MEM, which toggles support for mapping +guest_memfd shared memory at the host. + +Signed-off-by: Fuad Tabba +Acked-by: Vlastimil Babka +Acked-by: David Hildenbrand +--- + include/linux/kvm_host.h | 4 ++++ + include/linux/page-flags.h | 16 ++++++++++++++++ + mm/debug.c | 1 + + mm/swap.c | 29 +++++++++++++++++++++++++++++ + virt/kvm/Kconfig | 4 ++++ + virt/kvm/guest_memfd.c | 8 ++++++++ + 6 files changed, 62 insertions(+) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index f34f4cfaa513..3ad0719bfc4f 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2571,4 +2571,8 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); + #endif + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++void kvm_gmem_handle_folio_put(struct folio *folio); ++#endif ++ + #endif +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 6dc2494bd002..daeee9a38e4c 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -933,6 +933,7 @@ enum pagetype { + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, ++ PGTY_guestmem = 0xf8, + + PGTY_mapcount_underflow = 0xff + }; +@@ -1082,6 +1083,21 @@ FOLIO_TYPE_OPS(hugetlb, hugetlb) + FOLIO_TEST_FLAG_FALSE(hugetlb) + #endif + ++/* ++ * guestmem folios are used to back VM memory as managed by guest_memfd. Once ++ * the last reference is put, instead of freeing these folios back to the page ++ * allocator, they are returned to guest_memfd. ++ * ++ * For now, guestmem will only be set on these folios as long as they cannot be ++ * mapped to user space ("private state"), with the plan of always setting that ++ * type once typed folios can be mapped to user space cleanly. ++ */ ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++FOLIO_TYPE_OPS(guestmem, guestmem) ++#else ++FOLIO_TEST_FLAG_FALSE(guestmem) ++#endif ++ + PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) + + /* +diff --git a/mm/debug.c b/mm/debug.c +index 8d2acf432385..08bc42c6cba8 100644 +--- a/mm/debug.c ++++ b/mm/debug.c +@@ -56,6 +56,7 @@ static const char *page_type_names[] = { + DEF_PAGETYPE_NAME(table), + DEF_PAGETYPE_NAME(buddy), + DEF_PAGETYPE_NAME(unaccepted), ++ DEF_PAGETYPE_NAME(guestmem), + }; + + static const char *page_type_name(unsigned int page_type) +diff --git a/mm/swap.c b/mm/swap.c +index 47bc1bb919cc..d8fda3948684 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -38,6 +38,10 @@ + #include + #include + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++#include ++#endif ++ + #include "internal.h" + + #define CREATE_TRACE_POINTS +@@ -94,6 +98,26 @@ static void page_cache_release(struct folio *folio) + unlock_page_lruvec_irqrestore(lruvec, flags); + } + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++static void gmem_folio_put(struct folio *folio) ++{ ++ /* ++ * Perform the callback only as long as the KVM module is still loaded. ++ * As long as the folio mapping is set, the folio is associated with a ++ * guest_memfd inode. ++ */ ++ if (folio->mapping) ++ kvm_gmem_handle_folio_put(folio); ++ ++ /* ++ * If there are no references to the folio left, it's not associated ++ * with a guest_memfd inode anymore. ++ */ ++ if (folio_ref_count(folio) == 0) ++ __folio_put(folio); ++} ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ ++ + static void free_typed_folio(struct folio *folio) + { + switch (folio_get_type(folio)) { +@@ -101,6 +125,11 @@ static void free_typed_folio(struct folio *folio) + case PGTY_hugetlb: + free_huge_folio(folio); + return; ++#endif ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ case PGTY_guestmem: ++ gmem_folio_put(folio); ++ return; + #endif + default: + WARN_ON_ONCE(1); +diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig +index 54e959e7d68f..4e759e8020c5 100644 +--- a/virt/kvm/Kconfig ++++ b/virt/kvm/Kconfig +@@ -124,3 +124,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE + config HAVE_KVM_ARCH_GMEM_INVALIDATE + bool + depends on KVM_PRIVATE_MEM ++ ++config KVM_GMEM_SHARED_MEM ++ select KVM_PRIVATE_MEM ++ bool +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index b2aa6bf24d3a..5fc414becae5 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -13,6 +13,14 @@ struct kvm_gmem { + struct list_head entry; + }; + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++void kvm_gmem_handle_folio_put(struct folio *folio) ++{ ++ WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress."); ++} ++EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ ++ + /** + * folio_file_pfn - like folio_file_page, but return a pfn. + * @folio: The folio which contains this index. +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch b/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch new file mode 100644 index 00000000000..13d7180fa19 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch @@ -0,0 +1,193 @@ +From fd39febef2e0d41394e51f5e34f2c8de80b3b4dc Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:17 +0000 +Subject: [PATCH 03/26] KVM: guest_memfd: Allow host to map guest_memfd() pages + +Add support for mmap() and fault() for guest_memfd backed memory +in the host for VMs that support in-place conversion between +shared and private. To that end, this patch adds the ability to +check whether the VM type supports in-place conversion, and only +allows mapping its memory if that's the case. + +Also add the KVM capability KVM_CAP_GMEM_SHARED_MEM, which +indicates that the VM supports shared memory in guest_memfd, or +that the host can create VMs that support shared memory. +Supporting shared memory implies that memory can be mapped when +shared with the host. + +This is controlled by the KVM_GMEM_SHARED_MEM configuration +option. + +Signed-off-by: Fuad Tabba +--- + include/linux/kvm_host.h | 11 +++++ + include/uapi/linux/kvm.h | 1 + + virt/kvm/guest_memfd.c | 101 +++++++++++++++++++++++++++++++++++++++ + virt/kvm/kvm_main.c | 4 ++ + 4 files changed, 117 insertions(+) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 3ad0719bfc4f..601bbcaa5e41 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -728,6 +728,17 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) + } + #endif + ++/* ++ * Arch code must define kvm_arch_gmem_supports_shared_mem if support for ++ * private memory is enabled and it supports in-place shared/private conversion. ++ */ ++#if !defined(kvm_arch_gmem_supports_shared_mem) && !IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) ++static inline bool kvm_arch_gmem_supports_shared_mem(struct kvm *kvm) ++{ ++ return false; ++} ++#endif ++ + #ifndef kvm_arch_has_readonly_mem + static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) + { +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 45e6d8fca9b9..117937a895da 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -929,6 +929,7 @@ struct kvm_enable_cap { + #define KVM_CAP_PRE_FAULT_MEMORY 236 + #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 + #define KVM_CAP_X86_GUEST_MODE 238 ++#define KVM_CAP_GMEM_SHARED_MEM 239 + + struct kvm_irq_routing_irqchip { + __u32 irqchip; +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 5fc414becae5..fbf89e643add 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -320,7 +320,108 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) + return gfn - slot->base_gfn + slot->gmem.pgoff; + } + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index) ++{ ++ struct kvm_gmem *gmem = file->private_data; ++ ++ ++ /* For now, VMs that support shared memory share all their memory. */ ++ return kvm_arch_gmem_supports_shared_mem(gmem->kvm); ++} ++ ++static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) ++{ ++ struct inode *inode = file_inode(vmf->vma->vm_file); ++ struct folio *folio; ++ vm_fault_t ret = VM_FAULT_LOCKED; ++ ++ filemap_invalidate_lock_shared(inode->i_mapping); ++ ++ folio = kvm_gmem_get_folio(inode, vmf->pgoff); ++ if (IS_ERR(folio)) { ++ int err = PTR_ERR(folio); ++ ++ if (err == -EAGAIN) ++ ret = VM_FAULT_RETRY; ++ else ++ ret = vmf_error(err); ++ ++ goto out_filemap; ++ } ++ ++ if (folio_test_hwpoison(folio)) { ++ ret = VM_FAULT_HWPOISON; ++ goto out_folio; ++ } ++ ++ if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) { ++ ret = VM_FAULT_SIGBUS; ++ goto out_folio; ++ } ++ ++ /* ++ * Shared folios would not be marked as "guestmem" so far, and we only ++ * expect shared folios at this point. ++ */ ++ if (WARN_ON_ONCE(folio_test_guestmem(folio))) { ++ ret = VM_FAULT_SIGBUS; ++ goto out_folio; ++ } ++ ++ /* No support for huge pages. */ ++ if (WARN_ON_ONCE(folio_test_large(folio))) { ++ ret = VM_FAULT_SIGBUS; ++ goto out_folio; ++ } ++ ++ if (!folio_test_uptodate(folio)) { ++ clear_highpage(folio_page(folio, 0)); ++ kvm_gmem_mark_prepared(folio); ++ } ++ ++ vmf->page = folio_file_page(folio, vmf->pgoff); ++ ++out_folio: ++ if (ret != VM_FAULT_LOCKED) { ++ folio_unlock(folio); ++ folio_put(folio); ++ } ++ ++out_filemap: ++ filemap_invalidate_unlock_shared(inode->i_mapping); ++ ++ return ret; ++} ++ ++static const struct vm_operations_struct kvm_gmem_vm_ops = { ++ .fault = kvm_gmem_fault, ++}; ++ ++static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ struct kvm_gmem *gmem = file->private_data; ++ ++ if (!kvm_arch_gmem_supports_shared_mem(gmem->kvm)) ++ return -ENODEV; ++ ++ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != ++ (VM_SHARED | VM_MAYSHARE)) { ++ return -EINVAL; ++ } ++ ++ file_accessed(file); ++ vm_flags_set(vma, VM_DONTDUMP); ++ vma->vm_ops = &kvm_gmem_vm_ops; ++ ++ return 0; ++} ++#else ++#define kvm_gmem_mmap NULL ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ ++ + static struct file_operations kvm_gmem_fops = { ++ .mmap = kvm_gmem_mmap, + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index ba0327e2d0d3..38f0f402ea46 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4830,6 +4830,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) + #ifdef CONFIG_KVM_PRIVATE_MEM + case KVM_CAP_GUEST_MEMFD: + return !kvm || kvm_arch_has_private_mem(kvm); ++#endif ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ case KVM_CAP_GMEM_SHARED_MEM: ++ return !kvm || kvm_arch_gmem_supports_shared_mem(kvm); + #endif + default: + break; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch b/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch new file mode 100644 index 00000000000..2d32a4cefc2 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch @@ -0,0 +1,58 @@ +From d16c343f0f95ecd8d2cda2dfba4ac8b7c293f217 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:19 +0000 +Subject: [PATCH 04/26] KVM: x86: Mark KVM_X86_SW_PROTECTED_VM as supporting + guest_memfd shared memory + +The KVM_X86_SW_PROTECTED_VM type is meant for experimentation and +does not have any underlying support for protected guests. This +makes it a good candidate for testing mapping shared memory. +Therefore, when the kconfig option is enabled, mark +KVM_X86_SW_PROTECTED_VM as supporting shared memory. + +This means that this memory is considered by guest_memfd to be +shared with the host, with the possibility of in-place conversion +between shared and private. This allows the host to map and fault +in guest_memfd memory belonging to this VM type. + +Signed-off-by: Fuad Tabba +--- + arch/x86/include/asm/kvm_host.h | 5 +++++ + arch/x86/kvm/Kconfig | 3 ++- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 32ae3aa50c7e..b874e54a5ee4 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -2246,8 +2246,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + + #ifdef CONFIG_KVM_PRIVATE_MEM + #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) ++ ++#define kvm_arch_gmem_supports_shared_mem(kvm) \ ++ (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) && \ ++ ((kvm)->arch.vm_type == KVM_X86_SW_PROTECTED_VM)) + #else + #define kvm_arch_has_private_mem(kvm) false ++#define kvm_arch_gmem_supports_shared_mem(kvm) false + #endif + + #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index ea2c4f21c1ca..22d1bcdaad58 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -45,7 +45,8 @@ config KVM_X86 + select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_PRE_FAULT_MEMORY +- select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM ++ select KVM_PRIVATE_MEM if KVM_SW_PROTECTED_VM ++ select KVM_GMEM_SHARED_MEM if KVM_SW_PROTECTED_VM + select KVM_WERROR if WERROR + + config KVM +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch b/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch new file mode 100644 index 00000000000..905c88558d8 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch @@ -0,0 +1,62 @@ +From 483ccb70335cb0c76161caf76c0ccb7c618038e2 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:20 +0000 +Subject: [PATCH 05/26] KVM: arm64: Refactor user_mem_abort() calculation of + force_pte + +To simplify the code and to make the assumptions clearer, +refactor user_mem_abort() by immediately setting force_pte to +true if the conditions are met. Also, remove the comment about +logging_active being guaranteed to never be true for VM_PFNMAP +memslots, since it's not technically correct right now. + +No functional change intended. + +Signed-off-by: Fuad Tabba +--- + arch/arm64/kvm/mmu.c | 13 ++++--------- + 1 file changed, 4 insertions(+), 9 deletions(-) + +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index 1f55b0c7b11d..887ffa1f5b14 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1460,7 +1460,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + bool fault_is_perm) + { + int ret = 0; +- bool write_fault, writable, force_pte = false; ++ bool write_fault, writable; + bool exec_fault, mte_allowed; + bool device = false, vfio_allow_any_uc = false; + unsigned long mmu_seq; +@@ -1472,6 +1472,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + gfn_t gfn; + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); ++ bool force_pte = logging_active || is_protected_kvm_enabled(); + long vma_pagesize, fault_granule; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; +@@ -1521,16 +1522,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + return -EFAULT; + } + +- /* +- * logging_active is guaranteed to never be true for VM_PFNMAP +- * memslots. +- */ +- if (logging_active || is_protected_kvm_enabled()) { +- force_pte = true; ++ if (force_pte) + vma_shift = PAGE_SHIFT; +- } else { ++ else + vma_shift = get_vma_page_shift(vma, hva); +- } + + switch (vma_shift) { + #ifndef __PAGETABLE_PMD_FOLDED +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch b/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch new file mode 100644 index 00000000000..3e0dea5a7e6 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch @@ -0,0 +1,40 @@ +From b1e925d4d5db8513dba67c3a9d40a2b507668f09 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:18 +0000 +Subject: [PATCH 06/26] KVM: guest_memfd: Handle in-place shared memory as + guest_memfd backed memory + +For VMs that allow sharing guest_memfd backed memory in-place, +handle that memory the same as "private" guest_memfd memory. This +means that faulting that memory in the host or in the guest will +go through the guest_memfd subsystem. + +Note that the word "private" in the name of the function +kvm_mem_is_private() doesn't necessarily indicate that the memory +isn't shared, but is due to the history and evolution of +guest_memfd and the various names it has received. In effect, +this function is used to multiplex between the path of a normal +page fault and the path of a guest_memfd backed page fault. + +Signed-off-by: Fuad Tabba +--- + include/linux/kvm_host.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 601bbcaa5e41..3d5595a71a2a 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2521,7 +2521,8 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) + #else + static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) + { +- return false; ++ return kvm_arch_gmem_supports_shared_mem(kvm) && ++ kvm_slot_can_be_private(gfn_to_memslot(kvm, gfn)); + } + #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ + +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch b/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch new file mode 100644 index 00000000000..5b68d6e183e --- /dev/null +++ b/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch @@ -0,0 +1,174 @@ +From 996513a423377349767d5cfef87850e80131854f Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:21 +0000 +Subject: [PATCH 07/26] KVM: arm64: Handle guest_memfd()-backed guest page + faults + +Add arm64 support for handling guest page faults on guest_memfd +backed memslots. + +For now, the fault granule is restricted to PAGE_SIZE. + +Signed-off-by: Fuad Tabba +--- + arch/arm64/kvm/mmu.c | 65 +++++++++++++++++++++++++++------------- + include/linux/kvm_host.h | 5 ++++ + virt/kvm/kvm_main.c | 5 ---- + 3 files changed, 50 insertions(+), 25 deletions(-) + +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index 887ffa1f5b14..adb0681fc1c6 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1454,6 +1454,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) + return vma->vm_flags & VM_MTE_ALLOWED; + } + ++static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, ++ gfn_t gfn, bool write_fault, bool *writable, ++ struct page **page, bool is_private) ++{ ++ kvm_pfn_t pfn; ++ int ret; ++ ++ if (!is_private) ++ return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); ++ ++ *writable = false; ++ ++ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); ++ if (!ret) { ++ *writable = !memslot_is_readonly(slot); ++ return pfn; ++ } ++ ++ if (ret == -EHWPOISON) ++ return KVM_PFN_ERR_HWPOISON; ++ ++ return KVM_PFN_ERR_NOSLOT_MASK; ++} ++ + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, unsigned long hva, +@@ -1461,19 +1485,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + { + int ret = 0; + bool write_fault, writable; +- bool exec_fault, mte_allowed; ++ bool exec_fault, mte_allowed = false; + bool device = false, vfio_allow_any_uc = false; + unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; + struct kvm *kvm = vcpu->kvm; +- struct vm_area_struct *vma; ++ struct vm_area_struct *vma = NULL; + short vma_shift; + void *memcache; +- gfn_t gfn; ++ gfn_t gfn = ipa >> PAGE_SHIFT; + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); +- bool force_pte = logging_active || is_protected_kvm_enabled(); +- long vma_pagesize, fault_granule; ++ bool is_gmem = kvm_mem_is_private(kvm, gfn); ++ bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); ++ long vma_pagesize, fault_granule = PAGE_SIZE; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; + struct page *page; +@@ -1510,16 +1535,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + return ret; + } + ++ mmap_read_lock(current->mm); ++ + /* + * Let's check if we will get back a huge page backed by hugetlbfs, or + * get block mapping for device MMIO region. + */ +- mmap_read_lock(current->mm); +- vma = vma_lookup(current->mm, hva); +- if (unlikely(!vma)) { +- kvm_err("Failed to find VMA for hva 0x%lx\n", hva); +- mmap_read_unlock(current->mm); +- return -EFAULT; ++ if (!is_gmem) { ++ vma = vma_lookup(current->mm, hva); ++ if (unlikely(!vma)) { ++ kvm_err("Failed to find VMA for hva 0x%lx\n", hva); ++ mmap_read_unlock(current->mm); ++ return -EFAULT; ++ } ++ ++ vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; ++ mte_allowed = kvm_vma_mte_allowed(vma); + } + + if (force_pte) +@@ -1590,18 +1621,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + ipa &= ~(vma_pagesize - 1); + } + +- gfn = ipa >> PAGE_SHIFT; +- mte_allowed = kvm_vma_mte_allowed(vma); +- +- vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; +- + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; + + /* + * Read mmu_invalidate_seq so that KVM can detect if the results of +- * vma_lookup() or __kvm_faultin_pfn() become stale prior to +- * acquiring kvm->mmu_lock. ++ * vma_lookup() or faultin_pfn() become stale prior to acquiring ++ * kvm->mmu_lock. + * + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). +@@ -1609,8 +1635,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + +- pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, +- &writable, &page); ++ pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); + return 0; +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 3d5595a71a2a..ec3bedc18eab 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) + return gfn_to_memslot(kvm, gfn)->id; + } + ++static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) ++{ ++ return slot->flags & KVM_MEM_READONLY; ++} ++ + static inline gfn_t + hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) + { +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 38f0f402ea46..3e40acb9f5c0 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) + return size; + } + +-static bool memslot_is_readonly(const struct kvm_memory_slot *slot) +-{ +- return slot->flags & KVM_MEM_READONLY; +-} +- + static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, + gfn_t *nr_pages, bool write) + { +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch b/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch new file mode 100644 index 00000000000..2a5a355a2e1 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch @@ -0,0 +1,149 @@ +From 1ee5d01987bff47f007fb86ad7738b299816b2ef Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:23 +0000 +Subject: [PATCH 08/26] KVM: guest_memfd: selftests: guest_memfd mmap() test + when mapping is allowed + +Expand the guest_memfd selftests to include testing mapping guest +memory for VM types that support it. + +Also, build the guest_memfd selftest for arm64. + +Signed-off-by: Fuad Tabba +--- + tools/testing/selftests/kvm/Makefile.kvm | 1 + + .../testing/selftests/kvm/guest_memfd_test.c | 75 +++++++++++++++++-- + 2 files changed, 70 insertions(+), 6 deletions(-) + +diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm +index 4277b983cace..c9a3f30e28dd 100644 +--- a/tools/testing/selftests/kvm/Makefile.kvm ++++ b/tools/testing/selftests/kvm/Makefile.kvm +@@ -160,6 +160,7 @@ TEST_GEN_PROGS_arm64 += coalesced_io_test + TEST_GEN_PROGS_arm64 += demand_paging_test + TEST_GEN_PROGS_arm64 += dirty_log_test + TEST_GEN_PROGS_arm64 += dirty_log_perf_test ++TEST_GEN_PROGS_arm64 += guest_memfd_test + TEST_GEN_PROGS_arm64 += guest_print_test + TEST_GEN_PROGS_arm64 += get-reg-list + TEST_GEN_PROGS_arm64 += kvm_create_max_vcpus +diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c +index ce687f8d248f..38c501e49e0e 100644 +--- a/tools/testing/selftests/kvm/guest_memfd_test.c ++++ b/tools/testing/selftests/kvm/guest_memfd_test.c +@@ -34,12 +34,48 @@ static void test_file_read_write(int fd) + "pwrite on a guest_mem fd should fail"); + } + +-static void test_mmap(int fd, size_t page_size) ++static void test_mmap_allowed(int fd, size_t total_size) + { ++ size_t page_size = getpagesize(); ++ const char val = 0xaa; ++ char *mem; ++ int ret; ++ int i; ++ ++ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); ++ TEST_ASSERT(mem != MAP_FAILED, "mmaping() guest memory should pass."); ++ ++ memset(mem, val, total_size); ++ for (i = 0; i < total_size; i++) ++ TEST_ASSERT_EQ(mem[i], val); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, ++ page_size); ++ TEST_ASSERT(!ret, "fallocate the first page should succeed"); ++ ++ for (i = 0; i < page_size; i++) ++ TEST_ASSERT_EQ(mem[i], 0x00); ++ for (; i < total_size; i++) ++ TEST_ASSERT_EQ(mem[i], val); ++ ++ memset(mem, val, total_size); ++ for (i = 0; i < total_size; i++) ++ TEST_ASSERT_EQ(mem[i], val); ++ ++ ret = munmap(mem, total_size); ++ TEST_ASSERT(!ret, "munmap should succeed"); ++} ++ ++static void test_mmap_denied(int fd, size_t total_size) ++{ ++ size_t page_size = getpagesize(); + char *mem; + + mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT_EQ(mem, MAP_FAILED); ++ ++ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); ++ TEST_ASSERT_EQ(mem, MAP_FAILED); + } + + static void test_file_size(int fd, size_t page_size, size_t total_size) +@@ -170,19 +206,27 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) + close(fd1); + } + +-int main(int argc, char *argv[]) ++unsigned long get_shared_type(void) + { +- size_t page_size; ++#ifdef __x86_64__ ++ return KVM_X86_SW_PROTECTED_VM; ++#endif ++ return 0; ++} ++ ++void test_vm_type(unsigned long type, bool is_shared) ++{ ++ struct kvm_vm *vm; + size_t total_size; ++ size_t page_size; + int fd; +- struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + + page_size = getpagesize(); + total_size = page_size * 4; + +- vm = vm_create_barebones(); ++ vm = vm_create_barebones_type(type); + + test_create_guest_memfd_invalid(vm); + test_create_guest_memfd_multiple(vm); +@@ -190,10 +234,29 @@ int main(int argc, char *argv[]) + fd = vm_create_guest_memfd(vm, total_size, 0); + + test_file_read_write(fd); +- test_mmap(fd, page_size); ++ ++ if (is_shared) ++ test_mmap_allowed(fd, total_size); ++ else ++ test_mmap_denied(fd, total_size); ++ + test_file_size(fd, page_size, total_size); + test_fallocate(fd, page_size, total_size); + test_invalid_punch_hole(fd, page_size, total_size); + + close(fd); ++ kvm_vm_release(vm); ++} ++ ++int main(int argc, char *argv[]) ++{ ++#ifndef __aarch64__ ++ /* For now, arm64 only supports shared guest memory. */ ++ test_vm_type(VM_TYPE_DEFAULT, false); ++#endif ++ ++ if (kvm_has_cap(KVM_CAP_GMEM_SHARED_MEM)) ++ test_vm_type(get_shared_type(), true); ++ ++ return 0; + } +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch b/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch new file mode 100644 index 00000000000..a03d592e4b0 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch @@ -0,0 +1,51 @@ +From 3cc51efc17a2c41a480eed36b31c1773936717e0 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:22 +0000 +Subject: [PATCH 09/26] KVM: arm64: Enable mapping guest_memfd in arm64 + +Enable mapping guest_memfd in arm64. For now, it applies to all +VMs in arm64 that use guest_memfd. In the future, new VM types +can restrict this via kvm_arch_gmem_supports_shared_mem(). + +Signed-off-by: Fuad Tabba +--- + arch/arm64/include/asm/kvm_host.h | 12 ++++++++++++ + arch/arm64/kvm/Kconfig | 1 + + 2 files changed, 13 insertions(+) + +diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h +index d919557af5e5..4440b2334a05 100644 +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -1543,4 +1543,16 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); + #define kvm_has_s1poe(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) + ++#ifdef CONFIG_KVM_PRIVATE_MEM ++static inline bool kvm_arch_has_private_mem(struct kvm *kvm) ++{ ++ return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM); ++} ++ ++static inline bool kvm_arch_gmem_supports_shared_mem(struct kvm *kvm) ++{ ++ return IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM); ++} ++#endif /* CONFIG_KVM_PRIVATE_MEM */ ++ + #endif /* __ARM64_KVM_HOST_H__ */ +diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig +index ead632ad01b4..4830d8805bed 100644 +--- a/arch/arm64/kvm/Kconfig ++++ b/arch/arm64/kvm/Kconfig +@@ -38,6 +38,7 @@ menuconfig KVM + select HAVE_KVM_VCPU_RUN_PID_CHANGE + select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS ++ select KVM_GMEM_SHARED_MEM + help + Support hosting virtualized guest machines. + +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch b/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch new file mode 100644 index 00000000000..bd336166268 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch @@ -0,0 +1,208 @@ +From 22ec89c0ff7af3430027cf71cf8bce5c8ed6e402 Mon Sep 17 00:00:00 2001 +From: Patrick Roy +Date: Fri, 7 Feb 2025 11:16:06 +0000 +Subject: [PATCH 10/26] mm: introduce AS_NO_DIRECT_MAP + +Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are +set to not present . Currently, mappings that match this description are +secretmem mappings (memfd_secret()). Later, some guest_memfd +configurations will also fall into this category. + +Reject this new type of mappings in all locations that currently reject +secretmem mappings, on the assumption that if secretmem mappings are +rejected somewhere, it is precisely because of an inability to deal with +folios without direct map entries, and then make memfd_secret() use +AS_NO_DIRECT_MAP on its address_space to drop its special +vma_is_secretmem()/secretmem_mapping() checks. + +This drops a optimization in gup_fast_folio_allowed() where +secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is +enabled by default since commit b758fe6df50d ("mm/secretmem: make it on +by default"), so the secretmem check did not actually end up elided in +most cases anymore anyway. + +Use a new flag instead of overloading AS_INACCESSIBLE (which is already +set by guest_memfd) because not all guest_memfd mappings will end up +being direct map removed (e.g. in pKVM setups, parts of guest_memfd that +can be mapped to userspace should also be GUP-able, and generally not +have restrictions on who can access it). + +Signed-off-by: Patrick Roy +--- + include/linux/pagemap.h | 16 ++++++++++++++++ + include/linux/secretmem.h | 18 ------------------ + lib/buildid.c | 4 ++-- + mm/gup.c | 14 +++----------- + mm/mlock.c | 2 +- + mm/secretmem.c | 6 +----- + 6 files changed, 23 insertions(+), 37 deletions(-) + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 47bfc6b1b632..903b41e89cf8 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -210,6 +210,7 @@ enum mapping_flags { + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying + folio contents */ + AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ ++ AS_NO_DIRECT_MAP = 9, /* Folios in the mapping are not in the direct map */ + /* Bits 16-25 are used for FOLIO_ORDER */ + AS_FOLIO_ORDER_BITS = 5, + AS_FOLIO_ORDER_MIN = 16, +@@ -335,6 +336,21 @@ static inline bool mapping_inaccessible(struct address_space *mapping) + return test_bit(AS_INACCESSIBLE, &mapping->flags); + } + ++static inline void mapping_set_no_direct_map(struct address_space *mapping) ++{ ++ set_bit(AS_NO_DIRECT_MAP, &mapping->flags); ++} ++ ++static inline bool mapping_no_direct_map(struct address_space *mapping) ++{ ++ return test_bit(AS_NO_DIRECT_MAP, &mapping->flags); ++} ++ ++static inline bool vma_is_no_direct_map(const struct vm_area_struct *vma) ++{ ++ return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping); ++} ++ + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) + { + return mapping->gfp_mask; +diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h +index e918f96881f5..0ae1fb057b3d 100644 +--- a/include/linux/secretmem.h ++++ b/include/linux/secretmem.h +@@ -4,28 +4,10 @@ + + #ifdef CONFIG_SECRETMEM + +-extern const struct address_space_operations secretmem_aops; +- +-static inline bool secretmem_mapping(struct address_space *mapping) +-{ +- return mapping->a_ops == &secretmem_aops; +-} +- +-bool vma_is_secretmem(struct vm_area_struct *vma); + bool secretmem_active(void); + + #else + +-static inline bool vma_is_secretmem(struct vm_area_struct *vma) +-{ +- return false; +-} +- +-static inline bool secretmem_mapping(struct address_space *mapping) +-{ +- return false; +-} +- + static inline bool secretmem_active(void) + { + return false; +diff --git a/lib/buildid.c b/lib/buildid.c +index c4b0f376fb34..33f173a607ad 100644 +--- a/lib/buildid.c ++++ b/lib/buildid.c +@@ -65,8 +65,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off) + + freader_put_folio(r); + +- /* reject secretmem folios created with memfd_secret() */ +- if (secretmem_mapping(r->file->f_mapping)) ++ /* reject secretmem folios created with memfd_secret() or guest_memfd() */ ++ if (mapping_no_direct_map(r->file->f_mapping)) + return -EFAULT; + + r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT); +diff --git a/mm/gup.c b/mm/gup.c +index 3883b307780e..b1483a876740 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1283,7 +1283,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) + if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) + return -EOPNOTSUPP; + +- if (vma_is_secretmem(vma)) ++ if (vma_is_no_direct_map(vma)) + return -EFAULT; + + if (write) { +@@ -2786,7 +2786,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) + { + bool reject_file_backed = false; + struct address_space *mapping; +- bool check_secretmem = false; + unsigned long mapping_flags; + + /* +@@ -2798,14 +2797,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) + reject_file_backed = true; + + /* We hold a folio reference, so we can safely access folio fields. */ +- +- /* secretmem folios are always order-0 folios. */ +- if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) +- check_secretmem = true; +- +- if (!reject_file_backed && !check_secretmem) +- return true; +- + if (WARN_ON_ONCE(folio_test_slab(folio))) + return false; + +@@ -2847,8 +2838,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) + * At this point, we know the mapping is non-null and points to an + * address_space object. + */ +- if (check_secretmem && secretmem_mapping(mapping)) ++ if (mapping_no_direct_map(mapping)) + return false; ++ + /* The only remaining allowed file system is shmem. */ + return !reject_file_backed || shmem_mapping(mapping); + } +diff --git a/mm/mlock.c b/mm/mlock.c +index cde076fa7d5e..a43f308be70d 100644 +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + + if (newflags == oldflags || (oldflags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || +- vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) ++ vma_is_dax(vma) || vma_is_no_direct_map(vma) || (oldflags & VM_DROPPABLE)) + /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + goto out; + +diff --git a/mm/secretmem.c b/mm/secretmem.c +index 1b0a214ee558..ea4c04d469b1 100644 +--- a/mm/secretmem.c ++++ b/mm/secretmem.c +@@ -136,11 +136,6 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) + return 0; + } + +-bool vma_is_secretmem(struct vm_area_struct *vma) +-{ +- return vma->vm_ops == &secretmem_vm_ops; +-} +- + static const struct file_operations secretmem_fops = { + .release = secretmem_release, + .mmap = secretmem_mmap, +@@ -214,6 +209,7 @@ static struct file *secretmem_file_create(unsigned long flags) + + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); ++ mapping_set_no_direct_map(inode->i_mapping); + + inode->i_op = &secretmem_iops; + inode->i_mapping->a_ops = &secretmem_aops; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch b/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch new file mode 100644 index 00000000000..dcce661a60e --- /dev/null +++ b/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch @@ -0,0 +1,178 @@ +From b1fc478976c93fd42b14e06d2de57e121be03142 Mon Sep 17 00:00:00 2001 +From: Patrick Roy +Date: Fri, 7 Feb 2025 14:33:01 +0000 +Subject: [PATCH 11/26] KVM: guest_memfd: Add flag to remove from direct map + +Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When +set, guest_memfd folios will be removed from the direct map after +preparation, with direct map entries only restored when the folios are +freed. + +To ensure these folios do not end up in places where the kernel cannot +deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct +address_space if KVM_GMEM_NO_DIRECT_MAP is requested. + +Add KVM_CAP_GMEM_NO_DIRECT_MAP to let userspace discover whether +guest_memfd supports KVM_GMEM_NO_DIRECT_MAP. Support depends on +guest_memfd itself being supported, but also on whether KVM can +manipulate the direct map at page granularity at all (possible most of +the time, just arm64 is a notable outlier where its impossible if the +direct map has been setup using hugepages, as arm64 cannot break these +apart due to break-before-make semantics). + +Note that this flag causes removal of direct map entries for all +guest_memfd folios independent of whether they are "shared" or "private" +(although current guest_memfd only supports either all folios in the +"shared" state, or all folios in the "private" state if +!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM)). The usecase for removing +direct map entries of also the shared parts of guest_memfd are a special +type of non-CoCo VM where, host userspace is trusted to have access to +all of guest memory, but where Spectre-style transient execution attacks +through the host kernel's direct map should still be mitigated. + +Note that KVM retains access to guest memory via userspace +mappings of guest_memfd, which are reflected back into KVM's memslots +via userspace_addr. This is needed for things like MMIO emulation on +x86_64 to work. Previous iterations attempted to instead have KVM +temporarily restore direct map entries whenever such an access to guest +memory was needed, but this turned out to have a significant performance +impact, as well as additional complexity due to needing to refcount +direct map reinsertion operations and making them play nicely with gmem +truncations. + +This iteration also doesn't have KVM perform TLB flushes after direct +map manipulations. This is because TLB flushes resulted in a up to 40x +elongation of page faults in guest_memfd (scaling with the number of CPU +cores), or a 5x elongation of memory population. On the one hand, TLB +flushes are not needed for functional correctness (the virt->phys +mapping technically stays "correct", the kernel should simply to not it +for a while), so this is a correct optimization to make. On the other +hand, it means that the desired protection from Spectre-style attacks is +not perfect, as an attacker could try to prevent a stale TLB entry from +getting evicted, keeping it alive until the page it refers to is used by +the guest for some sensitive data, and then targeting it using a +spectre-gadget. + +Signed-off-by: Patrick Roy +--- + include/uapi/linux/kvm.h | 3 +++ + virt/kvm/guest_memfd.c | 28 +++++++++++++++++++++++++++- + virt/kvm/kvm_main.c | 5 +++++ + 3 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 117937a895da..fb02a93546d8 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -930,6 +930,7 @@ struct kvm_enable_cap { + #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 + #define KVM_CAP_X86_GUEST_MODE 238 + #define KVM_CAP_GMEM_SHARED_MEM 239 ++#define KVM_CAP_GMEM_NO_DIRECT_MAP 240 + + struct kvm_irq_routing_irqchip { + __u32 irqchip; +@@ -1573,6 +1574,8 @@ struct kvm_create_guest_memfd { + __u64 reserved[6]; + }; + ++#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0) ++ + #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + + struct kvm_pre_fault_memory { +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index fbf89e643add..a2b96bc51391 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + + #include "kvm_mm.h" + +@@ -50,8 +51,23 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo + return 0; + } + ++static bool kvm_gmem_test_no_direct_map(struct inode *inode) ++{ ++ return ((unsigned long) inode->i_private) & KVM_GMEM_NO_DIRECT_MAP; ++} ++ + static inline void kvm_gmem_mark_prepared(struct folio *folio) + { ++ struct inode *inode = folio_inode(folio); ++ ++ if (kvm_gmem_test_no_direct_map(inode)) { ++ int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio), ++ false); ++ ++ if (!r) ++ folio_set_private(folio); ++ } ++ + folio_mark_uptodate(folio); + } + +@@ -478,6 +494,10 @@ static void kvm_gmem_free_folio(struct folio *folio) + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); + ++ if (folio_test_private(folio)) ++ WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0), ++ folio_nr_pages(folio), true)); ++ + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); + } + #endif +@@ -551,6 +571,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) + /* Unmovable mappings are supposed to be marked unevictable as well. */ + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + ++ if (flags & KVM_GMEM_NO_DIRECT_MAP) ++ mapping_set_no_direct_map(inode->i_mapping); ++ + kvm_get_kvm(kvm); + gmem->kvm = kvm; + xa_init(&gmem->bindings); +@@ -570,7 +593,10 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) + { + loff_t size = args->size; + u64 flags = args->flags; +- u64 valid_flags = 0; ++ u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP; ++ ++ if (!can_set_direct_map()) ++ valid_flags &= ~KVM_GMEM_NO_DIRECT_MAP; + + if (flags & ~valid_flags) + return -EINVAL; +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 3e40acb9f5c0..32ca1c921ab0 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -65,6 +65,7 @@ + #include + + #include ++#include + + + /* Worst case buffer size needed for holding an integer. */ +@@ -4823,6 +4824,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) + return kvm_supported_mem_attributes(kvm); + #endif + #ifdef CONFIG_KVM_PRIVATE_MEM ++ case KVM_CAP_GMEM_NO_DIRECT_MAP: ++ if (!can_set_direct_map()) ++ return false; ++ fallthrough; + case KVM_CAP_GUEST_MEMFD: + return !kvm || kvm_arch_has_private_mem(kvm); + #endif +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0012-fixup-for-direct-map-removal-v4.patch b/resources/hiding_ci/linux_patches/0012-fixup-for-direct-map-removal-v4.patch new file mode 100644 index 00000000000..c54565134f1 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0012-fixup-for-direct-map-removal-v4.patch @@ -0,0 +1,51 @@ +From ab44b2d5bfb7ef9b7bbb156d493f49a4bbebf014 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Thu, 10 Apr 2025 14:18:39 +0000 +Subject: [PATCH 12/26] fixup for direct map removal v4 + +Do not make kvm_gmem_free_folio dependent on +CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE . +--- + virt/kvm/guest_memfd.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index a2b96bc51391..291d647a5c80 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -487,28 +487,28 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol + return MF_DELAYED; + } + +-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + static void kvm_gmem_free_folio(struct folio *folio) + { ++#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + struct page *page = folio_page(folio, 0); + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); ++#endif + + if (folio_test_private(folio)) + WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0), + folio_nr_pages(folio), true)); + ++#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); +-} + #endif ++} + + static const struct address_space_operations kvm_gmem_aops = { + .dirty_folio = noop_dirty_folio, + .migrate_folio = kvm_gmem_migrate_folio, + .error_remove_folio = kvm_gmem_error_folio, +-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + .free_folio = kvm_gmem_free_folio, +-#endif + }; + + static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch b/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch new file mode 100644 index 00000000000..f4a62443b72 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch @@ -0,0 +1,161 @@ +From 48a178e27031d5eac97ba0630686fcf3034e88ed Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:17 +0000 +Subject: [PATCH 13/26] KVM: Add KVM_MEM_USERFAULT memslot flag and bitmap + +Use one of the 14 reserved u64s in struct kvm_userspace_memory_region2 +for the user to provide `userfault_bitmap`. + +The memslot flag indicates if KVM should be reading from the +`userfault_bitmap` field from the memslot. The user is permitted to +provide a bogus pointer. If the pointer cannot be read from, we will +return -EFAULT (with no other information) back to the user. + +Signed-off-by: James Houghton +--- + include/linux/kvm_host.h | 14 ++++++++++++++ + include/uapi/linux/kvm.h | 4 +++- + virt/kvm/Kconfig | 3 +++ + virt/kvm/kvm_main.c | 36 ++++++++++++++++++++++++++++++++++++ + 4 files changed, 56 insertions(+), 1 deletion(-) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index ec3bedc18eab..6cd0d910678e 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -596,6 +596,7 @@ struct kvm_memory_slot { + unsigned long *dirty_bitmap; + struct kvm_arch_memory_slot arch; + unsigned long userspace_addr; ++ unsigned long __user *userfault_bitmap; + u32 flags; + short id; + u16 as_id; +@@ -746,6 +747,11 @@ static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) + } + #endif + ++static inline bool kvm_has_userfault(struct kvm *kvm) ++{ ++ return IS_ENABLED(CONFIG_HAVE_KVM_USERFAULT); ++} ++ + struct kvm_memslots { + u64 generation; + atomic_long_t last_used_slot; +@@ -2592,4 +2598,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + void kvm_gmem_handle_folio_put(struct folio *folio); + #endif + ++int kvm_gfn_userfault(struct kvm *kvm, struct kvm_memory_slot *memslot, ++ gfn_t gfn); ++ ++static inline bool kvm_memslot_userfault(struct kvm_memory_slot *memslot) ++{ ++ return memslot->flags & KVM_MEM_USERFAULT; ++} ++ + #endif +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index fb02a93546d8..03676746be71 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -40,7 +40,8 @@ struct kvm_userspace_memory_region2 { + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; +- __u64 pad2[14]; ++ __u64 userfault_bitmap; ++ __u64 pad2[13]; + }; + + /* +@@ -51,6 +52,7 @@ struct kvm_userspace_memory_region2 { + #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) + #define KVM_MEM_READONLY (1UL << 1) + #define KVM_MEM_GUEST_MEMFD (1UL << 2) ++#define KVM_MEM_USERFAULT (1UL << 3) + + /* for KVM_IRQ_LINE */ + struct kvm_irq_level { +diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig +index 4e759e8020c5..7987fed3f3ec 100644 +--- a/virt/kvm/Kconfig ++++ b/virt/kvm/Kconfig +@@ -128,3 +128,6 @@ config HAVE_KVM_ARCH_GMEM_INVALIDATE + config KVM_GMEM_SHARED_MEM + select KVM_PRIVATE_MEM + bool ++ ++config HAVE_KVM_USERFAULT ++ bool +\ No newline at end of file +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 32ca1c921ab0..fb3ccf0cbb04 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1532,6 +1532,9 @@ static int check_memory_region_flags(struct kvm *kvm, + !(mem->flags & KVM_MEM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_READONLY; + ++ if (kvm_has_userfault(kvm)) ++ valid_flags |= KVM_MEM_USERFAULT; ++ + if (mem->flags & ~valid_flags) + return -EINVAL; + +@@ -1968,6 +1971,13 @@ static int kvm_set_memory_region(struct kvm *kvm, + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + return -EINVAL; + ++ if (mem->flags & KVM_MEM_USERFAULT && ++ ((mem->userfault_bitmap != untagged_addr(mem->userfault_bitmap)) || ++ !access_ok((void __user *)(unsigned long)mem->userfault_bitmap, ++ DIV_ROUND_UP(mem->memory_size >> PAGE_SHIFT, BITS_PER_LONG) ++ * sizeof(long)))) ++ return -EINVAL; ++ + slots = __kvm_memslots(kvm, as_id); + + /* +@@ -2035,6 +2045,9 @@ static int kvm_set_memory_region(struct kvm *kvm, + if (r) + goto out; + } ++ if (mem->flags & KVM_MEM_USERFAULT) ++ new->userfault_bitmap = ++ (unsigned long __user *)(unsigned long)mem->userfault_bitmap; + + r = kvm_set_memslot(kvm, old, new, change); + if (r) +@@ -6468,3 +6481,26 @@ void kvm_exit(void) + kvm_irqfd_exit(); + } + EXPORT_SYMBOL_GPL(kvm_exit); ++ ++int kvm_gfn_userfault(struct kvm *kvm, struct kvm_memory_slot *memslot, ++ gfn_t gfn) ++{ ++ unsigned long bitmap_chunk = 0; ++ off_t offset; ++ ++ if (!kvm_memslot_userfault(memslot)) ++ return 0; ++ ++ if (WARN_ON_ONCE(!memslot->userfault_bitmap)) ++ return 0; ++ ++ offset = gfn - memslot->base_gfn; ++ ++ if (copy_from_user(&bitmap_chunk, ++ memslot->userfault_bitmap + offset / BITS_PER_LONG, ++ sizeof(bitmap_chunk))) ++ return -EFAULT; ++ ++ /* Set in the bitmap means that the gfn is userfault */ ++ return !!(bitmap_chunk & (1ul << (offset % BITS_PER_LONG))); ++} +\ No newline at end of file +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch b/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch new file mode 100644 index 00000000000..dddc2b9dbfd --- /dev/null +++ b/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch @@ -0,0 +1,28 @@ +From 51a78015a0114ceaf1930739bba6111b1bc09f87 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:18 +0000 +Subject: [PATCH 14/26] KVM: Add KVM_MEMORY_EXIT_FLAG_USERFAULT + +This flag is used for vCPU memory faults caused by KVM Userfault; i.e., +the bit in `userfault_bitmap` corresponding to the faulting gfn was set. + +Signed-off-by: James Houghton +--- + include/uapi/linux/kvm.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 03676746be71..0e1a2fac5735 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -444,6 +444,7 @@ struct kvm_run { + /* KVM_EXIT_MEMORY_FAULT */ + struct { + #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) ++#define KVM_MEMORY_EXIT_FLAG_USERFAULT (1ULL << 4) + __u64 flags; + __u64 gpa; + __u64 size; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch b/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch new file mode 100644 index 00000000000..7960341db8a --- /dev/null +++ b/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch @@ -0,0 +1,58 @@ +From ed691412fd9414d3b9124e2416f6cae3f21a1071 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:19 +0000 +Subject: [PATCH 15/26] KVM: Allow late setting of KVM_MEM_USERFAULT on + guest_memfd memslot + +Currently guest_memfd memslots can only be deleted. Slightly change the +logic to allow KVM_MR_FLAGS_ONLY changes when the only flag being +changed is KVM_MEM_USERFAULT. + +Signed-off-by: James Houghton +--- + virt/kvm/kvm_main.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index fb3ccf0cbb04..c60fe692de03 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2009,9 +2009,6 @@ static int kvm_set_memory_region(struct kvm *kvm, + if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) + return -EINVAL; + } else { /* Modify an existing slot. */ +- /* Private memslots are immutable, they can only be deleted. */ +- if (mem->flags & KVM_MEM_GUEST_MEMFD) +- return -EINVAL; + if ((mem->userspace_addr != old->userspace_addr) || + (npages != old->npages) || + ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) +@@ -2025,6 +2022,16 @@ static int kvm_set_memory_region(struct kvm *kvm, + return 0; + } + ++ /* ++ * Except for being able to set KVM_MEM_USERFAULT, private memslots are ++ * immutable, they can only be deleted. ++ */ ++ if (mem->flags & KVM_MEM_GUEST_MEMFD && ++ !(change == KVM_MR_CREATE || ++ (change == KVM_MR_FLAGS_ONLY && ++ (mem->flags ^ old->flags) == KVM_MEM_USERFAULT))) ++ return -EINVAL; ++ + if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && + kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) + return -EEXIST; +@@ -2040,7 +2047,7 @@ static int kvm_set_memory_region(struct kvm *kvm, + new->npages = npages; + new->flags = mem->flags; + new->userspace_addr = mem->userspace_addr; +- if (mem->flags & KVM_MEM_GUEST_MEMFD) { ++ if (mem->flags & KVM_MEM_GUEST_MEMFD && change == KVM_MR_CREATE) { + r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); + if (r) + goto out; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch new file mode 100644 index 00000000000..ca31ca9518b --- /dev/null +++ b/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch @@ -0,0 +1,217 @@ +From fa324f2e503cd36dc357c3eb9b807e02f9b6206e Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:21 +0000 +Subject: [PATCH 16/26] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT + +Adhering to the requirements of KVM Userfault: + +1. Zap all sptes for the memslot when KVM_MEM_USERFAULT is toggled on + with kvm_arch_flush_shadow_memslot(). +2. Only all PAGE_SIZE sptes when KVM_MEM_USERFAULT is enabled (for both + normal/GUP memory and guest_memfd memory). +3. Reconstruct huge mappings when KVM_MEM_USERFAULT is toggled off with + kvm_mmu_recover_huge_pages(). This is the behavior when dirty logging + is disabled; remain consistent with it. + +With the new logic in kvm_mmu_slot_apply_flags(), I've simplified the +two dirty-logging-toggle checks into one, and I have dropped the +WARN_ON() that was there. + +Signed-off-by: James Houghton +--- + arch/x86/kvm/Kconfig | 1 + + arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++++++++---- + arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++++++++--- + arch/x86/kvm/x86.c | 36 ++++++++++++++++++++++++--------- + include/linux/kvm_host.h | 5 ++++- + 5 files changed, 72 insertions(+), 18 deletions(-) + +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index 22d1bcdaad58..6b1ef6402e30 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -48,6 +48,7 @@ config KVM_X86 + select KVM_PRIVATE_MEM if KVM_SW_PROTECTED_VM + select KVM_GMEM_SHARED_MEM if KVM_SW_PROTECTED_VM + select KVM_WERROR if WERROR ++ select HAVE_KVM_USERFAULT + + config KVM + tristate "Kernel-based Virtual Machine (KVM) support" +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 8160870398b9..7ac7dc164522 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4292,14 +4292,20 @@ static inline u8 kvm_max_level_for_order(int order) + return PG_LEVEL_4K; + } + +-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, +- u8 max_level, int gmem_order) ++static u8 kvm_max_private_mapping_level(struct kvm *kvm, ++ struct kvm_memory_slot *slot, ++ kvm_pfn_t pfn, ++ u8 max_level, ++ int gmem_order) + { + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + ++ if (kvm_memslot_userfault(slot)) ++ return PG_LEVEL_4K; ++ + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; +@@ -4336,8 +4342,10 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + } + + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); +- fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, +- fault->max_level, max_order); ++ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->slot, ++ fault->pfn, ++ fault->max_level, ++ max_order); + + return RET_PF_CONTINUE; + } +@@ -4346,6 +4354,18 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) + { + unsigned int foll = fault->write ? FOLL_WRITE : 0; ++ int userfault; ++ ++ userfault = kvm_gfn_userfault(vcpu->kvm, fault->slot, fault->gfn); ++ if (userfault < 0) ++ return userfault; ++ if (userfault) { ++ kvm_mmu_prepare_userfault_exit(vcpu, fault); ++ return -EFAULT; ++ } ++ ++ if (kvm_memslot_userfault(fault->slot)) ++ fault->max_level = PG_LEVEL_4K; + + if (fault->is_private) + return kvm_mmu_faultin_pfn_private(vcpu, fault); +diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h +index 75f00598289d..d1f18dcc18fb 100644 +--- a/arch/x86/kvm/mmu/mmu_internal.h ++++ b/arch/x86/kvm/mmu/mmu_internal.h +@@ -335,12 +335,26 @@ enum { + */ + static_assert(RET_PF_CONTINUE == 0); + +-static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, +- struct kvm_page_fault *fault) ++static inline void __kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, ++ struct kvm_page_fault *fault, ++ bool is_userfault) + { + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, +- fault->is_private); ++ fault->is_private, ++ is_userfault); ++} ++ ++static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, ++ struct kvm_page_fault *fault) ++{ ++ __kvm_mmu_prepare_memory_fault_exit(vcpu, fault, false); ++} ++ ++static inline void kvm_mmu_prepare_userfault_exit(struct kvm_vcpu *vcpu, ++ struct kvm_page_fault *fault) ++{ ++ __kvm_mmu_prepare_memory_fault_exit(vcpu, fault, true); + } + + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 4b64ab350bcd..04034ca04703 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -13075,12 +13075,36 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + u32 new_flags = new ? new->flags : 0; + bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; + ++ /* ++ * When toggling KVM Userfault on, zap all sptes so that userfault-ness ++ * will be respected at refault time. All new faults will only install ++ * small sptes. Therefore, when toggling it off, recover hugepages. ++ * ++ * For MOVE and DELETE, there will be nothing to do, as the old ++ * mappings will have already been deleted by ++ * kvm_arch_flush_shadow_memslot(). ++ * ++ * For CREATE, no mappings will have been created yet. ++ */ ++ if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT && ++ (change == KVM_MR_FLAGS_ONLY)) { ++ if (old_flags & KVM_MEM_USERFAULT) ++ kvm_mmu_recover_huge_pages(kvm, new); ++ else ++ kvm_arch_flush_shadow_memslot(kvm, old); ++ } ++ ++ /* ++ * Nothing more to do if dirty logging isn't being toggled. ++ */ ++ if (!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)) ++ return; ++ + /* + * Update CPU dirty logging if dirty logging is being toggled. This + * applies to all operations. + */ +- if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) +- kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); ++ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); + + /* + * Nothing more to do for RO slots (which can't be dirtied and can't be +@@ -13100,14 +13124,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) + return; + +- /* +- * READONLY and non-flags changes were filtered out above, and the only +- * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty +- * logging isn't being toggled on or off. +- */ +- if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) +- return; +- + if (!log_dirty_pages) { + /* + * Recover huge page mappings in the slot now that dirty logging +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 6cd0d910678e..4a5379367332 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2499,7 +2499,8 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) + static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t size, + bool is_write, bool is_exec, +- bool is_private) ++ bool is_private, ++ bool is_userfault) + { + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + vcpu->run->memory_fault.gpa = gpa; +@@ -2509,6 +2510,8 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + vcpu->run->memory_fault.flags = 0; + if (is_private) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; ++ if (is_userfault) ++ vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT; + } + + #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch b/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch new file mode 100644 index 00000000000..c89c7c9b262 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch @@ -0,0 +1,45 @@ +From f0ef961eba32b98755d2bfa5ff684944e3a442fc Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:20 +0000 +Subject: [PATCH 17/26] KVM: Advertise KVM_CAP_USERFAULT in KVM_CHECK_EXTENSION + +Advertise support for KVM_CAP_USERFAULT when kvm_has_userfault() returns +true. Currently this is merely IS_ENABLED(CONFIG_HAVE_KVM_USERFAULT), so +it is somewhat redundant. + +Signed-off-by: James Houghton +--- + include/uapi/linux/kvm.h | 1 + + virt/kvm/kvm_main.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 0e1a2fac5735..f5ad5d39c24b 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -934,6 +934,7 @@ struct kvm_enable_cap { + #define KVM_CAP_X86_GUEST_MODE 238 + #define KVM_CAP_GMEM_SHARED_MEM 239 + #define KVM_CAP_GMEM_NO_DIRECT_MAP 240 ++#define KVM_CAP_USERFAULT 241 + + struct kvm_irq_routing_irqchip { + __u32 irqchip; +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index c60fe692de03..bb85ea8d0f85 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4854,6 +4854,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) + #ifdef CONFIG_KVM_GMEM_SHARED_MEM + case KVM_CAP_GMEM_SHARED_MEM: + return !kvm || kvm_arch_gmem_supports_shared_mem(kvm); ++#endif ++#ifdef CONFIG_HAVE_KVM_USERFAULT ++ case KVM_CAP_USERFAULT: ++ return kvm_has_userfault(kvm); + #endif + default: + break; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch new file mode 100644 index 00000000000..58f076e27cb --- /dev/null +++ b/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch @@ -0,0 +1,87 @@ +From 482a64008a53577da046428922f247dce203113f Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:22 +0000 +Subject: [PATCH 18/26] KVM: arm64: Add support for KVM_MEM_USERFAULT + +Adhering to the requirements of KVM Userfault: +1. When it is toggled on, zap the second stage with + kvm_arch_flush_shadow_memslot(). This is to respect userfault-ness. +2. When KVM_MEM_USERFAULT is enabled, restrict new second-stage mappings + to be PAGE_SIZE, just like when dirty logging is enabled. + +Do not zap the second stage when KVM_MEM_USERFAULT is disabled to remain +consistent with the behavior when dirty logging is disabled. + +Signed-off-by: James Houghton +--- + arch/arm64/kvm/Kconfig | 1 + + arch/arm64/kvm/mmu.c | 27 ++++++++++++++++++++++++++- + 2 files changed, 27 insertions(+), 1 deletion(-) + +diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig +index 4830d8805bed..aa0f438fba1c 100644 +--- a/arch/arm64/kvm/Kconfig ++++ b/arch/arm64/kvm/Kconfig +@@ -39,6 +39,7 @@ menuconfig KVM + select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS + select KVM_GMEM_SHARED_MEM ++ select HAVE_KVM_USERFAULT + help + Support hosting virtualized guest machines. + +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index adb0681fc1c6..39d9a02db9e9 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1497,7 +1497,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); + bool is_gmem = kvm_mem_is_private(kvm, gfn); +- bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); ++ bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled() || ++ kvm_memslot_userfault(memslot); + long vma_pagesize, fault_granule = PAGE_SIZE; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; +@@ -1635,6 +1636,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + ++ if (kvm_gfn_userfault(kvm, memslot, gfn)) { ++ kvm_prepare_memory_fault_exit(vcpu, gfn << PAGE_SHIFT, ++ PAGE_SIZE, write_fault, ++ exec_fault, false, true); ++ return -EFAULT; ++ } ++ + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); +@@ -2125,6 +2133,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, + enum kvm_mr_change change) + { + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; ++ u32 new_flags = new ? new->flags : 0; ++ u32 changed_flags = (new_flags) ^ (old ? old->flags : 0); ++ ++ /* ++ * If KVM_MEM_USERFAULT has been enabled, drop all the stage-2 mappings ++ * so that we can respect userfault-ness. ++ */ ++ if ((changed_flags & KVM_MEM_USERFAULT) && ++ (new_flags & KVM_MEM_USERFAULT) && ++ change == KVM_MR_FLAGS_ONLY) ++ kvm_arch_flush_shadow_memslot(kvm, old); ++ ++ /* ++ * Nothing left to do if not toggling dirty logging. ++ */ ++ if (!(changed_flags & KVM_MEM_LOG_DIRTY_PAGES)) ++ return; + + /* + * At this point memslot has been committed and there is an +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0019-KVM-guest_memfd-add-generic-population-via-write.patch b/resources/hiding_ci/linux_patches/0019-KVM-guest_memfd-add-generic-population-via-write.patch new file mode 100644 index 00000000000..0c05129841e --- /dev/null +++ b/resources/hiding_ci/linux_patches/0019-KVM-guest_memfd-add-generic-population-via-write.patch @@ -0,0 +1,141 @@ +From f81fae83d40e1520a0a46afa3473f9fc4c6b7c79 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 29 Nov 2024 11:51:02 +0000 +Subject: [PATCH 19/26] KVM: guest_memfd: add generic population via write + +write syscall populates guest_memfd with user-supplied data in a generic +way, ie no vendor-specific preparation is performed. This is supposed +to be used in non-CoCo setups where guest memory is not +hardware-encrypted. + +The following behaviour is implemented: + - only page-aligned count and offset are allowed + - if the memory is already allocated, the call will successfully + populate it + - if the memory is not allocated, the call will both allocate and + populate + - if the memory is already populated, the call will not repopulate it + +Signed-off-by: Nikita Kalyazin +--- + virt/kvm/guest_memfd.c | 94 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 91 insertions(+), 3 deletions(-) + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 291d647a5c80..5abb6d52a375 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -432,12 +432,97 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) + + return 0; + } +-#else +-#define kvm_gmem_mmap NULL ++ ++static ssize_t kvm_kmem_gmem_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ pgoff_t start, end, index; ++ ssize_t ret = 0; ++ ++ if (!PAGE_ALIGNED(*offset) || !PAGE_ALIGNED(count)) ++ return -EINVAL; ++ ++ if (*offset + count > i_size_read(file_inode(file))) ++ return -EINVAL; ++ ++ if (!buf) ++ return -EINVAL; ++ ++ start = *offset >> PAGE_SHIFT; ++ end = (*offset + count) >> PAGE_SHIFT; ++ ++ filemap_invalidate_lock_shared(file->f_mapping); ++ ++ for (index = start; index < end; ) { ++ struct folio *folio; ++ void *vaddr; ++ pgoff_t buf_offset = (index - start) << PAGE_SHIFT; ++ ++ if (signal_pending(current)) { ++ ret = -EINTR; ++ goto out; ++ } ++ ++ folio = kvm_gmem_get_folio(file_inode(file), index); ++ if (IS_ERR(folio)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ if (folio_test_hwpoison(folio)) { ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ /* No support for huge pages. */ ++ if (WARN_ON_ONCE(folio_test_large(folio))) { ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ if (folio_test_uptodate(folio)) { ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ folio_unlock(folio); ++ ++ vaddr = kmap_local_folio(folio, 0); ++ ret = copy_from_user(vaddr, buf + buf_offset, PAGE_SIZE); ++ kunmap_local(vaddr); ++ if (ret) { ++ ret = -EINVAL; ++ folio_put(folio); ++ goto out; ++ } ++ ++ kvm_gmem_mark_prepared(folio); ++ folio_put(folio); ++ ++ index = folio_next_index(folio); ++ *offset += PAGE_SIZE; ++ } ++ ++out: ++ filemap_invalidate_unlock_shared(file->f_mapping); ++ ++ return ret && start == (*offset >> PAGE_SHIFT) ? ++ ret : *offset - (start << PAGE_SHIFT); ++} + #endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + + static struct file_operations kvm_gmem_fops = { +- .mmap = kvm_gmem_mmap, ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ .mmap = kvm_gmem_mmap, ++ .llseek = default_llseek, ++ .write = kvm_kmem_gmem_write, ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +@@ -557,6 +642,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) + } + + file->f_flags |= O_LARGEFILE; ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ file->f_mode |= FMODE_LSEEK | FMODE_PWRITE; ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + + inode = file->f_inode; + WARN_ON(file->f_mapping != inode->i_mapping); +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0020-KVM-selftests-update-guest_memfd-write-tests.patch b/resources/hiding_ci/linux_patches/0020-KVM-selftests-update-guest_memfd-write-tests.patch new file mode 100644 index 00000000000..869144f63d0 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0020-KVM-selftests-update-guest_memfd-write-tests.patch @@ -0,0 +1,126 @@ +From 3ccb28e0fe31afa8ac626ebd5b957ba9263a68d3 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 29 Nov 2024 11:57:58 +0000 +Subject: [PATCH 20/26] KVM: selftests: update guest_memfd write tests + +This is to reflect that the write syscall is now implemented for +guest_memfd. + +Signed-off-by: Nikita Kalyazin +--- + .../testing/selftests/kvm/guest_memfd_test.c | 85 +++++++++++++++++-- + 1 file changed, 79 insertions(+), 6 deletions(-) + +diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c +index 38c501e49e0e..b07221aa54c9 100644 +--- a/tools/testing/selftests/kvm/guest_memfd_test.c ++++ b/tools/testing/selftests/kvm/guest_memfd_test.c +@@ -20,18 +20,90 @@ + #include "kvm_util.h" + #include "test_util.h" + +-static void test_file_read_write(int fd) ++static void test_file_read(int fd) + { + char buf[64]; + + TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, + "read on a guest_mem fd should fail"); +- TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, +- "write on a guest_mem fd should fail"); + TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, + "pread on a guest_mem fd should fail"); +- TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, +- "pwrite on a guest_mem fd should fail"); ++} ++ ++static void test_file_write(int fd, size_t total_size) ++{ ++ size_t page_size = getpagesize(); ++ void *buf = NULL; ++ int ret; ++ ++ ret = posix_memalign(&buf, page_size, total_size); ++ TEST_ASSERT_EQ(ret, 0); ++ ++ /* Check arguments correctness checks work as expected */ ++ ++ ret = pwrite(fd, buf, page_size - 1, 0); ++ TEST_ASSERT(ret == -1, "write unaligned count on a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ ret = pwrite(fd, buf, page_size, 1); ++ TEST_ASSERT(ret == -1, "write unaligned offset on a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ ret = pwrite(fd, buf, page_size, total_size); ++ TEST_ASSERT(ret == -1, "writing past the file size on a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ ret = pwrite(fd, NULL, page_size, 0); ++ TEST_ASSERT(ret == -1, "supplying a NULL buffer when writing a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ /* Check double population is not allowed */ ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == page_size, "page-aligned write on a guest_mem fd should succeed"); ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == -1, "write on already populated guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, ENOSPC); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ /* Check population is allowed again after punching a hole */ ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == page_size, "page-aligned write on a punched guest_mem fd should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ /* Check population of already allocated memory is allowed */ ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == page_size, "write on a preallocated guest_mem fd should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ /* Check population works until an already populated page is encountered */ ++ ++ ret = pwrite(fd, buf, total_size, 0); ++ TEST_ASSERT(ret == total_size, "page-aligned write on a guest_mem fd should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ ret = pwrite(fd, buf, total_size, 0); ++ TEST_ASSERT(ret == page_size, "write on a guest_mem fd should not overwrite data"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, total_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ ++ free(buf); + } + + static void test_mmap_allowed(int fd, size_t total_size) +@@ -233,7 +305,8 @@ void test_vm_type(unsigned long type, bool is_shared) + + fd = vm_create_guest_memfd(vm, total_size, 0); + +- test_file_read_write(fd); ++ test_file_read(fd); ++ test_file_write(fd, total_size); + + if (is_shared) + test_mmap_allowed(fd, total_size); +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch b/resources/hiding_ci/linux_patches/0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch new file mode 100644 index 00000000000..4818a87a713 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch @@ -0,0 +1,153 @@ +From 51dc7d27476d00d96f6f71882a11b5e17e80aa8f Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Mon, 31 Mar 2025 10:15:35 +0000 +Subject: [PATCH 21/26] mm: userfaultfd: generic continue for non hugetlbfs + +Remove shmem-specific code from UFFDIO_CONTINUE implementation for +non-huge pages by calling vm_ops->fault(). A new VMF flag, +FAULT_FLAG_USERFAULT_CONTINUE, is introduced to avoid recursive call to +handle_userfault(). + +Suggested-by: James Houghton +Signed-off-by: Nikita Kalyazin +--- + include/linux/mm_types.h | 4 ++++ + mm/hugetlb.c | 2 +- + mm/shmem.c | 9 ++++++--- + mm/userfaultfd.c | 37 +++++++++++++++++++++++++++---------- + 4 files changed, 38 insertions(+), 14 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 0234f14f2aa6..2f26ee9742bf 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1429,6 +1429,9 @@ enum tlb_flush_reason { + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. + * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. ++ * @FAULT_FLAG_USERFAULT_CONTINUE: The fault handler must not call userfaultfd ++ * minor handler as it is being called by the ++ * userfaultfd code itself. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two +@@ -1467,6 +1470,7 @@ enum fault_flag { + FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, + FAULT_FLAG_VMA_LOCK = 1 << 12, ++ FAULT_FLAG_USERFAULT_CONTINUE = 1 << 13, + }; + + typedef unsigned int __bitwise zap_flags_t; +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 97930d44d460..c004cfdcd4e2 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, + } + + /* Check for page in userfault range. */ +- if (userfaultfd_minor(vma)) { ++ if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { + folio_unlock(folio); + folio_put(folio); + /* See comment in userfaultfd_missing() block above */ +diff --git a/mm/shmem.c b/mm/shmem.c +index 1ede0800e846..b4159303fe59 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + fault_mm = vma ? vma->vm_mm : NULL; + + folio = filemap_get_entry(inode->i_mapping, index); +- if (folio && vma && userfaultfd_minor(vma)) { ++ if (folio && vma && userfaultfd_minor(vma) && ++ !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { + if (!xa_is_value(folio)) + folio_put(folio); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); +@@ -2727,6 +2728,8 @@ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) + static vm_fault_t shmem_fault(struct vm_fault *vmf) + { + struct inode *inode = file_inode(vmf->vma->vm_file); ++ enum sgp_type sgp = vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE ? ++ SGP_NOALLOC : SGP_CACHE; + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; + vm_fault_t ret = 0; +@@ -2743,8 +2746,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) + } + + WARN_ON_ONCE(vmf->page != NULL); +- err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, +- gfp, vmf, &ret); ++ err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, sgp, gfp, vmf, ++ &ret); + if (err) + return vmf_error(err); + if (folio) { +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index d06453fa8aba..4b3dbc7dac64 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -380,30 +380,47 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, + return ret; + } + +-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ ++/* Handles UFFDIO_CONTINUE for all VMAs */ + static int mfill_atomic_pte_continue(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t flags) + { +- struct inode *inode = file_inode(dst_vma->vm_file); +- pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; + struct page *page; + int ret; ++ struct vm_fault vmf = { ++ .vma = dst_vma, ++ .address = dst_addr, ++ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE | ++ FAULT_FLAG_USERFAULT_CONTINUE, ++ .pte = NULL, ++ .page = NULL, ++ .pgoff = linear_page_index(dst_vma, dst_addr), ++ }; ++ ++ if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault) ++ return -EINVAL; + +- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); +- /* Our caller expects us to return -EFAULT if we failed to find folio */ +- if (ret == -ENOENT) ++retry: ++ ret = dst_vma->vm_ops->fault(&vmf); ++ if (ret & VM_FAULT_ERROR) { + ret = -EFAULT; +- if (ret) + goto out; +- if (!folio) { +- ret = -EFAULT; ++ } ++ ++ if (ret & VM_FAULT_NOPAGE) { ++ ret = -EAGAIN; + goto out; + } + +- page = folio_file_page(folio, pgoff); ++ if (ret & VM_FAULT_RETRY) ++ goto retry; ++ ++ page = vmf.page; ++ folio = page_folio(page); ++ BUG_ON(!folio); ++ + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0022-mm-provide-can_userfault-vma-operation.patch b/resources/hiding_ci/linux_patches/0022-mm-provide-can_userfault-vma-operation.patch new file mode 100644 index 00000000000..b6bc10178cc --- /dev/null +++ b/resources/hiding_ci/linux_patches/0022-mm-provide-can_userfault-vma-operation.patch @@ -0,0 +1,95 @@ +From 7ed09f6e50ea4e4448e457a7b7712bdf3b38e826 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 4 Apr 2025 14:15:18 +0000 +Subject: [PATCH 22/26] mm: provide can_userfault vma operation + +The new operation allows to decouple the userfaulfd code from +dependencies to VMA types, specifically, shmem and hugetlb. The +vm_flags bitmap argument is processed with "any" logic, meaning if the +VMA type supports any of the flags set, it returns true. This is to +avoid multiple calls when checking for __VM_UFFD_FLAGS. + +Signed-off-by: Nikita Kalyazin +--- + include/linux/mm.h | 5 +++++ + mm/hugetlb.c | 7 +++++++ + mm/shmem.c | 8 ++++++++ + 3 files changed, 20 insertions(+) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 8483e09aeb2c..488d721d8542 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -680,6 +680,11 @@ struct vm_operations_struct { + */ + struct page *(*find_special_page)(struct vm_area_struct *vma, + unsigned long addr); ++ /* ++ * True if the VMA supports userfault at least for one of the vm_flags ++ */ ++ bool (*can_userfault)(struct vm_area_struct *vma, ++ unsigned long vm_flags); + }; + + #ifdef CONFIG_NUMA_BALANCING +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index c004cfdcd4e2..f3901c11e1fd 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -5143,6 +5143,12 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) + return huge_page_size(hstate_vma(vma)); + } + ++static bool hugetlb_vm_op_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return true; ++} ++ + /* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the +@@ -5168,6 +5174,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { + .close = hugetlb_vm_op_close, + .may_split = hugetlb_vm_op_split, + .pagesize = hugetlb_vm_op_pagesize, ++ .can_userfault = hugetlb_vm_op_can_userfault, + }; + + static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +diff --git a/mm/shmem.c b/mm/shmem.c +index b4159303fe59..0b9e19abd1e9 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2891,6 +2891,12 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); + } + ++static bool shmem_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return true; ++} ++ + static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) + { +@@ -5309,6 +5315,7 @@ static const struct vm_operations_struct shmem_vm_ops = { + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, + #endif ++ .can_userfault = shmem_can_userfault, + }; + + static const struct vm_operations_struct shmem_anon_vm_ops = { +@@ -5318,6 +5325,7 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, + #endif ++ .can_userfault = shmem_can_userfault, + }; + + int shmem_init_fs_context(struct fs_context *fc) +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0023-mm-userfaultfd-use-can_userfault-vma-operation.patch b/resources/hiding_ci/linux_patches/0023-mm-userfaultfd-use-can_userfault-vma-operation.patch new file mode 100644 index 00000000000..ce5130bb620 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0023-mm-userfaultfd-use-can_userfault-vma-operation.patch @@ -0,0 +1,79 @@ +From 04555059b68ba6e2aeb678da706a8290e3598df0 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 4 Apr 2025 14:16:49 +0000 +Subject: [PATCH 23/26] mm: userfaultfd: use can_userfault vma operation + +Signed-off-by: Nikita Kalyazin +--- + include/linux/userfaultfd_k.h | 13 ++++++------- + mm/userfaultfd.c | 10 +++++++--- + 2 files changed, 13 insertions(+), 10 deletions(-) + +diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h +index 75342022d144..64551e8a55fb 100644 +--- a/include/linux/userfaultfd_k.h ++++ b/include/linux/userfaultfd_k.h +@@ -221,8 +221,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + if (vm_flags & VM_DROPPABLE) + return false; + +- if ((vm_flags & VM_UFFD_MINOR) && +- (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) ++ if (!vma->vm_ops->can_userfault || ++ !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR)) + return false; + + /* +@@ -235,16 +235,15 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + #ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for +- * uffd-wp, then shmem & hugetlbfs are not supported but only +- * anonymous. ++ * uffd-wp, then only anonymous is supported. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; + #endif + +- /* By default, allow any of anon|shmem|hugetlb */ +- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || +- vma_is_shmem(vma); ++ return vma_is_anonymous(vma) || ++ (vma->vm_ops->can_userfault && ++ vma->vm_ops->can_userfault(vma, vm_flags)); + } + + static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index 4b3dbc7dac64..0aa82c968e16 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -728,6 +728,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + unsigned long src_addr, dst_addr; + long copied; + struct folio *folio; ++ bool can_userfault; + + /* + * Sanitize the command parameters: +@@ -787,10 +788,13 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); + +- if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) ++ can_userfault = dst_vma->vm_ops->can_userfault && ++ dst_vma->vm_ops->can_userfault(dst_vma, __VM_UFFD_FLAGS); ++ ++ if (!vma_is_anonymous(dst_vma) && !can_userfault) + goto out_unlock; +- if (!vma_is_shmem(dst_vma) && +- uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) ++ ++ if (!can_userfault && uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + goto out_unlock; + + while (src_addr < src_start + len) { +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch b/resources/hiding_ci/linux_patches/0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch new file mode 100644 index 00000000000..37dc68e3989 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch @@ -0,0 +1,44 @@ +From b806003684d08506cb66c664efdfda3d7ff6103e Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Tue, 1 Apr 2025 15:02:56 +0000 +Subject: [PATCH 24/26] KVM: guest_memfd: add support for userfaultfd minor + +Add support for sending a pagefault event if userfaultfd is registered. +Only page minor event is currently supported. + +Signed-off-by: Nikita Kalyazin +--- + virt/kvm/guest_memfd.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 5abb6d52a375..91ee5dd91c31 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -5,6 +5,9 @@ + #include + #include + #include ++#ifdef CONFIG_KVM_PRIVATE_MEM ++#include ++#endif /* CONFIG_KVM_PRIVATE_MEM */ + + #include "kvm_mm.h" + +@@ -396,6 +399,13 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) + kvm_gmem_mark_prepared(folio); + } + ++ if (userfaultfd_minor(vmf->vma) && ++ !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { ++ folio_unlock(folio); ++ filemap_invalidate_unlock_shared(inode->i_mapping); ++ return handle_userfault(vmf, VM_UFFD_MINOR); ++ } ++ + vmf->page = folio_file_page(folio, vmf->pgoff); + + out_folio: +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch b/resources/hiding_ci/linux_patches/0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch new file mode 100644 index 00000000000..777a2b05e66 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch @@ -0,0 +1,61 @@ +From 6c5886204ff8d306cc4ee945235c88eb854ebf7f Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 4 Apr 2025 14:18:03 +0000 +Subject: [PATCH 25/26] mm: userfaultfd: add UFFD_FEATURE_MINOR_GUEST_MEMFD + +Signed-off-by: Nikita Kalyazin +--- + fs/userfaultfd.c | 3 ++- + include/uapi/linux/userfaultfd.h | 8 +++++++- + 2 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index 97c4d71115d8..32152bfa462a 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -1954,7 +1954,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, + uffdio_api.features = UFFD_API_FEATURES; + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= +- ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); ++ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM | ++ UFFD_FEATURE_MINOR_GUEST_MEMFD); + #endif + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; +diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h +index 2841e4ea8f2c..ed688797eba7 100644 +--- a/include/uapi/linux/userfaultfd.h ++++ b/include/uapi/linux/userfaultfd.h +@@ -42,7 +42,8 @@ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ +- UFFD_FEATURE_MOVE) ++ UFFD_FEATURE_MOVE | \ ++ UFFD_FEATURE_MINOR_GUEST_MEMFD) + #define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ +@@ -230,6 +231,10 @@ struct uffdio_api { + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. ++ * ++ * UFFD_FEATURE_MINOR_GUEST_MEMFD indicates the same support as ++ * UFFD_FEATURE_MINOR_HUGETLBFS, but for guest_memfd-backed pages ++ * instead. + */ + #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) + #define UFFD_FEATURE_EVENT_FORK (1<<1) +@@ -248,6 +253,7 @@ struct uffdio_api { + #define UFFD_FEATURE_POISON (1<<14) + #define UFFD_FEATURE_WP_ASYNC (1<<15) + #define UFFD_FEATURE_MOVE (1<<16) ++#define UFFD_FEATURE_MINOR_GUEST_MEMFD (1<<17) + __u64 features; + + __u64 ioctls; +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch b/resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch new file mode 100644 index 00000000000..2aa0a3bea09 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch @@ -0,0 +1,70 @@ +From d950436a063f021ae0d925509363106625eafe0f Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Thu, 10 Apr 2025 14:18:53 +0000 +Subject: [PATCH 26/26] fixup for guest_memfd uffd v3 + + - implement can_userfault for guest_memfd + - check vma->vm_ops pointer before dereferencing + - proper check for VM_UFFD_MINOR +--- + include/linux/userfaultfd_k.h | 6 ++++-- + mm/userfaultfd.c | 4 +++- + virt/kvm/guest_memfd.c | 9 ++++++++- + 3 files changed, 15 insertions(+), 4 deletions(-) + +diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h +index 64551e8a55fb..080437fa7eab 100644 +--- a/include/linux/userfaultfd_k.h ++++ b/include/linux/userfaultfd_k.h +@@ -221,8 +221,10 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + if (vm_flags & VM_DROPPABLE) + return false; + +- if (!vma->vm_ops->can_userfault || +- !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR)) ++ if ((vm_flags & VM_UFFD_MINOR) && ++ (!vma->vm_ops || ++ !vma->vm_ops->can_userfault || ++ !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR))) + return false; + + /* +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index 0aa82c968e16..638360a78561 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -788,7 +788,9 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); + +- can_userfault = dst_vma->vm_ops->can_userfault && ++ can_userfault = ++ dst_vma->vm_ops && ++ dst_vma->vm_ops->can_userfault && + dst_vma->vm_ops->can_userfault(dst_vma, __VM_UFFD_FLAGS); + + if (!vma_is_anonymous(dst_vma) && !can_userfault) +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 91ee5dd91c31..202b12dc4b6f 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -420,8 +420,15 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) + return ret; + } + ++static bool kvm_gmem_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return vm_flags & VM_UFFD_MINOR; ++} ++ + static const struct vm_operations_struct kvm_gmem_vm_ops = { +- .fault = kvm_gmem_fault, ++ .fault = kvm_gmem_fault, ++ .can_userfault = kvm_gmem_can_userfault, + }; + + static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/GPL-2.0 b/resources/hiding_ci/linux_patches/GPL-2.0 new file mode 100644 index 00000000000..ff0812fd89c --- /dev/null +++ b/resources/hiding_ci/linux_patches/GPL-2.0 @@ -0,0 +1,359 @@ +Valid-License-Identifier: GPL-2.0 +Valid-License-Identifier: GPL-2.0-only +Valid-License-Identifier: GPL-2.0+ +Valid-License-Identifier: GPL-2.0-or-later +SPDX-URL: https://spdx.org/licenses/GPL-2.0.html +Usage-Guide: + To use this license in source code, put one of the following SPDX + tag/value pairs into a comment according to the placement + guidelines in the licensing rules documentation. + For 'GNU General Public License (GPL) version 2 only' use: + SPDX-License-Identifier: GPL-2.0 + or + SPDX-License-Identifier: GPL-2.0-only + For 'GNU General Public License (GPL) version 2 or any later version' use: + SPDX-License-Identifier: GPL-2.0+ + or + SPDX-License-Identifier: GPL-2.0-or-later +License-Text: + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/resources/hiding_ci/linux_patches/README.md b/resources/hiding_ci/linux_patches/README.md new file mode 100644 index 00000000000..7a119e42452 --- /dev/null +++ b/resources/hiding_ci/linux_patches/README.md @@ -0,0 +1,8 @@ +# Linux kernel patches for direct map removal + +The Linux kernel patches in this directory are distributed under the `GPL-2.0` +licence (see the full licence text at [GPL-2.0](./GPL-2.0)). The patches are +required by Firecracker's "Secret Freedom" feature that removes the VM memory +from the host direct map (see +[lore](https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/) +for more details). The patches are not yet merged upstream. diff --git a/src/firecracker/src/api_server/request/machine_configuration.rs b/src/firecracker/src/api_server/request/machine_configuration.rs index 2e8addffb74..0edb79f3774 100644 --- a/src/firecracker/src/api_server/request/machine_configuration.rs +++ b/src/firecracker/src/api_server/request/machine_configuration.rs @@ -119,6 +119,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: None, track_dirty_pages: Some(false), @@ -140,6 +141,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: Some(StaticCpuTemplate::None), track_dirty_pages: Some(false), @@ -161,6 +163,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: None, track_dirty_pages: Some(true), @@ -186,6 +189,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: Some(StaticCpuTemplate::T2), track_dirty_pages: Some(true), @@ -213,6 +217,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(true), cpu_template: None, track_dirty_pages: Some(true), diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index dd834baa785..55e6333931b 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1044,6 +1044,11 @@ definitions: mem_size_mib: type: integer description: Memory size of VM + secret_free: + type: boolean + description: + If enabled, guest memory will be unmapped from the host kernel's address space, providing additional + protection against transitive execution issues. All I/O then goes through a bounce buffer. track_dirty_pages: type: boolean description: diff --git a/src/vmm/benches/memory_access.rs b/src/vmm/benches/memory_access.rs index fe4f138db2d..e3b6b656302 100644 --- a/src/vmm/benches/memory_access.rs +++ b/src/vmm/benches/memory_access.rs @@ -9,7 +9,7 @@ fn bench_single_page_fault(c: &mut Criterion, configuration: VmResources) { c.bench_function("page_fault", |b| { b.iter_batched( || { - let memory = configuration.allocate_guest_memory().unwrap(); + let memory = configuration.allocate_guest_memory(None).unwrap(); // Get a pointer to the first memory region (cannot do `.get_slice(GuestAddress(0), // 1)`, because on ARM64 guest memory does not start at physical // address 0). diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index ead827c08c4..6bb379b9f9c 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -18,11 +18,11 @@ pub mod vm; use std::cmp::min; use std::fmt::Debug; -use std::fs::File; +use std::io::{Read, Seek}; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, ReadVolatile}; use crate::arch::{BootProtocol, EntryPoint}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; @@ -187,16 +187,10 @@ fn get_fdt_addr(mem: &GuestMemoryMmap) -> u64 { } /// Load linux kernel into guest memory. -pub fn load_kernel( - kernel: &File, +pub fn load_kernel( + mut kernel_file: R, guest_memory: &GuestMemoryMmap, ) -> Result { - // Need to clone the File because reading from it - // mutates it. - let mut kernel_file = kernel - .try_clone() - .map_err(|_| ConfigurationError::KernelFile)?; - let entry_addr = Loader::load( guest_memory, Some(GuestAddress(get_kernel_start())), diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..44897e42e41 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -8,6 +8,14 @@ use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; use crate::vstate::vm::{VmCommon, VmError}; +/// The VM type for this architecture that allows us to use guest_memfd. On ARM, all VMs +/// support guest_memfd and no special type is needed (in fact, no concept of vm types really +/// exists, and the correspoding field of the CREATE_VM ioctl determines IPA size instead, +/// e.g. the size of the guest physical address space. This value cannot be hardcoded, hence +/// `None` to let the `Vm` constructor now that just normal [`Kvm::create_vm`] should be called, +/// which internally determines the preferred IPA size. +pub const VM_TYPE_FOR_SECRET_FREEDOM: Option = None; + /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { @@ -30,8 +38,8 @@ pub enum ArchVmError { impl ArchVm { /// Create a new `Vm` struct. - pub fn new(kvm: &Kvm) -> Result { - let common = Self::create_common(kvm)?; + pub fn new(kvm: &Kvm, secret_free: bool) -> Result { + let common = Self::create_common(kvm, secret_free)?; Ok(ArchVm { common, irqchip_handle: None, diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 61d65fea1a5..05f930682ab 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -17,7 +17,7 @@ pub use aarch64::kvm::{Kvm, KvmArchError, OptionalCapabilities}; #[cfg(target_arch = "aarch64")] pub use aarch64::vcpu::*; #[cfg(target_arch = "aarch64")] -pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; +pub use aarch64::vm::{ArchVm, ArchVmError, VM_TYPE_FOR_SECRET_FREEDOM, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, @@ -35,7 +35,7 @@ pub use x86_64::kvm::{Kvm, KvmArchError}; #[cfg(target_arch = "x86_64")] pub use x86_64::vcpu::*; #[cfg(target_arch = "x86_64")] -pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; +pub use x86_64::vm::{ArchVm, ArchVmError, VM_TYPE_FOR_SECRET_FREEDOM, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index ca350cbf9af..55bcc544f8d 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -31,7 +31,7 @@ pub mod xstate; #[allow(missing_docs)] pub mod generated; -use std::fs::File; +use std::io::{Read, Seek}; use layout::CMDLINE_START; use linux_loader::configurator::linux::LinuxBootConfigurator; @@ -44,6 +44,7 @@ use linux_loader::loader::elf::start_info::{ }; use linux_loader::loader::{Cmdline, KernelLoader, PvhBootCapability, load_cmdline}; use log::debug; +use vm_memory::ReadVolatile; use super::EntryPoint; use crate::acpi::create_acpi_tables; @@ -438,20 +439,14 @@ fn add_e820_entry( } /// Load linux kernel into guest memory. -pub fn load_kernel( - kernel: &File, +pub fn load_kernel( + mut kernel: R, guest_memory: &GuestMemoryMmap, ) -> Result { - // Need to clone the File because reading from it - // mutates it. - let mut kernel_file = kernel - .try_clone() - .map_err(|_| ConfigurationError::KernelFile)?; - let entry_addr = Loader::load( guest_memory, None, - &mut kernel_file, + &mut kernel, Some(GuestAddress(get_kernel_start())), ) .map_err(ConfigurationError::KernelLoader)?; diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..09a1c03e6dc 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -5,7 +5,8 @@ use std::fmt; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, + KVM_PIT_SPEAKER_DUMMY, KVM_X86_SW_PROTECTED_VM, MsrList, kvm_clock_data, kvm_irqchip, + kvm_pit_config, kvm_pit_state2, }; use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; @@ -46,6 +47,9 @@ pub enum ArchVmError { SetTssAddress(kvm_ioctls::Error), } +/// The VM type for this architecture that allows us to use guest_memfd. +pub const VM_TYPE_FOR_SECRET_FREEDOM: Option = Some(KVM_X86_SW_PROTECTED_VM as u64); + /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { @@ -60,8 +64,8 @@ pub struct ArchVm { impl ArchVm { /// Create a new `Vm` struct. - pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result { - let common = Self::create_common(kvm)?; + pub fn new(kvm: &crate::vstate::kvm::Kvm, secret_free: bool) -> Result { + let common = Self::create_common(kvm, secret_free)?; let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 398c25ba056..e4a9f0eecfc 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,6 +5,8 @@ use std::fmt::Debug; use std::io; +use std::os::fd::AsFd; +use std::os::unix::fs::MetadataExt; #[cfg(feature = "gdb")] use std::sync::mpsc; use std::sync::{Arc, Mutex}; @@ -54,12 +56,13 @@ use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; +use crate::utils::u64_to_usize; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::Kvm; -use crate::vstate::memory::GuestRegionMmap; +use crate::vstate::memory::{GuestRegionMmap, MaybeBounce}; use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; +use crate::vstate::vm::{KVM_GMEM_NO_DIRECT_MAP, Vm}; use crate::{EventManager, Vmm, VmmError, device_manager}; /// Errors associated with starting the instance. @@ -136,11 +139,12 @@ fn create_vmm_and_vcpus( event_manager: &mut EventManager, vcpu_count: u8, kvm_capabilities: Vec, + secret_free: bool, ) -> Result<(Vmm, Vec), VmmError> { let kvm = Kvm::new(kvm_capabilities)?; // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; + let mut vm = Vm::new(&kvm, secret_free)?; let resource_allocator = ResourceAllocator::new()?; @@ -213,10 +217,6 @@ pub fn build_microvm_for_boot( .as_ref() .ok_or(MissingKernelConfig)?; - let guest_memory = vm_resources - .allocate_guest_memory() - .map_err(StartMicrovmError::GuestMemory)?; - // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] let mut boot_cmdline = boot_config.cmdline.clone(); @@ -226,19 +226,60 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; + let secret_free = vm_resources.machine_config.secret_free; + + #[cfg(target_arch = "x86_64")] + if secret_free { + boot_cmdline.insert_str("no-kvmclock")?; + } + let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, vm_resources.machine_config.vcpu_count, cpu_template.kvm_capabilities.clone(), + vm_resources.machine_config.secret_free, )?; + let guest_memfd = match secret_free { + true => Some( + vmm.vm + .create_guest_memfd(vm_resources.memory_size(), KVM_GMEM_NO_DIRECT_MAP) + .map_err(VmmError::Vm)?, + ), + false => None, + }; + + let guest_memory = vm_resources + .allocate_guest_memory(guest_memfd) + .map_err(StartMicrovmError::GuestMemory)?; + vmm.vm .register_memory_regions(guest_memory) .map_err(VmmError::Vm)?; - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + #[cfg(target_arch = "x86_64")] + vmm.vm.set_memory_private().map_err(VmmError::Vm)?; + + let entry_point = load_kernel( + MaybeBounce::new(boot_config.kernel_file.try_clone().unwrap(), secret_free), + vmm.vm.guest_memory(), + )?; + let initrd = match &boot_config.initrd_file { + Some(initrd_file) => { + let size = initrd_file + .metadata() + .map_err(InitrdError::Metadata)? + .size(); + + Some(InitrdConfig::from_reader( + vmm.vm.guest_memory(), + MaybeBounce::new(initrd_file.as_fd(), secret_free), + u64_to_usize(size), + )?) + } + None => None, + }; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -426,6 +467,7 @@ pub fn build_microvm_from_snapshot( event_manager, vm_resources.machine_config.vcpu_count, microvm_state.kvm_state.kvm_cap_modifiers.clone(), + false, ) .map_err(StartMicrovmError::Internal)?; @@ -597,6 +639,10 @@ fn attach_virtio_device( ) -> Result<(), MmioError> { event_manager.add_subscriber(device.clone()); + if vmm.vm.secret_free() { + device.lock().unwrap().force_userspace_bounce_buffers(); + } + // The device mutex mustn't be locked here otherwise it will deadlock. let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); vmm.mmio_device_manager diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 99bde6e2e78..f1b8cf01697 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -600,6 +600,14 @@ mod tests { fn set_acked_features(&mut self, _: u64) {} + fn force_userspace_bounce_buffers(&mut self) { + todo!() + } + + fn userspace_bounce_buffers(&self) -> bool { + todo!() + } + fn device_type(&self) -> u32 { 0 } @@ -651,7 +659,7 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); + let mut vm = Vm::new(&kvm, false).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -682,7 +690,7 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); + let mut vm = Vm::new(&kvm, false).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -738,7 +746,7 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); + let mut vm = Vm::new(&kvm, false).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 30a6387bc82..a5923dd3624 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -846,6 +846,7 @@ mod tests { "machine-config": {{ "vcpu_count": 1, "mem_size_mib": 128, + "secret_free": false, "smt": false, "track_dirty_pages": false, "huge_pages": "None" diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 186f09275bc..f22b220984c 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -557,6 +557,14 @@ impl VirtioDevice for Balloon { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + // balloon device doesn't have a need for bounce buffers + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } + fn device_type(&self) -> u32 { TYPE_BALLOON } diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index bf3043bcdd4..a55e0254bec 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -148,6 +148,20 @@ impl VirtioDevice for Block { } } + fn force_userspace_bounce_buffers(&mut self) { + match self { + Block::Virtio(b) => b.force_userspace_bounce_buffers(), + Block::VhostUser(b) => b.force_userspace_bounce_buffers(), + } + } + + fn userspace_bounce_buffers(&self) -> bool { + match self { + Block::Virtio(b) => b.userspace_bounce_buffers(), + Block::VhostUser(b) => b.userspace_bounce_buffers(), + } + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index b0bf5a31e3f..62b0002c371 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -294,6 +294,15 @@ impl VirtioDevice for VhostUserBlock self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + // Nothing Firecracker can do about this, the backend would need to do the bouncing + panic!("vhost-user-blk is incompatible with userspace bounce buffers") + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index b11c757d43c..6ce866806ba 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -578,6 +578,22 @@ impl VirtioDevice for VirtioBlock { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + match self.disk.file_engine { + FileEngine::Async(_) => { + panic!("async engine is incompatible with userspace bounce buffers") + } + FileEngine::Sync(ref mut engine) => engine.start_bouncing(), + } + } + + fn userspace_bounce_buffers(&self) -> bool { + match self.disk.file_engine { + FileEngine::Async(_) => false, + FileEngine::Sync(ref engine) => engine.is_bouncing(), + } + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..576a0a5b1f2 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -6,7 +6,7 @@ use std::io::{Seek, SeekFrom, Write}; use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; -use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap, MaybeBounce}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum SyncIoError { @@ -22,7 +22,12 @@ pub enum SyncIoError { #[derive(Debug)] pub struct SyncFileEngine { - file: File, + // 65536 is the largest buffer a linux guest will give us, empirically. Determined by + // having `MaybeBounce` logging scenarios where the fixed size bounce buffer isn't sufficient. + // Note that even if this assumption ever changes, the worse that'll happen is that we do + // multiple roundtrips between guest memory and the bounce buffer, as MaybeBounce would + // just chop larger reads/writes into chunks of 65k. + file: MaybeBounce, } // SAFETY: `File` is send and ultimately a POD. @@ -30,17 +35,27 @@ unsafe impl Send for SyncFileEngine {} impl SyncFileEngine { pub fn from_file(file: File) -> SyncFileEngine { - SyncFileEngine { file } + SyncFileEngine { + file: MaybeBounce::new_persistent(file, false), + } } #[cfg(test)] pub fn file(&self) -> &File { - &self.file + &self.file.target + } + + pub fn start_bouncing(&mut self) { + self.file.activate() + } + + pub fn is_bouncing(&self) -> bool { + self.file.is_activated() } /// Update the backing file of the engine pub fn update_file(&mut self, file: File) { - self.file = file + self.file.target = file } pub fn read( @@ -77,8 +92,8 @@ impl SyncFileEngine { pub fn flush(&mut self) -> Result<(), SyncIoError> { // flush() first to force any cached data out of rust buffers. - self.file.flush().map_err(SyncIoError::Flush)?; + self.file.target.flush().map_err(SyncIoError::Flush)?; // Sync data out to physical media on host. - self.file.sync_all().map_err(SyncIoError::SyncAll) + self.file.target.sync_all().map_err(SyncIoError::SyncAll) } } diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 8c6f2c2453d..a52f901ebab 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -16,7 +16,7 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::{DeviceState, IrqTrigger}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; use crate::rate_limiter::RateLimiter; @@ -127,7 +127,7 @@ impl Persist<'_> for VirtioBlock { capacity: disk_properties.nsectors.to_le(), }; - Ok(VirtioBlock { + let mut dev = VirtioBlock { avail_features, acked_features, config_space, @@ -148,7 +148,13 @@ impl Persist<'_> for VirtioBlock { rate_limiter, is_io_engine_throttled: false, metrics: BlockMetricsPerDevice::alloc(state.id.clone()), - }) + }; + + if state.virtio_state.bounce_in_userspace { + dev.force_userspace_bounce_buffers() + } + + Ok(dev) } } diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 62131e775f5..8c35e4d2f3c 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -102,6 +102,12 @@ pub trait VirtioDevice: AsAny + Send { /// - self.avail_features() & self.acked_features() = self.get_acked_features() fn set_acked_features(&mut self, acked_features: u64); + /// Make the virtio device user userspace bounce buffers + fn force_userspace_bounce_buffers(&mut self); + + /// Whether this device is using userspace bounce buffers + fn userspace_bounce_buffers(&self) -> bool; + /// Check if virtio device has negotiated given feature. fn has_feature(&self, feature: u64) -> bool { (self.acked_features() & (1 << feature)) != 0 @@ -259,6 +265,14 @@ pub(crate) mod tests { todo!() } + fn force_userspace_bounce_buffers(&mut self) { + todo!() + } + + fn userspace_bounce_buffers(&self) -> bool { + todo!() + } + fn device_type(&self) -> u32 { todo!() } diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/mmio.rs index 12ee54bfb0a..c061ad82732 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/mmio.rs @@ -423,6 +423,14 @@ pub(crate) mod tests { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + unimplemented!() + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } + fn device_type(&self) -> u32 { 123 } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index fff04d1da1a..b6bd6906b23 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -6,12 +6,14 @@ // found in the THIRD-PARTY file. use std::collections::VecDeque; +use std::io::{Read, Write}; use std::mem::{self}; use std::net::Ipv4Addr; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; use log::error; +use vm_memory::VolatileSlice; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; @@ -245,7 +247,9 @@ pub struct Net { pub(crate) rx_rate_limiter: RateLimiter, pub(crate) tx_rate_limiter: RateLimiter, - rx_frame_buf: [u8; MAX_BUFFER_SIZE], + /// Used both for bounce buffering and for relaying frames to MMDS + userspace_buffer: [u8; MAX_BUFFER_SIZE], + pub(crate) userspace_bouncing: bool, tx_frame_headers: [u8; frame_hdr_len()], @@ -311,8 +315,9 @@ impl Net { queue_evts, rx_rate_limiter, tx_rate_limiter, - rx_frame_buf: [0u8; MAX_BUFFER_SIZE], + userspace_buffer: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], + userspace_bouncing: false, irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, config_space, guest_mac, @@ -496,6 +501,7 @@ impl Net { // Tries to detour the frame to MMDS and if MMDS doesn't accept it, sends it on the host TAP. // // Returns whether MMDS consumed the frame. + #[allow(clippy::too_many_arguments)] fn write_to_mmds_or_tap( mmds_ns: Option<&mut MmdsNetworkStack>, rate_limiter: &mut RateLimiter, @@ -504,6 +510,7 @@ impl Net { tap: &mut Tap, guest_mac: Option, net_metrics: &NetDeviceMetrics, + bb: Option<&mut [u8]>, ) -> Result { // Read the frame headers from the IoVecBuffer let max_header_len = headers.len(); @@ -551,7 +558,7 @@ impl Net { } let _metric = net_metrics.tap_write_agg.record_latency_metrics(); - match Self::write_tap(tap, frame_iovec) { + match Self::write_tap(tap, frame_iovec, bb) { Ok(_) => { let len = u64::from(frame_iovec.len()); net_metrics.tx_bytes_count.add(len); @@ -585,15 +592,15 @@ impl Net { if let Some(ns) = self.mmds_ns.as_mut() { if let Some(len) = - ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.rx_frame_buf)?) + ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.userspace_buffer)?) { let len = len.get(); METRICS.mmds.tx_frames.inc(); METRICS.mmds.tx_bytes.add(len as u64); - init_vnet_hdr(&mut self.rx_frame_buf); + init_vnet_hdr(&mut self.userspace_buffer); self.rx_buffer .iovec - .write_all_volatile_at(&self.rx_frame_buf[..vnet_hdr_len() + len], 0)?; + .write_all_volatile_at(&self.userspace_buffer[..vnet_hdr_len() + len], 0)?; // SAFETY: // * len will never be bigger that u32::MAX because mmds is bound // by the size of `self.rx_frame_buf` which is MAX_BUFFER_SIZE size. @@ -734,6 +741,8 @@ impl Net { &mut self.tap, self.guest_mac, &self.metrics, + self.userspace_bouncing + .then_some(self.userspace_buffer.as_mut_slice()), ) .unwrap_or(false); if frame_consumed_by_mmds && self.rx_buffer.used_bytes == 0 { @@ -826,11 +835,57 @@ impl Net { } else { self.rx_buffer.single_chain_slice_mut() }; - self.tap.read_iovec(slice) + + if self.userspace_bouncing { + let how_many = self + .tap + .tap_file + .read(self.userspace_buffer.as_mut_slice())?; + + assert!(how_many <= MAX_BUFFER_SIZE); + + let mut offset = 0; + for iov in slice { + assert!( + offset <= how_many, + "copied more bytes into guest memory than read from tap" + ); + + let to_copy = (how_many - offset).min(iov.iov_len); + + if to_copy == 0 { + break; + } + + // SAFETY: the iovec comes from an `IoVecBufferMut`, which upholds the invariant + // that all contained iovecs are covering valid ranges of guest memory. + // Particularly, to_copy <= iov.iov_len + let vslice = unsafe { VolatileSlice::new(iov.iov_base.cast(), to_copy) }; + + vslice.copy_from(&self.userspace_buffer[offset..]); + + offset += to_copy; + } + + Ok(how_many) + } else { + self.tap.read_iovec(slice) + } } - fn write_tap(tap: &mut Tap, buf: &IoVecBuffer) -> std::io::Result { - tap.write_iovec(buf) + fn write_tap( + tap: &mut Tap, + buf: &IoVecBuffer, + bounce_buffer: Option<&mut [u8]>, + ) -> std::io::Result { + if let Some(bb) = bounce_buffer { + let how_many = buf.len() as usize; + let copied = buf.read_volatile_at(&mut &mut *bb, 0, how_many).unwrap(); + assert_eq!(copied, how_many); + tap.tap_file.write(&bb[..copied]) + } else { + tap.write_iovec(buf) + } } /// Process a single RX queue event. @@ -946,6 +1001,14 @@ impl VirtioDevice for Net { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + self.userspace_bouncing = true + } + + fn userspace_bounce_buffers(&self) -> bool { + self.userspace_bouncing + } + fn device_type(&self) -> u32 { TYPE_NET } @@ -1931,6 +1994,7 @@ pub mod tests { &mut net.tap, Some(src_mac), &net.metrics, + None ) .unwrap() ) @@ -1970,6 +2034,7 @@ pub mod tests { &mut net.tap, Some(guest_mac), &net.metrics, + None ) ); @@ -1985,6 +2050,7 @@ pub mod tests { &mut net.tap, Some(not_guest_mac), &net.metrics, + None ) ); } diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 5f2d6f560b4..cbb5c8f52a7 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -152,6 +152,8 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; + net.userspace_bouncing = state.virtio_state.bounce_in_userspace; + if state.virtio_state.activated { let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); net.tap diff --git a/src/vmm/src/devices/virtio/net/tap.rs b/src/vmm/src/devices/virtio/net/tap.rs index c516705af31..30a499489b0 100644 --- a/src/vmm/src/devices/virtio/net/tap.rs +++ b/src/vmm/src/devices/virtio/net/tap.rs @@ -49,7 +49,7 @@ ioctl_iow_nr!(TUNSETVNETHDRSZ, TUNTAP, 216, ::std::os::raw::c_int); /// Tap goes out of scope, and the kernel will clean up the interface automatically. #[derive(Debug)] pub struct Tap { - tap_file: File, + pub(crate) tap_file: File, pub(crate) if_name: [u8; IFACE_NAME_MAX_LEN], } diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 7c861352317..ba365617abf 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -125,11 +125,13 @@ pub struct VirtioDeviceState { pub interrupt_status: u32, /// Flag for activated status. pub activated: bool, + /// Whether this device has to use userspace bounce buffers + pub bounce_in_userspace: bool, } impl VirtioDeviceState { /// Construct the virtio state of a device. - pub fn from_device(device: &dyn VirtioDevice) -> Self { + pub fn from_device(device: &impl VirtioDevice) -> Self { VirtioDeviceState { device_type: device.device_type(), avail_features: device.avail_features(), @@ -137,6 +139,7 @@ impl VirtioDeviceState { queues: device.queues().iter().map(Persist::save).collect(), interrupt_status: device.interrupt_status().load(Ordering::Relaxed), activated: device.is_activated(), + bounce_in_userspace: device.userspace_bounce_buffers(), } } diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 97ac8676e0a..50fb1e4ee23 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -303,6 +303,14 @@ impl VirtioDevice for Entropy { self.device_state = DeviceState::Activated(mem); Ok(()) } + + fn force_userspace_bounce_buffers(&mut self) { + // rng device works with only userspace accesses + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/vsock/csm/connection.rs b/src/vmm/src/devices/virtio/vsock/csm/connection.rs index c9bd5b2c0f7..6f39f8b3079 100644 --- a/src/vmm/src/devices/virtio/vsock/csm/connection.rs +++ b/src/vmm/src/devices/virtio/vsock/csm/connection.rs @@ -95,6 +95,7 @@ use crate::devices::virtio::vsock::metrics::METRICS; use crate::devices::virtio::vsock::packet::{VsockPacketHeader, VsockPacketRx, VsockPacketTx}; use crate::logger::IncMetric; use crate::utils::wrap_usize_to_u32; +use crate::vstate::memory::MaybeBounce; /// Trait that vsock connection backends need to implement. /// @@ -118,7 +119,7 @@ pub struct VsockConnection { /// The peer (guest) port. peer_port: u32, /// The (connected) host-side stream. - stream: S, + pub(crate) stream: MaybeBounce, /// The TX buffer for this connection. tx_buf: TxBuf, /// Total number of bytes that have been successfully written to `self.stream`, either @@ -414,7 +415,7 @@ where /// The connection is interested in being notified about EPOLLIN / EPOLLOUT events on the /// host stream. fn as_raw_fd(&self) -> RawFd { - self.stream.as_raw_fd() + self.stream.target.as_raw_fd() } } @@ -509,13 +510,14 @@ where local_port: u32, peer_port: u32, peer_buf_alloc: u32, + bounce: bool, ) -> Self { Self { local_cid, peer_cid, local_port, peer_port, - stream, + stream: MaybeBounce::new_persistent(stream, bounce), state: ConnState::PeerInit, tx_buf: TxBuf::new(), fwd_cnt: Wrapping(0), @@ -535,13 +537,14 @@ where peer_cid: u64, local_port: u32, peer_port: u32, + bounce: bool, ) -> Self { Self { local_cid, peer_cid, local_port, peer_port, - stream, + stream: MaybeBounce::new_persistent(stream, bounce), state: ConnState::LocalInit, tx_buf: TxBuf::new(), fwd_cnt: Wrapping(0), @@ -882,9 +885,10 @@ mod tests { LOCAL_PORT, PEER_PORT, PEER_BUF_ALLOC, + false, ), ConnState::LocalInit => VsockConnection::::new_local_init( - stream, LOCAL_CID, PEER_CID, LOCAL_PORT, PEER_PORT, + stream, LOCAL_CID, PEER_CID, LOCAL_PORT, PEER_PORT, false, ), ConnState::Established => { let mut conn = VsockConnection::::new_peer_init( @@ -894,6 +898,7 @@ mod tests { LOCAL_PORT, PEER_PORT, PEER_BUF_ALLOC, + false, ); assert!(conn.has_pending_rx()); conn.recv_pkt(&mut rx_pkt).unwrap(); @@ -912,7 +917,7 @@ mod tests { } fn set_stream(&mut self, stream: TestStream) { - self.conn.stream = stream; + self.conn.stream = MaybeBounce::new_persistent(stream, false); } fn set_peer_credit(&mut self, credit: u32) { @@ -1014,7 +1019,7 @@ mod tests { let mut ctx = CsmTestContext::new_established(); let data = &[1, 2, 3, 4]; ctx.set_stream(TestStream::new_with_read_buf(data)); - assert_eq!(ctx.conn.as_raw_fd(), ctx.conn.stream.as_raw_fd()); + assert_eq!(ctx.conn.as_raw_fd(), ctx.conn.stream.target.as_raw_fd()); ctx.notify_epollin(); ctx.recv(); assert_eq!(ctx.rx_pkt.hdr.op(), uapi::VSOCK_OP_RW); @@ -1098,7 +1103,7 @@ mod tests { ctx.init_data_tx_pkt(data); ctx.send(); - assert_eq!(ctx.conn.stream.write_buf.len(), 0); + assert_eq!(ctx.conn.stream.target.write_buf.len(), 0); assert!(ctx.conn.tx_buf.is_empty()); } @@ -1113,7 +1118,7 @@ mod tests { let data = &[1, 2, 3, 4]; ctx.init_data_tx_pkt(data); ctx.send(); - assert_eq!(ctx.conn.stream.write_buf, data.to_vec()); + assert_eq!(ctx.conn.stream.target.write_buf, data.to_vec()); ctx.notify_epollin(); ctx.recv(); @@ -1233,7 +1238,7 @@ mod tests { ctx.set_stream(TestStream::new()); ctx.conn.notify(EventSet::OUT); assert!(ctx.conn.tx_buf.is_empty()); - assert_eq!(ctx.conn.stream.write_buf, data); + assert_eq!(ctx.conn.stream.target.write_buf, data); } } diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index aa114f6cccb..55bc97bc7ff 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -280,6 +280,14 @@ where self.acked_features = acked_features } + fn force_userspace_bounce_buffers(&mut self) { + self.backend.start_bouncing() + } + + fn userspace_bounce_buffers(&self) -> bool { + self.backend.is_bouncing() + } + fn device_type(&self) -> u32 { uapi::VIRTIO_ID_VSOCK } diff --git a/src/vmm/src/devices/virtio/vsock/mod.rs b/src/vmm/src/devices/virtio/vsock/mod.rs index 859e198860b..54c9eeef3b9 100644 --- a/src/vmm/src/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/devices/virtio/vsock/mod.rs @@ -185,4 +185,7 @@ pub trait VsockChannel { /// The vsock backend, which is basically an epoll-event-driven vsock channel. /// Currently, the only implementation we have is `crate::devices::virtio::unix::muxer::VsockMuxer`, /// which translates guest-side vsock connections to host-side Unix domain socket connections. -pub trait VsockBackend: VsockChannel + VsockEpollListener + Send {} +pub trait VsockBackend: VsockChannel + VsockEpollListener + Send { + fn start_bouncing(&mut self); + fn is_bouncing(&self) -> bool; +} diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index fce6affae69..6128090b601 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -10,7 +10,7 @@ use std::sync::atomic::AtomicU32; use serde::{Deserialize, Serialize}; use super::*; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::vsock::TYPE_VSOCK; @@ -128,6 +128,11 @@ where } else { DeviceState::Inactive }; + + if state.virtio_state.bounce_in_userspace { + vsock.force_userspace_bounce_buffers(); + } + Ok(vsock) } } diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 804f0442559..391d543537f 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -111,7 +111,15 @@ impl VsockEpollListener for TestBackend { self.evset = Some(evset); } } -impl VsockBackend for TestBackend {} +impl VsockBackend for TestBackend { + fn start_bouncing(&mut self) { + unimplemented!() + } + + fn is_bouncing(&self) -> bool { + false + } +} #[derive(Debug)] pub struct TestContext { diff --git a/src/vmm/src/devices/virtio/vsock/unix/muxer.rs b/src/vmm/src/devices/virtio/vsock/unix/muxer.rs index 478d5c7318d..5585761af8f 100644 --- a/src/vmm/src/devices/virtio/vsock/unix/muxer.rs +++ b/src/vmm/src/devices/virtio/vsock/unix/muxer.rs @@ -108,6 +108,7 @@ pub struct VsockMuxer { local_port_set: HashSet, /// The last used host-side port. local_port_last: u32, + bounce: bool, } impl VsockChannel for VsockMuxer { @@ -299,7 +300,19 @@ impl VsockEpollListener for VsockMuxer { } } -impl VsockBackend for VsockMuxer {} +impl VsockBackend for VsockMuxer { + fn start_bouncing(&mut self) { + self.bounce = true; + + for conn in self.conn_map.values_mut() { + conn.stream.activate() + } + } + + fn is_bouncing(&self) -> bool { + self.bounce + } +} impl VsockMuxer { /// Muxer constructor. @@ -321,6 +334,7 @@ impl VsockMuxer { killq: MuxerKillQ::new(), local_port_last: (1u32 << 30) - 1, local_port_set: HashSet::with_capacity(defs::MAX_CONNECTIONS), + bounce: false, }; // Listen on the host initiated socket, for incoming connections. @@ -402,6 +416,7 @@ impl VsockMuxer { self.cid, local_port, peer_port, + self.bounce, ), ) }) @@ -629,6 +644,7 @@ impl VsockMuxer { pkt.hdr.dst_port(), pkt.hdr.src_port(), pkt.hdr.buf_alloc(), + self.bounce, ), ) }) diff --git a/src/vmm/src/initrd.rs b/src/vmm/src/initrd.rs index 9dfcd8bc16e..624ec397f73 100644 --- a/src/vmm/src/initrd.rs +++ b/src/vmm/src/initrd.rs @@ -1,14 +1,9 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::fs::File; -use std::os::unix::fs::MetadataExt; - use vm_memory::{GuestAddress, GuestMemory, ReadVolatile, VolatileMemoryError}; use crate::arch::initrd_load_addr; -use crate::utils::u64_to_usize; -use crate::vmm_config::boot_source::BootConfig; use crate::vstate::memory::GuestMemoryMmap; /// Errors associated with initrd loading. @@ -20,8 +15,6 @@ pub enum InitrdError { Load, /// Cannot image metadata: {0} Metadata(std::io::Error), - /// Cannot copy initrd file fd: {0} - CloneFd(std::io::Error), /// Cannot load initrd due to an invalid image: {0} Read(VolatileMemoryError), } @@ -36,31 +29,20 @@ pub struct InitrdConfig { } impl InitrdConfig { - /// Load initrd into guest memory based on the boot config. - pub fn from_config( - boot_cfg: &BootConfig, - vm_memory: &GuestMemoryMmap, - ) -> Result, InitrdError> { - Ok(match &boot_cfg.initrd_file { - Some(f) => { - let f = f.try_clone().map_err(InitrdError::CloneFd)?; - Some(Self::from_file(vm_memory, f)?) - } - None => None, - }) - } - /// Loads the initrd from a file into guest memory. - pub fn from_file(vm_memory: &GuestMemoryMmap, mut file: File) -> Result { - let size = file.metadata().map_err(InitrdError::Metadata)?.size(); - let size = u64_to_usize(size); + pub fn from_reader( + vm_memory: &GuestMemoryMmap, + mut reader: R, + size: usize, + ) -> Result { let Some(address) = initrd_load_addr(vm_memory, size) else { return Err(InitrdError::Address); }; let mut slice = vm_memory .get_slice(GuestAddress(address), size) .map_err(|_| InitrdError::Load)?; - file.read_exact_volatile(&mut slice) + reader + .read_exact_volatile(&mut slice) .map_err(InitrdError::Read)?; Ok(InitrdConfig { @@ -105,7 +87,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let initrd = InitrdConfig::from_file(&gm, tempfile).unwrap(); + let initrd = InitrdConfig::from_reader(&gm, tempfile, image.len()).unwrap(); assert!(gm.address_in_range(initrd.address)); assert_eq!(initrd.size, image.len()); } @@ -120,7 +102,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let res = InitrdConfig::from_file(&gm, tempfile); + let res = InitrdConfig::from_reader(&gm, tempfile, image.len()); assert!(matches!(res, Err(InitrdError::Address)), "{:?}", res); } @@ -134,7 +116,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let res = InitrdConfig::from_file(&gm, tempfile); + let res = InitrdConfig::from_reader(&gm, tempfile, image.len()); assert!(matches!(res, Err(InitrdError::Address)), "{:?}", res); } } diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index aeacadeb66e..5ab446e572d 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -47,6 +47,8 @@ use crate::{EventManager, Vmm, vstate}; pub struct VmInfo { /// Guest memory size. pub mem_size_mib: u64, + /// Memory config + pub secret_free: bool, /// smt information pub smt: bool, /// CPU template type @@ -61,6 +63,7 @@ impl From<&VmResources> for VmInfo { fn from(value: &VmResources) -> Self { Self { mem_size_mib: value.machine_config.mem_size_mib as u64, + secret_free: value.machine_config.secret_free, smt: value.machine_config.smt, cpu_template: StaticCpuTemplate::from(&value.machine_config.cpu_template), boot_source: value.boot_source.config.clone(), @@ -360,6 +363,7 @@ pub fn restore_from_snapshot( .update_machine_config(&MachineConfigUpdate { vcpu_count: Some(vcpu_count), mem_size_mib: Some(u64_to_usize(microvm_state.vm_info.mem_size_mib)), + secret_free: Some(microvm_state.vm_info.secret_free), smt: Some(microvm_state.vm_info.smt), cpu_template: Some(microvm_state.vm_info.cpu_template), track_dirty_pages: Some(track_dirty_pages), @@ -453,7 +457,7 @@ fn guest_memory_from_file( track_dirty_pages: bool, ) -> Result, GuestMemoryFromFileError> { let mem_file = File::open(mem_file_path)?; - let guest_mem = memory::snapshot_file(mem_file, mem_state.regions(), track_dirty_pages)?; + let guest_mem = memory::file_private(mem_file, mem_state.regions(), track_dirty_pages)?; Ok(guest_mem) } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 097e2041b55..2d5062faf61 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::From; +use std::fs::File; use std::path::PathBuf; use std::sync::{Arc, Mutex, MutexGuard}; @@ -9,6 +10,7 @@ use serde::{Deserialize, Serialize}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::device_manager::persist::SharedDeviceType; +use crate::devices::virtio::block::device::Block; use crate::logger::info; use crate::mmds; use crate::mmds::data_store::{Mmds, MmdsVersion}; @@ -30,7 +32,7 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::vsock::*; use crate::vstate::memory; -use crate::vstate::memory::{GuestRegionMmap, MemoryError}; +use crate::vstate::memory::{GuestRegionMmap, MemoryError, create_memfd}; /// Errors encountered when configuring microVM resources. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -213,7 +215,14 @@ impl VmResources { self.balloon.set_device(balloon); if self.machine_config.huge_pages != HugePageConfig::None { - return Err(ResourcesError::BalloonDevice(BalloonConfigError::HugePages)); + return Err(ResourcesError::BalloonDevice( + BalloonConfigError::IncompatibleWith("huge pages"), + )); + } + if self.machine_config.secret_free { + return Err(ResourcesError::BalloonDevice( + BalloonConfigError::IncompatibleWith("secret freedom"), + )); } } @@ -255,7 +264,31 @@ impl VmResources { } if self.balloon.get().is_some() && updated.huge_pages != HugePageConfig::None { - return Err(MachineConfigError::BalloonAndHugePages); + return Err(MachineConfigError::Incompatible( + "balloon device", + "huge pages", + )); + } + if self.balloon.get().is_some() && updated.secret_free { + return Err(MachineConfigError::Incompatible( + "balloon device", + "secret freedom", + )); + } + if updated.secret_free { + if self.vhost_user_devices_used() { + return Err(MachineConfigError::Incompatible( + "vhost-user devices", + "userspace bounce buffers", + )); + } + + if self.async_block_engine_used() { + return Err(MachineConfigError::Incompatible( + "async block engine", + "userspace bounce buffers", + )); + } } self.machine_config = updated; @@ -312,7 +345,11 @@ impl VmResources { } if self.machine_config.huge_pages != HugePageConfig::None { - return Err(BalloonConfigError::HugePages); + return Err(BalloonConfigError::IncompatibleWith("huge pages")); + } + + if self.machine_config.secret_free { + return Err(BalloonConfigError::IncompatibleWith("secret freedom")); } self.balloon.set(config) @@ -338,6 +375,17 @@ impl VmResources { &mut self, block_device_config: BlockDeviceConfig, ) -> Result<(), DriveError> { + if self.machine_config.secret_free { + if block_device_config.file_engine_type == Some(FileEngineType::Async) { + return Err(DriveError::IncompatibleWithSecretFreedom( + "async file engine", + )); + } + + if block_device_config.socket.is_some() { + return Err(DriveError::IncompatibleWithSecretFreedom("vhost-user-blk")); + } + } self.block.insert(block_device_config) } @@ -437,18 +485,37 @@ impl VmResources { Ok(()) } + /// Returns true if any vhost user devices are configured int his [`VmResources`] object + pub fn vhost_user_devices_used(&self) -> bool { + self.block + .devices + .iter() + .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()) + } + + fn async_block_engine_used(&self) -> bool { + self.block + .devices + .iter() + .any(|b| match &*b.lock().unwrap() { + Block::Virtio(b) => b.file_engine_type() == FileEngineType::Async, + Block::VhostUser(_) => false, + }) + } + + /// Gets the size of the guest memory, in bytes + pub fn memory_size(&self) -> usize { + mib_to_bytes(self.machine_config.mem_size_mib) + } + /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result, MemoryError> { - let vhost_user_device_used = self - .block - .devices - .iter() - .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()); - - // Page faults are more expensive for shared memory mapping, including memfd. + pub fn allocate_guest_memory( + &self, + guest_memfd: Option, + ) -> Result, MemoryError> { // For this reason, we only back guest memory with a memfd // if a vhost-user-blk device is configured in the VM, otherwise we fall back to // an anonymous private memory. @@ -457,20 +524,35 @@ impl VmResources { // because that would require running a backend process. If in the future we converge to // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. - let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); - if vhost_user_device_used { - memory::memfd_backed( - regions.as_ref(), + let regions = crate::arch::arch_memory_regions(0, self.memory_size()).into_iter(); + match guest_memfd { + Some(file) => memory::file_shared( + file, + regions, self.machine_config.track_dirty_pages, self.machine_config.huge_pages, - ) - } else { - memory::anonymous( - regions.into_iter(), - self.machine_config.track_dirty_pages, - self.machine_config.huge_pages, - ) + ), + None => { + if self.vhost_user_devices_used() { + let memfd = create_memfd( + self.memory_size() as u64, + self.machine_config.huge_pages.into(), + )? + .into_file(); + memory::file_shared( + memfd, + regions, + self.machine_config.track_dirty_pages, + self.machine_config.huge_pages, + ) + } else { + memory::anonymous( + regions.into_iter(), + self.machine_config.track_dirty_pages, + self.machine_config.huge_pages, + ) + } + } } } } @@ -1302,6 +1384,7 @@ mod tests { let mut aux_vm_config = MachineConfigUpdate { vcpu_count: Some(32), mem_size_mib: Some(512), + secret_free: Some(false), smt: Some(false), #[cfg(target_arch = "x86_64")] cpu_template: Some(StaticCpuTemplate::T2), @@ -1323,44 +1406,6 @@ mod tests { aux_vm_config ); - // Invalid vcpu count. - aux_vm_config.vcpu_count = Some(0); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - aux_vm_config.vcpu_count = Some(33); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - - // Check that SMT is not supported on aarch64, and that on x86_64 enabling it requires vcpu - // count to be even. - aux_vm_config.smt = Some(true); - #[cfg(target_arch = "aarch64")] - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::SmtNotSupported) - ); - aux_vm_config.vcpu_count = Some(3); - #[cfg(target_arch = "x86_64")] - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - aux_vm_config.vcpu_count = Some(32); - #[cfg(target_arch = "x86_64")] - vm_resources.update_machine_config(&aux_vm_config).unwrap(); - aux_vm_config.smt = Some(false); - - // Invalid mem_size_mib. - aux_vm_config.mem_size_mib = Some(0); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidMemorySize) - ); - // Incompatible mem_size_mib with balloon size. vm_resources.machine_config.mem_size_mib = 128; vm_resources @@ -1379,23 +1424,6 @@ mod tests { // mem_size_mib compatible with balloon size. aux_vm_config.mem_size_mib = Some(256); vm_resources.update_machine_config(&aux_vm_config).unwrap(); - - // mem_size_mib incompatible with huge pages configuration - aux_vm_config.mem_size_mib = Some(129); - aux_vm_config.huge_pages = Some(HugePageConfig::Hugetlbfs2M); - assert_eq!( - vm_resources - .update_machine_config(&aux_vm_config) - .unwrap_err(), - MachineConfigError::InvalidMemorySize - ); - - // mem_size_mib compatible with huge page configuration - aux_vm_config.mem_size_mib = Some(2048); - // Remove the balloon device config that's added by `default_vm_resources` as it would - // trigger the "ballooning incompatible with huge pages" check. - vm_resources.balloon = BalloonBuilder::new(); - vm_resources.update_machine_config(&aux_vm_config).unwrap(); } #[test] @@ -1449,7 +1477,7 @@ mod tests { assert!( matches!( err, - ResourcesError::BalloonDevice(BalloonConfigError::HugePages) + ResourcesError::BalloonDevice(BalloonConfigError::IncompatibleWith("huge pages")) ), "{:?}", err diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index 6ac2fb34ecf..a6fccfe2b4b 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -28,8 +28,8 @@ pub enum BalloonConfigError { CreateFailure(crate::devices::virtio::balloon::BalloonError), /// Error updating the balloon device configuration: {0} UpdateFailure(std::io::Error), - /// Firecracker's huge pages support is incompatible with memory ballooning. - HugePages, + /// Memory ballooning is incompatible with {0}. + IncompatibleWith(&'static str), } /// This struct represents the strongly typed equivalent of the json body diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 9e301eff751..88a9b813874 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -24,6 +24,8 @@ pub enum DriveError { DeviceUpdate(VmmError), /// A root block device already exists! RootBlockDeviceAlreadyAdded, + /// {0} is incompatible with secret freedom. + IncompatibleWithSecretFreedom(&'static str), } /// Use this structure to set up the Block Device before booting the kernel. diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index cfe7105fdf8..3d30860144e 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -27,10 +27,8 @@ pub enum MachineConfigError { /// Enabling simultaneous multithreading is not supported on aarch64. #[cfg(target_arch = "aarch64")] SmtNotSupported, - /// Could not determine host kernel version when checking hugetlbfs compatibility - KernelVersion, - /// Firecracker's huge pages support is incompatible with memory ballooning. - BalloonAndHugePages, + /// '{0}' and '{1}' are mutually exclusive and cannot be used together. + Incompatible(&'static str, &'static str) } /// Describes the possible (huge)page configurations for a microVM's memory. @@ -97,6 +95,11 @@ pub struct MachineConfig { pub vcpu_count: u8, /// The memory size in MiB. pub mem_size_mib: usize, + /// Whether guest_memfd should be used to back normal guest memory. If this is enabled + /// and any devices are attached to the VM, userspace bounce buffers will be used + /// as I/O into secret free memory is not possible. + #[serde(default)] + pub secret_free: bool, /// Enables or disabled SMT. #[serde(default)] pub smt: bool, @@ -153,6 +156,7 @@ impl Default for MachineConfig { Self { vcpu_count: 1, mem_size_mib: DEFAULT_MEM_SIZE_MIB, + secret_free: false, smt: false, cpu_template: None, track_dirty_pages: false, @@ -178,6 +182,9 @@ pub struct MachineConfigUpdate { /// The memory size in MiB. #[serde(default)] pub mem_size_mib: Option, + /// Whether secret freedom should be enabled + #[serde(default)] + pub secret_free: Option, /// Enables or disabled SMT. #[serde(default)] pub smt: Option, @@ -210,6 +217,7 @@ impl From for MachineConfigUpdate { MachineConfigUpdate { vcpu_count: Some(cfg.vcpu_count), mem_size_mib: Some(cfg.mem_size_mib), + secret_free: Some(cfg.secret_free), smt: Some(cfg.smt), cpu_template: cfg.static_template(), track_dirty_pages: Some(cfg.track_dirty_pages), @@ -263,11 +271,27 @@ impl MachineConfig { let mem_size_mib = update.mem_size_mib.unwrap_or(self.mem_size_mib); let page_config = update.huge_pages.unwrap_or(self.huge_pages); + let secret_free = update.secret_free.unwrap_or(self.secret_free); + let track_dirty_pages = update.track_dirty_pages.unwrap_or(self.track_dirty_pages); if mem_size_mib == 0 || !page_config.is_valid_mem_size(mem_size_mib) { return Err(MachineConfigError::InvalidMemorySize); } + if secret_free && page_config != HugePageConfig::None { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "huge pages", + )); + } + + if secret_free && track_dirty_pages { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "diff snapshots", + )); + } + let cpu_template = match update.cpu_template { None => self.cpu_template.clone(), Some(StaticCpuTemplate::None) => None, @@ -277,9 +301,10 @@ impl MachineConfig { Ok(MachineConfig { vcpu_count, mem_size_mib, + secret_free, smt, cpu_template, - track_dirty_pages: update.track_dirty_pages.unwrap_or(self.track_dirty_pages), + track_dirty_pages, huge_pages: page_config, #[cfg(feature = "gdb")] gdb_socket_path: update.gdb_socket_path.clone(), @@ -290,7 +315,126 @@ impl MachineConfig { #[cfg(test)] mod tests { use crate::cpu_config::templates::{CpuTemplateType, CustomCpuTemplate, StaticCpuTemplate}; - use crate::vmm_config::machine_config::MachineConfig; + use crate::vmm_config::machine_config::{ + HugePageConfig, MachineConfig, MachineConfigError, MachineConfigUpdate, + }; + + #[test] + #[allow(unused)] // some assertions exist only on specific architectures. + fn test_machine_config_update() { + let mconf = MachineConfig::default(); + + // Assert that the default machine config is valid + assert_eq!( + mconf + .update(&MachineConfigUpdate::from(mconf.clone())) + .unwrap(), + mconf + ); + + // Invalid vCPU counts + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(0), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(33), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + // Invalid memory size + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(0), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidMemorySize)); + + // Memory Size incompatible with huge page configuration + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(31), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidMemorySize)); + + // works if the memory size is a multiple of huge page size indeed + let updated = mconf + .update(&MachineConfigUpdate { + mem_size_mib: Some(32), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }) + .unwrap(); + assert_eq!(updated.huge_pages, HugePageConfig::Hugetlbfs2M); + assert_eq!(updated.mem_size_mib, 32); + + let res = mconf.update(&MachineConfigUpdate { + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + secret_free: Some(true), + ..Default::default() + }); + assert_eq!( + res, + Err(MachineConfigError::Incompatible( + "secret freedom", + "huge pages" + )) + ); + + let res = mconf.update(&MachineConfigUpdate { + track_dirty_pages: Some(true), + secret_free: Some(true), + ..Default::default() + }); + assert_eq!( + res, + Err(MachineConfigError::Incompatible( + "secret freedom", + "diff snapshots" + )) + ); + } + + #[test] + #[cfg(target_arch = "aarch64")] + fn test_machine_config_update_aarch64() { + let mconf = MachineConfig::default(); + + // Check that SMT is not supported on aarch64 + let res = mconf.update(&MachineConfigUpdate { + smt: Some(true), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::SmtNotSupported)); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_machine_config_update_x86_64() { + let mconf = MachineConfig::default(); + + // Test that SMT requires an even vcpu count + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(3), + smt: Some(true), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + // Works if the vcpu count is even indeed + let updated = mconf + .update(&MachineConfigUpdate { + vcpu_count: Some(32), + smt: Some(true), + ..Default::default() + }) + .unwrap(); + assert_eq!(updated.vcpu_count, 32); + assert!(updated.smt); + } // Ensure the special (de)serialization logic for the cpu_template field works: // only static cpu templates can be specified via the machine-config endpoint, but diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 19367f7f997..005b4f7d38c 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -6,7 +6,9 @@ // found in the THIRD-PARTY file. use std::fs::File; -use std::io::SeekFrom; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::os::fd::AsRawFd; +use std::ptr::null_mut; use std::sync::Arc; use serde::{Deserialize, Serialize}; @@ -17,7 +19,10 @@ pub use vm_memory::{ Address, ByteValued, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MemoryRegionAddress, MmapRegion, address, }; -use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; +use vm_memory::{ + Error as VmMemoryError, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, + WriteVolatile, +}; use vmm_sys_util::errno; use crate::DirtyBitmap; @@ -48,6 +53,144 @@ pub enum MemoryError { MemfdSetLen(std::io::Error), /// Total sum of memory regions exceeds largest possible file offset OffsetTooLarge, + /// Error calling mmap: {0} + Mmap(std::io::Error), +} + +/// Newtype that implements [`ReadVolatile`] and [`WriteVolatile`] if `T` implements `Read` or +/// `Write` respectively, by reading/writing using a bounce buffer, and memcpy-ing into the +/// [`VolatileSlice`]. +/// +/// Bounce buffers are allocated on the heap, as on-stack bounce buffers could cause stack +/// overflows. If `N == 0` then bounce buffers will be allocated on demand. +#[derive(Debug)] +pub struct MaybeBounce { + pub(crate) target: T, + persistent_buffer: Option>, +} + +impl MaybeBounce { + /// Creates a new `MaybeBounce` that always allocates a bounce + /// buffer on-demand + pub fn new(target: T, should_bounce: bool) -> Self { + MaybeBounce::new_persistent(target, should_bounce) + } +} + +impl MaybeBounce { + /// Creates a new `MaybeBounce` that uses a persistent, fixed size bounce buffer + /// of size `N`. If a read/write request exceeds the size of this bounce buffer, it + /// is split into multiple, `<= N`-size read/writes. + pub fn new_persistent(target: T, should_bounce: bool) -> Self { + let mut bounce = MaybeBounce { + target, + persistent_buffer: None, + }; + + if should_bounce { + bounce.activate() + } + + bounce + } + + /// Activates this [`MaybeBounce`] to start doing reads/writes via a bounce buffer, + /// which is allocated on the heap by this function (e.g. if `activate()` is never called, + /// no bounce buffer is ever allocated). + pub fn activate(&mut self) { + self.persistent_buffer = Some(vec![0u8; N].into_boxed_slice().try_into().unwrap()) + } + + /// Returns `true` if this `MaybeBounce` is actually bouncing buffers. + pub fn is_activated(&self) -> bool { + self.persistent_buffer.is_some() + } +} + +impl ReadVolatile for MaybeBounce { + fn read_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result { + if let Some(ref mut persistent) = self.persistent_buffer { + let mut bbuf = (N == 0).then(|| vec![0u8; buf.len()]); + let bbuf = bbuf.as_deref_mut().unwrap_or(persistent.as_mut_slice()); + + let mut buf = buf.offset(0)?; + let mut total = 0; + while !buf.is_empty() { + let how_much = buf.len().min(bbuf.len()); + let n = self + .target + .read_volatile(&mut VolatileSlice::from(&mut bbuf[..how_much]))?; + buf.copy_from(&bbuf[..n]); + + buf = buf.offset(n)?; + total += n; + + if n < how_much { + break; + } + } + + Ok(total) + } else { + self.target.read_volatile(buf) + } + } +} + +impl WriteVolatile for MaybeBounce { + fn write_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result { + if let Some(ref mut persistent) = self.persistent_buffer { + let mut bbuf = (N == 0).then(|| vec![0u8; buf.len()]); + let bbuf = bbuf.as_deref_mut().unwrap_or(persistent.as_mut_slice()); + + let mut buf = buf.offset(0)?; + let mut total = 0; + while !buf.is_empty() { + let how_much = buf.copy_to(bbuf); + let n = self + .target + .write_volatile(&VolatileSlice::from(&mut bbuf[..how_much]))?; + buf = buf.offset(n)?; + total += n; + + if n < how_much { + break; + } + } + + Ok(total) + } else { + self.target.write_volatile(buf) + } + } +} + +impl Read for MaybeBounce { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.target.read(buf) + } +} + +impl Write for MaybeBounce { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.target.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.target.flush() + } +} + +impl Seek for MaybeBounce { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.target.seek(pos) + } } /// Creates a `Vec` of `GuestRegionMmap` with the given configuration @@ -64,16 +207,40 @@ pub fn create( let mut builder = MmapRegionBuilder::new_with_bitmap( size, track_dirty_pages.then(|| AtomicBitmap::with_len(size)), - ) - .with_mmap_prot(libc::PROT_READ | libc::PROT_WRITE) - .with_mmap_flags(libc::MAP_NORESERVE | mmap_flags); + ); - if let Some(ref file) = file { + // when computing offset below we ensure it fits into i64 + #[allow(clippy::cast_possible_wrap)] + let (fd, fd_off) = if let Some(ref file) = file { let file_offset = FileOffset::from_arc(Arc::clone(file), offset); builder = builder.with_file_offset(file_offset); + + (file.as_raw_fd(), offset as libc::off_t) + } else { + (-1, 0) + }; + + // SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense + let ptr = unsafe { + libc::mmap( + null_mut(), + size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_NORESERVE | mmap_flags, + fd, + fd_off, + ) + }; + + if ptr == libc::MAP_FAILED { + return Err(MemoryError::Mmap(std::io::Error::last_os_error())); } + // SAFETY: we check above that mmap succeeded, and the size we passed to builder is the + // same as the size of the mmap area. + let builder = unsafe { builder.with_raw_mmap_pointer(ptr.cast()) }; + offset = match offset.checked_add(size as u64) { None => return Err(MemoryError::OffsetTooLarge), Some(new_off) if new_off >= i64::MAX as u64 => { @@ -92,18 +259,16 @@ pub fn create( } /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. -pub fn memfd_backed( - regions: &[(GuestAddress, usize)], +pub fn file_shared( + file: File, + regions: impl Iterator, track_dirty_pages: bool, huge_pages: HugePageConfig, ) -> Result, MemoryError> { - let size = regions.iter().map(|&(_, size)| size as u64).sum(); - let memfd_file = create_memfd(size, huge_pages.into())?.into_file(); - create( - regions.iter().copied(), + regions, libc::MAP_SHARED | huge_pages.mmap_flags(), - Some(memfd_file), + Some(file), track_dirty_pages, ) } @@ -124,7 +289,7 @@ pub fn anonymous( /// Creates a GuestMemoryMmap given a `file` containing the data /// and a `state` containing mapping information. -pub fn snapshot_file( +pub fn file_private( file: File, regions: impl Iterator, track_dirty_pages: bool, @@ -310,7 +475,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { } } -fn create_memfd( +/// Creates a memfd of the given size and huge pages configuration +pub fn create_memfd( mem_size: u64, hugetlb_size: Option, ) -> Result { @@ -346,6 +512,7 @@ mod tests { use std::collections::HashMap; use std::io::{Read, Seek}; + use std::os::fd::AsFd; use vmm_sys_util::tempfile::TempFile; @@ -563,7 +730,7 @@ mod tests { guest_memory.dump(&mut memory_file).unwrap(); let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(memory_file, memory_state.regions(), false).unwrap(), + file_private(memory_file, memory_state.regions(), false).unwrap(), ) .unwrap(); @@ -625,7 +792,7 @@ mod tests { // We can restore from this because this is the first dirty dump. let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(file, memory_state.regions(), false).unwrap(), + file_private(file, memory_state.regions(), false).unwrap(), ) .unwrap(); @@ -722,4 +889,50 @@ mod tests { seals.insert(memfd::FileSeal::SealGrow); memfd.add_seals(&seals).unwrap_err(); } + + #[test] + fn test_bounce() { + let file_direct = TempFile::new().unwrap(); + let file_bounced = TempFile::new().unwrap(); + let file_persistent_bounced = TempFile::new().unwrap(); + + let mut data = (0..=255).collect::>(); + + MaybeBounce::new(file_direct.as_file().as_fd(), false) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + MaybeBounce::new(file_bounced.as_file().as_fd(), true) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + MaybeBounce::<_, 7>::new_persistent(file_persistent_bounced.as_file().as_fd(), true) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + + let mut data_direct = vec![0u8; 256]; + let mut data_bounced = vec![0u8; 256]; + let mut data_persistent_bounced = vec![0u8; 256]; + + file_direct.as_file().seek(SeekFrom::Start(0)).unwrap(); + file_bounced.as_file().seek(SeekFrom::Start(0)).unwrap(); + file_persistent_bounced + .as_file() + .seek(SeekFrom::Start(0)) + .unwrap(); + + MaybeBounce::new(file_direct.as_file().as_fd(), false) + .read_exact_volatile(&mut VolatileSlice::from(data_direct.as_mut_slice())) + .unwrap(); + MaybeBounce::new(file_bounced.as_file().as_fd(), true) + .read_exact_volatile(&mut VolatileSlice::from(data_bounced.as_mut_slice())) + .unwrap(); + MaybeBounce::<_, 7>::new_persistent(file_persistent_bounced.as_file().as_fd(), true) + .read_exact_volatile(&mut VolatileSlice::from( + data_persistent_bounced.as_mut_slice(), + )) + .unwrap(); + + assert_eq!(data_direct, data_bounced); + assert_eq!(data_direct, data); + assert_eq!(data_persistent_bounced, data); + } } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..3119bc4cbb6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -6,26 +6,35 @@ // found in the THIRD-PARTY file. use std::collections::HashMap; -use std::fs::OpenOptions; +use std::fs::{File, OpenOptions}; use std::io::Write; +use std::os::fd::{AsFd, AsRawFd, FromRawFd}; use std::path::Path; use std::sync::Arc; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; -use kvm_ioctls::VmFd; +use kvm_bindings::{ + KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEMORY_ATTRIBUTE_PRIVATE, + kvm_create_guest_memfd, kvm_memory_attributes, kvm_userspace_memory_region, + kvm_userspace_memory_region2, +}; +use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::arch::{VM_TYPE_FOR_SECRET_FREEDOM, host_page_size}; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ - Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, + GuestRegionMmap, MaybeBounce, }; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +pub(crate) const KVM_GMEM_NO_DIRECT_MAP: u64 = 1; + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -34,6 +43,7 @@ pub struct VmCommon { max_memslots: usize, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + secret_free: bool, } /// Errors associated with the wrappers over KVM ioctls. @@ -55,12 +65,25 @@ pub enum VmError { NotEnoughMemorySlots, /// Memory Error: {0} VmMemory(#[from] vm_memory::Error), + /// Failure to create guest_memfd: {0} + GuestMemfd(kvm_ioctls::Error), + /// guest_memfd is not supported on this host kernel. + GuestMemfdNotSupported, + /// Failed to set memory attributes to private: {0} + SetMemoryAttributes(kvm_ioctls::Error), } /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Create a KVM VM - pub fn create_common(kvm: &crate::vstate::kvm::Kvm) -> Result { + pub fn create_common( + kvm: &crate::vstate::kvm::Kvm, + secret_free: bool, + ) -> Result { + if secret_free && !kvm.fd.check_extension(Cap::GuestMemfd) { + return Err(VmError::GuestMemfdNotSupported); + } + // It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines // with many VMs. // @@ -84,7 +107,14 @@ impl Vm { const MAX_ATTEMPTS: u32 = 5; let mut attempt = 1; let fd = loop { - match kvm.fd.create_vm() { + let create_result = if secret_free && VM_TYPE_FOR_SECRET_FREEDOM.is_some() { + kvm.fd + .create_vm_with_type(VM_TYPE_FOR_SECRET_FREEDOM.unwrap()) + } else { + kvm.fd.create_vm() + }; + + match create_result { Ok(fd) => break fd, Err(e) if e.errno() == libc::EINTR && attempt < MAX_ATTEMPTS => { info!("Attempt #{attempt} of KVM_CREATE_VM returned EINTR"); @@ -101,6 +131,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + secret_free, }) } @@ -124,6 +155,28 @@ impl Vm { Ok((vcpus, exit_evt)) } + /// Create a guest_memfd of the specified size + pub fn create_guest_memfd(&self, size: usize, flags: u64) -> Result { + assert_eq!( + size & (host_page_size() - 1), + 0, + "guest_memfd size must be page aligned" + ); + + let kvm_gmem = kvm_create_guest_memfd { + size: size as u64, + flags, + ..Default::default() + }; + + self.fd() + .create_guest_memfd(kvm_gmem) + .map_err(VmError::GuestMemfd) + // SAFETY: We know rawfd is a valid fd because create_guest_memfd didn't return an + // error. + .map(|rawfd| unsafe { File::from_raw_fd(rawfd) }) + } + /// Register a list of new memory regions to this [`Vm`]. pub fn register_memory_regions( &mut self, @@ -147,27 +200,63 @@ impl Vm { return Err(VmError::NotEnoughMemorySlots); } - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES + let mut flags = 0; + if region.bitmap().is_some() { + flags |= KVM_MEM_LOG_DIRTY_PAGES; + } + + #[allow(clippy::cast_sign_loss)] + let (guest_memfd, guest_memfd_offset) = if self.secret_free() { + flags |= KVM_MEM_GUEST_MEMFD; + + let fo = region + .file_offset() + .expect("secret hidden VMs must mmap guest_memfd for memslots"); + + (fo.file().as_raw_fd() as u32, fo.start()) } else { - 0 + (0, 0) }; - let memory_region = kvm_userspace_memory_region { + let memory_region = kvm_userspace_memory_region2 { slot: next_slot, guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), userspace_addr: region.as_ptr() as u64, flags, + guest_memfd, + guest_memfd_offset, + ..Default::default() }; let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; - // SAFETY: Safe because the fd is a valid KVM file descriptor. - unsafe { - self.fd() - .set_user_memory_region(memory_region) - .map_err(VmError::SetUserMemoryRegion)?; + if self.fd().check_extension(Cap::UserMemory2) { + // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. + unsafe { + self.fd() + .set_user_memory_region2(memory_region) + .map_err(VmError::SetUserMemoryRegion)?; + } + } else { + // Something is seriously wrong if we manage to set these fields on a host that doesn't + // even allow creation of guest_memfds! + assert_eq!(memory_region.guest_memfd, 0); + assert_eq!(memory_region.guest_memfd_offset, 0); + assert_eq!(memory_region.flags & KVM_MEM_GUEST_MEMFD, 0); + + // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. + unsafe { + self.fd() + .set_user_memory_region(kvm_userspace_memory_region { + slot: memory_region.slot, + flags: memory_region.flags, + guest_phys_addr: memory_region.guest_phys_addr, + memory_size: memory_region.memory_size, + userspace_addr: memory_region.userspace_addr, + }) + .map_err(VmError::SetUserMemoryRegion)?; + } } self.common.guest_memory = new_guest_memory; @@ -175,6 +264,11 @@ impl Vm { Ok(()) } + /// Whether this VM is secret free + pub fn secret_free(&self) -> bool { + self.common.secret_free + } + /// Gets a reference to the kvm file descriptor owned by this VM. pub fn fd(&self) -> &VmFd { &self.common.fd @@ -185,6 +279,28 @@ impl Vm { &self.common.guest_memory } + /// Sets the memory attributes on all guest_memfd-backed regions to private + pub fn set_memory_private(&self) -> Result<(), VmError> { + if !self.secret_free() { + return Ok(()); + } + + for region in self.guest_memory().iter() { + let attr = kvm_memory_attributes { + address: region.start_addr().0, + size: region.len(), + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + ..Default::default() + }; + + self.fd() + .set_memory_attributes(attr) + .map_err(VmError::SetMemoryAttributes)? + } + + Ok(()) + } + /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { self.guest_memory() @@ -265,7 +381,8 @@ impl Vm { self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; } SnapshotType::Full => { - self.guest_memory().dump(&mut file)?; + self.guest_memory() + .dump(&mut MaybeBounce::new(file.as_fd(), self.secret_free()))?; self.reset_dirty_bitmap(); self.guest_memory().reset_dirty(); } @@ -292,7 +409,7 @@ pub(crate) mod tests { // Auxiliary function being used throughout the tests. pub(crate) fn setup_vm() -> (Kvm, Vm) { let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let vm = Vm::new(&kvm).expect("Cannot create new vm"); + let vm = Vm::new(&kvm, false).expect("Cannot create new vm"); (kvm, vm) } @@ -308,7 +425,19 @@ pub(crate) mod tests { fn test_new() { // Testing with a valid /dev/kvm descriptor. let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - Vm::new(&kvm).unwrap(); + Vm::new(&kvm, false).unwrap(); + } + + #[test] + fn test_new_secret_free() { + let kvm = Kvm::new(vec![]).unwrap(); + + if !kvm.fd.check_extension(Cap::GuestMemfd) { + return; + } + + Vm::new(&kvm, true) + .expect("should be able to create secret free VMs if guest_memfd is supported"); } #[test] diff --git a/tests/README.md b/tests/README.md index c306566392f..8e93ebac4be 100644 --- a/tests/README.md +++ b/tests/README.md @@ -340,6 +340,8 @@ which tests are run in which context: in separate pipelines according to various cron schedules. - Tests marked as `no_block_pr` are run in the "optional" PR CI pipeline. This pipeline is not required to pass for merging a PR. +- Tests marked as `secret_hiding` are secret hiding specifc tests. They don't + run by default. All tests without markers are run for every pull request, and are required to pass for the PR to be merged. diff --git a/tests/conftest.py b/tests/conftest.py index 99d2e5c4344..c94e3bb8f31 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -134,7 +134,7 @@ def pytest_runtest_logreport(report): "test": report.nodeid, "instance": global_props.instance, "cpu_model": global_props.cpu_model, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, "phase": report.when, }, # per test @@ -142,12 +142,12 @@ def pytest_runtest_logreport(report): "test": report.nodeid, "instance": global_props.instance, "cpu_model": global_props.cpu_model, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, }, # per phase {"phase": report.when}, # per host kernel - {"host_kernel": "linux-" + global_props.host_linux_version}, + {"host_kernel": "linux-" + global_props.host_linux_version_metrics}, # per CPU {"cpu_model": global_props.cpu_model}, # and global @@ -386,6 +386,20 @@ def io_engine(request): return request.param +secret_free_test_cases = [False] +if ( + global_props.host_linux_version_metrics == "next" + and global_props.instance != "m6g.metal" +): + secret_free_test_cases.append(True) + + +@pytest.fixture(params=secret_free_test_cases) +def secret_free(request): + """Supported secret hiding configuration, based on hardware""" + return request.param + + @pytest.fixture def results_dir(request): """ diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 6c550e8c687..40d677752aa 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -249,6 +249,7 @@ def __init__( self.disks_vhost_user = {} self.vcpus_count = None self.mem_size_bytes = None + self.secret_free = False self.cpu_template_name = "None" # The given custom CPU template will be set in basic_config() but could # be overwritten via set_cpu_template(). @@ -458,11 +459,12 @@ def dimensions(self): return { "instance": global_props.instance, "cpu_model": global_props.cpu_model, - "host_kernel": f"linux-{global_props.host_linux_version}", + "host_kernel": f"linux-{global_props.host_linux_version_metrics}", "guest_kernel": self.kernel_file.stem[2:], "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "secret_free": str(self.secret_free or False), } @property @@ -730,6 +732,7 @@ def basic_config( rootfs_io_engine=None, cpu_template: Optional[str] = None, enable_entropy_device=False, + secret_free=None, ): """Shortcut for quickly configuring a microVM. @@ -748,15 +751,23 @@ def basic_config( Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ + # Have to do it this way as otherwise A/B-tests fail if the 'A' revision + # of Firecracker doesn't know about the secret_free parameter. + kwargs = {} + if secret_free: + kwargs["secret_free"] = True + self.api.machine_config.put( vcpu_count=vcpu_count, smt=smt, mem_size_mib=mem_size_mib, track_dirty_pages=track_dirty_pages, huge_pages=huge_pages, + **kwargs, ) self.vcpus_count = vcpu_count self.mem_size_bytes = mem_size_mib * 2**20 + self.secret_free = secret_free or False if self.custom_cpu_template is not None: self.set_cpu_template(self.custom_cpu_template) diff --git a/tests/framework/properties.py b/tests/framework/properties.py index c7c9dfe789d..83ff9dcdce2 100644 --- a/tests/framework/properties.py +++ b/tests/framework/properties.py @@ -102,6 +102,13 @@ def host_linux_version_tpl(self): """Host Linux version major.minor, as a tuple for easy comparison""" return tuple(int(x) for x in self.host_linux_version.split(".")) + @property + def host_linux_version_metrics(self): + """Host Linux version to be reported in metrics""" + return ( + "next" if self.host_linux_version_tpl > (6, 12) else self.host_linux_version + ) + @property def is_ec2(self): """Are we running on an EC2 instance?""" diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 5df673308d9..a026f8a7571 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -20,6 +20,7 @@ "machine-config": { "vcpu_count": 2, "mem_size_mib": 1024, + "secret_free": false, "smt": false, "track_dirty_pages": false, "huge_pages": "None" diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index 47661d5b27d..5aa247f40b7 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -508,7 +508,7 @@ def __init__(self, vm, timer=60): self.metrics_logger.set_dimensions( { "instance": global_props.instance, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, "guest_kernel": vm.kernel_file.stem[2:], } ) diff --git a/tests/integration_tests/build/test_hiding_kernel.py b/tests/integration_tests/build/test_hiding_kernel.py new file mode 100644 index 00000000000..1d76b31260f --- /dev/null +++ b/tests/integration_tests/build/test_hiding_kernel.py @@ -0,0 +1,30 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""A test which checks that the secret hiding enable kernel builds successfully.""" + +import pytest + +from framework import utils + + +@pytest.mark.timeout(600) +@pytest.mark.secret_hiding +def test_build_hiding_kernel(): + """ + In the test we will run our kernel build script to check it succeeds and builds the hidden kernel + """ + + # We have some extra deps for building the kernel that are not in the dev container + utils.check_output("apt update") + utils.check_output( + "apt install -y build-essential libncurses-dev bison flex libssl-dev libelf-dev bc dwarves libncurses5-dev kmod fakeroot" + ) + + # We have to configure git otherwise patch application fails + # the git log still credits the original author + utils.check_output('git config --global user.name "Firecracker CI"') + utils.check_output('git config --global user.email "ci@email.com"') + + utils.check_output( + "cd ../resources/hiding_ci; ./build_and_install_kernel.sh --no-install --tidy" + ) diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 864c6d5eda9..1e5ec6fc473 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -375,9 +375,7 @@ def test_api_machine_config(uvm_plain): bad_size = (1 << 64) - 1 test_microvm.api.machine_config.patch(mem_size_mib=bad_size) - fail_msg = re.escape( - "Invalid Memory Configuration: Cannot create mmap region: Out of memory (os error 12)" - ) + fail_msg = re.escape("Out of memory (os error 12)") with pytest.raises(RuntimeError, match=fail_msg): test_microvm.start() @@ -1062,6 +1060,7 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): setup_cfg["machine-config"] = { "vcpu_count": 2, "mem_size_mib": 256, + "secret_free": False, "smt": True, "track_dirty_pages": False, "huge_pages": "None", @@ -1175,6 +1174,7 @@ def test_get_full_config(uvm_plain): expected_cfg["machine-config"] = { "vcpu_count": 2, "mem_size_mib": 256, + "secret_free": False, "smt": False, "track_dirty_pages": False, "huge_pages": "None", diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index 4b66b077839..86c7e384b58 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -15,6 +15,8 @@ import os +import pytest + from framework import utils from framework.properties import global_props from framework.utils_cpuid import CPU_FEATURES_CMD, CpuModel @@ -157,6 +159,10 @@ } +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1), + reason="We don't currently track features for host kernels above 6.1.", +) def test_host_vs_guest_cpu_features(uvm_plain_any): """Check CPU features host vs guest""" diff --git a/tests/integration_tests/functional/test_secret_freedom.py b/tests/integration_tests/functional/test_secret_freedom.py new file mode 100644 index 00000000000..5f9758bb88c --- /dev/null +++ b/tests/integration_tests/functional/test_secret_freedom.py @@ -0,0 +1,75 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test secret-freedom related functionality.""" + +import pytest + +from framework import defs +from framework.microvm import Serial +from framework.properties import global_props +from integration_tests.performance.test_initrd import INITRD_FILESYSTEM + +pytestmark = [ + pytest.mark.skipif( + global_props.host_linux_version_metrics != "next", + reason="Secret Freedom is only supported on the in-dev upstream kernels for now", + ), + pytest.mark.skipif( + global_props.instance == "m6g.metal", + reason="Secret Freedom currently only works on ARM hardware conforming to at least ARMv8.4 as absense of ARM64_HAS_STAGE2_FWB causes kernel panics because of dcache flushing during stage2 page table entry installation", + ), +] + + +def test_secret_free_boot(microvm_factory, guest_kernel, rootfs): + """Tests that a VM can boot, e.g. some basic I/O works through userspace bounce buffers""" + vm = microvm_factory.build(guest_kernel, rootfs) + vm.spawn() + vm.memory_monitor = None + vm.basic_config(secret_free=True) + vm.add_net_iface() + vm.start() + + +def test_secret_free_initrd(microvm_factory, guest_kernel): + """ + Test that we can boot a secret hidden initrd (e.g. a VM with no I/O devices) + """ + fs = defs.ARTIFACT_DIR / "initramfs.cpio" + uvm = microvm_factory.build(guest_kernel) + uvm.initrd_file = fs + uvm.help.enable_console() + uvm.spawn() + uvm.memory_monitor = None + + uvm.basic_config( + add_root_device=False, + vcpu_count=1, + boot_args="console=ttyS0 reboot=k panic=1 pci=off", + use_initrd=True, + secret_free=True, + ) + + uvm.start() + serial = Serial(uvm) + serial.open() + serial.rx(token="# ") + serial.tx("mount |grep rootfs") + serial.rx(token=f"rootfs on / type {INITRD_FILESYSTEM}") + + +def test_secret_free_snapshot_creation(microvm_factory, guest_kernel, rootfs): + """Test that snapshot creation works for secret hidden VMs""" + vm = microvm_factory.build(guest_kernel, rootfs) + vm.spawn() + vm.memory_monitor = None + vm.basic_config(secret_free=True) + vm.add_net_iface() + vm.start() + + snapshot = vm.snapshot_full() + + # After restoration, the VM will not be secret hidden anymore, as that's not supported yet. + # But we can at least test that in principle, the snapshot creation worked. + vm = microvm_factory.build_from_snapshot(snapshot) + vm.ssh.check_output("true") diff --git a/tests/integration_tests/functional/test_shut_down.py b/tests/integration_tests/functional/test_shut_down.py index 591f04e4593..2f9bcd6572d 100644 --- a/tests/integration_tests/functional/test_shut_down.py +++ b/tests/integration_tests/functional/test_shut_down.py @@ -4,11 +4,17 @@ import platform +import pytest from packaging import version from framework import utils +from framework.properties import global_props +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1), + reason="The number of threads associated to firecracker changes in newer kernels", +) def test_reboot(uvm_plain_any): """ Test reboot from guest. diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py index f38872cdc94..aa1facb634c 100644 --- a/tests/integration_tests/performance/test_block_ab.py +++ b/tests/integration_tests/performance/test_block_ab.py @@ -163,14 +163,20 @@ def test_block_performance( fio_block_size, fio_engine, io_engine, + secret_free, metrics, ): """ Execute block device emulation benchmarking scenarios. """ + if secret_free and io_engine == "Async": + pytest.skip("userspace bounce buffers not supported with async block engine") + vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) + vm.basic_config( + vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB, secret_free=secret_free + ) vm.add_net_iface() # Add a secondary block device for benchmark tests. fs = drive_tools.FilesystemFile( diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 3bf74e3607a..30568dea1e9 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -8,8 +8,6 @@ import pytest -from framework.properties import global_props - # Regex for obtaining boot time from some string. TIMESTAMP_LOG_REGEX = r"Guest-boot-time\s+\=\s+(\d+)\s+us" @@ -19,14 +17,6 @@ ) -DIMENSIONS = { - "instance": global_props.instance, - "cpu_model": global_props.cpu_model, - "host_os": global_props.host_os, - "host_kernel": "linux-" + global_props.host_linux_version, -} - - def _get_microvm_boottime(vm): """Auxiliary function for asserting the expected boot time.""" boot_time_us = None @@ -75,22 +65,18 @@ def find_events(log_data): ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + secret_free, + metrics, ): """Test boot time with different guest configurations""" - metrics.set_dimensions( - { - **DIMENSIONS, - "performance_test": "test_boottime", - "guest_kernel": guest_kernel_acpi.name, - "vcpus": str(vcpu_count), - "mem_size_mib": str(mem_size_mib), - } - ) - for _ in range(10): - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw, monitor_memory=False) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn() vm.basic_config( @@ -98,10 +84,14 @@ def test_boottime( mem_size_mib=mem_size_mib, boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", enable_entropy_device=True, + secret_free=secret_free, ) vm.add_net_iface() vm.start() vm.pin_threads(0) + + metrics.set_dimensions({"performance_test": "test_boottime", **vm.dimensions}) + boottime_us = _get_microvm_boottime(vm) metrics.put_metric("boot_time", boottime_us, unit="Microseconds") timestamps = find_events(vm.log_data) diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 6015cf6032b..f5ddfe23786 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -55,6 +55,11 @@ def check_hugetlbfs_in_use(pid: int, allocation_name: str): assert kernel_page_size_kib > 4 +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_hugetlbfs_boot(uvm_plain): """Tests booting a microvm with guest memory backed by 2MB hugetlbfs pages""" @@ -69,6 +74,11 @@ def test_hugetlbfs_boot(uvm_plain): ) +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): """ Test hugetlbfs snapshot restore via uffd @@ -100,6 +110,11 @@ def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain): """ Test hugetlbfs differential snapshot support. @@ -142,6 +157,11 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain): # Verify if the restored microvm works. +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_ept_violation_count( microvm_factory, @@ -221,6 +241,11 @@ def test_ept_violation_count( metrics.put_metric(metric, int(metric_value), "Count") +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_negative_huge_pages_plus_balloon(uvm_plain): """Tests that huge pages and memory ballooning cannot be used together""" uvm_plain.memory_monitor = None @@ -230,7 +255,7 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) with pytest.raises( RuntimeError, - match="Firecracker's huge pages support is incompatible with memory ballooning.", + match="Memory ballooning is incompatible with huge pages.", ): uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) @@ -239,6 +264,6 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) with pytest.raises( RuntimeError, - match="Machine config error: Firecracker's huge pages support is incompatible with memory ballooning.", + match="Machine config error: 'balloon device' and 'huge pages' are mutually exclusive and cannot be used together.", ): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) diff --git a/tests/integration_tests/performance/test_initrd.py b/tests/integration_tests/performance/test_initrd.py index 3845e5610c0..28df8159155 100644 --- a/tests/integration_tests/performance/test_initrd.py +++ b/tests/integration_tests/performance/test_initrd.py @@ -4,6 +4,7 @@ import pytest from framework.microvm import HugePagesConfig, Serial +from framework.properties import global_props INITRD_FILESYSTEM = "rootfs" @@ -20,6 +21,11 @@ def uvm_with_initrd(microvm_factory, guest_kernel, record_property, artifact_dir yield uvm +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_microvm_initrd_with_serial(uvm_with_initrd, huge_pages): """ diff --git a/tests/integration_tests/performance/test_network_ab.py b/tests/integration_tests/performance/test_network_ab.py index 3a50d864544..3ac14a2c16f 100644 --- a/tests/integration_tests/performance/test_network_ab.py +++ b/tests/integration_tests/performance/test_network_ab.py @@ -36,7 +36,7 @@ def consume_ping_output(ping_putput, request_per_round): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, secret_free): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" @@ -45,7 +45,9 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) + vm.basic_config( + vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib, secret_free=secret_free + ) vm.add_net_iface() vm.start() vm.pin_threads(0) diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py index b4f1b8f15dc..fb8d7eee880 100644 --- a/tests/integration_tests/performance/test_snapshot_ab.py +++ b/tests/integration_tests/performance/test_snapshot_ab.py @@ -1,6 +1,7 @@ # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Performance benchmark for snapshot restore.""" + import re import signal import tempfile @@ -12,6 +13,7 @@ import host_tools.drive as drive_tools from framework.microvm import HugePagesConfig, Microvm +from framework.properties import global_props USEC_IN_MSEC = 1000 NS_IN_MSEC = 1_000_000 @@ -105,6 +107,15 @@ def test_restore_latency( We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore. """ + if ( + test_setup.huge_pages != HugePagesConfig.NONE + and global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64" + ): + pytest.skip( + "huge pages with secret hiding kernels on ARM are currently failing" + ) + vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) metrics.set_dimensions( @@ -153,6 +164,15 @@ def test_post_restore_latency( if huge_pages != HugePagesConfig.NONE and uffd_handler is None: pytest.skip("huge page snapshots can only be restored using uffd") + if ( + huge_pages != HugePagesConfig.NONE + and global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64" + ): + pytest.skip( + "huge pages with secret hiding kernels on ARM are currently failing" + ) + test_setup = SnapshotRestoreTest(mem=1024, vcpus=2, huge_pages=huge_pages) vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) @@ -208,6 +228,15 @@ def test_population_latency( mem, ): """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)""" + if ( + huge_pages != HugePagesConfig.NONE + and global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64" + ): + pytest.skip( + "huge pages with secret hiding kernels on ARM are currently failing" + ) + test_setup = SnapshotRestoreTest(mem=mem, vcpus=vcpus, huge_pages=huge_pages) vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) diff --git a/tests/integration_tests/performance/test_vsock_ab.py b/tests/integration_tests/performance/test_vsock_ab.py index bcee05528af..9cd08e312d5 100644 --- a/tests/integration_tests/performance/test_vsock_ab.py +++ b/tests/integration_tests/performance/test_vsock_ab.py @@ -73,7 +73,14 @@ def guest_command(self, port_offset): @pytest.mark.parametrize("payload_length", ["64K", "1024K"], ids=["p64K", "p1024K"]) @pytest.mark.parametrize("mode", ["g2h", "h2g", "bd"]) def test_vsock_throughput( - microvm_factory, guest_kernel_acpi, rootfs, vcpus, payload_length, mode, metrics + microvm_factory, + guest_kernel_acpi, + rootfs, + vcpus, + payload_length, + mode, + metrics, + secret_free, ): """ Test vsock throughput for multiple vm configurations. @@ -87,7 +94,9 @@ def test_vsock_throughput( mem_size_mib = 1024 vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) + vm.basic_config( + vcpu_count=vcpus, mem_size_mib=mem_size_mib, secret_free=secret_free + ) vm.add_net_iface() # Create a vsock device vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path="/" + VSOCK_UDS_PATH) diff --git a/tests/pytest.ini b/tests/pytest.ini index 5656c8eee4d..930c4891814 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -5,12 +5,13 @@ addopts = -vv --durations=10 --showlocals - -m 'not nonci and not no_block_pr' + -m 'not nonci and not no_block_pr and not secret_hiding' --json-report --json-report-file=../test_results/test-report.json markers = no_block_pr: tests whose failure does not block PR merging. nonci: mark test as nonci. + secret_hiding: tests related to secret hiding. ; Overwrite the default norecursedirs, which includes 'build'. norecursedirs = .* diff --git a/tools/setup-ci-artifacts.sh b/tools/setup-ci-artifacts.sh index 0d524658b51..079eda888d8 100755 --- a/tools/setup-ci-artifacts.sh +++ b/tools/setup-ci-artifacts.sh @@ -12,7 +12,7 @@ say "Setup CI artifacts" cd build/img/$(uname -m) say "Fix executable permissions" -find "firecracker" -type f |xargs chmod -c 755 +find "firecracker" -type f |xargs chmod -c 755 || true say "Generate SSH key to connect from host" if [ ! -s id_rsa ]; then