From d43d3bdfd8ee66056434ba538ae59a495d042e7f Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 19 Mar 2025 15:26:17 +0000 Subject: [PATCH 01/40] ci: Create script for installing custom kernel Creating a script to build and install a modified kernel with patches applied. Signed-off-by: Jack Thomson --- .../hiding_ci/build_and_install_kernel.sh | 170 ++++++++++++++++++ resources/hiding_ci/kernel_commit_hash | 1 + resources/hiding_ci/kernel_config_overrides | 6 + resources/hiding_ci/kernel_url | 1 + resources/hiding_ci/patches/0001.lore | 1 + 5 files changed, 179 insertions(+) create mode 100755 resources/hiding_ci/build_and_install_kernel.sh create mode 100644 resources/hiding_ci/kernel_commit_hash create mode 100644 resources/hiding_ci/kernel_config_overrides create mode 100644 resources/hiding_ci/kernel_url create mode 100644 resources/hiding_ci/patches/0001.lore diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh new file mode 100755 index 00000000000..c898a581384 --- /dev/null +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# fail if we encounter an error, uninitialized variable or a pipe breaks +set -eu -o pipefail + +check_root() { + # We need sudo privileges to install the kernel + if [ "$(id -u)" -ne 0 ]; then + echo "To install, this script must be run as root or with sudo privileges" + exit 1 + fi +} + +check_ubuntu() { + # Currently this script only works on Ubuntu instances + if ! grep -qi 'ubuntu' /etc/os-release; then + echo "This script currently only works on Ubuntu." + exit 1 + fi +} + +tidy_up() { + # Some cleanup after we are done + echo "Cleaning up.." + popd + rm -rf $TMP_BUILD_DIR +} + +confirm() { + if [[ "$*" == *"--no-install"* ]]; then + echo "Not installing new kernel." + + if [[ "$*" == *"--tidy"* ]]; then + tidy_up + fi + + exit 0 + fi + + if [[ "$*" == *"--install"* ]]; then + return 0 + fi + + while true; do + read -p "Do you want to install the new kernel? (y/n) " yn + case $yn in + [Yy]*) return 0 ;; + [Nn]*) + echo "Exiting..." + exit 1 + ;; + *) echo "Please answer yes or no." ;; + esac + done +} + +apply_patch_file() { + git apply $1 +} + +apply_series_mbox() { + git am $1 --empty=drop +} + +apply_series_link() { + patch_url=$(cat $1) + echo "Fetching mbox from:" $patch_url + curl --output lore.mbox.gz "$patch_url/t.mbox.gz" + gunzip lore.mbox + apply_series_mbox lore.mbox + rm lore.mbox +} + +apply_patch_or_series() { + case "$1" in + *.patch) apply_patch_file $1 ;; + *.mbox) apply_series_mbox $1 ;; + *.lore) apply_series_link $1 ;; + *) + echo "Uknown patch file: "$1 + exit 1 + ;; + esac +} + +check_override_presence() { + while IFS= read -r line; do + if ! grep -Fq "$line" .config; then + echo "Missing config: $line" + exit 1 + fi + done <"$KERNEL_CONFIG_OVERRIDES" + + echo "All overrides correctly applied.." +} + +KERNEL_URL=$(cat kernel_url) +KERNEL_COMMIT_HASH=$(cat kernel_commit_hash) +KERNEL_PATCHES_DIR=$(pwd)/patches +KERNEL_CONFIG_OVERRIDES=$(pwd)/kernel_config_overrides + +TMP_BUILD_DIR=$(mktemp -d -t kernel-build-XXXX) + +pushd . +cd $TMP_BUILD_DIR + +echo "Cloning kernel repository into" $TMP_BUILD_DIR + +# We checkout the repository that way to make it as +# small and fast as possible +git init +git remote add origin $KERNEL_URL +git fetch --depth 1 origin $KERNEL_COMMIT_HASH +git checkout FETCH_HEAD + +# Apply our patches on top +for PATCH in $KERNEL_PATCHES_DIR/*.*; do + echo "Applying patch:" $(basename $PATCH) + apply_patch_or_series $PATCH +done + +echo "Making kernel config ready for build" +# We use olddefconfig to automatically pull in the +# config from the AMI and update to the newest +# defaults +make olddefconfig + +# Disable the ubuntu keys +scripts/config --disable SYSTEM_TRUSTED_KEYS +scripts/config --disable SYSTEM_REVOCATION_KEYS + +# We run this again to default options now changed by +# the disabling of the ubuntu keys +make olddefconfig + +# Apply our config overrides on top of the config +scripts/kconfig/merge_config.sh -m .config $KERNEL_CONFIG_OVERRIDES + +check_override_presence + +echo "Building kernel this may take a while" +make -s -j $(nproc) +echo "Building kernel modules" +make modules -s -j $(nproc) +echo "Kernel build complete!" + +KERNEL_VERSION=$(KERNELVERSION=$(make -s kernelversion) ./scripts/setlocalversion) + +echo "New kernel version:" $KERNEL_VERSION + +# Make sure a user really wants to install this kernel +confirm "$@" + +check_root +check_ubuntu + +echo "Installing kernel modules..." +make INSTALL_MOD_STRIP=1 modules_install +echo "Installing kernel..." +make INSTALL_MOD_STRIP=1 install +echo "Update initramfs" +update-initramfs -c -k $KERNEL_VERSION +echo "Updating GRUB..." +update-grub + +echo "Kernel built and installed successfully!" + +tidy_up diff --git a/resources/hiding_ci/kernel_commit_hash b/resources/hiding_ci/kernel_commit_hash new file mode 100644 index 00000000000..39d6afaaf51 --- /dev/null +++ b/resources/hiding_ci/kernel_commit_hash @@ -0,0 +1 @@ +4701f33a10702d5fc577c32434eb62adde0a1ae1 diff --git a/resources/hiding_ci/kernel_config_overrides b/resources/hiding_ci/kernel_config_overrides new file mode 100644 index 00000000000..e42464abb89 --- /dev/null +++ b/resources/hiding_ci/kernel_config_overrides @@ -0,0 +1,6 @@ +CONFIG_EXPERT=y +CONFIG_KVM=y +CONFIG_KVM_SW_PROTECTED_VM=y +CONFIG_KVM_PRIVATE_MEM=y +CONFIG_KVM_AMD_SEV=y +CONFIG_DEBUG_INFO=y diff --git a/resources/hiding_ci/kernel_url b/resources/hiding_ci/kernel_url new file mode 100644 index 00000000000..ce6e1a3e6a8 --- /dev/null +++ b/resources/hiding_ci/kernel_url @@ -0,0 +1 @@ +git://git.kernel.org/pub/scm/virt/kvm/kvm.git diff --git a/resources/hiding_ci/patches/0001.lore b/resources/hiding_ci/patches/0001.lore new file mode 100644 index 00000000000..7663841026d --- /dev/null +++ b/resources/hiding_ci/patches/0001.lore @@ -0,0 +1 @@ +https://lore.kernel.org/kvm/20250318161823.4005529-1-tabba@google.com From 130080c0fb707d83faa2fbbe3475e52db005c95b Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Mon, 24 Mar 2025 15:56:05 +0000 Subject: [PATCH 02/40] test: Add test for kernel build Adding a new integration test to assert that the kernel build script will succeed. Signed-off-by: Jack Thomson --- .buildkite/pipeline_pr.py | 9 ++++++ tests/README.md | 2 ++ .../build/test_hiding_kernel.py | 29 +++++++++++++++++++ tests/pytest.ini | 3 +- 4 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 tests/integration_tests/build/test_hiding_kernel.py diff --git a/.buildkite/pipeline_pr.py b/.buildkite/pipeline_pr.py index 7f85f777c6b..f5818f8bd02 100755 --- a/.buildkite/pipeline_pr.py +++ b/.buildkite/pipeline_pr.py @@ -68,6 +68,15 @@ for step in kani_grp["steps"]: step["label"] = "🔍 Kani" +if any(x.parent.name == "hiding_ci" for x in changed_files): + pipeline.build_group_per_arch( + "🕵️ Build Secret Hiding Kernel", + pipeline.devtool_test( + pytest_opts="-m secret_hiding integration_tests/build/test_hiding_kernel.py", + ), + depends_on_build=False, + ) + if run_all_tests(changed_files): pipeline.build_group( "📦 Build", diff --git a/tests/README.md b/tests/README.md index c306566392f..8e93ebac4be 100644 --- a/tests/README.md +++ b/tests/README.md @@ -340,6 +340,8 @@ which tests are run in which context: in separate pipelines according to various cron schedules. - Tests marked as `no_block_pr` are run in the "optional" PR CI pipeline. This pipeline is not required to pass for merging a PR. +- Tests marked as `secret_hiding` are secret hiding specifc tests. They don't + run by default. All tests without markers are run for every pull request, and are required to pass for the PR to be merged. diff --git a/tests/integration_tests/build/test_hiding_kernel.py b/tests/integration_tests/build/test_hiding_kernel.py new file mode 100644 index 00000000000..a85a73143cb --- /dev/null +++ b/tests/integration_tests/build/test_hiding_kernel.py @@ -0,0 +1,29 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""A test which checks that the secret hiding enable kernel builds successfully.""" + +import pytest + +from framework import utils + + +@pytest.mark.timeout(600) +@pytest.mark.secret_hiding +def test_build_hiding_kernel(): + """ + In the test we will run our kernel build script to check it succeeds and builds the hidden kernel + """ + + # We have some extra deps for building the kernel that are not in the dev contaner + utils.check_output( + "apt install -y build-essential libncurses-dev bison flex libssl-dev libelf-dev bc dwarves libncurses5-dev kmod fakeroot" + ) + + # We have to configure git otherwise patch application fails + # the git log still credits the original author + utils.check_output('git config --global user.name "Firecracker CI"') + utils.check_output('git config --global user.email "ci@email.com"') + + utils.check_output( + "cd ../resources/hiding_ci; ./build_and_install_kernel.sh --no-install --tidy" + ) diff --git a/tests/pytest.ini b/tests/pytest.ini index 5656c8eee4d..930c4891814 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -5,12 +5,13 @@ addopts = -vv --durations=10 --showlocals - -m 'not nonci and not no_block_pr' + -m 'not nonci and not no_block_pr and not secret_hiding' --json-report --json-report-file=../test_results/test-report.json markers = no_block_pr: tests whose failure does not block PR merging. nonci: mark test as nonci. + secret_hiding: tests related to secret hiding. ; Overwrite the default norecursedirs, which includes 'build'. norecursedirs = .* From 1ea914a925e2e2a58082f375efe95bfe83e4907a Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Tue, 25 Mar 2025 13:15:29 +0000 Subject: [PATCH 03/40] ci: Add secret hiding kernel to defaults buildkite Adding the secret hiding kernel as a default for the buildkite pipeline, this will mean that PR's made against the branch will now be run with the new secret hiding enabled amis. Some tests have been marked to skip as they are kernel dependent so while we are compiling our kernel in CI these could change again. Signed-off-by: Jack Thomson --- .buildkite/common.py | 1 + .../test_cpu_features_host_vs_guest.py | 6 +++++ .../functional/test_shut_down.py | 6 +++++ .../performance/test_huge_pages.py | 25 +++++++++++++++++++ .../performance/test_initrd.py | 6 +++++ 5 files changed, 44 insertions(+) diff --git a/.buildkite/common.py b/.buildkite/common.py index a979638e472..7013406a0f0 100644 --- a/.buildkite/common.py +++ b/.buildkite/common.py @@ -33,6 +33,7 @@ DEFAULT_PLATFORMS = [ ("al2", "linux_5.10"), ("al2023", "linux_6.1"), + ("ubuntu24", "secret_hiding"), ] diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index 4b66b077839..86c7e384b58 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -15,6 +15,8 @@ import os +import pytest + from framework import utils from framework.properties import global_props from framework.utils_cpuid import CPU_FEATURES_CMD, CpuModel @@ -157,6 +159,10 @@ } +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1), + reason="We don't currently track features for host kernels above 6.1.", +) def test_host_vs_guest_cpu_features(uvm_plain_any): """Check CPU features host vs guest""" diff --git a/tests/integration_tests/functional/test_shut_down.py b/tests/integration_tests/functional/test_shut_down.py index 591f04e4593..2f9bcd6572d 100644 --- a/tests/integration_tests/functional/test_shut_down.py +++ b/tests/integration_tests/functional/test_shut_down.py @@ -4,11 +4,17 @@ import platform +import pytest from packaging import version from framework import utils +from framework.properties import global_props +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1), + reason="The number of threads associated to firecracker changes in newer kernels", +) def test_reboot(uvm_plain_any): """ Test reboot from guest. diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 6015cf6032b..d683afe065e 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -55,6 +55,11 @@ def check_hugetlbfs_in_use(pid: int, allocation_name: str): assert kernel_page_size_kib > 4 +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_hugetlbfs_boot(uvm_plain): """Tests booting a microvm with guest memory backed by 2MB hugetlbfs pages""" @@ -69,6 +74,11 @@ def test_hugetlbfs_boot(uvm_plain): ) +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): """ Test hugetlbfs snapshot restore via uffd @@ -100,6 +110,11 @@ def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain): """ Test hugetlbfs differential snapshot support. @@ -142,6 +157,11 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain): # Verify if the restored microvm works. +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_ept_violation_count( microvm_factory, @@ -221,6 +241,11 @@ def test_ept_violation_count( metrics.put_metric(metric, int(metric_value), "Count") +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) def test_negative_huge_pages_plus_balloon(uvm_plain): """Tests that huge pages and memory ballooning cannot be used together""" uvm_plain.memory_monitor = None diff --git a/tests/integration_tests/performance/test_initrd.py b/tests/integration_tests/performance/test_initrd.py index 3845e5610c0..28df8159155 100644 --- a/tests/integration_tests/performance/test_initrd.py +++ b/tests/integration_tests/performance/test_initrd.py @@ -4,6 +4,7 @@ import pytest from framework.microvm import HugePagesConfig, Serial +from framework.properties import global_props INITRD_FILESYSTEM = "rootfs" @@ -20,6 +21,11 @@ def uvm_with_initrd(microvm_factory, guest_kernel, record_property, artifact_dir yield uvm +@pytest.mark.skipif( + global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64", + reason="Huge page tests with secret hidden kernels on ARM currently fail", +) @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_microvm_initrd_with_serial(uvm_with_initrd, huge_pages): """ From 421da221134104218e9eb9db0a4829dd04d2ff87 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 26 Mar 2025 14:43:55 +0000 Subject: [PATCH 04/40] tests: Mark kernels newer than 6.12 as next To make it easier to track the upstream kernels which may change as we rebase, let's mark kernels newer than 6.12 as next for now to make dashboarding easier. Signed-off-by: Jack Thomson --- tests/conftest.py | 6 +++--- tests/framework/microvm.py | 2 +- tests/framework/properties.py | 7 +++++++ tests/host_tools/fcmetrics.py | 2 +- tests/integration_tests/performance/test_boottime.py | 2 +- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 99d2e5c4344..4591cd8112d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -134,7 +134,7 @@ def pytest_runtest_logreport(report): "test": report.nodeid, "instance": global_props.instance, "cpu_model": global_props.cpu_model, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, "phase": report.when, }, # per test @@ -142,12 +142,12 @@ def pytest_runtest_logreport(report): "test": report.nodeid, "instance": global_props.instance, "cpu_model": global_props.cpu_model, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, }, # per phase {"phase": report.when}, # per host kernel - {"host_kernel": "linux-" + global_props.host_linux_version}, + {"host_kernel": "linux-" + global_props.host_linux_version_metrics}, # per CPU {"cpu_model": global_props.cpu_model}, # and global diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 6c550e8c687..00ceaac82c2 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -458,7 +458,7 @@ def dimensions(self): return { "instance": global_props.instance, "cpu_model": global_props.cpu_model, - "host_kernel": f"linux-{global_props.host_linux_version}", + "host_kernel": f"linux-{global_props.host_linux_version_metrics}", "guest_kernel": self.kernel_file.stem[2:], "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), diff --git a/tests/framework/properties.py b/tests/framework/properties.py index c7c9dfe789d..83ff9dcdce2 100644 --- a/tests/framework/properties.py +++ b/tests/framework/properties.py @@ -102,6 +102,13 @@ def host_linux_version_tpl(self): """Host Linux version major.minor, as a tuple for easy comparison""" return tuple(int(x) for x in self.host_linux_version.split(".")) + @property + def host_linux_version_metrics(self): + """Host Linux version to be reported in metrics""" + return ( + "next" if self.host_linux_version_tpl > (6, 12) else self.host_linux_version + ) + @property def is_ec2(self): """Are we running on an EC2 instance?""" diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index 47661d5b27d..5aa247f40b7 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -508,7 +508,7 @@ def __init__(self, vm, timer=60): self.metrics_logger.set_dimensions( { "instance": global_props.instance, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, "guest_kernel": vm.kernel_file.stem[2:], } ) diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 3bf74e3607a..4a2e6b61b70 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -23,7 +23,7 @@ "instance": global_props.instance, "cpu_model": global_props.cpu_model, "host_os": global_props.host_os, - "host_kernel": "linux-" + global_props.host_linux_version, + "host_kernel": "linux-" + global_props.host_linux_version_metrics, } From d59cbb74eafbdb791053af2645a8cb1a297d0f3e Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 27 Mar 2025 13:43:56 +0000 Subject: [PATCH 05/40] tests: Skip more huge page tests on ARM This test is failing for ARM instances in our performance pipeline, skipping this for now until we resolve the issue. Signed-off-by: Jack Thomson --- .../integration_tests/performance/test_snapshot_ab.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py index b4f1b8f15dc..e8a24a43b50 100644 --- a/tests/integration_tests/performance/test_snapshot_ab.py +++ b/tests/integration_tests/performance/test_snapshot_ab.py @@ -12,6 +12,7 @@ import host_tools.drive as drive_tools from framework.microvm import HugePagesConfig, Microvm +from framework.properties import global_props USEC_IN_MSEC = 1000 NS_IN_MSEC = 1_000_000 @@ -153,6 +154,15 @@ def test_post_restore_latency( if huge_pages != HugePagesConfig.NONE and uffd_handler is None: pytest.skip("huge page snapshots can only be restored using uffd") + if ( + huge_pages != HugePagesConfig.NONE + and global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64" + ): + pytest.skip( + "huge pages with secret hiding kernels on ARM are currently failing" + ) + test_setup = SnapshotRestoreTest(mem=1024, vcpus=2, huge_pages=huge_pages) vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) From 312976817018e2bef73e4e9be2385735fd8bd4f2 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Thu, 27 Mar 2025 13:56:03 +0000 Subject: [PATCH 06/40] ci: Move away from using dir stacks Addressing a comment to move away from dir stacks in our install scripts. We now store the start directly before we move the build directory and cd back to that explicitly. Signed-off-by: Jack Thomson --- resources/hiding_ci/build_and_install_kernel.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index c898a581384..7d27f3a3f86 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -24,7 +24,7 @@ check_ubuntu() { tidy_up() { # Some cleanup after we are done echo "Cleaning up.." - popd + cd $START_DIR rm -rf $TMP_BUILD_DIR } @@ -103,7 +103,8 @@ KERNEL_CONFIG_OVERRIDES=$(pwd)/kernel_config_overrides TMP_BUILD_DIR=$(mktemp -d -t kernel-build-XXXX) -pushd . +START_DIR=$(pwd) + cd $TMP_BUILD_DIR echo "Cloning kernel repository into" $TMP_BUILD_DIR From cfbe7f280dd32bc730f157ba7e7f8aa283a7a814 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Fri, 28 Mar 2025 10:40:47 +0000 Subject: [PATCH 07/40] test: Skip final huge page tests We previously skipped a huge page test which stoppped the test timing out, but now we get stuck on another. Skipping this and the other huge page snapshot tests in the file. Signed-off-by: Jack Thomson --- .../performance/test_snapshot_ab.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py index e8a24a43b50..fb8d7eee880 100644 --- a/tests/integration_tests/performance/test_snapshot_ab.py +++ b/tests/integration_tests/performance/test_snapshot_ab.py @@ -1,6 +1,7 @@ # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Performance benchmark for snapshot restore.""" + import re import signal import tempfile @@ -106,6 +107,15 @@ def test_restore_latency( We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore. """ + if ( + test_setup.huge_pages != HugePagesConfig.NONE + and global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64" + ): + pytest.skip( + "huge pages with secret hiding kernels on ARM are currently failing" + ) + vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) metrics.set_dimensions( @@ -218,6 +228,15 @@ def test_population_latency( mem, ): """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)""" + if ( + huge_pages != HugePagesConfig.NONE + and global_props.host_linux_version_tpl > (6, 1) + and global_props.cpu_architecture == "aarch64" + ): + pytest.skip( + "huge pages with secret hiding kernels on ARM are currently failing" + ) + test_setup = SnapshotRestoreTest(mem=mem, vcpus=vcpus, huge_pages=huge_pages) vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) From 7e34e207477be17c0ac313eeedb46f3d281b10c2 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Mon, 31 Mar 2025 09:31:46 +0000 Subject: [PATCH 08/40] tests(bk): Run the kernel build in our nightly PR Run the kernel build as part of our nightly tests so we can monitor it's success. Signed-off-by: Jack Thomson --- .buildkite/pipeline_pr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline_pr.py b/.buildkite/pipeline_pr.py index f5818f8bd02..be77a3fafd0 100755 --- a/.buildkite/pipeline_pr.py +++ b/.buildkite/pipeline_pr.py @@ -68,7 +68,7 @@ for step in kani_grp["steps"]: step["label"] = "🔍 Kani" -if any(x.parent.name == "hiding_ci" for x in changed_files): +if not changed_files or (any(x.parent.name == "hiding_ci" for x in changed_files)): pipeline.build_group_per_arch( "🕵️ Build Secret Hiding Kernel", pipeline.devtool_test( From 7c5afcfa7295a3eb82ae9faed3da31ba83dea003 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 3 Apr 2025 10:16:41 +0100 Subject: [PATCH 09/40] add direct map removal patches to secret hiding CI Add an updated version of relevant patches from my v4 direct map removal series [1]. Updated here means - Drop all selftests patches, as they are irrelevant for our CI - Address comments from David about squashing commits - Rebase on top of Fuad's v7 [1]: https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/ Signed-off-by: Patrick Roy --- .../0002-mm-introduce-AS_NO_DIRECT_MAP.patch | 208 ++++++++++++++++++ ...d-Add-flag-to-remove-from-direct-map.patch | 178 +++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch create mode 100644 resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch diff --git a/resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch b/resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch new file mode 100644 index 00000000000..53dfc236022 --- /dev/null +++ b/resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch @@ -0,0 +1,208 @@ +From 138b7a4c83c43b42851cb8fec2bbdbaadd960241 Mon Sep 17 00:00:00 2001 +From: Patrick Roy +Date: Fri, 7 Feb 2025 11:16:06 +0000 +Subject: [PATCH 1/2] mm: introduce AS_NO_DIRECT_MAP + +Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are +set to not present . Currently, mappings that match this description are +secretmem mappings (memfd_secret()). Later, some guest_memfd +configurations will also fall into this category. + +Reject this new type of mappings in all locations that currently reject +secretmem mappings, on the assumption that if secretmem mappings are +rejected somewhere, it is precisely because of an inability to deal with +folios without direct map entries, and then make memfd_secret() use +AS_NO_DIRECT_MAP on its address_space to drop its special +vma_is_secretmem()/secretmem_mapping() checks. + +This drops a optimization in gup_fast_folio_allowed() where +secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is +enabled by default since commit b758fe6df50d ("mm/secretmem: make it on +by default"), so the secretmem check did not actually end up elided in +most cases anymore anyway. + +Use a new flag instead of overloading AS_INACCESSIBLE (which is already +set by guest_memfd) because not all guest_memfd mappings will end up +being direct map removed (e.g. in pKVM setups, parts of guest_memfd that +can be mapped to userspace should also be GUP-able, and generally not +have restrictions on who can access it). + +Signed-off-by: Patrick Roy +--- + include/linux/pagemap.h | 16 ++++++++++++++++ + include/linux/secretmem.h | 18 ------------------ + lib/buildid.c | 4 ++-- + mm/gup.c | 14 +++----------- + mm/mlock.c | 2 +- + mm/secretmem.c | 6 +----- + 6 files changed, 23 insertions(+), 37 deletions(-) + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 47bfc6b1b632..903b41e89cf8 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -210,6 +210,7 @@ enum mapping_flags { + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying + folio contents */ + AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ ++ AS_NO_DIRECT_MAP = 9, /* Folios in the mapping are not in the direct map */ + /* Bits 16-25 are used for FOLIO_ORDER */ + AS_FOLIO_ORDER_BITS = 5, + AS_FOLIO_ORDER_MIN = 16, +@@ -335,6 +336,21 @@ static inline bool mapping_inaccessible(struct address_space *mapping) + return test_bit(AS_INACCESSIBLE, &mapping->flags); + } + ++static inline void mapping_set_no_direct_map(struct address_space *mapping) ++{ ++ set_bit(AS_NO_DIRECT_MAP, &mapping->flags); ++} ++ ++static inline bool mapping_no_direct_map(struct address_space *mapping) ++{ ++ return test_bit(AS_NO_DIRECT_MAP, &mapping->flags); ++} ++ ++static inline bool vma_is_no_direct_map(const struct vm_area_struct *vma) ++{ ++ return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping); ++} ++ + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) + { + return mapping->gfp_mask; +diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h +index e918f96881f5..0ae1fb057b3d 100644 +--- a/include/linux/secretmem.h ++++ b/include/linux/secretmem.h +@@ -4,28 +4,10 @@ + + #ifdef CONFIG_SECRETMEM + +-extern const struct address_space_operations secretmem_aops; +- +-static inline bool secretmem_mapping(struct address_space *mapping) +-{ +- return mapping->a_ops == &secretmem_aops; +-} +- +-bool vma_is_secretmem(struct vm_area_struct *vma); + bool secretmem_active(void); + + #else + +-static inline bool vma_is_secretmem(struct vm_area_struct *vma) +-{ +- return false; +-} +- +-static inline bool secretmem_mapping(struct address_space *mapping) +-{ +- return false; +-} +- + static inline bool secretmem_active(void) + { + return false; +diff --git a/lib/buildid.c b/lib/buildid.c +index c4b0f376fb34..33f173a607ad 100644 +--- a/lib/buildid.c ++++ b/lib/buildid.c +@@ -65,8 +65,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off) + + freader_put_folio(r); + +- /* reject secretmem folios created with memfd_secret() */ +- if (secretmem_mapping(r->file->f_mapping)) ++ /* reject secretmem folios created with memfd_secret() or guest_memfd() */ ++ if (mapping_no_direct_map(r->file->f_mapping)) + return -EFAULT; + + r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT); +diff --git a/mm/gup.c b/mm/gup.c +index 3883b307780e..b1483a876740 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1283,7 +1283,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) + if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) + return -EOPNOTSUPP; + +- if (vma_is_secretmem(vma)) ++ if (vma_is_no_direct_map(vma)) + return -EFAULT; + + if (write) { +@@ -2786,7 +2786,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) + { + bool reject_file_backed = false; + struct address_space *mapping; +- bool check_secretmem = false; + unsigned long mapping_flags; + + /* +@@ -2798,14 +2797,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) + reject_file_backed = true; + + /* We hold a folio reference, so we can safely access folio fields. */ +- +- /* secretmem folios are always order-0 folios. */ +- if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) +- check_secretmem = true; +- +- if (!reject_file_backed && !check_secretmem) +- return true; +- + if (WARN_ON_ONCE(folio_test_slab(folio))) + return false; + +@@ -2847,8 +2838,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) + * At this point, we know the mapping is non-null and points to an + * address_space object. + */ +- if (check_secretmem && secretmem_mapping(mapping)) ++ if (mapping_no_direct_map(mapping)) + return false; ++ + /* The only remaining allowed file system is shmem. */ + return !reject_file_backed || shmem_mapping(mapping); + } +diff --git a/mm/mlock.c b/mm/mlock.c +index cde076fa7d5e..a43f308be70d 100644 +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + + if (newflags == oldflags || (oldflags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || +- vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) ++ vma_is_dax(vma) || vma_is_no_direct_map(vma) || (oldflags & VM_DROPPABLE)) + /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + goto out; + +diff --git a/mm/secretmem.c b/mm/secretmem.c +index 1b0a214ee558..ea4c04d469b1 100644 +--- a/mm/secretmem.c ++++ b/mm/secretmem.c +@@ -136,11 +136,6 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) + return 0; + } + +-bool vma_is_secretmem(struct vm_area_struct *vma) +-{ +- return vma->vm_ops == &secretmem_vm_ops; +-} +- + static const struct file_operations secretmem_fops = { + .release = secretmem_release, + .mmap = secretmem_mmap, +@@ -214,6 +209,7 @@ static struct file *secretmem_file_create(unsigned long flags) + + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); ++ mapping_set_no_direct_map(inode->i_mapping); + + inode->i_op = &secretmem_iops; + inode->i_mapping->a_ops = &secretmem_aops; +-- +2.48.1 + diff --git a/resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch b/resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch new file mode 100644 index 00000000000..c46e04e8543 --- /dev/null +++ b/resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch @@ -0,0 +1,178 @@ +From 9bbc39f9c7622f0060d395b1063a564c24926d8d Mon Sep 17 00:00:00 2001 +From: Patrick Roy +Date: Fri, 7 Feb 2025 14:33:01 +0000 +Subject: [PATCH 2/2] KVM: guest_memfd: Add flag to remove from direct map + +Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When +set, guest_memfd folios will be removed from the direct map after +preparation, with direct map entries only restored when the folios are +freed. + +To ensure these folios do not end up in places where the kernel cannot +deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct +address_space if KVM_GMEM_NO_DIRECT_MAP is requested. + +Add KVM_CAP_GMEM_NO_DIRECT_MAP to let userspace discover whether +guest_memfd supports KVM_GMEM_NO_DIRECT_MAP. Support depends on +guest_memfd itself being supported, but also on whether KVM can +manipulate the direct map at page granularity at all (possible most of +the time, just arm64 is a notable outlier where its impossible if the +direct map has been setup using hugepages, as arm64 cannot break these +apart due to break-before-make semantics). + +Note that this flag causes removal of direct map entries for all +guest_memfd folios independent of whether they are "shared" or "private" +(although current guest_memfd only supports either all folios in the +"shared" state, or all folios in the "private" state if +!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM)). The usecase for removing +direct map entries of also the shared parts of guest_memfd are a special +type of non-CoCo VM where, host userspace is trusted to have access to +all of guest memory, but where Spectre-style transient execution attacks +through the host kernel's direct map should still be mitigated. + +Note that KVM retains access to guest memory via userspace +mappings of guest_memfd, which are reflected back into KVM's memslots +via userspace_addr. This is needed for things like MMIO emulation on +x86_64 to work. Previous iterations attempted to instead have KVM +temporarily restore direct map entries whenever such an access to guest +memory was needed, but this turned out to have a significant performance +impact, as well as additional complexity due to needing to refcount +direct map reinsertion operations and making them play nicely with gmem +truncations. + +This iteration also doesn't have KVM perform TLB flushes after direct +map manipulations. This is because TLB flushes resulted in a up to 40x +elongation of page faults in guest_memfd (scaling with the number of CPU +cores), or a 5x elongation of memory population. On the one hand, TLB +flushes are not needed for functional correctness (the virt->phys +mapping technically stays "correct", the kernel should simply to not it +for a while), so this is a correct optimization to make. On the other +hand, it means that the desired protection from Spectre-style attacks is +not perfect, as an attacker could try to prevent a stale TLB entry from +getting evicted, keeping it alive until the page it refers to is used by +the guest for some sensitive data, and then targeting it using a +spectre-gadget. + +Signed-off-by: Patrick Roy +--- + include/uapi/linux/kvm.h | 3 +++ + virt/kvm/guest_memfd.c | 28 +++++++++++++++++++++++++++- + virt/kvm/kvm_main.c | 5 +++++ + 3 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 117937a895da..fb02a93546d8 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -930,6 +930,7 @@ struct kvm_enable_cap { + #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 + #define KVM_CAP_X86_GUEST_MODE 238 + #define KVM_CAP_GMEM_SHARED_MEM 239 ++#define KVM_CAP_GMEM_NO_DIRECT_MAP 240 + + struct kvm_irq_routing_irqchip { + __u32 irqchip; +@@ -1573,6 +1574,8 @@ struct kvm_create_guest_memfd { + __u64 reserved[6]; + }; + ++#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0) ++ + #define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + + struct kvm_pre_fault_memory { +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index fbf89e643add..a2b96bc51391 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + + #include "kvm_mm.h" + +@@ -50,8 +51,23 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo + return 0; + } + ++static bool kvm_gmem_test_no_direct_map(struct inode *inode) ++{ ++ return ((unsigned long) inode->i_private) & KVM_GMEM_NO_DIRECT_MAP; ++} ++ + static inline void kvm_gmem_mark_prepared(struct folio *folio) + { ++ struct inode *inode = folio_inode(folio); ++ ++ if (kvm_gmem_test_no_direct_map(inode)) { ++ int r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio), ++ false); ++ ++ if (!r) ++ folio_set_private(folio); ++ } ++ + folio_mark_uptodate(folio); + } + +@@ -478,6 +494,10 @@ static void kvm_gmem_free_folio(struct folio *folio) + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); + ++ if (folio_test_private(folio)) ++ WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0), ++ folio_nr_pages(folio), true)); ++ + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); + } + #endif +@@ -551,6 +571,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) + /* Unmovable mappings are supposed to be marked unevictable as well. */ + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + ++ if (flags & KVM_GMEM_NO_DIRECT_MAP) ++ mapping_set_no_direct_map(inode->i_mapping); ++ + kvm_get_kvm(kvm); + gmem->kvm = kvm; + xa_init(&gmem->bindings); +@@ -570,7 +593,10 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) + { + loff_t size = args->size; + u64 flags = args->flags; +- u64 valid_flags = 0; ++ u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP; ++ ++ if (!can_set_direct_map()) ++ valid_flags &= ~KVM_GMEM_NO_DIRECT_MAP; + + if (flags & ~valid_flags) + return -EINVAL; +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 3e40acb9f5c0..32ca1c921ab0 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -65,6 +65,7 @@ + #include + + #include ++#include + + + /* Worst case buffer size needed for holding an integer. */ +@@ -4823,6 +4824,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) + return kvm_supported_mem_attributes(kvm); + #endif + #ifdef CONFIG_KVM_PRIVATE_MEM ++ case KVM_CAP_GMEM_NO_DIRECT_MAP: ++ if (!can_set_direct_map()) ++ return false; ++ fallthrough; + case KVM_CAP_GUEST_MEMFD: + return !kvm || kvm_arch_has_private_mem(kvm); + #endif +-- +2.48.1 + From 606d6dca61c58770794b1fcc88407702351a20f3 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 3 Apr 2025 15:42:46 +0100 Subject: [PATCH 10/40] fix(ci): actually test kernel builds if patches are added The patches are in the `patches` subdirectory of `hiding_ci`, so if only patches were added, then the check of "any files with parent directory `hiding_ci`" would be false, and the CI step for testing the build of patches wouldn't actually run. Fix this by updating the check to be "any files where any parent directory is `hiding_ci`", which will also catch patches. Reported-by: Jack Thomson Signed-off-by: Patrick Roy --- .buildkite/pipeline_pr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline_pr.py b/.buildkite/pipeline_pr.py index be77a3fafd0..17c0df83d94 100755 --- a/.buildkite/pipeline_pr.py +++ b/.buildkite/pipeline_pr.py @@ -68,7 +68,9 @@ for step in kani_grp["steps"]: step["label"] = "🔍 Kani" -if not changed_files or (any(x.parent.name == "hiding_ci" for x in changed_files)): +if not changed_files or ( + any(parent.name == "hiding_ci" for x in changed_files for parent in x.parents) +): pipeline.build_group_per_arch( "🕵️ Build Secret Hiding Kernel", pipeline.devtool_test( From 7caefb3537fa6e5302c34163fd5604daf9f61499 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Tue, 1 Apr 2025 15:21:33 +0000 Subject: [PATCH 11/40] ci: Update script to install for AL23 Update the build script to allow us to install the secret hidden kernels onto Amazon Linux 2023 instances. We have to as part of this include a script to download and install ena drivers for the instance to allow us to boot. Signed-off-by: Jack Thomson --- .../hiding_ci/build_and_install_kernel.sh | 61 ++++++++++++++++--- resources/hiding_ci/dkms.conf | 10 +++ resources/hiding_ci/install_ena.sh | 24 ++++++++ 3 files changed, 85 insertions(+), 10 deletions(-) create mode 100644 resources/hiding_ci/dkms.conf create mode 100755 resources/hiding_ci/install_ena.sh diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 7d27f3a3f86..949d7fdac4a 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -13,12 +13,20 @@ check_root() { fi } -check_ubuntu() { - # Currently this script only works on Ubuntu instances - if ! grep -qi 'ubuntu' /etc/os-release; then - echo "This script currently only works on Ubuntu." - exit 1 +check_userspace() { + # Currently this script only works on Ubuntu and AL2023 + if grep -qi 'ubuntu' /etc/os-release; then + USERSPACE="UBUNTU" + return 0 + fi + + if grep -qi 'al2023' /etc/os-release; then + USERSPACE="AL2023" + return 0 fi + + echo "This script currently only works on Ubuntu and Amazon Linux 2023." + exit 1 } tidy_up() { @@ -96,6 +104,41 @@ check_override_presence() { echo "All overrides correctly applied.." } +ubuntu_update_boot() { + echo "Update initramfs" + update-initramfs -c -k $KERNEL_VERSION + echo "Updating GRUB..." + update-grub +} + +al2023_update_boot() { + echo "Installing ENA driver for AL2023" + $START_DIR/install_ena.sh $KERNEL_VERSION $START_DIR/dkms.conf + + # Just ensure we are back in the build dir + cd $TMP_BUILD_DIR + + echo "Creating the new ram disk" + dracut --kver $KERNEL_VERSION -f -v + + echo "Updating GRUB..." + grubby --grub2 --add-kernel /boot/vmlinux-$KERNEL_VERSION \ + --title="Secret Hiding" \ + --initrd=/boot/initramfs-$KERNEL_VERSION.img --copy-default + grubby --set-default /boot/vmlinux-$KERNEL_VERSION +} + +update_boot_config() { + case "$USERSPACE" in + UBUNTU) ubuntu_update_boot ;; + AL2023) al2023_update_boot ;; + *) + echo "Unknown userspace" + exit 1 + ;; + esac +} + KERNEL_URL=$(cat kernel_url) KERNEL_COMMIT_HASH=$(cat kernel_commit_hash) KERNEL_PATCHES_DIR=$(pwd)/patches @@ -155,16 +198,14 @@ echo "New kernel version:" $KERNEL_VERSION confirm "$@" check_root -check_ubuntu +check_userspace echo "Installing kernel modules..." make INSTALL_MOD_STRIP=1 modules_install echo "Installing kernel..." make INSTALL_MOD_STRIP=1 install -echo "Update initramfs" -update-initramfs -c -k $KERNEL_VERSION -echo "Updating GRUB..." -update-grub + +update_boot_config echo "Kernel built and installed successfully!" diff --git a/resources/hiding_ci/dkms.conf b/resources/hiding_ci/dkms.conf new file mode 100644 index 00000000000..29f108ba298 --- /dev/null +++ b/resources/hiding_ci/dkms.conf @@ -0,0 +1,10 @@ +PACKAGE_NAME="ena" +PACKAGE_VERSION="1.0.0" +CLEAN="make -C kernel/linux/ena clean" +MAKE="make -C kernel/linux/ena/ BUILD_KERNEL=${kernelver}" +BUILT_MODULE_NAME[0]="ena" +BUILT_MODULE_LOCATION="kernel/linux/ena" +DEST_MODULE_LOCATION[0]="/updates" +DEST_MODULE_NAME[0]="ena" +REMAKE_INITRD="yes" +AUTOINSTALL="yes" diff --git a/resources/hiding_ci/install_ena.sh b/resources/hiding_ci/install_ena.sh new file mode 100755 index 00000000000..7d0fd679395 --- /dev/null +++ b/resources/hiding_ci/install_ena.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# # SPDX-License-Identifier: Apache-2.0 + +# fail if we encounter an error, uninitialized variable or a pipe breaks +set -eu -o pipefail + +AMZN_DRIVER_VERSION="2.13.3" +KERNEL_VERSION=$1 +DKMS_CONF_LOCATION=$2 +START_DIR=$(pwd) + +cd /tmp/ + +git clone --depth=1 https://github.com/amzn/amzn-drivers.git +mv amzn-drivers /usr/src/amzn-drivers-${AMZN_DRIVER_VERSION} + +cp $DKMS_CONF_LOCATION /usr/src/amzn-drivers-${AMZN_DRIVER_VERSION} + +dkms add -m amzn-drivers -v ${AMZN_DRIVER_VERSION} +dkms build -k ${KERNEL_VERSION} -m amzn-drivers -v ${AMZN_DRIVER_VERSION} +dkms install -k ${KERNEL_VERSION} -m amzn-drivers -v ${AMZN_DRIVER_VERSION} + +cd $START_DIR From b41077015cac4724765981b21ee59efacbeb85b2 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Fri, 4 Apr 2025 13:26:06 +0000 Subject: [PATCH 12/40] ci: Update the script to support x86 on AL23 The output from the build in x86 is archived so updated the script to support installing either output type from the build Signed-off-by: Jack Thomson --- resources/hiding_ci/build_and_install_kernel.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 949d7fdac4a..79d6129f6d0 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -121,11 +121,14 @@ al2023_update_boot() { echo "Creating the new ram disk" dracut --kver $KERNEL_VERSION -f -v + # This varies from x86 and ARM so capture what was generated + VM_LINUX_LOCATION=$(ls /boot/vmlinu{x,z}-$KERNEL_VERSION 2>/dev/null | head -n1) + echo "Updating GRUB..." - grubby --grub2 --add-kernel /boot/vmlinux-$KERNEL_VERSION \ + grubby --grub2 --add-kernel $VM_LINUX_LOCATION \ --title="Secret Hiding" \ --initrd=/boot/initramfs-$KERNEL_VERSION.img --copy-default - grubby --set-default /boot/vmlinux-$KERNEL_VERSION + grubby --set-default $VM_LINUX_LOCATION } update_boot_config() { From dcf08bb61574beead79d0635b74522eca9aede7d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 7 Apr 2025 09:32:59 +0200 Subject: [PATCH 13/40] fix: test_hiding_kernel.py Add an 'apt update' before `apt install`. Otherwise, we might hold an old view of the package versions and installation might fail. Signed-off-by: Babis Chalios --- tests/integration_tests/build/test_hiding_kernel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/build/test_hiding_kernel.py b/tests/integration_tests/build/test_hiding_kernel.py index a85a73143cb..1d76b31260f 100644 --- a/tests/integration_tests/build/test_hiding_kernel.py +++ b/tests/integration_tests/build/test_hiding_kernel.py @@ -14,7 +14,8 @@ def test_build_hiding_kernel(): In the test we will run our kernel build script to check it succeeds and builds the hidden kernel """ - # We have some extra deps for building the kernel that are not in the dev contaner + # We have some extra deps for building the kernel that are not in the dev container + utils.check_output("apt update") utils.check_output( "apt install -y build-essential libncurses-dev bison flex libssl-dev libelf-dev bc dwarves libncurses5-dev kmod fakeroot" ) From 2cd78bac223c62b987b2717da8fe3120f9ac1290 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 7 Apr 2025 13:05:12 +0100 Subject: [PATCH 14/40] chore: allow clippy::needless_update This lint forbids using `..Default::default()` in struct initializers after all fields have already been initialized, but this is a useful pattern if you know you want to add more fields to a struct in a future PR without needing to touch a ton of initializers in unittests again (_heavy foreshadowing_). So silence the paperclip. Signed-off-by: Patrick Roy --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 4f8cc3f5eb5..58a001e202e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ exit = "warn" tests_outside_test_module = "warn" assertions_on_result_states = "warn" error_impl_error = "warn" +needless-update = "allow" [profile.dev] panic = "abort" From d44b5cfbcffd7b1d4fff1a9a8b0cd69daf3ceb26 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 3 Apr 2025 13:49:44 +0100 Subject: [PATCH 15/40] refactor(test): Move MachineConfig::update tests to machine_config.rs There's no need to test this through VmResources when it can be tested in isolation. Also, everytime I touch MachineConfig I get confsued by where the hell the tests are, cuz not only are they in a different module, they're also one directory level away. So move the tests into machine_config.rs, where it makes sense to have them. Signed-off-by: Patrick Roy --- src/vmm/src/resources.rs | 55 -------------- src/vmm/src/vmm_config/machine_config.rs | 95 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 56 deletions(-) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 097e2041b55..2ecd5139f8a 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -1323,44 +1323,6 @@ mod tests { aux_vm_config ); - // Invalid vcpu count. - aux_vm_config.vcpu_count = Some(0); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - aux_vm_config.vcpu_count = Some(33); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - - // Check that SMT is not supported on aarch64, and that on x86_64 enabling it requires vcpu - // count to be even. - aux_vm_config.smt = Some(true); - #[cfg(target_arch = "aarch64")] - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::SmtNotSupported) - ); - aux_vm_config.vcpu_count = Some(3); - #[cfg(target_arch = "x86_64")] - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - aux_vm_config.vcpu_count = Some(32); - #[cfg(target_arch = "x86_64")] - vm_resources.update_machine_config(&aux_vm_config).unwrap(); - aux_vm_config.smt = Some(false); - - // Invalid mem_size_mib. - aux_vm_config.mem_size_mib = Some(0); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidMemorySize) - ); - // Incompatible mem_size_mib with balloon size. vm_resources.machine_config.mem_size_mib = 128; vm_resources @@ -1379,23 +1341,6 @@ mod tests { // mem_size_mib compatible with balloon size. aux_vm_config.mem_size_mib = Some(256); vm_resources.update_machine_config(&aux_vm_config).unwrap(); - - // mem_size_mib incompatible with huge pages configuration - aux_vm_config.mem_size_mib = Some(129); - aux_vm_config.huge_pages = Some(HugePageConfig::Hugetlbfs2M); - assert_eq!( - vm_resources - .update_machine_config(&aux_vm_config) - .unwrap_err(), - MachineConfigError::InvalidMemorySize - ); - - // mem_size_mib compatible with huge page configuration - aux_vm_config.mem_size_mib = Some(2048); - // Remove the balloon device config that's added by `default_vm_resources` as it would - // trigger the "ballooning incompatible with huge pages" check. - vm_resources.balloon = BalloonBuilder::new(); - vm_resources.update_machine_config(&aux_vm_config).unwrap(); } #[test] diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index cfe7105fdf8..125ee047e2d 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -290,7 +290,100 @@ impl MachineConfig { #[cfg(test)] mod tests { use crate::cpu_config::templates::{CpuTemplateType, CustomCpuTemplate, StaticCpuTemplate}; - use crate::vmm_config::machine_config::MachineConfig; + use crate::vmm_config::machine_config::{ + HugePageConfig, MachineConfig, MachineConfigError, MachineConfigUpdate, + }; + + #[test] + #[allow(unused)] // some assertions exist only on specific architectures. + fn test_machine_config_update() { + let mconf = MachineConfig::default(); + + // Assert that the default machine config is valid + assert_eq!( + mconf + .update(&MachineConfigUpdate::from(mconf.clone())) + .unwrap(), + mconf + ); + + // Invalid vCPU counts + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(0), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(33), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + // Invalid memory size + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(0), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidMemorySize)); + + // Memory Size incompatible with huge page configuration + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(31), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidMemorySize)); + + // works if the memory size is a multiple of huge page size indeed + let updated = mconf + .update(&MachineConfigUpdate { + mem_size_mib: Some(32), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }) + .unwrap(); + assert_eq!(updated.huge_pages, HugePageConfig::Hugetlbfs2M); + assert_eq!(updated.mem_size_mib, 32); + } + + #[test] + #[cfg(target_arch = "aarch64")] + fn test_machine_config_update_aarch64() { + let mconf = MachineConfig::default(); + + // Check that SMT is not supported on aarch64 + let res = mconf.update(&MachineConfigUpdate { + smt: Some(true), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::SmtNotSupported)); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_machine_config_update_x86_64() { + let mconf = MachineConfig::default(); + + // Test that SMT requires an even vcpu count + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(3), + smt: Some(true), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + // Works if the vcpu count is even indeed + let updated = mconf + .update(&MachineConfigUpdate { + vcpu_count: Some(32), + smt: Some(true), + ..Default::default() + }) + .unwrap(); + assert_eq!(updated.vcpu_count, 32); + assert!(updated.smt); + } // Ensure the special (de)serialization logic for the cpu_template field works: // only static cpu templates can be specified via the machine-config endpoint, but From 448ea07c90697653c77a7132ddc081c28254d6b9 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 25 Apr 2025 10:04:47 +0100 Subject: [PATCH 16/40] add helper for Read/Write[Volatile] through bounce buffer With secret freedom, direct accesses to guest memory from the context of the host kernel are no longer possible. This particularly means that we cannot pass pointers to guest memory to the host kernel anymore (at least if the host kernel tries to GUP them). For these scenarios, introduce a utility decorator struct `MaybeBounce` that can optionally do indirect read and write syscalls on guest memory by first memcpy-ing to firecracker userspace, and passing a pointer to firecracker heap memory into the kernel instead. Signed-off-by: Patrick Roy --- src/vmm/src/vstate/memory.rs | 91 +++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 19367f7f997..3138a8026e6 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -6,7 +6,7 @@ // found in the THIRD-PARTY file. use std::fs::File; -use std::io::SeekFrom; +use std::io::{Read, Seek, SeekFrom}; use std::sync::Arc; use serde::{Deserialize, Serialize}; @@ -17,7 +17,10 @@ pub use vm_memory::{ Address, ByteValued, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MemoryRegionAddress, MmapRegion, address, }; -use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; +use vm_memory::{ + Error as VmMemoryError, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, + WriteVolatile, +}; use vmm_sys_util::errno; use crate::DirtyBitmap; @@ -50,6 +53,58 @@ pub enum MemoryError { OffsetTooLarge, } +/// Newtype that implements [`ReadVolatile`] and [`WriteVolatile`] if `T` implements `Read` or +/// `Write` respectively, by reading/writing using a bounce buffer, and memcpy-ing into the +/// [`VolatileSlice`]. +#[derive(Debug)] +pub struct MaybeBounce(pub T, pub bool); + +impl ReadVolatile for MaybeBounce { + fn read_volatile( + &mut self, + buf: &mut VolatileSlice, + ) -> Result { + if self.1 { + let mut bbuf = vec![0; buf.len()]; + let n = self + .0 + .read_volatile(&mut VolatileSlice::from(bbuf.as_mut_slice()))?; + buf.copy_from(&bbuf[..n]); + Ok(n) + } else { + self.0.read_volatile(buf) + } + } +} + +impl WriteVolatile for MaybeBounce { + fn write_volatile( + &mut self, + buf: &VolatileSlice, + ) -> Result { + if self.1 { + let mut bbuf = vec![0; buf.len()]; + buf.copy_to(bbuf.as_mut_slice()); + self.0 + .write_volatile(&VolatileSlice::from(bbuf.as_mut_slice())) + } else { + self.0.write_volatile(buf) + } + } +} + +impl Read for MaybeBounce { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.0.read(buf) + } +} + +impl Seek for MaybeBounce { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.0.seek(pos) + } +} + /// Creates a `Vec` of `GuestRegionMmap` with the given configuration pub fn create( regions: impl Iterator, @@ -346,6 +401,7 @@ mod tests { use std::collections::HashMap; use std::io::{Read, Seek}; + use std::os::fd::AsFd; use vmm_sys_util::tempfile::TempFile; @@ -722,4 +778,35 @@ mod tests { seals.insert(memfd::FileSeal::SealGrow); memfd.add_seals(&seals).unwrap_err(); } + + #[test] + fn test_bounce() { + let file_direct = TempFile::new().unwrap(); + let file_bounced = TempFile::new().unwrap(); + + let mut data = (0..=255).collect::>(); + + MaybeBounce(file_direct.as_file().as_fd(), false) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + MaybeBounce(file_bounced.as_file().as_fd(), true) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + + let mut data_direct = vec![0u8; 256]; + let mut data_bounced = vec![0u8; 256]; + + file_direct.as_file().seek(SeekFrom::Start(0)).unwrap(); + file_bounced.as_file().seek(SeekFrom::Start(0)).unwrap(); + + MaybeBounce(file_direct.as_file().as_fd(), false) + .read_exact_volatile(&mut VolatileSlice::from(data_direct.as_mut_slice())) + .unwrap(); + MaybeBounce(file_bounced.as_file().as_fd(), true) + .read_exact_volatile(&mut VolatileSlice::from(data_bounced.as_mut_slice())) + .unwrap(); + + assert_eq!(data_direct, data_bounced); + assert_eq!(data_direct, data); + } } From 82b2f47471e8c1b1fe56673d13e6538b3eaf3d4a Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 14 Apr 2025 11:57:51 +0100 Subject: [PATCH 17/40] allow persistent bounce buffers in MaybeBounce This is particularly useful for virtio devices, where on-demand allocation of bounce buffers leads to sever performance impacts (~80%) in synthetic throughput tests. Additionally, for virtio devices we can know approximately what the optimal size of a statically allocated bounce buffer is. Allocate bounce buffers on the heap, as trying to even temporarily place a 65k bounce buffer on the stack can lead to stack overflow errors. Signed-off-by: Patrick Roy --- src/vmm/src/vstate/memory.rs | 146 ++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 26 deletions(-) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 3138a8026e6..f9206bdc414 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -6,7 +6,7 @@ // found in the THIRD-PARTY file. use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; +use std::io::{Read, Seek, SeekFrom, Write}; use std::sync::Arc; use serde::{Deserialize, Serialize}; @@ -56,52 +56,131 @@ pub enum MemoryError { /// Newtype that implements [`ReadVolatile`] and [`WriteVolatile`] if `T` implements `Read` or /// `Write` respectively, by reading/writing using a bounce buffer, and memcpy-ing into the /// [`VolatileSlice`]. +/// +/// Bounce buffers are allocated on the heap, as on-stack bounce buffers could cause stack +/// overflows. If `N == 0` then bounce buffers will be allocated on demand. #[derive(Debug)] -pub struct MaybeBounce(pub T, pub bool); +pub struct MaybeBounce { + pub(crate) target: T, + persistent_buffer: Option>, +} + +impl MaybeBounce { + /// Creates a new `MaybeBounce` that always allocates a bounce + /// buffer on-demand + pub fn new(target: T, should_bounce: bool) -> Self { + MaybeBounce::new_persistent(target, should_bounce) + } +} + +impl MaybeBounce { + /// Creates a new `MaybeBounce` that uses a persistent, fixed size bounce buffer + /// of size `N`. If a read/write request exceeds the size of this bounce buffer, it + /// is split into multiple, `<= N`-size read/writes. + pub fn new_persistent(target: T, should_bounce: bool) -> Self { + let mut bounce = MaybeBounce { + target, + persistent_buffer: None, + }; + + if should_bounce { + bounce.activate() + } + + bounce + } -impl ReadVolatile for MaybeBounce { + /// Activates this [`MaybeBounce`] to start doing reads/writes via a bounce buffer, + /// which is allocated on the heap by this function (e.g. if `activate()` is never called, + /// no bounce buffer is ever allocated). + pub fn activate(&mut self) { + self.persistent_buffer = Some(vec![0u8; N].into_boxed_slice().try_into().unwrap()) + } +} + +impl ReadVolatile for MaybeBounce { fn read_volatile( &mut self, buf: &mut VolatileSlice, ) -> Result { - if self.1 { - let mut bbuf = vec![0; buf.len()]; - let n = self - .0 - .read_volatile(&mut VolatileSlice::from(bbuf.as_mut_slice()))?; - buf.copy_from(&bbuf[..n]); - Ok(n) + if let Some(ref mut persistent) = self.persistent_buffer { + let mut bbuf = (N == 0).then(|| vec![0u8; buf.len()]); + let bbuf = bbuf.as_deref_mut().unwrap_or(persistent.as_mut_slice()); + + let mut buf = buf.offset(0)?; + let mut total = 0; + while !buf.is_empty() { + let how_much = buf.len().min(bbuf.len()); + let n = self + .target + .read_volatile(&mut VolatileSlice::from(&mut bbuf[..how_much]))?; + buf.copy_from(&bbuf[..n]); + + buf = buf.offset(n)?; + total += n; + + if n < how_much { + break; + } + } + + Ok(total) } else { - self.0.read_volatile(buf) + self.target.read_volatile(buf) } } } -impl WriteVolatile for MaybeBounce { +impl WriteVolatile for MaybeBounce { fn write_volatile( &mut self, buf: &VolatileSlice, ) -> Result { - if self.1 { - let mut bbuf = vec![0; buf.len()]; - buf.copy_to(bbuf.as_mut_slice()); - self.0 - .write_volatile(&VolatileSlice::from(bbuf.as_mut_slice())) + if let Some(ref mut persistent) = self.persistent_buffer { + let mut bbuf = (N == 0).then(|| vec![0u8; buf.len()]); + let bbuf = bbuf.as_deref_mut().unwrap_or(persistent.as_mut_slice()); + + let mut buf = buf.offset(0)?; + let mut total = 0; + while !buf.is_empty() { + let how_much = buf.copy_to(bbuf); + let n = self + .target + .write_volatile(&VolatileSlice::from(&mut bbuf[..how_much]))?; + buf = buf.offset(n)?; + total += n; + + if n < how_much { + break; + } + } + + Ok(total) } else { - self.0.write_volatile(buf) + self.target.write_volatile(buf) } } } -impl Read for MaybeBounce { +impl Read for MaybeBounce { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - self.0.read(buf) + self.target.read(buf) + } +} + +impl Write for MaybeBounce { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.target.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.target.flush() } } -impl Seek for MaybeBounce { +impl Seek for MaybeBounce { fn seek(&mut self, pos: SeekFrom) -> std::io::Result { - self.0.seek(pos) + self.target.seek(pos) } } @@ -783,30 +862,45 @@ mod tests { fn test_bounce() { let file_direct = TempFile::new().unwrap(); let file_bounced = TempFile::new().unwrap(); + let file_persistent_bounced = TempFile::new().unwrap(); let mut data = (0..=255).collect::>(); - MaybeBounce(file_direct.as_file().as_fd(), false) + MaybeBounce::new(file_direct.as_file().as_fd(), false) .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) .unwrap(); - MaybeBounce(file_bounced.as_file().as_fd(), true) + MaybeBounce::new(file_bounced.as_file().as_fd(), true) + .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) + .unwrap(); + MaybeBounce::<_, 7>::new_persistent(file_persistent_bounced.as_file().as_fd(), true) .write_all_volatile(&VolatileSlice::from(data.as_mut_slice())) .unwrap(); let mut data_direct = vec![0u8; 256]; let mut data_bounced = vec![0u8; 256]; + let mut data_persistent_bounced = vec![0u8; 256]; file_direct.as_file().seek(SeekFrom::Start(0)).unwrap(); file_bounced.as_file().seek(SeekFrom::Start(0)).unwrap(); + file_persistent_bounced + .as_file() + .seek(SeekFrom::Start(0)) + .unwrap(); - MaybeBounce(file_direct.as_file().as_fd(), false) + MaybeBounce::new(file_direct.as_file().as_fd(), false) .read_exact_volatile(&mut VolatileSlice::from(data_direct.as_mut_slice())) .unwrap(); - MaybeBounce(file_bounced.as_file().as_fd(), true) + MaybeBounce::new(file_bounced.as_file().as_fd(), true) .read_exact_volatile(&mut VolatileSlice::from(data_bounced.as_mut_slice())) .unwrap(); + MaybeBounce::<_, 7>::new_persistent(file_persistent_bounced.as_file().as_fd(), true) + .read_exact_volatile(&mut VolatileSlice::from( + data_persistent_bounced.as_mut_slice(), + )) + .unwrap(); assert_eq!(data_direct, data_bounced); assert_eq!(data_direct, data); + assert_eq!(data_persistent_bounced, data); } } From 120c0cefb0987946a0d974ca36172c39de2592bc Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 11 Apr 2025 13:11:14 +0100 Subject: [PATCH 18/40] implement userspace bounce buffering support Add support to our virtio devices to allow userspace bounce buffering of virtio buffers. This is an alternative to swiotlb. Don't implement it for vhost-user-blk and for virtio-block with async engine, because I have no idea how that would even work. Signed-off-by: Patrick Roy --- src/vmm/src/device_manager/mmio.rs | 8 ++ src/vmm/src/devices/virtio/balloon/device.rs | 8 ++ src/vmm/src/devices/virtio/block/device.rs | 14 ++++ .../devices/virtio/block/vhost_user/device.rs | 8 ++ .../src/devices/virtio/block/virtio/device.rs | 14 ++++ .../devices/virtio/block/virtio/io/sync_io.rs | 29 +++++-- .../devices/virtio/block/virtio/persist.rs | 12 ++- src/vmm/src/devices/virtio/device.rs | 14 ++++ src/vmm/src/devices/virtio/mmio.rs | 8 ++ src/vmm/src/devices/virtio/net/device.rs | 84 +++++++++++++++++-- src/vmm/src/devices/virtio/net/persist.rs | 2 + src/vmm/src/devices/virtio/net/tap.rs | 2 +- src/vmm/src/devices/virtio/persist.rs | 5 +- src/vmm/src/devices/virtio/rng/device.rs | 8 ++ .../devices/virtio/vsock/csm/connection.rs | 25 +++--- src/vmm/src/devices/virtio/vsock/device.rs | 8 ++ src/vmm/src/devices/virtio/vsock/mod.rs | 5 +- src/vmm/src/devices/virtio/vsock/persist.rs | 7 +- .../src/devices/virtio/vsock/test_utils.rs | 10 ++- .../src/devices/virtio/vsock/unix/muxer.rs | 18 +++- src/vmm/src/vstate/memory.rs | 5 ++ 21 files changed, 259 insertions(+), 35 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 99bde6e2e78..7425fd56945 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -600,6 +600,14 @@ mod tests { fn set_acked_features(&mut self, _: u64) {} + fn force_userspace_bounce_buffers(&mut self) { + todo!() + } + + fn userspace_bounce_buffers(&self) -> bool { + todo!() + } + fn device_type(&self) -> u32 { 0 } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 186f09275bc..f22b220984c 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -557,6 +557,14 @@ impl VirtioDevice for Balloon { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + // balloon device doesn't have a need for bounce buffers + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } + fn device_type(&self) -> u32 { TYPE_BALLOON } diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index bf3043bcdd4..a55e0254bec 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -148,6 +148,20 @@ impl VirtioDevice for Block { } } + fn force_userspace_bounce_buffers(&mut self) { + match self { + Block::Virtio(b) => b.force_userspace_bounce_buffers(), + Block::VhostUser(b) => b.force_userspace_bounce_buffers(), + } + } + + fn userspace_bounce_buffers(&self) -> bool { + match self { + Block::Virtio(b) => b.userspace_bounce_buffers(), + Block::VhostUser(b) => b.userspace_bounce_buffers(), + } + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index b0bf5a31e3f..014693d3f5e 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -294,6 +294,14 @@ impl VirtioDevice for VhostUserBlock self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + // Nothing Firecracker can do about this, the backend would need to do the bouncing + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index b11c757d43c..7a913d912c1 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -578,6 +578,20 @@ impl VirtioDevice for VirtioBlock { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + match self.disk.file_engine { + FileEngine::Async(_) => panic!("No idea how this is supposed to work for io_uring"), + FileEngine::Sync(ref mut engine) => engine.start_bouncing(), + } + } + + fn userspace_bounce_buffers(&self) -> bool { + match self.disk.file_engine { + FileEngine::Async(_) => false, + FileEngine::Sync(ref engine) => engine.is_bouncing(), + } + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..576a0a5b1f2 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -6,7 +6,7 @@ use std::io::{Seek, SeekFrom, Write}; use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; -use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap, MaybeBounce}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum SyncIoError { @@ -22,7 +22,12 @@ pub enum SyncIoError { #[derive(Debug)] pub struct SyncFileEngine { - file: File, + // 65536 is the largest buffer a linux guest will give us, empirically. Determined by + // having `MaybeBounce` logging scenarios where the fixed size bounce buffer isn't sufficient. + // Note that even if this assumption ever changes, the worse that'll happen is that we do + // multiple roundtrips between guest memory and the bounce buffer, as MaybeBounce would + // just chop larger reads/writes into chunks of 65k. + file: MaybeBounce, } // SAFETY: `File` is send and ultimately a POD. @@ -30,17 +35,27 @@ unsafe impl Send for SyncFileEngine {} impl SyncFileEngine { pub fn from_file(file: File) -> SyncFileEngine { - SyncFileEngine { file } + SyncFileEngine { + file: MaybeBounce::new_persistent(file, false), + } } #[cfg(test)] pub fn file(&self) -> &File { - &self.file + &self.file.target + } + + pub fn start_bouncing(&mut self) { + self.file.activate() + } + + pub fn is_bouncing(&self) -> bool { + self.file.is_activated() } /// Update the backing file of the engine pub fn update_file(&mut self, file: File) { - self.file = file + self.file.target = file } pub fn read( @@ -77,8 +92,8 @@ impl SyncFileEngine { pub fn flush(&mut self) -> Result<(), SyncIoError> { // flush() first to force any cached data out of rust buffers. - self.file.flush().map_err(SyncIoError::Flush)?; + self.file.target.flush().map_err(SyncIoError::Flush)?; // Sync data out to physical media on host. - self.file.sync_all().map_err(SyncIoError::SyncAll) + self.file.target.sync_all().map_err(SyncIoError::SyncAll) } } diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 8c6f2c2453d..a52f901ebab 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -16,7 +16,7 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::{DeviceState, IrqTrigger}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; use crate::rate_limiter::RateLimiter; @@ -127,7 +127,7 @@ impl Persist<'_> for VirtioBlock { capacity: disk_properties.nsectors.to_le(), }; - Ok(VirtioBlock { + let mut dev = VirtioBlock { avail_features, acked_features, config_space, @@ -148,7 +148,13 @@ impl Persist<'_> for VirtioBlock { rate_limiter, is_io_engine_throttled: false, metrics: BlockMetricsPerDevice::alloc(state.id.clone()), - }) + }; + + if state.virtio_state.bounce_in_userspace { + dev.force_userspace_bounce_buffers() + } + + Ok(dev) } } diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 62131e775f5..8c35e4d2f3c 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -102,6 +102,12 @@ pub trait VirtioDevice: AsAny + Send { /// - self.avail_features() & self.acked_features() = self.get_acked_features() fn set_acked_features(&mut self, acked_features: u64); + /// Make the virtio device user userspace bounce buffers + fn force_userspace_bounce_buffers(&mut self); + + /// Whether this device is using userspace bounce buffers + fn userspace_bounce_buffers(&self) -> bool; + /// Check if virtio device has negotiated given feature. fn has_feature(&self, feature: u64) -> bool { (self.acked_features() & (1 << feature)) != 0 @@ -259,6 +265,14 @@ pub(crate) mod tests { todo!() } + fn force_userspace_bounce_buffers(&mut self) { + todo!() + } + + fn userspace_bounce_buffers(&self) -> bool { + todo!() + } + fn device_type(&self) -> u32 { todo!() } diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/mmio.rs index 12ee54bfb0a..c061ad82732 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/mmio.rs @@ -423,6 +423,14 @@ pub(crate) mod tests { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + unimplemented!() + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } + fn device_type(&self) -> u32 { 123 } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index fff04d1da1a..b6bd6906b23 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -6,12 +6,14 @@ // found in the THIRD-PARTY file. use std::collections::VecDeque; +use std::io::{Read, Write}; use std::mem::{self}; use std::net::Ipv4Addr; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; use log::error; +use vm_memory::VolatileSlice; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; @@ -245,7 +247,9 @@ pub struct Net { pub(crate) rx_rate_limiter: RateLimiter, pub(crate) tx_rate_limiter: RateLimiter, - rx_frame_buf: [u8; MAX_BUFFER_SIZE], + /// Used both for bounce buffering and for relaying frames to MMDS + userspace_buffer: [u8; MAX_BUFFER_SIZE], + pub(crate) userspace_bouncing: bool, tx_frame_headers: [u8; frame_hdr_len()], @@ -311,8 +315,9 @@ impl Net { queue_evts, rx_rate_limiter, tx_rate_limiter, - rx_frame_buf: [0u8; MAX_BUFFER_SIZE], + userspace_buffer: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], + userspace_bouncing: false, irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, config_space, guest_mac, @@ -496,6 +501,7 @@ impl Net { // Tries to detour the frame to MMDS and if MMDS doesn't accept it, sends it on the host TAP. // // Returns whether MMDS consumed the frame. + #[allow(clippy::too_many_arguments)] fn write_to_mmds_or_tap( mmds_ns: Option<&mut MmdsNetworkStack>, rate_limiter: &mut RateLimiter, @@ -504,6 +510,7 @@ impl Net { tap: &mut Tap, guest_mac: Option, net_metrics: &NetDeviceMetrics, + bb: Option<&mut [u8]>, ) -> Result { // Read the frame headers from the IoVecBuffer let max_header_len = headers.len(); @@ -551,7 +558,7 @@ impl Net { } let _metric = net_metrics.tap_write_agg.record_latency_metrics(); - match Self::write_tap(tap, frame_iovec) { + match Self::write_tap(tap, frame_iovec, bb) { Ok(_) => { let len = u64::from(frame_iovec.len()); net_metrics.tx_bytes_count.add(len); @@ -585,15 +592,15 @@ impl Net { if let Some(ns) = self.mmds_ns.as_mut() { if let Some(len) = - ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.rx_frame_buf)?) + ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.userspace_buffer)?) { let len = len.get(); METRICS.mmds.tx_frames.inc(); METRICS.mmds.tx_bytes.add(len as u64); - init_vnet_hdr(&mut self.rx_frame_buf); + init_vnet_hdr(&mut self.userspace_buffer); self.rx_buffer .iovec - .write_all_volatile_at(&self.rx_frame_buf[..vnet_hdr_len() + len], 0)?; + .write_all_volatile_at(&self.userspace_buffer[..vnet_hdr_len() + len], 0)?; // SAFETY: // * len will never be bigger that u32::MAX because mmds is bound // by the size of `self.rx_frame_buf` which is MAX_BUFFER_SIZE size. @@ -734,6 +741,8 @@ impl Net { &mut self.tap, self.guest_mac, &self.metrics, + self.userspace_bouncing + .then_some(self.userspace_buffer.as_mut_slice()), ) .unwrap_or(false); if frame_consumed_by_mmds && self.rx_buffer.used_bytes == 0 { @@ -826,11 +835,57 @@ impl Net { } else { self.rx_buffer.single_chain_slice_mut() }; - self.tap.read_iovec(slice) + + if self.userspace_bouncing { + let how_many = self + .tap + .tap_file + .read(self.userspace_buffer.as_mut_slice())?; + + assert!(how_many <= MAX_BUFFER_SIZE); + + let mut offset = 0; + for iov in slice { + assert!( + offset <= how_many, + "copied more bytes into guest memory than read from tap" + ); + + let to_copy = (how_many - offset).min(iov.iov_len); + + if to_copy == 0 { + break; + } + + // SAFETY: the iovec comes from an `IoVecBufferMut`, which upholds the invariant + // that all contained iovecs are covering valid ranges of guest memory. + // Particularly, to_copy <= iov.iov_len + let vslice = unsafe { VolatileSlice::new(iov.iov_base.cast(), to_copy) }; + + vslice.copy_from(&self.userspace_buffer[offset..]); + + offset += to_copy; + } + + Ok(how_many) + } else { + self.tap.read_iovec(slice) + } } - fn write_tap(tap: &mut Tap, buf: &IoVecBuffer) -> std::io::Result { - tap.write_iovec(buf) + fn write_tap( + tap: &mut Tap, + buf: &IoVecBuffer, + bounce_buffer: Option<&mut [u8]>, + ) -> std::io::Result { + if let Some(bb) = bounce_buffer { + let how_many = buf.len() as usize; + let copied = buf.read_volatile_at(&mut &mut *bb, 0, how_many).unwrap(); + assert_eq!(copied, how_many); + tap.tap_file.write(&bb[..copied]) + } else { + tap.write_iovec(buf) + } } /// Process a single RX queue event. @@ -946,6 +1001,14 @@ impl VirtioDevice for Net { self.acked_features = acked_features; } + fn force_userspace_bounce_buffers(&mut self) { + self.userspace_bouncing = true + } + + fn userspace_bounce_buffers(&self) -> bool { + self.userspace_bouncing + } + fn device_type(&self) -> u32 { TYPE_NET } @@ -1931,6 +1994,7 @@ pub mod tests { &mut net.tap, Some(src_mac), &net.metrics, + None ) .unwrap() ) @@ -1970,6 +2034,7 @@ pub mod tests { &mut net.tap, Some(guest_mac), &net.metrics, + None ) ); @@ -1985,6 +2050,7 @@ pub mod tests { &mut net.tap, Some(not_guest_mac), &net.metrics, + None ) ); } diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 5f2d6f560b4..cbb5c8f52a7 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -152,6 +152,8 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; + net.userspace_bouncing = state.virtio_state.bounce_in_userspace; + if state.virtio_state.activated { let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); net.tap diff --git a/src/vmm/src/devices/virtio/net/tap.rs b/src/vmm/src/devices/virtio/net/tap.rs index c516705af31..30a499489b0 100644 --- a/src/vmm/src/devices/virtio/net/tap.rs +++ b/src/vmm/src/devices/virtio/net/tap.rs @@ -49,7 +49,7 @@ ioctl_iow_nr!(TUNSETVNETHDRSZ, TUNTAP, 216, ::std::os::raw::c_int); /// Tap goes out of scope, and the kernel will clean up the interface automatically. #[derive(Debug)] pub struct Tap { - tap_file: File, + pub(crate) tap_file: File, pub(crate) if_name: [u8; IFACE_NAME_MAX_LEN], } diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 7c861352317..ba365617abf 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -125,11 +125,13 @@ pub struct VirtioDeviceState { pub interrupt_status: u32, /// Flag for activated status. pub activated: bool, + /// Whether this device has to use userspace bounce buffers + pub bounce_in_userspace: bool, } impl VirtioDeviceState { /// Construct the virtio state of a device. - pub fn from_device(device: &dyn VirtioDevice) -> Self { + pub fn from_device(device: &impl VirtioDevice) -> Self { VirtioDeviceState { device_type: device.device_type(), avail_features: device.avail_features(), @@ -137,6 +139,7 @@ impl VirtioDeviceState { queues: device.queues().iter().map(Persist::save).collect(), interrupt_status: device.interrupt_status().load(Ordering::Relaxed), activated: device.is_activated(), + bounce_in_userspace: device.userspace_bounce_buffers(), } } diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 97ac8676e0a..50fb1e4ee23 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -303,6 +303,14 @@ impl VirtioDevice for Entropy { self.device_state = DeviceState::Activated(mem); Ok(()) } + + fn force_userspace_bounce_buffers(&mut self) { + // rng device works with only userspace accesses + } + + fn userspace_bounce_buffers(&self) -> bool { + false + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/vsock/csm/connection.rs b/src/vmm/src/devices/virtio/vsock/csm/connection.rs index c9bd5b2c0f7..6f39f8b3079 100644 --- a/src/vmm/src/devices/virtio/vsock/csm/connection.rs +++ b/src/vmm/src/devices/virtio/vsock/csm/connection.rs @@ -95,6 +95,7 @@ use crate::devices::virtio::vsock::metrics::METRICS; use crate::devices::virtio::vsock::packet::{VsockPacketHeader, VsockPacketRx, VsockPacketTx}; use crate::logger::IncMetric; use crate::utils::wrap_usize_to_u32; +use crate::vstate::memory::MaybeBounce; /// Trait that vsock connection backends need to implement. /// @@ -118,7 +119,7 @@ pub struct VsockConnection { /// The peer (guest) port. peer_port: u32, /// The (connected) host-side stream. - stream: S, + pub(crate) stream: MaybeBounce, /// The TX buffer for this connection. tx_buf: TxBuf, /// Total number of bytes that have been successfully written to `self.stream`, either @@ -414,7 +415,7 @@ where /// The connection is interested in being notified about EPOLLIN / EPOLLOUT events on the /// host stream. fn as_raw_fd(&self) -> RawFd { - self.stream.as_raw_fd() + self.stream.target.as_raw_fd() } } @@ -509,13 +510,14 @@ where local_port: u32, peer_port: u32, peer_buf_alloc: u32, + bounce: bool, ) -> Self { Self { local_cid, peer_cid, local_port, peer_port, - stream, + stream: MaybeBounce::new_persistent(stream, bounce), state: ConnState::PeerInit, tx_buf: TxBuf::new(), fwd_cnt: Wrapping(0), @@ -535,13 +537,14 @@ where peer_cid: u64, local_port: u32, peer_port: u32, + bounce: bool, ) -> Self { Self { local_cid, peer_cid, local_port, peer_port, - stream, + stream: MaybeBounce::new_persistent(stream, bounce), state: ConnState::LocalInit, tx_buf: TxBuf::new(), fwd_cnt: Wrapping(0), @@ -882,9 +885,10 @@ mod tests { LOCAL_PORT, PEER_PORT, PEER_BUF_ALLOC, + false, ), ConnState::LocalInit => VsockConnection::::new_local_init( - stream, LOCAL_CID, PEER_CID, LOCAL_PORT, PEER_PORT, + stream, LOCAL_CID, PEER_CID, LOCAL_PORT, PEER_PORT, false, ), ConnState::Established => { let mut conn = VsockConnection::::new_peer_init( @@ -894,6 +898,7 @@ mod tests { LOCAL_PORT, PEER_PORT, PEER_BUF_ALLOC, + false, ); assert!(conn.has_pending_rx()); conn.recv_pkt(&mut rx_pkt).unwrap(); @@ -912,7 +917,7 @@ mod tests { } fn set_stream(&mut self, stream: TestStream) { - self.conn.stream = stream; + self.conn.stream = MaybeBounce::new_persistent(stream, false); } fn set_peer_credit(&mut self, credit: u32) { @@ -1014,7 +1019,7 @@ mod tests { let mut ctx = CsmTestContext::new_established(); let data = &[1, 2, 3, 4]; ctx.set_stream(TestStream::new_with_read_buf(data)); - assert_eq!(ctx.conn.as_raw_fd(), ctx.conn.stream.as_raw_fd()); + assert_eq!(ctx.conn.as_raw_fd(), ctx.conn.stream.target.as_raw_fd()); ctx.notify_epollin(); ctx.recv(); assert_eq!(ctx.rx_pkt.hdr.op(), uapi::VSOCK_OP_RW); @@ -1098,7 +1103,7 @@ mod tests { ctx.init_data_tx_pkt(data); ctx.send(); - assert_eq!(ctx.conn.stream.write_buf.len(), 0); + assert_eq!(ctx.conn.stream.target.write_buf.len(), 0); assert!(ctx.conn.tx_buf.is_empty()); } @@ -1113,7 +1118,7 @@ mod tests { let data = &[1, 2, 3, 4]; ctx.init_data_tx_pkt(data); ctx.send(); - assert_eq!(ctx.conn.stream.write_buf, data.to_vec()); + assert_eq!(ctx.conn.stream.target.write_buf, data.to_vec()); ctx.notify_epollin(); ctx.recv(); @@ -1233,7 +1238,7 @@ mod tests { ctx.set_stream(TestStream::new()); ctx.conn.notify(EventSet::OUT); assert!(ctx.conn.tx_buf.is_empty()); - assert_eq!(ctx.conn.stream.write_buf, data); + assert_eq!(ctx.conn.stream.target.write_buf, data); } } diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index aa114f6cccb..55bc97bc7ff 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -280,6 +280,14 @@ where self.acked_features = acked_features } + fn force_userspace_bounce_buffers(&mut self) { + self.backend.start_bouncing() + } + + fn userspace_bounce_buffers(&self) -> bool { + self.backend.is_bouncing() + } + fn device_type(&self) -> u32 { uapi::VIRTIO_ID_VSOCK } diff --git a/src/vmm/src/devices/virtio/vsock/mod.rs b/src/vmm/src/devices/virtio/vsock/mod.rs index 859e198860b..54c9eeef3b9 100644 --- a/src/vmm/src/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/devices/virtio/vsock/mod.rs @@ -185,4 +185,7 @@ pub trait VsockChannel { /// The vsock backend, which is basically an epoll-event-driven vsock channel. /// Currently, the only implementation we have is `crate::devices::virtio::unix::muxer::VsockMuxer`, /// which translates guest-side vsock connections to host-side Unix domain socket connections. -pub trait VsockBackend: VsockChannel + VsockEpollListener + Send {} +pub trait VsockBackend: VsockChannel + VsockEpollListener + Send { + fn start_bouncing(&mut self); + fn is_bouncing(&self) -> bool; +} diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index fce6affae69..6128090b601 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -10,7 +10,7 @@ use std::sync::atomic::AtomicU32; use serde::{Deserialize, Serialize}; use super::*; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::vsock::TYPE_VSOCK; @@ -128,6 +128,11 @@ where } else { DeviceState::Inactive }; + + if state.virtio_state.bounce_in_userspace { + vsock.force_userspace_bounce_buffers(); + } + Ok(vsock) } } diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 804f0442559..391d543537f 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -111,7 +111,15 @@ impl VsockEpollListener for TestBackend { self.evset = Some(evset); } } -impl VsockBackend for TestBackend {} +impl VsockBackend for TestBackend { + fn start_bouncing(&mut self) { + unimplemented!() + } + + fn is_bouncing(&self) -> bool { + false + } +} #[derive(Debug)] pub struct TestContext { diff --git a/src/vmm/src/devices/virtio/vsock/unix/muxer.rs b/src/vmm/src/devices/virtio/vsock/unix/muxer.rs index 478d5c7318d..5585761af8f 100644 --- a/src/vmm/src/devices/virtio/vsock/unix/muxer.rs +++ b/src/vmm/src/devices/virtio/vsock/unix/muxer.rs @@ -108,6 +108,7 @@ pub struct VsockMuxer { local_port_set: HashSet, /// The last used host-side port. local_port_last: u32, + bounce: bool, } impl VsockChannel for VsockMuxer { @@ -299,7 +300,19 @@ impl VsockEpollListener for VsockMuxer { } } -impl VsockBackend for VsockMuxer {} +impl VsockBackend for VsockMuxer { + fn start_bouncing(&mut self) { + self.bounce = true; + + for conn in self.conn_map.values_mut() { + conn.stream.activate() + } + } + + fn is_bouncing(&self) -> bool { + self.bounce + } +} impl VsockMuxer { /// Muxer constructor. @@ -321,6 +334,7 @@ impl VsockMuxer { killq: MuxerKillQ::new(), local_port_last: (1u32 << 30) - 1, local_port_set: HashSet::with_capacity(defs::MAX_CONNECTIONS), + bounce: false, }; // Listen on the host initiated socket, for incoming connections. @@ -402,6 +416,7 @@ impl VsockMuxer { self.cid, local_port, peer_port, + self.bounce, ), ) }) @@ -629,6 +644,7 @@ impl VsockMuxer { pkt.hdr.dst_port(), pkt.hdr.src_port(), pkt.hdr.buf_alloc(), + self.bounce, ), ) }) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index f9206bdc414..03f0783500f 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -96,6 +96,11 @@ impl MaybeBounce { pub fn activate(&mut self) { self.persistent_buffer = Some(vec![0u8; N].into_boxed_slice().try_into().unwrap()) } + + /// Returns `true` if this `MaybeBounce` is actually bouncing buffers. + pub fn is_activated(&self) -> bool { + self.persistent_buffer.is_some() + } } impl ReadVolatile for MaybeBounce { From 6da974c0cdc48ec58f82307317c649c66482c134 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 24 Mar 2025 14:36:35 +0000 Subject: [PATCH 19/40] ci: dont fail downloading artifacts if no firecracker binaries exist If the CI artifacts dont contain old firecracker releases, still succeed at setting them up after downloading them. Signed-off-by: Patrick Roy --- tools/setup-ci-artifacts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/setup-ci-artifacts.sh b/tools/setup-ci-artifacts.sh index 0d524658b51..079eda888d8 100755 --- a/tools/setup-ci-artifacts.sh +++ b/tools/setup-ci-artifacts.sh @@ -12,7 +12,7 @@ say "Setup CI artifacts" cd build/img/$(uname -m) say "Fix executable permissions" -find "firecracker" -type f |xargs chmod -c 755 +find "firecracker" -type f |xargs chmod -c 755 || true say "Generate SSH key to connect from host" if [ ! -s id_rsa ]; then From 24610fc60aaf8def85ebe97e25caf5c69a39574a Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 31 Mar 2025 13:52:05 +0100 Subject: [PATCH 20/40] add Vm::create_guest_memfd Add a utility function for creating a guest_memfd and wrapping it into a `File` object. Signed-off-by: Patrick Roy --- src/vmm/src/vstate/vm.rs | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..a19c7ce96aa 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -6,15 +6,17 @@ // found in the THIRD-PARTY file. use std::collections::HashMap; -use std::fs::OpenOptions; +use std::fs::{File, OpenOptions}; use std::io::Write; +use std::os::fd::FromRawFd; use std::path::Path; use std::sync::Arc; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; -use kvm_ioctls::VmFd; +use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_create_guest_memfd, kvm_userspace_memory_region}; +use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; +use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::logger::info; use crate::persist::CreateSnapshotError; @@ -55,6 +57,10 @@ pub enum VmError { NotEnoughMemorySlots, /// Memory Error: {0} VmMemory(#[from] vm_memory::Error), + /// Failure to create guest_memfd: {0} + GuestMemfd(kvm_ioctls::Error), + /// guest_memfd is not supported on this host kernel. + GuestMemfdNotSupported, } /// Contains Vm functions that are usable across CPU architectures @@ -124,6 +130,32 @@ impl Vm { Ok((vcpus, exit_evt)) } + /// Create a guest_memfd of the specified size + pub fn create_guest_memfd(&self, size: usize, flags: u64) -> Result { + assert_eq!( + size & (host_page_size() - 1), + 0, + "guest_memfd size must be page aligned" + ); + + if !self.fd().check_extension(Cap::GuestMemfd) { + return Err(VmError::GuestMemfdNotSupported); + } + + let kvm_gmem = kvm_create_guest_memfd { + size: size as u64, + flags, + ..Default::default() + }; + + self.fd() + .create_guest_memfd(kvm_gmem) + .map_err(VmError::GuestMemfd) + // SAFETY: We know rawfd is a valid fd because create_guest_memfd didn't return an + // error. + .map(|rawfd| unsafe { File::from_raw_fd(rawfd) }) + } + /// Register a list of new memory regions to this [`Vm`]. pub fn register_memory_regions( &mut self, From c8fd23e4431a256a2f0cbdde99d09a95cdee9252 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 31 Mar 2025 14:05:59 +0100 Subject: [PATCH 21/40] refactor: generify "these features are incompatible" error variants There's be a lot more things that are incompatible going forward (mostly related to secret freedom), so instead of adding a ton of error variants for each pair of incompatible features, let's just have a single one where we can insert arbitrary features via a string argument. Signed-off-by: Patrick Roy --- src/vmm/src/resources.rs | 13 +++++++++---- src/vmm/src/vmm_config/balloon.rs | 4 ++-- src/vmm/src/vmm_config/machine_config.rs | 6 ++---- .../performance/test_huge_pages.py | 4 ++-- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 2ecd5139f8a..03183f20fc7 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -213,7 +213,9 @@ impl VmResources { self.balloon.set_device(balloon); if self.machine_config.huge_pages != HugePageConfig::None { - return Err(ResourcesError::BalloonDevice(BalloonConfigError::HugePages)); + return Err(ResourcesError::BalloonDevice( + BalloonConfigError::IncompatibleWith("huge pages"), + )); } } @@ -255,7 +257,10 @@ impl VmResources { } if self.balloon.get().is_some() && updated.huge_pages != HugePageConfig::None { - return Err(MachineConfigError::BalloonAndHugePages); + return Err(MachineConfigError::Incompatible( + "balloon device", + "huge pages", + )); } self.machine_config = updated; @@ -312,7 +317,7 @@ impl VmResources { } if self.machine_config.huge_pages != HugePageConfig::None { - return Err(BalloonConfigError::HugePages); + return Err(BalloonConfigError::IncompatibleWith("huge pages")); } self.balloon.set(config) @@ -1394,7 +1399,7 @@ mod tests { assert!( matches!( err, - ResourcesError::BalloonDevice(BalloonConfigError::HugePages) + ResourcesError::BalloonDevice(BalloonConfigError::IncompatibleWith("huge pages")) ), "{:?}", err diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index 6ac2fb34ecf..a6fccfe2b4b 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -28,8 +28,8 @@ pub enum BalloonConfigError { CreateFailure(crate::devices::virtio::balloon::BalloonError), /// Error updating the balloon device configuration: {0} UpdateFailure(std::io::Error), - /// Firecracker's huge pages support is incompatible with memory ballooning. - HugePages, + /// Memory ballooning is incompatible with {0}. + IncompatibleWith(&'static str), } /// This struct represents the strongly typed equivalent of the json body diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index 125ee047e2d..39952d7fa0e 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -27,10 +27,8 @@ pub enum MachineConfigError { /// Enabling simultaneous multithreading is not supported on aarch64. #[cfg(target_arch = "aarch64")] SmtNotSupported, - /// Could not determine host kernel version when checking hugetlbfs compatibility - KernelVersion, - /// Firecracker's huge pages support is incompatible with memory ballooning. - BalloonAndHugePages, + /// '{0}' and '{1}' are mutually exclusive and cannot be used together. + Incompatible(&'static str, &'static str) } /// Describes the possible (huge)page configurations for a microVM's memory. diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index d683afe065e..f5ddfe23786 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -255,7 +255,7 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) with pytest.raises( RuntimeError, - match="Firecracker's huge pages support is incompatible with memory ballooning.", + match="Memory ballooning is incompatible with huge pages.", ): uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) @@ -264,6 +264,6 @@ def test_negative_huge_pages_plus_balloon(uvm_plain): uvm_plain.api.balloon.put(amount_mib=0, deflate_on_oom=False) with pytest.raises( RuntimeError, - match="Machine config error: Firecracker's huge pages support is incompatible with memory ballooning.", + match="Machine config error: 'balloon device' and 'huge pages' are mutually exclusive and cannot be used together.", ): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) From 59cca1631b6a5d1153925bb989916bbf1a8d4851 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 20 Mar 2025 15:37:50 +0000 Subject: [PATCH 22/40] add "secret_free" parameter to /machine-config endpoint This will later indicate to Firecracker that guest memory should be backed by guest_memfd. Mark vhost-user and async block engine as incompatible, as I/O will require userspace bounce buffers. For vhost-user-blk, we would need to communicate the need for bounce buffers to the backend somehow, and for the async block engine we would need to somehow keep the bounce buffers around until io_uring finishes requests (which is not impossible, but complicated and not needed for now). Signed-off-by: Patrick Roy --- .../request/machine_configuration.rs | 5 ++ src/firecracker/swagger/firecracker.yaml | 5 ++ src/vmm/src/device_manager/persist.rs | 1 + src/vmm/src/persist.rs | 4 ++ src/vmm/src/resources.rs | 69 +++++++++++++++++-- src/vmm/src/vmm_config/drive.rs | 2 + src/vmm/src/vmm_config/machine_config.rs | 55 ++++++++++++++- tests/framework/vm_config.json | 1 + .../integration_tests/functional/test_api.py | 2 + 9 files changed, 136 insertions(+), 8 deletions(-) diff --git a/src/firecracker/src/api_server/request/machine_configuration.rs b/src/firecracker/src/api_server/request/machine_configuration.rs index 2e8addffb74..0edb79f3774 100644 --- a/src/firecracker/src/api_server/request/machine_configuration.rs +++ b/src/firecracker/src/api_server/request/machine_configuration.rs @@ -119,6 +119,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: None, track_dirty_pages: Some(false), @@ -140,6 +141,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: Some(StaticCpuTemplate::None), track_dirty_pages: Some(false), @@ -161,6 +163,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: None, track_dirty_pages: Some(true), @@ -186,6 +189,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(false), cpu_template: Some(StaticCpuTemplate::T2), track_dirty_pages: Some(true), @@ -213,6 +217,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + secret_free: Some(false), smt: Some(true), cpu_template: None, track_dirty_pages: Some(true), diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index dd834baa785..55e6333931b 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1044,6 +1044,11 @@ definitions: mem_size_mib: type: integer description: Memory size of VM + secret_free: + type: boolean + description: + If enabled, guest memory will be unmapped from the host kernel's address space, providing additional + protection against transitive execution issues. All I/O then goes through a bounce buffer. track_dirty_pages: type: boolean description: diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 30a6387bc82..a5923dd3624 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -846,6 +846,7 @@ mod tests { "machine-config": {{ "vcpu_count": 1, "mem_size_mib": 128, + "secret_free": false, "smt": false, "track_dirty_pages": false, "huge_pages": "None" diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index aeacadeb66e..0e66b184f44 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -47,6 +47,8 @@ use crate::{EventManager, Vmm, vstate}; pub struct VmInfo { /// Guest memory size. pub mem_size_mib: u64, + /// Memory config + pub secret_free: bool, /// smt information pub smt: bool, /// CPU template type @@ -61,6 +63,7 @@ impl From<&VmResources> for VmInfo { fn from(value: &VmResources) -> Self { Self { mem_size_mib: value.machine_config.mem_size_mib as u64, + secret_free: value.machine_config.secret_free, smt: value.machine_config.smt, cpu_template: StaticCpuTemplate::from(&value.machine_config.cpu_template), boot_source: value.boot_source.config.clone(), @@ -360,6 +363,7 @@ pub fn restore_from_snapshot( .update_machine_config(&MachineConfigUpdate { vcpu_count: Some(vcpu_count), mem_size_mib: Some(u64_to_usize(microvm_state.vm_info.mem_size_mib)), + secret_free: Some(microvm_state.vm_info.secret_free), smt: Some(microvm_state.vm_info.smt), cpu_template: Some(microvm_state.vm_info.cpu_template), track_dirty_pages: Some(track_dirty_pages), diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 03183f20fc7..ac4adc22a56 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::device_manager::persist::SharedDeviceType; +use crate::devices::virtio::block::device::Block; use crate::logger::info; use crate::mmds; use crate::mmds::data_store::{Mmds, MmdsVersion}; @@ -217,6 +218,11 @@ impl VmResources { BalloonConfigError::IncompatibleWith("huge pages"), )); } + if self.machine_config.secret_free { + return Err(ResourcesError::BalloonDevice( + BalloonConfigError::IncompatibleWith("secret freedom"), + )); + } } SharedDeviceType::Vsock(vsock) => { @@ -262,6 +268,27 @@ impl VmResources { "huge pages", )); } + if self.balloon.get().is_some() && updated.secret_free { + return Err(MachineConfigError::Incompatible( + "balloon device", + "secret freedom", + )); + } + if updated.secret_free { + if self.vhost_user_devices_used() { + return Err(MachineConfigError::Incompatible( + "vhost-user devices", + "userspace bounce buffers", + )); + } + + if self.async_block_engine_used() { + return Err(MachineConfigError::Incompatible( + "async block engine", + "userspace bounce buffers", + )); + } + } self.machine_config = updated; Ok(()) @@ -320,6 +347,10 @@ impl VmResources { return Err(BalloonConfigError::IncompatibleWith("huge pages")); } + if self.machine_config.secret_free { + return Err(BalloonConfigError::IncompatibleWith("secret freedom")); + } + self.balloon.set(config) } @@ -343,6 +374,17 @@ impl VmResources { &mut self, block_device_config: BlockDeviceConfig, ) -> Result<(), DriveError> { + if self.machine_config.secret_free { + if block_device_config.file_engine_type == Some(FileEngineType::Async) { + return Err(DriveError::IncompatibleWithSecretFreedom( + "async file engine", + )); + } + + if block_device_config.socket.is_some() { + return Err(DriveError::IncompatibleWithSecretFreedom("vhost-user-blk")); + } + } self.block.insert(block_device_config) } @@ -442,17 +484,29 @@ impl VmResources { Ok(()) } + /// Returns true if any vhost user devices are configured int his [`VmResources`] object + pub fn vhost_user_devices_used(&self) -> bool { + self.block + .devices + .iter() + .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()) + } + + fn async_block_engine_used(&self) -> bool { + self.block + .devices + .iter() + .any(|b| match &*b.lock().unwrap() { + Block::Virtio(b) => b.file_engine_type() == FileEngineType::Async, + Block::VhostUser(_) => false, + }) + } + /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. pub fn allocate_guest_memory(&self) -> Result, MemoryError> { - let vhost_user_device_used = self - .block - .devices - .iter() - .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()); - // Page faults are more expensive for shared memory mapping, including memfd. // For this reason, we only back guest memory with a memfd // if a vhost-user-blk device is configured in the VM, otherwise we fall back to @@ -464,7 +518,7 @@ impl VmResources { // that would not be worth the effort. let regions = crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); - if vhost_user_device_used { + if self.vhost_user_devices_used() { memory::memfd_backed( regions.as_ref(), self.machine_config.track_dirty_pages, @@ -1307,6 +1361,7 @@ mod tests { let mut aux_vm_config = MachineConfigUpdate { vcpu_count: Some(32), mem_size_mib: Some(512), + secret_free: Some(false), smt: Some(false), #[cfg(target_arch = "x86_64")] cpu_template: Some(StaticCpuTemplate::T2), diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 9e301eff751..88a9b813874 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -24,6 +24,8 @@ pub enum DriveError { DeviceUpdate(VmmError), /// A root block device already exists! RootBlockDeviceAlreadyAdded, + /// {0} is incompatible with secret freedom. + IncompatibleWithSecretFreedom(&'static str), } /// Use this structure to set up the Block Device before booting the kernel. diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index 39952d7fa0e..3d30860144e 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -95,6 +95,11 @@ pub struct MachineConfig { pub vcpu_count: u8, /// The memory size in MiB. pub mem_size_mib: usize, + /// Whether guest_memfd should be used to back normal guest memory. If this is enabled + /// and any devices are attached to the VM, userspace bounce buffers will be used + /// as I/O into secret free memory is not possible. + #[serde(default)] + pub secret_free: bool, /// Enables or disabled SMT. #[serde(default)] pub smt: bool, @@ -151,6 +156,7 @@ impl Default for MachineConfig { Self { vcpu_count: 1, mem_size_mib: DEFAULT_MEM_SIZE_MIB, + secret_free: false, smt: false, cpu_template: None, track_dirty_pages: false, @@ -176,6 +182,9 @@ pub struct MachineConfigUpdate { /// The memory size in MiB. #[serde(default)] pub mem_size_mib: Option, + /// Whether secret freedom should be enabled + #[serde(default)] + pub secret_free: Option, /// Enables or disabled SMT. #[serde(default)] pub smt: Option, @@ -208,6 +217,7 @@ impl From for MachineConfigUpdate { MachineConfigUpdate { vcpu_count: Some(cfg.vcpu_count), mem_size_mib: Some(cfg.mem_size_mib), + secret_free: Some(cfg.secret_free), smt: Some(cfg.smt), cpu_template: cfg.static_template(), track_dirty_pages: Some(cfg.track_dirty_pages), @@ -261,11 +271,27 @@ impl MachineConfig { let mem_size_mib = update.mem_size_mib.unwrap_or(self.mem_size_mib); let page_config = update.huge_pages.unwrap_or(self.huge_pages); + let secret_free = update.secret_free.unwrap_or(self.secret_free); + let track_dirty_pages = update.track_dirty_pages.unwrap_or(self.track_dirty_pages); if mem_size_mib == 0 || !page_config.is_valid_mem_size(mem_size_mib) { return Err(MachineConfigError::InvalidMemorySize); } + if secret_free && page_config != HugePageConfig::None { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "huge pages", + )); + } + + if secret_free && track_dirty_pages { + return Err(MachineConfigError::Incompatible( + "secret freedom", + "diff snapshots", + )); + } + let cpu_template = match update.cpu_template { None => self.cpu_template.clone(), Some(StaticCpuTemplate::None) => None, @@ -275,9 +301,10 @@ impl MachineConfig { Ok(MachineConfig { vcpu_count, mem_size_mib, + secret_free, smt, cpu_template, - track_dirty_pages: update.track_dirty_pages.unwrap_or(self.track_dirty_pages), + track_dirty_pages, huge_pages: page_config, #[cfg(feature = "gdb")] gdb_socket_path: update.gdb_socket_path.clone(), @@ -343,6 +370,32 @@ mod tests { .unwrap(); assert_eq!(updated.huge_pages, HugePageConfig::Hugetlbfs2M); assert_eq!(updated.mem_size_mib, 32); + + let res = mconf.update(&MachineConfigUpdate { + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + secret_free: Some(true), + ..Default::default() + }); + assert_eq!( + res, + Err(MachineConfigError::Incompatible( + "secret freedom", + "huge pages" + )) + ); + + let res = mconf.update(&MachineConfigUpdate { + track_dirty_pages: Some(true), + secret_free: Some(true), + ..Default::default() + }); + assert_eq!( + res, + Err(MachineConfigError::Incompatible( + "secret freedom", + "diff snapshots" + )) + ); } #[test] diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 5df673308d9..a026f8a7571 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -20,6 +20,7 @@ "machine-config": { "vcpu_count": 2, "mem_size_mib": 1024, + "secret_free": false, "smt": false, "track_dirty_pages": false, "huge_pages": "None" diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 864c6d5eda9..d1c9d4c9581 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -1062,6 +1062,7 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): setup_cfg["machine-config"] = { "vcpu_count": 2, "mem_size_mib": 256, + "secret_free": False, "smt": True, "track_dirty_pages": False, "huge_pages": "None", @@ -1175,6 +1176,7 @@ def test_get_full_config(uvm_plain): expected_cfg["machine-config"] = { "vcpu_count": 2, "mem_size_mib": 256, + "secret_free": False, "smt": False, "track_dirty_pages": False, "huge_pages": "None", From 841ac045422066fff36d3562e617aca02ae3a6fe Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 25 Apr 2025 10:08:17 +0100 Subject: [PATCH 23/40] use bounce buffers for loading kernel if secret freedom is enabled If secret freedom is enabled, the guest kernel and potential initrd needs to be loaded via bounce buffer, as we cannot directly do `read` syscalls that target guest memory. Signed-off-by: Patrick Roy --- src/vmm/src/arch/aarch64/mod.rs | 14 ++++-------- src/vmm/src/arch/x86_64/mod.rs | 15 +++++-------- src/vmm/src/builder.rs | 29 ++++++++++++++++++++++--- src/vmm/src/initrd.rs | 38 +++++++++------------------------ 4 files changed, 45 insertions(+), 51 deletions(-) diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index ead827c08c4..6bb379b9f9c 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -18,11 +18,11 @@ pub mod vm; use std::cmp::min; use std::fmt::Debug; -use std::fs::File; +use std::io::{Read, Seek}; use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, ReadVolatile}; use crate::arch::{BootProtocol, EntryPoint}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; @@ -187,16 +187,10 @@ fn get_fdt_addr(mem: &GuestMemoryMmap) -> u64 { } /// Load linux kernel into guest memory. -pub fn load_kernel( - kernel: &File, +pub fn load_kernel( + mut kernel_file: R, guest_memory: &GuestMemoryMmap, ) -> Result { - // Need to clone the File because reading from it - // mutates it. - let mut kernel_file = kernel - .try_clone() - .map_err(|_| ConfigurationError::KernelFile)?; - let entry_addr = Loader::load( guest_memory, Some(GuestAddress(get_kernel_start())), diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index ca350cbf9af..55bcc544f8d 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -31,7 +31,7 @@ pub mod xstate; #[allow(missing_docs)] pub mod generated; -use std::fs::File; +use std::io::{Read, Seek}; use layout::CMDLINE_START; use linux_loader::configurator::linux::LinuxBootConfigurator; @@ -44,6 +44,7 @@ use linux_loader::loader::elf::start_info::{ }; use linux_loader::loader::{Cmdline, KernelLoader, PvhBootCapability, load_cmdline}; use log::debug; +use vm_memory::ReadVolatile; use super::EntryPoint; use crate::acpi::create_acpi_tables; @@ -438,20 +439,14 @@ fn add_e820_entry( } /// Load linux kernel into guest memory. -pub fn load_kernel( - kernel: &File, +pub fn load_kernel( + mut kernel: R, guest_memory: &GuestMemoryMmap, ) -> Result { - // Need to clone the File because reading from it - // mutates it. - let mut kernel_file = kernel - .try_clone() - .map_err(|_| ConfigurationError::KernelFile)?; - let entry_addr = Loader::load( guest_memory, None, - &mut kernel_file, + &mut kernel, Some(GuestAddress(get_kernel_start())), ) .map_err(ConfigurationError::KernelLoader)?; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 398c25ba056..5b54b01f561 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,6 +5,8 @@ use std::fmt::Debug; use std::io; +use std::os::fd::AsFd; +use std::os::unix::fs::MetadataExt; #[cfg(feature = "gdb")] use std::sync::mpsc; use std::sync::{Arc, Mutex}; @@ -54,10 +56,11 @@ use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; +use crate::utils::u64_to_usize; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::Kvm; -use crate::vstate::memory::GuestRegionMmap; +use crate::vstate::memory::{GuestRegionMmap, MaybeBounce}; use crate::vstate::vcpu::{Vcpu, VcpuError}; use crate::vstate::vm::Vm; use crate::{EventManager, Vmm, VmmError, device_manager}; @@ -237,8 +240,28 @@ pub fn build_microvm_for_boot( .register_memory_regions(guest_memory) .map_err(VmmError::Vm)?; - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let entry_point = load_kernel( + MaybeBounce::new( + boot_config.kernel_file.try_clone().unwrap(), + vm_resources.machine_config.secret_free, + ), + vmm.vm.guest_memory(), + )?; + let initrd = match &boot_config.initrd_file { + Some(initrd_file) => { + let size = initrd_file + .metadata() + .map_err(InitrdError::Metadata)? + .size(); + + Some(InitrdConfig::from_reader( + vmm.vm.guest_memory(), + MaybeBounce::new(initrd_file.as_fd(), vm_resources.machine_config.secret_free), + u64_to_usize(size), + )?) + } + None => None, + }; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); diff --git a/src/vmm/src/initrd.rs b/src/vmm/src/initrd.rs index 9dfcd8bc16e..624ec397f73 100644 --- a/src/vmm/src/initrd.rs +++ b/src/vmm/src/initrd.rs @@ -1,14 +1,9 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::fs::File; -use std::os::unix::fs::MetadataExt; - use vm_memory::{GuestAddress, GuestMemory, ReadVolatile, VolatileMemoryError}; use crate::arch::initrd_load_addr; -use crate::utils::u64_to_usize; -use crate::vmm_config::boot_source::BootConfig; use crate::vstate::memory::GuestMemoryMmap; /// Errors associated with initrd loading. @@ -20,8 +15,6 @@ pub enum InitrdError { Load, /// Cannot image metadata: {0} Metadata(std::io::Error), - /// Cannot copy initrd file fd: {0} - CloneFd(std::io::Error), /// Cannot load initrd due to an invalid image: {0} Read(VolatileMemoryError), } @@ -36,31 +29,20 @@ pub struct InitrdConfig { } impl InitrdConfig { - /// Load initrd into guest memory based on the boot config. - pub fn from_config( - boot_cfg: &BootConfig, - vm_memory: &GuestMemoryMmap, - ) -> Result, InitrdError> { - Ok(match &boot_cfg.initrd_file { - Some(f) => { - let f = f.try_clone().map_err(InitrdError::CloneFd)?; - Some(Self::from_file(vm_memory, f)?) - } - None => None, - }) - } - /// Loads the initrd from a file into guest memory. - pub fn from_file(vm_memory: &GuestMemoryMmap, mut file: File) -> Result { - let size = file.metadata().map_err(InitrdError::Metadata)?.size(); - let size = u64_to_usize(size); + pub fn from_reader( + vm_memory: &GuestMemoryMmap, + mut reader: R, + size: usize, + ) -> Result { let Some(address) = initrd_load_addr(vm_memory, size) else { return Err(InitrdError::Address); }; let mut slice = vm_memory .get_slice(GuestAddress(address), size) .map_err(|_| InitrdError::Load)?; - file.read_exact_volatile(&mut slice) + reader + .read_exact_volatile(&mut slice) .map_err(InitrdError::Read)?; Ok(InitrdConfig { @@ -105,7 +87,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let initrd = InitrdConfig::from_file(&gm, tempfile).unwrap(); + let initrd = InitrdConfig::from_reader(&gm, tempfile, image.len()).unwrap(); assert!(gm.address_in_range(initrd.address)); assert_eq!(initrd.size, image.len()); } @@ -120,7 +102,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let res = InitrdConfig::from_file(&gm, tempfile); + let res = InitrdConfig::from_reader(&gm, tempfile, image.len()); assert!(matches!(res, Err(InitrdError::Address)), "{:?}", res); } @@ -134,7 +116,7 @@ mod tests { // Need to reset the cursor to read initrd properly. tempfile.seek(SeekFrom::Start(0)).unwrap(); - let res = InitrdConfig::from_file(&gm, tempfile); + let res = InitrdConfig::from_reader(&gm, tempfile, image.len()); assert!(matches!(res, Err(InitrdError::Address)), "{:?}", res); } } From c9a9cfb2b9d222876fea1fb9ffb494ff75398f1b Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 11 Apr 2025 13:24:45 +0100 Subject: [PATCH 24/40] use userspace bounce buffers if secret freedom is enabled Needed because we cannot do I/O straight into secret hidden memory - the host kernel cannot access it. Signed-off-by: Patrick Roy --- src/vmm/src/builder.rs | 57 ++++++++++++++++--- .../devices/virtio/block/vhost_user/device.rs | 1 + .../src/devices/virtio/block/virtio/device.rs | 4 +- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5b54b01f561..a31a66f4d72 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -291,16 +291,24 @@ pub fn build_microvm_for_boot( &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, + vm_resources.machine_config.secret_free, )?; attach_net_devices( &mut vmm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, + vm_resources.machine_config.secret_free, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; + attach_unixsock_vsock_device( + &mut vmm, + &mut boot_cmdline, + unix_vsock, + event_manager, + vm_resources.machine_config.secret_free, + )?; } if let Some(entropy) = vm_resources.entropy.get() { @@ -617,9 +625,14 @@ fn attach_virtio_device( device: Arc>, cmdline: &mut LoaderKernelCmdline, is_vhost_user: bool, + secret_free: bool, ) -> Result<(), MmioError> { event_manager.add_subscriber(device.clone()); + if secret_free { + device.lock().unwrap().force_userspace_bounce_buffers(); + } + // The device mutex mustn't be locked here otherwise it will deadlock. let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); vmm.mmio_device_manager @@ -675,6 +688,7 @@ fn attach_entropy_device( entropy_device.clone(), cmdline, false, + false, ) } @@ -683,6 +697,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, + secret_free: bool, ) -> Result<(), StartMicrovmError> { for block in blocks { let (id, is_vhost_user) = { @@ -707,6 +722,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( block.clone(), cmdline, is_vhost_user, + secret_free, )?; } Ok(()) @@ -717,11 +733,20 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, + secret_free: bool, ) -> Result<(), StartMicrovmError> { for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; + attach_virtio_device( + event_manager, + vmm, + id, + net_device.clone(), + cmdline, + false, + secret_free, + )?; } Ok(()) } @@ -731,10 +756,19 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, + secret_free: bool, ) -> Result<(), MmioError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) + attach_virtio_device( + event_manager, + vmm, + id, + unix_vsock.clone(), + cmdline, + false, + secret_free, + ) } fn attach_balloon_device( @@ -745,7 +779,15 @@ fn attach_balloon_device( ) -> Result<(), MmioError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) + attach_virtio_device( + event_manager, + vmm, + id, + balloon.clone(), + cmdline, + false, + false, + ) } // Adds `O_NONBLOCK` to the stdout flags. @@ -921,6 +963,7 @@ pub(crate) mod tests { cmdline, block_dev_configs.devices.iter(), event_manager, + false, ) .unwrap(); block_files @@ -935,7 +978,7 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); + let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager, false); res.unwrap(); } @@ -956,7 +999,7 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); + attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager, false).unwrap(); } pub(crate) fn insert_vsock_device( @@ -969,7 +1012,7 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); + attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager, false).unwrap(); assert!( vmm.mmio_device_manager diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 014693d3f5e..62b0002c371 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -296,6 +296,7 @@ impl VirtioDevice for VhostUserBlock fn force_userspace_bounce_buffers(&mut self) { // Nothing Firecracker can do about this, the backend would need to do the bouncing + panic!("vhost-user-blk is incompatible with userspace bounce buffers") } fn userspace_bounce_buffers(&self) -> bool { diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 7a913d912c1..6ce866806ba 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -580,7 +580,9 @@ impl VirtioDevice for VirtioBlock { fn force_userspace_bounce_buffers(&mut self) { match self.disk.file_engine { - FileEngine::Async(_) => panic!("No idea how this is supposed to work for io_uring"), + FileEngine::Async(_) => { + panic!("async engine is incompatible with userspace bounce buffers") + } FileEngine::Sync(ref mut engine) => engine.start_bouncing(), } } From 3c68e6370c9c937d9eb4829e5a812363d2a61604 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 1 Apr 2025 12:36:55 +0100 Subject: [PATCH 25/40] switch to using kvm_userspace_region2 Fall back to kvm_user_memory_region in case the 2 version of the struct isnt supported. Signed-off-by: Patrick Roy --- src/vmm/src/vstate/vm.rs | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index a19c7ce96aa..153d1839522 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -12,7 +12,10 @@ use std::os::fd::FromRawFd; use std::path::Path; use std::sync::Arc; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_create_guest_memfd, kvm_userspace_memory_region}; +use kvm_bindings::{ + KVM_MEM_LOG_DIRTY_PAGES, kvm_create_guest_memfd, kvm_userspace_memory_region, + kvm_userspace_memory_region2, +}; use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; @@ -185,21 +188,37 @@ impl Vm { 0 }; - let memory_region = kvm_userspace_memory_region { + let memory_region = kvm_userspace_memory_region2 { slot: next_slot, guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), userspace_addr: region.as_ptr() as u64, flags, + ..Default::default() }; let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; - // SAFETY: Safe because the fd is a valid KVM file descriptor. - unsafe { - self.fd() - .set_user_memory_region(memory_region) - .map_err(VmError::SetUserMemoryRegion)?; + if self.fd().check_extension(Cap::UserMemory2) { + // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. + unsafe { + self.fd() + .set_user_memory_region2(memory_region) + .map_err(VmError::SetUserMemoryRegion)?; + } + } else { + // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. + unsafe { + self.fd() + .set_user_memory_region(kvm_userspace_memory_region { + slot: memory_region.slot, + flags: memory_region.flags, + guest_phys_addr: memory_region.guest_phys_addr, + memory_size: memory_region.memory_size, + userspace_addr: memory_region.userspace_addr, + }) + .map_err(VmError::SetUserMemoryRegion)?; + } } self.common.guest_memory = new_guest_memory; From 0f49046ac732f3aabc0cbf9b6d87842a106e2c46 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 2 Apr 2025 14:54:48 +0100 Subject: [PATCH 26/40] tmp: call mmap outselves vm-memory has faulty validation logic that prevents us from mmap-ing guest_memfds, so just bypass that by calling mmap ourselves for the time being. See also https://github.com/rust-vmm/vm-memory/pull/320 Signed-off-by: Patrick Roy --- src/vmm/src/vstate/memory.rs | 36 ++++++++++++++++--- .../integration_tests/functional/test_api.py | 4 +-- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 03f0783500f..dd99e6b9e1b 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -7,6 +7,8 @@ use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; +use std::os::fd::AsRawFd; +use std::ptr::null_mut; use std::sync::Arc; use serde::{Deserialize, Serialize}; @@ -51,6 +53,8 @@ pub enum MemoryError { MemfdSetLen(std::io::Error), /// Total sum of memory regions exceeds largest possible file offset OffsetTooLarge, + /// Error calling mmap: {0} + Mmap(std::io::Error), } /// Newtype that implements [`ReadVolatile`] and [`WriteVolatile`] if `T` implements `Read` or @@ -203,16 +207,40 @@ pub fn create( let mut builder = MmapRegionBuilder::new_with_bitmap( size, track_dirty_pages.then(|| AtomicBitmap::with_len(size)), - ) - .with_mmap_prot(libc::PROT_READ | libc::PROT_WRITE) - .with_mmap_flags(libc::MAP_NORESERVE | mmap_flags); + ); - if let Some(ref file) = file { + // when computing offset below we ensure it fits into i64 + #[allow(clippy::cast_possible_wrap)] + let (fd, fd_off) = if let Some(ref file) = file { let file_offset = FileOffset::from_arc(Arc::clone(file), offset); builder = builder.with_file_offset(file_offset); + + (file.as_raw_fd(), offset as libc::off_t) + } else { + (-1, 0) + }; + + // SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense + let ptr = unsafe { + libc::mmap( + null_mut(), + size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_NORESERVE | mmap_flags, + fd, + fd_off, + ) + }; + + if ptr == libc::MAP_FAILED { + return Err(MemoryError::Mmap(std::io::Error::last_os_error())); } + // SAFETY: we check above that mmap succeeded, and the size we passed to builder is the + // same as the size of the mmap area. + let builder = unsafe { builder.with_raw_mmap_pointer(ptr.cast()) }; + offset = match offset.checked_add(size as u64) { None => return Err(MemoryError::OffsetTooLarge), Some(new_off) if new_off >= i64::MAX as u64 => { diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index d1c9d4c9581..1e5ec6fc473 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -375,9 +375,7 @@ def test_api_machine_config(uvm_plain): bad_size = (1 << 64) - 1 test_microvm.api.machine_config.patch(mem_size_mib=bad_size) - fail_msg = re.escape( - "Invalid Memory Configuration: Cannot create mmap region: Out of memory (os error 12)" - ) + fail_msg = re.escape("Out of memory (os error 12)") with pytest.raises(RuntimeError, match=fail_msg): test_microvm.start() From 5fba4eaa021bd9e6206a29cac472c65f552d1556 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 25 Apr 2025 14:04:15 +0100 Subject: [PATCH 27/40] add concept of "secret free" VMs Have the `struct Vm` constructor take an argument to indicate whether the VM should be secret free. Use this to determine the correct vm type for guest_memfd support, and store it inside the VM so that we don't have to pass bools to various functions. Signed-off-by: Patrick Roy --- src/vmm/src/arch/aarch64/vm.rs | 12 +++++- src/vmm/src/arch/mod.rs | 4 +- src/vmm/src/arch/x86_64/vm.rs | 10 +++-- src/vmm/src/builder.rs | 64 +++++++----------------------- src/vmm/src/device_manager/mmio.rs | 6 +-- src/vmm/src/vstate/vm.rs | 43 +++++++++++++++++--- 6 files changed, 74 insertions(+), 65 deletions(-) diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..44897e42e41 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -8,6 +8,14 @@ use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; use crate::vstate::vm::{VmCommon, VmError}; +/// The VM type for this architecture that allows us to use guest_memfd. On ARM, all VMs +/// support guest_memfd and no special type is needed (in fact, no concept of vm types really +/// exists, and the correspoding field of the CREATE_VM ioctl determines IPA size instead, +/// e.g. the size of the guest physical address space. This value cannot be hardcoded, hence +/// `None` to let the `Vm` constructor now that just normal [`Kvm::create_vm`] should be called, +/// which internally determines the preferred IPA size. +pub const VM_TYPE_FOR_SECRET_FREEDOM: Option = None; + /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { @@ -30,8 +38,8 @@ pub enum ArchVmError { impl ArchVm { /// Create a new `Vm` struct. - pub fn new(kvm: &Kvm) -> Result { - let common = Self::create_common(kvm)?; + pub fn new(kvm: &Kvm, secret_free: bool) -> Result { + let common = Self::create_common(kvm, secret_free)?; Ok(ArchVm { common, irqchip_handle: None, diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 61d65fea1a5..05f930682ab 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -17,7 +17,7 @@ pub use aarch64::kvm::{Kvm, KvmArchError, OptionalCapabilities}; #[cfg(target_arch = "aarch64")] pub use aarch64::vcpu::*; #[cfg(target_arch = "aarch64")] -pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; +pub use aarch64::vm::{ArchVm, ArchVmError, VM_TYPE_FOR_SECRET_FREEDOM, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, @@ -35,7 +35,7 @@ pub use x86_64::kvm::{Kvm, KvmArchError}; #[cfg(target_arch = "x86_64")] pub use x86_64::vcpu::*; #[cfg(target_arch = "x86_64")] -pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; +pub use x86_64::vm::{ArchVm, ArchVmError, VM_TYPE_FOR_SECRET_FREEDOM, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..09a1c03e6dc 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -5,7 +5,8 @@ use std::fmt; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, + KVM_PIT_SPEAKER_DUMMY, KVM_X86_SW_PROTECTED_VM, MsrList, kvm_clock_data, kvm_irqchip, + kvm_pit_config, kvm_pit_state2, }; use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; @@ -46,6 +47,9 @@ pub enum ArchVmError { SetTssAddress(kvm_ioctls::Error), } +/// The VM type for this architecture that allows us to use guest_memfd. +pub const VM_TYPE_FOR_SECRET_FREEDOM: Option = Some(KVM_X86_SW_PROTECTED_VM as u64); + /// Structure representing the current architecture's understand of what a "virtual machine" is. #[derive(Debug)] pub struct ArchVm { @@ -60,8 +64,8 @@ pub struct ArchVm { impl ArchVm { /// Create a new `Vm` struct. - pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result { - let common = Self::create_common(kvm)?; + pub fn new(kvm: &crate::vstate::kvm::Kvm, secret_free: bool) -> Result { + let common = Self::create_common(kvm, secret_free)?; let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a31a66f4d72..43db6d8c35d 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -139,11 +139,12 @@ fn create_vmm_and_vcpus( event_manager: &mut EventManager, vcpu_count: u8, kvm_capabilities: Vec, + secret_free: bool, ) -> Result<(Vmm, Vec), VmmError> { let kvm = Kvm::new(kvm_capabilities)?; // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; + let mut vm = Vm::new(&kvm, secret_free)?; let resource_allocator = ResourceAllocator::new()?; @@ -234,6 +235,7 @@ pub fn build_microvm_for_boot( event_manager, vm_resources.machine_config.vcpu_count, cpu_template.kvm_capabilities.clone(), + vm_resources.machine_config.secret_free, )?; vmm.vm @@ -243,7 +245,7 @@ pub fn build_microvm_for_boot( let entry_point = load_kernel( MaybeBounce::new( boot_config.kernel_file.try_clone().unwrap(), - vm_resources.machine_config.secret_free, + vmm.vm.secret_free(), ), vmm.vm.guest_memory(), )?; @@ -256,7 +258,7 @@ pub fn build_microvm_for_boot( Some(InitrdConfig::from_reader( vmm.vm.guest_memory(), - MaybeBounce::new(initrd_file.as_fd(), vm_resources.machine_config.secret_free), + MaybeBounce::new(initrd_file.as_fd(), vmm.vm.secret_free()), u64_to_usize(size), )?) } @@ -291,24 +293,16 @@ pub fn build_microvm_for_boot( &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, - vm_resources.machine_config.secret_free, )?; attach_net_devices( &mut vmm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, - vm_resources.machine_config.secret_free, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device( - &mut vmm, - &mut boot_cmdline, - unix_vsock, - event_manager, - vm_resources.machine_config.secret_free, - )?; + attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; } if let Some(entropy) = vm_resources.entropy.get() { @@ -457,6 +451,7 @@ pub fn build_microvm_from_snapshot( event_manager, vm_resources.machine_config.vcpu_count, microvm_state.kvm_state.kvm_cap_modifiers.clone(), + false, ) .map_err(StartMicrovmError::Internal)?; @@ -625,11 +620,10 @@ fn attach_virtio_device( device: Arc>, cmdline: &mut LoaderKernelCmdline, is_vhost_user: bool, - secret_free: bool, ) -> Result<(), MmioError> { event_manager.add_subscriber(device.clone()); - if secret_free { + if vmm.vm.secret_free() { device.lock().unwrap().force_userspace_bounce_buffers(); } @@ -688,7 +682,6 @@ fn attach_entropy_device( entropy_device.clone(), cmdline, false, - false, ) } @@ -697,7 +690,6 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, - secret_free: bool, ) -> Result<(), StartMicrovmError> { for block in blocks { let (id, is_vhost_user) = { @@ -722,7 +714,6 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( block.clone(), cmdline, is_vhost_user, - secret_free, )?; } Ok(()) @@ -733,20 +724,11 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, - secret_free: bool, ) -> Result<(), StartMicrovmError> { for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( - event_manager, - vmm, - id, - net_device.clone(), - cmdline, - false, - secret_free, - )?; + attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; } Ok(()) } @@ -756,19 +738,10 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, - secret_free: bool, ) -> Result<(), MmioError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( - event_manager, - vmm, - id, - unix_vsock.clone(), - cmdline, - false, - secret_free, - ) + attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( @@ -779,15 +752,7 @@ fn attach_balloon_device( ) -> Result<(), MmioError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( - event_manager, - vmm, - id, - balloon.clone(), - cmdline, - false, - false, - ) + attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) } // Adds `O_NONBLOCK` to the stdout flags. @@ -963,7 +928,6 @@ pub(crate) mod tests { cmdline, block_dev_configs.devices.iter(), event_manager, - false, ) .unwrap(); block_files @@ -978,7 +942,7 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager, false); + let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); res.unwrap(); } @@ -999,7 +963,7 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager, false).unwrap(); + attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); } pub(crate) fn insert_vsock_device( @@ -1012,7 +976,7 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager, false).unwrap(); + attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); assert!( vmm.mmio_device_manager diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 7425fd56945..f1b8cf01697 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -659,7 +659,7 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); + let mut vm = Vm::new(&kvm, false).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -690,7 +690,7 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); + let mut vm = Vm::new(&kvm, false).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -746,7 +746,7 @@ mod tests { let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let mut vm = Vm::new(&kvm).unwrap(); + let mut vm = Vm::new(&kvm, false).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 153d1839522..3210d8cb43a 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -19,8 +19,8 @@ use kvm_bindings::{ use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; -use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::arch::{VM_TYPE_FOR_SECRET_FREEDOM, host_page_size}; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::utils::u64_to_usize; @@ -39,6 +39,7 @@ pub struct VmCommon { max_memslots: usize, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + secret_free: bool, } /// Errors associated with the wrappers over KVM ioctls. @@ -69,7 +70,14 @@ pub enum VmError { /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Create a KVM VM - pub fn create_common(kvm: &crate::vstate::kvm::Kvm) -> Result { + pub fn create_common( + kvm: &crate::vstate::kvm::Kvm, + secret_free: bool, + ) -> Result { + if secret_free && !kvm.fd.check_extension(Cap::GuestMemfd) { + return Err(VmError::GuestMemfdNotSupported); + } + // It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines // with many VMs. // @@ -93,7 +101,14 @@ impl Vm { const MAX_ATTEMPTS: u32 = 5; let mut attempt = 1; let fd = loop { - match kvm.fd.create_vm() { + let create_result = if secret_free && VM_TYPE_FOR_SECRET_FREEDOM.is_some() { + kvm.fd + .create_vm_with_type(VM_TYPE_FOR_SECRET_FREEDOM.unwrap()) + } else { + kvm.fd.create_vm() + }; + + match create_result { Ok(fd) => break fd, Err(e) if e.errno() == libc::EINTR && attempt < MAX_ATTEMPTS => { info!("Attempt #{attempt} of KVM_CREATE_VM returned EINTR"); @@ -110,6 +125,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + secret_free, }) } @@ -226,6 +242,11 @@ impl Vm { Ok(()) } + /// Whether this VM is secret free + pub fn secret_free(&self) -> bool { + self.common.secret_free + } + /// Gets a reference to the kvm file descriptor owned by this VM. pub fn fd(&self) -> &VmFd { &self.common.fd @@ -343,7 +364,7 @@ pub(crate) mod tests { // Auxiliary function being used throughout the tests. pub(crate) fn setup_vm() -> (Kvm, Vm) { let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - let vm = Vm::new(&kvm).expect("Cannot create new vm"); + let vm = Vm::new(&kvm, false).expect("Cannot create new vm"); (kvm, vm) } @@ -359,7 +380,19 @@ pub(crate) mod tests { fn test_new() { // Testing with a valid /dev/kvm descriptor. let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); - Vm::new(&kvm).unwrap(); + Vm::new(&kvm, false).unwrap(); + } + + #[test] + fn test_new_secret_free() { + let kvm = Kvm::new(vec![]).unwrap(); + + if !kvm.fd.check_extension(Cap::GuestMemfd) { + return; + } + + Vm::new(&kvm, true) + .expect("should be able to create secret free VMs if guest_memfd is supported"); } #[test] From c1b0a9ca198cdb7632915dd984d8dedc52825cc9 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 2 Apr 2025 07:00:51 +0100 Subject: [PATCH 28/40] Use guest_memfd to back memory if secret freedom is enabled If the `secret_free` field of the memory_config is set to true in the /machine-config endpoint, back all memory regions using guest_memfd. For our setup, this means both setting the guest_memfd[_offset] fields in kvm_user_memory_region2, as well as mmaping the guest memory and reflecting this VMA back into the memslot's userspace_addr (which is how kvm internal accesses to guest memory will work for these guest_memfd regions, such as mmio emulation on x86). Signed-off-by: Patrick Roy --- src/vmm/benches/memory_access.rs | 2 +- src/vmm/src/builder.rs | 28 +++++++++++------ src/vmm/src/persist.rs | 2 +- src/vmm/src/resources.rs | 53 +++++++++++++++++++++++--------- src/vmm/src/vstate/memory.rs | 21 ++++++------- src/vmm/src/vstate/vm.rs | 38 +++++++++++++++++------ 6 files changed, 96 insertions(+), 48 deletions(-) diff --git a/src/vmm/benches/memory_access.rs b/src/vmm/benches/memory_access.rs index fe4f138db2d..e3b6b656302 100644 --- a/src/vmm/benches/memory_access.rs +++ b/src/vmm/benches/memory_access.rs @@ -9,7 +9,7 @@ fn bench_single_page_fault(c: &mut Criterion, configuration: VmResources) { c.bench_function("page_fault", |b| { b.iter_batched( || { - let memory = configuration.allocate_guest_memory().unwrap(); + let memory = configuration.allocate_guest_memory(None).unwrap(); // Get a pointer to the first memory region (cannot do `.get_slice(GuestAddress(0), // 1)`, because on ARM64 guest memory does not start at physical // address 0). diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 43db6d8c35d..cb60b16b166 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -62,7 +62,7 @@ use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestRegionMmap, MaybeBounce}; use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; +use crate::vstate::vm::{KVM_GMEM_NO_DIRECT_MAP, Vm}; use crate::{EventManager, Vmm, VmmError, device_manager}; /// Errors associated with starting the instance. @@ -217,10 +217,6 @@ pub fn build_microvm_for_boot( .as_ref() .ok_or(MissingKernelConfig)?; - let guest_memory = vm_resources - .allocate_guest_memory() - .map_err(StartMicrovmError::GuestMemory)?; - // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] let mut boot_cmdline = boot_config.cmdline.clone(); @@ -230,6 +226,8 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; + let secret_free = vm_resources.machine_config.secret_free; + let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, @@ -238,15 +236,25 @@ pub fn build_microvm_for_boot( vm_resources.machine_config.secret_free, )?; + let guest_memfd = match secret_free { + true => Some( + vmm.vm + .create_guest_memfd(vm_resources.memory_size(), KVM_GMEM_NO_DIRECT_MAP) + .map_err(VmmError::Vm)?, + ), + false => None, + }; + + let guest_memory = vm_resources + .allocate_guest_memory(guest_memfd) + .map_err(StartMicrovmError::GuestMemory)?; + vmm.vm .register_memory_regions(guest_memory) .map_err(VmmError::Vm)?; let entry_point = load_kernel( - MaybeBounce::new( - boot_config.kernel_file.try_clone().unwrap(), - vmm.vm.secret_free(), - ), + MaybeBounce::new(boot_config.kernel_file.try_clone().unwrap(), secret_free), vmm.vm.guest_memory(), )?; let initrd = match &boot_config.initrd_file { @@ -258,7 +266,7 @@ pub fn build_microvm_for_boot( Some(InitrdConfig::from_reader( vmm.vm.guest_memory(), - MaybeBounce::new(initrd_file.as_fd(), vmm.vm.secret_free()), + MaybeBounce::new(initrd_file.as_fd(), secret_free), u64_to_usize(size), )?) } diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 0e66b184f44..5ab446e572d 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -457,7 +457,7 @@ fn guest_memory_from_file( track_dirty_pages: bool, ) -> Result, GuestMemoryFromFileError> { let mem_file = File::open(mem_file_path)?; - let guest_mem = memory::snapshot_file(mem_file, mem_state.regions(), track_dirty_pages)?; + let guest_mem = memory::file_private(mem_file, mem_state.regions(), track_dirty_pages)?; Ok(guest_mem) } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index ac4adc22a56..2d5062faf61 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::From; +use std::fs::File; use std::path::PathBuf; use std::sync::{Arc, Mutex, MutexGuard}; @@ -31,7 +32,7 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::vsock::*; use crate::vstate::memory; -use crate::vstate::memory::{GuestRegionMmap, MemoryError}; +use crate::vstate::memory::{GuestRegionMmap, MemoryError, create_memfd}; /// Errors encountered when configuring microVM resources. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -502,12 +503,19 @@ impl VmResources { }) } + /// Gets the size of the guest memory, in bytes + pub fn memory_size(&self) -> usize { + mib_to_bytes(self.machine_config.mem_size_mib) + } + /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. /// /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result, MemoryError> { - // Page faults are more expensive for shared memory mapping, including memfd. + pub fn allocate_guest_memory( + &self, + guest_memfd: Option, + ) -> Result, MemoryError> { // For this reason, we only back guest memory with a memfd // if a vhost-user-blk device is configured in the VM, otherwise we fall back to // an anonymous private memory. @@ -516,20 +524,35 @@ impl VmResources { // because that would require running a backend process. If in the future we converge to // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. - let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); - if self.vhost_user_devices_used() { - memory::memfd_backed( - regions.as_ref(), - self.machine_config.track_dirty_pages, - self.machine_config.huge_pages, - ) - } else { - memory::anonymous( - regions.into_iter(), + let regions = crate::arch::arch_memory_regions(0, self.memory_size()).into_iter(); + match guest_memfd { + Some(file) => memory::file_shared( + file, + regions, self.machine_config.track_dirty_pages, self.machine_config.huge_pages, - ) + ), + None => { + if self.vhost_user_devices_used() { + let memfd = create_memfd( + self.memory_size() as u64, + self.machine_config.huge_pages.into(), + )? + .into_file(); + memory::file_shared( + memfd, + regions, + self.machine_config.track_dirty_pages, + self.machine_config.huge_pages, + ) + } else { + memory::anonymous( + regions.into_iter(), + self.machine_config.track_dirty_pages, + self.machine_config.huge_pages, + ) + } + } } } } diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index dd99e6b9e1b..005b4f7d38c 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -259,18 +259,16 @@ pub fn create( } /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. -pub fn memfd_backed( - regions: &[(GuestAddress, usize)], +pub fn file_shared( + file: File, + regions: impl Iterator, track_dirty_pages: bool, huge_pages: HugePageConfig, ) -> Result, MemoryError> { - let size = regions.iter().map(|&(_, size)| size as u64).sum(); - let memfd_file = create_memfd(size, huge_pages.into())?.into_file(); - create( - regions.iter().copied(), + regions, libc::MAP_SHARED | huge_pages.mmap_flags(), - Some(memfd_file), + Some(file), track_dirty_pages, ) } @@ -291,7 +289,7 @@ pub fn anonymous( /// Creates a GuestMemoryMmap given a `file` containing the data /// and a `state` containing mapping information. -pub fn snapshot_file( +pub fn file_private( file: File, regions: impl Iterator, track_dirty_pages: bool, @@ -477,7 +475,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { } } -fn create_memfd( +/// Creates a memfd of the given size and huge pages configuration +pub fn create_memfd( mem_size: u64, hugetlb_size: Option, ) -> Result { @@ -731,7 +730,7 @@ mod tests { guest_memory.dump(&mut memory_file).unwrap(); let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(memory_file, memory_state.regions(), false).unwrap(), + file_private(memory_file, memory_state.regions(), false).unwrap(), ) .unwrap(); @@ -793,7 +792,7 @@ mod tests { // We can restore from this because this is the first dirty dump. let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(file, memory_state.regions(), false).unwrap(), + file_private(file, memory_state.regions(), false).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 3210d8cb43a..473c9d30cb6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -8,13 +8,13 @@ use std::collections::HashMap; use std::fs::{File, OpenOptions}; use std::io::Write; -use std::os::fd::FromRawFd; +use std::os::fd::{AsRawFd, FromRawFd}; use std::path::Path; use std::sync::Arc; use kvm_bindings::{ - KVM_MEM_LOG_DIRTY_PAGES, kvm_create_guest_memfd, kvm_userspace_memory_region, - kvm_userspace_memory_region2, + KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, kvm_create_guest_memfd, + kvm_userspace_memory_region, kvm_userspace_memory_region2, }; use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; @@ -31,6 +31,8 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +pub(crate) const KVM_GMEM_NO_DIRECT_MAP: u64 = 1; + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -157,10 +159,6 @@ impl Vm { "guest_memfd size must be page aligned" ); - if !self.fd().check_extension(Cap::GuestMemfd) { - return Err(VmError::GuestMemfdNotSupported); - } - let kvm_gmem = kvm_create_guest_memfd { size: size as u64, flags, @@ -198,10 +196,22 @@ impl Vm { return Err(VmError::NotEnoughMemorySlots); } - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES + let mut flags = 0; + if region.bitmap().is_some() { + flags |= KVM_MEM_LOG_DIRTY_PAGES; + } + + #[allow(clippy::cast_sign_loss)] + let (guest_memfd, guest_memfd_offset) = if self.secret_free() { + flags |= KVM_MEM_GUEST_MEMFD; + + let fo = region + .file_offset() + .expect("secret hidden VMs must mmap guest_memfd for memslots"); + + (fo.file().as_raw_fd() as u32, fo.start()) } else { - 0 + (0, 0) }; let memory_region = kvm_userspace_memory_region2 { @@ -210,6 +220,8 @@ impl Vm { memory_size: region.len(), userspace_addr: region.as_ptr() as u64, flags, + guest_memfd, + guest_memfd_offset, ..Default::default() }; @@ -223,6 +235,12 @@ impl Vm { .map_err(VmError::SetUserMemoryRegion)?; } } else { + // Something is seriously wrong if we manage to set these fields on a host that doesn't + // even allow creation of guest_memfds! + assert_eq!(memory_region.guest_memfd, 0); + assert_eq!(memory_region.guest_memfd_offset, 0); + assert_eq!(memory_region.flags & KVM_MEM_GUEST_MEMFD, 0); + // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. unsafe { self.fd() From 1349cf5fba2a97523096aebd4f96427f9f8bfe04 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 8 Apr 2025 15:07:01 +0100 Subject: [PATCH 29/40] allow creation of snapshots of secret hidden VMs To take snapshots of secret hidden VMs, we need to bounce guest memory through a userspace buffer. Reuse the `Bounce` wrapper type that is already in use for loading the guest kernel / initrd. Signed-off-by: Patrick Roy --- src/vmm/src/vstate/vm.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 473c9d30cb6..4d5e4226799 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; use std::fs::{File, OpenOptions}; use std::io::Write; -use std::os::fd::{AsRawFd, FromRawFd}; +use std::os::fd::{AsFd, AsRawFd, FromRawFd}; use std::path::Path; use std::sync::Arc; @@ -26,7 +26,8 @@ use crate::persist::CreateSnapshotError; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ - Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, + GuestRegionMmap, MaybeBounce, }; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; @@ -355,7 +356,8 @@ impl Vm { self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; } SnapshotType::Full => { - self.guest_memory().dump(&mut file)?; + self.guest_memory() + .dump(&mut MaybeBounce::new(file.as_fd(), self.secret_free()))?; self.reset_dirty_bitmap(); self.guest_memory().reset_dirty(); } From 7a8f8d9b7c2d4c8f14cc07056b64a60e31d1311f Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 8 Apr 2025 15:20:19 +0100 Subject: [PATCH 30/40] tmp: set memory attributes to private on x86 The current version of the mmap-support patches require that on x86, memory attributes have to be set to private even if the guest_memfd VMA is short-circuited back into the memslot (on ARM, memory attributes are not even supported in this scenario). Signed-off-by: Patrick Roy --- src/vmm/src/builder.rs | 3 +++ src/vmm/src/vstate/vm.rs | 29 +++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index cb60b16b166..7c4d8516e7b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -253,6 +253,9 @@ pub fn build_microvm_for_boot( .register_memory_regions(guest_memory) .map_err(VmmError::Vm)?; + #[cfg(target_arch = "x86_64")] + vmm.vm.set_memory_private().map_err(VmmError::Vm)?; + let entry_point = load_kernel( MaybeBounce::new(boot_config.kernel_file.try_clone().unwrap(), secret_free), vmm.vm.guest_memory(), diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 4d5e4226799..3119bc4cbb6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -13,8 +13,9 @@ use std::path::Path; use std::sync::Arc; use kvm_bindings::{ - KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, kvm_create_guest_memfd, - kvm_userspace_memory_region, kvm_userspace_memory_region2, + KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEMORY_ATTRIBUTE_PRIVATE, + kvm_create_guest_memfd, kvm_memory_attributes, kvm_userspace_memory_region, + kvm_userspace_memory_region2, }; use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; @@ -68,6 +69,8 @@ pub enum VmError { GuestMemfd(kvm_ioctls::Error), /// guest_memfd is not supported on this host kernel. GuestMemfdNotSupported, + /// Failed to set memory attributes to private: {0} + SetMemoryAttributes(kvm_ioctls::Error), } /// Contains Vm functions that are usable across CPU architectures @@ -276,6 +279,28 @@ impl Vm { &self.common.guest_memory } + /// Sets the memory attributes on all guest_memfd-backed regions to private + pub fn set_memory_private(&self) -> Result<(), VmError> { + if !self.secret_free() { + return Ok(()); + } + + for region in self.guest_memory().iter() { + let attr = kvm_memory_attributes { + address: region.start_addr().0, + size: region.len(), + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + ..Default::default() + }; + + self.fd() + .set_memory_attributes(attr) + .map_err(VmError::SetMemoryAttributes)? + } + + Ok(()) + } + /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { self.guest_memory() From 4a618fd152aecedb3d6222d2e7369bf83255c6ae Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Wed, 16 Apr 2025 17:35:42 +0100 Subject: [PATCH 31/40] x86: force no-kvmclock on cmdline if secret_free=True kvm-clock is incompatible with direct map removal for now. This is because kvm-clock tries to access guest memory through the direct map. Additionally, it does not handle failures during guest-attempted activations of kvm-clock gracefully (e.g. it cannot/does not communicate these back to the guest). This means a guest will unconditionally assume that if it wrote to the kvm-clock MSR to activate kvm-clock, it will work. But if KVM internally fails to activate kvm-clock, KVM will never write the information the guest expects into guest memory, resulting in the guest reading garbage data (generally, zeros), resulting in division by zero panics in the guest. Hence, explicitly tells guests that they shouldn't even try to enable kvm-clock, if they value their lives. Signed-off-by: Patrick Roy --- src/vmm/src/builder.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 7c4d8516e7b..e4a9f0eecfc 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -228,6 +228,11 @@ pub fn build_microvm_for_boot( let secret_free = vm_resources.machine_config.secret_free; + #[cfg(target_arch = "x86_64")] + if secret_free { + boot_cmdline.insert_str("no-kvmclock")?; + } + let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, From 8c6ccd0284926977b2860865059f828ec0de5429 Mon Sep 17 00:00:00 2001 From: Jack Thomson Date: Wed, 9 Apr 2025 16:24:28 +0000 Subject: [PATCH 32/40] fix: Stop the scan for vmlinux failing Previously this would fail on x86 as we set -e. By setting the || true this means the script will continue. The grubby step next will fail if it failed to find the image. Signed-off-by: Jack Thomson --- resources/hiding_ci/build_and_install_kernel.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 79d6129f6d0..967f644e35a 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -122,7 +122,9 @@ al2023_update_boot() { dracut --kver $KERNEL_VERSION -f -v # This varies from x86 and ARM so capture what was generated - VM_LINUX_LOCATION=$(ls /boot/vmlinu{x,z}-$KERNEL_VERSION 2>/dev/null | head -n1) + # We add the || true here due to the fact that we have pipefail enabled + # this causes a non 0 exit when ls cant find vmlinux or vmlinux + VM_LINUX_LOCATION=$(ls /boot/vmlinu{x,z}-$KERNEL_VERSION 2>/dev/null | head -n1 || true) echo "Updating GRUB..." grubby --grub2 --add-kernel $VM_LINUX_LOCATION \ From 5df1386e1beb8fa5021c3f8e933fda07033480c6 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Tue, 22 Apr 2025 15:11:19 +0000 Subject: [PATCH 33/40] chore(hiding_ci): skip non-patch files when applying This is to allow to keep the licence and readme files in the patches directory. Signed-off-by: Nikita Kalyazin --- resources/hiding_ci/build_and_install_kernel.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 967f644e35a..68c0cca872b 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -86,10 +86,7 @@ apply_patch_or_series() { *.patch) apply_patch_file $1 ;; *.mbox) apply_series_mbox $1 ;; *.lore) apply_series_link $1 ;; - *) - echo "Uknown patch file: "$1 - exit 1 - ;; + *) echo "Skipping non-patch file" $1 ;; esac } From b968ca56b7a5fa8f6ffec850a2f0c8db062f6ef6 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Tue, 22 Apr 2025 14:32:49 +0000 Subject: [PATCH 34/40] doc(hiding_ci): add readme and GPL-2.0 text for Linux patches Explicitly mention that Linux patches are distributed under the GPL-2.0 licence. Signed-off-by: Nikita Kalyazin --- resources/hiding_ci/patches/GPL-2.0 | 359 ++++++++++++++++++++++++++ resources/hiding_ci/patches/README.md | 8 + 2 files changed, 367 insertions(+) create mode 100644 resources/hiding_ci/patches/GPL-2.0 create mode 100644 resources/hiding_ci/patches/README.md diff --git a/resources/hiding_ci/patches/GPL-2.0 b/resources/hiding_ci/patches/GPL-2.0 new file mode 100644 index 00000000000..ff0812fd89c --- /dev/null +++ b/resources/hiding_ci/patches/GPL-2.0 @@ -0,0 +1,359 @@ +Valid-License-Identifier: GPL-2.0 +Valid-License-Identifier: GPL-2.0-only +Valid-License-Identifier: GPL-2.0+ +Valid-License-Identifier: GPL-2.0-or-later +SPDX-URL: https://spdx.org/licenses/GPL-2.0.html +Usage-Guide: + To use this license in source code, put one of the following SPDX + tag/value pairs into a comment according to the placement + guidelines in the licensing rules documentation. + For 'GNU General Public License (GPL) version 2 only' use: + SPDX-License-Identifier: GPL-2.0 + or + SPDX-License-Identifier: GPL-2.0-only + For 'GNU General Public License (GPL) version 2 or any later version' use: + SPDX-License-Identifier: GPL-2.0+ + or + SPDX-License-Identifier: GPL-2.0-or-later +License-Text: + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/resources/hiding_ci/patches/README.md b/resources/hiding_ci/patches/README.md new file mode 100644 index 00000000000..7a119e42452 --- /dev/null +++ b/resources/hiding_ci/patches/README.md @@ -0,0 +1,8 @@ +# Linux kernel patches for direct map removal + +The Linux kernel patches in this directory are distributed under the `GPL-2.0` +licence (see the full licence text at [GPL-2.0](./GPL-2.0)). The patches are +required by Firecracker's "Secret Freedom" feature that removes the VM memory +from the host direct map (see +[lore](https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/) +for more details). The patches are not yet merged upstream. From e45177909e595273aa6a9725da5d12c2b300156a Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Tue, 22 Apr 2025 14:35:49 +0000 Subject: [PATCH 35/40] chore(hiding_ci): add userfault Linux patches Include the following patch series rebased on top of the v7 of "KVM: Mapping guest_memfd backed memory at the host for software protected VMs" (https:///kvm/20250318161823.4005529-1-tabba@google.com/, replace "" with "lore.kernel.org" in this and the following links): - v4 "Direct Map Removal for guest_memfd" (https:///kvm/20250221160728.1584559-1-roypat@amazon.co.uk/), with fixups - v2 "KVM: Introduce KVM Userfault" (https:///kvm/20250109204929.1106563-1-jthoughton@google.com/) - v3 "KVM: guest_memfd: use write for population" (https:///kvm/20250303130838.28812-1-kalyazin@amazon.com/) - v3 "KVM: guest_memfd: support for uffd minor" (https:///kvm/20250404154352.23078-1-kalyazin@amazon.com/), with fixups After this change all patches are represented as plain text files, meaning no patches are required to be fetched via a lore link. Signed-off-by: Nikita Kalyazin --- ...reeing-of-typed-folios-on-final-foli.patch | 109 +++++ resources/hiding_ci/patches/0001.lore | 1 - ...Handle-final-folio_put-of-guest_memf.patch | 182 +++++++++ ...-Allow-host-to-map-guest_memfd-pages.patch | 193 +++++++++ ..._X86_SW_PROTECTED_VM-as-supporting-g.patch | 58 +++ ...or-user_mem_abort-calculation-of-for.patch | 62 +++ ...Handle-in-place-shared-memory-as-gue.patch | 40 ++ ...-guest_memfd-backed-guest-page-fault.patch | 174 ++++++++ ...selftests-guest_memfd-mmap-test-when.patch | 149 +++++++ ...-Enable-mapping-guest_memfd-in-arm64.patch | 51 +++ ... 0010-mm-introduce-AS_NO_DIRECT_MAP.patch} | 6 +- ...-Add-flag-to-remove-from-direct-map.patch} | 6 +- .../patches/0012-patrick-v4-fixup.patch | 51 +++ ...EM_USERFAULT-memslot-flag-and-bitmap.patch | 161 ++++++++ ...M-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch | 28 ++ ...etting-of-KVM_MEM_USERFAULT-on-guest.patch | 58 +++ ...mu-Add-support-for-KVM_MEM_USERFAULT.patch | 217 ++++++++++ ...M_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch | 45 +++ ...64-Add-support-for-KVM_MEM_USERFAULT.patch | 87 ++++ ...ix-vm_mem_region_set_flags-docstring.patch | 28 ++ ...KVM-selftests-Fix-prefault_mem-logic.patch | 37 ++ ...ests-Add-va_start-end-into-uffd_desc.patch | 44 ++ ...form-set_memory_region_test-of-KVM_M.patch | 31 ++ ...d-KVM-Userfault-mode-to-demand_pagin.patch | 381 ++++++++++++++++++ ...d-KVM_MEM_USERFAULT-guest_memfd-togg.patch | 65 +++ ...n-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch | 76 ++++ ...mfd-add-generic-population-via-write.patch | 141 +++++++ ...tests-update-guest_memfd-write-tests.patch | 126 ++++++ ...d-generic-continue-for-non-hugetlbfs.patch | 153 +++++++ ...-provide-can_userfault-vma-operation.patch | 95 +++++ ...ltfd-use-can_userfault-vma-operation.patch | 79 ++++ ...fd-add-support-for-userfaultfd-minor.patch | 44 ++ ...d-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch | 61 +++ ...st-userfaultfd-minor-for-guest_memfd.patch | 146 +++++++ .../patches/0034-uffd-v3-fixup.patch | 50 +++ 35 files changed, 3228 insertions(+), 7 deletions(-) create mode 100644 resources/hiding_ci/patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch delete mode 100644 resources/hiding_ci/patches/0001.lore create mode 100644 resources/hiding_ci/patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch create mode 100644 resources/hiding_ci/patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch create mode 100644 resources/hiding_ci/patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch create mode 100644 resources/hiding_ci/patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch create mode 100644 resources/hiding_ci/patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch create mode 100644 resources/hiding_ci/patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch create mode 100644 resources/hiding_ci/patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch create mode 100644 resources/hiding_ci/patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch rename resources/hiding_ci/patches/{0002-mm-introduce-AS_NO_DIRECT_MAP.patch => 0010-mm-introduce-AS_NO_DIRECT_MAP.patch} (98%) rename resources/hiding_ci/patches/{0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch => 0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch} (97%) create mode 100644 resources/hiding_ci/patches/0012-patrick-v4-fixup.patch create mode 100644 resources/hiding_ci/patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch create mode 100644 resources/hiding_ci/patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch create mode 100644 resources/hiding_ci/patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch create mode 100644 resources/hiding_ci/patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch create mode 100644 resources/hiding_ci/patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch create mode 100644 resources/hiding_ci/patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch create mode 100644 resources/hiding_ci/patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch create mode 100644 resources/hiding_ci/patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch create mode 100644 resources/hiding_ci/patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch create mode 100644 resources/hiding_ci/patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch create mode 100644 resources/hiding_ci/patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch create mode 100644 resources/hiding_ci/patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch create mode 100644 resources/hiding_ci/patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch create mode 100644 resources/hiding_ci/patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch create mode 100644 resources/hiding_ci/patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch create mode 100644 resources/hiding_ci/patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch create mode 100644 resources/hiding_ci/patches/0029-mm-provide-can_userfault-vma-operation.patch create mode 100644 resources/hiding_ci/patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch create mode 100644 resources/hiding_ci/patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch create mode 100644 resources/hiding_ci/patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch create mode 100644 resources/hiding_ci/patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch create mode 100644 resources/hiding_ci/patches/0034-uffd-v3-fixup.patch diff --git a/resources/hiding_ci/patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch b/resources/hiding_ci/patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch new file mode 100644 index 00000000000..0986dfacfeb --- /dev/null +++ b/resources/hiding_ci/patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch @@ -0,0 +1,109 @@ +From f9ca710b51263ce8317cc2fa02232e456fa1f39c Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:15 +0000 +Subject: [PATCH 01/34] mm: Consolidate freeing of typed folios on final + folio_put() + +Some folio types, such as hugetlb, handle freeing their own +folios. Moreover, guest_memfd will require being notified once a +folio's reference count reaches 0 to facilitate shared to private +folio conversion, without the folio actually being freed at that +point. + +As a first step towards that, this patch consolidates freeing +folios that have a type. The first user is hugetlb folios. Later +in this patch series, guest_memfd will become the second user of +this. + +Suggested-by: David Hildenbrand +Acked-by: Vlastimil Babka +Acked-by: David Hildenbrand +Signed-off-by: Fuad Tabba +--- + include/linux/page-flags.h | 15 +++++++++++++++ + mm/swap.c | 23 ++++++++++++++++++----- + 2 files changed, 33 insertions(+), 5 deletions(-) + +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 36d283552f80..6dc2494bd002 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -953,6 +953,21 @@ static inline bool page_has_type(const struct page *page) + return page_mapcount_is_type(data_race(page->page_type)); + } + ++static inline int page_get_type(const struct page *page) ++{ ++ return page->page_type >> 24; ++} ++ ++static inline bool folio_has_type(const struct folio *folio) ++{ ++ return page_has_type(&folio->page); ++} ++ ++static inline int folio_get_type(const struct folio *folio) ++{ ++ return page_get_type(&folio->page); ++} ++ + #define FOLIO_TYPE_OPS(lname, fname) \ + static __always_inline bool folio_test_##fname(const struct folio *folio) \ + { \ +diff --git a/mm/swap.c b/mm/swap.c +index fc8281ef4241..47bc1bb919cc 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -94,6 +94,19 @@ static void page_cache_release(struct folio *folio) + unlock_page_lruvec_irqrestore(lruvec, flags); + } + ++static void free_typed_folio(struct folio *folio) ++{ ++ switch (folio_get_type(folio)) { ++#ifdef CONFIG_HUGETLBFS ++ case PGTY_hugetlb: ++ free_huge_folio(folio); ++ return; ++#endif ++ default: ++ WARN_ON_ONCE(1); ++ } ++} ++ + void __folio_put(struct folio *folio) + { + if (unlikely(folio_is_zone_device(folio))) { +@@ -101,8 +114,8 @@ void __folio_put(struct folio *folio) + return; + } + +- if (folio_test_hugetlb(folio)) { +- free_huge_folio(folio); ++ if (unlikely(folio_has_type(folio))) { ++ free_typed_folio(folio); + return; + } + +@@ -966,13 +979,13 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) + if (!folio_ref_sub_and_test(folio, nr_refs)) + continue; + +- /* hugetlb has its own memcg */ +- if (folio_test_hugetlb(folio)) { ++ if (unlikely(folio_has_type(folio))) { ++ /* typed folios have their own memcg, if any */ + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; + } +- free_huge_folio(folio); ++ free_typed_folio(folio); + continue; + } + folio_unqueue_deferred_split(folio); + +base-commit: 4701f33a10702d5fc577c32434eb62adde0a1ae1 +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0001.lore b/resources/hiding_ci/patches/0001.lore deleted file mode 100644 index 7663841026d..00000000000 --- a/resources/hiding_ci/patches/0001.lore +++ /dev/null @@ -1 +0,0 @@ -https://lore.kernel.org/kvm/20250318161823.4005529-1-tabba@google.com diff --git a/resources/hiding_ci/patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch b/resources/hiding_ci/patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch new file mode 100644 index 00000000000..b9f4f83f442 --- /dev/null +++ b/resources/hiding_ci/patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch @@ -0,0 +1,182 @@ +From 9a4d7cd855d14e1522f363e3e04ebb9fa0a90ff0 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:16 +0000 +Subject: [PATCH 02/34] KVM: guest_memfd: Handle final folio_put() of + guest_memfd pages + +Before transitioning a guest_memfd folio to unshared, thereby +disallowing access by the host and allowing the hypervisor to +transition its view of the guest page as private, we need to be +sure that the host doesn't have any references to the folio. + +This patch introduces a new type for guest_memfd folios, which +isn't activated in this series but is here as a placeholder and +to facilitate the code in the subsequent patch series. This will +be used in the future to register a callback that informs the +guest_memfd subsystem when the last reference is dropped, +therefore knowing that the host doesn't have any remaining +references. + +This patch also introduces the configuration option, +KVM_GMEM_SHARED_MEM, which toggles support for mapping +guest_memfd shared memory at the host. + +Signed-off-by: Fuad Tabba +Acked-by: Vlastimil Babka +Acked-by: David Hildenbrand +--- + include/linux/kvm_host.h | 4 ++++ + include/linux/page-flags.h | 16 ++++++++++++++++ + mm/debug.c | 1 + + mm/swap.c | 29 +++++++++++++++++++++++++++++ + virt/kvm/Kconfig | 4 ++++ + virt/kvm/guest_memfd.c | 8 ++++++++ + 6 files changed, 62 insertions(+) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index f34f4cfaa513..3ad0719bfc4f 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2571,4 +2571,8 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); + #endif + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++void kvm_gmem_handle_folio_put(struct folio *folio); ++#endif ++ + #endif +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 6dc2494bd002..daeee9a38e4c 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -933,6 +933,7 @@ enum pagetype { + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, ++ PGTY_guestmem = 0xf8, + + PGTY_mapcount_underflow = 0xff + }; +@@ -1082,6 +1083,21 @@ FOLIO_TYPE_OPS(hugetlb, hugetlb) + FOLIO_TEST_FLAG_FALSE(hugetlb) + #endif + ++/* ++ * guestmem folios are used to back VM memory as managed by guest_memfd. Once ++ * the last reference is put, instead of freeing these folios back to the page ++ * allocator, they are returned to guest_memfd. ++ * ++ * For now, guestmem will only be set on these folios as long as they cannot be ++ * mapped to user space ("private state"), with the plan of always setting that ++ * type once typed folios can be mapped to user space cleanly. ++ */ ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++FOLIO_TYPE_OPS(guestmem, guestmem) ++#else ++FOLIO_TEST_FLAG_FALSE(guestmem) ++#endif ++ + PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) + + /* +diff --git a/mm/debug.c b/mm/debug.c +index 8d2acf432385..08bc42c6cba8 100644 +--- a/mm/debug.c ++++ b/mm/debug.c +@@ -56,6 +56,7 @@ static const char *page_type_names[] = { + DEF_PAGETYPE_NAME(table), + DEF_PAGETYPE_NAME(buddy), + DEF_PAGETYPE_NAME(unaccepted), ++ DEF_PAGETYPE_NAME(guestmem), + }; + + static const char *page_type_name(unsigned int page_type) +diff --git a/mm/swap.c b/mm/swap.c +index 47bc1bb919cc..d8fda3948684 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -38,6 +38,10 @@ + #include + #include + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++#include ++#endif ++ + #include "internal.h" + + #define CREATE_TRACE_POINTS +@@ -94,6 +98,26 @@ static void page_cache_release(struct folio *folio) + unlock_page_lruvec_irqrestore(lruvec, flags); + } + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++static void gmem_folio_put(struct folio *folio) ++{ ++ /* ++ * Perform the callback only as long as the KVM module is still loaded. ++ * As long as the folio mapping is set, the folio is associated with a ++ * guest_memfd inode. ++ */ ++ if (folio->mapping) ++ kvm_gmem_handle_folio_put(folio); ++ ++ /* ++ * If there are no references to the folio left, it's not associated ++ * with a guest_memfd inode anymore. ++ */ ++ if (folio_ref_count(folio) == 0) ++ __folio_put(folio); ++} ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ ++ + static void free_typed_folio(struct folio *folio) + { + switch (folio_get_type(folio)) { +@@ -101,6 +125,11 @@ static void free_typed_folio(struct folio *folio) + case PGTY_hugetlb: + free_huge_folio(folio); + return; ++#endif ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ case PGTY_guestmem: ++ gmem_folio_put(folio); ++ return; + #endif + default: + WARN_ON_ONCE(1); +diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig +index 54e959e7d68f..4e759e8020c5 100644 +--- a/virt/kvm/Kconfig ++++ b/virt/kvm/Kconfig +@@ -124,3 +124,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE + config HAVE_KVM_ARCH_GMEM_INVALIDATE + bool + depends on KVM_PRIVATE_MEM ++ ++config KVM_GMEM_SHARED_MEM ++ select KVM_PRIVATE_MEM ++ bool +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index b2aa6bf24d3a..5fc414becae5 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -13,6 +13,14 @@ struct kvm_gmem { + struct list_head entry; + }; + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++void kvm_gmem_handle_folio_put(struct folio *folio) ++{ ++ WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress."); ++} ++EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ ++ + /** + * folio_file_pfn - like folio_file_page, but return a pfn. + * @folio: The folio which contains this index. +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch b/resources/hiding_ci/patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch new file mode 100644 index 00000000000..8fb306b257b --- /dev/null +++ b/resources/hiding_ci/patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch @@ -0,0 +1,193 @@ +From fd39febef2e0d41394e51f5e34f2c8de80b3b4dc Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:17 +0000 +Subject: [PATCH 03/34] KVM: guest_memfd: Allow host to map guest_memfd() pages + +Add support for mmap() and fault() for guest_memfd backed memory +in the host for VMs that support in-place conversion between +shared and private. To that end, this patch adds the ability to +check whether the VM type supports in-place conversion, and only +allows mapping its memory if that's the case. + +Also add the KVM capability KVM_CAP_GMEM_SHARED_MEM, which +indicates that the VM supports shared memory in guest_memfd, or +that the host can create VMs that support shared memory. +Supporting shared memory implies that memory can be mapped when +shared with the host. + +This is controlled by the KVM_GMEM_SHARED_MEM configuration +option. + +Signed-off-by: Fuad Tabba +--- + include/linux/kvm_host.h | 11 +++++ + include/uapi/linux/kvm.h | 1 + + virt/kvm/guest_memfd.c | 101 +++++++++++++++++++++++++++++++++++++++ + virt/kvm/kvm_main.c | 4 ++ + 4 files changed, 117 insertions(+) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 3ad0719bfc4f..601bbcaa5e41 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -728,6 +728,17 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) + } + #endif + ++/* ++ * Arch code must define kvm_arch_gmem_supports_shared_mem if support for ++ * private memory is enabled and it supports in-place shared/private conversion. ++ */ ++#if !defined(kvm_arch_gmem_supports_shared_mem) && !IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) ++static inline bool kvm_arch_gmem_supports_shared_mem(struct kvm *kvm) ++{ ++ return false; ++} ++#endif ++ + #ifndef kvm_arch_has_readonly_mem + static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) + { +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 45e6d8fca9b9..117937a895da 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -929,6 +929,7 @@ struct kvm_enable_cap { + #define KVM_CAP_PRE_FAULT_MEMORY 236 + #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 + #define KVM_CAP_X86_GUEST_MODE 238 ++#define KVM_CAP_GMEM_SHARED_MEM 239 + + struct kvm_irq_routing_irqchip { + __u32 irqchip; +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 5fc414becae5..fbf89e643add 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -320,7 +320,108 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) + return gfn - slot->base_gfn + slot->gmem.pgoff; + } + ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index) ++{ ++ struct kvm_gmem *gmem = file->private_data; ++ ++ ++ /* For now, VMs that support shared memory share all their memory. */ ++ return kvm_arch_gmem_supports_shared_mem(gmem->kvm); ++} ++ ++static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) ++{ ++ struct inode *inode = file_inode(vmf->vma->vm_file); ++ struct folio *folio; ++ vm_fault_t ret = VM_FAULT_LOCKED; ++ ++ filemap_invalidate_lock_shared(inode->i_mapping); ++ ++ folio = kvm_gmem_get_folio(inode, vmf->pgoff); ++ if (IS_ERR(folio)) { ++ int err = PTR_ERR(folio); ++ ++ if (err == -EAGAIN) ++ ret = VM_FAULT_RETRY; ++ else ++ ret = vmf_error(err); ++ ++ goto out_filemap; ++ } ++ ++ if (folio_test_hwpoison(folio)) { ++ ret = VM_FAULT_HWPOISON; ++ goto out_folio; ++ } ++ ++ if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) { ++ ret = VM_FAULT_SIGBUS; ++ goto out_folio; ++ } ++ ++ /* ++ * Shared folios would not be marked as "guestmem" so far, and we only ++ * expect shared folios at this point. ++ */ ++ if (WARN_ON_ONCE(folio_test_guestmem(folio))) { ++ ret = VM_FAULT_SIGBUS; ++ goto out_folio; ++ } ++ ++ /* No support for huge pages. */ ++ if (WARN_ON_ONCE(folio_test_large(folio))) { ++ ret = VM_FAULT_SIGBUS; ++ goto out_folio; ++ } ++ ++ if (!folio_test_uptodate(folio)) { ++ clear_highpage(folio_page(folio, 0)); ++ kvm_gmem_mark_prepared(folio); ++ } ++ ++ vmf->page = folio_file_page(folio, vmf->pgoff); ++ ++out_folio: ++ if (ret != VM_FAULT_LOCKED) { ++ folio_unlock(folio); ++ folio_put(folio); ++ } ++ ++out_filemap: ++ filemap_invalidate_unlock_shared(inode->i_mapping); ++ ++ return ret; ++} ++ ++static const struct vm_operations_struct kvm_gmem_vm_ops = { ++ .fault = kvm_gmem_fault, ++}; ++ ++static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ struct kvm_gmem *gmem = file->private_data; ++ ++ if (!kvm_arch_gmem_supports_shared_mem(gmem->kvm)) ++ return -ENODEV; ++ ++ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != ++ (VM_SHARED | VM_MAYSHARE)) { ++ return -EINVAL; ++ } ++ ++ file_accessed(file); ++ vm_flags_set(vma, VM_DONTDUMP); ++ vma->vm_ops = &kvm_gmem_vm_ops; ++ ++ return 0; ++} ++#else ++#define kvm_gmem_mmap NULL ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ ++ + static struct file_operations kvm_gmem_fops = { ++ .mmap = kvm_gmem_mmap, + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index ba0327e2d0d3..38f0f402ea46 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4830,6 +4830,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) + #ifdef CONFIG_KVM_PRIVATE_MEM + case KVM_CAP_GUEST_MEMFD: + return !kvm || kvm_arch_has_private_mem(kvm); ++#endif ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ case KVM_CAP_GMEM_SHARED_MEM: ++ return !kvm || kvm_arch_gmem_supports_shared_mem(kvm); + #endif + default: + break; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch b/resources/hiding_ci/patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch new file mode 100644 index 00000000000..33bf24f9415 --- /dev/null +++ b/resources/hiding_ci/patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch @@ -0,0 +1,58 @@ +From d16c343f0f95ecd8d2cda2dfba4ac8b7c293f217 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:19 +0000 +Subject: [PATCH 04/34] KVM: x86: Mark KVM_X86_SW_PROTECTED_VM as supporting + guest_memfd shared memory + +The KVM_X86_SW_PROTECTED_VM type is meant for experimentation and +does not have any underlying support for protected guests. This +makes it a good candidate for testing mapping shared memory. +Therefore, when the kconfig option is enabled, mark +KVM_X86_SW_PROTECTED_VM as supporting shared memory. + +This means that this memory is considered by guest_memfd to be +shared with the host, with the possibility of in-place conversion +between shared and private. This allows the host to map and fault +in guest_memfd memory belonging to this VM type. + +Signed-off-by: Fuad Tabba +--- + arch/x86/include/asm/kvm_host.h | 5 +++++ + arch/x86/kvm/Kconfig | 3 ++- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 32ae3aa50c7e..b874e54a5ee4 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -2246,8 +2246,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + + #ifdef CONFIG_KVM_PRIVATE_MEM + #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) ++ ++#define kvm_arch_gmem_supports_shared_mem(kvm) \ ++ (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) && \ ++ ((kvm)->arch.vm_type == KVM_X86_SW_PROTECTED_VM)) + #else + #define kvm_arch_has_private_mem(kvm) false ++#define kvm_arch_gmem_supports_shared_mem(kvm) false + #endif + + #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index ea2c4f21c1ca..22d1bcdaad58 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -45,7 +45,8 @@ config KVM_X86 + select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_PRE_FAULT_MEMORY +- select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM ++ select KVM_PRIVATE_MEM if KVM_SW_PROTECTED_VM ++ select KVM_GMEM_SHARED_MEM if KVM_SW_PROTECTED_VM + select KVM_WERROR if WERROR + + config KVM +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch b/resources/hiding_ci/patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch new file mode 100644 index 00000000000..38b3292884d --- /dev/null +++ b/resources/hiding_ci/patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch @@ -0,0 +1,62 @@ +From 483ccb70335cb0c76161caf76c0ccb7c618038e2 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:20 +0000 +Subject: [PATCH 05/34] KVM: arm64: Refactor user_mem_abort() calculation of + force_pte + +To simplify the code and to make the assumptions clearer, +refactor user_mem_abort() by immediately setting force_pte to +true if the conditions are met. Also, remove the comment about +logging_active being guaranteed to never be true for VM_PFNMAP +memslots, since it's not technically correct right now. + +No functional change intended. + +Signed-off-by: Fuad Tabba +--- + arch/arm64/kvm/mmu.c | 13 ++++--------- + 1 file changed, 4 insertions(+), 9 deletions(-) + +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index 1f55b0c7b11d..887ffa1f5b14 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1460,7 +1460,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + bool fault_is_perm) + { + int ret = 0; +- bool write_fault, writable, force_pte = false; ++ bool write_fault, writable; + bool exec_fault, mte_allowed; + bool device = false, vfio_allow_any_uc = false; + unsigned long mmu_seq; +@@ -1472,6 +1472,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + gfn_t gfn; + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); ++ bool force_pte = logging_active || is_protected_kvm_enabled(); + long vma_pagesize, fault_granule; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; +@@ -1521,16 +1522,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + return -EFAULT; + } + +- /* +- * logging_active is guaranteed to never be true for VM_PFNMAP +- * memslots. +- */ +- if (logging_active || is_protected_kvm_enabled()) { +- force_pte = true; ++ if (force_pte) + vma_shift = PAGE_SHIFT; +- } else { ++ else + vma_shift = get_vma_page_shift(vma, hva); +- } + + switch (vma_shift) { + #ifndef __PAGETABLE_PMD_FOLDED +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch b/resources/hiding_ci/patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch new file mode 100644 index 00000000000..05d9e08b1fc --- /dev/null +++ b/resources/hiding_ci/patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch @@ -0,0 +1,40 @@ +From b1e925d4d5db8513dba67c3a9d40a2b507668f09 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:18 +0000 +Subject: [PATCH 06/34] KVM: guest_memfd: Handle in-place shared memory as + guest_memfd backed memory + +For VMs that allow sharing guest_memfd backed memory in-place, +handle that memory the same as "private" guest_memfd memory. This +means that faulting that memory in the host or in the guest will +go through the guest_memfd subsystem. + +Note that the word "private" in the name of the function +kvm_mem_is_private() doesn't necessarily indicate that the memory +isn't shared, but is due to the history and evolution of +guest_memfd and the various names it has received. In effect, +this function is used to multiplex between the path of a normal +page fault and the path of a guest_memfd backed page fault. + +Signed-off-by: Fuad Tabba +--- + include/linux/kvm_host.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 601bbcaa5e41..3d5595a71a2a 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2521,7 +2521,8 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) + #else + static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) + { +- return false; ++ return kvm_arch_gmem_supports_shared_mem(kvm) && ++ kvm_slot_can_be_private(gfn_to_memslot(kvm, gfn)); + } + #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ + +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch b/resources/hiding_ci/patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch new file mode 100644 index 00000000000..1ce256e6f57 --- /dev/null +++ b/resources/hiding_ci/patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch @@ -0,0 +1,174 @@ +From 996513a423377349767d5cfef87850e80131854f Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:21 +0000 +Subject: [PATCH 07/34] KVM: arm64: Handle guest_memfd()-backed guest page + faults + +Add arm64 support for handling guest page faults on guest_memfd +backed memslots. + +For now, the fault granule is restricted to PAGE_SIZE. + +Signed-off-by: Fuad Tabba +--- + arch/arm64/kvm/mmu.c | 65 +++++++++++++++++++++++++++------------- + include/linux/kvm_host.h | 5 ++++ + virt/kvm/kvm_main.c | 5 ---- + 3 files changed, 50 insertions(+), 25 deletions(-) + +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index 887ffa1f5b14..adb0681fc1c6 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1454,6 +1454,30 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) + return vma->vm_flags & VM_MTE_ALLOWED; + } + ++static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, ++ gfn_t gfn, bool write_fault, bool *writable, ++ struct page **page, bool is_private) ++{ ++ kvm_pfn_t pfn; ++ int ret; ++ ++ if (!is_private) ++ return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page); ++ ++ *writable = false; ++ ++ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL); ++ if (!ret) { ++ *writable = !memslot_is_readonly(slot); ++ return pfn; ++ } ++ ++ if (ret == -EHWPOISON) ++ return KVM_PFN_ERR_HWPOISON; ++ ++ return KVM_PFN_ERR_NOSLOT_MASK; ++} ++ + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, unsigned long hva, +@@ -1461,19 +1485,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + { + int ret = 0; + bool write_fault, writable; +- bool exec_fault, mte_allowed; ++ bool exec_fault, mte_allowed = false; + bool device = false, vfio_allow_any_uc = false; + unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; + struct kvm *kvm = vcpu->kvm; +- struct vm_area_struct *vma; ++ struct vm_area_struct *vma = NULL; + short vma_shift; + void *memcache; +- gfn_t gfn; ++ gfn_t gfn = ipa >> PAGE_SHIFT; + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); +- bool force_pte = logging_active || is_protected_kvm_enabled(); +- long vma_pagesize, fault_granule; ++ bool is_gmem = kvm_mem_is_private(kvm, gfn); ++ bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); ++ long vma_pagesize, fault_granule = PAGE_SIZE; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; + struct page *page; +@@ -1510,16 +1535,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + return ret; + } + ++ mmap_read_lock(current->mm); ++ + /* + * Let's check if we will get back a huge page backed by hugetlbfs, or + * get block mapping for device MMIO region. + */ +- mmap_read_lock(current->mm); +- vma = vma_lookup(current->mm, hva); +- if (unlikely(!vma)) { +- kvm_err("Failed to find VMA for hva 0x%lx\n", hva); +- mmap_read_unlock(current->mm); +- return -EFAULT; ++ if (!is_gmem) { ++ vma = vma_lookup(current->mm, hva); ++ if (unlikely(!vma)) { ++ kvm_err("Failed to find VMA for hva 0x%lx\n", hva); ++ mmap_read_unlock(current->mm); ++ return -EFAULT; ++ } ++ ++ vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; ++ mte_allowed = kvm_vma_mte_allowed(vma); + } + + if (force_pte) +@@ -1590,18 +1621,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + ipa &= ~(vma_pagesize - 1); + } + +- gfn = ipa >> PAGE_SHIFT; +- mte_allowed = kvm_vma_mte_allowed(vma); +- +- vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; +- + /* Don't use the VMA after the unlock -- it may have vanished */ + vma = NULL; + + /* + * Read mmu_invalidate_seq so that KVM can detect if the results of +- * vma_lookup() or __kvm_faultin_pfn() become stale prior to +- * acquiring kvm->mmu_lock. ++ * vma_lookup() or faultin_pfn() become stale prior to acquiring ++ * kvm->mmu_lock. + * + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). +@@ -1609,8 +1635,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + +- pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, +- &writable, &page); ++ pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); + return 0; +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 3d5595a71a2a..ec3bedc18eab 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) + return gfn_to_memslot(kvm, gfn)->id; + } + ++static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot) ++{ ++ return slot->flags & KVM_MEM_READONLY; ++} ++ + static inline gfn_t + hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) + { +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 38f0f402ea46..3e40acb9f5c0 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) + return size; + } + +-static bool memslot_is_readonly(const struct kvm_memory_slot *slot) +-{ +- return slot->flags & KVM_MEM_READONLY; +-} +- + static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, + gfn_t *nr_pages, bool write) + { +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch b/resources/hiding_ci/patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch new file mode 100644 index 00000000000..872ee55d789 --- /dev/null +++ b/resources/hiding_ci/patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch @@ -0,0 +1,149 @@ +From 1ee5d01987bff47f007fb86ad7738b299816b2ef Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:23 +0000 +Subject: [PATCH 08/34] KVM: guest_memfd: selftests: guest_memfd mmap() test + when mapping is allowed + +Expand the guest_memfd selftests to include testing mapping guest +memory for VM types that support it. + +Also, build the guest_memfd selftest for arm64. + +Signed-off-by: Fuad Tabba +--- + tools/testing/selftests/kvm/Makefile.kvm | 1 + + .../testing/selftests/kvm/guest_memfd_test.c | 75 +++++++++++++++++-- + 2 files changed, 70 insertions(+), 6 deletions(-) + +diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm +index 4277b983cace..c9a3f30e28dd 100644 +--- a/tools/testing/selftests/kvm/Makefile.kvm ++++ b/tools/testing/selftests/kvm/Makefile.kvm +@@ -160,6 +160,7 @@ TEST_GEN_PROGS_arm64 += coalesced_io_test + TEST_GEN_PROGS_arm64 += demand_paging_test + TEST_GEN_PROGS_arm64 += dirty_log_test + TEST_GEN_PROGS_arm64 += dirty_log_perf_test ++TEST_GEN_PROGS_arm64 += guest_memfd_test + TEST_GEN_PROGS_arm64 += guest_print_test + TEST_GEN_PROGS_arm64 += get-reg-list + TEST_GEN_PROGS_arm64 += kvm_create_max_vcpus +diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c +index ce687f8d248f..38c501e49e0e 100644 +--- a/tools/testing/selftests/kvm/guest_memfd_test.c ++++ b/tools/testing/selftests/kvm/guest_memfd_test.c +@@ -34,12 +34,48 @@ static void test_file_read_write(int fd) + "pwrite on a guest_mem fd should fail"); + } + +-static void test_mmap(int fd, size_t page_size) ++static void test_mmap_allowed(int fd, size_t total_size) + { ++ size_t page_size = getpagesize(); ++ const char val = 0xaa; ++ char *mem; ++ int ret; ++ int i; ++ ++ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); ++ TEST_ASSERT(mem != MAP_FAILED, "mmaping() guest memory should pass."); ++ ++ memset(mem, val, total_size); ++ for (i = 0; i < total_size; i++) ++ TEST_ASSERT_EQ(mem[i], val); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, ++ page_size); ++ TEST_ASSERT(!ret, "fallocate the first page should succeed"); ++ ++ for (i = 0; i < page_size; i++) ++ TEST_ASSERT_EQ(mem[i], 0x00); ++ for (; i < total_size; i++) ++ TEST_ASSERT_EQ(mem[i], val); ++ ++ memset(mem, val, total_size); ++ for (i = 0; i < total_size; i++) ++ TEST_ASSERT_EQ(mem[i], val); ++ ++ ret = munmap(mem, total_size); ++ TEST_ASSERT(!ret, "munmap should succeed"); ++} ++ ++static void test_mmap_denied(int fd, size_t total_size) ++{ ++ size_t page_size = getpagesize(); + char *mem; + + mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT_EQ(mem, MAP_FAILED); ++ ++ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); ++ TEST_ASSERT_EQ(mem, MAP_FAILED); + } + + static void test_file_size(int fd, size_t page_size, size_t total_size) +@@ -170,19 +206,27 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) + close(fd1); + } + +-int main(int argc, char *argv[]) ++unsigned long get_shared_type(void) + { +- size_t page_size; ++#ifdef __x86_64__ ++ return KVM_X86_SW_PROTECTED_VM; ++#endif ++ return 0; ++} ++ ++void test_vm_type(unsigned long type, bool is_shared) ++{ ++ struct kvm_vm *vm; + size_t total_size; ++ size_t page_size; + int fd; +- struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + + page_size = getpagesize(); + total_size = page_size * 4; + +- vm = vm_create_barebones(); ++ vm = vm_create_barebones_type(type); + + test_create_guest_memfd_invalid(vm); + test_create_guest_memfd_multiple(vm); +@@ -190,10 +234,29 @@ int main(int argc, char *argv[]) + fd = vm_create_guest_memfd(vm, total_size, 0); + + test_file_read_write(fd); +- test_mmap(fd, page_size); ++ ++ if (is_shared) ++ test_mmap_allowed(fd, total_size); ++ else ++ test_mmap_denied(fd, total_size); ++ + test_file_size(fd, page_size, total_size); + test_fallocate(fd, page_size, total_size); + test_invalid_punch_hole(fd, page_size, total_size); + + close(fd); ++ kvm_vm_release(vm); ++} ++ ++int main(int argc, char *argv[]) ++{ ++#ifndef __aarch64__ ++ /* For now, arm64 only supports shared guest memory. */ ++ test_vm_type(VM_TYPE_DEFAULT, false); ++#endif ++ ++ if (kvm_has_cap(KVM_CAP_GMEM_SHARED_MEM)) ++ test_vm_type(get_shared_type(), true); ++ ++ return 0; + } +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch b/resources/hiding_ci/patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch new file mode 100644 index 00000000000..eb2cb3fd4e8 --- /dev/null +++ b/resources/hiding_ci/patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch @@ -0,0 +1,51 @@ +From 3cc51efc17a2c41a480eed36b31c1773936717e0 Mon Sep 17 00:00:00 2001 +From: Fuad Tabba +Date: Tue, 18 Mar 2025 16:18:22 +0000 +Subject: [PATCH 09/34] KVM: arm64: Enable mapping guest_memfd in arm64 + +Enable mapping guest_memfd in arm64. For now, it applies to all +VMs in arm64 that use guest_memfd. In the future, new VM types +can restrict this via kvm_arch_gmem_supports_shared_mem(). + +Signed-off-by: Fuad Tabba +--- + arch/arm64/include/asm/kvm_host.h | 12 ++++++++++++ + arch/arm64/kvm/Kconfig | 1 + + 2 files changed, 13 insertions(+) + +diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h +index d919557af5e5..4440b2334a05 100644 +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -1543,4 +1543,16 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); + #define kvm_has_s1poe(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) + ++#ifdef CONFIG_KVM_PRIVATE_MEM ++static inline bool kvm_arch_has_private_mem(struct kvm *kvm) ++{ ++ return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM); ++} ++ ++static inline bool kvm_arch_gmem_supports_shared_mem(struct kvm *kvm) ++{ ++ return IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM); ++} ++#endif /* CONFIG_KVM_PRIVATE_MEM */ ++ + #endif /* __ARM64_KVM_HOST_H__ */ +diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig +index ead632ad01b4..4830d8805bed 100644 +--- a/arch/arm64/kvm/Kconfig ++++ b/arch/arm64/kvm/Kconfig +@@ -38,6 +38,7 @@ menuconfig KVM + select HAVE_KVM_VCPU_RUN_PID_CHANGE + select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS ++ select KVM_GMEM_SHARED_MEM + help + Support hosting virtualized guest machines. + +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch b/resources/hiding_ci/patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch similarity index 98% rename from resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch rename to resources/hiding_ci/patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch index 53dfc236022..0ec11b19bb7 100644 --- a/resources/hiding_ci/patches/0002-mm-introduce-AS_NO_DIRECT_MAP.patch +++ b/resources/hiding_ci/patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch @@ -1,7 +1,7 @@ -From 138b7a4c83c43b42851cb8fec2bbdbaadd960241 Mon Sep 17 00:00:00 2001 +From 22ec89c0ff7af3430027cf71cf8bce5c8ed6e402 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 7 Feb 2025 11:16:06 +0000 -Subject: [PATCH 1/2] mm: introduce AS_NO_DIRECT_MAP +Subject: [PATCH 10/34] mm: introduce AS_NO_DIRECT_MAP Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are set to not present . Currently, mappings that match this description are @@ -204,5 +204,5 @@ index 1b0a214ee558..ea4c04d469b1 100644 inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; -- -2.48.1 +2.47.1 diff --git a/resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch b/resources/hiding_ci/patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch similarity index 97% rename from resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch rename to resources/hiding_ci/patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch index c46e04e8543..0c8b984a2e6 100644 --- a/resources/hiding_ci/patches/0003-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch +++ b/resources/hiding_ci/patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch @@ -1,7 +1,7 @@ -From 9bbc39f9c7622f0060d395b1063a564c24926d8d Mon Sep 17 00:00:00 2001 +From b1fc478976c93fd42b14e06d2de57e121be03142 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 7 Feb 2025 14:33:01 +0000 -Subject: [PATCH 2/2] KVM: guest_memfd: Add flag to remove from direct map +Subject: [PATCH 11/34] KVM: guest_memfd: Add flag to remove from direct map Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When set, guest_memfd folios will be removed from the direct map after @@ -174,5 +174,5 @@ index 3e40acb9f5c0..32ca1c921ab0 100644 return !kvm || kvm_arch_has_private_mem(kvm); #endif -- -2.48.1 +2.47.1 diff --git a/resources/hiding_ci/patches/0012-patrick-v4-fixup.patch b/resources/hiding_ci/patches/0012-patrick-v4-fixup.patch new file mode 100644 index 00000000000..7532bcbe975 --- /dev/null +++ b/resources/hiding_ci/patches/0012-patrick-v4-fixup.patch @@ -0,0 +1,51 @@ +From 098a8167ad6c55336cad9abb808ebdc105784278 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Thu, 10 Apr 2025 14:18:39 +0000 +Subject: [PATCH 12/34] patrick v4 fixup + +Do not make kvm_gmem_free_folio dependent on +CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE . +--- + virt/kvm/guest_memfd.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index a2b96bc51391..291d647a5c80 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -487,28 +487,28 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol + return MF_DELAYED; + } + +-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + static void kvm_gmem_free_folio(struct folio *folio) + { ++#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + struct page *page = folio_page(folio, 0); + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); ++#endif + + if (folio_test_private(folio)) + WARN_ON_ONCE(set_direct_map_valid_noflush(folio_page(folio, 0), + folio_nr_pages(folio), true)); + ++#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); +-} + #endif ++} + + static const struct address_space_operations kvm_gmem_aops = { + .dirty_folio = noop_dirty_folio, + .migrate_folio = kvm_gmem_migrate_folio, + .error_remove_folio = kvm_gmem_error_folio, +-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + .free_folio = kvm_gmem_free_folio, +-#endif + }; + + static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch b/resources/hiding_ci/patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch new file mode 100644 index 00000000000..263752c8c6e --- /dev/null +++ b/resources/hiding_ci/patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch @@ -0,0 +1,161 @@ +From abfa51cb95feaae899254453788c6db1c70d0189 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:17 +0000 +Subject: [PATCH 13/34] KVM: Add KVM_MEM_USERFAULT memslot flag and bitmap + +Use one of the 14 reserved u64s in struct kvm_userspace_memory_region2 +for the user to provide `userfault_bitmap`. + +The memslot flag indicates if KVM should be reading from the +`userfault_bitmap` field from the memslot. The user is permitted to +provide a bogus pointer. If the pointer cannot be read from, we will +return -EFAULT (with no other information) back to the user. + +Signed-off-by: James Houghton +--- + include/linux/kvm_host.h | 14 ++++++++++++++ + include/uapi/linux/kvm.h | 4 +++- + virt/kvm/Kconfig | 3 +++ + virt/kvm/kvm_main.c | 36 ++++++++++++++++++++++++++++++++++++ + 4 files changed, 56 insertions(+), 1 deletion(-) + +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index ec3bedc18eab..6cd0d910678e 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -596,6 +596,7 @@ struct kvm_memory_slot { + unsigned long *dirty_bitmap; + struct kvm_arch_memory_slot arch; + unsigned long userspace_addr; ++ unsigned long __user *userfault_bitmap; + u32 flags; + short id; + u16 as_id; +@@ -746,6 +747,11 @@ static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) + } + #endif + ++static inline bool kvm_has_userfault(struct kvm *kvm) ++{ ++ return IS_ENABLED(CONFIG_HAVE_KVM_USERFAULT); ++} ++ + struct kvm_memslots { + u64 generation; + atomic_long_t last_used_slot; +@@ -2592,4 +2598,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + void kvm_gmem_handle_folio_put(struct folio *folio); + #endif + ++int kvm_gfn_userfault(struct kvm *kvm, struct kvm_memory_slot *memslot, ++ gfn_t gfn); ++ ++static inline bool kvm_memslot_userfault(struct kvm_memory_slot *memslot) ++{ ++ return memslot->flags & KVM_MEM_USERFAULT; ++} ++ + #endif +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index fb02a93546d8..03676746be71 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -40,7 +40,8 @@ struct kvm_userspace_memory_region2 { + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; +- __u64 pad2[14]; ++ __u64 userfault_bitmap; ++ __u64 pad2[13]; + }; + + /* +@@ -51,6 +52,7 @@ struct kvm_userspace_memory_region2 { + #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) + #define KVM_MEM_READONLY (1UL << 1) + #define KVM_MEM_GUEST_MEMFD (1UL << 2) ++#define KVM_MEM_USERFAULT (1UL << 3) + + /* for KVM_IRQ_LINE */ + struct kvm_irq_level { +diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig +index 4e759e8020c5..7987fed3f3ec 100644 +--- a/virt/kvm/Kconfig ++++ b/virt/kvm/Kconfig +@@ -128,3 +128,6 @@ config HAVE_KVM_ARCH_GMEM_INVALIDATE + config KVM_GMEM_SHARED_MEM + select KVM_PRIVATE_MEM + bool ++ ++config HAVE_KVM_USERFAULT ++ bool +\ No newline at end of file +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 32ca1c921ab0..fb3ccf0cbb04 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1532,6 +1532,9 @@ static int check_memory_region_flags(struct kvm *kvm, + !(mem->flags & KVM_MEM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_READONLY; + ++ if (kvm_has_userfault(kvm)) ++ valid_flags |= KVM_MEM_USERFAULT; ++ + if (mem->flags & ~valid_flags) + return -EINVAL; + +@@ -1968,6 +1971,13 @@ static int kvm_set_memory_region(struct kvm *kvm, + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + return -EINVAL; + ++ if (mem->flags & KVM_MEM_USERFAULT && ++ ((mem->userfault_bitmap != untagged_addr(mem->userfault_bitmap)) || ++ !access_ok((void __user *)(unsigned long)mem->userfault_bitmap, ++ DIV_ROUND_UP(mem->memory_size >> PAGE_SHIFT, BITS_PER_LONG) ++ * sizeof(long)))) ++ return -EINVAL; ++ + slots = __kvm_memslots(kvm, as_id); + + /* +@@ -2035,6 +2045,9 @@ static int kvm_set_memory_region(struct kvm *kvm, + if (r) + goto out; + } ++ if (mem->flags & KVM_MEM_USERFAULT) ++ new->userfault_bitmap = ++ (unsigned long __user *)(unsigned long)mem->userfault_bitmap; + + r = kvm_set_memslot(kvm, old, new, change); + if (r) +@@ -6468,3 +6481,26 @@ void kvm_exit(void) + kvm_irqfd_exit(); + } + EXPORT_SYMBOL_GPL(kvm_exit); ++ ++int kvm_gfn_userfault(struct kvm *kvm, struct kvm_memory_slot *memslot, ++ gfn_t gfn) ++{ ++ unsigned long bitmap_chunk = 0; ++ off_t offset; ++ ++ if (!kvm_memslot_userfault(memslot)) ++ return 0; ++ ++ if (WARN_ON_ONCE(!memslot->userfault_bitmap)) ++ return 0; ++ ++ offset = gfn - memslot->base_gfn; ++ ++ if (copy_from_user(&bitmap_chunk, ++ memslot->userfault_bitmap + offset / BITS_PER_LONG, ++ sizeof(bitmap_chunk))) ++ return -EFAULT; ++ ++ /* Set in the bitmap means that the gfn is userfault */ ++ return !!(bitmap_chunk & (1ul << (offset % BITS_PER_LONG))); ++} +\ No newline at end of file +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch b/resources/hiding_ci/patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch new file mode 100644 index 00000000000..11d3f4d0618 --- /dev/null +++ b/resources/hiding_ci/patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch @@ -0,0 +1,28 @@ +From 0a91075ecff4f60404bc05da0d10d41e1b33fcec Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:18 +0000 +Subject: [PATCH 14/34] KVM: Add KVM_MEMORY_EXIT_FLAG_USERFAULT + +This flag is used for vCPU memory faults caused by KVM Userfault; i.e., +the bit in `userfault_bitmap` corresponding to the faulting gfn was set. + +Signed-off-by: James Houghton +--- + include/uapi/linux/kvm.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 03676746be71..0e1a2fac5735 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -444,6 +444,7 @@ struct kvm_run { + /* KVM_EXIT_MEMORY_FAULT */ + struct { + #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) ++#define KVM_MEMORY_EXIT_FLAG_USERFAULT (1ULL << 4) + __u64 flags; + __u64 gpa; + __u64 size; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch b/resources/hiding_ci/patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch new file mode 100644 index 00000000000..38f7624d108 --- /dev/null +++ b/resources/hiding_ci/patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch @@ -0,0 +1,58 @@ +From 7a626f99566e7ab7fce004fcf4041d87e75512cc Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:19 +0000 +Subject: [PATCH 15/34] KVM: Allow late setting of KVM_MEM_USERFAULT on + guest_memfd memslot + +Currently guest_memfd memslots can only be deleted. Slightly change the +logic to allow KVM_MR_FLAGS_ONLY changes when the only flag being +changed is KVM_MEM_USERFAULT. + +Signed-off-by: James Houghton +--- + virt/kvm/kvm_main.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index fb3ccf0cbb04..c60fe692de03 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -2009,9 +2009,6 @@ static int kvm_set_memory_region(struct kvm *kvm, + if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) + return -EINVAL; + } else { /* Modify an existing slot. */ +- /* Private memslots are immutable, they can only be deleted. */ +- if (mem->flags & KVM_MEM_GUEST_MEMFD) +- return -EINVAL; + if ((mem->userspace_addr != old->userspace_addr) || + (npages != old->npages) || + ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) +@@ -2025,6 +2022,16 @@ static int kvm_set_memory_region(struct kvm *kvm, + return 0; + } + ++ /* ++ * Except for being able to set KVM_MEM_USERFAULT, private memslots are ++ * immutable, they can only be deleted. ++ */ ++ if (mem->flags & KVM_MEM_GUEST_MEMFD && ++ !(change == KVM_MR_CREATE || ++ (change == KVM_MR_FLAGS_ONLY && ++ (mem->flags ^ old->flags) == KVM_MEM_USERFAULT))) ++ return -EINVAL; ++ + if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && + kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) + return -EEXIST; +@@ -2040,7 +2047,7 @@ static int kvm_set_memory_region(struct kvm *kvm, + new->npages = npages; + new->flags = mem->flags; + new->userspace_addr = mem->userspace_addr; +- if (mem->flags & KVM_MEM_GUEST_MEMFD) { ++ if (mem->flags & KVM_MEM_GUEST_MEMFD && change == KVM_MR_CREATE) { + r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); + if (r) + goto out; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch new file mode 100644 index 00000000000..5895c60b415 --- /dev/null +++ b/resources/hiding_ci/patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch @@ -0,0 +1,217 @@ +From 7e9f22f83ca191df8deab84d7c8c9d64a1b5b10f Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:21 +0000 +Subject: [PATCH 16/34] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT + +Adhering to the requirements of KVM Userfault: + +1. Zap all sptes for the memslot when KVM_MEM_USERFAULT is toggled on + with kvm_arch_flush_shadow_memslot(). +2. Only all PAGE_SIZE sptes when KVM_MEM_USERFAULT is enabled (for both + normal/GUP memory and guest_memfd memory). +3. Reconstruct huge mappings when KVM_MEM_USERFAULT is toggled off with + kvm_mmu_recover_huge_pages(). This is the behavior when dirty logging + is disabled; remain consistent with it. + +With the new logic in kvm_mmu_slot_apply_flags(), I've simplified the +two dirty-logging-toggle checks into one, and I have dropped the +WARN_ON() that was there. + +Signed-off-by: James Houghton +--- + arch/x86/kvm/Kconfig | 1 + + arch/x86/kvm/mmu/mmu.c | 28 +++++++++++++++++++++---- + arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++++++++--- + arch/x86/kvm/x86.c | 36 ++++++++++++++++++++++++--------- + include/linux/kvm_host.h | 5 ++++- + 5 files changed, 72 insertions(+), 18 deletions(-) + +diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig +index 22d1bcdaad58..6b1ef6402e30 100644 +--- a/arch/x86/kvm/Kconfig ++++ b/arch/x86/kvm/Kconfig +@@ -48,6 +48,7 @@ config KVM_X86 + select KVM_PRIVATE_MEM if KVM_SW_PROTECTED_VM + select KVM_GMEM_SHARED_MEM if KVM_SW_PROTECTED_VM + select KVM_WERROR if WERROR ++ select HAVE_KVM_USERFAULT + + config KVM + tristate "Kernel-based Virtual Machine (KVM) support" +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 8160870398b9..7ac7dc164522 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4292,14 +4292,20 @@ static inline u8 kvm_max_level_for_order(int order) + return PG_LEVEL_4K; + } + +-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, +- u8 max_level, int gmem_order) ++static u8 kvm_max_private_mapping_level(struct kvm *kvm, ++ struct kvm_memory_slot *slot, ++ kvm_pfn_t pfn, ++ u8 max_level, ++ int gmem_order) + { + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + ++ if (kvm_memslot_userfault(slot)) ++ return PG_LEVEL_4K; ++ + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; +@@ -4336,8 +4342,10 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + } + + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); +- fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, +- fault->max_level, max_order); ++ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->slot, ++ fault->pfn, ++ fault->max_level, ++ max_order); + + return RET_PF_CONTINUE; + } +@@ -4346,6 +4354,18 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) + { + unsigned int foll = fault->write ? FOLL_WRITE : 0; ++ int userfault; ++ ++ userfault = kvm_gfn_userfault(vcpu->kvm, fault->slot, fault->gfn); ++ if (userfault < 0) ++ return userfault; ++ if (userfault) { ++ kvm_mmu_prepare_userfault_exit(vcpu, fault); ++ return -EFAULT; ++ } ++ ++ if (kvm_memslot_userfault(fault->slot)) ++ fault->max_level = PG_LEVEL_4K; + + if (fault->is_private) + return kvm_mmu_faultin_pfn_private(vcpu, fault); +diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h +index 75f00598289d..d1f18dcc18fb 100644 +--- a/arch/x86/kvm/mmu/mmu_internal.h ++++ b/arch/x86/kvm/mmu/mmu_internal.h +@@ -335,12 +335,26 @@ enum { + */ + static_assert(RET_PF_CONTINUE == 0); + +-static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, +- struct kvm_page_fault *fault) ++static inline void __kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, ++ struct kvm_page_fault *fault, ++ bool is_userfault) + { + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, +- fault->is_private); ++ fault->is_private, ++ is_userfault); ++} ++ ++static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, ++ struct kvm_page_fault *fault) ++{ ++ __kvm_mmu_prepare_memory_fault_exit(vcpu, fault, false); ++} ++ ++static inline void kvm_mmu_prepare_userfault_exit(struct kvm_vcpu *vcpu, ++ struct kvm_page_fault *fault) ++{ ++ __kvm_mmu_prepare_memory_fault_exit(vcpu, fault, true); + } + + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 4b64ab350bcd..04034ca04703 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -13075,12 +13075,36 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + u32 new_flags = new ? new->flags : 0; + bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; + ++ /* ++ * When toggling KVM Userfault on, zap all sptes so that userfault-ness ++ * will be respected at refault time. All new faults will only install ++ * small sptes. Therefore, when toggling it off, recover hugepages. ++ * ++ * For MOVE and DELETE, there will be nothing to do, as the old ++ * mappings will have already been deleted by ++ * kvm_arch_flush_shadow_memslot(). ++ * ++ * For CREATE, no mappings will have been created yet. ++ */ ++ if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT && ++ (change == KVM_MR_FLAGS_ONLY)) { ++ if (old_flags & KVM_MEM_USERFAULT) ++ kvm_mmu_recover_huge_pages(kvm, new); ++ else ++ kvm_arch_flush_shadow_memslot(kvm, old); ++ } ++ ++ /* ++ * Nothing more to do if dirty logging isn't being toggled. ++ */ ++ if (!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)) ++ return; ++ + /* + * Update CPU dirty logging if dirty logging is being toggled. This + * applies to all operations. + */ +- if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) +- kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); ++ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); + + /* + * Nothing more to do for RO slots (which can't be dirtied and can't be +@@ -13100,14 +13124,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) + return; + +- /* +- * READONLY and non-flags changes were filtered out above, and the only +- * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty +- * logging isn't being toggled on or off. +- */ +- if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) +- return; +- + if (!log_dirty_pages) { + /* + * Recover huge page mappings in the slot now that dirty logging +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 6cd0d910678e..4a5379367332 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2499,7 +2499,8 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) + static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t size, + bool is_write, bool is_exec, +- bool is_private) ++ bool is_private, ++ bool is_userfault) + { + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + vcpu->run->memory_fault.gpa = gpa; +@@ -2509,6 +2510,8 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + vcpu->run->memory_fault.flags = 0; + if (is_private) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; ++ if (is_userfault) ++ vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT; + } + + #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch b/resources/hiding_ci/patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch new file mode 100644 index 00000000000..bdbcb9117ba --- /dev/null +++ b/resources/hiding_ci/patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch @@ -0,0 +1,45 @@ +From fe6a44733d6384057fd68732ebf3aeb612443d14 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:20 +0000 +Subject: [PATCH 17/34] KVM: Advertise KVM_CAP_USERFAULT in KVM_CHECK_EXTENSION + +Advertise support for KVM_CAP_USERFAULT when kvm_has_userfault() returns +true. Currently this is merely IS_ENABLED(CONFIG_HAVE_KVM_USERFAULT), so +it is somewhat redundant. + +Signed-off-by: James Houghton +--- + include/uapi/linux/kvm.h | 1 + + virt/kvm/kvm_main.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h +index 0e1a2fac5735..f5ad5d39c24b 100644 +--- a/include/uapi/linux/kvm.h ++++ b/include/uapi/linux/kvm.h +@@ -934,6 +934,7 @@ struct kvm_enable_cap { + #define KVM_CAP_X86_GUEST_MODE 238 + #define KVM_CAP_GMEM_SHARED_MEM 239 + #define KVM_CAP_GMEM_NO_DIRECT_MAP 240 ++#define KVM_CAP_USERFAULT 241 + + struct kvm_irq_routing_irqchip { + __u32 irqchip; +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index c60fe692de03..bb85ea8d0f85 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4854,6 +4854,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) + #ifdef CONFIG_KVM_GMEM_SHARED_MEM + case KVM_CAP_GMEM_SHARED_MEM: + return !kvm || kvm_arch_gmem_supports_shared_mem(kvm); ++#endif ++#ifdef CONFIG_HAVE_KVM_USERFAULT ++ case KVM_CAP_USERFAULT: ++ return kvm_has_userfault(kvm); + #endif + default: + break; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch new file mode 100644 index 00000000000..bc9d608d15b --- /dev/null +++ b/resources/hiding_ci/patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch @@ -0,0 +1,87 @@ +From 86888e840dce0193dfb14916857df1c09749b618 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:22 +0000 +Subject: [PATCH 18/34] KVM: arm64: Add support for KVM_MEM_USERFAULT + +Adhering to the requirements of KVM Userfault: +1. When it is toggled on, zap the second stage with + kvm_arch_flush_shadow_memslot(). This is to respect userfault-ness. +2. When KVM_MEM_USERFAULT is enabled, restrict new second-stage mappings + to be PAGE_SIZE, just like when dirty logging is enabled. + +Do not zap the second stage when KVM_MEM_USERFAULT is disabled to remain +consistent with the behavior when dirty logging is disabled. + +Signed-off-by: James Houghton +--- + arch/arm64/kvm/Kconfig | 1 + + arch/arm64/kvm/mmu.c | 27 ++++++++++++++++++++++++++- + 2 files changed, 27 insertions(+), 1 deletion(-) + +diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig +index 4830d8805bed..aa0f438fba1c 100644 +--- a/arch/arm64/kvm/Kconfig ++++ b/arch/arm64/kvm/Kconfig +@@ -39,6 +39,7 @@ menuconfig KVM + select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS + select KVM_GMEM_SHARED_MEM ++ select HAVE_KVM_USERFAULT + help + Support hosting virtualized guest machines. + +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index adb0681fc1c6..39d9a02db9e9 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -1497,7 +1497,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + kvm_pfn_t pfn; + bool logging_active = memslot_is_logging(memslot); + bool is_gmem = kvm_mem_is_private(kvm, gfn); +- bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled(); ++ bool force_pte = logging_active || is_gmem || is_protected_kvm_enabled() || ++ kvm_memslot_userfault(memslot); + long vma_pagesize, fault_granule = PAGE_SIZE; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; +@@ -1635,6 +1636,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); + ++ if (kvm_gfn_userfault(kvm, memslot, gfn)) { ++ kvm_prepare_memory_fault_exit(vcpu, gfn << PAGE_SHIFT, ++ PAGE_SIZE, write_fault, ++ exec_fault, false, true); ++ return -EFAULT; ++ } ++ + pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_gmem); + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(hva, vma_shift); +@@ -2125,6 +2133,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, + enum kvm_mr_change change) + { + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; ++ u32 new_flags = new ? new->flags : 0; ++ u32 changed_flags = (new_flags) ^ (old ? old->flags : 0); ++ ++ /* ++ * If KVM_MEM_USERFAULT has been enabled, drop all the stage-2 mappings ++ * so that we can respect userfault-ness. ++ */ ++ if ((changed_flags & KVM_MEM_USERFAULT) && ++ (new_flags & KVM_MEM_USERFAULT) && ++ change == KVM_MR_FLAGS_ONLY) ++ kvm_arch_flush_shadow_memslot(kvm, old); ++ ++ /* ++ * Nothing left to do if not toggling dirty logging. ++ */ ++ if (!(changed_flags & KVM_MEM_LOG_DIRTY_PAGES)) ++ return; + + /* + * At this point memslot has been committed and there is an +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch b/resources/hiding_ci/patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch new file mode 100644 index 00000000000..effde901d5b --- /dev/null +++ b/resources/hiding_ci/patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch @@ -0,0 +1,28 @@ +From 398111201a62de89a4973295512b303d74ea0662 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:23 +0000 +Subject: [PATCH 19/34] KVM: selftests: Fix vm_mem_region_set_flags docstring + +`flags` is what region->region.flags gets set to. + +Signed-off-by: James Houghton +--- + tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c +index 33fefeb3ca44..a87988a162f1 100644 +--- a/tools/testing/selftests/kvm/lib/kvm_util.c ++++ b/tools/testing/selftests/kvm/lib/kvm_util.c +@@ -1124,7 +1124,7 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) + * + * Input Args: + * vm - Virtual Machine +- * flags - Starting guest physical address ++ * flags - Flags for the memslot + * + * Output Args: None + * +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch b/resources/hiding_ci/patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch new file mode 100644 index 00000000000..e97c522033e --- /dev/null +++ b/resources/hiding_ci/patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch @@ -0,0 +1,37 @@ +From 281292095132694847d44d12de0268045ae727ec Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:24 +0000 +Subject: [PATCH 20/34] KVM: selftests: Fix prefault_mem logic + +The previous logic didn't handle the case where memory was partitioned +AND we were using a single userfaultfd. It would only prefault the first +vCPU's memory and not the rest. + +Signed-off-by: James Houghton +--- + tools/testing/selftests/kvm/demand_paging_test.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c +index 0202b78f8680..315f5c9037b4 100644 +--- a/tools/testing/selftests/kvm/demand_paging_test.c ++++ b/tools/testing/selftests/kvm/demand_paging_test.c +@@ -172,11 +172,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) + memset(guest_data_prototype, 0xAB, demand_paging_size); + + if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { +- num_uffds = p->single_uffd ? 1 : nr_vcpus; +- for (i = 0; i < num_uffds; i++) { ++ for (i = 0; i < nr_vcpus; i++) { + vcpu_args = &memstress_args.vcpu_args[i]; + prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), + vcpu_args->pages * memstress_args.guest_page_size); ++ if (!p->partition_vcpu_memory_access) ++ /* We prefaulted everything */ ++ break; + } + } + +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch b/resources/hiding_ci/patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch new file mode 100644 index 00000000000..67bf2b8c9d8 --- /dev/null +++ b/resources/hiding_ci/patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch @@ -0,0 +1,44 @@ +From c64583011616045a4b70e34aeef6fd77e6f23ccc Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:25 +0000 +Subject: [PATCH 21/34] KVM: selftests: Add va_start/end into uffd_desc + +This will be used for the self-test to look up which userfaultfd we +should be using when handling a KVM Userfault (in the event KVM +Userfault and userfaultfd are being used together). + +Signed-off-by: James Houghton +--- + tools/testing/selftests/kvm/include/userfaultfd_util.h | 2 ++ + tools/testing/selftests/kvm/lib/userfaultfd_util.c | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h +index 60f7f9d435dc..b62fecdfe745 100644 +--- a/tools/testing/selftests/kvm/include/userfaultfd_util.h ++++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h +@@ -30,6 +30,8 @@ struct uffd_desc { + int *pipefds; + pthread_t *readers; + struct uffd_reader_args *reader_args; ++ void *va_start; ++ void *va_end; + }; + + struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, +diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c +index 7c9de8414462..93004c85bcdc 100644 +--- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c ++++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c +@@ -152,6 +152,8 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, + expected_ioctls, "missing userfaultfd ioctls"); + + uffd_desc->uffd = uffd; ++ uffd_desc->va_start = hva; ++ uffd_desc->va_end = (char *)hva + len; + for (i = 0; i < uffd_desc->num_readers; ++i) { + int pipes[2]; + +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch b/resources/hiding_ci/patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch new file mode 100644 index 00000000000..22ec8dbcc83 --- /dev/null +++ b/resources/hiding_ci/patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch @@ -0,0 +1,31 @@ +From 73cef3464706c3665efcbac533979e83716b0d86 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:27 +0000 +Subject: [PATCH 22/34] KVM: selftests: Inform set_memory_region_test of + KVM_MEM_USERFAULT + +The KVM_MEM_USERFAULT flag is supported iff KVM_CAP_USERFAULT is +available. + +Signed-off-by: James Houghton +--- + tools/testing/selftests/kvm/set_memory_region_test.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c +index bc440d5aba57..56231c02d88c 100644 +--- a/tools/testing/selftests/kvm/set_memory_region_test.c ++++ b/tools/testing/selftests/kvm/set_memory_region_test.c +@@ -364,6 +364,9 @@ static void test_invalid_memory_region_flags(void) + if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE) + supported_flags |= KVM_MEM_GUEST_MEMFD; + ++ if (kvm_check_cap(KVM_CAP_USERFAULT)) ++ supported_flags |= KVM_MEM_USERFAULT; ++ + for (i = 0; i < 32; i++) { + if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) + continue; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch b/resources/hiding_ci/patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch new file mode 100644 index 00000000000..d0b3c5e7489 --- /dev/null +++ b/resources/hiding_ci/patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch @@ -0,0 +1,381 @@ +From 6aa865fbd91cb293da61db4e11f0ce1a6de36cae Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:26 +0000 +Subject: [PATCH 23/34] KVM: selftests: Add KVM Userfault mode to + demand_paging_test + +Add a way for the KVM_RUN loop to handle -EFAULT exits when they are for +KVM_MEMORY_EXIT_FLAG_USERFAULT. In this case, preemptively handle the +UFFDIO_COPY or UFFDIO_CONTINUE if userfaultfd is also in use. This saves +the trip through the userfaultfd poll/read/WAKE loop. + +When preemptively handling UFFDIO_COPY/CONTINUE, do so with +MODE_DONTWAKE, as there will not be a thread to wake. If a thread *does* +take the userfaultfd slow path, we will get a regular userfault, and we +will call handle_uffd_page_request() which will do a full wake-up. In +the EEXIST case, a wake-up will not occur. Make sure to call UFFDIO_WAKE +explicitly in this case. + +When handling KVM userfaults, make sure to set the bitmap with +memory_order_release. Although it wouldn't affect the functionality of +the test (because memstress doesn't actually require any particular +guest memory contents), it is what userspace normally needs to do. + +Add `-k` to set the test to use KVM Userfault. + +Add the vm_mem_region_set_flags_userfault() helper for setting +`userfault_bitmap` and KVM_MEM_USERFAULT at the same time. + +Signed-off-by: James Houghton +--- + .../selftests/kvm/demand_paging_test.c | 139 +++++++++++++++++- + .../testing/selftests/kvm/include/kvm_util.h | 5 + + tools/testing/selftests/kvm/lib/kvm_util.c | 40 ++++- + 3 files changed, 176 insertions(+), 8 deletions(-) + +diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c +index 315f5c9037b4..183c70731093 100644 +--- a/tools/testing/selftests/kvm/demand_paging_test.c ++++ b/tools/testing/selftests/kvm/demand_paging_test.c +@@ -12,7 +12,9 @@ + #include + #include + #include ++#include + #include ++#include + + #include "kvm_util.h" + #include "test_util.h" +@@ -24,11 +26,21 @@ + #ifdef __NR_userfaultfd + + static int nr_vcpus = 1; ++static int num_uffds; + static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + + static size_t demand_paging_size; ++static size_t host_page_size; + static char *guest_data_prototype; + ++static struct { ++ bool enabled; ++ int uffd_mode; /* set if userfaultfd is also in use */ ++ struct uffd_desc **uffd_descs; ++} kvm_userfault_data; ++ ++static void resolve_kvm_userfault(u64 gpa, u64 size); ++ + static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) + { + struct kvm_vcpu *vcpu = vcpu_args->vcpu; +@@ -41,8 +53,22 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) + clock_gettime(CLOCK_MONOTONIC, &start); + + /* Let the guest access its memory */ ++restart: + ret = _vcpu_run(vcpu); +- TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); ++ if (ret < 0 && errno == EFAULT && kvm_userfault_data.enabled) { ++ /* Check for userfault. */ ++ TEST_ASSERT(run->exit_reason == KVM_EXIT_MEMORY_FAULT, ++ "Got invalid exit reason: %x", run->exit_reason); ++ TEST_ASSERT(run->memory_fault.flags == ++ KVM_MEMORY_EXIT_FLAG_USERFAULT, ++ "Got invalid memory fault exit: %llx", ++ run->memory_fault.flags); ++ resolve_kvm_userfault(run->memory_fault.gpa, ++ run->memory_fault.size); ++ goto restart; ++ } else ++ TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); ++ + if (get_ucall(vcpu, NULL) != UCALL_SYNC) { + TEST_ASSERT(false, + "Invalid guest sync status: exit_reason=%s", +@@ -54,11 +80,10 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) + ts_diff.tv_sec, ts_diff.tv_nsec); + } + +-static int handle_uffd_page_request(int uffd_mode, int uffd, +- struct uffd_msg *msg) ++static int resolve_uffd_page_request(int uffd_mode, int uffd, uint64_t addr, ++ bool wake) + { + pid_t tid = syscall(__NR_gettid); +- uint64_t addr = msg->arg.pagefault.address; + struct timespec start; + struct timespec ts_diff; + int r; +@@ -71,7 +96,7 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = demand_paging_size; +- copy.mode = 0; ++ copy.mode = wake ? 0 : UFFDIO_COPY_MODE_DONTWAKE; + + r = ioctl(uffd, UFFDIO_COPY, ©); + /* +@@ -96,6 +121,7 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, + + cont.range.start = addr; + cont.range.len = demand_paging_size; ++ cont.mode = wake ? 0 : UFFDIO_CONTINUE_MODE_DONTWAKE; + + r = ioctl(uffd, UFFDIO_CONTINUE, &cont); + /* +@@ -119,6 +145,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, + TEST_FAIL("Invalid uffd mode %d", uffd_mode); + } + ++ if (r < 0 && wake) { ++ /* ++ * No wake-up occurs when UFFDIO_COPY/CONTINUE fails, but we ++ * have a thread waiting. Wake it up. ++ */ ++ struct uffdio_range range = {0}; ++ ++ range.start = addr; ++ range.len = demand_paging_size; ++ ++ TEST_ASSERT(ioctl(uffd, UFFDIO_WAKE, &range) == 0, ++ "UFFDIO_WAKE failed: 0x%lx", addr); ++ } ++ + ts_diff = timespec_elapsed(start); + + PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, +@@ -129,6 +169,58 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, + return 0; + } + ++static int handle_uffd_page_request(int uffd_mode, int uffd, ++ struct uffd_msg *msg) ++{ ++ uint64_t addr = msg->arg.pagefault.address; ++ ++ return resolve_uffd_page_request(uffd_mode, uffd, addr, true); ++} ++ ++static void resolve_kvm_userfault(u64 gpa, u64 size) ++{ ++ struct kvm_vm *vm = memstress_args.vm; ++ struct userspace_mem_region *region; ++ unsigned long *bitmap_chunk; ++ u64 page, gpa_offset; ++ ++ region = (struct userspace_mem_region *) userspace_mem_region_find( ++ vm, gpa, (gpa + size - 1)); ++ ++ if (kvm_userfault_data.uffd_mode) { ++ /* ++ * Resolve userfaults early, without needing to read them ++ * off the userfaultfd. ++ */ ++ uint64_t hva = (uint64_t)addr_gpa2hva(vm, gpa); ++ struct uffd_desc **descs = kvm_userfault_data.uffd_descs; ++ int i, fd; ++ ++ for (i = 0; i < num_uffds; ++i) ++ if (hva >= (uint64_t)descs[i]->va_start && ++ hva < (uint64_t)descs[i]->va_end) ++ break; ++ ++ TEST_ASSERT(i < num_uffds, ++ "Did not find userfaultfd for hva: %lx", hva); ++ ++ fd = kvm_userfault_data.uffd_descs[i]->uffd; ++ resolve_uffd_page_request(kvm_userfault_data.uffd_mode, fd, ++ hva, false); ++ } else { ++ uint64_t hva = (uint64_t)addr_gpa2hva(vm, gpa); ++ ++ memcpy((char *)hva, guest_data_prototype, demand_paging_size); ++ } ++ ++ gpa_offset = gpa - region->region.guest_phys_addr; ++ page = gpa_offset / host_page_size; ++ bitmap_chunk = (unsigned long *)region->region.userfault_bitmap + ++ page / BITS_PER_LONG; ++ atomic_fetch_and_explicit((_Atomic unsigned long *)bitmap_chunk, ++ ~(1ul << (page % BITS_PER_LONG)), memory_order_release); ++} ++ + struct test_params { + int uffd_mode; + bool single_uffd; +@@ -136,6 +228,7 @@ struct test_params { + int readers_per_uffd; + enum vm_mem_backing_src_type src_type; + bool partition_vcpu_memory_access; ++ bool kvm_userfault; + }; + + static void prefault_mem(void *alias, uint64_t len) +@@ -149,6 +242,25 @@ static void prefault_mem(void *alias, uint64_t len) + } + } + ++static void enable_userfault(struct kvm_vm *vm, int slots) ++{ ++ for (int i = 0; i < slots; ++i) { ++ int slot = MEMSTRESS_MEM_SLOT_INDEX + i; ++ struct userspace_mem_region *region; ++ unsigned long *userfault_bitmap; ++ int flags = KVM_MEM_USERFAULT; ++ ++ region = memslot2region(vm, slot); ++ userfault_bitmap = bitmap_zalloc(region->mmap_size / ++ host_page_size); ++ /* everything is userfault initially */ ++ memset(userfault_bitmap, -1, region->mmap_size / host_page_size / CHAR_BIT); ++ printf("Setting bitmap: %p\n", userfault_bitmap); ++ vm_mem_region_set_flags_userfault(vm, slot, flags, ++ userfault_bitmap); ++ } ++} ++ + static void run_test(enum vm_guest_mode mode, void *arg) + { + struct memstress_vcpu_args *vcpu_args; +@@ -159,12 +271,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) + struct timespec ts_diff; + double vcpu_paging_rate; + struct kvm_vm *vm; +- int i, num_uffds = 0; ++ int i; + + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, + p->src_type, p->partition_vcpu_memory_access); + + demand_paging_size = get_backing_src_pagesz(p->src_type); ++ host_page_size = getpagesize(); + + guest_data_prototype = malloc(demand_paging_size); + TEST_ASSERT(guest_data_prototype, +@@ -208,6 +321,14 @@ static void run_test(enum vm_guest_mode mode, void *arg) + } + } + ++ if (p->kvm_userfault) { ++ TEST_REQUIRE(kvm_has_cap(KVM_CAP_USERFAULT)); ++ kvm_userfault_data.enabled = true; ++ kvm_userfault_data.uffd_mode = p->uffd_mode; ++ kvm_userfault_data.uffd_descs = uffd_descs; ++ enable_userfault(vm, 1); ++ } ++ + pr_info("Finished creating vCPUs and starting uffd threads\n"); + + clock_gettime(CLOCK_MONOTONIC, &start); +@@ -265,6 +386,7 @@ static void help(char *name) + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); ++ printf(" -k: Use KVM Userfault\n"); + puts(""); + exit(0); + } +@@ -283,7 +405,7 @@ int main(int argc, char *argv[]) + + guest_modes_append_default(); + +- while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { ++ while ((opt = getopt(argc, argv, "ahokm:u:d:b:s:v:c:r:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); +@@ -326,6 +448,9 @@ int main(int argc, char *argv[]) + "Invalid number of readers per uffd %d: must be >=1", + p.readers_per_uffd); + break; ++ case 'k': ++ p.kvm_userfault = true; ++ break; + case 'h': + default: + help(argv[0]); +diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h +index 4c4e5a847f67..0d49a9ce832a 100644 +--- a/tools/testing/selftests/kvm/include/kvm_util.h ++++ b/tools/testing/selftests/kvm/include/kvm_util.h +@@ -582,6 +582,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, + void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); ++struct userspace_mem_region * ++userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end); + + #ifndef vm_arch_has_protected_memory + static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) +@@ -591,6 +593,9 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) + #endif + + void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); ++void vm_mem_region_set_flags_userfault(struct kvm_vm *vm, uint32_t slot, ++ uint32_t flags, ++ unsigned long *userfault_bitmap); + void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); + void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); + struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c +index a87988a162f1..a8f6b949ac59 100644 +--- a/tools/testing/selftests/kvm/lib/kvm_util.c ++++ b/tools/testing/selftests/kvm/lib/kvm_util.c +@@ -634,7 +634,7 @@ void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + * of the regions is returned. Null is returned only when no overlapping + * region exists. + */ +-static struct userspace_mem_region * ++struct userspace_mem_region * + userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) + { + struct rb_node *node; +@@ -1149,6 +1149,44 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) + ret, errno, slot, flags); + } + ++/* ++ * VM Memory Region Flags Set with a userfault bitmap ++ * ++ * Input Args: ++ * vm - Virtual Machine ++ * flags - Flags for the memslot ++ * userfault_bitmap - The bitmap to use for KVM_MEM_USERFAULT ++ * ++ * Output Args: None ++ * ++ * Return: None ++ * ++ * Sets the flags of the memory region specified by the value of slot, ++ * to the values given by flags. This helper adds a way to provide a ++ * userfault_bitmap. ++ */ ++void vm_mem_region_set_flags_userfault(struct kvm_vm *vm, uint32_t slot, ++ uint32_t flags, ++ unsigned long *userfault_bitmap) ++{ ++ int ret; ++ struct userspace_mem_region *region; ++ ++ region = memslot2region(vm, slot); ++ ++ TEST_ASSERT(!userfault_bitmap ^ (flags & KVM_MEM_USERFAULT), ++ "KVM_MEM_USERFAULT must be specified with a bitmap"); ++ ++ region->region.flags = flags; ++ region->region.userfault_bitmap = (__u64)userfault_bitmap; ++ ++ ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); ++ ++ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" ++ " rc: %i errno: %i slot: %u flags: 0x%x", ++ ret, errno, slot, flags); ++} ++ + /* + * VM Memory Region Move + * +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch b/resources/hiding_ci/patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch new file mode 100644 index 00000000000..36b151a0f3c --- /dev/null +++ b/resources/hiding_ci/patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch @@ -0,0 +1,65 @@ +From be1d7a3ce1b177d64198b8e060bc9a3844f462cd Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:28 +0000 +Subject: [PATCH 24/34] KVM: selftests: Add KVM_MEM_USERFAULT + guest_memfd + toggle tests + +Make sure KVM_MEM_USERFAULT can be toggled on and off for +KVM_MEM_GUEST_MEMFD memslots. + +Signed-off-by: James Houghton +--- + .../selftests/kvm/set_memory_region_test.c | 30 +++++++++++++++++++ + 1 file changed, 30 insertions(+) + +diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c +index 56231c02d88c..95d315b976df 100644 +--- a/tools/testing/selftests/kvm/set_memory_region_test.c ++++ b/tools/testing/selftests/kvm/set_memory_region_test.c +@@ -608,6 +608,35 @@ static void test_mmio_during_vectoring(void) + } + #endif + ++static void test_private_memory_region_userfault(void) ++{ ++ struct kvm_vm *vm; ++ int memfd; ++ ++ pr_info("Testing toggling KVM_MEM_USERFAULT on KVM_MEM_GUEST_MEMFD memory regions\n"); ++ ++ vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); ++ ++ test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); ++ test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); ++ ++ memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); ++ ++ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, ++ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); ++ ++ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, ++ KVM_MEM_GUEST_MEMFD | KVM_MEM_USERFAULT, ++ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); ++ ++ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, ++ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); ++ ++ close(memfd); ++ ++ kvm_vm_free(vm); ++} ++ + int main(int argc, char *argv[]) + { + #ifdef __x86_64__ +@@ -633,6 +662,7 @@ int main(int argc, char *argv[]) + (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) { + test_add_private_memory_region(); + test_add_overlapping_private_memory_regions(); ++ test_private_memory_region_userfault(); + } else { + pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n"); + } +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch b/resources/hiding_ci/patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch new file mode 100644 index 00000000000..68aa5a42ad1 --- /dev/null +++ b/resources/hiding_ci/patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch @@ -0,0 +1,76 @@ +From fb4c74191df7821bf047af099473dd8f20948b43 Mon Sep 17 00:00:00 2001 +From: James Houghton +Date: Thu, 9 Jan 2025 20:49:29 +0000 +Subject: [PATCH 25/34] KVM: Documentation: Add KVM_CAP_USERFAULT and + KVM_MEM_USERFAULT details + +Include the note about memory ordering when clearing bits in +userfault_bitmap, as it may not be obvious for users. + +Signed-off-by: James Houghton +Reviewed-by: Bagas Sanjaya +--- + Documentation/virt/kvm/api.rst | 33 ++++++++++++++++++++++++++++++++- + 1 file changed, 32 insertions(+), 1 deletion(-) + +diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst +index 2b52eb77e29c..3ec3d0bdb18a 100644 +--- a/Documentation/virt/kvm/api.rst ++++ b/Documentation/virt/kvm/api.rst +@@ -6287,7 +6287,8 @@ bounds checks apply (use common sense). + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; +- __u64 pad2[14]; ++ __u64 userfault_bitmap; ++ __u64 pad2[13]; + }; + + A KVM_MEM_GUEST_MEMFD region _must_ have a valid guest_memfd (private memory) and +@@ -6303,6 +6304,25 @@ state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute + is '0' for all gfns. Userspace can control whether memory is shared/private by + toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed. + ++When the KVM_MEM_USERFAULT flag is set, userfault_bitmap points to the starting ++address for the bitmap that controls if vCPU memory faults should immediately ++exit to userspace. If an invalid pointer is provided, at fault time, KVM_RUN ++will return -EFAULT. KVM_MEM_USERFAULT is only supported when ++KVM_CAP_USERFAULT is supported. ++ ++userfault_bitmap should point to an array of longs where each bit in the array ++linearly corresponds to a single gfn. Bit 0 in userfault_bitmap corresponds to ++guest_phys_addr, bit 1 corresponds to guest_phys_addr + PAGE_SIZE, etc. If the ++bit for a page is set, any vCPU access to that page will exit to userspace with ++KVM_MEMORY_EXIT_FLAG_USERFAULT. ++ ++Setting bits in userfault_bitmap has no effect on pages that have already been ++mapped by KVM until KVM_MEM_USERFAULT is disabled and re-enabled again. ++ ++Clearing bits in userfault_bitmap should usually be done with a store-release ++if changes to guest memory are being made available to the guest via ++userfault_bitmap. ++ + S390: + ^^^^^ + +@@ -8258,6 +8278,17 @@ KVM exits with the register state of either the L1 or L2 guest + depending on which executed at the time of an exit. Userspace must + take care to differentiate between these cases. + ++7.37 KVM_CAP_USERFAULT ++---------------------- ++ ++:Architectures: x86, arm64 ++:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. ++ ++The presence of this capability indicates that KVM_SET_USER_MEMORY_REGION2 will ++accept KVM_MEM_USERFAULT as a valid memslot flag. ++ ++See KVM_SET_USER_MEMORY_REGION2 for more details. ++ + 8. Other capabilities. + ====================== + +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch b/resources/hiding_ci/patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch new file mode 100644 index 00000000000..f7c1cce60f2 --- /dev/null +++ b/resources/hiding_ci/patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch @@ -0,0 +1,141 @@ +From dda48e07b48f344c57d8ed90ddad7e01c26f7952 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 29 Nov 2024 11:51:02 +0000 +Subject: [PATCH 26/34] KVM: guest_memfd: add generic population via write + +write syscall populates guest_memfd with user-supplied data in a generic +way, ie no vendor-specific preparation is performed. This is supposed +to be used in non-CoCo setups where guest memory is not +hardware-encrypted. + +The following behaviour is implemented: + - only page-aligned count and offset are allowed + - if the memory is already allocated, the call will successfully + populate it + - if the memory is not allocated, the call will both allocate and + populate + - if the memory is already populated, the call will not repopulate it + +Signed-off-by: Nikita Kalyazin +--- + virt/kvm/guest_memfd.c | 94 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 91 insertions(+), 3 deletions(-) + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 291d647a5c80..5abb6d52a375 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -432,12 +432,97 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) + + return 0; + } +-#else +-#define kvm_gmem_mmap NULL ++ ++static ssize_t kvm_kmem_gmem_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ pgoff_t start, end, index; ++ ssize_t ret = 0; ++ ++ if (!PAGE_ALIGNED(*offset) || !PAGE_ALIGNED(count)) ++ return -EINVAL; ++ ++ if (*offset + count > i_size_read(file_inode(file))) ++ return -EINVAL; ++ ++ if (!buf) ++ return -EINVAL; ++ ++ start = *offset >> PAGE_SHIFT; ++ end = (*offset + count) >> PAGE_SHIFT; ++ ++ filemap_invalidate_lock_shared(file->f_mapping); ++ ++ for (index = start; index < end; ) { ++ struct folio *folio; ++ void *vaddr; ++ pgoff_t buf_offset = (index - start) << PAGE_SHIFT; ++ ++ if (signal_pending(current)) { ++ ret = -EINTR; ++ goto out; ++ } ++ ++ folio = kvm_gmem_get_folio(file_inode(file), index); ++ if (IS_ERR(folio)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ if (folio_test_hwpoison(folio)) { ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ /* No support for huge pages. */ ++ if (WARN_ON_ONCE(folio_test_large(folio))) { ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ if (folio_test_uptodate(folio)) { ++ folio_unlock(folio); ++ folio_put(folio); ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ folio_unlock(folio); ++ ++ vaddr = kmap_local_folio(folio, 0); ++ ret = copy_from_user(vaddr, buf + buf_offset, PAGE_SIZE); ++ kunmap_local(vaddr); ++ if (ret) { ++ ret = -EINVAL; ++ folio_put(folio); ++ goto out; ++ } ++ ++ kvm_gmem_mark_prepared(folio); ++ folio_put(folio); ++ ++ index = folio_next_index(folio); ++ *offset += PAGE_SIZE; ++ } ++ ++out: ++ filemap_invalidate_unlock_shared(file->f_mapping); ++ ++ return ret && start == (*offset >> PAGE_SHIFT) ? ++ ret : *offset - (start << PAGE_SHIFT); ++} + #endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + + static struct file_operations kvm_gmem_fops = { +- .mmap = kvm_gmem_mmap, ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ .mmap = kvm_gmem_mmap, ++ .llseek = default_llseek, ++ .write = kvm_kmem_gmem_write, ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +@@ -557,6 +642,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) + } + + file->f_flags |= O_LARGEFILE; ++#ifdef CONFIG_KVM_GMEM_SHARED_MEM ++ file->f_mode |= FMODE_LSEEK | FMODE_PWRITE; ++#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ + + inode = file->f_inode; + WARN_ON(file->f_mapping != inode->i_mapping); +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch b/resources/hiding_ci/patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch new file mode 100644 index 00000000000..6062f6fd982 --- /dev/null +++ b/resources/hiding_ci/patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch @@ -0,0 +1,126 @@ +From 120635067e2d910bf96a53e3b7e2f2d5be19af7e Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 29 Nov 2024 11:57:58 +0000 +Subject: [PATCH 27/34] KVM: selftests: update guest_memfd write tests + +This is to reflect that the write syscall is now implemented for +guest_memfd. + +Signed-off-by: Nikita Kalyazin +--- + .../testing/selftests/kvm/guest_memfd_test.c | 85 +++++++++++++++++-- + 1 file changed, 79 insertions(+), 6 deletions(-) + +diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c +index 38c501e49e0e..b07221aa54c9 100644 +--- a/tools/testing/selftests/kvm/guest_memfd_test.c ++++ b/tools/testing/selftests/kvm/guest_memfd_test.c +@@ -20,18 +20,90 @@ + #include "kvm_util.h" + #include "test_util.h" + +-static void test_file_read_write(int fd) ++static void test_file_read(int fd) + { + char buf[64]; + + TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, + "read on a guest_mem fd should fail"); +- TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, +- "write on a guest_mem fd should fail"); + TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, + "pread on a guest_mem fd should fail"); +- TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, +- "pwrite on a guest_mem fd should fail"); ++} ++ ++static void test_file_write(int fd, size_t total_size) ++{ ++ size_t page_size = getpagesize(); ++ void *buf = NULL; ++ int ret; ++ ++ ret = posix_memalign(&buf, page_size, total_size); ++ TEST_ASSERT_EQ(ret, 0); ++ ++ /* Check arguments correctness checks work as expected */ ++ ++ ret = pwrite(fd, buf, page_size - 1, 0); ++ TEST_ASSERT(ret == -1, "write unaligned count on a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ ret = pwrite(fd, buf, page_size, 1); ++ TEST_ASSERT(ret == -1, "write unaligned offset on a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ ret = pwrite(fd, buf, page_size, total_size); ++ TEST_ASSERT(ret == -1, "writing past the file size on a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ ret = pwrite(fd, NULL, page_size, 0); ++ TEST_ASSERT(ret == -1, "supplying a NULL buffer when writing a guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, EINVAL); ++ ++ /* Check double population is not allowed */ ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == page_size, "page-aligned write on a guest_mem fd should succeed"); ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == -1, "write on already populated guest_mem fd should fail"); ++ TEST_ASSERT_EQ(errno, ENOSPC); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ /* Check population is allowed again after punching a hole */ ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == page_size, "page-aligned write on a punched guest_mem fd should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ /* Check population of already allocated memory is allowed */ ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); ++ ++ ret = pwrite(fd, buf, page_size, 0); ++ TEST_ASSERT(ret == page_size, "write on a preallocated guest_mem fd should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ /* Check population works until an already populated page is encountered */ ++ ++ ret = pwrite(fd, buf, total_size, 0); ++ TEST_ASSERT(ret == total_size, "page-aligned write on a guest_mem fd should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ ret = pwrite(fd, buf, total_size, 0); ++ TEST_ASSERT(ret == page_size, "write on a guest_mem fd should not overwrite data"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, total_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ ++ free(buf); + } + + static void test_mmap_allowed(int fd, size_t total_size) +@@ -233,7 +305,8 @@ void test_vm_type(unsigned long type, bool is_shared) + + fd = vm_create_guest_memfd(vm, total_size, 0); + +- test_file_read_write(fd); ++ test_file_read(fd); ++ test_file_write(fd, total_size); + + if (is_shared) + test_mmap_allowed(fd, total_size); +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch b/resources/hiding_ci/patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch new file mode 100644 index 00000000000..e610f502a9b --- /dev/null +++ b/resources/hiding_ci/patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch @@ -0,0 +1,153 @@ +From 38fb84551a238b98a622433157fb2537ecb5611e Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Mon, 31 Mar 2025 10:15:35 +0000 +Subject: [PATCH 28/34] mm: userfaultfd: generic continue for non hugetlbfs + +Remove shmem-specific code from UFFDIO_CONTINUE implementation for +non-huge pages by calling vm_ops->fault(). A new VMF flag, +FAULT_FLAG_USERFAULT_CONTINUE, is introduced to avoid recursive call to +handle_userfault(). + +Suggested-by: James Houghton +Signed-off-by: Nikita Kalyazin +--- + include/linux/mm_types.h | 4 ++++ + mm/hugetlb.c | 2 +- + mm/shmem.c | 9 ++++++--- + mm/userfaultfd.c | 37 +++++++++++++++++++++++++++---------- + 4 files changed, 38 insertions(+), 14 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 0234f14f2aa6..2f26ee9742bf 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1429,6 +1429,9 @@ enum tlb_flush_reason { + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. + * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. ++ * @FAULT_FLAG_USERFAULT_CONTINUE: The fault handler must not call userfaultfd ++ * minor handler as it is being called by the ++ * userfaultfd code itself. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two +@@ -1467,6 +1470,7 @@ enum fault_flag { + FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, + FAULT_FLAG_VMA_LOCK = 1 << 12, ++ FAULT_FLAG_USERFAULT_CONTINUE = 1 << 13, + }; + + typedef unsigned int __bitwise zap_flags_t; +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 97930d44d460..c004cfdcd4e2 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, + } + + /* Check for page in userfault range. */ +- if (userfaultfd_minor(vma)) { ++ if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { + folio_unlock(folio); + folio_put(folio); + /* See comment in userfaultfd_missing() block above */ +diff --git a/mm/shmem.c b/mm/shmem.c +index 1ede0800e846..b4159303fe59 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + fault_mm = vma ? vma->vm_mm : NULL; + + folio = filemap_get_entry(inode->i_mapping, index); +- if (folio && vma && userfaultfd_minor(vma)) { ++ if (folio && vma && userfaultfd_minor(vma) && ++ !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { + if (!xa_is_value(folio)) + folio_put(folio); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); +@@ -2727,6 +2728,8 @@ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) + static vm_fault_t shmem_fault(struct vm_fault *vmf) + { + struct inode *inode = file_inode(vmf->vma->vm_file); ++ enum sgp_type sgp = vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE ? ++ SGP_NOALLOC : SGP_CACHE; + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; + vm_fault_t ret = 0; +@@ -2743,8 +2746,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) + } + + WARN_ON_ONCE(vmf->page != NULL); +- err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, +- gfp, vmf, &ret); ++ err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, sgp, gfp, vmf, ++ &ret); + if (err) + return vmf_error(err); + if (folio) { +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index d06453fa8aba..4b3dbc7dac64 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -380,30 +380,47 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, + return ret; + } + +-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ ++/* Handles UFFDIO_CONTINUE for all VMAs */ + static int mfill_atomic_pte_continue(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t flags) + { +- struct inode *inode = file_inode(dst_vma->vm_file); +- pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; + struct page *page; + int ret; ++ struct vm_fault vmf = { ++ .vma = dst_vma, ++ .address = dst_addr, ++ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE | ++ FAULT_FLAG_USERFAULT_CONTINUE, ++ .pte = NULL, ++ .page = NULL, ++ .pgoff = linear_page_index(dst_vma, dst_addr), ++ }; ++ ++ if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault) ++ return -EINVAL; + +- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); +- /* Our caller expects us to return -EFAULT if we failed to find folio */ +- if (ret == -ENOENT) ++retry: ++ ret = dst_vma->vm_ops->fault(&vmf); ++ if (ret & VM_FAULT_ERROR) { + ret = -EFAULT; +- if (ret) + goto out; +- if (!folio) { +- ret = -EFAULT; ++ } ++ ++ if (ret & VM_FAULT_NOPAGE) { ++ ret = -EAGAIN; + goto out; + } + +- page = folio_file_page(folio, pgoff); ++ if (ret & VM_FAULT_RETRY) ++ goto retry; ++ ++ page = vmf.page; ++ folio = page_folio(page); ++ BUG_ON(!folio); ++ + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0029-mm-provide-can_userfault-vma-operation.patch b/resources/hiding_ci/patches/0029-mm-provide-can_userfault-vma-operation.patch new file mode 100644 index 00000000000..0f939d066d1 --- /dev/null +++ b/resources/hiding_ci/patches/0029-mm-provide-can_userfault-vma-operation.patch @@ -0,0 +1,95 @@ +From bcfff7f58b747aac6f27a51ce54efe5eae4b02f9 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 4 Apr 2025 14:15:18 +0000 +Subject: [PATCH 29/34] mm: provide can_userfault vma operation + +The new operation allows to decouple the userfaulfd code from +dependencies to VMA types, specifically, shmem and hugetlb. The +vm_flags bitmap argument is processed with "any" logic, meaning if the +VMA type supports any of the flags set, it returns true. This is to +avoid multiple calls when checking for __VM_UFFD_FLAGS. + +Signed-off-by: Nikita Kalyazin +--- + include/linux/mm.h | 5 +++++ + mm/hugetlb.c | 7 +++++++ + mm/shmem.c | 8 ++++++++ + 3 files changed, 20 insertions(+) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 8483e09aeb2c..488d721d8542 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -680,6 +680,11 @@ struct vm_operations_struct { + */ + struct page *(*find_special_page)(struct vm_area_struct *vma, + unsigned long addr); ++ /* ++ * True if the VMA supports userfault at least for one of the vm_flags ++ */ ++ bool (*can_userfault)(struct vm_area_struct *vma, ++ unsigned long vm_flags); + }; + + #ifdef CONFIG_NUMA_BALANCING +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index c004cfdcd4e2..f3901c11e1fd 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -5143,6 +5143,12 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) + return huge_page_size(hstate_vma(vma)); + } + ++static bool hugetlb_vm_op_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return true; ++} ++ + /* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the +@@ -5168,6 +5174,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { + .close = hugetlb_vm_op_close, + .may_split = hugetlb_vm_op_split, + .pagesize = hugetlb_vm_op_pagesize, ++ .can_userfault = hugetlb_vm_op_can_userfault, + }; + + static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +diff --git a/mm/shmem.c b/mm/shmem.c +index b4159303fe59..0b9e19abd1e9 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2891,6 +2891,12 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); + } + ++static bool shmem_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return true; ++} ++ + static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) + { +@@ -5309,6 +5315,7 @@ static const struct vm_operations_struct shmem_vm_ops = { + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, + #endif ++ .can_userfault = shmem_can_userfault, + }; + + static const struct vm_operations_struct shmem_anon_vm_ops = { +@@ -5318,6 +5325,7 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, + #endif ++ .can_userfault = shmem_can_userfault, + }; + + int shmem_init_fs_context(struct fs_context *fc) +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch b/resources/hiding_ci/patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch new file mode 100644 index 00000000000..3344989cb31 --- /dev/null +++ b/resources/hiding_ci/patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch @@ -0,0 +1,79 @@ +From 2c19b37bc94ef338ec540424a9a1eee95ffbdc3c Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 4 Apr 2025 14:16:49 +0000 +Subject: [PATCH 30/34] mm: userfaultfd: use can_userfault vma operation + +Signed-off-by: Nikita Kalyazin +--- + include/linux/userfaultfd_k.h | 13 ++++++------- + mm/userfaultfd.c | 10 +++++++--- + 2 files changed, 13 insertions(+), 10 deletions(-) + +diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h +index 75342022d144..64551e8a55fb 100644 +--- a/include/linux/userfaultfd_k.h ++++ b/include/linux/userfaultfd_k.h +@@ -221,8 +221,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + if (vm_flags & VM_DROPPABLE) + return false; + +- if ((vm_flags & VM_UFFD_MINOR) && +- (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) ++ if (!vma->vm_ops->can_userfault || ++ !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR)) + return false; + + /* +@@ -235,16 +235,15 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + #ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for +- * uffd-wp, then shmem & hugetlbfs are not supported but only +- * anonymous. ++ * uffd-wp, then only anonymous is supported. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; + #endif + +- /* By default, allow any of anon|shmem|hugetlb */ +- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || +- vma_is_shmem(vma); ++ return vma_is_anonymous(vma) || ++ (vma->vm_ops->can_userfault && ++ vma->vm_ops->can_userfault(vma, vm_flags)); + } + + static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index 4b3dbc7dac64..0aa82c968e16 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -728,6 +728,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + unsigned long src_addr, dst_addr; + long copied; + struct folio *folio; ++ bool can_userfault; + + /* + * Sanitize the command parameters: +@@ -787,10 +788,13 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); + +- if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) ++ can_userfault = dst_vma->vm_ops->can_userfault && ++ dst_vma->vm_ops->can_userfault(dst_vma, __VM_UFFD_FLAGS); ++ ++ if (!vma_is_anonymous(dst_vma) && !can_userfault) + goto out_unlock; +- if (!vma_is_shmem(dst_vma) && +- uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) ++ ++ if (!can_userfault && uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + goto out_unlock; + + while (src_addr < src_start + len) { +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch b/resources/hiding_ci/patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch new file mode 100644 index 00000000000..4e544677625 --- /dev/null +++ b/resources/hiding_ci/patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch @@ -0,0 +1,44 @@ +From 140a906e90e2ba2092148d80e0764e54802c947c Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Tue, 1 Apr 2025 15:02:56 +0000 +Subject: [PATCH 31/34] KVM: guest_memfd: add support for userfaultfd minor + +Add support for sending a pagefault event if userfaultfd is registered. +Only page minor event is currently supported. + +Signed-off-by: Nikita Kalyazin +--- + virt/kvm/guest_memfd.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 5abb6d52a375..91ee5dd91c31 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -5,6 +5,9 @@ + #include + #include + #include ++#ifdef CONFIG_KVM_PRIVATE_MEM ++#include ++#endif /* CONFIG_KVM_PRIVATE_MEM */ + + #include "kvm_mm.h" + +@@ -396,6 +399,13 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) + kvm_gmem_mark_prepared(folio); + } + ++ if (userfaultfd_minor(vmf->vma) && ++ !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { ++ folio_unlock(folio); ++ filemap_invalidate_unlock_shared(inode->i_mapping); ++ return handle_userfault(vmf, VM_UFFD_MINOR); ++ } ++ + vmf->page = folio_file_page(folio, vmf->pgoff); + + out_folio: +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch b/resources/hiding_ci/patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch new file mode 100644 index 00000000000..3700d496c49 --- /dev/null +++ b/resources/hiding_ci/patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch @@ -0,0 +1,61 @@ +From 27e27a59e6139f780439f13cf7180f06c5b0d518 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 4 Apr 2025 14:18:03 +0000 +Subject: [PATCH 32/34] mm: userfaultfd: add UFFD_FEATURE_MINOR_GUEST_MEMFD + +Signed-off-by: Nikita Kalyazin +--- + fs/userfaultfd.c | 3 ++- + include/uapi/linux/userfaultfd.h | 8 +++++++- + 2 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index 97c4d71115d8..32152bfa462a 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -1954,7 +1954,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, + uffdio_api.features = UFFD_API_FEATURES; + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= +- ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); ++ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM | ++ UFFD_FEATURE_MINOR_GUEST_MEMFD); + #endif + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; +diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h +index 2841e4ea8f2c..ed688797eba7 100644 +--- a/include/uapi/linux/userfaultfd.h ++++ b/include/uapi/linux/userfaultfd.h +@@ -42,7 +42,8 @@ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ +- UFFD_FEATURE_MOVE) ++ UFFD_FEATURE_MOVE | \ ++ UFFD_FEATURE_MINOR_GUEST_MEMFD) + #define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ +@@ -230,6 +231,10 @@ struct uffdio_api { + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. ++ * ++ * UFFD_FEATURE_MINOR_GUEST_MEMFD indicates the same support as ++ * UFFD_FEATURE_MINOR_HUGETLBFS, but for guest_memfd-backed pages ++ * instead. + */ + #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) + #define UFFD_FEATURE_EVENT_FORK (1<<1) +@@ -248,6 +253,7 @@ struct uffdio_api { + #define UFFD_FEATURE_POISON (1<<14) + #define UFFD_FEATURE_WP_ASYNC (1<<15) + #define UFFD_FEATURE_MOVE (1<<16) ++#define UFFD_FEATURE_MINOR_GUEST_MEMFD (1<<17) + __u64 features; + + __u64 ioctls; +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch b/resources/hiding_ci/patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch new file mode 100644 index 00000000000..bfade8ef68a --- /dev/null +++ b/resources/hiding_ci/patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch @@ -0,0 +1,146 @@ +From 4647a42c34b3896fd872f8b2991b55827d084a38 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Fri, 28 Feb 2025 16:17:41 +0000 +Subject: [PATCH 33/34] KVM: selftests: test userfaultfd minor for guest_memfd + +The test demonstrates that a minor userfaultfd event in guest_memfd can +be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl. + +Signed-off-by: Nikita Kalyazin +--- + .../testing/selftests/kvm/guest_memfd_test.c | 99 +++++++++++++++++++ + 1 file changed, 99 insertions(+) + +diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c +index b07221aa54c9..c9578f0ce46f 100644 +--- a/tools/testing/selftests/kvm/guest_memfd_test.c ++++ b/tools/testing/selftests/kvm/guest_memfd_test.c +@@ -10,12 +10,16 @@ + #include + #include + #include ++#include + + #include + #include ++#include + #include + #include + #include ++#include ++#include + + #include "kvm_util.h" + #include "test_util.h" +@@ -278,6 +282,98 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) + close(fd1); + } + ++struct fault_args { ++ char *addr; ++ volatile char value; ++}; ++ ++static void *fault_thread_fn(void *arg) ++{ ++ struct fault_args *args = arg; ++ ++ /* Trigger page fault */ ++ args->value = *args->addr; ++ return NULL; ++} ++ ++static void test_uffd_minor(int fd, size_t page_size, size_t total_size) ++{ ++ struct uffdio_register uffd_reg; ++ struct uffdio_continue uffd_cont; ++ struct uffd_msg msg; ++ struct fault_args args; ++ pthread_t fault_thread; ++ void *mem, *mem_nofault, *buf = NULL; ++ int uffd, ret; ++ off_t offset = page_size; ++ void *fault_addr; ++ ++ ret = posix_memalign(&buf, page_size, total_size); ++ TEST_ASSERT_EQ(ret, 0); ++ ++ uffd = syscall(__NR_userfaultfd, O_CLOEXEC); ++ TEST_ASSERT(uffd != -1, "userfaultfd creation should succeed"); ++ ++ struct uffdio_api uffdio_api = { ++ .api = UFFD_API, ++ .features = UFFD_FEATURE_MINOR_GUEST_MEMFD, ++ }; ++ ret = ioctl(uffd, UFFDIO_API, &uffdio_api); ++ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_API) should succeed"); ++ ++ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); ++ TEST_ASSERT(mem != MAP_FAILED, "mmap should succeed"); ++ ++ mem_nofault = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); ++ TEST_ASSERT(mem_nofault != MAP_FAILED, "mmap should succeed"); ++ ++ uffd_reg.range.start = (unsigned long)mem; ++ uffd_reg.range.len = total_size; ++ uffd_reg.mode = UFFDIO_REGISTER_MODE_MINOR; ++ ret = ioctl(uffd, UFFDIO_REGISTER, &uffd_reg); ++ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_REGISTER) should succeed"); ++ ++ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, ++ offset, page_size); ++ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); ++ ++ fault_addr = mem + offset; ++ args.addr = fault_addr; ++ ++ ret = pthread_create(&fault_thread, NULL, fault_thread_fn, &args); ++ TEST_ASSERT(ret == 0, "pthread_create should succeed"); ++ ++ ret = read(uffd, &msg, sizeof(msg)); ++ TEST_ASSERT(ret != -1, "read from userfaultfd should succeed"); ++ TEST_ASSERT(msg.event == UFFD_EVENT_PAGEFAULT, "event type should be pagefault"); ++ TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) == fault_addr, ++ "pagefault should occur at expected address"); ++ ++ memcpy(mem_nofault + offset, buf + offset, page_size); ++ ++ uffd_cont.range.start = (unsigned long)fault_addr; ++ uffd_cont.range.len = page_size; ++ uffd_cont.mode = 0; ++ ret = ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont); ++ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_CONTINUE) should succeed"); ++ ++ TEST_ASSERT(args.value == *(char *)(mem_nofault + offset), ++ "memory should contain the value that was copied"); ++ TEST_ASSERT(args.value == *(char *)(mem + offset), ++ "no further fault is expected"); ++ ++ ret = pthread_join(fault_thread, NULL); ++ TEST_ASSERT(ret == 0, "pthread_join should succeed"); ++ ++ ret = munmap(mem_nofault, total_size); ++ TEST_ASSERT(!ret, "munmap should succeed"); ++ ++ ret = munmap(mem, total_size); ++ TEST_ASSERT(!ret, "munmap should succeed"); ++ free(buf); ++ close(uffd); ++} ++ + unsigned long get_shared_type(void) + { + #ifdef __x86_64__ +@@ -317,6 +413,9 @@ void test_vm_type(unsigned long type, bool is_shared) + test_fallocate(fd, page_size, total_size); + test_invalid_punch_hole(fd, page_size, total_size); + ++ if (is_shared) ++ test_uffd_minor(fd, page_size, total_size); ++ + close(fd); + kvm_vm_release(vm); + } +-- +2.47.1 + diff --git a/resources/hiding_ci/patches/0034-uffd-v3-fixup.patch b/resources/hiding_ci/patches/0034-uffd-v3-fixup.patch new file mode 100644 index 00000000000..41df6ff2428 --- /dev/null +++ b/resources/hiding_ci/patches/0034-uffd-v3-fixup.patch @@ -0,0 +1,50 @@ +From a03bf9042094cc0fd1b2a71307b6e0b02e8500d8 Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Thu, 10 Apr 2025 14:18:53 +0000 +Subject: [PATCH 34/34] uffd v3 fixup + + - implement can_userfault for guest_memfd + - check vma->vm_ops pointer before dereferencing +--- + include/linux/userfaultfd_k.h | 3 ++- + virt/kvm/guest_memfd.c | 9 ++++++++- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h +index 64551e8a55fb..92fb5372bea5 100644 +--- a/include/linux/userfaultfd_k.h ++++ b/include/linux/userfaultfd_k.h +@@ -221,7 +221,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + if (vm_flags & VM_DROPPABLE) + return false; + +- if (!vma->vm_ops->can_userfault || ++ if (!vma->vm_ops || ++ !vma->vm_ops->can_userfault || + !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR)) + return false; + +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 91ee5dd91c31..202b12dc4b6f 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -420,8 +420,15 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) + return ret; + } + ++static bool kvm_gmem_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return vm_flags & VM_UFFD_MINOR; ++} ++ + static const struct vm_operations_struct kvm_gmem_vm_ops = { +- .fault = kvm_gmem_fault, ++ .fault = kvm_gmem_fault, ++ .can_userfault = kvm_gmem_can_userfault, + }; + + static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +-- +2.47.1 + From 51593f3b3fdcc8cb61eb11835d95d8e7802e24c4 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 23 Apr 2025 07:10:02 +0000 Subject: [PATCH 36/40] chore(hiding_ci): rename patches dir to linux_patches This is to keep Linux patches separate in case we need to store some other patches at some point. Signed-off-by: Nikita Kalyazin --- resources/hiding_ci/build_and_install_kernel.sh | 2 +- ...1-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch | 0 ...2-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch | 0 ...03-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch | 0 ...4-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch | 0 ...5-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch | 0 ...6-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch | 0 ...7-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch | 0 ...8-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch | 0 .../0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch | 0 .../0010-mm-introduce-AS_NO_DIRECT_MAP.patch | 0 ...011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch | 0 .../{patches => linux_patches}/0012-patrick-v4-fixup.patch | 0 ...0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch | 0 .../0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch | 0 ...5-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch | 0 .../0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch | 0 ...7-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch | 0 .../0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch | 0 ...19-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch | 0 .../0020-KVM-selftests-Fix-prefault_mem-logic.patch | 0 .../0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch | 0 ...2-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch | 0 ...3-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch | 0 ...4-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch | 0 ...5-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch | 0 .../0026-KVM-guest_memfd-add-generic-population-via-write.patch | 0 .../0027-KVM-selftests-update-guest_memfd-write-tests.patch | 0 ...0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch | 0 .../0029-mm-provide-can_userfault-vma-operation.patch | 0 .../0030-mm-userfaultfd-use-can_userfault-vma-operation.patch | 0 ...0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch | 0 ...0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch | 0 ...3-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch | 0 .../{patches => linux_patches}/0034-uffd-v3-fixup.patch | 0 resources/hiding_ci/{patches => linux_patches}/GPL-2.0 | 0 resources/hiding_ci/{patches => linux_patches}/README.md | 0 37 files changed, 1 insertion(+), 1 deletion(-) rename resources/hiding_ci/{patches => linux_patches}/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0010-mm-introduce-AS_NO_DIRECT_MAP.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0012-patrick-v4-fixup.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0020-KVM-selftests-Fix-prefault_mem-logic.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0026-KVM-guest_memfd-add-generic-population-via-write.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0027-KVM-selftests-update-guest_memfd-write-tests.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0029-mm-provide-can_userfault-vma-operation.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/0034-uffd-v3-fixup.patch (100%) rename resources/hiding_ci/{patches => linux_patches}/GPL-2.0 (100%) rename resources/hiding_ci/{patches => linux_patches}/README.md (100%) diff --git a/resources/hiding_ci/build_and_install_kernel.sh b/resources/hiding_ci/build_and_install_kernel.sh index 68c0cca872b..74bbb979906 100755 --- a/resources/hiding_ci/build_and_install_kernel.sh +++ b/resources/hiding_ci/build_and_install_kernel.sh @@ -143,7 +143,7 @@ update_boot_config() { KERNEL_URL=$(cat kernel_url) KERNEL_COMMIT_HASH=$(cat kernel_commit_hash) -KERNEL_PATCHES_DIR=$(pwd)/patches +KERNEL_PATCHES_DIR=$(pwd)/linux_patches KERNEL_CONFIG_OVERRIDES=$(pwd)/kernel_config_overrides TMP_BUILD_DIR=$(mktemp -d -t kernel-build-XXXX) diff --git a/resources/hiding_ci/patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch b/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch similarity index 100% rename from resources/hiding_ci/patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch rename to resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch diff --git a/resources/hiding_ci/patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch b/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch similarity index 100% rename from resources/hiding_ci/patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch rename to resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch diff --git a/resources/hiding_ci/patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch b/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch similarity index 100% rename from resources/hiding_ci/patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch rename to resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch diff --git a/resources/hiding_ci/patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch b/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch similarity index 100% rename from resources/hiding_ci/patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch rename to resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch diff --git a/resources/hiding_ci/patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch b/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch similarity index 100% rename from resources/hiding_ci/patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch rename to resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch diff --git a/resources/hiding_ci/patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch b/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch similarity index 100% rename from resources/hiding_ci/patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch rename to resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch diff --git a/resources/hiding_ci/patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch b/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch similarity index 100% rename from resources/hiding_ci/patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch rename to resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch diff --git a/resources/hiding_ci/patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch b/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch similarity index 100% rename from resources/hiding_ci/patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch rename to resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch diff --git a/resources/hiding_ci/patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch b/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch similarity index 100% rename from resources/hiding_ci/patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch rename to resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch diff --git a/resources/hiding_ci/patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch b/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch similarity index 100% rename from resources/hiding_ci/patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch rename to resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch diff --git a/resources/hiding_ci/patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch b/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch similarity index 100% rename from resources/hiding_ci/patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch rename to resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch diff --git a/resources/hiding_ci/patches/0012-patrick-v4-fixup.patch b/resources/hiding_ci/linux_patches/0012-patrick-v4-fixup.patch similarity index 100% rename from resources/hiding_ci/patches/0012-patrick-v4-fixup.patch rename to resources/hiding_ci/linux_patches/0012-patrick-v4-fixup.patch diff --git a/resources/hiding_ci/patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch b/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch similarity index 100% rename from resources/hiding_ci/patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch rename to resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch diff --git a/resources/hiding_ci/patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch b/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch similarity index 100% rename from resources/hiding_ci/patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch rename to resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch diff --git a/resources/hiding_ci/patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch b/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch similarity index 100% rename from resources/hiding_ci/patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch rename to resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch diff --git a/resources/hiding_ci/patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch similarity index 100% rename from resources/hiding_ci/patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch rename to resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch diff --git a/resources/hiding_ci/patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch b/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch similarity index 100% rename from resources/hiding_ci/patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch rename to resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch diff --git a/resources/hiding_ci/patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch similarity index 100% rename from resources/hiding_ci/patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch rename to resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch diff --git a/resources/hiding_ci/patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch b/resources/hiding_ci/linux_patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch similarity index 100% rename from resources/hiding_ci/patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch rename to resources/hiding_ci/linux_patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch diff --git a/resources/hiding_ci/patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch b/resources/hiding_ci/linux_patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch similarity index 100% rename from resources/hiding_ci/patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch rename to resources/hiding_ci/linux_patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch diff --git a/resources/hiding_ci/patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch b/resources/hiding_ci/linux_patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch similarity index 100% rename from resources/hiding_ci/patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch rename to resources/hiding_ci/linux_patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch diff --git a/resources/hiding_ci/patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch b/resources/hiding_ci/linux_patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch similarity index 100% rename from resources/hiding_ci/patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch rename to resources/hiding_ci/linux_patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch diff --git a/resources/hiding_ci/patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch b/resources/hiding_ci/linux_patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch similarity index 100% rename from resources/hiding_ci/patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch rename to resources/hiding_ci/linux_patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch diff --git a/resources/hiding_ci/patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch b/resources/hiding_ci/linux_patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch similarity index 100% rename from resources/hiding_ci/patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch rename to resources/hiding_ci/linux_patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch diff --git a/resources/hiding_ci/patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch b/resources/hiding_ci/linux_patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch similarity index 100% rename from resources/hiding_ci/patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch rename to resources/hiding_ci/linux_patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch diff --git a/resources/hiding_ci/patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch b/resources/hiding_ci/linux_patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch similarity index 100% rename from resources/hiding_ci/patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch rename to resources/hiding_ci/linux_patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch diff --git a/resources/hiding_ci/patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch b/resources/hiding_ci/linux_patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch similarity index 100% rename from resources/hiding_ci/patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch rename to resources/hiding_ci/linux_patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch diff --git a/resources/hiding_ci/patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch b/resources/hiding_ci/linux_patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch similarity index 100% rename from resources/hiding_ci/patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch rename to resources/hiding_ci/linux_patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch diff --git a/resources/hiding_ci/patches/0029-mm-provide-can_userfault-vma-operation.patch b/resources/hiding_ci/linux_patches/0029-mm-provide-can_userfault-vma-operation.patch similarity index 100% rename from resources/hiding_ci/patches/0029-mm-provide-can_userfault-vma-operation.patch rename to resources/hiding_ci/linux_patches/0029-mm-provide-can_userfault-vma-operation.patch diff --git a/resources/hiding_ci/patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch b/resources/hiding_ci/linux_patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch similarity index 100% rename from resources/hiding_ci/patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch rename to resources/hiding_ci/linux_patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch diff --git a/resources/hiding_ci/patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch b/resources/hiding_ci/linux_patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch similarity index 100% rename from resources/hiding_ci/patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch rename to resources/hiding_ci/linux_patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch diff --git a/resources/hiding_ci/patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch b/resources/hiding_ci/linux_patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch similarity index 100% rename from resources/hiding_ci/patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch rename to resources/hiding_ci/linux_patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch diff --git a/resources/hiding_ci/patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch b/resources/hiding_ci/linux_patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch similarity index 100% rename from resources/hiding_ci/patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch rename to resources/hiding_ci/linux_patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch diff --git a/resources/hiding_ci/patches/0034-uffd-v3-fixup.patch b/resources/hiding_ci/linux_patches/0034-uffd-v3-fixup.patch similarity index 100% rename from resources/hiding_ci/patches/0034-uffd-v3-fixup.patch rename to resources/hiding_ci/linux_patches/0034-uffd-v3-fixup.patch diff --git a/resources/hiding_ci/patches/GPL-2.0 b/resources/hiding_ci/linux_patches/GPL-2.0 similarity index 100% rename from resources/hiding_ci/patches/GPL-2.0 rename to resources/hiding_ci/linux_patches/GPL-2.0 diff --git a/resources/hiding_ci/patches/README.md b/resources/hiding_ci/linux_patches/README.md similarity index 100% rename from resources/hiding_ci/patches/README.md rename to resources/hiding_ci/linux_patches/README.md From deb856ca7794c2972c5201013b65f0acb1136572 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 24 Apr 2025 14:49:19 +0000 Subject: [PATCH 37/40] chore(hiding_ci): fix up guest_memfd UFFD patches Also strip doc and test patches committed by mistake last time. Signed-off-by: Nikita Kalyazin --- ...reeing-of-typed-folios-on-final-foli.patch | 2 +- ...Handle-final-folio_put-of-guest_memf.patch | 2 +- ...-Allow-host-to-map-guest_memfd-pages.patch | 2 +- ..._X86_SW_PROTECTED_VM-as-supporting-g.patch | 2 +- ...or-user_mem_abort-calculation-of-for.patch | 2 +- ...Handle-in-place-shared-memory-as-gue.patch | 2 +- ...-guest_memfd-backed-guest-page-fault.patch | 2 +- ...selftests-guest_memfd-mmap-test-when.patch | 2 +- ...-Enable-mapping-guest_memfd-in-arm64.patch | 2 +- .../0010-mm-introduce-AS_NO_DIRECT_MAP.patch | 2 +- ...d-Add-flag-to-remove-from-direct-map.patch | 2 +- ...012-fixup-for-direct-map-removal-v4.patch} | 4 +- ...EM_USERFAULT-memslot-flag-and-bitmap.patch | 4 +- ...M-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch | 4 +- ...etting-of-KVM_MEM_USERFAULT-on-guest.patch | 4 +- ...mu-Add-support-for-KVM_MEM_USERFAULT.patch | 4 +- ...M_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch | 4 +- ...64-Add-support-for-KVM_MEM_USERFAULT.patch | 4 +- ...fd-add-generic-population-via-write.patch} | 4 +- ...ix-vm_mem_region_set_flags-docstring.patch | 28 -- ...KVM-selftests-Fix-prefault_mem-logic.patch | 37 -- ...ests-update-guest_memfd-write-tests.patch} | 4 +- ...ests-Add-va_start-end-into-uffd_desc.patch | 44 -- ...-generic-continue-for-non-hugetlbfs.patch} | 4 +- ...form-set_memory_region_test-of-KVM_M.patch | 31 -- ...provide-can_userfault-vma-operation.patch} | 4 +- ...d-KVM-Userfault-mode-to-demand_pagin.patch | 381 ------------------ ...tfd-use-can_userfault-vma-operation.patch} | 4 +- ...d-add-support-for-userfaultfd-minor.patch} | 4 +- ...d-KVM_MEM_USERFAULT-guest_memfd-togg.patch | 65 --- ...n-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch | 76 ---- ...-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch} | 4 +- .../0026-fixup-for-guest_memfd-uffd-v3.patch | 70 ++++ ...st-userfaultfd-minor-for-guest_memfd.patch | 146 ------- .../linux_patches/0034-uffd-v3-fixup.patch | 50 --- 35 files changed, 109 insertions(+), 897 deletions(-) rename resources/hiding_ci/linux_patches/{0012-patrick-v4-fixup.patch => 0012-fixup-for-direct-map-removal-v4.patch} (92%) rename resources/hiding_ci/linux_patches/{0026-KVM-guest_memfd-add-generic-population-via-write.patch => 0019-KVM-guest_memfd-add-generic-population-via-write.patch} (96%) delete mode 100644 resources/hiding_ci/linux_patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch delete mode 100644 resources/hiding_ci/linux_patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch rename resources/hiding_ci/linux_patches/{0027-KVM-selftests-update-guest_memfd-write-tests.patch => 0020-KVM-selftests-update-guest_memfd-write-tests.patch} (97%) delete mode 100644 resources/hiding_ci/linux_patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch rename resources/hiding_ci/linux_patches/{0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch => 0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch} (97%) delete mode 100644 resources/hiding_ci/linux_patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch rename resources/hiding_ci/linux_patches/{0029-mm-provide-can_userfault-vma-operation.patch => 0022-mm-provide-can_userfault-vma-operation.patch} (95%) delete mode 100644 resources/hiding_ci/linux_patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch rename resources/hiding_ci/linux_patches/{0030-mm-userfaultfd-use-can_userfault-vma-operation.patch => 0023-mm-userfaultfd-use-can_userfault-vma-operation.patch} (95%) rename resources/hiding_ci/linux_patches/{0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch => 0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch} (90%) delete mode 100644 resources/hiding_ci/linux_patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch delete mode 100644 resources/hiding_ci/linux_patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch rename resources/hiding_ci/linux_patches/{0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch => 0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch} (94%) create mode 100644 resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch delete mode 100644 resources/hiding_ci/linux_patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch delete mode 100644 resources/hiding_ci/linux_patches/0034-uffd-v3-fixup.patch diff --git a/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch b/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch index 0986dfacfeb..4d4b5572d8a 100644 --- a/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch +++ b/resources/hiding_ci/linux_patches/0001-mm-Consolidate-freeing-of-typed-folios-on-final-foli.patch @@ -1,7 +1,7 @@ From f9ca710b51263ce8317cc2fa02232e456fa1f39c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:15 +0000 -Subject: [PATCH 01/34] mm: Consolidate freeing of typed folios on final +Subject: [PATCH 01/26] mm: Consolidate freeing of typed folios on final folio_put() Some folio types, such as hugetlb, handle freeing their own diff --git a/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch b/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch index b9f4f83f442..d5778165add 100644 --- a/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch +++ b/resources/hiding_ci/linux_patches/0002-KVM-guest_memfd-Handle-final-folio_put-of-guest_memf.patch @@ -1,7 +1,7 @@ From 9a4d7cd855d14e1522f363e3e04ebb9fa0a90ff0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:16 +0000 -Subject: [PATCH 02/34] KVM: guest_memfd: Handle final folio_put() of +Subject: [PATCH 02/26] KVM: guest_memfd: Handle final folio_put() of guest_memfd pages Before transitioning a guest_memfd folio to unshared, thereby diff --git a/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch b/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch index 8fb306b257b..13d7180fa19 100644 --- a/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch +++ b/resources/hiding_ci/linux_patches/0003-KVM-guest_memfd-Allow-host-to-map-guest_memfd-pages.patch @@ -1,7 +1,7 @@ From fd39febef2e0d41394e51f5e34f2c8de80b3b4dc Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:17 +0000 -Subject: [PATCH 03/34] KVM: guest_memfd: Allow host to map guest_memfd() pages +Subject: [PATCH 03/26] KVM: guest_memfd: Allow host to map guest_memfd() pages Add support for mmap() and fault() for guest_memfd backed memory in the host for VMs that support in-place conversion between diff --git a/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch b/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch index 33bf24f9415..2d32a4cefc2 100644 --- a/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch +++ b/resources/hiding_ci/linux_patches/0004-KVM-x86-Mark-KVM_X86_SW_PROTECTED_VM-as-supporting-g.patch @@ -1,7 +1,7 @@ From d16c343f0f95ecd8d2cda2dfba4ac8b7c293f217 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:19 +0000 -Subject: [PATCH 04/34] KVM: x86: Mark KVM_X86_SW_PROTECTED_VM as supporting +Subject: [PATCH 04/26] KVM: x86: Mark KVM_X86_SW_PROTECTED_VM as supporting guest_memfd shared memory The KVM_X86_SW_PROTECTED_VM type is meant for experimentation and diff --git a/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch b/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch index 38b3292884d..905c88558d8 100644 --- a/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch +++ b/resources/hiding_ci/linux_patches/0005-KVM-arm64-Refactor-user_mem_abort-calculation-of-for.patch @@ -1,7 +1,7 @@ From 483ccb70335cb0c76161caf76c0ccb7c618038e2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:20 +0000 -Subject: [PATCH 05/34] KVM: arm64: Refactor user_mem_abort() calculation of +Subject: [PATCH 05/26] KVM: arm64: Refactor user_mem_abort() calculation of force_pte To simplify the code and to make the assumptions clearer, diff --git a/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch b/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch index 05d9e08b1fc..3e0dea5a7e6 100644 --- a/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch +++ b/resources/hiding_ci/linux_patches/0006-KVM-guest_memfd-Handle-in-place-shared-memory-as-gue.patch @@ -1,7 +1,7 @@ From b1e925d4d5db8513dba67c3a9d40a2b507668f09 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:18 +0000 -Subject: [PATCH 06/34] KVM: guest_memfd: Handle in-place shared memory as +Subject: [PATCH 06/26] KVM: guest_memfd: Handle in-place shared memory as guest_memfd backed memory For VMs that allow sharing guest_memfd backed memory in-place, diff --git a/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch b/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch index 1ce256e6f57..5b68d6e183e 100644 --- a/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch +++ b/resources/hiding_ci/linux_patches/0007-KVM-arm64-Handle-guest_memfd-backed-guest-page-fault.patch @@ -1,7 +1,7 @@ From 996513a423377349767d5cfef87850e80131854f Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:21 +0000 -Subject: [PATCH 07/34] KVM: arm64: Handle guest_memfd()-backed guest page +Subject: [PATCH 07/26] KVM: arm64: Handle guest_memfd()-backed guest page faults Add arm64 support for handling guest page faults on guest_memfd diff --git a/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch b/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch index 872ee55d789..2a5a355a2e1 100644 --- a/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch +++ b/resources/hiding_ci/linux_patches/0008-KVM-guest_memfd-selftests-guest_memfd-mmap-test-when.patch @@ -1,7 +1,7 @@ From 1ee5d01987bff47f007fb86ad7738b299816b2ef Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:23 +0000 -Subject: [PATCH 08/34] KVM: guest_memfd: selftests: guest_memfd mmap() test +Subject: [PATCH 08/26] KVM: guest_memfd: selftests: guest_memfd mmap() test when mapping is allowed Expand the guest_memfd selftests to include testing mapping guest diff --git a/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch b/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch index eb2cb3fd4e8..a03d592e4b0 100644 --- a/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch +++ b/resources/hiding_ci/linux_patches/0009-KVM-arm64-Enable-mapping-guest_memfd-in-arm64.patch @@ -1,7 +1,7 @@ From 3cc51efc17a2c41a480eed36b31c1773936717e0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 18 Mar 2025 16:18:22 +0000 -Subject: [PATCH 09/34] KVM: arm64: Enable mapping guest_memfd in arm64 +Subject: [PATCH 09/26] KVM: arm64: Enable mapping guest_memfd in arm64 Enable mapping guest_memfd in arm64. For now, it applies to all VMs in arm64 that use guest_memfd. In the future, new VM types diff --git a/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch b/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch index 0ec11b19bb7..bd336166268 100644 --- a/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch +++ b/resources/hiding_ci/linux_patches/0010-mm-introduce-AS_NO_DIRECT_MAP.patch @@ -1,7 +1,7 @@ From 22ec89c0ff7af3430027cf71cf8bce5c8ed6e402 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 7 Feb 2025 11:16:06 +0000 -Subject: [PATCH 10/34] mm: introduce AS_NO_DIRECT_MAP +Subject: [PATCH 10/26] mm: introduce AS_NO_DIRECT_MAP Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are set to not present . Currently, mappings that match this description are diff --git a/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch b/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch index 0c8b984a2e6..dcce661a60e 100644 --- a/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch +++ b/resources/hiding_ci/linux_patches/0011-KVM-guest_memfd-Add-flag-to-remove-from-direct-map.patch @@ -1,7 +1,7 @@ From b1fc478976c93fd42b14e06d2de57e121be03142 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 7 Feb 2025 14:33:01 +0000 -Subject: [PATCH 11/34] KVM: guest_memfd: Add flag to remove from direct map +Subject: [PATCH 11/26] KVM: guest_memfd: Add flag to remove from direct map Add KVM_GMEM_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD() ioctl. When set, guest_memfd folios will be removed from the direct map after diff --git a/resources/hiding_ci/linux_patches/0012-patrick-v4-fixup.patch b/resources/hiding_ci/linux_patches/0012-fixup-for-direct-map-removal-v4.patch similarity index 92% rename from resources/hiding_ci/linux_patches/0012-patrick-v4-fixup.patch rename to resources/hiding_ci/linux_patches/0012-fixup-for-direct-map-removal-v4.patch index 7532bcbe975..c54565134f1 100644 --- a/resources/hiding_ci/linux_patches/0012-patrick-v4-fixup.patch +++ b/resources/hiding_ci/linux_patches/0012-fixup-for-direct-map-removal-v4.patch @@ -1,7 +1,7 @@ -From 098a8167ad6c55336cad9abb808ebdc105784278 Mon Sep 17 00:00:00 2001 +From ab44b2d5bfb7ef9b7bbb156d493f49a4bbebf014 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 10 Apr 2025 14:18:39 +0000 -Subject: [PATCH 12/34] patrick v4 fixup +Subject: [PATCH 12/26] fixup for direct map removal v4 Do not make kvm_gmem_free_folio dependent on CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE . diff --git a/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch b/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch index 263752c8c6e..f4a62443b72 100644 --- a/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch +++ b/resources/hiding_ci/linux_patches/0013-KVM-Add-KVM_MEM_USERFAULT-memslot-flag-and-bitmap.patch @@ -1,7 +1,7 @@ -From abfa51cb95feaae899254453788c6db1c70d0189 Mon Sep 17 00:00:00 2001 +From 48a178e27031d5eac97ba0630686fcf3034e88ed Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 9 Jan 2025 20:49:17 +0000 -Subject: [PATCH 13/34] KVM: Add KVM_MEM_USERFAULT memslot flag and bitmap +Subject: [PATCH 13/26] KVM: Add KVM_MEM_USERFAULT memslot flag and bitmap Use one of the 14 reserved u64s in struct kvm_userspace_memory_region2 for the user to provide `userfault_bitmap`. diff --git a/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch b/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch index 11d3f4d0618..dddc2b9dbfd 100644 --- a/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch +++ b/resources/hiding_ci/linux_patches/0014-KVM-Add-KVM_MEMORY_EXIT_FLAG_USERFAULT.patch @@ -1,7 +1,7 @@ -From 0a91075ecff4f60404bc05da0d10d41e1b33fcec Mon Sep 17 00:00:00 2001 +From 51a78015a0114ceaf1930739bba6111b1bc09f87 Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 9 Jan 2025 20:49:18 +0000 -Subject: [PATCH 14/34] KVM: Add KVM_MEMORY_EXIT_FLAG_USERFAULT +Subject: [PATCH 14/26] KVM: Add KVM_MEMORY_EXIT_FLAG_USERFAULT This flag is used for vCPU memory faults caused by KVM Userfault; i.e., the bit in `userfault_bitmap` corresponding to the faulting gfn was set. diff --git a/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch b/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch index 38f7624d108..7960341db8a 100644 --- a/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch +++ b/resources/hiding_ci/linux_patches/0015-KVM-Allow-late-setting-of-KVM_MEM_USERFAULT-on-guest.patch @@ -1,7 +1,7 @@ -From 7a626f99566e7ab7fce004fcf4041d87e75512cc Mon Sep 17 00:00:00 2001 +From ed691412fd9414d3b9124e2416f6cae3f21a1071 Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 9 Jan 2025 20:49:19 +0000 -Subject: [PATCH 15/34] KVM: Allow late setting of KVM_MEM_USERFAULT on +Subject: [PATCH 15/26] KVM: Allow late setting of KVM_MEM_USERFAULT on guest_memfd memslot Currently guest_memfd memslots can only be deleted. Slightly change the diff --git a/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch index 5895c60b415..ca31ca9518b 100644 --- a/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch +++ b/resources/hiding_ci/linux_patches/0016-KVM-x86-mmu-Add-support-for-KVM_MEM_USERFAULT.patch @@ -1,7 +1,7 @@ -From 7e9f22f83ca191df8deab84d7c8c9d64a1b5b10f Mon Sep 17 00:00:00 2001 +From fa324f2e503cd36dc357c3eb9b807e02f9b6206e Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 9 Jan 2025 20:49:21 +0000 -Subject: [PATCH 16/34] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT +Subject: [PATCH 16/26] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT Adhering to the requirements of KVM Userfault: diff --git a/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch b/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch index bdbcb9117ba..c89c7c9b262 100644 --- a/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch +++ b/resources/hiding_ci/linux_patches/0017-KVM-Advertise-KVM_CAP_USERFAULT-in-KVM_CHECK_EXTENSI.patch @@ -1,7 +1,7 @@ -From fe6a44733d6384057fd68732ebf3aeb612443d14 Mon Sep 17 00:00:00 2001 +From f0ef961eba32b98755d2bfa5ff684944e3a442fc Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 9 Jan 2025 20:49:20 +0000 -Subject: [PATCH 17/34] KVM: Advertise KVM_CAP_USERFAULT in KVM_CHECK_EXTENSION +Subject: [PATCH 17/26] KVM: Advertise KVM_CAP_USERFAULT in KVM_CHECK_EXTENSION Advertise support for KVM_CAP_USERFAULT when kvm_has_userfault() returns true. Currently this is merely IS_ENABLED(CONFIG_HAVE_KVM_USERFAULT), so diff --git a/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch b/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch index bc9d608d15b..58f076e27cb 100644 --- a/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch +++ b/resources/hiding_ci/linux_patches/0018-KVM-arm64-Add-support-for-KVM_MEM_USERFAULT.patch @@ -1,7 +1,7 @@ -From 86888e840dce0193dfb14916857df1c09749b618 Mon Sep 17 00:00:00 2001 +From 482a64008a53577da046428922f247dce203113f Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 9 Jan 2025 20:49:22 +0000 -Subject: [PATCH 18/34] KVM: arm64: Add support for KVM_MEM_USERFAULT +Subject: [PATCH 18/26] KVM: arm64: Add support for KVM_MEM_USERFAULT Adhering to the requirements of KVM Userfault: 1. When it is toggled on, zap the second stage with diff --git a/resources/hiding_ci/linux_patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch b/resources/hiding_ci/linux_patches/0019-KVM-guest_memfd-add-generic-population-via-write.patch similarity index 96% rename from resources/hiding_ci/linux_patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch rename to resources/hiding_ci/linux_patches/0019-KVM-guest_memfd-add-generic-population-via-write.patch index f7c1cce60f2..0c05129841e 100644 --- a/resources/hiding_ci/linux_patches/0026-KVM-guest_memfd-add-generic-population-via-write.patch +++ b/resources/hiding_ci/linux_patches/0019-KVM-guest_memfd-add-generic-population-via-write.patch @@ -1,7 +1,7 @@ -From dda48e07b48f344c57d8ed90ddad7e01c26f7952 Mon Sep 17 00:00:00 2001 +From f81fae83d40e1520a0a46afa3473f9fc4c6b7c79 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 29 Nov 2024 11:51:02 +0000 -Subject: [PATCH 26/34] KVM: guest_memfd: add generic population via write +Subject: [PATCH 19/26] KVM: guest_memfd: add generic population via write write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. This is supposed diff --git a/resources/hiding_ci/linux_patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch b/resources/hiding_ci/linux_patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch deleted file mode 100644 index effde901d5b..00000000000 --- a/resources/hiding_ci/linux_patches/0019-KVM-selftests-Fix-vm_mem_region_set_flags-docstring.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 398111201a62de89a4973295512b303d74ea0662 Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:23 +0000 -Subject: [PATCH 19/34] KVM: selftests: Fix vm_mem_region_set_flags docstring - -`flags` is what region->region.flags gets set to. - -Signed-off-by: James Houghton ---- - tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c -index 33fefeb3ca44..a87988a162f1 100644 ---- a/tools/testing/selftests/kvm/lib/kvm_util.c -+++ b/tools/testing/selftests/kvm/lib/kvm_util.c -@@ -1124,7 +1124,7 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) - * - * Input Args: - * vm - Virtual Machine -- * flags - Starting guest physical address -+ * flags - Flags for the memslot - * - * Output Args: None - * --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch b/resources/hiding_ci/linux_patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch deleted file mode 100644 index e97c522033e..00000000000 --- a/resources/hiding_ci/linux_patches/0020-KVM-selftests-Fix-prefault_mem-logic.patch +++ /dev/null @@ -1,37 +0,0 @@ -From 281292095132694847d44d12de0268045ae727ec Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:24 +0000 -Subject: [PATCH 20/34] KVM: selftests: Fix prefault_mem logic - -The previous logic didn't handle the case where memory was partitioned -AND we were using a single userfaultfd. It would only prefault the first -vCPU's memory and not the rest. - -Signed-off-by: James Houghton ---- - tools/testing/selftests/kvm/demand_paging_test.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c -index 0202b78f8680..315f5c9037b4 100644 ---- a/tools/testing/selftests/kvm/demand_paging_test.c -+++ b/tools/testing/selftests/kvm/demand_paging_test.c -@@ -172,11 +172,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) - memset(guest_data_prototype, 0xAB, demand_paging_size); - - if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { -- num_uffds = p->single_uffd ? 1 : nr_vcpus; -- for (i = 0; i < num_uffds; i++) { -+ for (i = 0; i < nr_vcpus; i++) { - vcpu_args = &memstress_args.vcpu_args[i]; - prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), - vcpu_args->pages * memstress_args.guest_page_size); -+ if (!p->partition_vcpu_memory_access) -+ /* We prefaulted everything */ -+ break; - } - } - --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch b/resources/hiding_ci/linux_patches/0020-KVM-selftests-update-guest_memfd-write-tests.patch similarity index 97% rename from resources/hiding_ci/linux_patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch rename to resources/hiding_ci/linux_patches/0020-KVM-selftests-update-guest_memfd-write-tests.patch index 6062f6fd982..869144f63d0 100644 --- a/resources/hiding_ci/linux_patches/0027-KVM-selftests-update-guest_memfd-write-tests.patch +++ b/resources/hiding_ci/linux_patches/0020-KVM-selftests-update-guest_memfd-write-tests.patch @@ -1,7 +1,7 @@ -From 120635067e2d910bf96a53e3b7e2f2d5be19af7e Mon Sep 17 00:00:00 2001 +From 3ccb28e0fe31afa8ac626ebd5b957ba9263a68d3 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 29 Nov 2024 11:57:58 +0000 -Subject: [PATCH 27/34] KVM: selftests: update guest_memfd write tests +Subject: [PATCH 20/26] KVM: selftests: update guest_memfd write tests This is to reflect that the write syscall is now implemented for guest_memfd. diff --git a/resources/hiding_ci/linux_patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch b/resources/hiding_ci/linux_patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch deleted file mode 100644 index 67bf2b8c9d8..00000000000 --- a/resources/hiding_ci/linux_patches/0021-KVM-selftests-Add-va_start-end-into-uffd_desc.patch +++ /dev/null @@ -1,44 +0,0 @@ -From c64583011616045a4b70e34aeef6fd77e6f23ccc Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:25 +0000 -Subject: [PATCH 21/34] KVM: selftests: Add va_start/end into uffd_desc - -This will be used for the self-test to look up which userfaultfd we -should be using when handling a KVM Userfault (in the event KVM -Userfault and userfaultfd are being used together). - -Signed-off-by: James Houghton ---- - tools/testing/selftests/kvm/include/userfaultfd_util.h | 2 ++ - tools/testing/selftests/kvm/lib/userfaultfd_util.c | 2 ++ - 2 files changed, 4 insertions(+) - -diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h -index 60f7f9d435dc..b62fecdfe745 100644 ---- a/tools/testing/selftests/kvm/include/userfaultfd_util.h -+++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h -@@ -30,6 +30,8 @@ struct uffd_desc { - int *pipefds; - pthread_t *readers; - struct uffd_reader_args *reader_args; -+ void *va_start; -+ void *va_end; - }; - - struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, -diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c -index 7c9de8414462..93004c85bcdc 100644 ---- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c -+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c -@@ -152,6 +152,8 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, - expected_ioctls, "missing userfaultfd ioctls"); - - uffd_desc->uffd = uffd; -+ uffd_desc->va_start = hva; -+ uffd_desc->va_end = (char *)hva + len; - for (i = 0; i < uffd_desc->num_readers; ++i) { - int pipes[2]; - --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch b/resources/hiding_ci/linux_patches/0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch similarity index 97% rename from resources/hiding_ci/linux_patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch rename to resources/hiding_ci/linux_patches/0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch index e610f502a9b..4818a87a713 100644 --- a/resources/hiding_ci/linux_patches/0028-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch +++ b/resources/hiding_ci/linux_patches/0021-mm-userfaultfd-generic-continue-for-non-hugetlbfs.patch @@ -1,7 +1,7 @@ -From 38fb84551a238b98a622433157fb2537ecb5611e Mon Sep 17 00:00:00 2001 +From 51dc7d27476d00d96f6f71882a11b5e17e80aa8f Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Mon, 31 Mar 2025 10:15:35 +0000 -Subject: [PATCH 28/34] mm: userfaultfd: generic continue for non hugetlbfs +Subject: [PATCH 21/26] mm: userfaultfd: generic continue for non hugetlbfs Remove shmem-specific code from UFFDIO_CONTINUE implementation for non-huge pages by calling vm_ops->fault(). A new VMF flag, diff --git a/resources/hiding_ci/linux_patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch b/resources/hiding_ci/linux_patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch deleted file mode 100644 index 22ec8dbcc83..00000000000 --- a/resources/hiding_ci/linux_patches/0022-KVM-selftests-Inform-set_memory_region_test-of-KVM_M.patch +++ /dev/null @@ -1,31 +0,0 @@ -From 73cef3464706c3665efcbac533979e83716b0d86 Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:27 +0000 -Subject: [PATCH 22/34] KVM: selftests: Inform set_memory_region_test of - KVM_MEM_USERFAULT - -The KVM_MEM_USERFAULT flag is supported iff KVM_CAP_USERFAULT is -available. - -Signed-off-by: James Houghton ---- - tools/testing/selftests/kvm/set_memory_region_test.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c -index bc440d5aba57..56231c02d88c 100644 ---- a/tools/testing/selftests/kvm/set_memory_region_test.c -+++ b/tools/testing/selftests/kvm/set_memory_region_test.c -@@ -364,6 +364,9 @@ static void test_invalid_memory_region_flags(void) - if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE) - supported_flags |= KVM_MEM_GUEST_MEMFD; - -+ if (kvm_check_cap(KVM_CAP_USERFAULT)) -+ supported_flags |= KVM_MEM_USERFAULT; -+ - for (i = 0; i < 32; i++) { - if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) - continue; --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0029-mm-provide-can_userfault-vma-operation.patch b/resources/hiding_ci/linux_patches/0022-mm-provide-can_userfault-vma-operation.patch similarity index 95% rename from resources/hiding_ci/linux_patches/0029-mm-provide-can_userfault-vma-operation.patch rename to resources/hiding_ci/linux_patches/0022-mm-provide-can_userfault-vma-operation.patch index 0f939d066d1..b6bc10178cc 100644 --- a/resources/hiding_ci/linux_patches/0029-mm-provide-can_userfault-vma-operation.patch +++ b/resources/hiding_ci/linux_patches/0022-mm-provide-can_userfault-vma-operation.patch @@ -1,7 +1,7 @@ -From bcfff7f58b747aac6f27a51ce54efe5eae4b02f9 Mon Sep 17 00:00:00 2001 +From 7ed09f6e50ea4e4448e457a7b7712bdf3b38e826 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 4 Apr 2025 14:15:18 +0000 -Subject: [PATCH 29/34] mm: provide can_userfault vma operation +Subject: [PATCH 22/26] mm: provide can_userfault vma operation The new operation allows to decouple the userfaulfd code from dependencies to VMA types, specifically, shmem and hugetlb. The diff --git a/resources/hiding_ci/linux_patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch b/resources/hiding_ci/linux_patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch deleted file mode 100644 index d0b3c5e7489..00000000000 --- a/resources/hiding_ci/linux_patches/0023-KVM-selftests-Add-KVM-Userfault-mode-to-demand_pagin.patch +++ /dev/null @@ -1,381 +0,0 @@ -From 6aa865fbd91cb293da61db4e11f0ce1a6de36cae Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:26 +0000 -Subject: [PATCH 23/34] KVM: selftests: Add KVM Userfault mode to - demand_paging_test - -Add a way for the KVM_RUN loop to handle -EFAULT exits when they are for -KVM_MEMORY_EXIT_FLAG_USERFAULT. In this case, preemptively handle the -UFFDIO_COPY or UFFDIO_CONTINUE if userfaultfd is also in use. This saves -the trip through the userfaultfd poll/read/WAKE loop. - -When preemptively handling UFFDIO_COPY/CONTINUE, do so with -MODE_DONTWAKE, as there will not be a thread to wake. If a thread *does* -take the userfaultfd slow path, we will get a regular userfault, and we -will call handle_uffd_page_request() which will do a full wake-up. In -the EEXIST case, a wake-up will not occur. Make sure to call UFFDIO_WAKE -explicitly in this case. - -When handling KVM userfaults, make sure to set the bitmap with -memory_order_release. Although it wouldn't affect the functionality of -the test (because memstress doesn't actually require any particular -guest memory contents), it is what userspace normally needs to do. - -Add `-k` to set the test to use KVM Userfault. - -Add the vm_mem_region_set_flags_userfault() helper for setting -`userfault_bitmap` and KVM_MEM_USERFAULT at the same time. - -Signed-off-by: James Houghton ---- - .../selftests/kvm/demand_paging_test.c | 139 +++++++++++++++++- - .../testing/selftests/kvm/include/kvm_util.h | 5 + - tools/testing/selftests/kvm/lib/kvm_util.c | 40 ++++- - 3 files changed, 176 insertions(+), 8 deletions(-) - -diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c -index 315f5c9037b4..183c70731093 100644 ---- a/tools/testing/selftests/kvm/demand_paging_test.c -+++ b/tools/testing/selftests/kvm/demand_paging_test.c -@@ -12,7 +12,9 @@ - #include - #include - #include -+#include - #include -+#include - - #include "kvm_util.h" - #include "test_util.h" -@@ -24,11 +26,21 @@ - #ifdef __NR_userfaultfd - - static int nr_vcpus = 1; -+static int num_uffds; - static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; - - static size_t demand_paging_size; -+static size_t host_page_size; - static char *guest_data_prototype; - -+static struct { -+ bool enabled; -+ int uffd_mode; /* set if userfaultfd is also in use */ -+ struct uffd_desc **uffd_descs; -+} kvm_userfault_data; -+ -+static void resolve_kvm_userfault(u64 gpa, u64 size); -+ - static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) - { - struct kvm_vcpu *vcpu = vcpu_args->vcpu; -@@ -41,8 +53,22 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) - clock_gettime(CLOCK_MONOTONIC, &start); - - /* Let the guest access its memory */ -+restart: - ret = _vcpu_run(vcpu); -- TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); -+ if (ret < 0 && errno == EFAULT && kvm_userfault_data.enabled) { -+ /* Check for userfault. */ -+ TEST_ASSERT(run->exit_reason == KVM_EXIT_MEMORY_FAULT, -+ "Got invalid exit reason: %x", run->exit_reason); -+ TEST_ASSERT(run->memory_fault.flags == -+ KVM_MEMORY_EXIT_FLAG_USERFAULT, -+ "Got invalid memory fault exit: %llx", -+ run->memory_fault.flags); -+ resolve_kvm_userfault(run->memory_fault.gpa, -+ run->memory_fault.size); -+ goto restart; -+ } else -+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); -+ - if (get_ucall(vcpu, NULL) != UCALL_SYNC) { - TEST_ASSERT(false, - "Invalid guest sync status: exit_reason=%s", -@@ -54,11 +80,10 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) - ts_diff.tv_sec, ts_diff.tv_nsec); - } - --static int handle_uffd_page_request(int uffd_mode, int uffd, -- struct uffd_msg *msg) -+static int resolve_uffd_page_request(int uffd_mode, int uffd, uint64_t addr, -+ bool wake) - { - pid_t tid = syscall(__NR_gettid); -- uint64_t addr = msg->arg.pagefault.address; - struct timespec start; - struct timespec ts_diff; - int r; -@@ -71,7 +96,7 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, - copy.src = (uint64_t)guest_data_prototype; - copy.dst = addr; - copy.len = demand_paging_size; -- copy.mode = 0; -+ copy.mode = wake ? 0 : UFFDIO_COPY_MODE_DONTWAKE; - - r = ioctl(uffd, UFFDIO_COPY, ©); - /* -@@ -96,6 +121,7 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, - - cont.range.start = addr; - cont.range.len = demand_paging_size; -+ cont.mode = wake ? 0 : UFFDIO_CONTINUE_MODE_DONTWAKE; - - r = ioctl(uffd, UFFDIO_CONTINUE, &cont); - /* -@@ -119,6 +145,20 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, - TEST_FAIL("Invalid uffd mode %d", uffd_mode); - } - -+ if (r < 0 && wake) { -+ /* -+ * No wake-up occurs when UFFDIO_COPY/CONTINUE fails, but we -+ * have a thread waiting. Wake it up. -+ */ -+ struct uffdio_range range = {0}; -+ -+ range.start = addr; -+ range.len = demand_paging_size; -+ -+ TEST_ASSERT(ioctl(uffd, UFFDIO_WAKE, &range) == 0, -+ "UFFDIO_WAKE failed: 0x%lx", addr); -+ } -+ - ts_diff = timespec_elapsed(start); - - PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, -@@ -129,6 +169,58 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, - return 0; - } - -+static int handle_uffd_page_request(int uffd_mode, int uffd, -+ struct uffd_msg *msg) -+{ -+ uint64_t addr = msg->arg.pagefault.address; -+ -+ return resolve_uffd_page_request(uffd_mode, uffd, addr, true); -+} -+ -+static void resolve_kvm_userfault(u64 gpa, u64 size) -+{ -+ struct kvm_vm *vm = memstress_args.vm; -+ struct userspace_mem_region *region; -+ unsigned long *bitmap_chunk; -+ u64 page, gpa_offset; -+ -+ region = (struct userspace_mem_region *) userspace_mem_region_find( -+ vm, gpa, (gpa + size - 1)); -+ -+ if (kvm_userfault_data.uffd_mode) { -+ /* -+ * Resolve userfaults early, without needing to read them -+ * off the userfaultfd. -+ */ -+ uint64_t hva = (uint64_t)addr_gpa2hva(vm, gpa); -+ struct uffd_desc **descs = kvm_userfault_data.uffd_descs; -+ int i, fd; -+ -+ for (i = 0; i < num_uffds; ++i) -+ if (hva >= (uint64_t)descs[i]->va_start && -+ hva < (uint64_t)descs[i]->va_end) -+ break; -+ -+ TEST_ASSERT(i < num_uffds, -+ "Did not find userfaultfd for hva: %lx", hva); -+ -+ fd = kvm_userfault_data.uffd_descs[i]->uffd; -+ resolve_uffd_page_request(kvm_userfault_data.uffd_mode, fd, -+ hva, false); -+ } else { -+ uint64_t hva = (uint64_t)addr_gpa2hva(vm, gpa); -+ -+ memcpy((char *)hva, guest_data_prototype, demand_paging_size); -+ } -+ -+ gpa_offset = gpa - region->region.guest_phys_addr; -+ page = gpa_offset / host_page_size; -+ bitmap_chunk = (unsigned long *)region->region.userfault_bitmap + -+ page / BITS_PER_LONG; -+ atomic_fetch_and_explicit((_Atomic unsigned long *)bitmap_chunk, -+ ~(1ul << (page % BITS_PER_LONG)), memory_order_release); -+} -+ - struct test_params { - int uffd_mode; - bool single_uffd; -@@ -136,6 +228,7 @@ struct test_params { - int readers_per_uffd; - enum vm_mem_backing_src_type src_type; - bool partition_vcpu_memory_access; -+ bool kvm_userfault; - }; - - static void prefault_mem(void *alias, uint64_t len) -@@ -149,6 +242,25 @@ static void prefault_mem(void *alias, uint64_t len) - } - } - -+static void enable_userfault(struct kvm_vm *vm, int slots) -+{ -+ for (int i = 0; i < slots; ++i) { -+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i; -+ struct userspace_mem_region *region; -+ unsigned long *userfault_bitmap; -+ int flags = KVM_MEM_USERFAULT; -+ -+ region = memslot2region(vm, slot); -+ userfault_bitmap = bitmap_zalloc(region->mmap_size / -+ host_page_size); -+ /* everything is userfault initially */ -+ memset(userfault_bitmap, -1, region->mmap_size / host_page_size / CHAR_BIT); -+ printf("Setting bitmap: %p\n", userfault_bitmap); -+ vm_mem_region_set_flags_userfault(vm, slot, flags, -+ userfault_bitmap); -+ } -+} -+ - static void run_test(enum vm_guest_mode mode, void *arg) - { - struct memstress_vcpu_args *vcpu_args; -@@ -159,12 +271,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) - struct timespec ts_diff; - double vcpu_paging_rate; - struct kvm_vm *vm; -- int i, num_uffds = 0; -+ int i; - - vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, - p->src_type, p->partition_vcpu_memory_access); - - demand_paging_size = get_backing_src_pagesz(p->src_type); -+ host_page_size = getpagesize(); - - guest_data_prototype = malloc(demand_paging_size); - TEST_ASSERT(guest_data_prototype, -@@ -208,6 +321,14 @@ static void run_test(enum vm_guest_mode mode, void *arg) - } - } - -+ if (p->kvm_userfault) { -+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_USERFAULT)); -+ kvm_userfault_data.enabled = true; -+ kvm_userfault_data.uffd_mode = p->uffd_mode; -+ kvm_userfault_data.uffd_descs = uffd_descs; -+ enable_userfault(vm, 1); -+ } -+ - pr_info("Finished creating vCPUs and starting uffd threads\n"); - - clock_gettime(CLOCK_MONOTONIC, &start); -@@ -265,6 +386,7 @@ static void help(char *name) - printf(" -v: specify the number of vCPUs to run.\n"); - printf(" -o: Overlap guest memory accesses instead of partitioning\n" - " them into a separate region of memory for each vCPU.\n"); -+ printf(" -k: Use KVM Userfault\n"); - puts(""); - exit(0); - } -@@ -283,7 +405,7 @@ int main(int argc, char *argv[]) - - guest_modes_append_default(); - -- while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { -+ while ((opt = getopt(argc, argv, "ahokm:u:d:b:s:v:c:r:")) != -1) { - switch (opt) { - case 'm': - guest_modes_cmdline(optarg); -@@ -326,6 +448,9 @@ int main(int argc, char *argv[]) - "Invalid number of readers per uffd %d: must be >=1", - p.readers_per_uffd); - break; -+ case 'k': -+ p.kvm_userfault = true; -+ break; - case 'h': - default: - help(argv[0]); -diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h -index 4c4e5a847f67..0d49a9ce832a 100644 ---- a/tools/testing/selftests/kvm/include/kvm_util.h -+++ b/tools/testing/selftests/kvm/include/kvm_util.h -@@ -582,6 +582,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, - void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); -+struct userspace_mem_region * -+userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end); - - #ifndef vm_arch_has_protected_memory - static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) -@@ -591,6 +593,9 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) - #endif - - void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); -+void vm_mem_region_set_flags_userfault(struct kvm_vm *vm, uint32_t slot, -+ uint32_t flags, -+ unsigned long *userfault_bitmap); - void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); - void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); - struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); -diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c -index a87988a162f1..a8f6b949ac59 100644 ---- a/tools/testing/selftests/kvm/lib/kvm_util.c -+++ b/tools/testing/selftests/kvm/lib/kvm_util.c -@@ -634,7 +634,7 @@ void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], - * of the regions is returned. Null is returned only when no overlapping - * region exists. - */ --static struct userspace_mem_region * -+struct userspace_mem_region * - userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) - { - struct rb_node *node; -@@ -1149,6 +1149,44 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) - ret, errno, slot, flags); - } - -+/* -+ * VM Memory Region Flags Set with a userfault bitmap -+ * -+ * Input Args: -+ * vm - Virtual Machine -+ * flags - Flags for the memslot -+ * userfault_bitmap - The bitmap to use for KVM_MEM_USERFAULT -+ * -+ * Output Args: None -+ * -+ * Return: None -+ * -+ * Sets the flags of the memory region specified by the value of slot, -+ * to the values given by flags. This helper adds a way to provide a -+ * userfault_bitmap. -+ */ -+void vm_mem_region_set_flags_userfault(struct kvm_vm *vm, uint32_t slot, -+ uint32_t flags, -+ unsigned long *userfault_bitmap) -+{ -+ int ret; -+ struct userspace_mem_region *region; -+ -+ region = memslot2region(vm, slot); -+ -+ TEST_ASSERT(!userfault_bitmap ^ (flags & KVM_MEM_USERFAULT), -+ "KVM_MEM_USERFAULT must be specified with a bitmap"); -+ -+ region->region.flags = flags; -+ region->region.userfault_bitmap = (__u64)userfault_bitmap; -+ -+ ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); -+ -+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" -+ " rc: %i errno: %i slot: %u flags: 0x%x", -+ ret, errno, slot, flags); -+} -+ - /* - * VM Memory Region Move - * --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch b/resources/hiding_ci/linux_patches/0023-mm-userfaultfd-use-can_userfault-vma-operation.patch similarity index 95% rename from resources/hiding_ci/linux_patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch rename to resources/hiding_ci/linux_patches/0023-mm-userfaultfd-use-can_userfault-vma-operation.patch index 3344989cb31..ce5130bb620 100644 --- a/resources/hiding_ci/linux_patches/0030-mm-userfaultfd-use-can_userfault-vma-operation.patch +++ b/resources/hiding_ci/linux_patches/0023-mm-userfaultfd-use-can_userfault-vma-operation.patch @@ -1,7 +1,7 @@ -From 2c19b37bc94ef338ec540424a9a1eee95ffbdc3c Mon Sep 17 00:00:00 2001 +From 04555059b68ba6e2aeb678da706a8290e3598df0 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 4 Apr 2025 14:16:49 +0000 -Subject: [PATCH 30/34] mm: userfaultfd: use can_userfault vma operation +Subject: [PATCH 23/26] mm: userfaultfd: use can_userfault vma operation Signed-off-by: Nikita Kalyazin --- diff --git a/resources/hiding_ci/linux_patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch b/resources/hiding_ci/linux_patches/0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch similarity index 90% rename from resources/hiding_ci/linux_patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch rename to resources/hiding_ci/linux_patches/0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch index 4e544677625..37dc68e3989 100644 --- a/resources/hiding_ci/linux_patches/0031-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch +++ b/resources/hiding_ci/linux_patches/0024-KVM-guest_memfd-add-support-for-userfaultfd-minor.patch @@ -1,7 +1,7 @@ -From 140a906e90e2ba2092148d80e0764e54802c947c Mon Sep 17 00:00:00 2001 +From b806003684d08506cb66c664efdfda3d7ff6103e Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Tue, 1 Apr 2025 15:02:56 +0000 -Subject: [PATCH 31/34] KVM: guest_memfd: add support for userfaultfd minor +Subject: [PATCH 24/26] KVM: guest_memfd: add support for userfaultfd minor Add support for sending a pagefault event if userfaultfd is registered. Only page minor event is currently supported. diff --git a/resources/hiding_ci/linux_patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch b/resources/hiding_ci/linux_patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch deleted file mode 100644 index 36b151a0f3c..00000000000 --- a/resources/hiding_ci/linux_patches/0024-KVM-selftests-Add-KVM_MEM_USERFAULT-guest_memfd-togg.patch +++ /dev/null @@ -1,65 +0,0 @@ -From be1d7a3ce1b177d64198b8e060bc9a3844f462cd Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:28 +0000 -Subject: [PATCH 24/34] KVM: selftests: Add KVM_MEM_USERFAULT + guest_memfd - toggle tests - -Make sure KVM_MEM_USERFAULT can be toggled on and off for -KVM_MEM_GUEST_MEMFD memslots. - -Signed-off-by: James Houghton ---- - .../selftests/kvm/set_memory_region_test.c | 30 +++++++++++++++++++ - 1 file changed, 30 insertions(+) - -diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c -index 56231c02d88c..95d315b976df 100644 ---- a/tools/testing/selftests/kvm/set_memory_region_test.c -+++ b/tools/testing/selftests/kvm/set_memory_region_test.c -@@ -608,6 +608,35 @@ static void test_mmio_during_vectoring(void) - } - #endif - -+static void test_private_memory_region_userfault(void) -+{ -+ struct kvm_vm *vm; -+ int memfd; -+ -+ pr_info("Testing toggling KVM_MEM_USERFAULT on KVM_MEM_GUEST_MEMFD memory regions\n"); -+ -+ vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); -+ -+ test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); -+ test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); -+ -+ memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); -+ -+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, -+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); -+ -+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, -+ KVM_MEM_GUEST_MEMFD | KVM_MEM_USERFAULT, -+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); -+ -+ vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, -+ MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); -+ -+ close(memfd); -+ -+ kvm_vm_free(vm); -+} -+ - int main(int argc, char *argv[]) - { - #ifdef __x86_64__ -@@ -633,6 +662,7 @@ int main(int argc, char *argv[]) - (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) { - test_add_private_memory_region(); - test_add_overlapping_private_memory_regions(); -+ test_private_memory_region_userfault(); - } else { - pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n"); - } --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch b/resources/hiding_ci/linux_patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch deleted file mode 100644 index 68aa5a42ad1..00000000000 --- a/resources/hiding_ci/linux_patches/0025-KVM-Documentation-Add-KVM_CAP_USERFAULT-and-KVM_MEM_.patch +++ /dev/null @@ -1,76 +0,0 @@ -From fb4c74191df7821bf047af099473dd8f20948b43 Mon Sep 17 00:00:00 2001 -From: James Houghton -Date: Thu, 9 Jan 2025 20:49:29 +0000 -Subject: [PATCH 25/34] KVM: Documentation: Add KVM_CAP_USERFAULT and - KVM_MEM_USERFAULT details - -Include the note about memory ordering when clearing bits in -userfault_bitmap, as it may not be obvious for users. - -Signed-off-by: James Houghton -Reviewed-by: Bagas Sanjaya ---- - Documentation/virt/kvm/api.rst | 33 ++++++++++++++++++++++++++++++++- - 1 file changed, 32 insertions(+), 1 deletion(-) - -diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst -index 2b52eb77e29c..3ec3d0bdb18a 100644 ---- a/Documentation/virt/kvm/api.rst -+++ b/Documentation/virt/kvm/api.rst -@@ -6287,7 +6287,8 @@ bounds checks apply (use common sense). - __u64 guest_memfd_offset; - __u32 guest_memfd; - __u32 pad1; -- __u64 pad2[14]; -+ __u64 userfault_bitmap; -+ __u64 pad2[13]; - }; - - A KVM_MEM_GUEST_MEMFD region _must_ have a valid guest_memfd (private memory) and -@@ -6303,6 +6304,25 @@ state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute - is '0' for all gfns. Userspace can control whether memory is shared/private by - toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed. - -+When the KVM_MEM_USERFAULT flag is set, userfault_bitmap points to the starting -+address for the bitmap that controls if vCPU memory faults should immediately -+exit to userspace. If an invalid pointer is provided, at fault time, KVM_RUN -+will return -EFAULT. KVM_MEM_USERFAULT is only supported when -+KVM_CAP_USERFAULT is supported. -+ -+userfault_bitmap should point to an array of longs where each bit in the array -+linearly corresponds to a single gfn. Bit 0 in userfault_bitmap corresponds to -+guest_phys_addr, bit 1 corresponds to guest_phys_addr + PAGE_SIZE, etc. If the -+bit for a page is set, any vCPU access to that page will exit to userspace with -+KVM_MEMORY_EXIT_FLAG_USERFAULT. -+ -+Setting bits in userfault_bitmap has no effect on pages that have already been -+mapped by KVM until KVM_MEM_USERFAULT is disabled and re-enabled again. -+ -+Clearing bits in userfault_bitmap should usually be done with a store-release -+if changes to guest memory are being made available to the guest via -+userfault_bitmap. -+ - S390: - ^^^^^ - -@@ -8258,6 +8278,17 @@ KVM exits with the register state of either the L1 or L2 guest - depending on which executed at the time of an exit. Userspace must - take care to differentiate between these cases. - -+7.37 KVM_CAP_USERFAULT -+---------------------- -+ -+:Architectures: x86, arm64 -+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. -+ -+The presence of this capability indicates that KVM_SET_USER_MEMORY_REGION2 will -+accept KVM_MEM_USERFAULT as a valid memslot flag. -+ -+See KVM_SET_USER_MEMORY_REGION2 for more details. -+ - 8. Other capabilities. - ====================== - --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch b/resources/hiding_ci/linux_patches/0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch similarity index 94% rename from resources/hiding_ci/linux_patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch rename to resources/hiding_ci/linux_patches/0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch index 3700d496c49..777a2b05e66 100644 --- a/resources/hiding_ci/linux_patches/0032-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch +++ b/resources/hiding_ci/linux_patches/0025-mm-userfaultfd-add-UFFD_FEATURE_MINOR_GUEST_MEMFD.patch @@ -1,7 +1,7 @@ -From 27e27a59e6139f780439f13cf7180f06c5b0d518 Mon Sep 17 00:00:00 2001 +From 6c5886204ff8d306cc4ee945235c88eb854ebf7f Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 4 Apr 2025 14:18:03 +0000 -Subject: [PATCH 32/34] mm: userfaultfd: add UFFD_FEATURE_MINOR_GUEST_MEMFD +Subject: [PATCH 25/26] mm: userfaultfd: add UFFD_FEATURE_MINOR_GUEST_MEMFD Signed-off-by: Nikita Kalyazin --- diff --git a/resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch b/resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch new file mode 100644 index 00000000000..2aa0a3bea09 --- /dev/null +++ b/resources/hiding_ci/linux_patches/0026-fixup-for-guest_memfd-uffd-v3.patch @@ -0,0 +1,70 @@ +From d950436a063f021ae0d925509363106625eafe0f Mon Sep 17 00:00:00 2001 +From: Nikita Kalyazin +Date: Thu, 10 Apr 2025 14:18:53 +0000 +Subject: [PATCH 26/26] fixup for guest_memfd uffd v3 + + - implement can_userfault for guest_memfd + - check vma->vm_ops pointer before dereferencing + - proper check for VM_UFFD_MINOR +--- + include/linux/userfaultfd_k.h | 6 ++++-- + mm/userfaultfd.c | 4 +++- + virt/kvm/guest_memfd.c | 9 ++++++++- + 3 files changed, 15 insertions(+), 4 deletions(-) + +diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h +index 64551e8a55fb..080437fa7eab 100644 +--- a/include/linux/userfaultfd_k.h ++++ b/include/linux/userfaultfd_k.h +@@ -221,8 +221,10 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, + if (vm_flags & VM_DROPPABLE) + return false; + +- if (!vma->vm_ops->can_userfault || +- !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR)) ++ if ((vm_flags & VM_UFFD_MINOR) && ++ (!vma->vm_ops || ++ !vma->vm_ops->can_userfault || ++ !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR))) + return false; + + /* +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index 0aa82c968e16..638360a78561 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -788,7 +788,9 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); + +- can_userfault = dst_vma->vm_ops->can_userfault && ++ can_userfault = ++ dst_vma->vm_ops && ++ dst_vma->vm_ops->can_userfault && + dst_vma->vm_ops->can_userfault(dst_vma, __VM_UFFD_FLAGS); + + if (!vma_is_anonymous(dst_vma) && !can_userfault) +diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c +index 91ee5dd91c31..202b12dc4b6f 100644 +--- a/virt/kvm/guest_memfd.c ++++ b/virt/kvm/guest_memfd.c +@@ -420,8 +420,15 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) + return ret; + } + ++static bool kvm_gmem_can_userfault(struct vm_area_struct *vma, ++ unsigned long vm_flags) ++{ ++ return vm_flags & VM_UFFD_MINOR; ++} ++ + static const struct vm_operations_struct kvm_gmem_vm_ops = { +- .fault = kvm_gmem_fault, ++ .fault = kvm_gmem_fault, ++ .can_userfault = kvm_gmem_can_userfault, + }; + + static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +-- +2.47.1 + diff --git a/resources/hiding_ci/linux_patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch b/resources/hiding_ci/linux_patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch deleted file mode 100644 index bfade8ef68a..00000000000 --- a/resources/hiding_ci/linux_patches/0033-KVM-selftests-test-userfaultfd-minor-for-guest_memfd.patch +++ /dev/null @@ -1,146 +0,0 @@ -From 4647a42c34b3896fd872f8b2991b55827d084a38 Mon Sep 17 00:00:00 2001 -From: Nikita Kalyazin -Date: Fri, 28 Feb 2025 16:17:41 +0000 -Subject: [PATCH 33/34] KVM: selftests: test userfaultfd minor for guest_memfd - -The test demonstrates that a minor userfaultfd event in guest_memfd can -be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl. - -Signed-off-by: Nikita Kalyazin ---- - .../testing/selftests/kvm/guest_memfd_test.c | 99 +++++++++++++++++++ - 1 file changed, 99 insertions(+) - -diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c -index b07221aa54c9..c9578f0ce46f 100644 ---- a/tools/testing/selftests/kvm/guest_memfd_test.c -+++ b/tools/testing/selftests/kvm/guest_memfd_test.c -@@ -10,12 +10,16 @@ - #include - #include - #include -+#include - - #include - #include -+#include - #include - #include - #include -+#include -+#include - - #include "kvm_util.h" - #include "test_util.h" -@@ -278,6 +282,98 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) - close(fd1); - } - -+struct fault_args { -+ char *addr; -+ volatile char value; -+}; -+ -+static void *fault_thread_fn(void *arg) -+{ -+ struct fault_args *args = arg; -+ -+ /* Trigger page fault */ -+ args->value = *args->addr; -+ return NULL; -+} -+ -+static void test_uffd_minor(int fd, size_t page_size, size_t total_size) -+{ -+ struct uffdio_register uffd_reg; -+ struct uffdio_continue uffd_cont; -+ struct uffd_msg msg; -+ struct fault_args args; -+ pthread_t fault_thread; -+ void *mem, *mem_nofault, *buf = NULL; -+ int uffd, ret; -+ off_t offset = page_size; -+ void *fault_addr; -+ -+ ret = posix_memalign(&buf, page_size, total_size); -+ TEST_ASSERT_EQ(ret, 0); -+ -+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC); -+ TEST_ASSERT(uffd != -1, "userfaultfd creation should succeed"); -+ -+ struct uffdio_api uffdio_api = { -+ .api = UFFD_API, -+ .features = UFFD_FEATURE_MINOR_GUEST_MEMFD, -+ }; -+ ret = ioctl(uffd, UFFDIO_API, &uffdio_api); -+ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_API) should succeed"); -+ -+ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); -+ TEST_ASSERT(mem != MAP_FAILED, "mmap should succeed"); -+ -+ mem_nofault = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); -+ TEST_ASSERT(mem_nofault != MAP_FAILED, "mmap should succeed"); -+ -+ uffd_reg.range.start = (unsigned long)mem; -+ uffd_reg.range.len = total_size; -+ uffd_reg.mode = UFFDIO_REGISTER_MODE_MINOR; -+ ret = ioctl(uffd, UFFDIO_REGISTER, &uffd_reg); -+ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_REGISTER) should succeed"); -+ -+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, -+ offset, page_size); -+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); -+ -+ fault_addr = mem + offset; -+ args.addr = fault_addr; -+ -+ ret = pthread_create(&fault_thread, NULL, fault_thread_fn, &args); -+ TEST_ASSERT(ret == 0, "pthread_create should succeed"); -+ -+ ret = read(uffd, &msg, sizeof(msg)); -+ TEST_ASSERT(ret != -1, "read from userfaultfd should succeed"); -+ TEST_ASSERT(msg.event == UFFD_EVENT_PAGEFAULT, "event type should be pagefault"); -+ TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) == fault_addr, -+ "pagefault should occur at expected address"); -+ -+ memcpy(mem_nofault + offset, buf + offset, page_size); -+ -+ uffd_cont.range.start = (unsigned long)fault_addr; -+ uffd_cont.range.len = page_size; -+ uffd_cont.mode = 0; -+ ret = ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont); -+ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_CONTINUE) should succeed"); -+ -+ TEST_ASSERT(args.value == *(char *)(mem_nofault + offset), -+ "memory should contain the value that was copied"); -+ TEST_ASSERT(args.value == *(char *)(mem + offset), -+ "no further fault is expected"); -+ -+ ret = pthread_join(fault_thread, NULL); -+ TEST_ASSERT(ret == 0, "pthread_join should succeed"); -+ -+ ret = munmap(mem_nofault, total_size); -+ TEST_ASSERT(!ret, "munmap should succeed"); -+ -+ ret = munmap(mem, total_size); -+ TEST_ASSERT(!ret, "munmap should succeed"); -+ free(buf); -+ close(uffd); -+} -+ - unsigned long get_shared_type(void) - { - #ifdef __x86_64__ -@@ -317,6 +413,9 @@ void test_vm_type(unsigned long type, bool is_shared) - test_fallocate(fd, page_size, total_size); - test_invalid_punch_hole(fd, page_size, total_size); - -+ if (is_shared) -+ test_uffd_minor(fd, page_size, total_size); -+ - close(fd); - kvm_vm_release(vm); - } --- -2.47.1 - diff --git a/resources/hiding_ci/linux_patches/0034-uffd-v3-fixup.patch b/resources/hiding_ci/linux_patches/0034-uffd-v3-fixup.patch deleted file mode 100644 index 41df6ff2428..00000000000 --- a/resources/hiding_ci/linux_patches/0034-uffd-v3-fixup.patch +++ /dev/null @@ -1,50 +0,0 @@ -From a03bf9042094cc0fd1b2a71307b6e0b02e8500d8 Mon Sep 17 00:00:00 2001 -From: Nikita Kalyazin -Date: Thu, 10 Apr 2025 14:18:53 +0000 -Subject: [PATCH 34/34] uffd v3 fixup - - - implement can_userfault for guest_memfd - - check vma->vm_ops pointer before dereferencing ---- - include/linux/userfaultfd_k.h | 3 ++- - virt/kvm/guest_memfd.c | 9 ++++++++- - 2 files changed, 10 insertions(+), 2 deletions(-) - -diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h -index 64551e8a55fb..92fb5372bea5 100644 ---- a/include/linux/userfaultfd_k.h -+++ b/include/linux/userfaultfd_k.h -@@ -221,7 +221,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, - if (vm_flags & VM_DROPPABLE) - return false; - -- if (!vma->vm_ops->can_userfault || -+ if (!vma->vm_ops || -+ !vma->vm_ops->can_userfault || - !vma->vm_ops->can_userfault(vma, VM_UFFD_MINOR)) - return false; - -diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c -index 91ee5dd91c31..202b12dc4b6f 100644 ---- a/virt/kvm/guest_memfd.c -+++ b/virt/kvm/guest_memfd.c -@@ -420,8 +420,15 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) - return ret; - } - -+static bool kvm_gmem_can_userfault(struct vm_area_struct *vma, -+ unsigned long vm_flags) -+{ -+ return vm_flags & VM_UFFD_MINOR; -+} -+ - static const struct vm_operations_struct kvm_gmem_vm_ops = { -- .fault = kvm_gmem_fault, -+ .fault = kvm_gmem_fault, -+ .can_userfault = kvm_gmem_can_userfault, - }; - - static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) --- -2.47.1 - From b8c1f7babe764630487c59916d42547883eb5823 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 24 Mar 2025 12:34:56 +0000 Subject: [PATCH 38/40] test: run throughput perf tests with secret freedom enabled Aadditionally parametrize some of our throughput performance tests (network, block and vsock) by memory config, so that they run with secret freedom (and hence bounce buffering) enabled. Also add it to the boottime test, because bouncing can impact the time taken to read the rootfs. Skip them on m6g.metal because secret freedom does not work here for architectural reasons (and our patches do not take this into account, so trying to use secret freedom here would result in host kernel panics). Signed-off-by: Patrick Roy --- tests/conftest.py | 14 ++++++++ tests/framework/microvm.py | 11 +++++++ .../performance/test_block_ab.py | 8 ++++- .../performance/test_boottime.py | 32 +++++++------------ .../performance/test_network_ab.py | 6 ++-- .../performance/test_vsock_ab.py | 13 ++++++-- 6 files changed, 58 insertions(+), 26 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4591cd8112d..c94e3bb8f31 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -386,6 +386,20 @@ def io_engine(request): return request.param +secret_free_test_cases = [False] +if ( + global_props.host_linux_version_metrics == "next" + and global_props.instance != "m6g.metal" +): + secret_free_test_cases.append(True) + + +@pytest.fixture(params=secret_free_test_cases) +def secret_free(request): + """Supported secret hiding configuration, based on hardware""" + return request.param + + @pytest.fixture def results_dir(request): """ diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 00ceaac82c2..40d677752aa 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -249,6 +249,7 @@ def __init__( self.disks_vhost_user = {} self.vcpus_count = None self.mem_size_bytes = None + self.secret_free = False self.cpu_template_name = "None" # The given custom CPU template will be set in basic_config() but could # be overwritten via set_cpu_template(). @@ -463,6 +464,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "secret_free": str(self.secret_free or False), } @property @@ -730,6 +732,7 @@ def basic_config( rootfs_io_engine=None, cpu_template: Optional[str] = None, enable_entropy_device=False, + secret_free=None, ): """Shortcut for quickly configuring a microVM. @@ -748,15 +751,23 @@ def basic_config( Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ + # Have to do it this way as otherwise A/B-tests fail if the 'A' revision + # of Firecracker doesn't know about the secret_free parameter. + kwargs = {} + if secret_free: + kwargs["secret_free"] = True + self.api.machine_config.put( vcpu_count=vcpu_count, smt=smt, mem_size_mib=mem_size_mib, track_dirty_pages=track_dirty_pages, huge_pages=huge_pages, + **kwargs, ) self.vcpus_count = vcpu_count self.mem_size_bytes = mem_size_mib * 2**20 + self.secret_free = secret_free or False if self.custom_cpu_template is not None: self.set_cpu_template(self.custom_cpu_template) diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py index f38872cdc94..aa1facb634c 100644 --- a/tests/integration_tests/performance/test_block_ab.py +++ b/tests/integration_tests/performance/test_block_ab.py @@ -163,14 +163,20 @@ def test_block_performance( fio_block_size, fio_engine, io_engine, + secret_free, metrics, ): """ Execute block device emulation benchmarking scenarios. """ + if secret_free and io_engine == "Async": + pytest.skip("userspace bounce buffers not supported with async block engine") + vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) + vm.basic_config( + vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB, secret_free=secret_free + ) vm.add_net_iface() # Add a secondary block device for benchmark tests. fs = drive_tools.FilesystemFile( diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 4a2e6b61b70..264996cf7dc 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -8,8 +8,6 @@ import pytest -from framework.properties import global_props - # Regex for obtaining boot time from some string. TIMESTAMP_LOG_REGEX = r"Guest-boot-time\s+\=\s+(\d+)\s+us" @@ -19,14 +17,6 @@ ) -DIMENSIONS = { - "instance": global_props.instance, - "cpu_model": global_props.cpu_model, - "host_os": global_props.host_os, - "host_kernel": "linux-" + global_props.host_linux_version_metrics, -} - - def _get_microvm_boottime(vm): """Auxiliary function for asserting the expected boot time.""" boot_time_us = None @@ -75,20 +65,16 @@ def find_events(log_data): ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + secret_free, + metrics, ): """Test boot time with different guest configurations""" - metrics.set_dimensions( - { - **DIMENSIONS, - "performance_test": "test_boottime", - "guest_kernel": guest_kernel_acpi.name, - "vcpus": str(vcpu_count), - "mem_size_mib": str(mem_size_mib), - } - ) - for _ in range(10): vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) @@ -98,10 +84,14 @@ def test_boottime( mem_size_mib=mem_size_mib, boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", enable_entropy_device=True, + secret_free=secret_free, ) vm.add_net_iface() vm.start() vm.pin_threads(0) + + metrics.set_dimensions({"performance_test": "test_boottime", **vm.dimensions}) + boottime_us = _get_microvm_boottime(vm) metrics.put_metric("boot_time", boottime_us, unit="Microseconds") timestamps = find_events(vm.log_data) diff --git a/tests/integration_tests/performance/test_network_ab.py b/tests/integration_tests/performance/test_network_ab.py index 3a50d864544..3ac14a2c16f 100644 --- a/tests/integration_tests/performance/test_network_ab.py +++ b/tests/integration_tests/performance/test_network_ab.py @@ -36,7 +36,7 @@ def consume_ping_output(ping_putput, request_per_round): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, secret_free): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" @@ -45,7 +45,9 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) + vm.basic_config( + vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib, secret_free=secret_free + ) vm.add_net_iface() vm.start() vm.pin_threads(0) diff --git a/tests/integration_tests/performance/test_vsock_ab.py b/tests/integration_tests/performance/test_vsock_ab.py index bcee05528af..9cd08e312d5 100644 --- a/tests/integration_tests/performance/test_vsock_ab.py +++ b/tests/integration_tests/performance/test_vsock_ab.py @@ -73,7 +73,14 @@ def guest_command(self, port_offset): @pytest.mark.parametrize("payload_length", ["64K", "1024K"], ids=["p64K", "p1024K"]) @pytest.mark.parametrize("mode", ["g2h", "h2g", "bd"]) def test_vsock_throughput( - microvm_factory, guest_kernel_acpi, rootfs, vcpus, payload_length, mode, metrics + microvm_factory, + guest_kernel_acpi, + rootfs, + vcpus, + payload_length, + mode, + metrics, + secret_free, ): """ Test vsock throughput for multiple vm configurations. @@ -87,7 +94,9 @@ def test_vsock_throughput( mem_size_mib = 1024 vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) + vm.basic_config( + vcpu_count=vcpus, mem_size_mib=mem_size_mib, secret_free=secret_free + ) vm.add_net_iface() # Create a vsock device vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path="/" + VSOCK_UDS_PATH) From 92a2b7839f68f62acb17aaf1b5a98f8ed59aa2e3 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 11 Apr 2025 15:16:10 +0100 Subject: [PATCH 39/40] test: add functional tests for booting secret free VMs Add a test that we can boot VMs and initrds with secret freedom enabled. Signed-off-by: Patrick Roy --- .../functional/test_secret_freedom.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tests/integration_tests/functional/test_secret_freedom.py diff --git a/tests/integration_tests/functional/test_secret_freedom.py b/tests/integration_tests/functional/test_secret_freedom.py new file mode 100644 index 00000000000..5f9758bb88c --- /dev/null +++ b/tests/integration_tests/functional/test_secret_freedom.py @@ -0,0 +1,75 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test secret-freedom related functionality.""" + +import pytest + +from framework import defs +from framework.microvm import Serial +from framework.properties import global_props +from integration_tests.performance.test_initrd import INITRD_FILESYSTEM + +pytestmark = [ + pytest.mark.skipif( + global_props.host_linux_version_metrics != "next", + reason="Secret Freedom is only supported on the in-dev upstream kernels for now", + ), + pytest.mark.skipif( + global_props.instance == "m6g.metal", + reason="Secret Freedom currently only works on ARM hardware conforming to at least ARMv8.4 as absense of ARM64_HAS_STAGE2_FWB causes kernel panics because of dcache flushing during stage2 page table entry installation", + ), +] + + +def test_secret_free_boot(microvm_factory, guest_kernel, rootfs): + """Tests that a VM can boot, e.g. some basic I/O works through userspace bounce buffers""" + vm = microvm_factory.build(guest_kernel, rootfs) + vm.spawn() + vm.memory_monitor = None + vm.basic_config(secret_free=True) + vm.add_net_iface() + vm.start() + + +def test_secret_free_initrd(microvm_factory, guest_kernel): + """ + Test that we can boot a secret hidden initrd (e.g. a VM with no I/O devices) + """ + fs = defs.ARTIFACT_DIR / "initramfs.cpio" + uvm = microvm_factory.build(guest_kernel) + uvm.initrd_file = fs + uvm.help.enable_console() + uvm.spawn() + uvm.memory_monitor = None + + uvm.basic_config( + add_root_device=False, + vcpu_count=1, + boot_args="console=ttyS0 reboot=k panic=1 pci=off", + use_initrd=True, + secret_free=True, + ) + + uvm.start() + serial = Serial(uvm) + serial.open() + serial.rx(token="# ") + serial.tx("mount |grep rootfs") + serial.rx(token=f"rootfs on / type {INITRD_FILESYSTEM}") + + +def test_secret_free_snapshot_creation(microvm_factory, guest_kernel, rootfs): + """Test that snapshot creation works for secret hidden VMs""" + vm = microvm_factory.build(guest_kernel, rootfs) + vm.spawn() + vm.memory_monitor = None + vm.basic_config(secret_free=True) + vm.add_net_iface() + vm.start() + + snapshot = vm.snapshot_full() + + # After restoration, the VM will not be secret hidden anymore, as that's not supported yet. + # But we can at least test that in principle, the snapshot creation worked. + vm = microvm_factory.build_from_snapshot(snapshot) + vm.ssh.check_output("true") From dc53e8d4dc10c91c8e7863aab5b7d54cb5df78cc Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 8 Apr 2025 17:04:25 +0100 Subject: [PATCH 40/40] test: disable memory monitor in boottime tests Since we load the kernel using bounce buffers now, it will give us false-positives. Signed-off-by: Patrick Roy --- tests/integration_tests/performance/test_boottime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 264996cf7dc..30568dea1e9 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -76,7 +76,7 @@ def test_boottime( """Test boot time with different guest configurations""" for _ in range(10): - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw, monitor_memory=False) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn() vm.basic_config(