diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml deleted file mode 100644 index cfc21696e1..0000000000 --- a/.github/workflows/audit.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: Cloud Hypervisor Dependency Audit -on: - pull_request: - paths: - - '**/Cargo.toml' - - '**/Cargo.lock' - -jobs: - security_audit: - name: Audit - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - uses: actions-rust-lang/audit@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 286c2af548..b368247da4 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -13,21 +13,16 @@ jobs: matrix: rust: - stable - - beta - nightly - "1.88.0" target: - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl steps: - name: Code checkout uses: actions/checkout@v5 with: fetch-depth: 0 - - name: Install musl-gcc - run: sudo apt install -y musl-tools - - name: Install Rust toolchain (${{ matrix.rust }}) uses: dtolnay/rust-toolchain@stable with: @@ -35,40 +30,40 @@ jobs: target: ${{ matrix.target }} - name: Build (default features) - run: cargo rustc --locked --bin cloud-hypervisor -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor - name: Build (kvm) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "kvm" - name: Build (default features + tdx) - run: cargo rustc --locked --bin cloud-hypervisor --features "tdx" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --features "tdx" - name: Build (default features + dbus_api) - run: cargo rustc --locked --bin cloud-hypervisor --features "dbus_api" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --features "dbus_api" - name: Build (default features + guest_debug) - run: cargo rustc --locked --bin cloud-hypervisor --features "guest_debug" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --features "guest_debug" - name: Build (default features + pvmemcontrol) - run: cargo rustc --locked --bin cloud-hypervisor --features "pvmemcontrol" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --features "pvmemcontrol" - name: Build (default features + fw_cfg) - run: cargo rustc --locked --bin cloud-hypervisor --features "fw_cfg" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --features "fw_cfg" - name: Build (default features + ivshmem) - run: cargo rustc --locked --bin cloud-hypervisor --features "ivshmem" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --features "ivshmem" - name: Build (mshv) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "mshv" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "mshv" - name: Build (sev_snp) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" - name: Build (igvm) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "igvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "igvm" - name: Build (mshv + kvm) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "mshv,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "mshv,kvm" - name: Release Build (default features) run: cargo build --locked --all --release --target=${{ matrix.target }} diff --git a/.github/workflows/commit-lint.yml b/.github/workflows/commit-lint.yml new file mode 100644 index 0000000000..ec2dfec7ac --- /dev/null +++ b/.github/workflows/commit-lint.yml @@ -0,0 +1,23 @@ +name: Commit Lint +on: [ pull_request ] +jobs: + gitlint: + name: Check commit messages + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install --upgrade gitlint + - name: Lint git commit messages + run: | + gitlint --commits origin/$GITHUB_BASE_REF.. diff --git a/.github/workflows/dco.yaml b/.github/workflows/dco.yaml deleted file mode 100644 index daf21315e0..0000000000 --- a/.github/workflows/dco.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: DCO -on: [pull_request, merge_group] - -jobs: - check: - name: DCO Check ("Signed-Off-By") - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - name: Set up Python 3.x - uses: actions/setup-python@v6 - with: - python-version: '3.x' - - name: Check DCO - if: ${{ github.event_name == 'pull_request' }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - pip3 install -U dco-check - dco-check -e "49699333+dependabot[bot]@users.noreply.github.com" diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml deleted file mode 100644 index a026eac2c6..0000000000 --- a/.github/workflows/docker-image.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Cloud Hypervisor's Docker image update -on: - push: - branches: main - paths: resources/Dockerfile - pull_request: - paths: resources/Dockerfile -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - main: - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v5 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to ghcr - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # generate Docker tags based on the following events/attributes - tags: | - type=raw,value=20250815-0 - type=sha - - - name: Build and push - if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v6 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: ${{ steps.meta.outputs.tags }} - - - name: Build only - if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v6 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/formatting.yaml b/.github/workflows/formatting.yaml index 75e4492559..37a0b3e6b5 100644 --- a/.github/workflows/formatting.yaml +++ b/.github/workflows/formatting.yaml @@ -14,7 +14,6 @@ jobs: - nightly target: - x86_64-unknown-linux-gnu - - aarch64-unknown-linux-musl env: RUSTFLAGS: -D warnings steps: diff --git a/.github/workflows/fuzz-build.yaml b/.github/workflows/fuzz-build.yaml deleted file mode 100644 index 427189b01e..0000000000 --- a/.github/workflows/fuzz-build.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: Cloud Hypervisor Cargo Fuzz Build -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo Fuzz Build - runs-on: ubuntu-latest - strategy: - matrix: - rust: - - nightly - target: - - x86_64-unknown-linux-gnu - env: - RUSTFLAGS: -D warnings - steps: - - name: Code checkout - uses: actions/checkout@v5 - - name: Install Rust toolchain (${{ matrix.rust }}) - uses: dtolnay/rust-toolchain@stable - with: - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - - name: Install Cargo fuzz - run: cargo install cargo-fuzz - - name: Fuzz Build - run: cargo fuzz build - - name: Fuzz Check - run: cargo fuzz check diff --git a/.github/workflows/gitlint.yaml b/.github/workflows/gitlint.yaml deleted file mode 100644 index 7c3c4f7e45..0000000000 --- a/.github/workflows/gitlint.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Commit messages check -on: - pull_request: - -jobs: - gitlint: - name: Check commit messages - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v5 - with: - ref: ${{ github.event.pull_request.head.sha }} - fetch-depth: 0 - - name: Set up Python 3.10 - uses: actions/setup-python@v6 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install --upgrade gitlint - - name: Lint git commit messages - run: | - gitlint --commits origin/$GITHUB_BASE_REF.. diff --git a/.github/workflows/hadolint.yaml b/.github/workflows/hadolint.yaml deleted file mode 100644 index 641d911c0c..0000000000 --- a/.github/workflows/hadolint.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Lint Dockerfile -on: - push: - paths: - - resources/Dockerfile - pull_request: - paths: - - resources/Dockerfile - -jobs: - hadolint: - name: Run Hadolint Dockerfile Linter - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v5 - - - name: Lint Dockerfile - uses: hadolint/hadolint-action@master - with: - dockerfile: ./resources/Dockerfile - format: tty - no-fail: false - verbose: true - failure-threshold: info diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml deleted file mode 100644 index 41a7bc824a..0000000000 --- a/.github/workflows/integration-arm64.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Cloud Hypervisor Tests (ARM64) -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - timeout-minutes: 120 - name: Tests (ARM64) - runs-on: bookworm-arm64 - steps: - - name: Fix workspace permissions - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Run unit tests (musl) - run: scripts/dev_cli.sh tests --unit --libc musl - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests (musl) - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc musl - - name: Install Azure CLI - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - if: ${{ github.event_name != 'pull_request' }} - shell: bash - run: | - IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw - IMG_PATH=$HOME/workloads/$IMG_BASENAME - IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz - IMG_GZ_BLOB_NAME=windows-11-iot-enterprise-aarch64-9-min.raw.gz - cp "scripts/$IMG_BASENAME.sha1" "$HOME/workloads/" - pushd "$HOME/workloads" - if sha1sum "$IMG_BASENAME.sha1" --check; then - exit - fi - popd - mkdir -p "$HOME/workloads" - az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d $IMG_GZ_PATH - - name: Run Windows guest integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 30 - run: scripts/dev_cli.sh tests --integration-windows --libc musl diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml deleted file mode 100644 index e8dd72ea84..0000000000 --- a/.github/workflows/integration-metrics.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Cloud Hypervisor Tests (Metrics) -on: - push: - branches: - - main - -jobs: - build: - name: Tests (Metrics) - runs-on: bare-metal-9950x - env: - METRICS_PUBLISH_KEY: ${{ secrets.METRICS_PUBLISH_KEY }} - steps: - - name: Code checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Run metrics tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --metrics -- -- --report-file /root/workloads/metrics.json - - name: Upload metrics report - run: 'curl -X PUT https://ch-metrics.azurewebsites.net/api/publishmetrics -H "x-functions-key: $METRICS_PUBLISH_KEY" -T ~/workloads/metrics.json' diff --git a/.github/workflows/integration-rate-limiter.yaml b/.github/workflows/integration-rate-limiter.yaml deleted file mode 100644 index 91682f77f8..0000000000 --- a/.github/workflows/integration-rate-limiter.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Cloud Hypervisor Tests (Rate-Limiter) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Tests (Rate-Limiter) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'bare-metal-9950x' }} - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Run rate-limiter integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-rate-limiter - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml deleted file mode 100644 index edd7399b15..0000000000 --- a/.github/workflows/integration-vfio.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Cloud Hypervisor Tests (VFIO) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Tests (VFIO) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'vfio-nvidia' }} - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Fix workspace permissions - if: ${{ github.event_name != 'pull_request' }} - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Run VFIO integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-vfio - # Most tests are failing with musl see #6790 - # - name: Run VFIO integration tests for musl - # if: ${{ github.event_name != 'pull_request' }} - # timeout-minutes: 15 - # run: scripts/dev_cli.sh tests --integration-vfio --libc musl - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" diff --git a/.github/workflows/integration-windows.yaml b/.github/workflows/integration-windows.yaml deleted file mode 100644 index 0769789a9d..0000000000 --- a/.github/workflows/integration-windows.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: Cloud Hypervisor Tests (Windows Guest) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Tests (Windows Guest) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'garm-jammy-16' }} - steps: - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Install Docker - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Install Azure CLI - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ jammy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - if: ${{ github.event_name != 'pull_request' }} - run: | - mkdir $HOME/workloads - az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2022-amd64-2.raw" --name windows-server-2022-amd64-2.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - - name: Run Windows guest integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows - - name: Run Windows guest integration tests for musl - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows --libc musl - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" \ No newline at end of file diff --git a/.github/workflows/integration-x86-64.yaml b/.github/workflows/integration-x86-64.yaml deleted file mode 100644 index 8ed76f16a1..0000000000 --- a/.github/workflows/integration-x86-64.yaml +++ /dev/null @@ -1,52 +0,0 @@ -name: Cloud Hypervisor Tests (x86-64) -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - runner: ['garm-jammy', "garm-jammy-amd"] - libc: ["musl", 'gnu'] - name: Tests (x86-64) - runs-on: ${{ github.event_name == 'pull_request' && !(matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') && 'ubuntu-latest' || format('{0}-16', matrix.runner) }} - steps: - - name: Code checkout - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - name: Install Docker - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: | - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Prepare for VDPA - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: scripts/prepare_vdpa.sh - - name: Run unit tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: scripts/dev_cli.sh tests --unit --libc ${{ matrix.libc }} - - name: Load openvswitch module - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: sudo modprobe openvswitch - - name: Run integration tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 40 - run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - - name: Run live-migration integration tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-live-migration --libc ${{ matrix.libc }} - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' && matrix.runner != 'garm-jammy' && matrix.libc != 'gnu' }} - run: echo "Skipping build for PR" diff --git a/.github/workflows/lychee.yaml b/.github/workflows/lychee.yaml deleted file mode 100644 index 8d1d3927c0..0000000000 --- a/.github/workflows/lychee.yaml +++ /dev/null @@ -1,45 +0,0 @@ -name: Link Check (lychee) -on: pull_request -jobs: - link_check: - name: Link Check - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v5 - with: - # Fetch the entire history so git diff can compare against the base branch - fetch-depth: 0 - - name: Get changed files in PR - id: changed-files - uses: tj-actions/changed-files@v47 # Using a dedicated action for robustness - with: - # Compare the HEAD of the PR with the merge-base (where the PR branches off) - base_sha: ${{ github.event.pull_request.base.sha }} - - # NEW STEP: Print all changed-files outputs for verification - - name: Verify Changed Files - run: | - echo "--- tj-actions/changed-files Outputs ---" - echo "any_changed: ${{ steps.changed-files.outputs.any_changed }}" - echo "all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}" - echo "added_files: ${{ steps.changed-files.outputs.added_files }}" - echo "modified_files: ${{ steps.changed-files.outputs.modified_files }}" - echo "deleted_files: ${{ steps.changed-files.outputs.deleted_files }}" - echo "renamed_files: ${{ steps.changed-files.outputs.renamed_files }}" - echo "----------------------------------------" - # This will also show if the all_changed_files string is empty or not - if [ -n "${{ steps.changed-files.outputs.all_changed_files }}" ]; then - echo "Detected changes: all_changed_files output is NOT empty." - else - echo "No changes detected: all_changed_files output IS empty." - fi - - name: Link Availability Check (Diff Only) - # MODIFIED: Only run lychee if the 'all_changed_files' output is not an empty string - if: ${{ steps.changed-files.outputs.all_changed_files != '' }} - uses: lycheeverse/lychee-action@master - with: - # Pass the space-separated list of changed files to lychee - args: --verbose --config .lychee.toml ${{ steps.changed-files.outputs.all_changed_files }} - failIfEmpty: false - fail: true \ No newline at end of file diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml deleted file mode 100644 index 72a90548a7..0000000000 --- a/.github/workflows/mshv-infra.yaml +++ /dev/null @@ -1,237 +0,0 @@ -name: MSHV Infra Setup -on: - workflow_call: - inputs: - ARCH: - description: 'Architecture for the VM' - required: true - type: string - KEY: - description: 'SSH Key Name' - required: true - type: string - OS_DISK_SIZE: - description: 'OS Disk Size in GB' - required: true - type: string - RG: - description: 'Resource Group Name' - required: true - type: string - VM_SKU: - description: 'VM SKU' - required: true - type: string - secrets: - MI_CLIENT_ID: - required: true - RUNNER_RG: - required: true - STORAGE_ACCOUNT_PATHS: - required: true - ARCH_SOURCE_PATH: - required: true - USERNAME: - required: true - outputs: - PRIVATE_IP: - description: 'Private IP of the VM' - value: ${{ jobs.infra-setup.outputs.PRIVATE_IP }} -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - infra-setup: - name: ${{ inputs.ARCH }} VM Provision - runs-on: mshv - continue-on-error: true - outputs: - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - steps: - - name: Install & login to AZ CLI - env: - MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }} - run: | - set -e - echo "Installing Azure CLI if not already installed" - if ! command -v az &>/dev/null; then - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - else - echo "Azure CLI already installed" - fi - az --version - echo "Logging into Azure CLI using Managed Identity" - az login --identity --client-id ${MI_CLIENT_ID} - - - name: Get Location - id: get-location - env: - SKU: ${{ inputs.VM_SKU }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -e - # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) - vcpu=$(echo "$SKU" | sed -n 's/^Standard_[A-Za-z]\+\([0-9]\+\).*/\1/p') - if [[ -z "$vcpu" ]]; then - echo "Cannot extract vCPU count from SKU: $SKU" - exit 1 - fi - - SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key') - - for location in $SUPPORTED_LOCATIONS; do - family=$(az vm list-skus --size "$SKU" --location "$location" --resource-type "virtualMachines" --query '[0].family' -o tsv) - if [[ -z "$family" ]]; then - echo "Cannot determine VM family for SKU: $SKU in $location" - continue - fi - - usage=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json) - current=$(echo "$usage" | jq -r '.currentValue') - limit=$(echo "$usage" | jq -r '.limit') - - if [[ $((limit - current)) -ge $vcpu ]]; then - echo "Sufficient quota found in $location" - echo "location=$location" >> "$GITHUB_OUTPUT" - exit 0 - fi - done - - echo "No location found with sufficient vCPU quota for SKU: $SKU" - exit 1 - - - name: Create Resource Group - id: rg-setup - env: - LOCATION: ${{ steps.get-location.outputs.location }} - RG: ${{ inputs.RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -e - echo "Creating Resource Group: $RG" - # Create the resource group - echo "Creating resource group in location: ${LOCATION}" - az group create --name ${RG} --location ${LOCATION} - echo "Resource group created successfully." - - - name: Generate SSH Key - id: generate-ssh-key - env: - KEY: ${{ inputs.KEY }} - run: | - set -e - echo "Generating SSH key: $KEY" - mkdir -p ~/.ssh - ssh-keygen -t rsa -b 4096 -f ~/.ssh/${KEY} -N "" - - - name: Create VM - id: vm-setup - env: - KEY: ${{ inputs.KEY }} - LOCATION: ${{ steps.get-location.outputs.location }} - OS_DISK_SIZE: ${{ inputs.OS_DISK_SIZE }} - RG: ${{ inputs.RG }} - RUNNER_RG: ${{ secrets.RUNNER_RG }} - USERNAME: ${{ secrets.USERNAME }} - VM_SKU: ${{ inputs.VM_SKU }} - VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -e - echo "Creating $VM_SKU VM: $VM_NAME" - - # Extract subnet ID from the runner VM - echo "Retrieving subnet ID..." - SUBNET_ID=$(az network vnet list --resource-group ${RUNNER_RG} --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") - if [[ -z "${SUBNET_ID}" ]]; then - echo "ERROR: Failed to retrieve Subnet ID." - exit 1 - fi - - # Extract image ID from the runner VM - echo "Retrieving image ID..." - IMAGE_ID=$(az image show --resource-group ${RUNNER_RG} --name ${VM_IMAGE_NAME} --query "id" -o tsv) - if [[ -z "${IMAGE_ID}" ]]; then - echo "ERROR: Failed to retrieve Image ID." - exit 1 - fi - - # Create VM - az vm create \ - --resource-group ${RG} \ - --name ${VM_NAME} \ - --subnet ${SUBNET_ID} \ - --size ${VM_SKU} \ - --location ${LOCATION} \ - --image ${IMAGE_ID} \ - --os-disk-size-gb ${OS_DISK_SIZE} \ - --public-ip-sku Standard \ - --storage-sku Premium_LRS \ - --public-ip-address "" \ - --admin-username ${USERNAME} \ - --ssh-key-value ~/.ssh/${KEY}.pub \ - --security-type Standard \ - --output json - - echo "VM creation process completed successfully." - - - name: Get VM Private IP - id: get-vm-ip - env: - RG: ${{ inputs.RG }} - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -e - echo "Retrieving VM Private IP address..." - # Retrieve VM Private IP address - PRIVATE_IP=$(az vm show -g ${RG} -n ${VM_NAME} -d --query privateIps -o tsv) - if [[ -z "$PRIVATE_IP" ]]; then - echo "ERROR: Failed to retrieve private IP address." - exit 1 - fi - echo "PRIVATE_IP=$PRIVATE_IP" >> $GITHUB_OUTPUT - - - name: Wait for SSH availability - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - echo "Waiting for SSH to be accessible..." - timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/${KEY} ${USERNAME}@${PRIVATE_IP} "exit" 2>/dev/null; do sleep 5; done' - echo "VM is accessible!" - - - name: Remove Old Host Key - env: - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - run: | - set -e - echo "Removing the old host key" - ssh-keygen -R $PRIVATE_IP - - - name: SSH into VM and Install Dependencies - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - set -e - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - set -e - echo "Logged in successfully." - echo "Installing dependencies..." - sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel - echo "Installing Rust..." - curl -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable --profile default -y - export PATH="\$HOME/.cargo/bin:\$PATH" - cargo --version - sudo mkdir -p /etc/docker/ - echo '{"default-ulimits":{"nofile":{"Hard":65535,"Name":"nofile","Soft":65535}}}' | sudo tee /etc/docker/daemon.json - sudo systemctl stop docker - sudo systemctl enable docker.service - sudo systemctl enable containerd.service - sudo systemctl start docker - sudo groupadd -f docker - sudo usermod -a -G docker ${USERNAME} - sudo systemctl restart docker - EOF \ No newline at end of file diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml deleted file mode 100644 index b14cc5603b..0000000000 --- a/.github/workflows/mshv-integration.yaml +++ /dev/null @@ -1,109 +0,0 @@ -name: Cloud Hypervisor Tests (MSHV) (x86_64) -on: [pull_request_target, merge_group] - -jobs: - infra-setup: - name: MSHV Infra Setup (x86_64) - uses: ./.github/workflows/mshv-infra.yaml - with: - ARCH: x86_64 - KEY: azure_key_${{ github.run_id }} - OS_DISK_SIZE: 512 - RG: MSHV-INTEGRATION-${{ github.run_id }} - VM_SKU: Standard_D16s_v5 - secrets: - MI_CLIENT_ID: ${{ secrets.MSHV_MI_CLIENT_ID }} - RUNNER_RG: ${{ secrets.MSHV_RUNNER_RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.MSHV_STORAGE_ACCOUNT_PATHS }} - ARCH_SOURCE_PATH: ${{ secrets.MSHV_X86_SOURCE_PATH }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - - run-tests: - name: Integration Tests (x86_64) - needs: infra-setup - if: ${{ always() && needs.infra-setup.result == 'success' }} - runs-on: mshv - continue-on-error: true - steps: - - name: Run integration tests - timeout-minutes: 60 - env: - KEY: azure_key_${{ github.run_id }} - PR_NUMBER: ${{ github.event.pull_request.number }} - REPO_URL: https://github.com/cloud-hypervisor/cloud-hypervisor.git - REPO_DIR: cloud-hypervisor - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - RG: MSHV-${{ github.run_id }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - set -e - echo "Connecting to the VM via SSH..." - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - set -e - echo "Logged in successfully." - export PATH="\$HOME/.cargo/bin:\$PATH" - - if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then - git clone --depth 1 "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - git fetch origin pull/${{ github.event.pull_request.number }}/merge - git checkout FETCH_HEAD - else - git clone --depth 1 --single-branch --branch "${{ github.ref_name }}" "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - fi - - echo "Loading VDPA kernel modules..." - sudo modprobe vdpa - sudo modprobe vhost_vdpa - sudo modprobe vdpa_sim - sudo modprobe vdpa_sim_blk - sudo modprobe vdpa_sim_net - - echo "Creating VDPA devices..." - sudo vdpa dev add name vdpa-blk0 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_net - - echo "Setting permissions..." - for i in 0 1 2; do - dev="/dev/vhost-vdpa-$i" - if [ -e "$dev" ]; then - sudo chown $USER:$USER "$dev" - sudo chmod 660 "$dev" - else - echo "Warning: Device $dev not found" - fi - done - - sudo ./scripts/dev_cli.sh tests --hypervisor mshv --integration - EOF - - cleanup: - name: Cleanup - needs: run-tests - if: always() - runs-on: mshv - steps: - - name: Delete RG - env: - RG: MSHV-INTEGRATION-${{ github.run_id }} - run: | - if az group exists --name ${RG}; then - az group delete --name ${RG} --yes --no-wait - else - echo "Resource Group ${RG} does not exist. Skipping deletion." - fi - echo "Cleanup process completed." - - - name: Delete SSH Key - env: - KEY: azure_key_${{ github.run_id }} - run: | - if [ -f ~/.ssh/${KEY} ]; then - rm -f ~/.ssh/${KEY} ~/.ssh/${KEY}.pub - echo "SSH key deleted successfully." - else - echo "SSH key does not exist. Skipping deletion." - fi - echo "Cleanup process completed." \ No newline at end of file diff --git a/.github/workflows/package-consistency.yaml b/.github/workflows/package-consistency.yaml deleted file mode 100644 index 719aa3d8df..0000000000 --- a/.github/workflows/package-consistency.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: Cloud Hypervisor Consistency -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Rust VMM Consistency Check - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Install dependencies - run: sudo apt install -y python3 - - - name: Install Rust toolchain stable - uses: dtolnay/rust-toolchain@stable - with: - toolchain: stable - - - name: Check Rust VMM Package Consistency of root Workspace - run: python3 scripts/package-consistency-check.py github.com/rust-vmm - - - name: Check Rust VMM Package Consistency of fuzz Workspace - run: | - pushd fuzz - python3 ../scripts/package-consistency-check.py github.com/rust-vmm - popd diff --git a/.github/workflows/preview-riscv64-build.yaml b/.github/workflows/preview-riscv64-build.yaml deleted file mode 100644 index 929a60147a..0000000000 --- a/.github/workflows/preview-riscv64-build.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit kvm build Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - - steps: - - name: Code checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.88.0 - - - name: Build test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo rustc --locked --no-default-features --features "kvm" - - - name: Clippy test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked --no-default-features --features "kvm" - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/preview-riscv64-modules.yaml b/.github/workflows/preview-riscv64-modules.yaml deleted file mode 100644 index 767d9779a8..0000000000 --- a/.github/workflows/preview-riscv64-modules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - matrix: - module: - - hypervisor - - arch - - vm-allocator - - devices - - steps: - - name: Code checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.88.0 - - - name: Build ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo rustc --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Clippy ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Test ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo test --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index f8d5ca41fa..bf2a908878 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -13,17 +13,11 @@ jobs: fail-fast: false matrix: rust: - - beta - stable target: - - aarch64-unknown-linux-gnu - - aarch64-unknown-linux-musl - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl include: - - rust: beta - experimental: true - rust: stable experimental: false @@ -56,25 +50,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Clippy (mshv) - uses: houseabsolute/actions-rust-cross@v1 - with: - command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Clippy (mshv + kvm) - uses: houseabsolute/actions-rust-cross@v1 - with: - command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings - name: Clippy (default features) uses: houseabsolute/actions-rust-cross@v1 @@ -83,7 +59,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --tests --examples -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --tests --examples -- -D warnings - name: Clippy (default features + guest_debug) uses: houseabsolute/actions-rust-cross@v1 @@ -92,7 +68,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --tests --examples --features "guest_debug" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --tests --examples --features "guest_debug" -- -D warnings - name: Clippy (default features + pvmemcontrol) uses: houseabsolute/actions-rust-cross@v1 @@ -101,7 +77,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --tests --examples --features "pvmemcontrol" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --tests --examples --features "pvmemcontrol" -- -D warnings - name: Clippy (default features + tracing) uses: houseabsolute/actions-rust-cross@v1 @@ -110,13 +86,13 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --tests --examples --features "tracing" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --tests --examples --features "tracing" -- -D warnings - name: Clippy (default features + fw_cfg) uses: actions-rs/cargo@v1 with: use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples --features "fw_cfg" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples --features "fw_cfg" -- -D warnings - name: Clippy (default features + ivshmem) uses: houseabsolute/actions-rust-cross@v1 @@ -125,7 +101,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --tests --examples --features "ivshmem" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --tests --examples --features "ivshmem" -- -D warnings - name: Clippy (sev_snp) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} @@ -135,7 +111,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "sev_snp" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --no-default-features --tests --examples --features "sev_snp" -- -D warnings - name: Clippy (igvm) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} @@ -145,7 +121,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "igvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --no-default-features --tests --examples --features "igvm" -- -D warnings - name: Clippy (kvm + tdx) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} @@ -155,7 +131,7 @@ jobs: cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 toolchain: ${{ matrix.rust }} target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings - name: Check build did not modify any files run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index 0ab1ea813f..0000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: Cloud Hypervisor Release -on: [create, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} - cancel-in-progress: true -env: - GITHUB_TOKEN: ${{ github.token }} - -jobs: - release: - if: (github.event_name == 'create' && github.event.ref_type == 'tag') || github.event_name == 'merge_group' - name: Release ${{ matrix.platform.target }} - strategy: - fail-fast: false - matrix: - platform: - - target: x86_64-unknown-linux-gnu - args: --all --release --features mshv - name_ch: cloud-hypervisor - name_ch_remote: ch-remote - - target: x86_64-unknown-linux-musl - args: --all --release --features mshv - name_ch: cloud-hypervisor-static - name_ch_remote: ch-remote-static - - target: aarch64-unknown-linux-musl - args: --all --release - name_ch: cloud-hypervisor-static-aarch64 - name_ch_remote: ch-remote-static-aarch64 - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v5 - - name: Install musl-gcc - if: contains(matrix.platform.target, 'musl') - run: sudo apt install -y musl-tools - - name: Create release directory - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: rsync -rv --exclude=.git . ../cloud-hypervisor-${{ github.event.ref }} - - name: Build ${{ matrix.platform.target }} - uses: houseabsolute/actions-rust-cross@v1 - with: - command: build - target: ${{ matrix.platform.target }} - args: ${{ matrix.platform.args }} - strip: true - toolchain: "1.88.0" - - name: Copy Release Binaries - if: github.event_name == 'create' && github.event.ref_type == 'tag' - shell: bash - run: | - cp target/${{ matrix.platform.target }}/release/cloud-hypervisor ./${{ matrix.platform.name_ch }} - cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - - name: Upload Release Artifacts - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v5 - with: - name: Artifacts for ${{ matrix.platform.target }} - path: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - - name: Vendor - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - working-directory: ../cloud-hypervisor-${{ github.event.ref }} - run: | - mkdir ../vendor-cargo-home - export CARGO_HOME=$(realpath ../vendor-cargo-home) - mkdir .cargo - cargo vendor > .cargo/config.toml - - name: Create vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: tar cJf cloud-hypervisor-${{ github.event.ref }}.tar.xz ../cloud-hypervisor-${{ github.event.ref }} - - name: Upload cloud-hypervisor vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v5 - with: - path: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - - name: Create GitHub Release - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v2 - with: - draft: true - files: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - ./cloud-hypervisor-${{ github.event.ref }}.tar.xz diff --git a/.gitlint b/.gitlint index 455dd0281a..d65c4ab73e 100644 --- a/.gitlint +++ b/.gitlint @@ -1,7 +1,7 @@ [general] extra-path=scripts/gitlint/rules regex-style-search=true -ignore=body-max-line-length +ignore=body-max-line-length,body-hard-tab [ignore-by-author-name] regex=dependabot diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c77d3e36da..779514e6b8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,6 +19,28 @@ for each submitted Pull Request (PR). ## Basic Checks +```sh +# We currently rely on nightly-only formatting features +cargo +nightly fmt --all +cargo check --all-targets --tests +cargo clippy --all-targets --tests +# Please note that this will not execute integration tests. +cargo test --all-targets --tests + +# To lint your last three commits +gitlint --commits "HEAD~3..HEAD" +``` + +### \[Optional\] Run Integration Tests + +_Caution: These tests are taking a long time to complete (40+ mins) and need special setup._ + +```sh + bash ./scripts/dev_cli.sh tests --integration -- --test-filter '' +``` + +### Setup Commit Hook + Please consider creating the following hook as `.git/hooks/pre-commit` in order to ensure basic correctness of your code. You can extend this further if you have specific features that you regularly develop against. @@ -26,9 +48,9 @@ have specific features that you regularly develop against. ```sh #!/bin/sh -cargo fmt -- --check || exit 1 -cargo check --locked --all --all-targets --tests || exit 1 -cargo clippy --locked --all --all-targets --tests -- -D warnings || exit 1 +cargo +nightly fmt --all -- --check || exit 1 +cargo check --locked --all-targets --tests || exit 1 +cargo clippy --locked --all-targets --tests -- -D warnings || exit 1 ``` You will need to `chmod +x .git/hooks/pre-commit` to have it run on every diff --git a/Cargo.lock b/Cargo.lock index 118b60cea6..ebe81c7d95 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -266,6 +266,29 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879b6c89592deb404ba4dc0ae6b58ffd1795c78991cbb5b8bc441c48a070440d" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "107a4e9d9cab9963e04e84bb8dee0e25f2a987f9a8bad5ed054abd439caa8f8c" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -281,6 +304,26 @@ dependencies = [ "windows-link", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.9.4", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.1", + "shlex", + "syn", +] + [[package]] name = "bitfield-struct" version = "0.10.1" @@ -368,9 +411,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.3" @@ -383,6 +437,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.49" @@ -442,6 +507,15 @@ dependencies = [ "zbus", ] +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.4" @@ -570,7 +644,7 @@ dependencies = [ "lazy_static", "mintex", "parking_lot", - "rustc-hash", + "rustc-hash 1.1.0", "serde", "serde_json", "thousands", @@ -597,6 +671,18 @@ dependencies = [ "windows-sys 0.61.0", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "endi" version = "1.1.0" @@ -740,6 +826,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.31" @@ -1031,6 +1123,24 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -1061,6 +1171,16 @@ dependencies = [ "syn", ] +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -1117,6 +1237,16 @@ version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libredox" version = "0.1.10" @@ -1213,6 +1343,12 @@ dependencies = [ "vmm-sys-util", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1311,6 +1447,16 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43794a0ace135be66a25d3ae77d41b91615fb68ae937f904090203e81f755b65" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1654,6 +1800,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-crate" version = "3.4.0" @@ -1793,6 +1949,20 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.26" @@ -1805,6 +1975,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustix" version = "1.1.2" @@ -1818,6 +1994,42 @@ dependencies = [ "windows-sys 0.61.0", ] +[[package]] +name = "rustls" +version = "0.23.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -1995,6 +2207,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.106" @@ -2189,6 +2407,12 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -2418,6 +2642,8 @@ name = "vm-migration" version = "0.1.0" dependencies = [ "anyhow", + "itertools 0.14.0", + "rustls", "serde", "serde_json", "thiserror 2.0.17", @@ -2456,6 +2682,7 @@ dependencies = [ "hypervisor", "igvm", "igvm_defs", + "kvm-bindings", "landlock", "libc", "linux-loader", @@ -2612,13 +2839,22 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.5", ] [[package]] @@ -2630,6 +2866,22 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + [[package]] name = "windows-targets" version = "0.53.5" @@ -2637,58 +2889,106 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_aarch64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + [[package]] name = "windows_i686_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_i686_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "windows_x86_64_msvc" version = "0.53.1" @@ -2793,6 +3093,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zvariant" version = "5.7.0" diff --git a/Cargo.toml b/Cargo.toml index 1d03f39688..eb2e57c67f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,86 +1,37 @@ -[package] -authors = ["The Cloud Hypervisor Authors"] -build = "build.rs" -default-run = "cloud-hypervisor" -description = "Open source Virtual Machine Monitor (VMM) that runs on top of KVM & MSHV" -edition = "2024" -homepage = "https://github.com/cloud-hypervisor/cloud-hypervisor" -license = "Apache-2.0 AND BSD-3-Clause" -name = "cloud-hypervisor" -version = "49.0.0" -# Minimum buildable version: -# Keep in sync with version in .github/workflows/build.yaml -# Policy on MSRV (see #4318): -# Can only be bumped if satisfying any of the following: -# a.) A dependency requires it, -# b.) If we want to use a new feature and that MSRV is at least 6 months old, -# c.) There is a security issue that is addressed by the toolchain update. -rust-version = "1.88.0" +# Cloud Hypervisor Workspace +# +# The main crate producing the binaries is in `./cloud-hypervisor`. [profile.release] codegen-units = 1 lto = true opt-level = "s" -strip = true -[profile.profiling] -debug = true +# Tradeof between performance and fast compilation times for local testing and +# development with frequent rebuilds. +[profile.optimized-dev] +codegen-units = 16 inherits = "release" +lto = false +opt-level = 2 strip = false -[dependencies] -anyhow = { workspace = true } -api_client = { path = "api_client" } -clap = { workspace = true, features = ["string"] } -dhat = { workspace = true, optional = true } -env_logger = { workspace = true } -epoll = { workspace = true } -event_monitor = { path = "event_monitor" } -hypervisor = { path = "hypervisor" } -libc = { workspace = true } -log = { workspace = true, features = ["std"] } -option_parser = { path = "option_parser" } -seccompiler = { workspace = true } -serde_json = { workspace = true } -signal-hook = { workspace = true } -thiserror = { workspace = true } -tpm = { path = "tpm" } -tracer = { path = "tracer" } -vm-memory = { workspace = true } -vmm = { path = "vmm" } -vmm-sys-util = { workspace = true } -zbus = { version = "5.7.1", optional = true } - -[dev-dependencies] -dirs = { workspace = true } -net_util = { path = "net_util" } -serde_json = { workspace = true } -test_infra = { path = "test_infra" } -wait-timeout = { workspace = true } +# Optimize more for dependencies: They don't require frequent rebuilds. +[profile.optimized-dev.package."*"] +codegen-units = 1 +opt-level = 3 -# Please adjust `vmm::feature_list()` accordingly when changing the -# feature list below -[features] -dbus_api = ["vmm/dbus_api", "zbus"] -default = ["io_uring", "kvm"] -dhat-heap = ["dhat", "vmm/dhat-heap"] # For heap profiling -fw_cfg = ["vmm/fw_cfg"] -guest_debug = ["vmm/guest_debug"] -igvm = ["mshv", "vmm/igvm"] -io_uring = ["vmm/io_uring"] -ivshmem = ["vmm/ivshmem"] -kvm = ["vmm/kvm"] -mshv = ["vmm/mshv"] -pvmemcontrol = ["vmm/pvmemcontrol"] -sev_snp = ["igvm", "mshv", "vmm/sev_snp"] -tdx = ["vmm/tdx"] -tracing = ["tracer/tracing", "vmm/tracing"] +[profile.profiling] +debug = true +inherits = "release" +strip = false [workspace] members = [ "api_client", "arch", "block", + "cloud-hypervisor", "devices", "event_monitor", "hypervisor", @@ -103,6 +54,7 @@ members = [ "vmm", ] package.edition = "2024" +resolver = "3" [workspace.dependencies] # rust-vmm crates @@ -145,10 +97,32 @@ dirs = "6.0.0" env_logger = "0.11.8" epoll = "4.4.0" flume = "0.11.1" +itertools = "0.14.0" libc = "0.2.177" log = "0.4.28" +rustls = "0.23.34" signal-hook = "0.3.18" thiserror = "2.0.17" uuid = { version = "1.18.1" } wait-timeout = "0.2.1" zerocopy = { version = "0.8.27", default-features = false } + +[workspace.lints.rust] +# `level = warn` is irrelevant here but mandatory for rustc/cargo +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(devcli_testenv)'] } + +[workspace.lints.clippy] +# Any clippy lint (group) in alphabetical order: +# https://rust-lang.github.io/rust-clippy/master/index.html + +# Groups +all = "deny" # shorthand for the other groups but here for compleness +complexity = "deny" +correctness = "deny" +perf = "deny" +style = "deny" +suspicious = "deny" + +# Individual Lints +assertions_on_result_states = "deny" +undocumented_unsafe_blocks = "deny" diff --git a/api_client/Cargo.toml b/api_client/Cargo.toml index 429ecbf927..b8791dfc3d 100644 --- a/api_client/Cargo.toml +++ b/api_client/Cargo.toml @@ -7,3 +7,6 @@ version = "0.1.0" [dependencies] thiserror = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 3bd32affb1..804be793d0 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -27,3 +27,6 @@ vmm-sys-util = { workspace = true, features = ["with-serde"] } [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] fdt_parser = { version = "0.1.5", package = "fdt" } vm-fdt = { workspace = true } + +[lints] +workspace = true diff --git a/block/Cargo.toml b/block/Cargo.toml index db4ac9a6b3..9823c1f818 100644 --- a/block/Cargo.toml +++ b/block/Cargo.toml @@ -28,3 +28,6 @@ vm-memory = { workspace = true, features = [ ] } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/block/src/async_io.rs b/block/src/async_io.rs index e4d8aaa256..dae94cbd2e 100644 --- a/block/src/async_io.rs +++ b/block/src/async_io.rs @@ -18,6 +18,14 @@ pub enum DiskFileError { /// Failed creating a new AsyncIo. #[error("Failed creating a new AsyncIo")] NewAsyncIo(#[source] std::io::Error), + + /// Unsupported operation. + #[error("Unsupported operation")] + Unsupported, + + /// Resize failed + #[error("Resize failed")] + ResizeError, } pub type DiskFileResult = std::result::Result; @@ -61,6 +69,8 @@ pub trait DiskFile: Send { fn topology(&mut self) -> DiskTopology { DiskTopology::default() } + fn resize(&mut self, size: u64) -> DiskFileResult<()>; + /// Returns the file descriptor of the underlying disk image file. /// /// The file descriptor is supposed to be used for `fcntl()` calls but no diff --git a/block/src/fcntl.rs b/block/src/fcntl.rs index 2e34de1d6a..3687288a6b 100644 --- a/block/src/fcntl.rs +++ b/block/src/fcntl.rs @@ -101,13 +101,52 @@ impl LockState { } } +/// The granularity of the advisory lock. +/// +/// The granularity has significant implications in typical cloud deployments +/// with network storage. The Linux kernel will sync advisory locks to network +/// file systems, but these backends may have different policies and handle +/// locks differently. For example, Netapp speaks a NFS API but will treat +/// advisory OFD locks for the whole file as mandatory locks, whereas byte-range +/// locks for the whole file will remain advisory [0]. +/// +/// As it is a valid use case to prevent multiple CHV instances from accessing +/// the same disk but disk management software (e.g., Cinder in OpenStack) +/// should be able to snapshot disks while VMs are running, we need special +/// control over the lock granularity. Therefore, it is a valid use case to lock +/// the whole byte range of a disk image without technically locking the whole +/// file - to get the best of both worlds. +/// +/// [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9 +#[derive(Clone, Copy, Debug)] +pub enum LockGranularity { + WholeFile, + ByteRange(u64 /* from, inclusive */, u64 /* len */), +} + +impl LockGranularity { + const fn l_start(self) -> u64 { + match self { + LockGranularity::WholeFile => 0, + LockGranularity::ByteRange(start, _) => start, + } + } + + const fn l_len(self) -> u64 { + match self { + LockGranularity::WholeFile => 0, /* EOF */ + LockGranularity::ByteRange(_, len) => len, + } + } +} + /// Returns a [`struct@libc::flock`] structure for the whole file. -const fn get_flock(lock_type: LockType) -> libc::flock { +const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock { libc::flock { l_type: lock_type.to_libc_val() as libc::c_short, l_whence: libc::SEEK_SET as libc::c_short, - l_start: 0, - l_len: 0, /* EOF */ + l_start: granularity.l_start() as libc::c_long, + l_len: granularity.l_len() as libc::c_long, l_pid: 0, /* filled by callee */ } } @@ -122,8 +161,13 @@ const fn get_flock(lock_type: LockType) -> libc::flock { /// - `file`: The file to acquire a lock for [`LockType`]. The file's state will /// be logically mutated, but not technically. /// - `lock_type`: The [`LockType`] -pub fn try_acquire_lock(file: Fd, lock_type: LockType) -> Result<(), LockError> { - let flock = get_flock(lock_type); +/// - `granularity`: The [`LockGranularity`]. +pub fn try_acquire_lock( + file: Fd, + lock_type: LockType, + granularity: LockGranularity, +) -> Result<(), LockError> { + let flock = get_flock(lock_type, granularity); let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_SETLK(&flock)); match res { @@ -146,8 +190,9 @@ pub fn try_acquire_lock(file: Fd, lock_type: LockType) -> Result<() /// /// # Parameters /// - `file`: The file to clear all locks for [`LockType`]. -pub fn clear_lock(file: Fd) -> Result<(), LockError> { - try_acquire_lock(file, LockType::Unlock) +/// - `granularity`: The [`LockGranularity`]. +pub fn clear_lock(file: Fd, granularity: LockGranularity) -> Result<(), LockError> { + try_acquire_lock(file, LockType::Unlock, granularity) } /// Returns the current lock state using [`fcntl`] with respect to the given @@ -155,8 +200,12 @@ pub fn clear_lock(file: Fd) -> Result<(), LockError> { /// /// # Parameters /// - `file`: The file for which to get the lock state. -pub fn get_lock_state(file: Fd) -> Result { - let mut flock = get_flock(LockType::Write); +/// - `granularity`: The [`LockGranularity`]. +pub fn get_lock_state( + file: Fd, + granularity: LockGranularity, +) -> Result { + let mut flock = get_flock(LockType::Write, granularity); let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_GETLK(&mut flock)); match res { 0 => { diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index ac02e21bf3..07ad258c4a 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -34,6 +34,10 @@ impl DiskFile for FixedVhdDiskAsync { ) as Box) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.0.as_raw_fd()) } diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index c125710698..0f05c66ad7 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -34,6 +34,10 @@ impl DiskFile for FixedVhdDiskSync { ) as Box) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.0.as_raw_fd()) } diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index cd6a1fb774..36b82e81cc 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -41,6 +41,9 @@ impl DiskFile for QcowDiskSync { fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.qcow_file.as_raw_fd()) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } } pub struct QcowSync { diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 1a582073b0..a982623a59 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -47,6 +47,19 @@ impl DiskFile for RawFileDisk { } } + fn resize(&mut self, size: u64) -> DiskFileResult<()> { + let borrowed_fd = self.fd(); + let raw_fd = borrowed_fd.as_raw_fd(); + + // SAFETY: FFI call into libc, trivially safe + let rc = unsafe { libc::ftruncate(raw_fd, size as libc::off_t) }; + if rc == 0 { + Ok(()) + } else { + Err(DiskFileError::ResizeError) + } + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.file.as_raw_fd()) } diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 9a74fa41d7..7404e81c81 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -50,6 +50,10 @@ impl DiskFile for RawFileDiskAio { } } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.file.as_raw_fd()) } diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 6b98147e19..43a9a5b3f0 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -47,6 +47,10 @@ impl DiskFile for RawFileDiskSync { fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.file.as_raw_fd()) } + + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } } pub struct RawFileSync { diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index 01bcbf5e7f..0028672d36 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -38,6 +38,10 @@ impl DiskFile for VhdxDiskSync { ) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.vhdx_file.as_raw_fd()) } diff --git a/cloud-hypervisor/Cargo.toml b/cloud-hypervisor/Cargo.toml new file mode 100644 index 0000000000..579bae8a27 --- /dev/null +++ b/cloud-hypervisor/Cargo.toml @@ -0,0 +1,69 @@ +[package] +authors = ["The Cloud Hypervisor Authors"] +build = "build.rs" +default-run = "cloud-hypervisor" +description = "Open source Virtual Machine Monitor (VMM) that runs on top of KVM & MSHV" +edition = "2024" +homepage = "https://github.com/cloud-hypervisor/cloud-hypervisor" +license = "Apache-2.0 AND BSD-3-Clause" +name = "cloud-hypervisor" +version = "49.0.0" +# Minimum buildable version: +# Keep in sync with version in .github/workflows/build.yaml +# Policy on MSRV (see #4318): +# Can only be bumped if satisfying any of the following: +# a.) A dependency requires it, +# b.) If we want to use a new feature and that MSRV is at least 6 months old, +# c.) There is a security issue that is addressed by the toolchain update. +rust-version = "1.89.0" + +[dependencies] +anyhow = { workspace = true } +api_client = { path = "../api_client" } +clap = { workspace = true, features = ["string"] } +dhat = { workspace = true, optional = true } +env_logger = { workspace = true } +epoll = { workspace = true } +event_monitor = { path = "../event_monitor" } +hypervisor = { path = "../hypervisor" } +libc = { workspace = true } +log = { workspace = true, features = ["std"] } +option_parser = { path = "../option_parser" } +seccompiler = { workspace = true } +serde_json = { workspace = true } +signal-hook = { workspace = true } +thiserror = { workspace = true } +tpm = { path = "../tpm" } +tracer = { path = "../tracer" } +vm-memory = { workspace = true } +vmm = { path = "../vmm" } +vmm-sys-util = { workspace = true } +zbus = { version = "5.7.1", optional = true } + +[dev-dependencies] +dirs = { workspace = true } +net_util = { path = "../net_util" } +serde_json = { workspace = true } +test_infra = { path = "../test_infra" } +wait-timeout = { workspace = true } + +# Please adjust `vmm::feature_list()` accordingly when changing the +# feature list below +[features] +dbus_api = ["vmm/dbus_api", "zbus"] +default = ["io_uring", "kvm"] +dhat-heap = ["dhat", "vmm/dhat-heap"] # For heap profiling +fw_cfg = ["vmm/fw_cfg"] +guest_debug = ["vmm/guest_debug"] +igvm = ["mshv", "vmm/igvm"] +io_uring = ["vmm/io_uring"] +ivshmem = ["vmm/ivshmem"] +kvm = ["vmm/kvm"] +mshv = ["vmm/mshv"] +pvmemcontrol = ["vmm/pvmemcontrol"] +sev_snp = ["igvm", "mshv", "vmm/sev_snp"] +tdx = ["vmm/tdx"] +tracing = ["tracer/tracing", "vmm/tracing"] + +[lints] +workspace = true diff --git a/build.rs b/cloud-hypervisor/build.rs similarity index 100% rename from build.rs rename to cloud-hypervisor/build.rs diff --git a/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs similarity index 89% rename from src/bin/ch-remote.rs rename to cloud-hypervisor/src/bin/ch-remote.rs index 803ffc7ee9..9e042ebe5b 100644 --- a/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -9,7 +9,9 @@ mod test_util; use std::io::Read; use std::marker::PhantomData; +use std::num::NonZeroU32; use std::os::unix::net::UnixStream; +use std::path::PathBuf; use std::process; use api_client::{ @@ -320,6 +322,22 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu )?; simple_api_command(socket, "PUT", "resize", Some(&resize)).map_err(Error::HttpApiClient) } + Some("resize-disk") => { + let resize_disk = resize_disk_config( + matches + .subcommand_matches("resize-disk") + .unwrap() + .get_one::("disk") + .unwrap(), + matches + .subcommand_matches("resize-disk") + .unwrap() + .get_one::("size") + .unwrap(), + )?; + simple_api_command(socket, "PUT", "resize-disk", Some(&resize_disk)) + .map_err(Error::HttpApiClient) + } Some("resize-zone") => { let resize_zone = resize_zone_config( matches @@ -474,11 +492,34 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .subcommand_matches("send-migration") .unwrap() .get_one::("send_migration_config") - .unwrap(), + .unwrap() + .to_owned(), matches .subcommand_matches("send-migration") .unwrap() .get_flag("send_migration_local"), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("downtime-ms") + .unwrap_or(&300), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("migration-timeout-s") + .unwrap_or(&3600), + matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("connections") + .copied() + .and_then(NonZeroU32::new) + .unwrap_or(NonZeroU32::new(1).unwrap()), + matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("tls-dir") + .cloned(), ); simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) .map_err(Error::HttpApiClient) @@ -489,7 +530,13 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .subcommand_matches("receive-migration") .unwrap() .get_one::("receive_migration_config") - .unwrap(), + .unwrap() + .to_owned(), + matches + .subcommand_matches("receive-migration") + .unwrap() + .get_one::("tls-dir") + .cloned(), ); simple_api_command( socket, @@ -693,6 +740,16 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .subcommand_matches("send-migration") .unwrap() .get_flag("send_migration_local"), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("downtime-ms") + .unwrap_or(&300), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("migration-timeout-s") + .unwrap_or(&3600), ); proxy.api_vm_send_migration(&send_migration_data) } @@ -762,6 +819,18 @@ fn resize_config( Ok(serde_json::to_string(&resize).unwrap()) } +fn resize_disk_config(id: &str, size: &str) -> Result { + let resize_zone = vmm::api::VmResizeDiskData { + id: id.to_owned(), + desired_size: size + .parse::() + .map_err(Error::InvalidMemorySize)? + .0, + }; + + Ok(serde_json::to_string(&resize_zone).unwrap()) +} + fn resize_zone_config(id: &str, size: &str) -> Result { let resize_zone = vmm::api::VmResizeZoneData { id: id.to_owned(), @@ -874,18 +943,35 @@ fn coredump_config(destination_url: &str) -> String { serde_json::to_string(&coredump_config).unwrap() } -fn receive_migration_data(url: &str) -> String { +fn receive_migration_data(url: String, tls_dir: Option) -> String { let receive_migration_data = vmm::api::VmReceiveMigrationData { - receiver_url: url.to_owned(), + receiver_url: url, + tcp_serial_url: None, + // Only FDs transmitted via an SCM_RIGHTS UNIX Domain Socket message + // are valid. Transmitting specific FD nums via the HTTP API is + // almost always invalid. + net_fds: None, + tls_dir, }; serde_json::to_string(&receive_migration_data).unwrap() } -fn send_migration_data(url: &str, local: bool) -> String { +fn send_migration_data( + url: String, + local: bool, + downtime: u64, + migration_timeout: u64, + connections: NonZeroU32, + tls_dir: Option, +) -> String { let send_migration_data = vmm::api::VmSendMigrationData { - destination_url: url.to_owned(), + destination_url: url, local, + downtime, + migration_timeout, + connections, + tls_dir, }; serde_json::to_string(&send_migration_data).unwrap() @@ -997,7 +1083,14 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("receive_migration_config") .index(1) + // Live migration with net_fds not supported in ch-remote. .help(""), + ) + .arg( + Arg::new("tls-dir") + .long("tls-dir") + .help("directory with TLS certificates") + .num_args(1), ), Command::new("remove-device") .about("Remove VFIO and PCI device") @@ -1022,6 +1115,20 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .help("New memory size in bytes (supports K/M/G suffix)") .num_args(1), ), + Command::new("resize-disk") + .about("grows/shrinks an attached disk") + .arg( + Arg::new("disk") + .long("disk") + .help("disk identifier") + .num_args(1), + ) + .arg( + Arg::new("size") + .long("size") + .help("new disk size") + .num_args(1), + ), Command::new("resize-zone") .about("Resize a memory zone") .arg( @@ -1046,6 +1153,32 @@ fn get_cli_commands_sorted() -> Box<[Command]> { Command::new("resume").about("Resume the VM"), Command::new("send-migration") .about("Initiate a VM migration") + .arg( + Arg::new("connections") + .long("connections") + .help("The number of connections to use for the migration") + .num_args(1) + .value_parser(clap::value_parser!(u32)) + .default_value("1"), + ) + .arg( + Arg::new("downtime-ms") + .long("downtime-ms") + .visible_alias("downtime") + .help("Set the expected maximum downtime in milliseconds") + .num_args(1) + .value_parser(clap::value_parser!(u64)) + .default_value("300"), + ) + .arg( + Arg::new("migration-timeout-s") + .long("migration-timeout-s") + .visible_alias("migration-timeout") + .help("Set the maximum allowed migration time in seconds") + .num_args(1) + .value_parser(clap::value_parser!(u64)) + .default_value("3600"), + ) .arg( Arg::new("send_migration_config") .index(1) @@ -1056,6 +1189,12 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .long("local") .num_args(0) .action(ArgAction::SetTrue), + ) + .arg( + Arg::new("tls-dir") + .long("tls-dir") + .help("directory with TLS certificates") + .num_args(1), ), Command::new("shutdown").about("Shutdown the VM"), Command::new("shutdown-vmm").about("Shutdown the VMM"), diff --git a/src/lib.rs b/cloud-hypervisor/src/lib.rs similarity index 100% rename from src/lib.rs rename to cloud-hypervisor/src/lib.rs diff --git a/src/main.rs b/cloud-hypervisor/src/main.rs similarity index 97% rename from src/main.rs rename to cloud-hypervisor/src/main.rs index 32dad8dffd..0de3cfcb89 100644 --- a/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -420,7 +420,7 @@ fn get_cli_options_sorted( .default_value("true"), Arg::new("serial") .long("serial") - .help("Control serial port: off|null|pty|tty|file=|socket=") + .help("Control serial port: off|null|pty|tty|file=|socket=|tcp=") .default_value("null") .group("vm-config"), Arg::new("tpm") @@ -993,6 +993,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -1002,12 +1003,16 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -1209,6 +1214,24 @@ mod unit_tests { }"#, true, ), + ( + vec![ + "cloud-hypervisor", + "--kernel", + "/path/to/kernel", + "--disk", + "path=/path/to/disk/1,addr=15.0", + "path=/path/to/disk/2", + ], + r#"{ + "payload": {"kernel": "/path/to/kernel"}, + "disks": [ + {"path": "/path/to/disk/1", "bdf_device": 21}, + {"path": "/path/to/disk/2"} + ] + }"#, + true, + ), ( vec![ "cloud-hypervisor", @@ -1406,6 +1429,20 @@ mod unit_tests { }"#, true, ), + ( + vec![ + "cloud-hypervisor", "--kernel", "/path/to/kernel", + "--net", + "mac=12:34:56:78:90:ab,host_mac=34:56:78:90:ab:cd,tap=tap0,ip=1.2.3.4,mask=5.6.7.8,addr=08.0", + ], + r#"{ + "payload": {"kernel": "/path/to/kernel"}, + "net": [ + {"mac": "12:34:56:78:90:ab", "host_mac": "34:56:78:90:ab:cd", "tap": "tap0", "ip": "1.2.3.4", "mask": "5.6.7.8", "num_queues": 2, "queue_size": 256, "bdf_device": 8} + ] + }"#, + true, + ), #[cfg(target_arch = "x86_64")] ( vec![ @@ -1477,11 +1514,11 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--rng", - "src=/path/to/entropy/source", + "src=/path/to/entropy/source,addr=11.0", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, - "rng": {"src": "/path/to/entropy/source"} + "rng": {"src": "/path/to/entropy/source", "bdf_device": 17} }"#, true, )] @@ -1498,14 +1535,14 @@ mod unit_tests { "cloud-hypervisor", "--kernel", "/path/to/kernel", "--memory", "shared=true", "--fs", - "tag=virtiofs1,socket=/path/to/sock1", + "tag=virtiofs1,socket=/path/to/sock1,addr=10.0", "tag=virtiofs2,socket=/path/to/sock2", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "memory" : { "shared": true, "size": 536870912 }, "fs": [ - {"tag": "virtiofs1", "socket": "/path/to/sock1"}, + {"tag": "virtiofs1", "socket": "/path/to/sock1", "bdf_device": 16}, {"tag": "virtiofs2", "socket": "/path/to/sock2"} ] }"#, @@ -1577,13 +1614,13 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--pmem", - "file=/path/to/img/1,size=1G", + "file=/path/to/img/1,size=1G,addr=1F.0", "file=/path/to/img/2,size=2G", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "pmem": [ - {"file": "/path/to/img/1", "size": 1073741824}, + {"file": "/path/to/img/1", "size": 1073741824,"bdf_device": 31}, {"file": "/path/to/img/2", "size": 2147483648} ] }"#, @@ -1861,13 +1898,13 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--vdpa", - "path=/path/to/device/1", + "path=/path/to/device/1,addr=18.0", "path=/path/to/device/2,num_queues=2", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "vdpa": [ - {"path": "/path/to/device/1", "num_queues": 1}, + {"path": "/path/to/device/1", "num_queues": 1, "bdf_device": 24}, {"path": "/path/to/device/2", "num_queues": 2} ] }"#, @@ -1906,11 +1943,11 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--vsock", - "cid=123,socket=/path/to/sock/1", + "cid=123,socket=/path/to/sock/1,addr=0F.0", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, - "vsock": {"cid": 123, "socket": "/path/to/sock/1"} + "vsock": {"cid": 123, "socket": "/path/to/sock/1", "bdf_device": 15} }"#, true, ), diff --git a/src/test_util.rs b/cloud-hypervisor/src/test_util.rs similarity index 100% rename from src/test_util.rs rename to cloud-hypervisor/src/test_util.rs diff --git a/tests/integration.rs b/cloud-hypervisor/tests/integration.rs similarity index 99% rename from tests/integration.rs rename to cloud-hypervisor/tests/integration.rs index 53a464e8e7..fb2d15783a 100644 --- a/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 // +#![cfg(devcli_testenv)] #![allow(clippy::undocumented_unsafe_blocks)] // When enabling the `mshv` feature, we skip quite some tests and // hence have known dead-code. This annotation silences dead-code @@ -7657,7 +7658,9 @@ mod ivshmem { &migration_socket, &src_api_socket, &dest_api_socket, - local + local, + 300, + 60 ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); @@ -9828,6 +9831,8 @@ mod live_migration { src_api_socket: &str, dest_api_socket: &str, local: bool, + downtime: u64, + timeout: u64, ) -> bool { // Start to receive migration from the destination VM let mut receive_migration = Command::new(clh_command("ch-remote")) @@ -9848,6 +9853,10 @@ mod live_migration { format!("--api-socket={}", &src_api_socket), "send-migration".to_string(), format! {"unix:{migration_socket}"}, + "--downtime".to_string(), + format!("{downtime}"), + "--migration-timeout".to_string(), + format!("{timeout}"), ] .to_vec(); @@ -10072,8 +10081,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100_000; // 100s + let migration_timeout = 1000; // 1000s + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10246,8 +10265,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10464,8 +10493,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10680,8 +10719,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10790,8 +10839,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10937,8 +10996,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, true), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + true, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10993,7 +11062,12 @@ mod live_migration { .port() } - fn start_live_migration_tcp(src_api_socket: &str, dest_api_socket: &str) -> bool { + fn start_live_migration_tcp( + src_api_socket: &str, + dest_api_socket: &str, + downtime: u64, + timeout: u64, + ) -> bool { // Get an available TCP port let migration_port = get_available_port(); let host_ip = "127.0.0.1"; @@ -11020,6 +11094,10 @@ mod live_migration { &format!("--api-socket={src_api_socket}"), "send-migration", &format!("tcp:{host_ip}:{migration_port}"), + "--downtime", + &format!("{downtime}"), + "--migration-timeout", + &format!("{timeout}"), ]) .stdin(Stdio::null()) .stderr(Stdio::piped()) @@ -11090,6 +11168,8 @@ mod live_migration { .output() .expect("Expect creating disk image to succeed"); let pmem_path = String::from("/dev/pmem0"); + let downtime = 100000; + let timeout = 1000; // Start the source VM let src_vm_path = clh_command("cloud-hypervisor"); @@ -11152,7 +11232,7 @@ mod live_migration { } // Start TCP live migration assert!( - start_live_migration_tcp(&src_api_socket, &dest_api_socket), + start_live_migration_tcp(&src_api_socket, &dest_api_socket, downtime, timeout), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); diff --git a/devices/Cargo.toml b/devices/Cargo.toml index 201d5284fc..f635e66a49 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -48,3 +48,6 @@ fw_cfg = ["arch/fw_cfg", "bitfield-struct", "linux-loader", "zerocopy"] ivshmem = [] kvm = ["arch/kvm"] pvmemcontrol = [] + +[lints] +workspace = true diff --git a/devices/src/ioapic.rs b/devices/src/ioapic.rs index 3ce827bd18..af4a36bcb3 100644 --- a/devices/src/ioapic.rs +++ b/devices/src/ioapic.rs @@ -171,7 +171,7 @@ impl BusDevice for Ioapic { return None; } - debug!("IOAPIC_W @ offset 0x{offset:x}"); + trace!("IOAPIC_W @ offset 0x{offset:x}"); let value = LittleEndian::read_u32(data); @@ -249,7 +249,7 @@ impl Ioapic { } fn ioapic_write(&mut self, val: u32) { - debug!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); + trace!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); match self.reg_sel as u8 { IOAPIC_REG_VERSION => { diff --git a/docs/live_migration.md b/docs/live_migration.md index 94c9afc236..5c77d2625f 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -171,7 +171,13 @@ After a few seconds the VM should be up and you can interact with it. Initiate the Migration over TCP: ```console -src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +``` + +With migration parameters: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} --migration-timeout 60 --downtime 5000 ``` > Replace {dst}:{port} with the actual IP address and port of your destination host. @@ -180,3 +186,24 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. + +#### Migration Parameters + +Cloud Hypervisor supports additional parameters to control the +migration process: + +- `migration-timeout ` +Sets the maximum time (in seconds) allowed for the migration process. +If the migration takes longer than this timeout, it will be aborted. A +value of 0 means no timeout limit. +- `downtime ` +Sets the maximum acceptable downtime (in milliseconds) during the +migration. This parameter helps control the trade-off between migration +time and VM downtime. + +> The downtime limit is related to the cost of serialization +(deserialization) of vCPU and device state. Therefore, the expected +downtime is always shorter than the actual downtime. + +These parameters can be used with the `send-migration` command to +fine-tune the migration behavior according to your requirements. \ No newline at end of file diff --git a/event_monitor/Cargo.toml b/event_monitor/Cargo.toml index b2b7a4e48d..41d3102807 100644 --- a/event_monitor/Cargo.toml +++ b/event_monitor/Cargo.toml @@ -9,3 +9,6 @@ flume = { workspace = true } libc = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index 93343c931f..70d5502145 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -21,7 +21,10 @@ cfg-if = { workspace = true } concat-idents = "1.1.5" igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } -kvm-bindings = { workspace = true, optional = true, features = ["serde"] } +kvm-bindings = { workspace = true, optional = true, features = [ + "fam-wrappers", + "serde", +] } kvm-ioctls = { workspace = true, optional = true } libc = { workspace = true } log = { workspace = true } @@ -63,3 +66,6 @@ version = "1.21.0" [dev-dependencies] env_logger = { workspace = true } + +[lints] +workspace = true diff --git a/hypervisor/src/arch/x86/mod.rs b/hypervisor/src/arch/x86/mod.rs index c337624621..56d1e98a24 100644 --- a/hypervisor/src/arch/x86/mod.rs +++ b/hypervisor/src/arch/x86/mod.rs @@ -12,6 +12,12 @@ // use core::fmt; +#[cfg(feature = "kvm")] +use std::sync::OnceLock; + +use thiserror::Error; + +use crate::{CpuVendor, Hypervisor}; #[cfg(all(feature = "mshv_emulator", target_arch = "x86_64"))] pub mod emulator; @@ -306,16 +312,156 @@ pub struct MsrEntry { pub data: u64, } -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct XsaveState { - #[serde_as(as = "[_; 1024usize]")] - pub region: [u32; 1024usize], +/// Error that may be returned when attempting to enable AMX state components for guests +#[derive(Debug, Error)] +pub enum AmxGuestSupportError { + /// Attempted to enable AMX on a CPU from a vendor that is not known to support AMX features. + #[error("The host CPU's vendor does not support AMX features. Only Intel provides such CPUs.")] + VendorDoesNotSupportAmx, + /// Unable to verify that the host supports AMX. + #[error("The host does not support AMX tile state components: errno={errno}")] + AmxNotSupported { errno: i64 }, + /// The syscall to check for AMX tile state support succeeded, but the returned + /// features did not match our expectations. + #[error( + "Could not verify AMX support. These are the supported features that were reported: features={features}" + )] + InvalidAmxTileFeatureCheck { features: usize }, + /// The request to enable AMX related state components for guests failed. + #[error("Failed to enable AMX tile state components for guests: errno={errno}")] + AmxGuestTileRequest { errno: i64 }, } -impl Default for XsaveState { - fn default() -> Self { - // SAFETY: this is plain old data structure - unsafe { ::std::mem::zeroed() } +/// The length of the XSAVE flexible array member (FAM). +/// This length increases when arch_prctl is utilized to dynamically add state components. +/// +/// IMPORTANT: This static should only be updated via methods on [`XsaveState`]. +#[cfg(feature = "kvm")] +static XSAVE_FAM_LENGTH: OnceLock = OnceLock::new(); + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct XsaveState(#[cfg(feature = "kvm")] pub(crate) kvm_bindings::Xsave); + +impl XsaveState { + const ARCH_GET_XCOMP_SUPP: usize = 0x1021; + const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; + const ARCH_XCOMP_TILECFG: usize = 17; + const ARCH_XCOMP_TILEDATA: usize = 18; + + /// Construct an instance via the given initializer. + /// + /// As long as dynamically enabled state components have only been enabled + /// through static methods on this struct it is guaranteed that the + /// initialization routine is given an Xsave struct of the expected size. + #[cfg(feature = "kvm")] + pub(crate) fn with_initializer( + mut init: F, + ) -> Result> + where + F: FnMut(&mut kvm_bindings::Xsave) -> Result<(), E>, + E: Into>, + { + let fam_length = XSAVE_FAM_LENGTH.get().unwrap_or(&0); + + let mut xsave = kvm_bindings::Xsave::new(*fam_length)?; + + init(&mut xsave).map_err(Into::into)?; + Ok(Self(xsave)) + } + + /// This function enables the AMX related TILECFG and TILEDATA state components for guests. + /// + /// # Background + /// AMX uses a concept of tiles which are small 2D blocks of data stored in registers on the CPU, + /// where the TILECFG state component defines the shape and size of each tile (rows and columns), + /// and the TILEDATA state component holds the actual elements of these tiles used by matrix operations. + pub fn enable_amx_state_components( + hypervisor: &dyn Hypervisor, + ) -> Result<(), AmxGuestSupportError> { + Self::amx_supported(hypervisor)?; + Self::request_guest_amx_support()?; + + // If we are using the KVM hypervisor we meed to query for the new xsave2 size and update + // `XSAVE_FAM_LENGTH` accordingly. + #[cfg(feature = "kvm")] + { + // Obtain the number of bytes the kvm_xsave struct requires. + // This number is documented to always be at least 4096 bytes, but + let size = hypervisor.check_extension_int(kvm_ioctls::Cap::Xsave2); + // Reality check: We should at least have this number of bytes and probably more as we have enabled + // AMX tiles. If this is not the case, it is probably best to panic. + assert!(size >= 4096); + let fam_length = { + // Computation is documented in `[kvm_bindings::kvm_xsave2::len]` + ((size as usize) - size_of::()) + .div_ceil(size_of::()) + }; + XSAVE_FAM_LENGTH + .set(fam_length) + .expect("This should only be set once"); + } + + Ok(()) + } + + /// Checks whether the host supports AMX. + /// + /// The `hypervisor` is used to inform us about the + /// CPU vendor (AMX is currently only available on Intel CPUs). + /// + /// Returns `Ok` if AMX is supported on the host and `Err` otherwise. + fn amx_supported(hypervisor: &dyn Hypervisor) -> Result<(), AmxGuestSupportError> { + if !matches!(hypervisor.get_cpu_vendor(), CpuVendor::Intel) { + return Err(AmxGuestSupportError::VendorDoesNotSupportAmx); + } + // We make a syscall to get information about which dynamically enabled + // XSAVE state components are supported. The corresponding state + // component bits will get set in `features` + let mut features: usize = 0; + // SAFETY: Syscall with valid parameters + let result = unsafe { + libc::syscall( + libc::SYS_arch_prctl, + Self::ARCH_GET_XCOMP_SUPP, + &raw mut features, + ) + }; + // Ensure that both the TILECFG and TILEDATA state components are supported + let mask = (1 << Self::ARCH_XCOMP_TILECFG) | (1 << Self::ARCH_XCOMP_TILEDATA); + if result != 0 { + return Err(AmxGuestSupportError::AmxNotSupported { errno: result }); + } + + if (features & mask) == mask { + Ok(()) + } else { + Err(AmxGuestSupportError::InvalidAmxTileFeatureCheck { features }) + } + } + + /// Asks the kernel to provide AMX support for guests. + fn request_guest_amx_support() -> Result<(), AmxGuestSupportError> { + // Make a syscall to request permission for guests to use the TILECFG + // and TILEDATA state components. Note that as per the kernel + // [documentation](https://docs.kernel.org/arch/x86/xstate.html#dynamic-features-for-virtual-machines) + // we need to pass in the number of the highest XSTATE component which is required for + // the facility to work which in this case is TILEDATA. + // + // This syscall will alter the size of `kvm_xsave` when KVM is used as the hypervisor. + // + // SAFETY: Syscall with valid parameters + let result = unsafe { + libc::syscall( + libc::SYS_arch_prctl, + Self::ARCH_REQ_XCOMP_GUEST_PERM, + Self::ARCH_XCOMP_TILEDATA, + ) + }; + if result == 0 { + Ok(()) + } else { + // Unwrap is OK because we verified that `result` is not zero + Err(AmxGuestSupportError::AmxGuestTileRequest { errno: result }) + } } } diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 763eaa4558..57f43b32df 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -10,6 +10,9 @@ // // +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; + use thiserror::Error; #[cfg(not(target_arch = "riscv64"))] use vm_memory::GuestAddress; @@ -603,4 +606,11 @@ pub trait Vcpu: Send + Sync { /// Trigger NMI interrupt /// fn nmi(&self) -> Result<()>; + /// Returns the underlying vCPU FD of KVM. + /// + /// # SAFETY + /// This is safe as we only use this to map the KVM_RUN structure for the + /// signal handler and only use it from there. + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd; } diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index 5db46d9908..eec249c71d 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -111,6 +111,34 @@ pub trait Hypervisor: Send + Sync { /// Return a hypervisor-agnostic Vm trait object /// fn create_vm(&self, config: HypervisorVmConfig) -> Result>; + + /// Query the hypervisor for the availability of an extension. + /// + /// + /// Generally 0 means no and 1 means yes, but some extensions may report + /// additional information in the integer return value. + /// + #[cfg(feature = "kvm")] + fn check_extension_int(&self, capability: kvm_ioctls::Cap) -> i32; + + /// + /// Create a Vm of a specific type using the underlying hypervisor + /// Return a hypervisor-agnostic Vm trait object + /// + fn create_vm_with_type(&self, _vm_type: u64) -> Result> { + unreachable!() + } + /// + /// Create a Vm of a specific type using the underlying hypervisor, passing memory size + /// Return a hypervisor-agnostic Vm trait object + /// + fn create_vm_with_type_and_memory( + &self, + _vm_type: u64, + #[cfg(feature = "sev_snp")] _mem_size: u64, + ) -> Result> { + unreachable!() + } #[cfg(target_arch = "x86_64")] /// /// Get the supported CpuID diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 5ce110d89a..9972c28715 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -14,10 +14,8 @@ use std::any::Any; use std::collections::HashMap; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::mem::offset_of; -#[cfg(feature = "tdx")] -use std::os::unix::io::AsRawFd; -#[cfg(feature = "tdx")] -use std::os::unix::io::RawFd; +#[cfg(any(feature = "tdx", feature = "kvm"))] +use std::os::unix::io::{AsRawFd, RawFd}; use std::result; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::sync::Mutex; @@ -1263,6 +1261,10 @@ impl hypervisor::Hypervisor for KvmHypervisor { } } + fn check_extension_int(&self, capability: kvm_ioctls::Cap) -> i32 { + self.kvm.check_extension_int(capability) + } + fn check_required_extensions(&self) -> hypervisor::Result<()> { check_required_kvm_extensions(&self.kvm) .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) @@ -2009,7 +2011,11 @@ impl cpu::Vcpu for KvmVcpu { }, Err(ref e) => match e.errno() { - libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + Ok(cpu::VmExit::Ignore) + } + libc::EAGAIN => Ok(cpu::VmExit::Ignore), _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "VCPU error {e:?}" ))), @@ -2679,6 +2685,11 @@ impl cpu::Vcpu for KvmVcpu { self.fd.set_kvm_immediate_exit(exit.into()); } + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } + /// /// Returns the details about TDX exit reason /// @@ -2845,11 +2856,11 @@ impl KvmVcpu { /// X86 specific call that returns the vcpu's current "xsave struct". /// fn get_xsave(&self) -> cpu::Result { - Ok(self - .fd - .get_xsave() - .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))? - .into()) + XsaveState::with_initializer(|state| + // SAFETY: Any configured dynamically enabled state components are always enabled via + // static methods on `XsaveState` hence we know that `state` has the expected size. + unsafe { self.fd.get_xsave2(state) }) + .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(anyhow::Error::from_boxed(e))) } #[cfg(target_arch = "x86_64")] @@ -2857,14 +2868,11 @@ impl KvmVcpu { /// X86 specific call that sets the vcpu's current "xsave struct". /// fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> { - let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into(); - // SAFETY: Here we trust the kernel not to read past the end of the kvm_xsave struct - // when calling the kvm-ioctl library function. - unsafe { - self.fd - .set_xsave(&xsave) - .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) - } + // SAFETY: Any configured dynamically enabled state components are always enabled via + // static methods on `XsaveState` hence we know that the wrapped instance has the + // expected size. + unsafe { self.fd.set_xsave2(&xsave.0) } + .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) } #[cfg(target_arch = "x86_64")] diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index c1bda9d9be..2efd12b0e4 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -291,18 +291,3 @@ impl From for kvm_msr_entry { } } } - -impl From for XsaveState { - fn from(s: kvm_xsave) -> Self { - Self { region: s.region } - } -} - -impl From for kvm_xsave { - fn from(s: XsaveState) -> Self { - Self { - region: s.region, - extra: Default::default(), - } - } -} diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 46a6103b52..baeb03d73b 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -41,6 +41,8 @@ pub mod x86_64; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use std::sync::Mutex; @@ -395,6 +397,12 @@ impl hypervisor::Hypervisor for MshvHypervisor { })) } } + + #[cfg(feature = "kvm")] + fn check_extension_int(&self, _capability: kvm_ioctls::Cap) -> i32 { + unimplemented!() + } + #[cfg(target_arch = "x86_64")] /// /// Get the supported CpuID @@ -1560,6 +1568,11 @@ impl cpu::Vcpu for MshvVcpu { Ok(()) } + + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + unimplemented!() + } } impl MshvVcpu { diff --git a/net_gen/Cargo.toml b/net_gen/Cargo.toml index b1443c1f29..a99c7c995d 100644 --- a/net_gen/Cargo.toml +++ b/net_gen/Cargo.toml @@ -7,3 +7,6 @@ version = "0.1.0" [dependencies] vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/net_util/Cargo.toml b/net_util/Cargo.toml index 8f5df72225..a55db49f8a 100644 --- a/net_util/Cargo.toml +++ b/net_util/Cargo.toml @@ -27,3 +27,6 @@ vmm-sys-util = { workspace = true } pnet = "0.35.0" pnet_datalink = "0.35.0" serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index 61e763ba20..95924d6df9 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -76,7 +76,14 @@ fn open_tap_rx_q_0( let tap = match if_name { Some(name) => Tap::open_named(name, num_rx_q, flags).map_err(Error::TapOpen)?, // Create a new Tap device in Linux, if none was specified. - None => Tap::new(num_rx_q).map_err(Error::TapOpen)?, + None => { + let tap = Tap::new(num_rx_q).map_err(Error::TapOpen)?; + log::info!( + "Created tap device: name={}, num_rx_q={num_rx_q}", + tap.if_name_as_str() + ); + tap + } }; // Don't overwrite ip configuration of existing interfaces: if !tap_exists { diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 2916e66cf6..222ca115d0 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -65,6 +65,16 @@ pub struct Tap { if_name: Vec, } +impl Drop for Tap { + fn drop(&mut self) { + debug!( + "Dropping Tap: if_name={}, FD={}", + self.if_name_as_str(), + self.tap_file.as_raw_fd() + ); + } +} + impl PartialEq for Tap { fn eq(&self, other: &Tap) -> bool { self.if_name == other.if_name @@ -129,6 +139,9 @@ fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { } impl Tap { + /// The default naming scheme for Tap devices that are created by Cloud Hypervisor. + pub const DEFAULT_NAME_SCHEME: &'static str = "vmtap%d"; + /// # Safety /// The caller should ensure to pass a valid file descriptor and valid /// arguments for the `ioctl()` syscall. @@ -183,6 +196,7 @@ impl Tap { if fd < 0 { return Err(Error::OpenTun(IoError::last_os_error())); } + debug!("Opening Tap device with given name: ifname={if_name}, fd={fd}"); // SAFETY: We just checked that the fd is valid. let tuntap = unsafe { File::from_raw_fd(fd) }; @@ -236,7 +250,7 @@ impl Tap { /// Create a new tap interface. pub fn new(num_queue_pairs: usize) -> Result { - Self::open_named("vmtap%d", num_queue_pairs, None) + Self::open_named(Self::DEFAULT_NAME_SCHEME, num_queue_pairs, None) } pub fn from_tap_fd(fd: RawFd, num_queue_pairs: usize) -> Result { @@ -549,6 +563,7 @@ impl AsRawFd for Tap { } #[cfg(test)] +#[cfg(devcli_testenv)] // we need special permissions in the ENV to create Tap devices mod tests { use std::net::Ipv4Addr; use std::sync::{LazyLock, Mutex, mpsc}; diff --git a/option_parser/Cargo.toml b/option_parser/Cargo.toml index abacf51ddd..3d76690b41 100644 --- a/option_parser/Cargo.toml +++ b/option_parser/Cargo.toml @@ -6,3 +6,6 @@ version = "0.1.0" [dependencies] thiserror = { workspace = true } + +[lints] +workspace = true diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index d28c2db5ed..6da24afa87 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -46,6 +46,8 @@ pub enum OptionParserError { Conversion(String /* field */, String /* value */), #[error("invalid value: {0}")] InvalidValue(String), + #[error("failed to convert {1}")] + NumberConversion(#[source] ParseIntError, String), } type OptionParserResult = std::result::Result; @@ -167,6 +169,42 @@ impl OptionParser { .is_some() } + /// Parses the `addr` option of PCI devices and returns the PCI device as well as the function ID + /// + /// Returns a tuple consisting of the parsed IDs for device and function in this order. Returns an error if the + /// supplied `addr` values cannot be parsed to [`u8`]. The tuple might consist of two times [`None`] if `addr` was + /// not provided. + pub fn get_pci_device_function( + &self, + ) -> OptionParserResult<(Option, Option)> { + if let Some(addr_str) = self.get("addr") { + let (device_str, function_str) = addr_str + .split_once('.') + .ok_or(OptionParserError::InvalidValue(addr_str.to_owned()))?; + + // We also accept hex number with `0x` prefix, but need to strip it before conversion in case it's present. + let device_str = device_str.strip_prefix("0x").unwrap_or(device_str); + let device_id = u8::from_str_radix(device_str, 16) + .map_err(|e| OptionParserError::NumberConversion(e, addr_str.to_owned()))?; + + let function_str = function_str.strip_prefix("0x").unwrap_or(function_str); + let function_id = u8::from_str_radix(function_str, 16) + .map_err(|e| OptionParserError::NumberConversion(e, addr_str.to_owned()))?; + + // Currently CHV only support single-function devices. Those are mapped to function ID 0 in all cases, so we + // disallow the assignment of any other function ID. + if function_id != 0 { + return Err(OptionParserError::InvalidValue(format!( + "multi-function devices currently not supported; expected 0 got {}", + function_id + ))); + } + Ok((Some(device_id), Some(function_id))) + } else { + Ok((None, None)) + } + } + pub fn convert(&self, option: &str) -> OptionParserResult> { match self.options.get(option).and_then(|v| v.value.as_ref()) { None => Ok(None), @@ -454,7 +492,8 @@ mod tests { .add("hotplug_method") .add("hotplug_size") .add("topology") - .add("cmdline"); + .add("cmdline") + .add("addr"); assert_eq!(split_commas("\"\"").unwrap(), vec!["\"\""]); parser.parse("size=128M,hanging_param").unwrap_err(); @@ -506,6 +545,22 @@ mod tests { ); parser.parse("cmdline=\"").unwrap_err(); parser.parse("cmdline=\"\"\"").unwrap_err(); + + parser.parse("addr=0A.0").unwrap(); + assert_eq!( + (Some(0xa_u8), Some(0)), + parser.get_pci_device_function().expect("should be valid") + ); + parser.parse("addr=0A.1").unwrap(); + assert!(matches!( + parser.get_pci_device_function(), + Err(OptionParserError::InvalidValue(_)) + )); + parser.parse("addr=1g.0").unwrap(); + assert!(matches!( + parser.get_pci_device_function(), + Err(OptionParserError::NumberConversion(_, _)) + )); } #[test] diff --git a/pci/Cargo.toml b/pci/Cargo.toml index e1d631c348..760baae03d 100644 --- a/pci/Cargo.toml +++ b/pci/Cargo.toml @@ -29,3 +29,6 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/pci/src/bus.rs b/pci/src/bus.rs index fd19321de5..5b10788ede 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -45,7 +45,7 @@ pub enum PciRootError { #[error("Invalid PCI device identifier provided")] InvalidPciDeviceSlot(usize), /// Valid PCI device identifier but already used. - #[error("Valid PCI device identifier but already used")] + #[error("Valid PCI device identifier but already used: {0}")] AlreadyInUsePciDeviceSlot(usize), } pub type Result = std::result::Result; @@ -166,15 +166,42 @@ impl PciBus { Ok(()) } - pub fn next_device_id(&mut self) -> Result { - for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; - return Ok(idx as u32); + /// Allocates a PCI device ID on the bus. + /// + /// - `id`: ID to allocate on the bus. If [`None`], the next free + /// device ID on the bus is allocated, else the ID given is + /// allocated + /// + /// ## Errors + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] in case + /// the ID requested is already allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] in case the + /// requested ID exceeds the maximum number of devices allowed per + /// bus (see [`NUM_DEVICE_IDS`]). + /// * If `id` is [`None`]: Returns + /// [`PciRootError::NoPciDeviceSlotAvailable`] if no free device + /// slot is available on the bus. + pub fn allocate_device_id(&mut self, id: Option) -> Result { + if let Some(id) = id { + if (id as usize) < NUM_DEVICE_IDS { + if !self.device_ids[id as usize] { + self.device_ids[id as usize] = true; + Ok(id as u32) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id as usize)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id as usize)) + } + } else { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } } + Err(PciRootError::NoPciDeviceSlotAvailable) } - - Err(PciRootError::NoPciDeviceSlotAvailable) } pub fn get_device_id(&mut self, id: usize) -> Result<()> { @@ -484,3 +511,110 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), ) } + +#[cfg(test)] +mod unit_tests { + use std::error::Error; + use std::result::Result; + + use super::*; + + #[derive(Debug)] + struct MocRelocDevice; + + impl DeviceRelocation for MocRelocDevice { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup_bus() -> PciBus { + let pci_root = PciRoot::new(None); + let moc_device_reloc = Arc::new(MocRelocDevice {}); + PciBus::new(pci_root, moc_device_reloc) + } + + #[test] + // Test to acquire all IDs that can be acquired + fn allocate_device_id_next_free() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id as u32, bus.allocate_device_id(None).unwrap()); + } + } + + #[test] + // Test that requesting specific ID work + fn allocate_device_id_request_id() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS - 1).try_into()?; + assert_eq!(0x01_u32, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x10_u32, bus.allocate_device_id(Some(0x10))?); + assert_eq!(max_id as u32, bus.allocate_device_id(Some(max_id))?); + Ok(()) + } + + #[test] + // Test that gaps resulting from explicit allocations are filled by implicit ones, + // beginning with the first free slot + fn allocate_device_id_fills_gaps() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + assert_eq!(0x01_u32, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x03_u32, bus.allocate_device_id(Some(0x03))?); + assert_eq!(0x06_u32, bus.allocate_device_id(Some(0x06))?); + assert_eq!(0x02_u32, bus.allocate_device_id(None)?); + assert_eq!(0x04_u32, bus.allocate_device_id(None)?); + assert_eq!(0x05_u32, bus.allocate_device_id(None)?); + assert_eq!(0x07_u32, bus.allocate_device_id(None)?); + Ok(()) + } + + #[test] + // Test that requesting the same ID twice fails + fn allocate_device_id_request_id_twice_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS - 1).try_into()?; + bus.allocate_device_id(Some(max_id))?; + let _result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + PciRootError::AlreadyInUsePciDeviceSlot(max_id.into()), + _result + )); + Ok(()) + } + + #[test] + // Test to request an invalid ID + fn allocate_device_id_request_invalid_id_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS + 1).try_into()?; + let _result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + PciRootError::InvalidPciDeviceSlot(max_id.into()), + _result + )); + Ok(()) + } + + #[test] + // Test to acquire an ID when all IDs were already acquired + fn allocate_device_id_none_left() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id as u32, bus.allocate_device_id(None).unwrap()); + } + let _result = bus.allocate_device_id(None); + assert!(matches!(PciRootError::NoPciDeviceSlotAvailable, _result)); + } +} diff --git a/performance-metrics/Cargo.toml b/performance-metrics/Cargo.toml index 00e6e74682..472f1159b3 100644 --- a/performance-metrics/Cargo.toml +++ b/performance-metrics/Cargo.toml @@ -1,6 +1,5 @@ [package] authors = ["The Cloud Hypervisor Authors"] -build = "../build.rs" edition.workspace = true name = "performance-metrics" version = "0.1.0" @@ -12,3 +11,6 @@ serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } test_infra = { path = "../test_infra" } thiserror = { workspace = true } + +[lints] +workspace = true diff --git a/rate_limiter/Cargo.toml b/rate_limiter/Cargo.toml index 3067c695bb..206ec7b7f8 100644 --- a/rate_limiter/Cargo.toml +++ b/rate_limiter/Cargo.toml @@ -9,3 +9,6 @@ libc = { workspace = true } log = { workspace = true } thiserror = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index 92ded5d017..6cd69ec358 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -47,6 +47,9 @@ CARGO_GIT_REGISTRY_DIR="${CLH_BUILD_DIR}/cargo_git_registry" # Full path to the cargo target dir on the host. CARGO_TARGET_DIR="${CLH_BUILD_DIR}/cargo_target" +# Let tests know that the special environment is set up. +RUSTFLAGS="${RUSTFLAGS} --cfg devcli_testenv" + # Send a decorated message to stdout, followed by a new line # say() { diff --git a/scripts/gitlint/rules/BodyMaxLineLengthEx.py b/scripts/gitlint/rules/BodyMaxLineLengthEx.py index 88314fc529..60b555d83a 100644 --- a/scripts/gitlint/rules/BodyMaxLineLengthEx.py +++ b/scripts/gitlint/rules/BodyMaxLineLengthEx.py @@ -1,11 +1,52 @@ # SPDX-License-Identifier: Apache-2.0 from gitlint.rules import LineRule, RuleViolation, CommitMessageBody +from typing import List, Optional import re +IGNORE_PREFIXES = [ + # Please sort alphabetically + " ", + "Acked-by: ", + "Co-authored-by: ", + "Co-developed-by: ", + "Debugged-by: ", + "Diagnosed-by: ", + "Explained-by: ", + "Fixed-by: ", + "Fixes: ", + "Helped-by: ", + "Inspired-by: ", + "On-behalf-of: ", + "Originally-by: ", + "Reported-by: ", + "Reviewed-and-tested-by: ", + "Reviewed-by: ", + "Signed-off-by: ", + "Suggested-by: ", + "Tested-by: ", + "Triggered-by: ", + "\t", +] + +# Pattern allowing: +# - [0]: https://example.com +# - [0] https://example.com +# - https://example.com +LINK_REGEX = re.compile(r"^(([\[0-9]+]:?\s?)?https?://).*$") + +MAX_LEN = 72 + + class BodyMaxLineLengthEx(LineRule): - """A rule to enforce a line limit of 72 characters, except for valid cases.""" + """ + A rule to enforce a line limit of 72 characters, except for valid cases: + + - Markdown-style code blocks + - Commit tags, such as Signed-off-by + - Links + """ # A rule MUST have a human friendly name name = "body-max-line-length-ex" @@ -17,33 +58,34 @@ class BodyMaxLineLengthEx(LineRule): # A line-rule MUST have a target (not required for CommitRules). target = CommitMessageBody - max_len = 72 - # Updated property as the commit messages is validated line by line. inside_open_codeblock = False - def validate(self, line, commit): - # Pattern allowing: - # - [0]: https://foobar - # - [0] https://foobar - # - https://foobar - link_regex = re.compile(r"^((\[[0-9]+\]:?\s?)?https?:\/\/).*$") - + def validate(self, line, commit) -> Optional[List[RuleViolation]]: + # We keep track of whether we are in an open code block. is_codeblock_marker = line.startswith("```") - inside_open_codeblock_ = self.inside_open_codeblock if is_codeblock_marker: self.inside_open_codeblock = not self.inside_open_codeblock - if len(line) > self.max_len: - is_link = link_regex.match(line) + # Begin checks + if len(line) <= MAX_LEN: + return None + + if inside_open_codeblock_: + return None - if inside_open_codeblock_: - return + if None is not LINK_REGEX.match(line): + return None - if is_link: - return + # Don't check lines with allowed prefixes + for prefix in IGNORE_PREFIXES: + if line.startswith(prefix): + return None - return [ - RuleViolation(self.id, f"Line '{line}' exceeds limit of {self.max_len}") - ] + return [ + RuleViolation( + self.id, + f"Line '{line}' exceeds limit of {MAX_LEN}: {len(line)}", + ) + ] diff --git a/scripts/gitlint/rules/on-behalf-of-marker.py b/scripts/gitlint/rules/on-behalf-of-marker.py new file mode 100644 index 0000000000..d08e334b17 --- /dev/null +++ b/scripts/gitlint/rules/on-behalf-of-marker.py @@ -0,0 +1,36 @@ +from gitlint.rules import LineRule, RuleViolation, CommitMessageTitle, CommitRule + +class BodyContainsOnBehalfOfSAPMarker(CommitRule): + """Enforce that each commit coming from an SAP contractor contains an + "On-behalf-of SAP user@sap.com" marker. + """ + + # A rule MUST have a human friendly name + name = "body-requires-on-behalf-of-sap" + + # A rule MUST have a *unique* id + # We recommend starting with UC (for User-defined Commit-rule). + id = "UC-sap" + + # Lower-case list of contractors + contractors = [ + "@cyberus-technology.de" + ] + + # Marker followed by " name.surname@sap.com" + marker = "On-behalf-of: SAP" + + def validate(self, commit): + if "@sap.com" in commit.author_email.lower(): + return + + # Allow third-party open-source contributions + if not any(contractor in commit.author_email.lower() for contractor in self.contractors): + return + + for line in commit.message.body: + if line.startswith(self.marker) and "@sap.com" in line.lower(): + return + + msg = f"Body does not contain a '{self.marker} user@sap.com' line" + return [RuleViolation(self.id, msg, line_nr=1)] diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index 483c9e8d37..bc1312f58e 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -191,7 +191,9 @@ if [ $RES -ne 0 ]; then exit 1 fi +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" cargo build --features mshv --all --release --target "$BUILD_TARGET" diff --git a/scripts/run_integration_tests_live_migration.sh b/scripts/run_integration_tests_live_migration.sh index fa0b3dcf45..f191b0baa1 100755 --- a/scripts/run_integration_tests_live_migration.sh +++ b/scripts/run_integration_tests_live_migration.sh @@ -83,7 +83,10 @@ PAGE_NUM=$((12288 * 1024 / HUGEPAGESIZE)) echo "$PAGE_NUM" | sudo tee /proc/sys/vm/nr_hugepages sudo chmod a+rwX /dev/hugepages +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test $test_features "live_migration_parallel::$test_filter" -- ${test_binary_args[*]} RES=$? diff --git a/scripts/run_integration_tests_rate_limiter.sh b/scripts/run_integration_tests_rate_limiter.sh index fd9f689a3f..f750e1aa63 100755 --- a/scripts/run_integration_tests_rate_limiter.sh +++ b/scripts/run_integration_tests_rate_limiter.sh @@ -55,7 +55,10 @@ fi cargo build --features mshv --all --release --target "$BUILD_TARGET" +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test $test_features "rate_limiter::$test_filter" -- --test-threads=1 ${test_binary_args[*]} RES=$? diff --git a/scripts/run_integration_tests_vfio.sh b/scripts/run_integration_tests_vfio.sh index 4d7bac60a4..b182c6612c 100755 --- a/scripts/run_integration_tests_vfio.sh +++ b/scripts/run_integration_tests_vfio.sh @@ -26,7 +26,10 @@ fi cargo build --features mshv --all --release --target "$BUILD_TARGET" +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test "vfio::test_nvidia" -- --test-threads=1 ${test_binary_args[*]} RES=$? diff --git a/scripts/run_integration_tests_windows_aarch64.sh b/scripts/run_integration_tests_windows_aarch64.sh index 92d66f805d..0190aa1c6c 100755 --- a/scripts/run_integration_tests_windows_aarch64.sh +++ b/scripts/run_integration_tests_windows_aarch64.sh @@ -36,7 +36,9 @@ dmsetup mknodes dmsetup create windows-snapshot-base --table "0 $img_blk_size snapshot-origin /dev/mapper/windows-base" dmsetup mknodes +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" cargo build --all --release --target "$BUILD_TARGET" diff --git a/scripts/run_integration_tests_windows_x86_64.sh b/scripts/run_integration_tests_windows_x86_64.sh index 2b11a6e687..714f6e4788 100755 --- a/scripts/run_integration_tests_windows_x86_64.sh +++ b/scripts/run_integration_tests_windows_x86_64.sh @@ -41,7 +41,9 @@ dmsetup mknodes cargo build --features mshv --all --release --target "$BUILD_TARGET" +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" # Only run with 1 thread to avoid tests interfering with one another because # Windows has a static IP configured diff --git a/scripts/run_integration_tests_x86_64.sh b/scripts/run_integration_tests_x86_64.sh index 1d9c56be78..ca5ed6f836 100755 --- a/scripts/run_integration_tests_x86_64.sh +++ b/scripts/run_integration_tests_x86_64.sh @@ -177,14 +177,16 @@ ulimit -l unlimited # Set number of open descriptors high enough for VFIO tests to run ulimit -n 4096 +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test --release --target "$BUILD_TARGET" $test_features "common_parallel::$test_filter" -- ${test_binary_args[*]} --test-threads=$((($(nproc) * 3) / 4)) RES=$? # Run some tests in sequence since the result could be affected by other tests # running in parallel. if [ $RES -eq 0 ]; then - export RUST_BACKTRACE=1 time cargo test --release --target "$BUILD_TARGET" $test_features "common_sequential::$test_filter" -- --test-threads=1 ${test_binary_args[*]} RES=$? fi @@ -192,7 +194,6 @@ fi # Run tests on dbus_api if [ $RES -eq 0 ]; then cargo build --features "mshv,dbus_api" --all --release --target "$BUILD_TARGET" - export RUST_BACKTRACE=1 # integration tests now do not reply on build feature "dbus_api" time cargo test $test_features "dbus_api::$test_filter" -- ${test_binary_args[*]} RES=$? @@ -201,14 +202,12 @@ fi # Run tests on fw_cfg if [ $RES -eq 0 ]; then cargo build --features "mshv,fw_cfg" --all --release --target "$BUILD_TARGET" - export RUST_BACKTRACE=1 time cargo test $test_features "fw_cfg::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} RES=$? fi if [ $RES -eq 0 ]; then cargo build --features "mshv,ivshmem" --all --release --target "$BUILD_TARGET" - export RUST_BACKTRACE=1 time cargo test $test_features "ivshmem::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} RES=$? fi diff --git a/serial_buffer/Cargo.toml b/serial_buffer/Cargo.toml index 0691b8a3b7..767c8a97ff 100644 --- a/serial_buffer/Cargo.toml +++ b/serial_buffer/Cargo.toml @@ -3,3 +3,6 @@ authors = ["The Cloud Hypervisor Authors"] edition.workspace = true name = "serial_buffer" version = "0.1.0" + +[lints] +workspace = true diff --git a/test_infra/Cargo.toml b/test_infra/Cargo.toml index 8fdefed15a..e6ea592c39 100644 --- a/test_infra/Cargo.toml +++ b/test_infra/Cargo.toml @@ -13,3 +13,6 @@ ssh2 = { version = "0.9.5", features = ["vendored-openssl"] } thiserror = { workspace = true } vmm-sys-util = { workspace = true } wait-timeout = { workspace = true } + +[lints] +workspace = true diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index 24a818c194..03e2656737 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -11,7 +11,7 @@ use std::io::{Read, Write}; use std::net::{TcpListener, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::os::unix::io::{AsRawFd, FromRawFd}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::{Child, Command, ExitStatus, Output, Stdio}; use std::str::FromStr; use std::sync::{LazyLock, Mutex}; @@ -249,6 +249,39 @@ impl Drop for WindowsDiskConfig { } } +/// Returns the workspace root directory. +/// +/// As we don't have packages in the workspace root, +/// we walk up until we found the main Cargo.toml file. +fn workspace_root() -> PathBuf { + // The directory of the current crate (integration test). + let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + + // Currently we have one level of nesting and we probably never change it. + let max_levels = 2; + + eprintln!( + "Looking for workspace root: starting with dir={}", + dir.to_str().unwrap() + ); + + // walk up + for _ in 0..max_levels { + dir = dir.parent().unwrap().to_path_buf(); + eprintln!("Checking parent dir: {}", dir.to_str().unwrap()); + let maybe_manifest_file = dir.join("Cargo.toml"); + if maybe_manifest_file.exists() { + let content = fs::read_to_string(&maybe_manifest_file).unwrap(); + if content.contains("[workspace]") && content.contains("Cloud Hypervisor Workspace") { + eprintln!("INFO: Found workspace root: {}", dir.to_str().unwrap()); + return dir; + } + } + } + + panic!("Could not find workspace root"); +} + impl DiskConfig for UbuntuDiskConfig { fn prepare_cloudinit(&self, tmp_dir: &TempDir, network: &GuestNetworkConfig) -> String { let cloudinit_file_path = @@ -259,15 +292,16 @@ impl DiskConfig for UbuntuDiskConfig { fs::create_dir_all(&cloud_init_directory) .expect("Expect creating cloud-init directory to succeed"); - let source_file_dir = std::env::current_dir() - .unwrap() + let source_file_dir = workspace_root() .join("test_data") .join("cloud-init") .join("ubuntu") .join("ci"); ["meta-data"].iter().for_each(|x| { - rate_limited_copy(source_file_dir.join(x), cloud_init_directory.join(x)) + let source_file = source_file_dir.join(x); + let cloud_init = cloud_init_directory.join(x); + rate_limited_copy(source_file, cloud_init) .expect("Expect copying cloud-init meta-data to succeed"); }); @@ -1402,11 +1436,19 @@ impl<'a> GuestCommand<'a> { } } +/// Returns the absolute path into the workspaces target directory to locate the desired +/// executable. +/// +/// # Arguments +/// - `cmd`: workspace binary, e.g. `ch-remote` or `cloud-hypervisor` pub fn clh_command(cmd: &str) -> String { - env::var("BUILD_TARGET").map_or( - format!("target/x86_64-unknown-linux-gnu/release/{cmd}"), - |target| format!("target/{target}/release/{cmd}"), - ) + let workspace_root = workspace_root(); + let rustc_target = env::var("BUILD_TARGET").unwrap_or("x86_64-unknown-linux-gnu".to_string()); + let target_artifact_dir = format!("target/{rustc_target}/release"); + let target_cmd_path = format!("{target_artifact_dir}/{cmd}"); + + let full_path = workspace_root.join(&target_cmd_path); + String::from(full_path.to_str().unwrap()) } pub fn parse_iperf3_output(output: &[u8], sender: bool, bandwidth: bool) -> Result { diff --git a/tests/readme.md b/tests/readme.md new file mode 100644 index 0000000000..9bff6a994a --- /dev/null +++ b/tests/readme.md @@ -0,0 +1 @@ +The integration tests have been moved to `./cloud-hypervisor/tests`. diff --git a/tpm/Cargo.toml b/tpm/Cargo.toml index cf03968cde..82dc8f79be 100644 --- a/tpm/Cargo.toml +++ b/tpm/Cargo.toml @@ -12,3 +12,6 @@ log = { workspace = true } net_gen = { path = "../net_gen" } thiserror = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/tracer/Cargo.toml b/tracer/Cargo.toml index bdcf559695..1ac9f4e393 100644 --- a/tracer/Cargo.toml +++ b/tracer/Cargo.toml @@ -12,3 +12,6 @@ serde_json = { workspace = true } [features] tracing = [] + +[lints] +workspace = true diff --git a/vhost_user_block/Cargo.toml b/vhost_user_block/Cargo.toml index c2e7385668..051210cb43 100644 --- a/vhost_user_block/Cargo.toml +++ b/vhost_user_block/Cargo.toml @@ -1,6 +1,5 @@ [package] authors = ["The Cloud Hypervisor Authors"] -build = "../build.rs" edition.workspace = true name = "vhost_user_block" version = "0.1.0" @@ -19,3 +18,6 @@ virtio-bindings = { workspace = true } virtio-queue = { workspace = true } vm-memory = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/vhost_user_net/Cargo.toml b/vhost_user_net/Cargo.toml index 6cd316e9fe..2edf00db95 100644 --- a/vhost_user_net/Cargo.toml +++ b/vhost_user_net/Cargo.toml @@ -1,6 +1,5 @@ [package] authors = ["The Cloud Hypervisor Authors"] -build = "../build.rs" edition.workspace = true name = "vhost_user_net" version = "0.1.0" @@ -19,3 +18,6 @@ vhost-user-backend = { workspace = true } virtio-bindings = { workspace = true } vm-memory = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index 64a60910ca..5cbfe145f4 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -47,3 +47,6 @@ vm-memory = { workspace = true, features = [ vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index ff8adf3b4e..5c0ed6f411 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -18,8 +18,8 @@ use std::sync::{Arc, Barrier}; use std::{io, result}; use anyhow::anyhow; -use block::async_io::{AsyncIo, AsyncIoError, DiskFile}; -use block::fcntl::{LockError, LockType, get_lock_state}; +use block::async_io::{AsyncIo, AsyncIoError, DiskFile, DiskFileError}; +use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state}; use block::{ ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, }; @@ -93,6 +93,14 @@ pub enum Error { /// The path of the disk image. path: PathBuf, }, + #[error("disk image size is not a multiple of {}", SECTOR_SIZE)] + InvalidSize, + #[error("Failed to pause/resume vcpus")] + FailedPauseResume(#[source] MigratableError), + #[error("Failed signal config interrupt")] + FailedSignalingConfigChange(#[source] io::Error), + #[error("Disk resize failed")] + FailedDiskResize(#[source] DiskFileError), } pub type Result = result::Result; @@ -135,7 +143,7 @@ struct BlockEpollHandler { queue: Queue, mem: GuestMemoryAtomic, disk_image: Box, - disk_nsectors: u64, + disk_nsectors: Arc, interrupt_cb: Arc, serial: Vec, kill_evt: EventFd, @@ -230,7 +238,7 @@ impl BlockEpollHandler { let result = request.execute_async( desc_chain.memory(), - self.disk_nsectors, + self.disk_nsectors.load(Ordering::SeqCst), self.disk_image.as_mut(), &self.serial, desc_chain.head_index() as u64, @@ -621,7 +629,7 @@ pub struct Block { id: String, disk_image: Box, disk_path: PathBuf, - disk_nsectors: u64, + disk_nsectors: Arc, config: VirtioBlockConfig, writeback: Arc, counters: BlockCounters, @@ -751,7 +759,7 @@ impl Block { id, disk_image, disk_path, - disk_nsectors, + disk_nsectors: Arc::new(AtomicU64::new(disk_nsectors)), config, writeback: Arc::new(AtomicBool::new(true)), counters: BlockCounters::default(), @@ -767,20 +775,42 @@ impl Block { has_feature(self.features(), VIRTIO_BLK_F_RO.into()) } + /// Returns the granularity for the advisory lock for this disk. + // TODO In future, we could add a `lock_granularity=` configuration to the CLI. + // For now, we stick to QEMU behavior. + fn lock_granularity(&mut self) -> LockGranularity { + let fallback = LockGranularity::WholeFile; + + self.disk_image + .size() + .map(|size| LockGranularity::ByteRange(0, size)) + // use a safe fallback + .unwrap_or_else(|e| { + log::warn!( + "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", + self.id, + self.disk_path.display(), + fallback + ); + fallback + }) + } + /// Tries to set an advisory lock for the corresponding disk image. pub fn try_lock_image(&mut self) -> Result<()> { let lock_type = match self.read_only() { true => LockType::Read, false => LockType::Write, }; + let granularity = self.lock_granularity(); log::debug!( - "Attempting to acquire {lock_type:?} lock for disk image id={},path={}", + "Attempting to acquire {lock_type:?} lock for disk image: id={},path={},granularity={granularity:?}", self.id, self.disk_path.display() ); let fd = self.disk_image.fd(); - fcntl::try_acquire_lock(fd, lock_type).map_err(|error| { - let current_lock = get_lock_state(fd); + fcntl::try_acquire_lock(fd, lock_type, granularity).map_err(|error| { + let current_lock = get_lock_state(fd, granularity); // Don't propagate the error to the outside, as it is not useful at all. Instead, // we try to log additional help to the user. if let Ok(current_lock) = current_lock { @@ -804,10 +834,12 @@ impl Block { /// Releases the advisory lock held for the corresponding disk image. pub fn unlock_image(&mut self) -> Result<()> { + let granularity = self.lock_granularity(); + // It is very unlikely that this fails; // Should we remove the Result to simplify the error propagation on // higher levels? - fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage { + fcntl::clear_lock(self.disk_image.fd(), granularity).map_err(|error| Error::LockDiskImage { path: self.disk_path.clone(), error, lock_type: LockType::Unlock, @@ -817,7 +849,7 @@ impl Block { fn state(&self) -> BlockState { BlockState { disk_path: self.disk_path.to_str().unwrap().to_owned(), - disk_nsectors: self.disk_nsectors, + disk_nsectors: self.disk_nsectors.load(Ordering::SeqCst), avail_features: self.common.avail_features, acked_features: self.common.acked_features, config: self.config, @@ -844,6 +876,34 @@ impl Block { self.writeback.store(writeback, Ordering::Release); } + pub fn resize(&mut self, new_size: u64) -> Result<()> { + if !new_size.is_multiple_of(SECTOR_SIZE) { + return Err(Error::InvalidSize); + } + + self.disk_image + .resize(new_size) + .map_err(Error::FailedDiskResize)?; + + let nsectors = new_size / SECTOR_SIZE; + + self.common.pause().map_err(Error::FailedPauseResume)?; + + self.disk_nsectors.store(nsectors, Ordering::SeqCst); + self.config.capacity = nsectors; + self.state().disk_nsectors = nsectors; + + self.common.resume().map_err(Error::FailedPauseResume)?; + + if let Some(interrupt_cb) = self.common.interrupt_cb.as_ref() { + interrupt_cb + .trigger(VirtioInterruptType::Config) + .map_err(Error::FailedSignalingConfigChange) + } else { + Ok(()) + } + } + #[cfg(fuzzing)] pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); @@ -931,7 +991,7 @@ impl VirtioDevice for Block { error!("failed to create new AsyncIo: {e}"); ActivateError::BadActivate })?, - disk_nsectors: self.disk_nsectors, + disk_nsectors: self.disk_nsectors.clone(), interrupt_cb: interrupt_cb.clone(), serial: self.serial.clone(), kill_evt, diff --git a/virtio-devices/src/mem.rs b/virtio-devices/src/mem.rs index 6620a5edf4..3378498b1a 100644 --- a/virtio-devices/src/mem.rs +++ b/virtio-devices/src/mem.rs @@ -391,6 +391,8 @@ impl BlocksState { } } + // TODO We can avoid creating a new bitmap here, if we switch the code + // to use Vec to keep dirty bits and just pass it as is. MemoryRangeTable::from_bitmap(bitmap, start_addr, VIRTIO_MEM_DEFAULT_BLOCK_SIZE) } } diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index fd6a4a3b07..0322181d8a 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -255,9 +255,9 @@ impl NetEpollHandler { || !self.driver_awake { self.signal_used_queue(self.queue_index_base)?; - debug!("Signalling RX queue"); + trace!("Signalling RX queue"); } else { - debug!("Not signalling RX queue"); + trace!("Not signalling RX queue"); } Ok(()) } @@ -608,11 +608,12 @@ impl Net { for fd in fds.iter() { // Duplicate so that it can survive reboots // SAFETY: FFI call to dup. Trivially safe. - let fd = unsafe { libc::dup(*fd) }; - if fd < 0 { + let fd_duped = unsafe { libc::dup(*fd) }; + if fd_duped < 0 { return Err(Error::DuplicateTapFd(std::io::Error::last_os_error())); } - let tap = Tap::from_tap_fd(fd, num_queue_pairs).map_err(Error::TapError)?; + debug!("dup'ed fd {fd} => {fd_duped} for virtio-net device {id}"); + let tap = Tap::from_tap_fd(fd_duped, num_queue_pairs).map_err(Error::TapError)?; taps.push(tap); } @@ -656,6 +657,19 @@ impl Net { impl Drop for Net { fn drop(&mut self) { + // Get a comma-separated list of the interface names of the tap devices + // associated with this network device. + let ifnames_str = self + .taps + .iter() + .map(|tap| tap.if_name_as_str()) + .collect::>(); + let ifnames_str = ifnames_str.join(","); + debug!( + "virtio-net device closed: id={}, ifnames=[{ifnames_str}]", + self.id + ); + if let Some(kill_evt) = self.common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 0680b4a303..bf0ae55e01 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -234,7 +234,7 @@ impl VirtioPciCommonConfig { } fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { - debug!("read_common_config_word: offset 0x{offset:x}"); + trace!("read_common_config_word: offset 0x{offset:x}"); match offset { 0x10 => self.msix_config.load(Ordering::Acquire), 0x12 => queues.len() as u16, // num_queues diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index f1255c7f7a..b61b2aad3c 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -570,12 +570,16 @@ impl VhostUserHandle { // divide it by 8. let len = region.size() / 8; // SAFETY: region is of size len - let bitmap = unsafe { + let bitmap: &[u64] = unsafe { // Cast the pointer to u64 let ptr = region.as_ptr() as *const u64; - std::slice::from_raw_parts(ptr, len).to_vec() + std::slice::from_raw_parts(ptr, len) }; - Ok(MemoryRangeTable::from_bitmap(bitmap, 0, 4096)) + Ok(MemoryRangeTable::from_bitmap( + bitmap.iter().copied(), + 0, + 4096, + )) } else { Err(Error::MissingShmLogRegion) } diff --git a/virtio-devices/src/vsock/unix/muxer.rs b/virtio-devices/src/vsock/unix/muxer.rs index e2e9fe2e22..645cc847a9 100644 --- a/virtio-devices/src/vsock/unix/muxer.rs +++ b/virtio-devices/src/vsock/unix/muxer.rs @@ -869,6 +869,7 @@ impl VsockMuxer { #[cfg(test)] mod tests { use std::cmp::min; + use std::fs; use std::io::Write; use std::path::{Path, PathBuf}; @@ -919,6 +920,8 @@ mod tests { ) .unwrap(); let uds_path = format!("test_vsock_{name}.sock"); + // Clear in case it is still there from a previous run + let _ = fs::remove_file(&uds_path); let muxer = VsockMuxer::new(PEER_CID, uds_path).unwrap(); Self { @@ -1047,6 +1050,9 @@ mod tests { } impl LocalListener { fn new + Clone>(path: P) -> Self { + // Clear in case it is still there from a previous run + let _ = fs::remove_file(path.as_ref()); + let path_buf = path.as_ref().to_path_buf(); let sock = UnixListener::bind(path).unwrap(); sock.set_nonblocking(true).unwrap(); diff --git a/vm-allocator/Cargo.toml b/vm-allocator/Cargo.toml index e77e877917..a4996d6dc3 100644 --- a/vm-allocator/Cargo.toml +++ b/vm-allocator/Cargo.toml @@ -14,3 +14,6 @@ vm-memory = { workspace = true } [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] arch = { path = "../arch" } + +[lints] +workspace = true diff --git a/vm-device/Cargo.toml b/vm-device/Cargo.toml index 80ed1489a3..a57ea57f5b 100644 --- a/vm-device/Cargo.toml +++ b/vm-device/Cargo.toml @@ -16,3 +16,6 @@ thiserror = { workspace = true } vfio-ioctls = { workspace = true, default-features = false } vm-memory = { workspace = true, features = ["backend-mmap"] } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index 7a8c9337b3..2053afc472 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -6,7 +6,12 @@ version = "0.1.0" [dependencies] anyhow = { workspace = true } +itertools = { workspace = true } +rustls = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } thiserror = { workspace = true } vm-memory = { workspace = true, features = ["backend-atomic", "backend-mmap"] } + +[lints] +workspace = true diff --git a/vm-migration/src/bitpos_iterator.rs b/vm-migration/src/bitpos_iterator.rs new file mode 100644 index 0000000000..8d70c7ff6b --- /dev/null +++ b/vm-migration/src/bitpos_iterator.rs @@ -0,0 +1,88 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +/// An iterator that turns a sequence of u64s into a sequence of bit positions +/// that are set. +/// +/// This is useful to iterate over dirty memory bitmaps. +struct BitposIterator { + underlying_it: I, + + /// How many u64's we've already consumed. + word_pos: usize, + + /// If we already started working on a u64, it's here. Together with the bit + /// position where we have to continue. + current_word: Option<(u64, u32)>, +} + +impl Iterator for BitposIterator +where + I: Iterator, +{ + type Item = u64; + + fn next(&mut self) -> Option { + loop { + if self.current_word.is_none() { + self.current_word = self.underlying_it.next().map(|w| (w, 0)); + } + + let (word, word_bit) = self.current_word?; + + // Continue early if there is no chance to find something. + if word != 0 && word_bit < 64 { + let shifted_word = word >> word_bit; + if shifted_word != 0 { + let zeroes = shifted_word.trailing_zeros(); + + self.current_word = Some((word, zeroes + word_bit + 1)); + let next_bitpos = + u64::try_from(self.word_pos).unwrap() * 64 + u64::from(word_bit + zeroes); + + return Some(next_bitpos); + } + } + + self.current_word = None; + self.word_pos += 1; + } + } +} + +pub trait BitposIteratorExt: Iterator + Sized { + /// Turn an iterator over `u64` into an iterator over the bit positions of + /// all 1s. We basically treat the incoming `u64` as one gigantic integer + /// and just spit out which bits are set. + fn bit_positions(self) -> impl Iterator { + BitposIterator { + underlying_it: self, + word_pos: 0, + current_word: None, + } + } +} + +impl + Sized> BitposIteratorExt for I {} + +#[cfg(test)] +mod tests { + use super::*; + + fn bitpos_check(inp: &[u64], out: &[u64]) { + assert_eq!(inp.iter().copied().bit_positions().collect::>(), out); + } + + #[test] + fn bitpos_iterator_works() { + bitpos_check(&[], &[]); + bitpos_check(&[0], &[]); + bitpos_check(&[1], &[0]); + bitpos_check(&[5], &[0, 2]); + bitpos_check(&[3 + 32], &[0, 1, 5]); + bitpos_check(&[1 << 63], &[63]); + + bitpos_check(&[1, 1 + 32], &[0, 64, 69]); + } +} diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 05bcc3131e..2fc7ffbdb7 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -9,7 +9,9 @@ use thiserror::Error; use crate::protocol::MemoryRangeTable; +mod bitpos_iterator; pub mod protocol; +pub mod tls; #[derive(Error, Debug)] pub enum MigratableError { @@ -49,8 +51,11 @@ pub enum MigratableError { #[error("Failed to complete migration for migratable component")] CompleteMigration(#[source] anyhow::Error), - #[error("Failed to release a disk lock before the migration")] + #[error("Failed to release a disk lock")] UnlockError(#[source] anyhow::Error), + + #[error("TLS error")] + Tls(#[from] tls::TlsError), } /// A Pausable component can be paused and resumed. diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 2ed782ae39..f9cb3e4188 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -5,10 +5,12 @@ use std::io::{Read, Write}; +use itertools::Itertools; use serde::{Deserialize, Serialize}; use vm_memory::ByteValued; use crate::MigratableError; +use crate::bitpos_iterator::BitposIteratorExt; // Migration protocol // 1: Source establishes communication with destination (file socket or TCP connection.) @@ -50,7 +52,7 @@ use crate::MigratableError; // The source can at any time send an "abandon request" to cancel #[repr(u16)] -#[derive(Copy, Clone, Default)] +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] pub enum Command { #[default] Invalid, @@ -174,6 +176,10 @@ impl Response { self.status } + pub fn length(&self) -> u64 { + self.length + } + pub fn read_from(fd: &mut dyn Read) -> Result { let mut response = Response::default(); fd.read_exact(Self::as_mut_slice(&mut response)) @@ -205,44 +211,130 @@ impl Response { } #[repr(C)] -#[derive(Clone, Default, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] pub struct MemoryRange { pub gpa: u64, pub length: u64, } -#[derive(Clone, Default, Serialize, Deserialize)] +impl MemoryRange { + /// Turn an iterator over the dirty bitmap into an iterator of dirty ranges. + pub fn dirty_ranges( + bitmap: impl IntoIterator, + start_addr: u64, + page_size: u64, + ) -> impl Iterator { + bitmap + .into_iter() + .bit_positions() + // Turn them into single-element ranges for coalesce. + .map(|b| b..(b + 1)) + // Merge adjacent ranges. + .coalesce(|prev, curr| { + if prev.end == curr.start { + Ok(prev.start..curr.end) + } else { + Err((prev, curr)) + } + }) + .map(move |r| Self { + gpa: start_addr + r.start * page_size, + length: (r.end - r.start) * page_size, + }) + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct MemoryRangeTable { data: Vec, } -impl MemoryRangeTable { - pub fn from_bitmap(bitmap: Vec, start_addr: u64, page_size: u64) -> Self { - let mut table = MemoryRangeTable::default(); - let mut entry: Option = None; - for (i, block) in bitmap.iter().enumerate() { - for j in 0..64 { - let is_page_dirty = ((block >> j) & 1u64) != 0u64; - let page_offset = ((i * 64) + j) as u64 * page_size; - if is_page_dirty { - if let Some(entry) = &mut entry { - entry.length += page_size; - } else { - entry = Some(MemoryRange { - gpa: start_addr + page_offset, - length: page_size, - }); +#[derive(Debug, Clone, Default)] +struct MemoryRangeTableIterator { + chunk_size: u64, + data: Vec, +} + +impl MemoryRangeTableIterator { + pub fn new(table: &MemoryRangeTable, chunk_size: u64) -> Self { + MemoryRangeTableIterator { + chunk_size, + data: table.data.clone(), + } + } +} + +impl Iterator for MemoryRangeTableIterator { + type Item = MemoryRangeTable; + + /// Return the next memory range in the table, making sure that + /// the returned range is not larger than `chunk_size`. + /// + /// **Note**: Do not rely on the order of the ranges returned by this + /// iterator. This allows for a more efficient implementation. + fn next(&mut self) -> Option { + let mut ranges: Vec = vec![]; + let mut ranges_size: u64 = 0; + + loop { + assert!(ranges_size <= self.chunk_size); + + if ranges_size == self.chunk_size || self.data.is_empty() { + break; + } + + if let Some(range) = self.data.pop() { + let next_range: MemoryRange = if ranges_size + range.length > self.chunk_size { + // How many bytes we need to put back into the table. + let leftover_bytes = ranges_size + range.length - self.chunk_size; + assert!(leftover_bytes <= range.length); + let returned_bytes = range.length - leftover_bytes; + assert!(returned_bytes <= range.length); + assert!(leftover_bytes + returned_bytes == range.length); + + self.data.push(MemoryRange { + gpa: range.gpa + returned_bytes, + length: leftover_bytes, + }); + MemoryRange { + gpa: range.gpa, + length: returned_bytes, } - } else if let Some(entry) = entry.take() { - table.push(entry); - } + } else { + range + }; + + ranges_size += next_range.length; + ranges.push(next_range); } } - if let Some(entry) = entry.take() { - table.push(entry); + + if ranges.is_empty() { + None + } else { + Some(MemoryRangeTable { data: ranges }) } + } +} - table +impl MemoryRangeTable { + pub fn ranges(&self) -> &[MemoryRange] { + &self.data + } + + /// Partitions the table into chunks of at most `chunk_size` bytes. + pub fn partition(&self, chunk_size: u64) -> impl Iterator { + MemoryRangeTableIterator::new(self, chunk_size) + } + + pub fn from_bitmap( + bitmap: impl IntoIterator, + start_addr: u64, + page_size: u64, + ) -> Self { + Self { + data: MemoryRange::dirty_ranges(bitmap, start_addr, page_size).collect(), + } } pub fn regions(&self) -> &[MemoryRange] { @@ -301,3 +393,62 @@ impl MemoryRangeTable { Self { data } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_range_table() { + let mut table = MemoryRangeTable::default(); + // Test blocks that are shorter than the chunk size. + table.push(MemoryRange { + gpa: 0, + length: 1 << 10, + }); + // Test blocks that are longer than the chunk size. + table.push(MemoryRange { + gpa: 0x1000, + length: 3 << 20, + }); + // And add another blocks, so we get a chunk that spans two memory + // ranges. + table.push(MemoryRange { + gpa: 4 << 20, + length: 1 << 20, + }); + + let table = table; // drop mut + + let chunks = table + .partition(2 << 20) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. If + // this tests becomes more complex, we can compare everything as sets. + assert_eq!( + chunks, + vec![ + vec![ + MemoryRange { + gpa: 4 << 20, + length: 1 << 20 + }, + MemoryRange { + gpa: 0x1000, + length: 1 << 20 + } + ], + vec![MemoryRange { + gpa: 0x1000 + (1 << 20), + length: 2 << 20 + },], + vec![MemoryRange { + gpa: 0, + length: 1 << 10 + }] + ] + ); + } +} diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs new file mode 100644 index 0000000000..a44a76ebc8 --- /dev/null +++ b/vm-migration/src/tls.rs @@ -0,0 +1,261 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::io::{self, Read, Write}; +use std::net::TcpStream; +use std::os::fd::{AsFd, BorrowedFd}; +use std::path::Path; +use std::sync::Arc; + +use rustls::pki_types::pem::PemObject; +use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; +use rustls::{ + ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, +}; +use thiserror::Error; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::io::{ReadVolatile, WriteVolatile}; +use vm_memory::{VolatileMemoryError, VolatileSlice}; + +use crate::MigratableError; + +#[derive(Error, Debug)] +pub enum TlsError { + #[error( + "The provided input could not be parsed because it is not a syntactically-valid DNS Name." + )] + InvalidDnsName(#[source] InvalidDnsNameError), + + #[error("Rustls protocol error")] + RustlsError(#[from] rustls::Error), + + #[error("Rustls protocol IO error")] + RustlsIoError(#[from] std::io::Error), + + #[error("Error during TLS handshake: {0}")] + HandshakeError(String), +} + +// This TlsStream will be later encapsulated in a SocketStream. Thus it has to +// implement the same traits. It is important that we never directly read from +// or write to the TcpStream encapsulated in StreamOwned. +#[derive(Debug)] +pub enum TlsStream { + Client(StreamOwned), + Server(StreamOwned), +} + +// The TLS-Stream objects cannot read or write volatile, thus we need a buffer +// between the VolatileSlice and the TLS stream (see ReadVolatile and +// WriteVolatile implementations below). Allocating this buffer in these +// function calls would make it very slow, thus we tie the buffer to the stream +// with this wrapper. +pub struct TlsStreamWrapper { + stream: TlsStream, + // Used only in ReadVolatile and WriteVolatile + buf: Vec, +} + +static MAX_CHUNK: usize = 1024 * 64; + +impl TlsStreamWrapper { + pub fn new(stream: TlsStream) -> Self { + Self { + stream, + buf: Vec::new(), + } + } +} + +impl Read for TlsStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + TlsStream::Client(s) => s.read(buf), + TlsStream::Server(s) => s.read(buf), + } + } +} + +impl Read for TlsStreamWrapper { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + Read::read(&mut self.stream, buf) + } +} + +impl Write for TlsStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + TlsStream::Client(s) => s.write(buf), + TlsStream::Server(s) => s.write(buf), + } + } + fn flush(&mut self) -> io::Result<()> { + match self { + TlsStream::Client(s) => s.flush(), + TlsStream::Server(s) => s.flush(), + } + } +} + +impl Write for TlsStreamWrapper { + fn write(&mut self, buf: &[u8]) -> io::Result { + Write::write(&mut self.stream, buf) + } + fn flush(&mut self) -> io::Result<()> { + Write::flush(&mut self.stream) + } +} + +// Reading from or writing to these FDs would break the connection, because +// those reads or writes wouldn't go through rustls. But the FD is used to wait +// until it becomes readable. +impl AsFd for TlsStreamWrapper { + fn as_fd(&self) -> BorrowedFd<'_> { + match &self.stream { + TlsStream::Client(s) => s.get_ref().as_fd(), + TlsStream::Server(s) => s.get_ref().as_fd(), + } + } +} + +impl ReadVolatile for TlsStreamWrapper { + fn read_volatile( + &mut self, + vs: &mut VolatileSlice, + ) -> std::result::Result { + let len = vs.len().min(MAX_CHUNK); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = + Read::read(&mut self.stream, &mut buf[..len]).map_err(VolatileMemoryError::IOError)?; + + if n == 0 { + return Ok(0); + } + + vs.copy_from(&buf[..n]); + self.buf.clear(); + + Ok(n) + } +} + +impl WriteVolatile for TlsStreamWrapper { + fn write_volatile( + &mut self, + vs: &VolatileSlice, + ) -> std::result::Result { + let len = vs.len().min(MAX_CHUNK); + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = vs.copy_to(&mut buf[..len]); + + if n == 0 { + return Ok(0); + } + + let n = Write::write(&mut self.stream, &buf[..n]).map_err(VolatileMemoryError::IOError)?; + self.buf.clear(); + + Ok(n) + } +} + +// A small wrapper to be put into ReceiveListener::Tls. It carries the +// TLS-Config and creates a TlsStream after the TcpConnection accepted a +// connection. +#[derive(Debug, Clone)] +pub struct TlsConnectionWrapper { + config: Arc, +} + +impl TlsConnectionWrapper { + pub fn new(cert_dir: &Path) -> Self { + let certs = CertificateDer::pem_file_iter(cert_dir.join("server-cert.pem")) + .unwrap() + .map(|cert| cert.unwrap()) + .collect(); + let key = PrivateKeyDer::from_pem_file(cert_dir.join("server-key.pem")).unwrap(); + let config = ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key) + .map_err(TlsError::RustlsError) + .unwrap(); + let config = Arc::new(config); + Self { config } + } + + pub fn wrap( + &self, + socket: TcpStream, + ) -> std::result::Result { + let conn = ServerConnection::new(self.config.clone()).map_err(TlsError::RustlsError)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError)?; + if rd == 0 && wr == 0 { + Err(TlsError::HandshakeError( + "EOF during TLS handshake".to_string(), + ))?; + } + } + + Ok(TlsStreamWrapper::new(TlsStream::Server(tls))) + } +} + +pub fn client_stream( + socket: TcpStream, + cert_dir: &Path, + hostname: &str, +) -> std::result::Result, MigratableError> { + let mut root_store = RootCertStore::empty(); + root_store.add_parsable_certificates( + CertificateDer::pem_file_iter(cert_dir.join("ca-cert.pem")) + .expect("Cannot open CA file") + .map(|result| result.unwrap()), + ); + let config = ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let config = Arc::new(config); + let server_name = + ServerName::try_from(hostname.to_string()).map_err(TlsError::InvalidDnsName)?; + let conn = ClientConnection::new(config.clone(), server_name.clone()) + .map_err(TlsError::RustlsError)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError)?; + if rd == 0 && wr == 0 { + Err(TlsError::HandshakeError( + "EOF during TLS handshake".to_string(), + ))?; + } + } + + Ok(tls) +} diff --git a/vm-virtio/Cargo.toml b/vm-virtio/Cargo.toml index 5f195af492..228f552416 100644 --- a/vm-virtio/Cargo.toml +++ b/vm-virtio/Cargo.toml @@ -14,3 +14,6 @@ vm-memory = { workspace = true, features = [ "backend-bitmap", "backend-mmap", ] } + +[lints] +workspace = true diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index b7035f030c..658f9b36b6 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -54,10 +54,13 @@ hex = { version = "0.4.3", optional = true } hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } +kvm-bindings = { workspace = true } landlock = "0.4.3" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } log = { workspace = true } +# Special fork of micro_http that combines HTTP traffic over a UNIX domain +# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors. micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } mshv-bindings = { workspace = true, features = [ "fam-wrappers", @@ -93,3 +96,6 @@ vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true, features = ["with-serde"] } zbus = { version = "5.11.0", optional = true } zerocopy = { workspace = true, features = ["alloc", "derive"] } + +[lints] +workspace = true diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index b25b7e9ab8..0b9a924658 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -6,11 +6,11 @@ //! # HTTP Endpoints of the Cloud Hypervisor API //! -//! ## Special Handling for Devices Backed by Network File Descriptors (FDs) (e.g., virtio-net) +//! ## Special Handling for Externally Provided File Descriptors (FDs) (e.g., virtio-net) //! //! Some of the HTTP handlers here implement special logic for devices -//! **backed by network FDs** to enable live-migration, state save/resume -//! (restore), and similar VM lifecycle events. +//! **backed by externally opened FDs** to enable live-migration, +//! state save/resume (restore), and similar VM lifecycle events. //! //! The utilized mechanism requires that the control software (e.g., libvirt) //! connects to Cloud Hypervisor by using a UNIX domain socket and that it @@ -35,11 +35,22 @@ //! [special HTTP library]: https://github.com/firecracker-microvm/micro-http use std::fs::File; -use std::sync::mpsc::Sender; +use std::sync::mpsc::{Receiver, Sender, SyncSender}; +use std::sync::{LazyLock, Mutex}; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; +/// Helper to make the VmSendMigration call blocking as long as a migration is ongoing. +#[allow(clippy::type_complexity)] +pub static ONGOING_LIVEMIGRATION: LazyLock<( + SyncSender>, + Mutex>>, +)> = LazyLock::new(|| { + let (sender, receiver) = std::sync::mpsc::sync_channel(0); + (sender, Mutex::new(receiver)) +}); + #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::http::http_endpoint::fds_helper::{attach_fds_to_cfg, attach_fds_to_cfgs}; @@ -47,8 +58,8 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmConfig, VmCounters, VmDelete, VmNmi, VmPause, - VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, - VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, + VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -422,15 +433,14 @@ vm_action_put_handler_body!(VmAddVdpa); vm_action_put_handler_body!(VmAddVsock); vm_action_put_handler_body!(VmAddUserDevice); vm_action_put_handler_body!(VmRemoveDevice); +vm_action_put_handler_body!(VmResizeDisk); vm_action_put_handler_body!(VmResizeZone); vm_action_put_handler_body!(VmSnapshot); -vm_action_put_handler_body!(VmReceiveMigration); -vm_action_put_handler_body!(VmSendMigration); #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] vm_action_put_handler_body!(VmCoredump); -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmAddNet { fn handle_request( @@ -454,6 +464,74 @@ impl PutHandler for VmAddNet { impl GetHandler for VmAddNet {} +// Special handling for externally provided FDs. +// See module description for more info. +impl PutHandler for VmReceiveMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let mut net_cfg: VmReceiveMigrationData = serde_json::from_slice(body.raw())?; + if let Some(cfgs) = &mut net_cfg.net_fds { + let mut cfgs = cfgs.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + attach_fds_to_cfgs(files, cfgs)?; + } + + self.send(api_notifier, api_sender, net_cfg) + .map_err(HttpError::ApiError) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmReceiveMigration {} + +// Special Handling for virtio-net Devices Backed by Network File Descriptors +// +// See above. +impl PutHandler for VmSendMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let res = self + .send( + api_notifier, + api_sender, + serde_json::from_slice(body.raw())?, + ) + .map_err(HttpError::ApiError)?; + + info!("live migration started"); + + let (_, receiver) = &*ONGOING_LIVEMIGRATION; + + info!("waiting for live migration result"); + let mig_res = receiver.lock().unwrap().recv().unwrap(); + info!("received live migration result"); + + // We forward the migration error here to the guest + mig_res + .map(|_| res) + .map_err(|e| HttpError::ApiError(ApiError::VmSendMigration(e))) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmSendMigration {} + impl PutHandler for VmResize { fn handle_request( &'static self, @@ -482,7 +560,7 @@ impl PutHandler for VmResize { impl GetHandler for VmResize {} -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmRestore { fn handle_request( diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 456610ce90..5662986368 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -6,21 +6,24 @@ use std::collections::BTreeMap; use std::error::Error; use std::fs::File; +use std::os::fd::AsRawFd; use std::os::unix::io::{IntoRawFd, RawFd}; use std::os::unix::net::UnixListener; use std::panic::AssertUnwindSafe; use std::path::PathBuf; -use std::sync::LazyLock; -use std::sync::mpsc::Sender; +use std::sync::mpsc::{Receiver, Sender, channel, sync_channel}; +use std::sync::{Arc, LazyLock, Mutex}; use std::thread; use hypervisor::HypervisorType; use micro_http::{ - Body, HttpServer, MediaType, Method, Request, Response, ServerError, StatusCode, Version, + Body, HttpServer, MediaType, Method, Request, Response, ServerError, ServerRequest, + ServerResponse, StatusCode, Version, }; use seccompiler::{SeccompAction, apply_filter}; use serde_json::Error as SerdeError; use thiserror::Error; +use vmm_sys_util::epoll::{ControlOperation, Epoll, EpollEvent, EventSet}; use vmm_sys_util::eventfd::EventFd; use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdown}; @@ -29,7 +32,7 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, VmResume, + VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; @@ -249,6 +252,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.resize"), Box::new(VmActionHandler::new(&VmResize)), ); + r.routes.insert( + endpoint!("/vm.resize-disk"), + Box::new(VmActionHandler::new(&VmResizeDisk)), + ); r.routes.insert( endpoint!("/vm.resize-zone"), Box::new(VmActionHandler::new(&VmResizeZone)), @@ -310,6 +317,152 @@ fn handle_http_request( response } +/// Keeps track of the worker threads, and the resources needed to interact +/// with them. +#[derive(Debug)] +struct HttpWorkerThreads { + // The worker threads themselves. + threads: Vec>>, + // An MPSC channel to send server requests to the workers. We put it into + // an option so we can easily drop it in the destructor. + request_tx: Option>, + // An MPSC channel that the workers use to send responses to the HTTP + // server thread. + response_rx: Receiver, + // Workers signal this eventfd when they have a response for the HTTP + // server thread. + response_event: EventFd, +} + +impl HttpWorkerThreads { + fn new( + thread_count: usize, + api_notifier: EventFd, + api_sender: Sender, + seccomp_action: &SeccompAction, + hypervisor_type: HypervisorType, + landlock_enable: bool, + exit_evt: EventFd, + ) -> Result { + let response_event = EventFd::new(libc::EFD_NONBLOCK).map_err(VmmError::EventFdCreate)?; + let (response_tx, response_rx) = sync_channel::(thread_count); + + let mut threads = Vec::new(); + let (request_tx, request_rx) = channel::(); + + let request_rx = Arc::new(Mutex::new(request_rx)); + + // We use the same seccomp filter that we already use for the HTTP server thread. + let api_seccomp_filter = + get_seccomp_filter(seccomp_action, Thread::HttpApi, hypervisor_type) + .map_err(VmmError::CreateSeccompFilter)?; + + for n in 0..thread_count { + let response_event = response_event.try_clone().map_err(VmmError::EventFdClone)?; + + let response_tx = response_tx.clone(); + let request_rx = request_rx.clone(); + + let api_notifier = api_notifier.try_clone().map_err(VmmError::EventFdClone)?; + let api_sender = api_sender.clone(); + + let api_seccomp_filter = api_seccomp_filter.clone(); + let exit_evt = exit_evt.try_clone().map_err(VmmError::EventFdClone)?; + + let thread = thread::Builder::new() + .name(format!("http-worker-{n}").to_string()) + .spawn(move || { + debug!("Spawned HTTP worker thread with id {n}",); + if !api_seccomp_filter.is_empty() { + apply_filter(&api_seccomp_filter) + .map_err(VmmError::ApplySeccompFilter) + .map_err(|e| { + error!("Error applying seccomp filter: {:?}", e); + exit_evt.write(1).ok(); + e + })?; + } + + if landlock_enable { + Landlock::new() + .map_err(VmmError::CreateLandlock)? + .restrict_self() + .map_err(VmmError::ApplyLandlock) + .map_err(|e| { + error!("Error applying landlock to http-worker thread: {:?}", e); + exit_evt.write(1).ok(); + e + })?; + } + + std::panic::catch_unwind(AssertUnwindSafe(move || { + let id = n; + loop { + let request = request_rx.lock().unwrap().recv(); + match request { + Ok(msg) => { + // Process the server request + let response = msg.process(|request| { + handle_http_request(request, &api_notifier, &api_sender) + }); + + // Send the response to the HTTP server thread together with this + // threads id. + if let Err(e) = response_tx.send(response) { + error!( + "HTTP worker thread {id}: error sending response {}", + e + ); + break; + } + + // Notify the HTTP server thread. + response_event.write(1).ok(); + } + Err(e) => { + error!( + "HTTP worker thread {id}: error receiving request {}", + e + ); + break; + } + } + } + })) + .map_err(|_| { + error!("http-worker thread {n} panicked"); + exit_evt.write(1).ok() + }) + .ok(); + + Ok(()) + }) + .map_err(VmmError::HttpThreadSpawn)?; + + threads.push(thread); + } + + Ok(Self { + threads, + request_tx: Some(request_tx), + response_rx, + response_event, + }) + } +} + +impl Drop for HttpWorkerThreads { + fn drop(&mut self) { + // Dropping the Sender side of the request channels to throw the worker + // threads out of their loops. + drop(self.request_tx.take()); + // Now we can join each thread. + self.threads + .drain(..) + .for_each(|thread| thread.join().unwrap().unwrap()); + } +} + fn start_http_thread( mut server: HttpServer, api_notifier: EventFd, @@ -330,6 +483,42 @@ fn start_http_thread( .add_kill_switch(api_shutdown_fd_clone) .map_err(VmmError::CreateApiServer)?; + // We use the epoll mechanism to parallelize this. The epoll tokens are + // attached when registering the FDs with epoll. That way we can later + // check why we were notified. + const HTTP_EPOLL_TOKEN: u64 = 1; + const WORKER_EPOLL_TOKEN: u64 = 2; + + // The epoll instance our HTTP server thread will wait on. + let outer_epoll = Epoll::new().unwrap(); + let worker_threads = HttpWorkerThreads::new( + 2, + api_notifier, + api_sender, + seccomp_action, + hypervisor_type, + landlock_enable, + exit_evt.try_clone().unwrap(), + )?; + + // Register the fd that the worker threads will signal. + outer_epoll + .ctl( + ControlOperation::Add, + worker_threads.response_event.as_raw_fd(), + EpollEvent::new(EventSet::IN, WORKER_EPOLL_TOKEN), + ) + .unwrap(); + + // Register the HttpServer's fd. + outer_epoll + .ctl( + ControlOperation::Add, + server.epoll().as_raw_fd(), + EpollEvent::new(EventSet::IN, HTTP_EPOLL_TOKEN), + ) + .unwrap(); + let thread = thread::Builder::new() .name("http-server".to_string()) .spawn(move || { @@ -357,24 +546,42 @@ fn start_http_thread( } std::panic::catch_unwind(AssertUnwindSafe(move || { + let mut events = vec![EpollEvent::default(); 32]; server.start_server().unwrap(); + loop { - match server.requests() { - Ok(request_vec) => { - for server_request in request_vec { - if let Err(e) = server.respond(server_request.process(|request| { - handle_http_request(request, &api_notifier, &api_sender) - })) { + let n = outer_epoll.wait(-1, &mut events).unwrap(); + for ev in events.iter().take(n) { + match ev.data() { + HTTP_EPOLL_TOKEN => { + // The HttpServer got a request, handle that. + match server.requests() { + Ok(request_vec) => { + for server_request in request_vec { + worker_threads.request_tx.as_ref().unwrap().send(server_request).unwrap(); + } + } + Err(ServerError::ShutdownEvent) => { + server.flush_outgoing_writes(); + return; + } + Err(e) => { + error!( + "HTTP server error on retrieving incoming request. Error: {e}" + ); + } + } + } + WORKER_EPOLL_TOKEN => { + // One of the worker threads has a response. + // We clear the eventfd first. + let _ = worker_threads.response_event.read().unwrap(); + let response = worker_threads.response_rx.recv().unwrap(); + if let Err(e) = server.respond(response){ error!("HTTP server error on response: {e}"); } } - } - Err(ServerError::ShutdownEvent) => { - server.flush_outgoing_writes(); - return; - } - Err(e) => { - error!("HTTP server error on retrieving incoming request. Error: {e}"); + _ => { } } } } diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index e0ffc2f8e1..4d681d035f 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -34,6 +34,8 @@ pub mod dbus; pub mod http; use std::io; +use std::num::NonZeroU32; +use std::path::PathBuf; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use micro_http::Body; @@ -46,7 +48,7 @@ use vmm_sys_util::eventfd::EventFd; pub use self::dbus::start_dbus_thread; pub use self::http::{start_http_fd_thread, start_http_path_thread}; use crate::Error as VmmError; -use crate::config::RestoreConfig; +use crate::config::{RestoreConfig, RestoredNetConfig}; use crate::device_tree::DeviceTree; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ @@ -117,8 +119,8 @@ pub enum ApiError { #[error("The VM could not be snapshotted")] VmSnapshot(#[source] VmError), - /// The VM could not restored. - #[error("The VM could not restored")] + /// The VM could not be restored. + #[error("The VM could not be restored")] VmRestore(#[source] VmError), /// The VM could not be coredumped. @@ -133,6 +135,10 @@ pub enum ApiError { #[error("The VM could not be resized")] VmResize(#[source] VmError), + /// The disk could not be resized. + #[error("The disk could not be resized")] + VmResizeDisk(#[source] VmError), + /// The memory zone could not be resized. #[error("The memory zone could not be resized")] VmResizeZone(#[source] VmError), @@ -222,6 +228,12 @@ pub struct VmResizeData { pub desired_balloon: Option, } +#[derive(Clone, Deserialize, Serialize, Default, Debug)] +pub struct VmResizeDiskData { + pub id: String, + pub desired_size: u64, +} + #[derive(Clone, Deserialize, Serialize, Default, Debug)] pub struct VmResizeZoneData { pub id: String, @@ -245,19 +257,53 @@ pub struct VmCoredumpData { pub destination_url: String, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmReceiveMigrationData { /// URL for the reception of migration state pub receiver_url: String, + /// Optional URL if the TCP serial configuration must be changed during + /// migration. Example: "192.168.1.1:2222". + pub tcp_serial_url: Option, + /// Map with new network FDs on the new host. + pub net_fds: Option>, + /// Directory containing the TLS server certificate (server-cert.pem) and TLS server key (server-key.pem). + #[serde(default)] + pub tls_dir: Option, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmSendMigrationData { - /// URL to migrate the VM to + /// URL to migrate the VM to. + /// + /// This is not actually a URL, but we are stuck with the name, because it's + /// part of the HTTP API. The destination is a string, such as + /// tcp:: or unix:/path/to/socket. pub destination_url: String, /// Send memory across socket without copying #[serde(default)] pub local: bool, + /// Microsecond level downtime + #[serde(default = "default_downtime")] + pub downtime: u64, + /// Second level migration timeout + #[serde(default)] + pub migration_timeout: u64, + /// The number of parallel connections for migration + #[serde(default = "default_connections")] + pub connections: NonZeroU32, + /// Directory containing the TLS root CA certificate (ca-cert.pem) + #[serde(default)] + pub tls_dir: Option, +} + +// Default value for downtime the same as qemu. +fn default_downtime() -> u64 { + 300 +} + +// We use a single connection for backward compatibility as default. +fn default_connections() -> NonZeroU32 { + NonZeroU32::new(1).unwrap() } pub enum ApiResponsePayload { @@ -314,6 +360,8 @@ pub trait RequestHandler { fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> Result<(), VmError>; + fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> Result<(), VmError>; + fn vm_add_device(&mut self, device_cfg: DeviceConfig) -> Result>, VmError>; fn vm_add_user_device( @@ -1135,6 +1183,44 @@ impl ApiAction for VmResize { } } +pub struct VmResizeDisk; + +impl ApiAction for VmResizeDisk { + type RequestBody = VmResizeDiskData; + type ResponseBody = Option; + + fn request( + &self, + resize_disk_data: Self::RequestBody, + response_sender: Sender, + ) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmResizeDisk {:?}", resize_disk_data); + println!("xxxxxx"); + + let response = vmm + .vm_resize_disk(resize_disk_data.id, resize_disk_data.desired_size) + .map_err(ApiError::VmResizeDisk) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmResizeZone; impl ApiAction for VmResizeZone { diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index d87573c558..aa87790a6f 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1254,10 +1254,24 @@ components: - destination_url type: object properties: + connections: + type: integer + format: int64 + default: 1 destination_url: type: string local: type: boolean + downtime: + type: integer + format: int64 + description: Maximum downtime in milliseconds during migration + default: 500 + migration_timeout: + type: integer + format: int64 + description: Total timeout for migration in milliseconds (0 = no limit) + default: 0 VmAddUserDevice: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 6ecdd3c800..c15d696dc5 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -6,6 +6,7 @@ use std::collections::{BTreeSet, HashMap}; #[cfg(feature = "ivshmem")] use std::fs; +use std::os::fd::RawFd; use std::path::PathBuf; use std::result; use std::str::FromStr; @@ -174,6 +175,9 @@ pub enum Error { /// Failed Parsing FwCfgItem config #[error("Error parsing --fw-cfg-config items")] ParseFwCfgItem(#[source] OptionParserError), + /// Failed parsing addr option + #[error("Error parsing --addr")] + ParsePciAddr(#[source] OptionParserError), } #[derive(Debug, PartialEq, Eq, Error)] @@ -181,6 +185,9 @@ pub enum ValidationError { /// Missing file value for console #[error("Path missing when using file console mode")] ConsoleFileMissing, + /// Missing TCP address for console + #[error("Address missing when using TCP console mode")] + ConsoleTcpAddressMissing, /// Missing socket path for console #[error("Path missing when using socket console mode")] ConsoleSocketPathMissing, @@ -226,8 +233,8 @@ pub enum ValidationError { #[error("Number of queues to virtio_net does not match the number of input FDs")] VnetQueueFdMismatch, /// Using reserved fd - #[error("Reserved fd number (<= 2)")] - VnetReservedFd, + #[error("Reserved fd number (fd={0} <= 2)")] + VnetReservedFd(RawFd), /// Hardware checksum offload is disabled. #[error("\"offload_tso\" and \"offload_ufo\" depend on \"offload_csum\"")] NoHardwareChecksumOffload, @@ -1077,7 +1084,7 @@ impl DiskConfig { ops_size=,ops_one_time_burst=,ops_refill_time=,\ id=,pci_segment=,rate_limit_group=,\ queue_affinity=,\ - serial="; + serial=,addr="; pub fn parse(disk: &str) -> Result { let mut parser = OptionParser::new(); @@ -1102,7 +1109,8 @@ impl DiskConfig { .add("pci_segment") .add("serial") .add("rate_limit_group") - .add("queue_affinity"); + .add("queue_affinity") + .add("addr"); parser.parse(disk).map_err(Error::ParseDisk)?; let path = parser.get("path").map(PathBuf::from); @@ -1214,6 +1222,10 @@ impl DiskConfig { None }; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(DiskConfig { path, readonly, @@ -1231,6 +1243,7 @@ impl DiskConfig { pci_segment, serial, queue_affinity, + bdf_device, }) } @@ -1302,7 +1315,7 @@ impl NetConfig { vhost_user=,socket=,vhost_mode=client|server,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=\ - offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off\""; + offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off,addr=DD.F\""; pub fn parse(net: &str) -> Result { let mut parser = OptionParser::new(); @@ -1331,7 +1344,8 @@ impl NetConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(net).map_err(Error::ParseNetwork)?; let tap = parser.get("tap"); @@ -1442,6 +1456,10 @@ impl NetConfig { None }; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + let config = NetConfig { tap, ip, @@ -1462,6 +1480,7 @@ impl NetConfig { offload_tso, offload_ufo, offload_csum, + bdf_device, }; Ok(config) } @@ -1478,7 +1497,12 @@ impl NetConfig { if let Some(fds) = self.fds.as_ref() { for fd in fds { if *fd <= 2 { - return Err(ValidationError::VnetReservedFd); + // If we see this, most likely our live migration path for network FDs failed. + log::debug!( + "virtio-net devices {:?} unexpectedly reports invalid FD", + self.id + ); + return Err(ValidationError::VnetReservedFd(*fd)); } } } @@ -1529,7 +1553,7 @@ impl NetConfig { impl RngConfig { pub fn parse(rng: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("src").add("iommu"); + parser.add("src").add("iommu").add("addr"); parser.parse(rng).map_err(Error::ParseRng)?; let src = PathBuf::from( @@ -1543,19 +1567,27 @@ impl RngConfig { .unwrap_or(Toggle(false)) .0; - Ok(RngConfig { src, iommu }) + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + + Ok(RngConfig { + src, + iommu, + bdf_device, + }) } } impl BalloonConfig { pub const SYNTAX: &'static str = "Balloon parameters \"size=,deflate_on_oom=on|off,\ - free_page_reporting=on|off\""; + free_page_reporting=on|off,addr=\""; pub fn parse(balloon: &str) -> Result { let mut parser = OptionParser::new(); parser.add("size"); parser.add("deflate_on_oom"); - parser.add("free_page_reporting"); + parser.add("free_page_reporting").add("addr"); parser.parse(balloon).map_err(Error::ParseBalloon)?; let size = parser @@ -1576,10 +1608,15 @@ impl BalloonConfig { .unwrap_or(Toggle(false)) .0; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(BalloonConfig { size, deflate_on_oom, free_page_reporting, + bdf_device, }) } } @@ -1587,7 +1624,8 @@ impl BalloonConfig { impl FsConfig { pub const SYNTAX: &'static str = "virtio-fs parameters \ \"tag=,socket=,num_queues=,\ - queue_size=,id=,pci_segment=\""; + queue_size=,id=,pci_segment=,\ + addr=\""; pub fn parse(fs: &str) -> Result { let mut parser = OptionParser::new(); @@ -1597,7 +1635,8 @@ impl FsConfig { .add("num_queues") .add("socket") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(fs).map_err(Error::ParseFileSystem)?; let tag = parser.get("tag").ok_or(Error::ParseFsTagMissing)?; @@ -1622,6 +1661,10 @@ impl FsConfig { .map_err(Error::ParseFileSystem)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(FsConfig { tag, socket, @@ -1629,6 +1672,7 @@ impl FsConfig { queue_size, id, pci_segment, + bdf_device, }) } @@ -1754,7 +1798,7 @@ impl FwCfgItem { impl PmemConfig { pub const SYNTAX: &'static str = "Persistent memory parameters \ \"file=,size=,iommu=on|off,\ - discard_writes=on|off,id=,pci_segment=\""; + discard_writes=on|off,id=,pci_segment=,addr=\""; pub fn parse(pmem: &str) -> Result { let mut parser = OptionParser::new(); @@ -1764,7 +1808,8 @@ impl PmemConfig { .add("iommu") .add("discard_writes") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(pmem).map_err(Error::ParsePersistentMemory)?; let file = PathBuf::from(parser.get("file").ok_or(Error::ParsePmemFileMissing)?); @@ -1788,6 +1833,10 @@ impl PmemConfig { .map_err(Error::ParsePersistentMemory)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(PmemConfig { file, size, @@ -1795,6 +1844,7 @@ impl PmemConfig { discard_writes, id, pci_segment, + bdf_device, }) } @@ -1826,11 +1876,14 @@ impl ConsoleConfig { .add_valueless("null") .add("file") .add("iommu") - .add("socket"); + .add("tcp") + .add("socket") + .add("addr"); parser.parse(console).map_err(Error::ParseConsole)?; let mut file: Option = default_consoleconfig_file(); let mut socket: Option = None; + let mut url: Option = None; let mut mode: ConsoleOutputMode = ConsoleOutputMode::Off; if parser.is_set("off") { @@ -1840,6 +1893,19 @@ impl ConsoleConfig { mode = ConsoleOutputMode::Tty } else if parser.is_set("null") { mode = ConsoleOutputMode::Null + } else if parser.is_set("tcp") { + mode = ConsoleOutputMode::Tcp; + url = Some( + parser + .get("tcp") + .ok_or(Error::Validation(ValidationError::ConsoleTcpAddressMissing))?, + ); + if parser.is_set("file") { + file = + Some(PathBuf::from(parser.get("file").ok_or( + Error::Validation(ValidationError::ConsoleFileMissing), + )?)); + } } else if parser.is_set("file") { mode = ConsoleOutputMode::File; file = @@ -1860,11 +1926,17 @@ impl ConsoleConfig { .unwrap_or(Toggle(false)) .0; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(Self { file, mode, iommu, socket, + url, + bdf_device, }) } } @@ -1924,7 +1996,8 @@ impl DebugConsoleConfig { } impl DeviceConfig { - pub const SYNTAX: &'static str = "Direct device assignment parameters \"path=,iommu=on|off,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Direct device assignment parameters \"\ + path=,iommu=on|off,id=,pci_segment=\""; pub fn parse(device: &str) -> Result { let mut parser = OptionParser::new(); @@ -2028,7 +2101,7 @@ impl UserDeviceConfig { impl VdpaConfig { pub const SYNTAX: &'static str = "vDPA device \ \"path=,num_queues=,iommu=on|off,\ - id=,pci_segment=\""; + id=,pci_segment=,addr=\""; pub fn parse(vdpa: &str) -> Result { let mut parser = OptionParser::new(); @@ -2037,7 +2110,8 @@ impl VdpaConfig { .add("num_queues") .add("iommu") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(vdpa).map_err(Error::ParseVdpa)?; let path = parser @@ -2059,12 +2133,17 @@ impl VdpaConfig { .map_err(Error::ParseVdpa)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(VdpaConfig { path, num_queues, iommu, id, pci_segment, + bdf_device, }) } @@ -2088,7 +2167,8 @@ impl VdpaConfig { impl VsockConfig { pub const SYNTAX: &'static str = "Virtio VSOCK parameters \ - \"cid=,socket=,iommu=on|off,id=,pci_segment=\""; + \"cid=,socket=,iommu=on|off,id=,\ + pci_segment=,addr=\""; pub fn parse(vsock: &str) -> Result { let mut parser = OptionParser::new(); @@ -2097,7 +2177,8 @@ impl VsockConfig { .add("cid") .add("iommu") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(vsock).map_err(Error::ParseVsock)?; let socket = parser @@ -2119,12 +2200,17 @@ impl VsockConfig { .map_err(Error::ParseVsock)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(VsockConfig { cid, socket, iommu, id, pci_segment, + bdf_device, }) } @@ -2216,6 +2302,27 @@ pub struct RestoredNetConfig { pub fds: Option>, } +impl RestoredNetConfig { + // Ensure all net devices from 'VmConfig' backed by FDs have a + // corresponding 'RestoreNetConfig' with a matched 'id' and expected + // number of FDs. + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + let found = vm_config + .net + .iter() + .flatten() + .any(|net| net.id.as_ref() == Some(&self.id)); + + if !found { + Err(ValidationError::RestoreMissingRequiredNetId( + self.id.clone(), + )) + } else { + Ok(()) + } + } +} + fn deserialize_restorednetconfig_fds<'de, D>( d: D, ) -> std::result::Result>, D::Error> @@ -3096,6 +3203,8 @@ impl VmConfig { /// To use this safely, the caller must guarantee that the input /// fds are all valid. pub unsafe fn add_preserved_fds(&mut self, mut fds: Vec) { + debug!("adding preserved FDs to VM list: {fds:?}"); + if fds.is_empty() { return; } @@ -3149,7 +3258,16 @@ impl Clone for VmConfig { .preserved_fds .as_ref() // SAFETY: FFI call with valid FDs - .map(|fds| fds.iter().map(|fd| unsafe { libc::dup(*fd) }).collect()), + .map(|fds| { + fds.iter() + .map(|fd| { + // SAFETY: Trivially safe. + let fd_duped = unsafe { libc::dup(*fd) }; + warn!("Cloning VM config: duping preserved FD {fd} => {fd_duped}"); + fd_duped + }) + .collect() + }), landlock_rules: self.landlock_rules.clone(), #[cfg(feature = "ivshmem")] ivshmem: self.ivshmem.clone(), @@ -3161,6 +3279,7 @@ impl Clone for VmConfig { impl Drop for VmConfig { fn drop(&mut self) { if let Some(mut fds) = self.preserved_fds.take() { + debug!("Closing preserved FDs from VM: fds={fds:?}"); for fd in fds.drain(..) { // SAFETY: FFI call with valid FDs unsafe { libc::close(fd) }; @@ -3401,6 +3520,7 @@ mod tests { pci_segment: 0, serial: None, queue_affinity: None, + bdf_device: None, } } @@ -3495,6 +3615,13 @@ mod tests { ..disk_fixture() } ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,addr=15.0")?, + DiskConfig { + bdf_device: Some(21), + ..disk_fixture() + } + ); Ok(()) } @@ -3519,6 +3646,7 @@ mod tests { offload_tso: true, offload_ufo: true, offload_csum: true, + bdf_device: None, } } @@ -3583,6 +3711,14 @@ mod tests { } ); + assert_eq!( + NetConfig::parse("mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,addr=08.0")?, + NetConfig { + bdf_device: Some(8), + ..net_fixture() + } + ); + assert_eq!( NetConfig::parse("mac=de:ad:be:ef:12:34,mask=255.255.255.0")?, NetConfig { @@ -3610,6 +3746,7 @@ mod tests { RngConfig { src: PathBuf::from("/dev/random"), iommu: true, + bdf_device: None, } ); assert_eq!( @@ -3619,6 +3756,13 @@ mod tests { ..Default::default() } ); + assert_eq!( + RngConfig::parse("addr=10.0")?, + RngConfig { + bdf_device: Some(16), + ..Default::default() + } + ); Ok(()) } @@ -3630,6 +3774,7 @@ mod tests { queue_size: 1024, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3649,6 +3794,14 @@ mod tests { } ); + assert_eq!( + FsConfig::parse("tag=mytag,socket=/tmp/sock,addr=0F.0")?, + FsConfig { + bdf_device: Some(15), + ..fs_fixture() + } + ); + Ok(()) } @@ -3660,6 +3813,7 @@ mod tests { discard_writes: false, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3687,6 +3841,13 @@ mod tests { ..pmem_fixture() } ); + assert_eq!( + PmemConfig::parse("file=/tmp/pmem,size=128M,addr=1F.0")?, + PmemConfig { + bdf_device: Some(31), + ..pmem_fixture() + } + ); Ok(()) } @@ -3702,6 +3863,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3711,6 +3874,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3720,6 +3885,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3729,6 +3896,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3738,6 +3907,8 @@ mod tests { iommu: false, file: Some(PathBuf::from("/tmp/console")), socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3747,6 +3918,8 @@ mod tests { iommu: true, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3756,6 +3929,8 @@ mod tests { iommu: true, file: Some(PathBuf::from("/tmp/console")), socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3765,6 +3940,8 @@ mod tests { iommu: true, file: None, socket: Some(PathBuf::from("/tmp/serial.sock")), + url: None, + bdf_device: None, } ); Ok(()) @@ -3816,6 +3993,7 @@ mod tests { iommu: false, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3832,6 +4010,13 @@ mod tests { ..vdpa_fixture() } ); + assert_eq!( + VdpaConfig::parse("path=/dev/vhost-vdpa,addr=0A.0")?, + VdpaConfig { + bdf_device: Some(10), + ..vdpa_fixture() + } + ); Ok(()) } @@ -3860,6 +4045,7 @@ mod tests { iommu: false, id: None, pci_segment: 0, + bdf_device: None, } ); assert_eq!( @@ -3870,6 +4056,19 @@ mod tests { iommu: true, id: None, pci_segment: 0, + bdf_device: None, + } + ); + + assert_eq!( + VsockConfig::parse("socket=/tmp/sock,cid=3,iommu=on,addr=08.0")?, + VsockConfig { + cid: 3, + socket: PathBuf::from("/tmp/sock"), + iommu: true, + id: None, + pci_segment: 0, + bdf_device: Some(8), } ); Ok(()) @@ -3949,6 +4148,7 @@ mod tests { id: Some("net0".to_owned()), num_queues: 2, fds: Some(vec![-1, -1, -1, -1]), + bdf_device: Some(15), ..net_fixture() }, NetConfig { @@ -4122,6 +4322,7 @@ mod tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -4131,12 +4332,16 @@ mod tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -4279,7 +4484,7 @@ mod tests { }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::VnetReservedFd) + Err(ValidationError::VnetReservedFd(0)) ); let mut invalid_config = valid_config.clone(); @@ -4454,6 +4659,7 @@ mod tests { id: None, iommu: true, pci_segment: 1, + bdf_device: None, }); still_valid_config.validate().unwrap(); @@ -4530,6 +4736,7 @@ mod tests { id: None, iommu: false, pci_segment: 1, + bdf_device: None, }); assert_eq!( invalid_config.validate(), diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index 9f8d18ae7c..19ac18e3ef 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -12,6 +12,7 @@ use std::fs::{File, OpenOptions, read_link}; use std::mem::zeroed; +use std::net::TcpListener; use std::os::fd::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::net::UnixListener; @@ -40,6 +41,10 @@ pub enum ConsoleDeviceError { #[error("No socket option support for console device")] NoSocketOptionSupportForConsoleDevice, + /// Error parsing the TCP address + #[error("Wrong TCP address format: {0}")] + WrongTcpAddressFormat(std::string::String), + /// Error setting pty raw mode #[error("Error setting pty raw mode")] SetPtyRaw(#[source] vmm_sys_util::errno::Error), @@ -62,6 +67,7 @@ pub enum ConsoleOutput { Tty(Arc), Null, Socket(Arc), + Tcp(Arc, Option>), Off, } @@ -227,6 +233,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleOutput::Null, ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, @@ -264,6 +271,21 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { + let url = vmconfig.serial.url.as_ref().unwrap(); + let socket_addr: std::net::SocketAddr = url + .parse() + .map_err(|_| ConsoleDeviceError::WrongTcpAddressFormat(url.to_string()))?; + let listener = TcpListener::bind(socket_addr) + .map_err(ConsoleDeviceError::CreateConsoleDevice)?; + + let mut f = None; + if let Some(p) = &vmconfig.serial.file { + let file = File::create(p).map_err(ConsoleDeviceError::CreateConsoleDevice)?; + f = Some(Arc::new(file)); + } + ConsoleOutput::Tcp(Arc::new(listener), f) + } ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, @@ -290,6 +312,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleOutput::Null, ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 01e12e807f..e4c86ec2fa 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -74,6 +74,8 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::{SIGRTMIN, register_signal_handler}; use zerocopy::{FromBytes, Immutable, IntoBytes}; +#[cfg(feature = "kvm")] +use {kvm_bindings::kvm_run, std::cell::Cell, std::os::fd::RawFd, std::sync::RwLock}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -89,6 +91,16 @@ use crate::vm::physical_bits; use crate::vm_config::CpusConfig; use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; +#[cfg(feature = "kvm")] +thread_local! { + static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())}; +} +#[cfg(feature = "kvm")] +/// Tell signal handler to not access certain stuff anymore during shutdown. +/// Otherwise => panics. +/// Better alternative would be to prevent signals there at all. +pub static IS_IN_SHUTDOWN: RwLock = RwLock::new(false); + #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] /// Extract the specified bits of a 64-bit integer. /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, @@ -543,6 +555,13 @@ impl Vcpu { .map_err(Error::VcpuSetGicrBaseAddr)?; Ok(()) } + + #[cfg(feature = "kvm")] + pub fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + // SAFETY: We happen to know that all current uses respect the safety contract. + // TODO find a better way to keep this safe and/or express its fragile state. + unsafe { self.vcpu.get_kvm_vcpu_raw_fd() } + } } impl Pausable for Vcpu {} @@ -771,39 +790,8 @@ impl CpuManager { #[cfg(target_arch = "x86_64")] if config.features.amx { - const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; - const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; - const XFEATURE_XTILEDATA: usize = 18; - const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; - - // SAFETY: the syscall is only modifying kernel internal - // data structures that the kernel is itself expected to safeguard. - let amx_tile = unsafe { - libc::syscall( - libc::SYS_arch_prctl, - ARCH_REQ_XCOMP_GUEST_PERM, - XFEATURE_XTILEDATA, - ) - }; - - if amx_tile != 0 { - return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); - } else { - let mut mask: usize = 0; - // SAFETY: Syscall with valid parameters. We use a raw mutable pointer to - // the `mask` place in order to ensure that we do not violate Rust's - // aliasing rules. - let result = unsafe { - libc::syscall( - libc::SYS_arch_prctl, - ARCH_GET_XCOMP_GUEST_PERM, - &raw mut mask, - ) - }; - if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { - return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); - } - } + hypervisor::arch::x86::XsaveState::enable_amx_state_components(hypervisor.as_ref()) + .map_err(|e| crate::cpu::Error::AmxEnable(e.into()))?; } let proximity_domain_per_cpu: BTreeMap = { @@ -1099,6 +1087,28 @@ impl CpuManager { thread::Builder::new() .name(format!("vcpu{vcpu_id}")) .spawn(move || { + // init thread-local kvm_run structure + #[cfg(feature = "kvm")] + { + let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd(); + + // SAFETY: We know the FD is valid and have the proper args. + let buffer = unsafe { + libc::mmap( + core::ptr::null_mut(), + 4096, + libc::PROT_WRITE | libc::PROT_READ, + libc::MAP_SHARED, + raw_kvm_fd, + 0, + ) + }; + assert!(!buffer.is_null()); + assert_ne!(buffer, libc::MAP_FAILED); + let kvm_run = buffer.cast::(); + KVM_RUN.set(kvm_run); + } + // Schedule the thread to run on the expected CPU set if let Some(cpuset) = cpuset.as_ref() { // SAFETY: FFI call with correct arguments @@ -1128,7 +1138,35 @@ impl CpuManager { return; } + #[cfg(not(feature = "kvm"))] extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} + #[cfg(feature = "kvm")] + extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) { + // We do not need a self-pipe for safe UNIX signal handling here as in this + // signal handler, we only expect the same signal over and over again. While + // different signals can interrupt a signal being handled, the same signal + // again can't by default. Therefore, this is safe. + + // This lock prevents accessing thread locals when a signal is received + // in the teardown phase of the Rust standard library. Otherwise, we would + // panic. + // + // Masking signals would be a nicer approach but this is the pragmatic + // solution. + // + // We don't have lock contention in normal operation. When the writer + // sets the bool to true, the lock is only held for a couple of µs. + let lock = IS_IN_SHUTDOWN.read().unwrap(); + if *lock { + return; + } + + let kvm_run = KVM_RUN.get(); + // SAFETY: the mapping is valid + let kvm_run = unsafe { + kvm_run.as_mut().expect("kvm_run should have been mapped as part of vCPU setup") }; + kvm_run.immediate_exit = 1; + } // This uses an async signal safe handler to kill the vcpu handles. register_signal_handler(SIGRTMIN(), handle_signal) .expect("Failed to register vcpu signal handler"); @@ -1167,12 +1205,14 @@ impl CpuManager { #[cfg(feature = "kvm")] if matches!(hypervisor_type, HypervisorType::Kvm) { - vcpu.lock().unwrap().vcpu.set_immediate_exit(true); - if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { + let lock = vcpu.lock(); + let mut lock = lock.unwrap(); + lock.vcpu.set_immediate_exit(true); + if !matches!(lock.run(), Ok(VmExit::Ignore)) { error!("Unexpected VM exit on \"immediate_exit\" run"); break; } - vcpu.lock().unwrap().vcpu.set_immediate_exit(false); + lock.vcpu.set_immediate_exit(false); } vcpu_run_interrupted.store(true, Ordering::SeqCst); diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index e52c5900ba..fbf407e3a6 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -9,7 +9,7 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; use std::fs::{File, OpenOptions}; use std::io::{self, IsTerminal, Seek, SeekFrom, stdout}; use std::num::Wrapping; @@ -466,7 +466,7 @@ pub enum DeviceManagerError { /// Failed to find an available PCI device ID. #[error("Failed to find an available PCI device ID")] - NextPciDeviceId(#[source] pci::PciRootError), + AllocatePciDeviceId(#[source] pci::PciRootError), /// Could not reserve the PCI device ID. #[error("Could not reserve the PCI device ID")] @@ -666,6 +666,10 @@ pub enum DeviceManagerError { /// Error adding fw_cfg to bus. #[error("Error adding fw_cfg to bus")] ErrorAddingFwCfgToBus(#[source] vm_device::BusError), + + /// Disk resizing failed. + #[error("Disk resize error")] + DiskResizeError(#[source] virtio_devices::block::Error), } pub type DeviceManagerResult = result::Result; @@ -891,6 +895,7 @@ struct MetaVirtioDevice { iommu: bool, id: String, pci_segment: u16, + bdf_device: Option, dma_handler: Option>, } @@ -976,7 +981,7 @@ pub struct DeviceManager { cpu_manager: Arc>, // The virtio devices on the system - virtio_devices: Vec, + virtio_devices: VecDeque, /// All disks. Needed for locking and unlocking the images. block_devices: Vec>>, @@ -1313,7 +1318,7 @@ impl DeviceManager { config, memory_manager, cpu_manager, - virtio_devices: Vec::new(), + virtio_devices: VecDeque::new(), block_devices: vec![], bus_devices: Vec::new(), device_id_cnt, @@ -1397,8 +1402,6 @@ impl DeviceManager { ) -> DeviceManagerResult<()> { trace_scoped!("create_devices"); - let mut virtio_devices: Vec = Vec::new(); - self.cpu_manager .lock() .unwrap() @@ -1451,7 +1454,6 @@ impl DeviceManager { self.console = self.add_console_devices( legacy_interrupt_manager.as_ref(), - &mut virtio_devices, console_info, console_resize_pipe, )?; @@ -1464,11 +1466,8 @@ impl DeviceManager { } self.legacy_interrupt_manager = Some(legacy_interrupt_manager); - virtio_devices.append(&mut self.make_virtio_devices()?); - - self.add_pci_devices(virtio_devices.clone())?; - - self.virtio_devices = virtio_devices; + self.make_virtio_devices()?; + self.add_pci_devices()?; // Add pvmemcontrol if required #[cfg(feature = "pvmemcontrol")] @@ -1577,10 +1576,7 @@ impl DeviceManager { } #[allow(unused_variables)] - fn add_pci_devices( - &mut self, - virtio_devices: Vec, - ) -> DeviceManagerResult<()> { + fn add_pci_devices(&mut self) -> DeviceManagerResult<()> { let iommu_id = String::from(IOMMU_DEVICE_NAME); let iommu_address_width_bits = @@ -1622,7 +1618,7 @@ impl DeviceManager { let mut iommu_attached_devices = Vec::new(); { - for handle in virtio_devices { + for handle in self.virtio_devices.clone() { let mapping: Option> = if handle.iommu { self.iommu_mapping.clone() } else { @@ -1635,6 +1631,7 @@ impl DeviceManager { handle.id, handle.pci_segment, handle.dma_handler, + handle.bdf_device, )?; if handle.iommu { @@ -1663,7 +1660,8 @@ impl DeviceManager { } if let Some(iommu_device) = iommu_device { - let dev_id = self.add_virtio_pci_device(iommu_device, &None, iommu_id, 0, None)?; + let dev_id = + self.add_virtio_pci_device(iommu_device, &None, iommu_id, 0, None, None)?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -2310,7 +2308,6 @@ impl DeviceManager { fn add_virtio_console_device( &mut self, - virtio_devices: &mut Vec, console_fd: ConsoleOutput, resize_pipe: Option>, ) -> DeviceManagerResult>> { @@ -2345,6 +2342,9 @@ impl DeviceManager { ConsoleOutput::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutput::Tcp(_, _) => { + return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); + } ConsoleOutput::Null => Endpoint::Null, ConsoleOutput::Off => return Ok(None), }; @@ -2366,14 +2366,21 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioConsole)?; let virtio_console_device = Arc::new(Mutex::new(virtio_console_device)); - virtio_devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_console_device) as Arc>, iommu: console_config.iommu, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: console_config.bdf_device, + }; + + if console_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } // Fill the device tree with a new node. In case of restore, we // know there is nothing to do, so we can simply override the @@ -2399,7 +2406,6 @@ impl DeviceManager { fn add_console_devices( &mut self, interrupt_manager: &dyn InterruptManager, - virtio_devices: &mut Vec, console_info: Option, console_resize_pipe: Option>, ) -> DeviceManagerResult> { @@ -2419,12 +2425,16 @@ impl DeviceManager { | ConsoleOutput::Null | ConsoleOutput::Pty(_) | ConsoleOutput::Socket(_) => None, + ConsoleOutput::Tcp(_, _) => None, }; if !matches!(console_info.serial_main_fd, ConsoleOutput::Off) { let serial = self.add_serial_device(interrupt_manager, serial_writer)?; self.serial_manager = match console_info.serial_main_fd { - ConsoleOutput::Pty(_) | ConsoleOutput::Tty(_) | ConsoleOutput::Socket(_) => { + ConsoleOutput::Pty(_) + | ConsoleOutput::Tty(_) + | ConsoleOutput::Socket(_) + | ConsoleOutput::Tcp(_, _) => { let serial_manager = SerialManager::new( serial, console_info.serial_main_fd, @@ -2457,17 +2467,15 @@ impl DeviceManager { | ConsoleOutput::Null | ConsoleOutput::Pty(_) | ConsoleOutput::Socket(_) => None, + ConsoleOutput::Tcp(_, _) => None, }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; } } - let console_resizer = self.add_virtio_console_device( - virtio_devices, - console_info.console_main_fd, - console_resize_pipe, - )?; + let console_resizer = + self.add_virtio_console_device(console_info.console_main_fd, console_resize_pipe)?; Ok(Arc::new(Console { console_resizer })) } @@ -2525,35 +2533,33 @@ impl DeviceManager { Ok(()) } - fn make_virtio_devices(&mut self) -> DeviceManagerResult> { - let mut devices: Vec = Vec::new(); - + fn make_virtio_devices(&mut self) -> DeviceManagerResult<()> { // Create "standard" virtio devices (net/block/rng) - devices.append(&mut self.make_virtio_block_devices()?); - devices.append(&mut self.make_virtio_net_devices()?); - devices.append(&mut self.make_virtio_rng_devices()?); + self.make_virtio_block_devices()?; + self.make_virtio_net_devices()?; + self.make_virtio_rng_devices()?; // Add virtio-fs if required - devices.append(&mut self.make_virtio_fs_devices()?); + self.make_virtio_fs_devices()?; // Add virtio-pmem if required - devices.append(&mut self.make_virtio_pmem_devices()?); + self.make_virtio_pmem_devices()?; // Add virtio-vsock if required - devices.append(&mut self.make_virtio_vsock_devices()?); + self.make_virtio_vsock_devices()?; - devices.append(&mut self.make_virtio_mem_devices()?); + self.make_virtio_mem_devices()?; // Add virtio-balloon if required - devices.append(&mut self.make_virtio_balloon_devices()?); + self.make_virtio_balloon_devices()?; // Add virtio-watchdog device - devices.append(&mut self.make_virtio_watchdog_devices()?); + self.make_virtio_watchdog_devices()?; // Add vDPA devices if required - devices.append(&mut self.make_vdpa_devices()?); + self.make_vdpa_devices()?; - Ok(devices) + Ok(()) } // Cache whether aio is supported to avoid checking for very block device @@ -2821,21 +2827,25 @@ impl DeviceManager { id, pci_segment: disk_cfg.pci_segment, dma_handler: None, + bdf_device: disk_cfg.bdf_device, }) } - fn make_virtio_block_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_block_devices(&mut self) -> DeviceManagerResult<()> { let mut block_devices = self.config.lock().unwrap().disks.clone(); if let Some(disk_list_cfg) = &mut block_devices { for disk_cfg in disk_list_cfg.iter_mut() { - devices.push(self.make_virtio_block_device(disk_cfg, false)?); + let device = self.make_virtio_block_device(disk_cfg, false)?; + if disk_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().disks = block_devices; - Ok(devices) + Ok(()) } fn make_virtio_net_device( @@ -2853,6 +2863,7 @@ impl DeviceManager { let (virtio_device, migratable_device) = if net_cfg.vhost_user { let socket = net_cfg.vhost_socket.as_ref().unwrap().clone(); + debug!("Creating virtio-net device with vhost-user backend: {socket}"); let vu_cfg = VhostUserConfig { socket, num_queues: net_cfg.num_queues, @@ -2895,6 +2906,7 @@ impl DeviceManager { let state = state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?; let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap { + debug!("Creating virtio-net device from Tap device: {tap_if_name}"); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2920,6 +2932,7 @@ impl DeviceManager { .map_err(DeviceManagerError::CreateVirtioNet)?, )) } else if let Some(fds) = &net_cfg.fds { + debug!("Creating virtio-net device from network FDs: {fds:?}"); let net = virtio_devices::Net::from_tap_fds( id.clone(), fds, @@ -2946,6 +2959,9 @@ impl DeviceManager { Arc::new(Mutex::new(net)) } else { + debug!( + "Creating virtio-net device: no ifname or FDs given, creating new Tap device" + ); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2992,26 +3008,29 @@ impl DeviceManager { id, pci_segment: net_cfg.pci_segment, dma_handler: None, + bdf_device: net_cfg.bdf_device, }) } /// Add virto-net and vhost-user-net devices - fn make_virtio_net_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); + fn make_virtio_net_devices(&mut self) -> DeviceManagerResult<()> { let mut net_devices = self.config.lock().unwrap().net.clone(); if let Some(net_list_cfg) = &mut net_devices { for net_cfg in net_list_cfg.iter_mut() { - devices.push(self.make_virtio_net_device(net_cfg)?); + let device = self.make_virtio_net_device(net_cfg)?; + if net_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().net = net_devices; - Ok(devices) + Ok(()) } - fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult<()> { // Add virtio-rng if required let rng_config = self.config.lock().unwrap().rng.clone(); if let Some(rng_path) = rng_config.src.to_str() { @@ -3032,14 +3051,20 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioRng)?, )); - devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_rng_device) as Arc>, iommu: rng_config.iommu, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: rng_config.bdf_device, + }; + if rng_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } // Fill the device tree with a new node. In case of restore, we // know there is nothing to do, so we can simply override the @@ -3050,7 +3075,7 @@ impl DeviceManager { .insert(id.clone(), device_node!(id, virtio_rng_device)); } - Ok(devices) + Ok(()) } fn make_virtio_fs_device( @@ -3100,24 +3125,28 @@ impl DeviceManager { id, pci_segment: fs_cfg.pci_segment, dma_handler: None, + bdf_device: fs_cfg.bdf_device, }) } else { Err(DeviceManagerError::NoVirtioFsSock) } } - fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult<()> { let mut fs_devices = self.config.lock().unwrap().fs.clone(); if let Some(fs_list_cfg) = &mut fs_devices { for fs_cfg in fs_list_cfg.iter_mut() { - devices.push(self.make_virtio_fs_device(fs_cfg)?); + let device = self.make_virtio_fs_device(fs_cfg)?; + if fs_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().fs = fs_devices; - Ok(devices) + Ok(()) } fn make_virtio_pmem_device( @@ -3289,21 +3318,26 @@ impl DeviceManager { id, pci_segment: pmem_cfg.pci_segment, dma_handler: None, + bdf_device: pmem_cfg.bdf_device, }) } - fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); + fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult<()> { // Add virtio-pmem if required let mut pmem_devices = self.config.lock().unwrap().pmem.clone(); if let Some(pmem_list_cfg) = &mut pmem_devices { for pmem_cfg in pmem_list_cfg.iter_mut() { - devices.push(self.make_virtio_pmem_device(pmem_cfg)?); + let device = self.make_virtio_pmem_device(pmem_cfg)?; + if pmem_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().pmem = pmem_devices; - Ok(devices) + Ok(()) } fn make_virtio_vsock_device( @@ -3360,24 +3394,26 @@ impl DeviceManager { id, pci_segment: vsock_cfg.pci_segment, dma_handler: None, + bdf_device: vsock_cfg.bdf_device, }) } - fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult<()> { let mut vsock = self.config.lock().unwrap().vsock.clone(); if let Some(vsock_cfg) = &mut vsock { - devices.push(self.make_virtio_vsock_device(vsock_cfg)?); + let device = self.make_virtio_vsock_device(vsock_cfg)?; + if vsock_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } self.config.lock().unwrap().vsock = vsock; - Ok(devices) + Ok(()) } - fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult<()> { let mm = self.memory_manager.clone(); let mut mm = mm.lock().unwrap(); for (memory_zone_id, memory_zone) in mm.memory_zones_mut().iter_mut() { @@ -3412,13 +3448,14 @@ impl DeviceManager { self.virtio_mem_devices.push(Arc::clone(&virtio_mem_device)); - devices.push(MetaVirtioDevice { + self.virtio_devices.push_back(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_mem_device) as Arc>, iommu: false, id: memory_zone_id.clone(), pci_segment: 0, dma_handler: None, + bdf_device: None, }); // Fill the device tree with a new node. In case of restore, we @@ -3431,7 +3468,7 @@ impl DeviceManager { } } - Ok(devices) + Ok(()) } #[cfg(feature = "pvmemcontrol")] @@ -3445,7 +3482,7 @@ impl DeviceManager { let pci_segment_id = 0x0_u16; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; info!("Creating pvmemcontrol device: id = {}", id); let (pvmemcontrol_pci_device, pvmemcontrol_bus_device) = @@ -3476,9 +3513,7 @@ impl DeviceManager { Ok((pvmemcontrol_bus_device, pvmemcontrol_pci_device)) } - fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult<()> { if let Some(balloon_config) = &self.config.lock().unwrap().balloon { let id = String::from(BALLOON_DEVICE_NAME); info!("Creating virtio-balloon device: id = {id}"); @@ -3501,14 +3536,21 @@ impl DeviceManager { self.balloon = Some(virtio_balloon_device.clone()); - devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_balloon_device) as Arc>, iommu: false, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: balloon_config.bdf_device, + }; + + if balloon_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } self.device_tree .lock() @@ -3516,14 +3558,12 @@ impl DeviceManager { .insert(id.clone(), device_node!(id, virtio_balloon_device)); } - Ok(devices) + Ok(()) } - fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult<()> { if !self.config.lock().unwrap().watchdog { - return Ok(devices); + return Ok(()); } let id = String::from(WATCHDOG_DEVICE_NAME); @@ -3542,13 +3582,14 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioWatchdog)?, )); - devices.push(MetaVirtioDevice { + self.virtio_devices.push_back(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_watchdog_device) as Arc>, iommu: false, id: id.clone(), pci_segment: 0, dma_handler: None, + bdf_device: None, }); self.device_tree @@ -3556,7 +3597,7 @@ impl DeviceManager { .unwrap() .insert(id.clone(), device_node!(id, virtio_watchdog_device)); - Ok(devices) + Ok(()) } fn make_vdpa_device( @@ -3607,21 +3648,26 @@ impl DeviceManager { id, pci_segment: vdpa_cfg.pci_segment, dma_handler: Some(vdpa_mapping), + bdf_device: vdpa_cfg.bdf_device, }) } - fn make_vdpa_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); + fn make_vdpa_devices(&mut self) -> DeviceManagerResult<()> { // Add vdpa if required let mut vdpa_devices = self.config.lock().unwrap().vdpa.clone(); if let Some(vdpa_list_cfg) = &mut vdpa_devices { for vdpa_cfg in vdpa_list_cfg.iter_mut() { - devices.push(self.make_vdpa_device(vdpa_cfg)?); + let device = self.make_vdpa_device(vdpa_cfg)?; + if vdpa_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().vdpa = vdpa_devices; - Ok(devices) + Ok(()) } fn next_device_name(&mut self, prefix: &str) -> DeviceManagerResult { @@ -3693,7 +3739,7 @@ impl DeviceManager { }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_name, device_cfg.pci_segment, None)?; let mut needs_dma_mapping = false; @@ -3930,7 +3976,7 @@ impl DeviceManager { }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_user_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_user_name, device_cfg.pci_segment, None)?; let legacy_interrupt_group = if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager { @@ -4042,6 +4088,7 @@ impl DeviceManager { virtio_device_id: String, pci_segment_id: u16, dma_handler: Option>, + bdf_device: Option, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4050,7 +4097,7 @@ impl DeviceManager { node.children = vec![virtio_device_id.clone()]; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, bdf_device)?; // Update the existing virtio node by setting the parent. if let Some(node) = self.device_tree.lock().unwrap().get_mut(&virtio_device_id) { @@ -4187,7 +4234,7 @@ impl DeviceManager { info!("Creating pvpanic device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); @@ -4225,7 +4272,7 @@ impl DeviceManager { info!("Creating ivshmem device {}", id); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { @@ -4270,6 +4317,7 @@ impl DeviceManager { &self, id: &str, pci_segment_id: u16, + pci_device_id: Option, ) -> DeviceManagerResult<(u16, PciBdf, Option>)> { // Look for the id in the device tree. If it can be found, that means // the device is being restored, otherwise it's created from scratch. @@ -4296,7 +4344,8 @@ impl DeviceManager { (pci_segment_id, pci_device_bdf, resources) } else { - let pci_device_bdf = self.pci_segments[pci_segment_id as usize].next_device_bdf()?; + let pci_device_bdf = + self.pci_segments[pci_segment_id as usize].allocate_device_bdf(pci_device_id)?; (pci_segment_id, pci_device_bdf, None) }) @@ -4396,6 +4445,10 @@ impl DeviceManager { Ok(()) } + /// Notifies the VM for a hotplug. + /// + /// This call doesn't wait for the vCPU receiving the + /// interrupt to acknowledge. pub fn notify_hotplug( &self, _notification_type: AcpiNotificationFlags, @@ -4764,7 +4817,7 @@ impl DeviceManager { // Add the virtio device to the device manager list. This is important // as the list is used to notify virtio devices about memory updates // for instance. - self.virtio_devices.push(handle.clone()); + self.virtio_devices.push_back(handle.clone()); let mapping: Option> = if handle.iommu { self.iommu_mapping.clone() @@ -4778,6 +4831,7 @@ impl DeviceManager { handle.id.clone(), handle.pci_segment, handle.dma_handler, + handle.bdf_device, )?; // Update the PCIU bitmap @@ -4898,6 +4952,18 @@ impl DeviceManager { 0 } + pub fn resize_disk(&mut self, device_id: &str, new_size: u64) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut disk = dev.lock().unwrap(); + if disk.id() == device_id { + return disk + .resize(new_size) + .map_err(DeviceManagerError::DiskResizeError); + } + } + Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) + } + pub fn device_tree(&self) -> Arc> { self.device_tree.clone() } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 9137ba1805..cf00f1654d 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -8,25 +8,38 @@ extern crate event_monitor; #[macro_use] extern crate log; +/// Amount of iterations before auto-converging starts. +const AUTO_CONVERGE_ITERATION_DELAY: u64 = 2; +/// Step size in percent to increase the vCPU throttling. +const AUTO_CONVERGE_STEP_SIZE: u8 = 10; +/// Amount of iterations after that we increase vCPU throttling. +const AUTO_CONVERGE_ITERATION_INCREASE: u64 = 2; +/// Maximum vCPU throttling value. +const AUTO_CONVERGE_MAX: u8 = 99; + use std::collections::HashMap; use std::fs::File; -use std::io::{Read, Write, stdout}; +use std::io::{ErrorKind, Read, Write, stdout}; use std::net::{TcpListener, TcpStream}; +use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::panic::AssertUnwindSafe; use std::path::PathBuf; use std::rc::Rc; -use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; -use std::sync::{Arc, Mutex}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::{Receiver, RecvError, SendError, Sender, TrySendError}; +use std::sync::{Arc, Barrier, Mutex}; +use std::thread::JoinHandle; #[cfg(not(target_arch = "riscv64"))] -use std::time::Instant; -use std::{io, result, thread}; +use std::time::{Duration, Instant}; +use std::{io, mem, result, thread}; use anyhow::anyhow; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; +use arch::PAGE_SIZE; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; use console_devices::{ConsoleInfo, pre_create_console_devices}; @@ -41,13 +54,20 @@ use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; use vm_memory::bitmap::{AtomicBitmap, BitmapSlice}; -use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; +use vm_memory::{ + GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, ReadVolatile, + VolatileMemoryError, VolatileSlice, WriteVolatile, +}; use vm_migration::protocol::*; -use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_migration::tls::{TlsConnectionWrapper, TlsStream, TlsStreamWrapper}; +use vm_migration::{ + Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, tls, +}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; +use crate::api::http::http_endpoint::ONGOING_LIVEMIGRATION; use crate::api::{ ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, @@ -55,6 +75,8 @@ use crate::api::{ use crate::config::{RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; +#[cfg(feature = "kvm")] +use crate::cpu::IS_IN_SHUTDOWN; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -89,6 +111,7 @@ mod pci_segment; pub mod seccomp_filters; mod serial_manager; mod sigwinch_listener; +mod vcpu_throttling; pub mod vm; pub mod vm_config; @@ -224,6 +247,7 @@ pub enum EpollDispatch { Api = 2, ActivateVirtioDevices = 3, Debug = 4, + CheckMigration = 5, Unknown, } @@ -236,6 +260,7 @@ impl From for EpollDispatch { 2 => Api, 3 => ActivateVirtioDevices, 4 => Debug, + 5 => CheckMigration, _ => Unknown, } } @@ -244,6 +269,7 @@ impl From for EpollDispatch { enum SocketStream { Unix(UnixStream), Tcp(TcpStream), + Tls(Box), } impl Read for SocketStream { @@ -251,6 +277,7 @@ impl Read for SocketStream { match self { SocketStream::Unix(stream) => stream.read(buf), SocketStream::Tcp(stream) => stream.read(buf), + SocketStream::Tls(stream) => stream.read(buf), } } } @@ -260,6 +287,7 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.write(buf), SocketStream::Tcp(stream) => stream.write(buf), + SocketStream::Tls(stream) => stream.write(buf), } } @@ -267,15 +295,17 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.flush(), SocketStream::Tcp(stream) => stream.flush(), + SocketStream::Tls(stream) => stream.flush(), } } } -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { +impl AsFd for SocketStream { + fn as_fd(&self) -> BorrowedFd<'_> { match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), + SocketStream::Unix(s) => s.as_fd(), + SocketStream::Tcp(s) => s.as_fd(), + SocketStream::Tls(s) => s.as_fd(), } } } @@ -288,6 +318,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_volatile(buf), SocketStream::Tcp(s) => s.read_volatile(buf), + SocketStream::Tls(s) => s.read_volatile(buf), } } @@ -298,6 +329,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_exact_volatile(buf), SocketStream::Tcp(s) => s.read_exact_volatile(buf), + SocketStream::Tls(s) => s.read_exact_volatile(buf), } } } @@ -310,6 +342,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_volatile(buf), SocketStream::Tcp(s) => s.write_volatile(buf), + SocketStream::Tls(s) => s.write_volatile(buf), } } @@ -320,6 +353,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_all_volatile(buf), SocketStream::Tcp(s) => s.write_all_volatile(buf), + SocketStream::Tls(s) => s.write_all_volatile(buf), } } } @@ -645,6 +679,101 @@ impl VmmVersionInfo { } } +#[derive(Debug, Clone)] +struct MigrationState { + current_dirty_pages: u64, + downtime: Duration, + downtime_start: Instant, + iteration: u64, + iteration_cost_time: Duration, + iteration_start_time: Instant, + mb_per_sec: f64, + pages_per_second: u64, + pending_size: u64, + start_time: Instant, + threshold_size: u64, + total_time: Duration, + total_transferred_bytes: u64, + total_transferred_dirty_pages: u64, +} + +impl MigrationState { + pub fn new() -> Self { + Self { + current_dirty_pages: 0, + downtime: Duration::default(), + downtime_start: Instant::now(), + iteration: 0, + iteration_cost_time: Duration::default(), + iteration_start_time: Instant::now(), + mb_per_sec: 0.0, + pages_per_second: 0, + pending_size: 0, + start_time: Instant::now(), + threshold_size: 0, + total_time: Duration::default(), + total_transferred_bytes: 0, + total_transferred_dirty_pages: 0, + } + } +} + +/// Abstraction for the thread controlling and performing the live migration. +/// +/// The migration thread also takes ownership of the [`Vm`] from the [`Vmm`]. +struct MigrationWorker { + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: Arc, +} + +impl MigrationWorker { + /// Performs any final cleanup after failed live migrations. + /// + /// Helper for [`Self::migrate`]. + fn migrate_error_cleanup(&mut self) -> result::Result<(), MigratableError> { + // Stop logging dirty pages only for non-local migrations + if !self.config.local { + self.vm.stop_dirty_log()?; + } + + Ok(()) + } + + /// Migrate and cleanup. + fn migrate(&mut self) -> result::Result<(), MigratableError> { + debug!("start sending migration"); + Vmm::send_migration( + &mut self.vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.clone(), + self.config.clone(), + ).inspect_err(|_| { + let e = self.migrate_error_cleanup(); + if let Err(e) = e { + error!("Failed to clean up after a failed live migration. VM might keep running but in an odd or possibly slowed-down state: {e}"); + } + })?; + + Ok(()) + } + + /// Perform the migration and communicate with the [`Vmm`] thread. + fn run(mut self) -> (Vm, result::Result<(), MigratableError>) { + debug!("migration thread is starting"); + + let res = self.migrate().inspect_err(|e| error!("migrate error: {e}")); + + // Notify VMM thread to get migration result by joining this thread. + self.check_migration_evt.write(1).unwrap(); + + debug!("migration thread is finished"); + (self.vm, res) + } +} + pub struct VmmThreadHandle { pub thread_handle: thread::JoinHandle>, #[cfg(feature = "dbus_api")] @@ -652,6 +781,41 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +/// Describes the current ownership of a running VM. +#[allow(clippy::large_enum_variant)] +pub enum MaybeVmOwnership { + /// The VMM holds the ownership of the VM. + Vmm(Vm), + /// The VM is temporarily blocked by the current ongoing migration. + Migration, + /// No VM is running. + None, +} + +impl MaybeVmOwnership { + /// Takes the VM and replaces it with [`Self::Migration`]. + /// + /// # Panics + /// This method panics if `self` is not [`Self::Vmm`]. + fn take_vm_for_migration(&mut self) -> Vm { + if !matches!(self, Self::Vmm(_)) { + panic!("should only be called when a migration can start"); + } + + match mem::replace(self, Self::Migration) { + MaybeVmOwnership::Vmm(vm) => vm, + _ => unreachable!(), + } + } + + fn vm_mut(&mut self) -> Option<&mut Vm> { + match self { + MaybeVmOwnership::Vmm(vm) => Some(vm), + _ => None, + } + } +} + pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, @@ -662,7 +826,7 @@ pub struct Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, version: VmmVersionInfo, - vm: Option, + vm: MaybeVmOwnership, vm_config: Option>>, seccomp_action: SeccompAction, hypervisor: Arc, @@ -672,6 +836,706 @@ pub struct Vmm { original_termios_opt: Arc>>, console_resize_pipe: Option>, console_info: Option, + check_migration_evt: EventFd, + /// Handle to the [`MigrationWorker`] thread. + /// + /// The handle will return the [`Vm`] back in any case. Further, the underlying error (if any) is returned. + migration_thread_handle: Option)>>, +} + +/// Wait for a file descriptor to become readable. In this case, we return +/// true. In case, the eventfd was signaled, return false. +fn wait_for_readable( + fd: &impl AsFd, + eventfd: &EventFd, +) -> std::result::Result { + let fd_event = eventfd.as_raw_fd().as_raw_fd(); + let fd_io = fd.as_fd().as_raw_fd(); + let mut poll_fds = [ + libc::pollfd { + fd: fd_event, + events: libc::POLLIN, + revents: 0, + }, + libc::pollfd { + fd: fd_io, + events: libc::POLLIN, + revents: 0, + }, + ]; + + // SAFETY: This is safe, because the file descriptors are valid and the + // poll_fds array is properly initialized. + let ret = unsafe { libc::poll(poll_fds.as_mut_ptr(), poll_fds.len() as libc::nfds_t, -1) }; + + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + + if poll_fds[0].revents & libc::POLLIN != 0 { + return Ok(false); + } + if poll_fds[1].revents & libc::POLLIN != 0 { + return Ok(true); + } + + panic!("Poll returned, but neither file descriptor is readable?"); +} + +/// Abstract over the different types of listeners that can be used to receive connections. +#[derive(Debug)] +enum ReceiveListener { + Tcp(TcpListener), + Unix(UnixListener, Option), + Tls(TcpListener, TlsConnectionWrapper), +} + +impl AsFd for ReceiveListener { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + ReceiveListener::Tcp(listener) => listener.as_fd(), + ReceiveListener::Unix(listener, _) => listener.as_fd(), + ReceiveListener::Tls(listener, _) => listener.as_fd(), + } + } +} + +impl ReceiveListener { + /// Block until a connection is accepted. + fn accept(&mut self) -> std::result::Result { + match self { + ReceiveListener::Tcp(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Tcp(socket)), + ReceiveListener::Unix(listener, opt_path) => { + let socket = listener + .accept() + .map(|(socket, _)| SocketStream::Unix(socket))?; + + // Remove the UNIX socket file after accepting the connection. Is this actually safe? If a user + // moves the file and creates a new one with the same name, we will delete the wrong file. + // Sounds like a confused deputy to me. + // + // TODO Don't do this? + if let Some(path) = opt_path.take() { + std::fs::remove_file(&path)?; + } + + Ok(socket) + } + ReceiveListener::Tls(listener, conn) => listener.accept().map(|(socket, _)| { + conn.wrap(socket) + .map(|s| SocketStream::Tls(Box::new(s))) + .map_err(std::io::Error::other) + })?, + } + } + + /// Same as accept(), but returns None if the eventfd is signaled. + fn abortable_accept( + &mut self, + eventfd: &EventFd, + ) -> std::result::Result, std::io::Error> { + wait_for_readable(&self, eventfd)? + .then(|| self.accept()) + .transpose() + } + + fn try_clone(&self) -> std::result::Result { + match self { + ReceiveListener::Tcp(listener) => listener.try_clone().map(ReceiveListener::Tcp), + ReceiveListener::Unix(listener, opt_path) => listener + .try_clone() + .map(|listener| ReceiveListener::Unix(listener, opt_path.clone())), + ReceiveListener::Tls(listener, conn) => listener + .try_clone() + .map(|listener| ReceiveListener::Tls(listener, conn.clone())), + } + } +} + +/// Handles a `Memory` request by writing its payload to the VM memory. +fn vm_receive_memory( + req: &Request, + socket: &mut T, + guest_mem: &GuestMemoryAtomic, +) -> std::result::Result<(), MigratableError> +where + T: Read + ReadVolatile, +{ + assert_eq!(req.command(), Command::Memory); + + // Read table + let ranges = MemoryRangeTable::read_from(socket, req.length())?; + let mem = guest_mem.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of read_exact_from() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_read = mem + .read_volatile_from( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error receiving memory from socket: {}", + e + )) + })?; + offset += bytes_read as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) +} + +/// We keep track of additional connections for receiving VM migration data +/// here. +struct ReceiveAdditionalConnections { + terminate_fd: EventFd, + + // This is only an option to be able to join it in the destructor. + accept_thread: Option>, +} + +impl ReceiveAdditionalConnections { + /// Create a pair of file descriptors that map to the same underlying event_fd. + fn event_fd_pair() -> std::result::Result<(EventFd, EventFd), std::io::Error> { + let event_fd = EventFd::new(0)?; + Ok((event_fd.try_clone()?, event_fd)) + } + + /// Handle incoming requests. + /// + /// For now we only handle `Command::Memory` requests here. Everything else + /// needs to come via the main connection. This function returns when the + /// abort_event_fd is triggered or the connection is closed or encountered + /// an error. + fn handle_requests( + socket: &mut SocketStream, + abort_event_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> std::result::Result<(), MigratableError> { + loop { + if !wait_for_readable(socket, abort_event_fd).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Failed to poll descriptors: {e}")) + })? { + info!("Got signal to tear down connection."); + return Ok(()); + } + + // TODO We only check whether we should abort when waiting for a new + // request. If the sender just stops sending data mid-request, we + // should still be abortable, but we are not... In this case, we + // will hang forever. But given that the sender is also in charge of + // driving the migration to completion, this is not a major concern. + // In the long run, it would be preferable to move I/O to + // asynchronous tasks to be able to handle aborts more gracefully. + + let req = match Request::read_from(socket) { + Ok(req) => req, + Err(MigratableError::MigrateSocket(io_error)) + if io_error.kind() == ErrorKind::UnexpectedEof => + { + debug!("Connection closed by peer"); + return Ok(()); + } + Err(e) => return Err(e), + }; + + if req.command() != Command::Memory { + return Err(MigratableError::MigrateReceive(anyhow!( + "Dropping connection. Only Memory commands are allowed on additional connections, but got {:?}", + req.command() + ))); + } + + vm_receive_memory(&req, socket, guest_memory)?; + Response::ok().write_to(socket)?; + } + } + + /// Starts a thread to accept incoming connections and handle them. These + /// additional connections are used to receive additional memory regions + /// during VM migration. + fn new( + listener: ReceiveListener, + guest_memory: GuestMemoryAtomic, + ) -> std::result::Result { + let (terminate_fd1, terminate_fd2) = Self::event_fd_pair()?; + + let accept_thread = std::thread::spawn(move || { + let terminate_fd = terminate_fd2; + let mut listener = listener; + let mut threads: Vec> = Vec::new(); + while let Ok(Some(mut socket)) = listener.abortable_accept(&terminate_fd) { + let guest_memory = guest_memory.clone(); + let terminate_fd = terminate_fd.try_clone().unwrap(); + + // We handle errors locally and log them. Passing them along is + // painful with little value. + threads.push(std::thread::spawn(move || { + if let Err(e) = Self::handle_requests(&mut socket, &terminate_fd, &guest_memory) + { + error!( + "Failed to read more requests on additional receive connection: {}", + e + ); + } + })); + } + + info!("Stopped accepting additional connections. Cleaning up threads."); + threads.into_iter().for_each(|thread| { + thread.join().unwrap(); + }); + }); + + Ok(Self { + accept_thread: Some(accept_thread), + terminate_fd: terminate_fd1, + }) + } + + /// Stop accepting additional connections and tear down all connections. + /// + /// This function does not wait for the operation to complete. + fn signal_termination(&self) { + // It's not really worth propagating this error, because it only happens if + // something hit the fan and we can't really do anything about it. + if let Err(e) = self.terminate_fd.write(1) { + error!("Failed to wake up other threads: {}", e); + } + } +} + +impl Drop for ReceiveAdditionalConnections { + fn drop(&mut self) { + self.signal_termination(); + // This unwrap is safe, because we never write a None into + // self.accept_thread in other places. + let _accept_thread = self.accept_thread.take().unwrap(); + + // TODO The accept thread tries to join all threads it started, but we + // haven't implemented tearing them down yet. + // accept_thread.join().unwrap(); + } +} + +/// The receiver's state machine behind the migration protocol. +enum ReceiveMigrationState { + /// The connection is established and we haven't received any commands yet. + Established, + + /// We received the start command. + Started, + + /// We received file descriptors for memory. This can only happen on UNIX domain sockets. + MemoryFdsReceived(Vec<(u32, File)>), + + /// We received the VM configuration. We keep the memory configuration around to populate guest memory. + /// From this point on, the sender can start sending memory updates. + /// + /// While the memory manager can also be used to populate guest memory, we keep a direct reference to + /// the memory around to populate guest memory without having to acquire a lock. + Configured( + Arc>, + GuestMemoryAtomic, + ReceiveAdditionalConnections, + ), + + /// Memory is populated and we received the state. The VM is ready to go. + StateReceived, + + /// The migration is successful. + Completed, + + /// The migration couldn't complete, either due to an error or because the sender abandoned the migration. + Aborted, +} + +impl ReceiveMigrationState { + fn finished(&self) -> bool { + matches!( + self, + ReceiveMigrationState::Completed | ReceiveMigrationState::Aborted + ) + } +} + +/// The different kinds of messages we can send to memory sending threads. +#[derive(Debug)] +enum SendMemoryThreadMessage { + Memory(Arc), + Barrier(Arc), + Disconnect, +} + +/// This struct keeps track of additional threads we use to send VM memory. +struct SendAdditionalConnections { + guest_memory: GuestMemoryAtomic, + threads: Vec>, + channels: Vec>, + // If an error occurs in one of the worker threads, the worker signals this + // using this flag. Only the main thread checks this variable, the other + // workers will be stopped in the destructor. + cancel: Arc, + // The first worker encountering an error will transmit the error using + // this channel. + error_rx: std::sync::mpsc::Receiver, +} + +/// Send memory from the given table. +fn vm_send_memory( + guest_memory: &GuestMemoryAtomic, + socket: &mut SocketStream, + table: &MemoryRangeTable, +) -> result::Result<(), MigratableError> { + if table.regions().is_empty() { + return Ok(()); + } + + Request::memory(table.length()).write_to(socket)?; + table.write_to(socket)?; + // And then the memory itself + send_memory_regions(guest_memory, table, socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + )?; + + Ok(()) +} + +impl SendAdditionalConnections { + /// How many requests can be waiting to be sent for each connection. This + /// can be set to zero to disable buffering. Whether we need to buffer + /// requests is currently unclear. If this is set too high, some connections + /// might go unused, because work pools up on some connections. + const BUFFERED_REQUESTS_PER_THREAD: usize = 1; + + /// The size of each chunk of memory to send. + /// + /// We want to make this large, because each chunk is acknowledged and we + /// wait for the ack before sending the next chunk. The challenge is that if + /// it is _too_ large, we become more sensitive to network issues, like + /// packet drops in individual connections, because large amounts of data + /// can pool when throughput on one connection is temporarily reduced. + /// + /// We can consider making this configurable, but a better network protocol + /// that doesn't require ACKs would be more efficient. + /// + /// The best-case throughput per connection can be estimated via: + /// effective_throughput = chunk_size / (chunk_size / throughput_per_connection + round_trip_time) + const CHUNK_SIZE: u64 = 64 /* MiB */ << 20; + + fn new( + send_data_migration: &VmSendMigrationData, + guest_mem: &GuestMemoryAtomic, + ) -> std::result::Result { + let mut threads = Vec::new(); + let mut channels = Vec::new(); + let cancel = Arc::new(AtomicBool::new(false)); + let (error_tx, error_rx) = std::sync::mpsc::channel::(); + + let additional_connections = send_data_migration.connections.get() - 1; + for n in 0..(additional_connections) { + let socket = (match send_migration_socket(send_data_migration) { + Err(e) if n == 0 => { + // If we encounter a problem on the first additional + // connection, we just assume the other side doesn't support + // multiple connections and carry on. + info!( + "Couldn't establish additional connections for sending VM memory: {e}, ignoring!" + ); + break; + } + otherwise => otherwise, + })?; + let guest_mem = guest_mem.clone(); + let (send, recv) = std::sync::mpsc::sync_channel::( + Self::BUFFERED_REQUESTS_PER_THREAD, + ); + let cancel = cancel.clone(); + let err_tx = error_tx.clone(); + + let thread = thread::spawn(move || { + info!("Spawned thread to send VM memory."); + + let mut total_sent = 0; + let mut socket = socket; + + for msg in recv { + match msg { + SendMemoryThreadMessage::Memory(table) => { + match vm_send_memory(&guest_mem, &mut socket, &table) { + Ok(()) => { + total_sent += table + .ranges() + .iter() + .map(|range| range.length) + .sum::(); + } + Err(e) => { + // Only the first thread that encounters an + // error sends it to the main thread. + if cancel.swap(true, Ordering::AcqRel) + && let Err(e) = err_tx.send(e) + { + error!("Could not send error to main thread: {e}"); + } + // After that we exit gracefully. Note that + // this also closes our mpsc channel. + break; + } + }; + } + SendMemoryThreadMessage::Barrier(barrier) => { + barrier.wait(); + } + SendMemoryThreadMessage::Disconnect => { + break; + } + } + } + info!("Sent {} MiB via additional connection.", total_sent >> 20); + }); + + threads.push(thread); + channels.push(send); + } + + Ok(Self { + guest_memory: guest_mem.clone(), + threads, + channels, + cancel, + error_rx, + }) + } + + /// Wait until all data that is in-flight has actually been sent and acknowledged. + fn wait_for_pending_data(&self) { + assert_eq!(self.channels.len(), self.threads.len()); + + // TODO We don't actually need the threads to block at the barrier. We + // can probably find a better implementation that involves less + // synchronization. + + let barrier = Arc::new(Barrier::new(self.channels.len() + 1)); + + for channel in &self.channels { + channel + .send(SendMemoryThreadMessage::Barrier(barrier.clone())) + // The unwrap only fails fi + .unwrap(); + } + + barrier.wait(); + } + + /// Send memory via all connections that we have. This may be just one. + /// `socket` is the original socket that was used to connect to the + /// destination. + /// + /// When this function returns, all memory has been sent and acknowledged. + fn send_memory( + &self, + table: &MemoryRangeTable, + socket: &mut SocketStream, + ) -> std::result::Result<(), MigratableError> { + let thread_len = self.threads.len(); + assert_eq!(thread_len, self.channels.len()); + + // In case, we didn't manage to establish additional connections, don't + // bother sending memory in chunks. This would just lower throughput, + // because we wait for a response after each chunk instead of sending + // everything in one go. + if thread_len == 0 { + vm_send_memory(&self.guest_memory, socket, table)?; + return Ok(()); + } + + // The chunk size is chosen to be big enough so that even very fast + // links need some milliseconds to send it. + 'next_partition: for chunk in table.partition(Self::CHUNK_SIZE) { + // If one of the workers encountered an error, we return it. + if self.cancel.load(Ordering::Acquire) { + return Err(self.error_rx.recv().unwrap()); + } + + let chunk = Arc::new(chunk); + + // Find the first free channel and send the chunk via it. + // + // TODO A better implementation wouldn't always start at the + // first thread, but go round-robin. + for channel in &self.channels { + match channel.try_send(SendMemoryThreadMessage::Memory(chunk.clone())) { + Ok(()) => continue 'next_partition, + Err(TrySendError::Full(_)) => { + // Try next channel. + } + Err(TrySendError::Disconnected(_)) => { + return Err(MigratableError::MigrateSend(anyhow!( + "Sending thread died?" + ))); + } + } + } + + // Fallback to sending the chunk via the control connection. + vm_send_memory(&self.guest_memory, socket, &chunk)?; + } + + self.wait_for_pending_data(); + + Ok(()) + } +} + +impl Drop for SendAdditionalConnections { + fn drop(&mut self) { + info!("Sending disconnect message to channels"); + self.channels.drain(..).for_each(|channel| { + // One of the workers may have died and thus closed the channel. + // Thus we cannot simply do send().unwrap(). + let e = channel.send(SendMemoryThreadMessage::Disconnect); + if let Err(e) = e { + error!("Could not send disconnect message to worker thread: {e}"); + } + }); + + info!("Waiting for threads to finish"); + self.threads + .drain(..) + .for_each(|thread| thread.join().unwrap()); + info!("Threads finished"); + } +} + +/// Establishes a connection to a migration destination socket (TCP or UNIX). +fn send_migration_socket( + send_data_migration: &VmSendMigrationData, +) -> std::result::Result { + if let Some(address) = send_data_migration.destination_url.strip_prefix("tcp:") { + info!("Connecting to TCP socket at {}", address); + + let socket = TcpStream::connect(address).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {}", e)) + })?; + + if send_data_migration.tls_dir.is_none() { + Ok(SocketStream::Tcp(socket)) + } else { + info!("Live Migration will be encrypted using TLS."); + // The address may still contain a port. I think we should build something more robust to also handle IPv6. + let tls_stream = tls::client_stream( + socket, + send_data_migration.tls_dir.as_ref().unwrap(), + address + .split_once(':') + .map(|(host, _)| host) + .unwrap_or(address), + )?; + Ok(SocketStream::Tls(Box::new(TlsStreamWrapper::new( + TlsStream::Client(tls_stream), + )))) + } + } else if let Some(path) = &send_data_migration.destination_url.strip_prefix("unix:") { + info!("Connecting to UNIX socket at {:?}", path); + + let socket = UnixStream::connect(path).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {}", e)) + })?; + + Ok(SocketStream::Unix(socket)) + } else { + Err(MigratableError::MigrateSend(anyhow!( + "Invalid destination: {}", + send_data_migration.destination_url + ))) + } +} + +/// Creates a listener socket for receiving incoming migration connections (TCP or UNIX). +fn receive_migration_listener( + receiver_data_migration: &VmReceiveMigrationData, +) -> std::result::Result { + if let Some(address) = receiver_data_migration.receiver_url.strip_prefix("tcp:") { + let listener = TcpListener::bind(address).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {}", e)) + })?; + + if receiver_data_migration.tls_dir.is_none() { + Ok(ReceiveListener::Tcp(listener)) + } else { + Ok(ReceiveListener::Tls( + listener, + TlsConnectionWrapper::new(receiver_data_migration.tls_dir.as_ref().unwrap()), + )) + } + } else if let Some(path) = receiver_data_migration.receiver_url.strip_prefix("unix:") { + UnixListener::bind(path) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {}", e)) + }) + .map(|listener| ReceiveListener::Unix(listener, Some(path.into()))) + } else { + Err(MigratableError::MigrateSend(anyhow!( + "Invalid source: {}", + receiver_data_migration.receiver_url + ))) + } +} + +fn send_memory_regions( + guest_memory: &GuestMemoryAtomic, + ranges: &MemoryRangeTable, + fd: &mut SocketStream, +) -> std::result::Result<(), MigratableError> { + let mem = guest_memory.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of write_all_to() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_written = mem + .write_volatile_to( + GuestAddress(range.gpa + offset), + fd, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Error transferring memory to socket: {}", + e + )) + })?; + offset += bytes_written as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) } impl Vmm { @@ -729,14 +1593,14 @@ impl Vmm { .name("vmm_signal_handler".to_string()) .spawn(move || { if !signal_handler_seccomp_filter.is_empty() && let Err(e) = apply_filter(&signal_handler_seccomp_filter) - .map_err(Error::ApplySeccompFilter) - { - error!("Error applying seccomp filter: {e:?}"); - exit_evt.write(1).ok(); - return; - } + .map_err(Error::ApplySeccompFilter) + { + error!("Error applying seccomp filter: {e:?}"); + exit_evt.write(1).ok(); + return; + } - if landlock_enable{ + if landlock_enable { match Landlock::new() { Ok(landlock) => { let _ = landlock.restrict_self().map_err(Error::ApplyLandlock).map_err(|e| { @@ -754,11 +1618,11 @@ impl Vmm { std::panic::catch_unwind(AssertUnwindSafe(|| { Vmm::signal_handler(signals, original_termios_opt, &exit_evt); })) - .map_err(|_| { - error!("vmm signal_handler thread panicked"); - exit_evt.write(1).ok() - }) - .ok(); + .map_err(|_| { + error!("vmm signal_handler thread panicked"); + exit_evt.write(1).ok() + }) + .ok(); }) .map_err(Error::SignalHandlerSpawn)?, ); @@ -781,6 +1645,7 @@ impl Vmm { let mut epoll = EpollContext::new().map_err(Error::Epoll)?; let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let check_migration_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll .add_event(&exit_evt, EpollDispatch::Exit) @@ -803,6 +1668,10 @@ impl Vmm { .add_event(&debug_evt, EpollDispatch::Debug) .map_err(Error::Epoll)?; + epoll + .add_event(&check_migration_evt, EpollDispatch::CheckMigration) + .map_err(Error::Epoll)?; + Ok(Vmm { epoll, exit_evt, @@ -813,7 +1682,7 @@ impl Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt, version: vmm_version, - vm: None, + vm: MaybeVmOwnership::None, vm_config: None, seccomp_action, hypervisor, @@ -823,29 +1692,183 @@ impl Vmm { original_termios_opt: Arc::new(Mutex::new(None)), console_resize_pipe: None, console_info: None, + check_migration_evt, + migration_thread_handle: None, }) } - fn vm_receive_config( + /// Try to receive a file descriptor from a socket. Returns the slot number and the file descriptor. + fn vm_receive_memory_fd( + socket: &mut SocketStream, + ) -> std::result::Result<(u32, File), MigratableError> { + if let SocketStream::Unix(unix_socket) = socket { + let mut buf = [0u8; 4]; + let (_, file) = unix_socket.recv_with_fd(&mut buf).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error receiving slot from socket: {}", e)) + })?; + + file.ok_or_else(|| MigratableError::MigrateReceive(anyhow!("Failed to receive socket"))) + .map(|file| (u32::from_le_bytes(buf), file)) + } else { + Err(MigratableError::MigrateReceive(anyhow!( + "Unsupported socket type" + ))) + } + } + + /// Handle a migration command and advance the protocol state machine. + /// + /// **Note**: This function is responsible for consuming any payloads! It also must + /// _not_ write any response to the socket. + fn vm_receive_migration_step( &mut self, + listener: &ReceiveListener, + socket: &mut SocketStream, + state: ReceiveMigrationState, req: &Request, - socket: &mut T, - existing_memory_files: Option>, - ) -> std::result::Result>, MigratableError> - where - T: Read + Write, - { - // Read in config data along with memory manager data - let mut data: Vec = Vec::new(); - data.resize_with(req.length() as usize, Default::default); - socket - .read_exact(&mut data) - .map_err(MigratableError::MigrateSocket)?; + receive_data_migration: &VmReceiveMigrationData, + ) -> std::result::Result { + use ReceiveMigrationState::*; + + let invalid_command = || { + Err(MigratableError::MigrateReceive(anyhow!( + "Can't handle command in current state" + ))) + }; - let vm_migration_config: VmMigrationConfig = - serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising config: {e}")) - })?; + let mut configure_vm = + |socket: &mut SocketStream, + memory_files: HashMap| + -> std::result::Result { + let memory_manager = self.vm_receive_config( + req, + socket, + memory_files, + receive_data_migration.tcp_serial_url.clone(), + )?; + + if let Some(ref restored_net_configs) = receive_data_migration.net_fds { + // TODO do some validation + //restored_net_config.validate(); + // Update VM's net configurations with new fds received for restore operation + + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + + for net in restored_net_configs { + for net_config in vm_config.net.iter_mut().flatten() { + // update only if the net dev is backed by FDs + if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { + log::debug!( + "overwriting net fds: id={}, old={:?}, new={:?}", + net.id, + &net_config.fds, + &net.fds + ); + net_config.fds.clone_from(&net.fds); + } + } + } + } + + let guest_memory = memory_manager.lock().unwrap().guest_memory(); + Ok(Configured( + memory_manager, + guest_memory.clone(), + listener + .try_clone() + .and_then(|l| ReceiveAdditionalConnections::new(l, guest_memory)) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Failed to create receive additional connections: {}", + e + )) + })?, + )) + }; + + let recv_memory_fd = + |socket: &mut SocketStream, + mut memory_files: Vec<(u32, File)>| + -> std::result::Result { + let (slot, file) = Self::vm_receive_memory_fd(socket)?; + + memory_files.push((slot, file)); + Ok(MemoryFdsReceived(memory_files)) + }; + + if req.command() == Command::Abandon { + return Ok(Aborted); + } + + match state { + Established => match req.command() { + Command::Start => Ok(Started), + _ => invalid_command(), + }, + Started => match req.command() { + Command::MemoryFd => recv_memory_fd(socket, Vec::new()), + Command::Config => configure_vm(socket, Default::default()), + _ => invalid_command(), + }, + MemoryFdsReceived(memory_files) => match req.command() { + Command::MemoryFd => recv_memory_fd(socket, memory_files), + Command::Config => configure_vm(socket, HashMap::from_iter(memory_files)), + _ => invalid_command(), + }, + Configured(memory_manager, guest_memory, receive_additional_connections) => { + match req.command() { + Command::Memory => { + vm_receive_memory(req, socket, &guest_memory)?; + Ok(Configured( + memory_manager, + guest_memory, + receive_additional_connections, + )) + } + Command::State => { + self.vm_receive_state(req, socket, memory_manager)?; + Ok(StateReceived) + } + _ => invalid_command(), + } + } + StateReceived => match req.command() { + Command::Complete => { + // The unwrap is safe, because the state machine makes sure we called + // vm_receive_state before, which creates the VM. + let vm = self.vm.vm_mut().unwrap(); + vm.resume()?; + Ok(Completed) + } + _ => invalid_command(), + }, + Completed | Aborted => { + unreachable!("Performed a step on the finished state machine") + } + } + } + + fn vm_receive_config( + &mut self, + req: &Request, + socket: &mut T, + existing_memory_files: HashMap, + tcp_serial_url: Option, + ) -> std::result::Result>, MigratableError> + where + T: Read, + { + // Read in config data along with memory manager data + let mut data: Vec = Vec::new(); + data.resize_with(req.length() as usize, Default::default); + socket + .read_exact(&mut data) + .map_err(MigratableError::MigrateSocket)?; + + let vm_migration_config: VmMigrationConfig = + serde_json::from_slice(&data).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error deserialising config: {e}")) + })?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] self.vm_check_cpuid_compatibility( @@ -855,6 +1878,12 @@ impl Vmm { let config = vm_migration_config.vm_config.clone(); self.vm_config = Some(vm_migration_config.vm_config); + + if let Some(tcp_serial_url) = tcp_serial_url { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + vm_config.serial.url = Some(tcp_serial_url); + } + self.console_info = Some(pre_create_console_devices(self).map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating console devices: {e:?}")) })?); @@ -913,8 +1942,6 @@ impl Vmm { )) })?; - Response::ok().write_to(socket)?; - Ok(memory_manager) } @@ -925,7 +1952,7 @@ impl Vmm { mm: Arc>, ) -> std::result::Result<(), MigratableError> where - T: Read + Write, + T: Read, { // Read in state data let mut data: Vec = Vec::new(); @@ -978,130 +2005,195 @@ impl Vmm { // Create VM vm.restore().map_err(|e| { - Response::error().write_to(socket).ok(); - MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {e}")) + MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {}", e)) })?; - self.vm = Some(vm); - - Response::ok().write_to(socket)?; + self.vm = MaybeVmOwnership::Vmm(vm); Ok(()) } - fn vm_receive_memory( - &mut self, - req: &Request, - socket: &mut T, - memory_manager: &mut MemoryManager, - ) -> std::result::Result<(), MigratableError> - where - T: Read + ReadVolatile + Write, - { - // Read table - let table = MemoryRangeTable::read_from(socket, req.length())?; - - // And then read the memory itself - memory_manager - .receive_memory_regions(&table, socket) - .inspect_err(|_| { - Response::error().write_to(socket).ok(); - })?; - Response::ok().write_to(socket)?; - Ok(()) - } - - fn socket_url_to_path(url: &str) -> result::Result { - url.strip_prefix("unix:") - .ok_or_else(|| { - MigratableError::MigrateSend(anyhow!("Could not extract path from URL: {url}")) - }) - .map(|s| s.into()) + fn can_increase_autoconverge_step(s: &MigrationState) -> bool { + if s.iteration < AUTO_CONVERGE_ITERATION_DELAY { + false + } else { + let iteration = s.iteration - AUTO_CONVERGE_ITERATION_DELAY; + iteration.is_multiple_of(AUTO_CONVERGE_ITERATION_INCREASE) + } } - fn send_migration_socket( - destination_url: &str, - ) -> std::result::Result { - if let Some(address) = destination_url.strip_prefix("tcp:") { - info!("Connecting to TCP socket at {address}"); + fn memory_copy_iterations( + vm: &mut Vm, + mem_send: &SendAdditionalConnections, + socket: &mut SocketStream, + s: &mut MigrationState, + migration_timeout: Duration, + migrate_downtime_limit: Duration, + ) -> result::Result { + let mut bandwidth = 0.0; + let mut iteration_table; - let socket = TcpStream::connect(address).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) - })?; + loop { + // todo: check if auto-converge is enabled at all? + if Self::can_increase_autoconverge_step(s) && vm.throttle_percent() < AUTO_CONVERGE_MAX + { + let current_throttle = vm.throttle_percent(); + let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE; + let new_throttle = std::cmp::min(new_throttle, AUTO_CONVERGE_MAX); + log::info!("Increasing auto-converge: {new_throttle}%"); + if new_throttle != current_throttle { + vm.set_throttle_percent(new_throttle); + } + } - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(destination_url)?; - info!("Connecting to UNIX socket at {path:?}"); + // Update the start time of the iteration + s.iteration_start_time = Instant::now(); - let socket = UnixStream::connect(&path).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) - })?; + // Increment iteration counter + s.iteration += 1; - Ok(SocketStream::Unix(socket)) - } - } + // Check if migration has timed out + // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check + if !migration_timeout.is_zero() && s.start_time.elapsed() > migration_timeout { + warn!("Migration timed out after {:?}", migration_timeout); + Request::abandon().write_to(socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Migration timed out")), + )?; + } - fn receive_migration_socket( - receiver_url: &str, - ) -> std::result::Result { - if let Some(address) = receiver_url.strip_prefix("tcp:") { - let listener = TcpListener::bind(address).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) - })?; + // Get the dirty page table + iteration_table = vm.dirty_log()?; - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on TCP socket: {e}" - )) - })?; + // Update the pending size (amount of data to transfer) + s.pending_size = iteration_table + .regions() + .iter() + .map(|range| range.length) + .sum(); - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(receiver_url)?; - let listener = UnixListener::bind(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) - })?; + // Update thresholds + if bandwidth > 0.0 { + s.threshold_size = bandwidth as u64 * migrate_downtime_limit.as_millis() as u64; + } - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on UNIX socket: {e}" - )) - })?; + // Enter the final stage of migration when the suspension conditions are met + if s.iteration > 1 && s.pending_size <= s.threshold_size { + break; + } - // Remove the UNIX socket file after accepting the connection - std::fs::remove_file(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {e}")) - })?; + // Update the number of dirty pages + s.total_transferred_bytes += s.pending_size; + s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); + s.total_transferred_dirty_pages += s.current_dirty_pages; + + // Send the current dirty pages + let transfer_start = Instant::now(); + mem_send.send_memory(&iteration_table, socket)?; + let transfer_time = transfer_start.elapsed().as_millis() as f64; + + // Update bandwidth + if transfer_time > 0.0 && s.pending_size > 0 { + bandwidth = s.pending_size as f64 / transfer_time; + // Convert bandwidth to MB/s + s.mb_per_sec = (bandwidth * 1000.0) / (1024.0 * 1024.0); + } - Ok(SocketStream::Unix(socket)) + // Update iteration cost time + s.iteration_cost_time = s.iteration_start_time.elapsed(); + if s.iteration_cost_time.as_millis() > 0 { + s.pages_per_second = + s.current_dirty_pages * 1000 / s.iteration_cost_time.as_millis() as u64; + } + debug!( + "iteration {}: cost={}ms, throttle={}%", + s.iteration, + s.iteration_cost_time.as_millis(), + vm.throttle_percent() + ); } + + Ok(iteration_table) } - // Returns true if there were dirty pages to send - fn vm_maybe_send_dirty_pages( + fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, - ) -> result::Result { - // Send (dirty) memory table - let table = vm.dirty_log()?; + s: &mut MigrationState, + send_data_migration: &VmSendMigrationData, + ) -> result::Result<(), MigratableError> { + let mem_send = SendAdditionalConnections::new(send_data_migration, &vm.guest_memory())?; - // But if there are no regions go straight to pause - if table.regions().is_empty() { - return Ok(false); + // Start logging dirty pages + vm.start_dirty_log()?; + + mem_send.send_memory(&vm.memory_range_table()?, socket)?; + + // Define the maximum allowed downtime 2000 seconds(2000000 milliseconds) + const MAX_MIGRATE_DOWNTIME: u64 = 2000000; + + // Verify that downtime must be between 1 and MAX_MIGRATE_DOWNTIME + if send_data_migration.downtime == 0 || send_data_migration.downtime > MAX_MIGRATE_DOWNTIME + { + return Err(MigratableError::MigrateSend(anyhow!( + "downtime_limit must be an integer in the range of 1 to {} ms", + MAX_MIGRATE_DOWNTIME + ))); + } + + let migration_timeout = Duration::from_secs(send_data_migration.migration_timeout); + let migrate_downtime_limit = Duration::from_millis(send_data_migration.downtime); + + // Verify that downtime must be less than the migration timeout + if !migration_timeout.is_zero() && migrate_downtime_limit >= migration_timeout { + return Err(MigratableError::MigrateSend(anyhow!( + "downtime_limit {}ms must be less than migration_timeout {}ms", + send_data_migration.downtime, + send_data_migration.migration_timeout * 1000 + ))); } - Request::memory(table.length()).write_to(socket).unwrap(); - table.write_to(socket)?; - // And then the memory itself - vm.send_memory_regions(&table, socket)?; - Response::read_from(socket)?.ok_or_abandon( + let iteration_table = Self::memory_copy_iterations( + vm, + &mem_send, socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + s, + migration_timeout, + migrate_downtime_limit, )?; - Ok(true) + info!("Entering downtime phase"); + s.downtime_start = Instant::now(); + // End throttle thread + info!("stopping vcpu thread"); + vm.stop_vcpu_throttling(); + info!("stopped vcpu thread"); + info!("pausing VM"); + vm.pause()?; + info!("paused VM"); + + // Send last batch of dirty pages + let mut final_table = vm.dirty_log()?; + final_table.extend(iteration_table.clone()); + mem_send.send_memory(&final_table, socket)?; + // Update statistics + s.pending_size = final_table.regions().iter().map(|range| range.length).sum(); + s.total_transferred_bytes += s.pending_size; + s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); + s.total_transferred_dirty_pages += s.current_dirty_pages; + + // Stop logging dirty pages + vm.stop_dirty_log()?; + + Ok(()) } + /// Performs a live-migration. + /// + /// This function performs necessary after-migration cleanup only in the + /// good case. Callers are responsible for properly handling failed + /// migrations. + #[allow(unused_assignments)] // TODO remove fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: Arc< @@ -1109,8 +2201,10 @@ impl Vmm { >, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + let mut s = MigrationState::new(); + // Set up the socket connection - let mut socket = Self::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = send_migration_socket(&send_data_migration)?; // Start the migration Request::start().write_to(&mut socket)?; @@ -1161,6 +2255,11 @@ impl Vmm { "--local option is not supported with TCP sockets", ))); } + SocketStream::Tls(_tls_socket) => { + return Err(MigratableError::MigrateSend(anyhow!( + "--local option is not supported with TCP sockets", + ))); + } } } @@ -1187,36 +2286,7 @@ impl Vmm { // Now pause VM vm.pause()?; } else { - // Start logging dirty pages - vm.start_dirty_log()?; - - // Send memory table - let table = vm.memory_range_table()?; - Request::memory(table.length()) - .write_to(&mut socket) - .unwrap(); - table.write_to(&mut socket)?; - // And then the memory itself - vm.send_memory_regions(&table, &mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), - )?; - - // Try at most 5 passes of dirty memory sending - const MAX_DIRTY_MIGRATIONS: usize = 5; - for i in 0..MAX_DIRTY_MIGRATIONS { - info!("Dirty memory migration {i} of {MAX_DIRTY_MIGRATIONS}"); - if !Self::vm_maybe_send_dirty_pages(vm, &mut socket)? { - break; - } - } - - // Now pause VM - vm.pause()?; - - // Send last batch of dirty pages - Self::vm_maybe_send_dirty_pages(vm, &mut socket)?; + Self::do_memory_migration(vm, &mut socket, &mut s, &send_data_migration)?; } // We release the locks early to enable locking them on the destination host. @@ -1224,6 +2294,14 @@ impl Vmm { vm.release_disk_locks() .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; + #[cfg(feature = "kvm")] + // Prevent signal handler to access thread local storage when signals are received + // close to the end when thread-local storage is already destroyed. + { + let mut lock = IS_IN_SHUTDOWN.write().unwrap(); + *lock = true; + } + // Capture snapshot and send it let vm_snapshot = vm.snapshot()?; let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap(); @@ -1243,11 +2321,17 @@ impl Vmm { MigratableError::MigrateSend(anyhow!("Error completing migration")), )?; + // Record downtime + s.downtime = s.downtime_start.elapsed(); + // Stop logging dirty pages if !send_data_migration.local { vm.stop_dirty_log()?; } + // Record total migration time + s.total_time = s.start_time.elapsed(); + info!("Migration complete"); // Let every Migratable object know about the migration being complete @@ -1301,6 +2385,10 @@ impl Vmm { vm_config: Arc>, prefault: bool, ) -> std::result::Result<(), VmError> { + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } + let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let vm_snapshot = get_vm_snapshot(&snapshot).map_err(VmError::Restore)?; @@ -1343,7 +2431,7 @@ impl Vmm { Some(source_url), Some(prefault), )?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); if self .vm_config @@ -1358,10 +2446,53 @@ impl Vmm { } // Now we can restore the rest of the VM. - if let Some(ref mut vm) = self.vm { - vm.restore() - } else { - Err(VmError::VmNotCreated) + // PANIC: won't panic, we just checked that the VM is there. + self.vm.vm_mut().unwrap().restore() + } + + /// Checks the migration result. + /// + /// This should be called when the migration thread indicated a state + /// change (and therefore, its termination). The function checks the result + /// of that thread and either shuts down the VMM on success or keeps the VM + /// and the VMM running on migration failure. + fn check_migration_result(&mut self) { + // At this point, the thread must be finished. + // If we fail here, we have lost anyway. Just panic. + let (vm, migration_res) = self + .migration_thread_handle + .take() + .expect("should have thread") + .join() + .expect("should have joined"); + + // Give VMM back control. + self.vm = MaybeVmOwnership::Vmm(vm); + + match migration_res { + Ok(()) => { + { + info!("Sending Receiver in HTTP thread that migration succeeded"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Ok(())).unwrap(); + } + + // Shutdown the VM after the migration succeeded + if let Err(e) = self.exit_evt.write(1) { + error!("Failed shutting down the VM after migration: {}", e); + } + } + Err(e) => { + error!("Migration failed: {}", e); + { + info!("Sending Receiver in HTTP thread that migration failed"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Err(e)).unwrap(); + } + // we don't fail the VMM here, it just continues running its VM + } } } @@ -1415,7 +2546,7 @@ impl Vmm { self.vm_reboot().map_err(Error::VmReboot)?; } EpollDispatch::ActivateVirtioDevices => { - if let Some(ref vm) = self.vm { + if let MaybeVmOwnership::Vmm(ref vm) = self.vm { let count = self.activate_evt.read().map_err(Error::EventFdRead)?; info!("Trying to activate pending virtio devices: count = {count}"); vm.activate_virtio_devices() @@ -1440,7 +2571,7 @@ impl Vmm { // Read from the API receiver channel let gdb_request = gdb_receiver.recv().map_err(Error::GdbRequestRecv)?; - let response = if let Some(ref mut vm) = self.vm { + let response = if let MaybeVmOwnership::Vmm(ref mut vm) = self.vm { vm.debug_request(&gdb_request.payload, gdb_request.cpu_id) } else { Err(VmError::VmNotRunning) @@ -1455,6 +2586,14 @@ impl Vmm { } #[cfg(not(feature = "guest_debug"))] EpollDispatch::Debug => {} + EpollDispatch::CheckMigration => { + info!("VM migration check event"); + // Consume the event. + self.check_migration_evt + .read() + .map_err(Error::EventFdRead)?; + self.check_migration_result(); + } } } } @@ -1508,102 +2647,116 @@ impl RequestHandler for Vmm { tracer::start(); info!("Booting VM"); event!("vm", "booting"); - let r = { - trace_scoped!("vm_boot"); - // If we don't have a config, we cannot boot a VM. - if self.vm_config.is_none() { - return Err(VmError::VmMissingConfig); - }; - // console_info is set to None in vm_shutdown. re-populate here if empty - if self.console_info.is_none() { - self.console_info = - Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); - } + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } - // Create a new VM if we don't have one yet. - if self.vm.is_none() { - let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; - let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; - #[cfg(feature = "guest_debug")] - let vm_debug_evt = self - .vm_debug_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - let activate_evt = self - .activate_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - - if let Some(ref vm_config) = self.vm_config { - let vm = Vm::new( - Arc::clone(vm_config), - exit_evt, - reset_evt, - #[cfg(feature = "guest_debug")] - vm_debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - None, - None, - None, - )?; - - self.vm = Some(vm); - } + trace_scoped!("vm_boot"); + // If we don't have a config, we cannot boot a VM. + if self.vm_config.is_none() { + return Err(VmError::VmMissingConfig); + }; + + // console_info is set to None in vm_shutdown. re-populate here if empty + if self.console_info.is_none() { + self.console_info = + Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); + } + + // Create a new VM if we don't have one yet. + if matches!(self.vm, MaybeVmOwnership::None) { + let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; + let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + #[cfg(feature = "guest_debug")] + let vm_debug_evt = self + .vm_debug_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + let activate_evt = self + .activate_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + + if let Some(ref vm_config) = self.vm_config { + let vm = Vm::new( + Arc::clone(vm_config), + exit_evt, + reset_evt, + #[cfg(feature = "guest_debug")] + vm_debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + None, + None, + None, + )?; + + self.vm = MaybeVmOwnership::Vmm(vm); } + } - // Now we can boot the VM. - if let Some(ref mut vm) = self.vm { - vm.boot() - } else { - Err(VmError::VmNotCreated) + // Now we can boot the VM. + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.boot()?; + event!("vm", "booted"); } - }; - tracer::end(); - if r.is_ok() { - event!("vm", "booted"); + MaybeVmOwnership::None => { + return Err(VmError::VmNotCreated); + } + _ => unreachable!(), } - r + + tracer::end(); + Ok(()) } fn vm_pause(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.pause().map_err(VmError::Pause) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.pause().map_err(VmError::Pause), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_resume(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.resume().map_err(VmError::Resume) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.resume().map_err(VmError::Resume), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - // Drain console_info so that FDs are not reused - let _ = self.console_info.take(); - vm.snapshot() - .map_err(VmError::Snapshot) - .and_then(|snapshot| { - vm.send(&snapshot, destination_url) - .map_err(VmError::SnapshotSend) - }) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + // Drain console_info so that FDs are not reused + let _ = self.console_info.take(); + vm.snapshot() + .map_err(VmError::Snapshot) + .and_then(|snapshot| { + vm.send(&snapshot, destination_url) + .map_err(VmError::SnapshotSend) + }) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> result::Result<(), VmError> { - if self.vm.is_some() || self.vm_config.is_some() { + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => return Err(VmError::VmAlreadyCreated), + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => (), + }; + + if self.vm_config.is_some() { return Err(VmError::VmAlreadyCreated); } @@ -1628,7 +2781,7 @@ impl RequestHandler for Vmm { for net in restored_nets.iter() { for net_config in vm_net_configs.iter_mut() { // update only if the net dev is backed by FDs - if net_config.id == Some(net.id.clone()) && net_config.fds.is_some() { + if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { net_config.fds.clone_from(&net.fds); } } @@ -1650,21 +2803,25 @@ impl RequestHandler for Vmm { #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] fn vm_coredump(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.coredump(destination_url).map_err(VmError::Coredump) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.coredump(destination_url).map_err(VmError::Coredump) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_shutdown(&mut self) -> result::Result<(), VmError> { - let r = if let Some(ref mut vm) = self.vm.take() { - // Drain console_info so that the FDs are not reused - let _ = self.console_info.take(); - vm.shutdown() - } else { - Err(VmError::VmNotRunning) + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + // Drain console_info so that the FDs are not reused + let _ = self.console_info.take(); + let r = vm.shutdown(); + self.vm = MaybeVmOwnership::None; if r.is_ok() { event!("vm", "shutdown"); @@ -1677,13 +2834,14 @@ impl RequestHandler for Vmm { event!("vm", "rebooting"); // First we stop the current VM - let config = if let Some(mut vm) = self.vm.take() { - let config = vm.get_config(); - vm.shutdown()?; - config - } else { - return Err(VmError::VmNotCreated); + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + let config = vm.get_config(); + vm.shutdown()?; + self.vm = MaybeVmOwnership::None; // vm.shutdown() closes all the console devices, so set console_info to None // so that the closed FD #s are not reused. @@ -1732,7 +2890,7 @@ impl RequestHandler for Vmm { // And we boot it vm.boot()?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); event!("vm", "rebooted"); @@ -1740,33 +2898,38 @@ impl RequestHandler for Vmm { } fn vm_info(&self) -> result::Result { - match &self.vm_config { - Some(vm_config) => { - let state = match &self.vm { - Some(vm) => vm.get_state()?, - None => VmState::Created, - }; - let config = vm_config.lock().unwrap().clone(); - - let mut memory_actual_size = config.memory.total_size(); - if let Some(vm) = &self.vm { - memory_actual_size -= vm.balloon_size(); - } + let vm_config = self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + let vm_config = vm_config.lock().unwrap().clone(); + + let state = match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm.get_state()?, + // TODO in theory one could live-migrate a non-running VM .. + MaybeVmOwnership::Migration => VmState::Running, + MaybeVmOwnership::None => VmState::Created, + }; - let device_tree = self - .vm - .as_ref() - .map(|vm| vm.device_tree().lock().unwrap().clone()); - - Ok(VmInfoResponse { - config: Box::new(config), - state, - memory_actual_size, - device_tree, - }) + let mut memory_actual_size = vm_config.memory.total_size(); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => { + memory_actual_size -= vm.balloon_size(); } - None => Err(VmError::VmNotCreated), + MaybeVmOwnership::Migration => {} + MaybeVmOwnership::None => {} } + + let device_tree = match &self.vm { + MaybeVmOwnership::Vmm(vm) => Some(vm.device_tree().lock().unwrap().clone()), + // TODO we need to fix this + MaybeVmOwnership::Migration => None, + MaybeVmOwnership::None => None, + }; + + Ok(VmInfoResponse { + config: Box::new(vm_config), + state, + memory_actual_size, + device_tree, + }) } fn vmm_ping(&self) -> VmmPingResponse { @@ -1788,14 +2951,19 @@ impl RequestHandler for Vmm { return Ok(()); } - // If a VM is booted, we first try to shut it down. - if self.vm.is_some() { - self.vm_shutdown()?; - } - - self.vm_config = None; + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => { + event!("vm", "deleted"); - event!("vm", "deleted"); + // If a VM is booted, we first try to shut it down. + self.vm_shutdown()?; + self.vm_config = None; + } + MaybeVmOwnership::None => { + self.vm_config = None; + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + } Ok(()) } @@ -1814,55 +2982,85 @@ impl RequestHandler for Vmm { ) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - if let Err(e) = vm.resize(desired_vcpus, desired_ram, desired_balloon) { - error!("Error when resizing VM: {e:?}"); - Err(e) - } else { - Ok(()) - } - } else { - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - if let Some(desired_vcpus) = desired_vcpus { - config.cpus.boot_vcpus = desired_vcpus; - } - if let Some(desired_ram) = desired_ram { - config.memory.size = desired_ram; + if desired_vcpus.is_some() { + todo!("doesn't work currently with our thread-local KVM_RUN approach"); + } + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize(desired_vcpus, desired_ram, desired_balloon) { + error!("Error when resizing VM: {:?}", e); + Err(e) + } else { + Ok(()) + } } - if let Some(desired_balloon) = desired_balloon - && let Some(balloon_config) = &mut config.balloon - { - balloon_config.size = desired_balloon; + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + if let Some(desired_vcpus) = desired_vcpus { + config.cpus.boot_vcpus = desired_vcpus; + } + if let Some(desired_ram) = desired_ram { + config.memory.size = desired_ram; + } + if let Some(desired_balloon) = desired_balloon + && let Some(balloon_config) = &mut config.balloon + { + balloon_config.size = desired_balloon; + } + + Ok(()) } - Ok(()) } } + fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> result::Result<(), VmError> { + info!("request to resize disk: id={id}"); + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_disk(id, desired_size) { + error!("Error when resizing disk: {:?}", e); + Err(e) + } else { + Ok(()) + } + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::ResizeDisk), + } + } fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - if let Err(e) = vm.resize_zone(id, desired_ram) { - error!("Error when resizing VM: {e:?}"); - Err(e) - } else { - Ok(()) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_zone(id, desired_ram) { + error!("Error when resizing VM: {e:?}"); + Err(e) + } else { + Ok(()) + } } - } else { - // Update VmConfig by setting the new desired ram. - let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; - - if let Some(zones) = &mut memory_config.zones { - for zone in zones.iter_mut() { - if zone.id == id { - zone.size = desired_ram; - return Ok(()); + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by setting the new desired ram. + let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; + + if let Some(zones) = &mut memory_config.zones { + for zone in zones.iter_mut() { + if zone.id == id { + zone.size = desired_ram; + return Ok(()); + } } } - } - error!("Could not find the memory zone {id} for the resize"); - Err(VmError::ResizeZone) + error!("Could not find the memory zone {id} for the resize"); + Err(VmError::ResizeZone) + } } } @@ -1879,19 +3077,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_device(device_cfg).map_err(|e| { - error!("Error when adding new device to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_device(device_cfg).map_err(|e| { + error!("Error when adding new device to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.devices, device_cfg); + Ok(None) + } } } @@ -1908,39 +3110,49 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_user_device(device_cfg).map_err(|e| { - error!("Error when adding new user device to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.user_devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_user_device(device_cfg).map_err(|e| { + error!("Error when adding new user device to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.user_devices, device_cfg); + Ok(None) + } } } fn vm_remove_device(&mut self, id: String) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - if let Err(e) = vm.remove_device(id) { - error!("Error when removing device from the VM: {e:?}"); - Err(e) - } else { - Ok(()) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.remove_device(id) { + error!("Error when removing device from the VM: {e:?}"); + Err(e) + } else { + Ok(()) + } } - } else if let Some(ref config) = self.vm_config { - let mut config = config.lock().unwrap(); - if config.remove_device(&id) { - Ok(()) - } else { - Err(VmError::NoDeviceToRemove(id)) + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + if let Some(ref config) = self.vm_config { + let mut config = config.lock().unwrap(); + if config.remove_device(&id) { + Ok(()) + } else { + Err(VmError::NoDeviceToRemove(id)) + } + } else { + Err(VmError::VmNotCreated) + } } - } else { - Err(VmError::VmNotCreated) } } @@ -1954,19 +3166,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_disk(disk_cfg).map_err(|e| { - error!("Error when adding new disk to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.disks, disk_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_disk(disk_cfg).map_err(|e| { + error!("Error when adding new disk to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.disks, disk_cfg); + Ok(None) + } } } @@ -1980,19 +3196,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_fs(fs_cfg).map_err(|e| { - error!("Error when adding new fs to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.fs, fs_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_fs(fs_cfg).map_err(|e| { + error!("Error when adding new fs to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.fs, fs_cfg); + Ok(None) + } } } @@ -2006,19 +3226,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_pmem(pmem_cfg).map_err(|e| { - error!("Error when adding new pmem device to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.pmem, pmem_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_pmem(pmem_cfg).map_err(|e| { + error!("Error when adding new pmem device to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.pmem, pmem_cfg); + Ok(None) + } } } @@ -2032,19 +3256,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_net(net_cfg).map_err(|e| { - error!("Error when adding new network device to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.net, net_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_net(net_cfg).map_err(|e| { + error!("Error when adding new network device to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.net, net_cfg); + Ok(None) + } } } @@ -2058,19 +3286,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vdpa(vdpa_cfg).map_err(|e| { - error!("Error when adding new vDPA device to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.vdpa, vdpa_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vdpa(vdpa_cfg).map_err(|e| { + error!("Error when adding new vDPA device to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.vdpa, vdpa_cfg); + Ok(None) + } } } @@ -2089,49 +3321,55 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vsock(vsock_cfg).map_err(|e| { - error!("Error when adding new vsock device to the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - config.vsock = Some(vsock_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vsock(vsock_cfg).map_err(|e| { + error!("Error when adding new vsock device to the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + config.vsock = Some(vsock_cfg); + Ok(None) + } } } fn vm_counters(&mut self) -> result::Result>, VmError> { - if let Some(ref mut vm) = self.vm { - let info = vm.counters().map_err(|e| { - error!("Error when getting counters from the VM: {e:?}"); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.counters().map_err(|e| { + error!("Error when getting counters from the VM: {e:?}"); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_power_button(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.power_button() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.power_button(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_nmi(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.nmi() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.nmi(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -2140,127 +3378,45 @@ impl RequestHandler for Vmm { receive_data_migration: VmReceiveMigrationData, ) -> result::Result<(), MigratableError> { info!( - "Receiving migration: receiver_url = {}", - receive_data_migration.receiver_url + "Receiving migration: receiver_url = {}, net_fds={:?}", + receive_data_migration.receiver_url, &receive_data_migration.net_fds ); + let mut listener = receive_migration_listener(&receive_data_migration)?; // Accept the connection and get the socket - let mut socket = Vmm::receive_migration_socket(&receive_data_migration.receiver_url)?; - - let mut started = false; - let mut memory_manager: Option>> = None; - let mut existing_memory_files = None; - loop { - let req = Request::read_from(&mut socket)?; - match req.command() { - Command::Invalid => info!("Invalid Command Received"), - Command::Start => { - info!("Start Command Received"); - started = true; - - Response::ok().write_to(&mut socket)?; - } - Command::Config => { - info!("Config Command Received"); + let mut socket = listener.accept().map_err(|e| { + warn!("Failed to accept migration connection: {}", e); + MigratableError::MigrateReceive(anyhow!("Failed to accept migration connection: {}", e)) + })?; - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - memory_manager = Some(self.vm_receive_config( - &req, - &mut socket, - existing_memory_files.take(), - )?); - } - Command::State => { - info!("State Command Received"); + let mut state = ReceiveMigrationState::Established; - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - if let Some(mm) = memory_manager.take() { - self.vm_receive_state(&req, &mut socket, mm)?; - } else { - warn!("Configuration not sent yet"); - Response::error().write_to(&mut socket)?; - } - } - Command::Memory => { - info!("Memory Command Received"); + while !state.finished() { + let req = Request::read_from(&mut socket)?; + trace!("Command {:?} received", req.command()); - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - if let Some(mm) = memory_manager.as_ref() { - self.vm_receive_memory(&req, &mut socket, &mut mm.lock().unwrap())?; - } else { - warn!("Configuration not sent yet"); - Response::error().write_to(&mut socket)?; - } + let (response, new_state) = match self.vm_receive_migration_step( + &listener, + &mut socket, + state, + &req, + &receive_data_migration, + ) { + Ok(next_state) => (Response::ok(), next_state), + Err(err) => { + warn!("Migration command {:?} failed: {}", req.command(), err); + (Response::error(), ReceiveMigrationState::Aborted) } - Command::MemoryFd => { - info!("MemoryFd Command Received"); - - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - - match &mut socket { - SocketStream::Unix(unix_socket) => { - let mut buf = [0u8; 4]; - let (_, file) = unix_socket.recv_with_fd(&mut buf).map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving slot from socket: {e}" - )) - })?; - - if existing_memory_files.is_none() { - existing_memory_files = Some(HashMap::default()) - } + }; - if let Some(ref mut existing_memory_files) = existing_memory_files { - let slot = u32::from_le_bytes(buf); - existing_memory_files.insert(slot, file.unwrap()); - } + state = new_state; + assert_eq!(response.length(), 0); + response.write_to(&mut socket)?; + } - Response::ok().write_to(&mut socket)?; - } - SocketStream::Tcp(_tcp_socket) => { - // For TCP sockets, we cannot transfer file descriptors - warn!( - "MemoryFd command received over TCP socket, which is not supported" - ); - Response::error().write_to(&mut socket)?; - } - } - } - Command::Complete => { - info!("Complete Command Received"); - if let Some(ref mut vm) = self.vm.as_mut() { - vm.resume()?; - Response::ok().write_to(&mut socket)?; - } else { - warn!("VM not created yet"); - Response::error().write_to(&mut socket)?; - } - break; - } - Command::Abandon => { - info!("Abandon Command Received"); - self.vm = None; - self.vm_config = None; - Response::ok().write_to(&mut socket).ok(); - break; - } - } + if let ReceiveMigrationState::Aborted = state { + self.vm = MaybeVmOwnership::None; + self.vm_config = None; } Ok(()) @@ -2270,6 +3426,18 @@ impl RequestHandler for Vmm { &mut self, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + match self.vm { + MaybeVmOwnership::Vmm(_) => (), + MaybeVmOwnership::Migration => { + return Err(MigratableError::MigrateSend(anyhow!( + "There is already an ongoing migration" + ))); + } + MaybeVmOwnership::None => { + return Err(MigratableError::MigrateSend(anyhow!("VM is not running"))); + } + }; + info!( "Sending migration: destination_url = {}, local = {}", send_data_migration.destination_url, send_data_migration.local @@ -2289,41 +3457,29 @@ impl RequestHandler for Vmm { ))); } - if let Some(vm) = self.vm.as_mut() { - Self::send_migration( - vm, - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.clone(), - send_data_migration.clone(), - ) - .map_err(|migration_err| { - error!("Migration failed: {migration_err:?}"); - - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local - && let Err(e) = vm.stop_dirty_log() - { - return e; - } + // Take VM ownership. This also means that API events can no longer + // change the VM (e.g. net device hotplug). + let vm = self.vm.take_vm_for_migration(); - if vm.get_state().unwrap() == VmState::Paused - && let Err(e) = vm.resume() - { - return e; - } + // Start migration thread + let worker = MigrationWorker { + vm, + check_migration_evt: self.check_migration_evt.try_clone().unwrap(), + config: send_data_migration, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: self.hypervisor.clone(), + }; - migration_err - })?; + self.migration_thread_handle = Some( + thread::Builder::new() + .name("migration".into()) + .spawn(move || worker.run()) + // For upstreaming, we should simply continue and return an + // error when this fails. For our PoC, this is fine. + .unwrap(), + ); - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {e:?}" - )) - }) - } else { - Err(MigratableError::MigrateSend(anyhow!("VM is not running"))) - } + Ok(()) } } @@ -2398,6 +3554,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -2407,12 +3564,17 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, - mode: ConsoleOutputMode::Tty, + // Caution: Don't use `Tty` to not mess with users terminal + mode: ConsoleOutputMode::Off, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 9a9b7f23e6..43d2b6e7cb 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -38,7 +38,7 @@ use vm_memory::guest_memory::FileOffset; use vm_memory::mmap::MmapRegionError; use vm_memory::{ Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, - GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, + GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, }; use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; use vm_migration::{ @@ -986,7 +986,7 @@ impl MemoryManager { phys_bits: u8, #[cfg(feature = "tdx")] tdx_enabled: bool, restore_data: Option<&MemoryManagerSnapshotData>, - existing_memory_files: Option>, + existing_memory_files: HashMap, ) -> Result>, Error> { trace_scoped!("MemoryManager::new"); @@ -1021,7 +1021,7 @@ impl MemoryManager { &data.guest_ram_mappings, &zones, prefault, - existing_memory_files.unwrap_or_default(), + existing_memory_files, config.thp, )?; let guest_memory = @@ -1254,7 +1254,7 @@ impl MemoryManager { #[cfg(feature = "tdx")] false, Some(&mem_snapshot), - None, + Default::default(), )?; mm.lock() @@ -2077,47 +2077,6 @@ impl MemoryManager { debug!("coredump total bytes {}", total_bytes); Ok(()) } - - pub fn receive_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: ReadVolatile, - { - let guest_memory = self.guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of read_exact_from() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_read = mem - .read_volatile_from( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving memory from socket: {e}" - )) - })?; - offset += bytes_read as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } } struct MemoryNotify { @@ -2600,20 +2559,19 @@ impl Migratable for MemoryManager { } }; - let dirty_bitmap: Vec = vm_dirty_bitmap + let dirty_bitmap = vm_dirty_bitmap .iter() .zip(vmm_dirty_bitmap.iter()) - .map(|(x, y)| x | y) - .collect(); + .map(|(x, y)| x | y); let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); if sub_table.regions().is_empty() { - info!("Dirty Memory Range Table is empty"); + debug!("Dirty Memory Range Table is empty"); } else { - info!("Dirty Memory Range Table:"); + debug!("Dirty Memory Range Table:"); for range in sub_table.regions() { - info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); + trace!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); } } diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 345869c1da..289ae853c7 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -163,15 +163,22 @@ impl PciSegment { ) } - pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult { + /// Allocates a device BDF on this PCI segment. + /// + /// - `device_id`: Device ID to request for BDF allocation + /// + /// ## Errors + /// * [`DeviceManagerError::AllocatePciDeviceId`] if device ID + /// allocation on the bus fails. + pub(crate) fn allocate_device_bdf(&self, device_id: Option) -> DeviceManagerResult { Ok(PciBdf::new( self.id, 0, self.pci_bus .lock() .unwrap() - .next_device_id() - .map_err(DeviceManagerError::NextPciDeviceId)? as u8, + .allocate_device_id(device_id) + .map_err(DeviceManagerError::AllocatePciDeviceId)? as u8, 0, )) } @@ -201,6 +208,65 @@ impl PciSegment { Ok(()) } + + #[cfg(test)] + /// Creates a PciSegment without the need for an [`AddressManager`] + /// for testing purpose. + /// + /// An [`AddressManager`] would otherwise be required to create + /// [`PciBus`] instances. Instead, we use any struct that implements + /// [`DeviceRelocation`] to instantiate a [`PciBus`]. + pub(crate) fn new_without_address_manager( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + pci_irq_slots: &[u8; 32], + device_reloc: Arc, + ) -> DeviceManagerResult { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, device_reloc.clone()))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = + layout::PCI_MMCONFIG_START.0 + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area + ); + Ok(segment) + } } struct PciDevSlot { @@ -473,3 +539,96 @@ impl Aml for PciSegment { .to_aml_bytes(sink) } } + +#[cfg(test)] +mod unit_tests { + use std::result::Result; + + use vm_memory::GuestAddress; + + use super::*; + + #[derive(Debug)] + struct MocRelocDevice; + impl DeviceRelocation for MocRelocDevice { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup() -> PciSegment { + let guest_addr = 0_u64; + let guest_size = 0x1000_usize; + let allocator_1 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let allocator_2 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let moc_device_reloc = Arc::new(MocRelocDevice {}); + let arr = [0_u8; 32]; + + PciSegment::new_without_address_manager( + 0, + 0, + allocator_1, + allocator_2, + &arr, + moc_device_reloc, + ) + .unwrap() + } + + #[test] + // Test the default bdf for a segment with an empty bus (except for the root device) + fn allocate_device_bdf_default() { + // The first address is occupied by the root + let segment = setup(); + let bdf = segment.allocate_device_bdf(None).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 1); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a bdf with s specific device ID + fn allocate_device_bdf_fixed_device_id() { + // The first address is occupied by the root + let expect_device_id = 0x10_u8; + let segment = setup(); + let bdf = segment.allocate_device_bdf(Some(expect_device_id)).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), expect_device_id); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a bdf with invalid device id, one already + // taken and the other being greater then the number of allowed + // devices per bus. + fn allocate_device_bdf_invalid_device_id() { + // The first address is occupied by the root + let already_taken_device_id = 0x0_u8; + let overflow_device_id = 0xff_u8; + let segment = setup(); + let bdf_res = segment.allocate_device_bdf(Some(already_taken_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(_)) + )); + let bdf_res = segment.allocate_device_bdf(Some(overflow_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(_)) + )); + } +} diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index afb304c066..a359a929e5 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -376,7 +376,7 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> const KVM_GET_SREGS: u64 = 0x8138_ae83; const KVM_GET_TSC_KHZ: u64 = 0xaea3; const KVM_GET_XCRS: u64 = 0x8188_aea6; - const KVM_GET_XSAVE: u64 = 0x9000_aea4; + const KVM_GET_XSAVE2: u64 = 0x9000_aecf; const KVM_KVMCLOCK_CTRL: u64 = 0xaead; const KVM_SET_CLOCK: u64 = 0x4030_ae7b; const KVM_SET_CPUID2: u64 = 0x4008_ae90; @@ -404,7 +404,7 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_SREGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_TSC_KHZ)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XCRS,)?], - and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XSAVE,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XSAVE2,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_KVMCLOCK_CTRL)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_CLOCK)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_CPUID2)?], @@ -550,6 +550,8 @@ fn vmm_thread_rules( (libc::SYS_accept4, vec![]), #[cfg(target_arch = "x86_64")] (libc::SYS_access, vec![]), + #[cfg(target_arch = "x86_64")] + (libc::SYS_arch_prctl, vec![]), (libc::SYS_bind, vec![]), (libc::SYS_brk, vec![]), (libc::SYS_clock_gettime, vec![]), @@ -815,6 +817,7 @@ fn vcpu_thread_rules( (libc::SYS_rt_sigreturn, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_sendmsg, vec![]), + (libc::SYS_sendto, vec![]), (libc::SYS_shutdown, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_tgkill, vec![]), @@ -864,6 +867,8 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_write, vec![]), (libc::SYS_rt_sigprocmask, vec![]), (libc::SYS_getcwd, vec![]), + (libc::SYS_clock_nanosleep, vec![]), + (libc::SYS_read, vec![]), ]) } diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 2df1ba3ca6..4bc4e447dc 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::any::TypeId; +use std::collections::HashMap; use std::fs::File; -use std::io::Read; -use std::net::Shutdown; +use std::io::{Read, Write}; +use std::net::{Shutdown, TcpStream}; use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; @@ -67,9 +69,9 @@ pub enum Error { #[error("Error accepting connection")] AcceptConnection(#[source] io::Error), - /// Cannot clone the UnixStream - #[error("Error cloning UnixStream")] - CloneUnixStream(#[source] io::Error), + /// Cannot clone the Stream + #[error("Error cloning Stream")] + CloneStream(#[source] io::Error), /// Cannot shutdown the connection #[error("Error shutting down a connection")] @@ -91,9 +93,10 @@ pub enum EpollDispatch { File = 0, Kill = 1, Socket = 2, + Tcp = 3, Unknown, } -const EPOLL_EVENTS_LEN: usize = 4; +const EPOLL_EVENTS_LEN: usize = 5; impl From for EpollDispatch { fn from(v: u64) -> Self { @@ -102,11 +105,64 @@ impl From for EpollDispatch { 0 => File, 1 => Kill, 2 => Socket, + 3 => Tcp, _ => Unknown, } } } +/// A thread-safe writer that fans out to multiple keyed writers. Allows for +/// bundling different kinds of writers for the serial device, e.g. writing to +/// a TCP socket and a file. +#[derive(Clone)] +pub struct FanoutWriter { + writers: Arc>>>, +} + +impl FanoutWriter { + pub fn new() -> Self { + FanoutWriter { + writers: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub fn add_writer(&self, writer: W) { + let mut writers = self.writers.lock().unwrap(); + writers.insert(TypeId::of::(), Box::new(writer)); + } + + pub fn remove_writer(&self, id: TypeId) -> Option> { + let mut writers = self.writers.lock().unwrap(); + writers.remove(&id) + } +} + +impl Write for FanoutWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let mut writers = self.writers.lock().unwrap(); + let mut result: io::Result = Ok(buf.len()); + + for (i, w) in writers.values_mut().enumerate() { + let r = w.write(buf); + if i == 0 { + result = r; + } else if let Err(e) = r { + return Err(e); + } + } + + result + } + + fn flush(&mut self) -> io::Result<()> { + let mut writers = self.writers.lock().unwrap(); + for w in writers.values_mut() { + w.flush()?; + } + Ok(()) + } +} + pub struct SerialManager { #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, @@ -165,6 +221,7 @@ impl SerialManager { } fd.as_raw_fd() } + ConsoleOutput::Tcp(ref fd, _) => fd.as_raw_fd(), _ => return Ok(None), }; @@ -179,10 +236,14 @@ impl SerialManager { ) .map_err(Error::Epoll)?; - let epoll_fd_data = if let ConsoleOutput::Socket(_) = output { - EpollDispatch::Socket - } else { - EpollDispatch::File + let epoll_fd_data = match output { + ConsoleOutput::File(_) => EpollDispatch::File, + ConsoleOutput::Pty(_) => EpollDispatch::File, + ConsoleOutput::Tty(_) => EpollDispatch::File, + ConsoleOutput::Null => EpollDispatch::File, + ConsoleOutput::Off => EpollDispatch::File, + ConsoleOutput::Socket(_) => EpollDispatch::Socket, + ConsoleOutput::Tcp(_, _) => EpollDispatch::Tcp, }; epoll::ctl( @@ -259,6 +320,7 @@ impl SerialManager { let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; + let mut reader_tcp: Option = None; // In case of PTY, we want to be able to detect a connection on the // other end of the PTY. This is done by detecting there's no event @@ -272,6 +334,17 @@ impl SerialManager { .name("serial-manager".to_string()) .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { + let write_distributor = FanoutWriter::new(); + + if let ConsoleOutput::Tcp(_, Some(f)) = &in_file { + write_distributor.add_writer(f.clone()); + serial + .as_ref() + .lock() + .unwrap() + .set_out(Some(Box::new(write_distributor.clone()))); + } + let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; @@ -328,10 +401,9 @@ impl SerialManager { let (unix_stream, _) = listener.accept().map_err(Error::AcceptConnection)?; let writer = - unix_stream.try_clone().map_err(Error::CloneUnixStream)?; - reader = Some( - unix_stream.try_clone().map_err(Error::CloneUnixStream)?, - ); + unix_stream.try_clone().map_err(Error::CloneStream)?; + reader = + Some(unix_stream.try_clone().map_err(Error::CloneStream)?); epoll::ctl( epoll_fd, @@ -345,6 +417,41 @@ impl SerialManager { .map_err(Error::Epoll)?; serial.lock().unwrap().set_out(Some(Box::new(writer))); } + EpollDispatch::Tcp => { + // New connection request arrived. + // Shutdown the previous connection, if any + if let Some(ref previous_reader) = reader_tcp { + previous_reader + .shutdown(Shutdown::Both) + .map_err(Error::AcceptConnection)?; + write_distributor.remove_writer(TypeId::of::()); + } + + let ConsoleOutput::Tcp(ref listener, _) = in_file else { + unreachable!(); + }; + + // Events on the listening socket will be connection requests. + // Accept them, create a reader and a writer. + let (tcp_stream, _) = + listener.accept().map_err(Error::AcceptConnection)?; + let writer = + tcp_stream.try_clone().map_err(Error::CloneStream)?; + reader_tcp = + Some(tcp_stream.try_clone().map_err(Error::CloneStream)?); + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + tcp_stream.into_raw_fd(), + epoll::Event::new( + epoll::Events::EPOLLIN, + EpollDispatch::File as u64, + ), + ) + .map_err(Error::Epoll)?; + write_distributor.add_writer(writer); + } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; @@ -371,6 +478,27 @@ impl SerialManager { 0 } } + ConsoleOutput::Tcp(_, _) => { + if let Some(mut serial_reader) = reader_tcp.as_ref() + { + let count = serial_reader + .read(&mut input) + .map_err(Error::ReadInput)?; + if count == 0 { + info!("Remote end closed serial socket"); + serial_reader + .shutdown(Shutdown::Both) + .map_err(Error::ShutdownConnection)?; + reader_tcp = None; + write_distributor.remove_writer( + TypeId::of::(), + ); + } + count + } else { + 0 + } + } ConsoleOutput::Pty(file) | ConsoleOutput::Tty(file) => { (&**file) .read(&mut input) diff --git a/vmm/src/vcpu_throttling.rs b/vmm/src/vcpu_throttling.rs new file mode 100644 index 0000000000..7e74b702d5 --- /dev/null +++ b/vmm/src/vcpu_throttling.rs @@ -0,0 +1,604 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! # vCPU throttling for Auto Converging +//! +//! vCPU throttling is crucial to reach a reasonable downtime when using a +//! precopy strategy for live-migration of VMs with memory-intensive workloads. +//! Auto converge means an increasing vCPU throttling over time until the memory +//! delta is small enough for the migration thread(s) to perform the switch-over +//! to the new host. +//! +//! Therefore, the migration thread(s) use this thread to help them reach their +//! goal. Next to typical lifecycle management, this thread must fulfill various +//! requirements to ensure a minimal downtime. +//! +//! ## Thread Requirements +//! - Needs to be able to gracefully wait for work. +//! - Must be able to exit gracefully. +//! - Must be able to cancel any work and return to its init state to support +//! live-migration cancellation and restart of live-migrations. +//! - Must not block the migration thread(s) whenever possible, to facilitate +//! fast live-migrations with short downtimes. +//! - Must be interruptible during a sleep phase to not block the migration +//! thread(s). +//! - Must not confuse or hinder the migration thread(s) regarding +//! pause()/resume() operations. Context: migration thread shuts down the +//! vCPUs for the handover. The throttle thread must not restart the vCPUs +//! again. + +use std::cell::Cell; +use std::cmp::min; +use std::sync::mpsc::RecvTimeoutError; +use std::sync::{Arc, Mutex, mpsc}; +use std::thread; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; + +use vm_migration::Pausable; + +use crate::cpu::CpuManager; + +/// The possible command of the thread, i.e., the current state. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ThrottleCommand { + /// Waiting for next event. + Waiting, + /// Ongoing vCPU throttling. + /// + /// The inner value shows the current throttling percentage in range `1..=99`. + Throttling(u8 /* `1..=99` */), + /// Thread is shutting down gracefully. + Exiting, +} + +/// Helper to adapt the throttling timeslice as we go, depending on the time it +/// takes to pause() and resume() all vCPUs. +#[derive(Debug)] +struct TimesliceContext { + current_timeslice: Duration, + /// Duration it took to pause() all vCPUs on the previous iteration. + previous_pause_duration: Duration, + /// Duration it took to resume() all vCPUs on the previous iteration. + previous_resume_duration: Duration, +} + +impl TimesliceContext { + /// The initial timeslice for a throttling cycle (vCPU pause & resume). + const INITIAL_TIMESLICE: Duration = Duration::from_millis(100); + + /// The minimal value for the operations. + /// + /// Any value smaller than this is upgraded to this to prevent math + /// exceptions during timing calculations. + const MIN_DURATION: Duration = Duration::from_millis(1); + + /// Maximum time slice. This should not be too big. + /// + /// Otherwise, for example: Assuming we have 10% throttling and + /// 2000ms time slice, then the WM will be unresponsive for + /// 200ms every 1800ms. This is not convenient. /// + const MAX_TIMESLICE: Duration = Duration::from_millis(800); + + /// Creates a new instance with [`Self::INITIAL_TIMESLICE`]. + fn new() -> Self { + Self { + current_timeslice: Self::INITIAL_TIMESLICE, + previous_pause_duration: Self::MIN_DURATION, + previous_resume_duration: Self::MIN_DURATION, + } + } + + /// Updates the timeslice. + fn update_timeslice(&mut self) { + // CpuManager::pause() plus CpuManager::resume() without additional delay is the shortest + // we can get. + let one_percent = self.previous_pause_duration + self.previous_resume_duration; + self.current_timeslice = one_percent * 100; + self.current_timeslice = min(self.current_timeslice, Self::MAX_TIMESLICE); + } + + /// Calculates the sleep durations for after the `pause()` and `resume()` operations with + /// the current `timeslice`. + /// + /// It uses the `timeslice` that was calculated on the previous + /// invocation of [`Self::update_timeslice`]. + fn calc_sleep_durations( + &mut self, + percentage: u64, + ) -> ( + Duration, /* after pause */ + Duration, /* after resume */ + ) { + assert!(percentage <= 100); + assert!(percentage > 0); + + let timeslice_ms = self.current_timeslice.as_millis() as u64; + let wait_ms_after_pause_ms = timeslice_ms * percentage / 100; + let wait_ms_after_resume_ms = timeslice_ms - wait_ms_after_pause_ms; + + let wait_ms_after_pause_ms = + wait_ms_after_pause_ms.saturating_sub(self.previous_pause_duration.as_millis() as u64); + let wait_ms_after_resume_ms = wait_ms_after_resume_ms + .saturating_sub(self.previous_resume_duration.as_millis() as u64); + + ( + Duration::from_millis(wait_ms_after_pause_ms), + Duration::from_millis(wait_ms_after_resume_ms), + ) + } + + /// Set the previous pause duration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_pause_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION + } + + self.previous_pause_duration = duration; + } + + /// Set the duration it took to `resume()` all vCPUs on the previous iteration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_resume_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION + } + self.previous_resume_duration = duration; + } +} + +/// Context of the vCPU throttle thread. +// The main justification for this dedicated type is to split the thread +// functions from the higher-level control API. +// TODO seccomp is missing +pub struct ThrottleWorker { + handle: Option>, +} + +impl ThrottleWorker { + /// This should not be named "vcpu*" as libvirt fails when + /// iterating the vCPU threads then. Fix this first in libvirt! + const THREAD_NAME: &'static str = "throttle-vcpu"; + + /// Executes the provided callback and goes to sleep until the specified + /// `sleep_duration` passed. + /// + /// The time to execute the callback itself is not taken into account + /// when sleeping for `sleep_duration`. Therefore, the callback is + /// supposed to be quick (a couple of milliseconds). + /// + /// The thread is interruptible during the sleep phase when the `receiver` + /// receives a new [`ThrottleCommand`]. + /// + /// # Arguments + /// - `callback`: Function to run + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `sleep_duration`: Duration this function takes at most, including + /// running the `callback`. + /// - `receiver`: Receiving end of the channel to the migration managing + /// thread. + fn execute_and_wait_interruptible( + callback: &impl Fn(), + mut set_callback_duration: impl FnMut(Duration), + sleep_duration: Duration, + receiver: &mpsc::Receiver, + ) -> Option { + let begin = Instant::now(); + callback(); + let cb_duration = begin.elapsed(); + // Help to adjust the timeslice in the next cycle. + set_callback_duration(cb_duration); + + // It might happen that sometimes we get interrupted during a sleep phase + // with a new higher throttle percentage but this is negligible. For an + // auto-converge cycle, there are typically only ~10 steps involved over + // a time frame from a couple of seconds up to a couple of minutes. + match receiver.recv_timeout(sleep_duration) { + Ok(next_task) => Some(next_task), + Err(RecvTimeoutError::Timeout) => None, + Err(RecvTimeoutError::Disconnected) => { + panic!("thread and channel should exit gracefully") + } + } + } + + /// Executes one throttling step: either pause or resume of vCPUs. + /// + /// Runs the given callback, then waits for the specified duration, unless + /// interrupted by a new [`ThrottleCommand`]. + /// + /// # Behavior + /// - Runs the provided `callback` immediately. + /// - Waits up to `duration` for new commands on the `receiver`. + /// - If no command arrives before the timeout, this step completes + /// normally and returns `None`. + /// - If a [`ThrottleCommand::Throttling`] arrives, updates the current + /// throttle percentage in `current_throttle` and continues with the + /// loop. Returns `None`. + /// - If a [`ThrottleCommand::Waiting`] or [`ThrottleCommand::Exiting`] + /// arrives, this command is forwarded to the caller. + /// + /// # Arguments + /// - `callback`: Function to run (e.g., pause or resume vCPUs). + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `receiver`: Channel for receiving new [`ThrottleCommand`]s. + /// - `current_throttle`: Mutable reference to the current throttle + /// percentage (updated on [`ThrottleCommand::Throttling`]). + /// + /// # Returns + /// - `None` if the throttling cycle should continue. + /// - `Some(ThrottleCommand::Waiting | ThrottleCommand::Exiting)` if + /// throttling should stop. + fn throttle_step( + callback: &F, + set_callback_duration: impl FnMut(Duration), + duration: Duration, + receiver: &mpsc::Receiver, + current_throttle: &mut u64, + ) -> Option + where + F: Fn(), + { + let maybe_task = Self::execute_and_wait_interruptible( + callback, + set_callback_duration, + duration, + receiver, + ); + match maybe_task { + None => None, + Some(ThrottleCommand::Throttling(next)) => { + // A new throttle value is only applied at the end of a full + // throttling cycle. This is fine and negligible in a series of + // (tens of) thousands of cycles. + *current_throttle = next as u64; + None + } + Some(cmd @ (ThrottleCommand::Exiting | ThrottleCommand::Waiting)) => Some(cmd), + } + } + + /// Helper for [`Self::control_loop`] that runs the actual throttling loop. + /// + /// This function returns the next [`ThrottleCommand`] **only** if the thread + /// stopped the vCPU throttling. + fn throttle_loop( + receiver: &mpsc::Receiver, + initial_throttle: u8, + callback_pause_vcpus: &impl Fn(), + callback_resume_vcpus: &impl Fn(), + ) -> ThrottleCommand { + // The current throttle value, as long as the thread is throttling. + let mut current_throttle = initial_throttle as u64; + let mut timeslice_ctx = TimesliceContext::new(); + + loop { + // Catch logic bug: We should have exited in this case already. + assert_ne!(current_throttle, 0); + assert!(current_throttle < 100); + + let (wait_ms_after_pause, wait_ms_after_resume) = + timeslice_ctx.calc_sleep_durations(current_throttle); + + // pause vCPUs + if let Some(cmd) = Self::throttle_step( + callback_pause_vcpus, + |new_duration| timeslice_ctx.set_previous_pause_duration(new_duration), + wait_ms_after_pause, + receiver, + &mut current_throttle, + ) { + // TODO: future optimization + // Prevent unnecessary resume() here when the migration thread + // performs .pause() right after anyway. We could make .pause() and + // .resume() idempotent. + callback_resume_vcpus(); + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // resume vCPUs + if let Some(cmd) = Self::throttle_step( + callback_resume_vcpus, + |new_duration| timeslice_ctx.set_previous_resume_duration(new_duration), + wait_ms_after_resume, + receiver, + &mut current_throttle, + ) { + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // Update timeslice for next cycle. This way, we can closely match the expected + // percentage for pause() and resume(). + timeslice_ctx.update_timeslice(); + } + } + + /// Implements the control loop of the thread. + /// + /// It wraps the actual throttling with the necessary thread lifecycle + /// management. + fn control_loop( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> impl Fn() { + move || { + // In the outer loop, we gracefully wait for commands. + 'control: loop { + let thread_task = receiver.recv().expect("channel should not be closed"); + match thread_task { + ThrottleCommand::Exiting => { + break 'control; + } + ThrottleCommand::Waiting => { + continue 'control; + } + ThrottleCommand::Throttling(initial_throttle) => { + let next_task = Self::throttle_loop( + &receiver, + initial_throttle, + &callback_pause_vcpus, + &callback_resume_vcpus, + ); + if next_task == ThrottleCommand::Exiting { + break 'control; + } + // else: thread is in Waiting state + } + } + } + debug!("thread exited gracefully"); + } + } + + /// Spawns a new thread. + fn spawn( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> Self { + let handle = { + let thread_fn = + Self::control_loop(receiver, callback_pause_vcpus, callback_resume_vcpus); + thread::Builder::new() + .name(String::from(Self::THREAD_NAME)) + .spawn(thread_fn) + .expect("should spawn thread") + }; + + Self { + handle: Some(handle), + } + } +} + +impl Drop for ThrottleWorker { + fn drop(&mut self) { + // Note: The thread handle must send the shutdown command first. + if let Some(handle) = self.handle.take() { + handle.join().expect("thread should have succeeded"); + } + } +} + +/// Handler for controlling the vCPU throttle thread. +/// +/// vCPU throttling is needed for live-migration of memory-intensive workloads. +/// The current design assumes that all vCPUs are throttled equally. +/// +/// # Transitions +/// - `Waiting` -> `Throttling(x %)`, `Exit` +/// - `Throttling(x %)` -> `Exit`, `Waiting`, `Throttling(y %)` +/// - `Exiting` +pub struct ThrottleThreadHandle { + /// Thread state wrapped by synchronization primitives. + state_sender: mpsc::Sender, + /// Current throttle value. + /// + /// This is the last throttle value that was sent to the + /// thread. + current_throttle: Cell, + /// The underlying thread handle. Option to have more control over when it is dropped. + throttle_thread: Option, +} + +impl ThrottleThreadHandle { + /// Spawns a new thread and returning a handle to it. + /// + /// # Parameters + /// - `cpu_manager`: CPU manager to pause and resume vCPUs + pub fn new_from_cpu_manager(cpu_manager: &Arc>) -> Self { + let callback_pause_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().pause().unwrap()) + }; + + let callback_resume_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().resume().unwrap()) + }; + + Self::new(callback_pause_vcpus, callback_resume_vcpus) + } + + /// Spawns a new thread and returning a handle to it. + /// + /// This function returns when the thread gracefully arrived in + /// [`ThrottleCommand::Waiting`]. + /// + /// # Parameters + /// - `callback_pause_vcpus`: Function putting all vCPUs into pause state. The + /// function must not perform any artificial delay itself. + /// - `callback_resume_vcpus`: Function putting all vCPUs back into running + /// state. The function must not perform any artificial delay itself. + fn new( + callback_pause_vcpus: Box, + callback_resume_vcpus: Box, + ) -> Self { + // Channel used for synchronization. + let (sender, receiver) = mpsc::channel::(); + + let thread = ThrottleWorker::spawn(receiver, callback_pause_vcpus, callback_resume_vcpus); + + Self { + state_sender: sender, + current_throttle: Cell::new(0), + throttle_thread: Some(thread), + } + } + + /// Set's the throttle percentage to a value in range `0..=99` and updates + /// the thread's state. + /// + /// Setting the value back to `0` equals setting the thread back into + /// [`ThrottleCommand::Waiting`]. + /// + /// In case of an ongoing throttling cycle (vCPU pause & resume), any new + /// throttling percentage will be applied no later than when the current cycle + /// ends. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent_new: u8) { + assert!( + percent_new <= 100, + "setting a percentage of 100 or above is not allowed: {percent_new}%" + ); + + // We have no problematic race condition here as in normal operation + // there is exactly one thread calling these functions. + let percent_old = self.throttle_percent(); + + // Return early, no action needed. + if percent_old == percent_new { + return; + } + + if percent_new == 0 { + self.state_sender + .send(ThrottleCommand::Waiting) + .expect("channel should not be closed"); + } else { + self.state_sender + .send(ThrottleCommand::Throttling(percent_new)) + .expect("channel should not be closed"); + }; + + self.current_throttle.set(percent_new); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.current_throttle.get() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. This function **must** be called before + /// the migration thread(s) do anything with the CPU manager to prevent + /// odd states. + pub fn shutdown(&mut self) { + let begin = Instant::now(); + + { + // drop thread; ensure that the channel is still alive when it is dropped + if let Some(worker) = self.throttle_thread.take() { + self.state_sender + .send(ThrottleCommand::Exiting) + .expect("channel should not be closed"); + + // Ensure the sender is still living when this is dropped. + drop(worker); + } + } + + let elapsed = begin.elapsed(); + if elapsed > Duration::from_millis(20) { + warn!( + "shutting down thread takes too long ({} ms): this increases the downtime!", + elapsed.as_millis() + ); + } + } +} + +impl Drop for ThrottleThreadHandle { + fn drop(&mut self) { + self.shutdown(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicBool, Ordering}; + use std::thread::sleep; + + use super::*; + + // The test is successful if it does not get stuck. Then, the thread exits + // gracefully. + #[test] + fn test_vcpu_throttling_thread_lifecycle() { + for _ in 0..5 { + // State transitions: Waiting -> Exit + { + let mut handler = ThrottleThreadHandle::new(Box::new(|| {}), Box::new(|| {})); + + // The test is successful if it does not get stuck. + handler.shutdown(); + } + + // Dummy CpuManager + let cpus_throttled = Arc::new(AtomicBool::new(false)); + let callback_pause_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(true, Ordering::SeqCst); + assert!(!old); + }) + }; + let callback_resume_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(false, Ordering::SeqCst); + assert!(old); + }) + }; + + // State transitions: Waiting -> Throttle -> Waiting -> Throttle -> Exit + { + let mut handler = + ThrottleThreadHandle::new(callback_pause_vcpus, callback_resume_vcpus); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // Assume we aborted vCPU throttling (or the live-migration at all). + handler.set_throttle_percent(0 /* reset to waiting */); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // The test is successful if we don't have a panic here due to a + // closed channel. + for _ in 0..10 { + handler.shutdown(); + sleep(Duration::from_millis(1)); + } + + // The test is successful if it does not get stuck. + drop(handler); + } + } + } +} diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 2271c65fd9..2093312c6c 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -64,9 +64,7 @@ use tracer::trace_scoped; use vm_device::Bus; #[cfg(feature = "tdx")] use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; -use vm_memory::{ - Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, -}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, snapshot_from_id, @@ -95,6 +93,7 @@ use crate::migration::get_vm_snapshot; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::migration::url_to_file; use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; +use crate::vcpu_throttling::ThrottleThreadHandle; #[cfg(feature = "fw_cfg")] use crate::vm_config::FwCfgConfig; use crate::vm_config::{ @@ -178,6 +177,9 @@ pub enum Error { #[error("VM is not running")] VmNotRunning, + #[error("VM is currently migrating and can't be modified")] + VmMigrating, + #[error("Cannot clone EventFd")] EventFdClone(#[source] io::Error), @@ -244,6 +246,9 @@ pub enum Error { #[error("Failed resizing a memory zone")] ResizeZone, + #[error("Failed resizing a disk image")] + ResizeDisk, + #[error("Cannot activate virtio devices")] ActivateVirtioDevices(#[source] DeviceManagerError), @@ -521,6 +526,7 @@ pub struct Vm { hypervisor: Arc, stop_on_boot: bool, load_payload_handle: Option>>, + vcpu_throttler: ThrottleThreadHandle, } impl Vm { @@ -807,6 +813,10 @@ impl Vm { VmState::Created }; + // TODO we could also spawn the thread when a migration with auto-converge starts. + // Probably this is the better design. + let vcpu_throttler = ThrottleThreadHandle::new_from_cpu_manager(&cpu_manager); + Ok(Vm { #[cfg(feature = "tdx")] kernel, @@ -826,6 +836,7 @@ impl Vm { hypervisor, stop_on_boot, load_payload_handle, + vcpu_throttler, }) } @@ -979,6 +990,31 @@ impl Vm { Ok(numa_nodes) } + /// Set's the throttle percentage to a value in range `0..=99`. + /// + /// Setting the value back to `0` brings the thread back into a waiting + /// state. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent: u8 /* 1..=99 */) { + self.vcpu_throttler.set_throttle_percent(percent); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.vcpu_throttler.throttle_percent() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. + pub fn stop_vcpu_throttling(&mut self) { + self.vcpu_throttler.shutdown(); + } + #[allow(clippy::too_many_arguments)] pub fn new( vm_config: Arc>, @@ -1055,7 +1091,7 @@ impl Vm { #[cfg(feature = "tdx")] tdx_enabled, None, - None, + Default::default(), ) .map_err(Error::MemoryManager)? }; @@ -1722,6 +1758,16 @@ impl Vm { Ok(()) } + pub fn resize_disk(&mut self, id: String, desired_size: u64) -> Result<()> { + self.device_manager + .lock() + .unwrap() + .resize_disk(&id, desired_size) + .map_err(Error::DeviceManager)?; + + Ok(()) + } + pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { let memory_config = &mut self.config.lock().unwrap().memory; @@ -2552,45 +2598,8 @@ impl Vm { Ok(()) } - pub fn send_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: WriteVolatile, - { - let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of write_all_to() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_written = mem - .write_volatile_to( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Error transferring memory to socket: {e}" - )) - })?; - offset += bytes_written as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) + pub fn guest_memory(&self) -> GuestMemoryAtomic { + self.memory_manager.lock().unwrap().guest_memory() } pub fn memory_range_table(&self) -> std::result::Result { diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 70c3579025..4ea0aa5214 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -276,6 +276,8 @@ pub struct DiskConfig { pub serial: Option, #[serde(default)] pub queue_affinity: Option>, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for DiskConfig { @@ -324,12 +326,6 @@ pub struct NetConfig { pub vhost_mode: VhostMode, #[serde(default)] pub id: Option, - // Special deserialize handling: - // Therefore, we don't serialize FDs, and whatever value is here after - // deserialization is invalid. - // - // Valid FDs are transmitted via a different channel (SCM_RIGHTS message) - // and will be populated into this struct on the destination VMM eventually. #[serde(default, deserialize_with = "deserialize_netconfig_fds")] pub fds: Option>, #[serde(default)] @@ -342,6 +338,8 @@ pub struct NetConfig { pub offload_ufo: bool, #[serde(default = "default_netconfig_true")] pub offload_csum: bool, + #[serde(default)] + pub bdf_device: Option, } pub fn default_netconfig_true() -> bool { @@ -388,6 +386,8 @@ pub struct RngConfig { pub src: PathBuf, #[serde(default)] pub iommu: bool, + #[serde(default)] + pub bdf_device: Option, } pub const DEFAULT_RNG_SOURCE: &str = "/dev/urandom"; @@ -397,6 +397,7 @@ impl Default for RngConfig { RngConfig { src: PathBuf::from(DEFAULT_RNG_SOURCE), iommu: false, + bdf_device: None, } } } @@ -418,6 +419,8 @@ pub struct BalloonConfig { /// Option to enable free page reporting from the guest. #[serde(default)] pub free_page_reporting: bool, + #[serde(default)] + pub bdf_device: Option, } #[cfg(feature = "pvmemcontrol")] @@ -436,6 +439,8 @@ pub struct FsConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } pub fn default_fsconfig_num_queues() -> usize { @@ -466,6 +471,8 @@ pub struct PmemConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for PmemConfig { @@ -483,6 +490,7 @@ pub enum ConsoleOutputMode { Tty, File, Socket, + Tcp, Null, } @@ -494,6 +502,10 @@ pub struct ConsoleConfig { #[serde(default)] pub iommu: bool, pub socket: Option, + pub url: Option, + /// PCI BDF to attach the console in the guest to + #[serde(default)] + pub bdf_device: Option, } pub fn default_consoleconfig_file() -> Option { @@ -599,6 +611,8 @@ pub struct VdpaConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } pub fn default_vdpaconfig_num_queues() -> usize { @@ -622,6 +636,8 @@ pub struct VsockConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for VsockConfig { @@ -856,6 +872,8 @@ pub fn default_serial() -> ConsoleConfig { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, } } @@ -865,6 +883,8 @@ pub fn default_console() -> ConsoleConfig { mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, } }