diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml deleted file mode 100644 index bab8eaa145..0000000000 --- a/.github/workflows/audit.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: Cloud Hypervisor Dependency Audit -on: - pull_request: - paths: - - '**/Cargo.toml' - - '**/Cargo.lock' - -jobs: - security_audit: - name: Audit - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - - uses: actions-rust-lang/audit@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 8528d54299..3da0ff9288 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -13,21 +13,16 @@ jobs: matrix: rust: - stable - - beta - nightly - "1.89.0" target: - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl steps: - name: Code checkout uses: actions/checkout@v6 with: fetch-depth: 0 - - name: Install musl-gcc - run: sudo apt install -y musl-tools - - name: Install Rust toolchain (${{ matrix.rust }}) uses: dtolnay/rust-toolchain@stable with: diff --git a/.github/workflows/dco.yaml b/.github/workflows/dco.yaml deleted file mode 100644 index 655c0b5e2f..0000000000 --- a/.github/workflows/dco.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: DCO -on: [pull_request, merge_group] - -jobs: - check: - name: DCO Check ("Signed-off-by") - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - - name: Set up Python 3.x - uses: actions/setup-python@v6 - with: - python-version: '3.x' - - name: Check DCO - if: ${{ github.event_name == 'pull_request' }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - pip3 install -U dco-check - dco-check -e "49699333+dependabot[bot]@users.noreply.github.com" diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml deleted file mode 100644 index da2bbf5d3a..0000000000 --- a/.github/workflows/docker-image.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Cloud Hypervisor's Docker image update -on: - push: - branches: main - paths: resources/Dockerfile - pull_request: - paths: resources/Dockerfile -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - main: - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to ghcr - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # generate Docker tags based on the following events/attributes - tags: | - type=raw,value=20251114-0 - type=sha - - - name: Build and push - if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v6 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: ${{ steps.meta.outputs.tags }} - - - name: Build only - if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v6 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/formatting.yaml b/.github/workflows/formatting.yaml index 00cd322797..d852eafe2c 100644 --- a/.github/workflows/formatting.yaml +++ b/.github/workflows/formatting.yaml @@ -14,7 +14,6 @@ jobs: - nightly target: - x86_64-unknown-linux-gnu - - aarch64-unknown-linux-musl env: RUSTFLAGS: -D warnings steps: diff --git a/.github/workflows/fuzz-build.yaml b/.github/workflows/fuzz-build.yaml deleted file mode 100644 index b97796893d..0000000000 --- a/.github/workflows/fuzz-build.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: Cloud Hypervisor Cargo Fuzz Build -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo Fuzz Build - runs-on: ubuntu-latest - strategy: - matrix: - rust: - - nightly - target: - - x86_64-unknown-linux-gnu - env: - RUSTFLAGS: -D warnings - steps: - - name: Code checkout - uses: actions/checkout@v6 - - name: Install Rust toolchain (${{ matrix.rust }}) - uses: dtolnay/rust-toolchain@stable - with: - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - - name: Install Cargo fuzz - run: cargo install cargo-fuzz - - name: Fuzz Build - run: cargo fuzz build - - name: Fuzz Check - run: cargo fuzz check diff --git a/.github/workflows/hadolint.yaml b/.github/workflows/hadolint.yaml deleted file mode 100644 index 631c50eefd..0000000000 --- a/.github/workflows/hadolint.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Lint Dockerfile -on: - push: - paths: - - resources/Dockerfile - pull_request: - paths: - - resources/Dockerfile - -jobs: - hadolint: - name: Run Hadolint Dockerfile Linter - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Lint Dockerfile - uses: hadolint/hadolint-action@master - with: - dockerfile: ./resources/Dockerfile - format: tty - no-fail: false - verbose: true - failure-threshold: info diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml deleted file mode 100644 index f48c0ec126..0000000000 --- a/.github/workflows/integration-arm64.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Cloud Hypervisor Tests (ARM64) -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - timeout-minutes: 120 - name: Tests (ARM64) - runs-on: bookworm-arm64 - steps: - - name: Fix workspace permissions - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run unit tests (musl) - run: scripts/dev_cli.sh tests --unit --libc musl - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests (musl) - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc musl - - name: Install Azure CLI - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - if: ${{ github.event_name != 'pull_request' }} - shell: bash - run: | - IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw - IMG_PATH=$HOME/workloads/$IMG_BASENAME - IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz - IMG_GZ_BLOB_NAME=windows-11-iot-enterprise-aarch64-9-min.raw.gz - cp "scripts/$IMG_BASENAME.sha1" "$HOME/workloads/" - pushd "$HOME/workloads" - if sha1sum "$IMG_BASENAME.sha1" --check; then - exit - fi - popd - mkdir -p "$HOME/workloads" - az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d $IMG_GZ_PATH - - name: Run Windows guest integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 30 - run: scripts/dev_cli.sh tests --integration-windows --libc musl diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml deleted file mode 100644 index 952e938fdf..0000000000 --- a/.github/workflows/integration-metrics.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Cloud Hypervisor Tests (Metrics) -on: - push: - branches: - - main - -jobs: - build: - name: Tests (Metrics) - runs-on: bare-metal-9950x - env: - METRICS_PUBLISH_KEY: ${{ secrets.METRICS_PUBLISH_KEY }} - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run metrics tests - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --metrics -- -- --report-file /root/workloads/metrics.json - - name: Upload metrics report - run: 'curl -X PUT https://ch-metrics.azurewebsites.net/api/publishmetrics -H "x-functions-key: $METRICS_PUBLISH_KEY" -T ~/workloads/metrics.json' diff --git a/.github/workflows/integration-rate-limiter.yaml b/.github/workflows/integration-rate-limiter.yaml deleted file mode 100644 index 94497f47bd..0000000000 --- a/.github/workflows/integration-rate-limiter.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Cloud Hypervisor Tests (Rate-Limiter) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Tests (Rate-Limiter) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'bare-metal-9950x' }} - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run rate-limiter integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-rate-limiter - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml deleted file mode 100644 index 416e99ee95..0000000000 --- a/.github/workflows/integration-vfio.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Cloud Hypervisor Tests (VFIO) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Tests (VFIO) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'vfio-nvidia' }} - env: - AUTH_DOWNLOAD_TOKEN: ${{ secrets.AUTH_DOWNLOAD_TOKEN }} - steps: - - name: Fix workspace permissions - if: ${{ github.event_name != 'pull_request' }} - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Run VFIO integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-vfio - # Most tests are failing with musl see #6790 - # - name: Run VFIO integration tests for musl - # if: ${{ github.event_name != 'pull_request' }} - # timeout-minutes: 15 - # run: scripts/dev_cli.sh tests --integration-vfio --libc musl - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" diff --git a/.github/workflows/integration-windows.yaml b/.github/workflows/integration-windows.yaml deleted file mode 100644 index 81ed017550..0000000000 --- a/.github/workflows/integration-windows.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: Cloud Hypervisor Tests (Windows Guest) -on: [merge_group, pull_request] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Tests (Windows Guest) - runs-on: ${{ github.event_name == 'pull_request' && 'ubuntu-latest' || 'garm-jammy-16' }} - steps: - - name: Code checkout - if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Install Azure CLI - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ jammy main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - if: ${{ github.event_name != 'pull_request' }} - run: | - mkdir $HOME/workloads - az storage blob download --container-name private-images --file "$HOME/workloads/windows-server-2022-amd64-2.raw" --name windows-server-2022-amd64-2.raw --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - - name: Run Windows guest integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows - - name: Run Windows guest integration tests for musl - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 15 - run: scripts/dev_cli.sh tests --integration-windows --libc musl - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' }} - run: echo "Skipping build for PR" \ No newline at end of file diff --git a/.github/workflows/integration-x86-64.yaml b/.github/workflows/integration-x86-64.yaml deleted file mode 100644 index 453d7c2d32..0000000000 --- a/.github/workflows/integration-x86-64.yaml +++ /dev/null @@ -1,52 +0,0 @@ -name: Cloud Hypervisor Tests (x86-64) -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - runner: ['garm-jammy', "garm-jammy-amd"] - libc: ["musl", 'gnu'] - name: Tests (x86-64) - runs-on: ${{ github.event_name == 'pull_request' && !(matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') && 'ubuntu-latest' || format('{0}-16', matrix.runner) }} - steps: - - name: Code checkout - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Install Docker - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: | - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg - sudo chmod a+r /usr/share/keyrings/docker-archive-keyring.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt install -y docker-ce docker-ce-cli - - name: Prepare for VDPA - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: scripts/prepare_vdpa.sh - - name: Run unit tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: scripts/dev_cli.sh tests --unit --libc ${{ matrix.libc }} - - name: Load openvswitch module - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: sudo modprobe openvswitch - - name: Run integration tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 40 - run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - - name: Run live-migration integration tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-live-migration --libc ${{ matrix.libc }} - - name: Skipping build for PR - if: ${{ github.event_name == 'pull_request' && matrix.runner != 'garm-jammy' && matrix.libc != 'gnu' }} - run: echo "Skipping build for PR" diff --git a/.github/workflows/lychee.yaml b/.github/workflows/lychee.yaml deleted file mode 100644 index e77c595ed3..0000000000 --- a/.github/workflows/lychee.yaml +++ /dev/null @@ -1,45 +0,0 @@ -name: Link Check (lychee) -on: pull_request -jobs: - link_check: - name: Link Check - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - # Fetch the entire history so git diff can compare against the base branch - fetch-depth: 0 - - name: Get changed files in PR - id: changed-files - uses: tj-actions/changed-files@v47 # Using a dedicated action for robustness - with: - # Compare the HEAD of the PR with the merge-base (where the PR branches off) - base_sha: ${{ github.event.pull_request.base.sha }} - - # NEW STEP: Print all changed-files outputs for verification - - name: Verify Changed Files - run: | - echo "--- tj-actions/changed-files Outputs ---" - echo "any_changed: ${{ steps.changed-files.outputs.any_changed }}" - echo "all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}" - echo "added_files: ${{ steps.changed-files.outputs.added_files }}" - echo "modified_files: ${{ steps.changed-files.outputs.modified_files }}" - echo "deleted_files: ${{ steps.changed-files.outputs.deleted_files }}" - echo "renamed_files: ${{ steps.changed-files.outputs.renamed_files }}" - echo "----------------------------------------" - # This will also show if the all_changed_files string is empty or not - if [ -n "${{ steps.changed-files.outputs.all_changed_files }}" ]; then - echo "Detected changes: all_changed_files output is NOT empty." - else - echo "No changes detected: all_changed_files output IS empty." - fi - - name: Link Availability Check (Diff Only) - # MODIFIED: Only run lychee if the 'all_changed_files' output is not an empty string - if: ${{ steps.changed-files.outputs.all_changed_files != '' }} - uses: lycheeverse/lychee-action@master - with: - # Pass the space-separated list of changed files to lychee - args: --verbose --config .lychee.toml ${{ steps.changed-files.outputs.all_changed_files }} - failIfEmpty: false - fail: true \ No newline at end of file diff --git a/.github/workflows/mshv-infra.yaml b/.github/workflows/mshv-infra.yaml deleted file mode 100644 index ffd72713e1..0000000000 --- a/.github/workflows/mshv-infra.yaml +++ /dev/null @@ -1,249 +0,0 @@ -name: MSHV Infra Setup -on: - workflow_call: - inputs: - ARCH: - description: 'Architecture for the VM' - required: true - type: string - KEY: - description: 'SSH Key Name' - required: true - type: string - OS_DISK_SIZE: - description: 'OS Disk Size in GB' - required: true - type: string - RG: - description: 'Resource Group Name' - required: true - type: string - VM_SKU: - description: 'VM SKU' - required: true - type: string - secrets: - MI_CLIENT_ID: - required: true - RUNNER_RG: - required: true - STORAGE_ACCOUNT_PATHS: - required: true - ARCH_SOURCE_PATH: - required: true - USERNAME: - required: true - outputs: - RG_NAME: - description: 'Resource group of the VM' - value: ${{ jobs.infra-setup.outputs.RG_NAME }} - VM_NAME: - description: 'Name of the VM' - value: ${{ jobs.infra-setup.outputs.VM_NAME }} - PRIVATE_IP: - description: 'Private IP of the VM' - value: ${{ jobs.infra-setup.outputs.PRIVATE_IP }} -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - infra-setup: - name: ${{ inputs.ARCH }} VM Provision - runs-on: mshv - continue-on-error: true - outputs: - RG_NAME: ${{ steps.rg-setup.outputs.RG_NAME }} - VM_NAME: ${{ steps.vm-setup.outputs.VM_NAME }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - steps: - - name: Install & login to AZ CLI - env: - MI_CLIENT_ID: ${{ secrets.MI_CLIENT_ID }} - run: | - set -e - echo "Installing Azure CLI if not already installed" - if ! command -v az &>/dev/null; then - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - else - echo "Azure CLI already installed" - fi - az --version - echo "Logging into Azure CLI using Managed Identity" - az login --identity --client-id ${MI_CLIENT_ID} - - - name: Get Location - id: get-location - env: - SKU: ${{ inputs.VM_SKU }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -e - # Extract vCPU count from SKU (e.g., "Standard_D2s_v3" => 2) - vcpu=$(echo "$SKU" | sed -n 's/^Standard_[A-Za-z]\+\([0-9]\+\).*/\1/p') - if [[ -z "$vcpu" ]]; then - echo "Cannot extract vCPU count from SKU: $SKU" - exit 1 - fi - - SUPPORTED_LOCATIONS=$(echo "$STORAGE_ACCOUNT_PATHS" | jq -r 'to_entries[] | .key') - - for location in $SUPPORTED_LOCATIONS; do - family=$(az vm list-skus --size "$SKU" --location "$location" --resource-type "virtualMachines" --query '[0].family' -o tsv) - if [[ -z "$family" ]]; then - echo "Cannot determine VM family for SKU: $SKU in $location" - continue - fi - - usage=$(az vm list-usage --location "$location" --query "[?name.value=='$family'] | [0]" -o json) - current=$(echo "$usage" | jq -r '.currentValue') - limit=$(echo "$usage" | jq -r '.limit') - - if [[ $((limit - current)) -ge $vcpu ]]; then - echo "Sufficient quota found in $location" - echo "location=$location" >> "$GITHUB_OUTPUT" - exit 0 - fi - done - - echo "No location found with sufficient vCPU quota for SKU: $SKU" - exit 1 - - - name: Create Resource Group - id: rg-setup - env: - LOCATION: ${{ steps.get-location.outputs.location }} - RG: ${{ inputs.RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.STORAGE_ACCOUNT_PATHS }} - run: | - set -e - echo "Creating Resource Group: $RG" - # Create the resource group - echo "Creating resource group in location: ${LOCATION}" - az group create --name ${RG} --location ${LOCATION} - echo "RG_NAME=${RG}" >> $GITHUB_OUTPUT - echo "Resource group created successfully." - - - name: Generate SSH Key - id: generate-ssh-key - env: - KEY: ${{ inputs.KEY }} - run: | - set -e - echo "Generating SSH key: $KEY" - mkdir -p ~/.ssh - ssh-keygen -t rsa -b 4096 -f ~/.ssh/${KEY} -N "" - - - name: Create VM - id: vm-setup - env: - KEY: ${{ inputs.KEY }} - LOCATION: ${{ steps.get-location.outputs.location }} - OS_DISK_SIZE: ${{ inputs.OS_DISK_SIZE }} - RG: ${{ inputs.RG }} - RUNNER_RG: ${{ secrets.RUNNER_RG }} - USERNAME: ${{ secrets.USERNAME }} - VM_SKU: ${{ inputs.VM_SKU }} - VM_IMAGE_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_image - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -e - echo "Creating $VM_SKU VM: $VM_NAME" - - # Extract subnet ID from the runner VM - echo "Retrieving subnet ID..." - SUBNET_ID=$(az network vnet list --resource-group ${RUNNER_RG} --query "[?contains(location, '${LOCATION}')].{SUBNETS:subnets}" | jq -r ".[0].SUBNETS[0].id") - if [[ -z "${SUBNET_ID}" ]]; then - echo "ERROR: Failed to retrieve Subnet ID." - exit 1 - fi - - # Extract image ID from the runner VM - echo "Retrieving image ID..." - IMAGE_ID=$(az image show --resource-group ${RUNNER_RG} --name ${VM_IMAGE_NAME} --query "id" -o tsv) - if [[ -z "${IMAGE_ID}" ]]; then - echo "ERROR: Failed to retrieve Image ID." - exit 1 - fi - - # Create VM - az vm create \ - --resource-group ${RG} \ - --name ${VM_NAME} \ - --subnet ${SUBNET_ID} \ - --size ${VM_SKU} \ - --location ${LOCATION} \ - --image ${IMAGE_ID} \ - --os-disk-size-gb ${OS_DISK_SIZE} \ - --public-ip-sku Standard \ - --storage-sku Premium_LRS \ - --public-ip-address "" \ - --admin-username ${USERNAME} \ - --ssh-key-value ~/.ssh/${KEY}.pub \ - --security-type Standard \ - --output json - - az vm boot-diagnostics enable --name ${VM_NAME} --resource-group ${RG} - - echo "VM_NAME=${VM_NAME}" >> $GITHUB_OUTPUT - echo "VM creation process completed successfully." - - - name: Get VM Private IP - id: get-vm-ip - env: - RG: ${{ inputs.RG }} - VM_NAME: ${{ inputs.ARCH }}_${{ steps.get-location.outputs.location }}_${{ github.run_id }} - run: | - set -e - echo "Retrieving VM Private IP address..." - # Retrieve VM Private IP address - PRIVATE_IP=$(az vm show -g ${RG} -n ${VM_NAME} -d --query privateIps -o tsv) - if [[ -z "$PRIVATE_IP" ]]; then - echo "ERROR: Failed to retrieve private IP address." - exit 1 - fi - echo "PRIVATE_IP=$PRIVATE_IP" >> $GITHUB_OUTPUT - - - name: Wait for SSH availability - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - echo "Waiting for SSH to be accessible..." - timeout 120 bash -c 'until ssh -o StrictHostKeyChecking=no -i ~/.ssh/${KEY} ${USERNAME}@${PRIVATE_IP} "exit" 2>/dev/null; do sleep 5; done' - echo "VM is accessible!" - - - name: Remove Old Host Key - env: - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - run: | - set -e - echo "Removing the old host key" - ssh-keygen -R $PRIVATE_IP - - - name: SSH into VM and Install Dependencies - env: - KEY: ${{ inputs.KEY }} - PRIVATE_IP: ${{ steps.get-vm-ip.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.USERNAME }} - run: | - set -e - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - set -e - echo "Logged in successfully." - echo "Installing dependencies..." - sudo tdnf install -y git moby-engine moby-cli clang llvm pkg-config make gcc glibc-devel - echo "Installing Rust..." - curl -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable --profile default -y - export PATH="\$HOME/.cargo/bin:\$PATH" - cargo --version - sudo mkdir -p /etc/docker/ - echo '{"default-ulimits":{"nofile":{"Hard":65535,"Name":"nofile","Soft":65535}}}' | sudo tee /etc/docker/daemon.json - sudo systemctl stop docker - sudo systemctl enable docker.service - sudo systemctl enable containerd.service - sudo systemctl start docker - sudo groupadd -f docker - sudo usermod -a -G docker ${USERNAME} - sudo systemctl restart docker - EOF diff --git a/.github/workflows/mshv-integration.yaml b/.github/workflows/mshv-integration.yaml deleted file mode 100644 index 2083c54361..0000000000 --- a/.github/workflows/mshv-integration.yaml +++ /dev/null @@ -1,131 +0,0 @@ -name: Cloud Hypervisor Tests (MSHV) (x86_64) -on: [pull_request_target, merge_group] - -jobs: - infra-setup: - name: MSHV Infra Setup (x86_64) - uses: ./.github/workflows/mshv-infra.yaml - with: - ARCH: x86_64 - KEY: azure_key_${{ github.run_id }} - OS_DISK_SIZE: 512 - RG: MSHV-INTEGRATION-${{ github.run_id }} - VM_SKU: Standard_D16s_v5 - secrets: - MI_CLIENT_ID: ${{ secrets.MSHV_MI_CLIENT_ID }} - RUNNER_RG: ${{ secrets.MSHV_RUNNER_RG }} - STORAGE_ACCOUNT_PATHS: ${{ secrets.MSHV_STORAGE_ACCOUNT_PATHS }} - ARCH_SOURCE_PATH: ${{ secrets.MSHV_X86_SOURCE_PATH }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - - run-tests: - name: Integration Tests (x86_64) - needs: infra-setup - if: ${{ always() && needs.infra-setup.result == 'success' }} - runs-on: mshv - continue-on-error: true - steps: - - name: Run integration tests - timeout-minutes: 60 - env: - KEY: azure_key_${{ github.run_id }} - PR_NUMBER: ${{ github.event.pull_request.number }} - REPO_URL: https://github.com/cloud-hypervisor/cloud-hypervisor.git - REPO_DIR: cloud-hypervisor - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - RG: MSHV-${{ github.run_id }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - set -e - echo "Connecting to the VM via SSH..." - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - set -e - echo "Logged in successfully." - export PATH="\$HOME/.cargo/bin:\$PATH" - - if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then - git clone --depth 1 "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - git fetch origin pull/${{ github.event.pull_request.number }}/merge - git checkout FETCH_HEAD - else - git clone --depth 1 --single-branch --branch "${{ github.ref_name }}" "$REPO_URL" "$REPO_DIR" - cd "$REPO_DIR" - fi - - echo "Loading VDPA kernel modules..." - sudo modprobe vdpa - sudo modprobe vhost_vdpa - sudo modprobe vdpa_sim - sudo modprobe vdpa_sim_blk - sudo modprobe vdpa_sim_net - - echo "Creating VDPA devices..." - sudo vdpa dev add name vdpa-blk0 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk1 mgmtdev vdpasim_blk - sudo vdpa dev add name vdpa-blk2 mgmtdev vdpasim_net - - echo "Setting permissions..." - for i in 0 1 2; do - dev="/dev/vhost-vdpa-$i" - if [ -e "$dev" ]; then - sudo chown $USER:$USER "$dev" - sudo chmod 660 "$dev" - else - echo "Warning: Device $dev not found" - fi - done - - sudo ./scripts/dev_cli.sh tests --hypervisor mshv --integration - EOF - - - name: Dump dmesg - if: always() - continue-on-error: true - env: - KEY: azure_key_${{ github.run_id }} - PRIVATE_IP: ${{ needs.infra-setup.outputs.PRIVATE_IP }} - USERNAME: ${{ secrets.MSHV_USERNAME }} - run: | - ssh -i ~/.ssh/${KEY} -o StrictHostKeyChecking=no ${USERNAME}@${PRIVATE_IP} << EOF - sudo dmesg - EOF - - - name: Dump serial console logs - if: always() - continue-on-error: true - env: - RG_NAME: ${{ needs.infra-setup.outputs.RG_NAME }} - VM_NAME: ${{ needs.infra-setup.outputs.VM_NAME }} - run: | - set -e - az vm boot-diagnostics get-boot-log --name "${VM_NAME}" --resource-group "${RG_NAME}" | jq -r - - cleanup: - name: Cleanup - needs: run-tests - if: always() - runs-on: mshv - steps: - - name: Delete RG - env: - RG: MSHV-INTEGRATION-${{ github.run_id }} - run: | - if az group exists --name ${RG}; then - az group delete --name ${RG} --yes --no-wait - else - echo "Resource Group ${RG} does not exist. Skipping deletion." - fi - echo "Cleanup process completed." - - - name: Delete SSH Key - env: - KEY: azure_key_${{ github.run_id }} - run: | - if [ -f ~/.ssh/${KEY} ]; then - rm -f ~/.ssh/${KEY} ~/.ssh/${KEY}.pub - echo "SSH key deleted successfully." - else - echo "SSH key does not exist. Skipping deletion." - fi - echo "Cleanup process completed." diff --git a/.github/workflows/package-consistency.yaml b/.github/workflows/package-consistency.yaml deleted file mode 100644 index 9c5eb5c0e4..0000000000 --- a/.github/workflows/package-consistency.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: Cloud Hypervisor Consistency -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Rust VMM Consistency Check - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install dependencies - run: sudo apt install -y python3 - - - name: Install Rust toolchain stable - uses: dtolnay/rust-toolchain@stable - with: - toolchain: stable - - - name: Check Rust VMM Package Consistency of root Workspace - run: python3 scripts/package-consistency-check.py github.com/rust-vmm - - - name: Check Rust VMM Package Consistency of fuzz Workspace - run: | - pushd fuzz - python3 ../scripts/package-consistency-check.py github.com/rust-vmm - popd diff --git a/.github/workflows/preview-riscv64-build.yaml b/.github/workflows/preview-riscv64-build.yaml deleted file mode 100644 index e5fbf5ee27..0000000000 --- a/.github/workflows/preview-riscv64-build.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit kvm build Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.89.0 - - - name: Build test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo build --locked --no-default-features --features "kvm" -p cloud-hypervisor - - - name: Clippy test (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked --no-default-features --features "kvm" -p cloud-hypervisor - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/preview-riscv64-modules.yaml b/.github/workflows/preview-riscv64-modules.yaml deleted file mode 100644 index 1ae1c63758..0000000000 --- a/.github/workflows/preview-riscv64-modules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - matrix: - module: - - hypervisor - - arch - - vm-allocator - - devices - - steps: - - name: Code checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.89.0 - - - name: Build ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo build --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Clippy ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings - - - name: Test ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo test --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index c97487d198..8ff031920a 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -13,17 +13,11 @@ jobs: fail-fast: false matrix: rust: - - beta - stable target: - - aarch64-unknown-linux-gnu - - aarch64-unknown-linux-musl - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl include: - - rust: beta - experimental: true - rust: stable experimental: false @@ -58,24 +52,6 @@ jobs: target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings - - name: Clippy (mshv) - uses: houseabsolute/actions-rust-cross@v1 - with: - command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings - - - name: Clippy (mshv + kvm) - uses: houseabsolute/actions-rust-cross@v1 - with: - command: clippy - cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings - - name: Clippy (default features) uses: houseabsolute/actions-rust-cross@v1 with: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index 18317aeae5..0000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: Cloud Hypervisor Release -on: [create, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} - cancel-in-progress: true -env: - GITHUB_TOKEN: ${{ github.token }} - -jobs: - release: - if: (github.event_name == 'create' && github.event.ref_type == 'tag') || github.event_name == 'merge_group' - name: Release ${{ matrix.platform.target }} - strategy: - fail-fast: false - matrix: - platform: - - target: x86_64-unknown-linux-gnu - args: --all --release --features mshv - name_ch: cloud-hypervisor - name_ch_remote: ch-remote - - target: x86_64-unknown-linux-musl - args: --all --release --features mshv - name_ch: cloud-hypervisor-static - name_ch_remote: ch-remote-static - - target: aarch64-unknown-linux-musl - args: --all --release - name_ch: cloud-hypervisor-static-aarch64 - name_ch_remote: ch-remote-static-aarch64 - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v6 - - name: Install musl-gcc - if: contains(matrix.platform.target, 'musl') - run: sudo apt install -y musl-tools - - name: Create release directory - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: rsync -rv --exclude=.git . ../cloud-hypervisor-${{ github.event.ref }} - - name: Build ${{ matrix.platform.target }} - uses: houseabsolute/actions-rust-cross@v1 - with: - command: build - target: ${{ matrix.platform.target }} - args: ${{ matrix.platform.args }} - strip: true - toolchain: "1.89.0" - - name: Copy Release Binaries - if: github.event_name == 'create' && github.event.ref_type == 'tag' - shell: bash - run: | - cp target/${{ matrix.platform.target }}/release/cloud-hypervisor ./${{ matrix.platform.name_ch }} - cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - - name: Upload Release Artifacts - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v6 - with: - name: Artifacts for ${{ matrix.platform.target }} - path: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - - name: Vendor - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - working-directory: ../cloud-hypervisor-${{ github.event.ref }} - run: | - mkdir ../vendor-cargo-home - export CARGO_HOME=$(realpath ../vendor-cargo-home) - mkdir .cargo - cargo vendor > .cargo/config.toml - - name: Create vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: tar cJf cloud-hypervisor-${{ github.event.ref }}.tar.xz ../cloud-hypervisor-${{ github.event.ref }} - - name: Upload cloud-hypervisor vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v6 - with: - path: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - - name: Create GitHub Release - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v2 - with: - draft: true - files: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - ./cloud-hypervisor-${{ github.event.ref }}.tar.xz diff --git a/.github/workflows/shlint.yaml b/.github/workflows/shlint.yaml deleted file mode 100644 index 068b9930ed..0000000000 --- a/.github/workflows/shlint.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: Shell scripts check -on: - pull_request: - merge_group: - push: - branches: - - main - -jobs: - sh-checker: - name: Check shell scripts - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v6 - - name: Run the shell script checkers - uses: luizm/action-sh-checker@master - env: - SHFMT_OPTS: -i 4 -d - SHELLCHECK_OPTS: -x --source-path scripts diff --git a/Cargo.lock b/Cargo.lock index 62bbf709fe..288bc9f800 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -266,6 +266,29 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879b6c89592deb404ba4dc0ae6b58ffd1795c78991cbb5b8bc441c48a070440d" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "107a4e9d9cab9963e04e84bb8dee0e25f2a987f9a8bad5ed054abd439caa8f8c" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -281,6 +304,26 @@ dependencies = [ "windows-link", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.10.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.1", + "shlex", + "syn", +] + [[package]] name = "bitfield-struct" version = "0.10.1" @@ -375,6 +418,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -387,6 +439,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.53" @@ -446,6 +509,15 @@ dependencies = [ "zbus", ] +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.4" @@ -574,7 +646,7 @@ dependencies = [ "lazy_static", "mintex", "parking_lot", - "rustc-hash", + "rustc-hash 1.1.0", "serde", "serde_json", "thousands", @@ -601,6 +673,12 @@ dependencies = [ "windows-sys 0.61.0", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -764,6 +842,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.31" @@ -1055,6 +1139,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -1160,6 +1253,16 @@ version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libredox" version = "0.1.10" @@ -1256,6 +1359,12 @@ dependencies = [ "vmm-sys-util", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1346,6 +1455,16 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43794a0ace135be66a25d3ae77d41b91615fb68ae937f904090203e81f755b65" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1689,6 +1808,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-crate" version = "3.4.0" @@ -1828,6 +1957,20 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.26" @@ -1840,6 +1983,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustix" version = "1.1.2" @@ -1853,6 +2002,42 @@ dependencies = [ "windows-sys 0.61.0", ] +[[package]] +name = "rustls" +version = "0.23.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -2036,6 +2221,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.111" @@ -2230,6 +2421,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -2460,7 +2657,8 @@ name = "vm-migration" version = "0.1.0" dependencies = [ "anyhow", - "itertools", + "itertools 0.14.0", + "rustls", "serde", "serde_json", "thiserror 2.0.17", @@ -2499,6 +2697,7 @@ dependencies = [ "hypervisor", "igvm", "igvm_defs", + "kvm-bindings", "landlock", "libc", "linux-loader", @@ -2655,13 +2854,22 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.5", ] [[package]] @@ -2673,6 +2881,22 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + [[package]] name = "windows-targets" version = "0.53.5" @@ -2680,58 +2904,106 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_aarch64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + [[package]] name = "windows_i686_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_i686_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "windows_x86_64_msvc" version = "0.53.1" @@ -2834,6 +3106,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zstd" version = "0.13.3" diff --git a/Cargo.toml b/Cargo.toml index b738dde5bf..433b75d4c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,20 @@ codegen-units = 1 lto = true opt-level = "s" -strip = true + +# Tradeof between performance and fast compilation times for local testing and +# development with frequent rebuilds. +[profile.optimized-dev] +codegen-units = 16 +inherits = "release" +lto = false +opt-level = 2 +strip = false + +# Optimize more for dependencies: They don't require frequent rebuilds. +[profile.optimized-dev.package."*"] +codegen-units = 1 +opt-level = 3 [profile.profiling] debug = true @@ -87,6 +100,7 @@ flume = "0.12.0" itertools = "0.14.0" libc = "0.2.178" log = "0.4.29" +rustls = "0.23.34" signal-hook = "0.3.18" thiserror = "2.0.17" uuid = { version = "1.19.0" } diff --git a/README.md b/README.md index fdb18255f0..874c4944ce 100644 --- a/README.md +++ b/README.md @@ -1,402 +1,27 @@ -- [1. What is Cloud Hypervisor?](#1-what-is-cloud-hypervisor) - - [Objectives](#objectives) - - [High Level](#high-level) - - [Architectures](#architectures) - - [Guest OS](#guest-os) -- [2. Getting Started](#2-getting-started) - - [Host OS](#host-os) - - [Use Pre-built Binaries](#use-pre-built-binaries) - - [Packages](#packages) - - [Building from Source](#building-from-source) - - [Booting Linux](#booting-linux) - - [Firmware Booting](#firmware-booting) - - [Custom Kernel and Disk Image](#custom-kernel-and-disk-image) - - [Building your Kernel](#building-your-kernel) - - [Disk image](#disk-image) - - [Booting the guest VM](#booting-the-guest-vm) -- [3. Status](#3-status) - - [Hot Plug](#hot-plug) - - [Device Model](#device-model) - - [Roadmap](#roadmap) -- [4. Relationship with _Rust VMM_ Project](#4-relationship-with-rust-vmm-project) - - [Differences with Firecracker and crosvm](#differences-with-firecracker-and-crosvm) -- [5. Community](#5-community) - - [Contribute](#contribute) - - [Slack](#slack) - - [Mailing list](#mailing-list) - - [Security issues](#security-issues) - -# 1. What is Cloud Hypervisor? - -Cloud Hypervisor is an open source Virtual Machine Monitor (VMM) that runs on -top of the [KVM](https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt) -hypervisor and the Microsoft Hypervisor (MSHV). - -The project focuses on running modern, _Cloud Workloads_, on specific, common, -hardware architectures. In this case _Cloud Workloads_ refers to those that are -run by customers inside a Cloud Service Provider. This means modern operating -systems with most I/O handled by -paravirtualised devices (e.g. _virtio_), no requirement for legacy devices, and -64-bit CPUs. - -Cloud Hypervisor is implemented in [Rust](https://www.rust-lang.org/) and is -based on the [Rust VMM](https://github.com/rust-vmm) crates. - -## Objectives - -### High Level - -- Runs on KVM or MSHV -- Minimal emulation -- Low latency -- Low memory footprint -- Low complexity -- High performance -- Small attack surface -- 64-bit support only -- CPU, memory, PCI hotplug -- Machine to machine migration - -### Architectures - -Cloud Hypervisor supports the `x86-64`, `AArch64` and `riscv64` -architectures, with functionality varying across these platforms. The -functionality differences between `x86-64` and `AArch64` are documented -in [#1125](https://github.com/cloud-hypervisor/cloud-hypervisor/issues/1125). -The `riscv64` architecture support is experimental and offers limited -functionality. For more details and instructions, please refer to [riscv -documentation](docs/riscv.md). - -### Guest OS - -Cloud Hypervisor supports `64-bit Linux` and Windows 10/Windows Server 2019. - -# 2. Getting Started - -The following sections describe how to build and run Cloud Hypervisor. - -## Prerequisites for AArch64 - -- AArch64 servers (recommended) or development boards equipped with the GICv3 - interrupt controller. - -## Host OS - -For required KVM functionality and adequate performance the recommended host -kernel version is 5.13. The majority of the CI currently tests with kernel -version 5.15. - -## Use Pre-built Binaries - -The recommended approach to getting started with Cloud Hypervisor is by using a -pre-built binary. Binaries are available for the [latest -release](https://github.com/cloud-hypervisor/cloud-hypervisor/releases/latest). -Use `cloud-hypervisor-static` for `x86-64` or `cloud-hypervisor-static-aarch64` -for `AArch64` platform. - -## Packages - -For convenience, packages are also available targeting some popular Linux -distributions. This is thanks to the [Open Build -Service](https://build.opensuse.org). The [OBS -README](https://github.com/cloud-hypervisor/obs-packaging) explains how to -enable the repository in a supported Linux distribution and install Cloud Hypervisor -and accompanying packages. Please report any packaging issues in the -[obs-packaging](https://github.com/cloud-hypervisor/obs-packaging) repository. - -## Building from Source - -Please see the [instructions for building from source](docs/building.md) if you -do not wish to use the pre-built binaries. - -## Booting Linux - -Cloud Hypervisor supports direct kernel boot (the x86-64 kernel requires the kernel -built with PVH support or a bzImage) or booting via a firmware (either [Rust Hypervisor -Firmware](https://github.com/cloud-hypervisor/rust-hypervisor-firmware) or an -edk2 UEFI firmware called `CLOUDHV` / `CLOUDHV_EFI`.) - -Binary builds of the firmware files are available for the latest release of -[Rust Hypervisor -Firmware](https://github.com/cloud-hypervisor/rust-hypervisor-firmware/releases/latest) -and [our edk2 -repository](https://github.com/cloud-hypervisor/edk2/releases/latest) - -The choice of firmware depends on your guest OS choice; some experimentation -may be required. - -### Firmware Booting - -Cloud Hypervisor supports booting disk images containing all needed components -to run cloud workloads, a.k.a. cloud images. - -The following sample commands will download an Ubuntu Cloud image, converting -it into a format that Cloud Hypervisor can use and a firmware to boot the image -with. - -```shell -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-amd64.img focal-server-cloudimg-amd64.raw -$ wget https://github.com/cloud-hypervisor/rust-hypervisor-firmware/releases/download/0.4.2/hypervisor-fw -``` - -The Ubuntu cloud images do not ship with a default password so it necessary to -use a `cloud-init` disk image to customise the image on the first boot. A basic -`cloud-init` image is generated by this [script](scripts/create-cloud-init.sh). -This seeds the image with a default username/password of `cloud/cloud123`. It -is only necessary to add this disk image on the first boot. Script also assigns -default IP address using `test_data/cloud-init/ubuntu/local/network-config` details -with `--net "mac=12:34:56:78:90:ab,tap="` option. Then the matching mac address -interface will be enabled as per `network-config` details. - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --firmware ./hypervisor-fw \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -If access to the firmware messages or interaction with the boot loader (e.g. -GRUB) is required then it necessary to switch to the serial console instead of -`virtio-console`. - -```shell -$ ./cloud-hypervisor \ - --kernel ./hypervisor-fw \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" \ - --serial tty \ - --console off -``` - -## Booting: `--firmware` vs `--kernel` - -The following scenarios are supported by Cloud Hypervisor to bootstrap a VM, i.e., -to load a payload/bootitem(s): - -- Provide firmware -- Provide kernel \[+ cmdline\]\ [+ initrd\] - -Please note that our Cloud Hypervisor firmware (`hypervisor-fw`) has a Xen PVH -boot entry, therefore it can also be booted via the `--kernel` parameter, as -seen in some examples. - -### Custom Kernel and Disk Image - -#### Building your Kernel - -Cloud Hypervisor also supports direct kernel boot. For x86-64, a `vmlinux` ELF kernel (compiled with PVH support) or a regular bzImage are supported. In order to support development there is a custom branch; however provided the required options are enabled any recent kernel will suffice. - -To build the kernel: - -```shell -# Clone the Cloud Hypervisor Linux branch -$ git clone --depth 1 https://github.com/cloud-hypervisor/linux.git -b ch-6.12.8 linux-cloud-hypervisor -$ pushd linux-cloud-hypervisor -$ make ch_defconfig -# Do native build of the x86-64 kernel -$ KCFLAGS="-Wa,-mx86-used-note=no" make bzImage -j `nproc` -# Do native build of the AArch64 kernel -$ make -j `nproc` -$ popd -``` - -For x86-64, the `vmlinux` kernel image will then be located at -`linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin`. -For AArch64, the `Image` kernel image will then be located at -`linux-cloud-hypervisor/arch/arm64/boot/Image`. - -#### Disk image - -For the disk image the same Ubuntu image as before can be used. This contains -an `ext4` root filesystem. - -```shell -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img # x86-64 -$ wget https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-arm64.img # AArch64 -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-amd64.img focal-server-cloudimg-amd64.raw # x86-64 -$ qemu-img convert -p -f qcow2 -O raw focal-server-cloudimg-arm64.img focal-server-cloudimg-arm64.raw # AArch64 -``` - -#### Booting the guest VM - -These sample commands boot the disk image using the custom kernel whilst also -supplying the desired kernel command line. - -- x86-64 - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin \ - --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ - --cmdline "console=hvc0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -- AArch64 - -```shell -$ sudo setcap cap_net_admin+ep ./cloud-hypervisor -$ ./create-cloud-init.sh -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/arm64/boot/Image \ - --disk path=focal-server-cloudimg-arm64.raw path=/tmp/ubuntu-cloudinit.img \ - --cmdline "console=hvc0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -If earlier kernel messages are required the serial console should be used instead of `virtio-console`. - -- x86-64 - -```shell -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/x86/boot/compressed/vmlinux.bin \ - --console off \ - --serial tty \ - --disk path=focal-server-cloudimg-amd64.raw \ - --cmdline "console=ttyS0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -- AArch64 - -```shell -$ ./cloud-hypervisor \ - --kernel ./linux-cloud-hypervisor/arch/arm64/boot/Image \ - --console off \ - --serial tty \ - --disk path=focal-server-cloudimg-arm64.raw \ - --cmdline "console=ttyAMA0 root=/dev/vda1 rw" \ - --cpus boot=4 \ - --memory size=1024M \ - --net "tap=,mac=,ip=,mask=" -``` - -# 3. Status - -Cloud Hypervisor is under active development. The following stability -guarantees are currently made: - -* The API (including command line options) will not be removed or changed in a - breaking way without a minimum of 2 major releases notice. Where possible - warnings will be given about the use of deprecated functionality and the - deprecations will be documented in the release notes. - -* Point releases will be made between individual releases where there are - substantial bug fixes or security issues that need to be fixed. These point - releases will only include bug fixes. - -Currently the following items are **not** guaranteed across updates: - -* Snapshot/restore is not supported across different versions -* Live migration is not supported across different versions -* The following features are considered experimental and may change - substantially between releases: TDX, vfio-user, vDPA. - -Further details can be found in the [release documentation](docs/releases.md). - -As of 2023-01-03, the following cloud images are supported: - -- [Ubuntu Focal](https://cloud-images.ubuntu.com/focal/current/) (focal-server-cloudimg-{amd64,arm64}.img) -- [Ubuntu Jammy](https://cloud-images.ubuntu.com/jammy/current/) (jammy-server-cloudimg-{amd64,arm64}.img) -- [Ubuntu Noble](https://cloud-images.ubuntu.com/noble/current/) (noble-server-cloudimg-{amd64,arm64}.img) -- [Fedora 36](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/) ([Fedora-Cloud-Base-36-1.5.x86_64.raw.xz](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/x86_64/images/) / [Fedora-Cloud-Base-36-1.5.aarch64.raw.xz](https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/36/Cloud/aarch64/images/)) - -Direct kernel boot to userspace should work with a rootfs from most -distributions although you may need to enable exotic filesystem types in the -reference kernel configuration (e.g. XFS or btrfs.) - -## Hot Plug - -Cloud Hypervisor supports hotplug of CPUs, passthrough devices (VFIO), -`virtio-{net,block,pmem,fs,vsock}` and memory resizing. This -[document](docs/hotplug.md) details how to add devices to a running VM. - -## Device Model - -Details of the device model can be found in this -[documentation](docs/device_model.md). - -## Roadmap - -The project roadmap is tracked through a [GitHub -project](https://github.com/orgs/cloud-hypervisor/projects/6). - -# 4. Relationship with _Rust VMM_ Project - -In order to satisfy the design goal of having a high-performance, -security-focused hypervisor the decision was made to use the -[Rust](https://www.rust-lang.org/) programming language. The language's strong -focus on memory and thread safety makes it an ideal candidate for implementing -VMMs. - -Instead of implementing the VMM components from scratch, Cloud Hypervisor is -importing the [Rust VMM](https://github.com/rust-vmm) crates, and sharing code -and architecture together with other VMMs like e.g. Amazon's -[Firecracker](https://firecracker-microvm.github.io/) and Google's -[crosvm](https://chromium.googlesource.com/chromiumos/platform/crosvm/). - -Cloud Hypervisor embraces the _Rust VMM_ project's goals, which is to be able -to share and re-use as many virtualization crates as possible. - -## Differences with Firecracker and crosvm - -A large part of the Cloud Hypervisor code is based on either the Firecracker or -the crosvm project's implementations. Both of these are VMMs written in Rust -with a focus on safety and security, like Cloud Hypervisor. - -The goal of the Cloud Hypervisor project differs from the aforementioned -projects in that it aims to be a general purpose VMM for _Cloud Workloads_ and -not limited to container/serverless or client workloads. - -The Cloud Hypervisor community thanks the communities of both the Firecracker -and crosvm projects for their excellent work. - -# 5. Community - -The Cloud Hypervisor project follows the governance, and community guidelines -described in the [Community](https://github.com/cloud-hypervisor/community) -repository. - -## Contribute - -The project strongly believes in building a global, diverse and collaborative -community around the Cloud Hypervisor project. Anyone who is interested in -[contributing](CONTRIBUTING.md) to the project is welcome to participate. - -Contributing to a open source project like Cloud Hypervisor covers a lot more -than just sending code. Testing, documentation, pull request -reviews, bug reports, feature requests, project improvement suggestions, etc, -are all equal and welcome means of contribution. See the -[CONTRIBUTING](CONTRIBUTING.md) document for more details. - -## Slack - -Get an [invite to our Slack channel](https://join.slack.com/t/cloud-hypervisor/shared_invite/enQtNjY3MTE3MDkwNDQ4LWQ1MTA1ZDVmODkwMWQ1MTRhYzk4ZGNlN2UwNTI3ZmFlODU0OTcwOWZjMTkwZDExYWE3YjFmNzgzY2FmNDAyMjI), - [join us on Slack](https://cloud-hypervisor.slack.com/), and [participate in our community activities](https://cloud-hypervisor.slack.com/archives/C04R5DUQVBN). - -## Mailing list - -Please report bugs using the [GitHub issue -tracker](https://github.com/cloud-hypervisor/cloud-hypervisor/issues) but for -broader community discussions you may use our [mailing -list](https://lists.cloudhypervisor.org/g/dev/). - -## Security issues - -Please contact the maintainers listed in the MAINTAINERS.md file with security issues. +# Cloud Hypervisor Fork for SAP gardenlinux + +The `gardenlinux` branch is the branch from that our SAP colleagues [build] +[sap-gl-ci] their Cloud Hypervisor packages. + +## Development Model + +- The `gardenlinux` branch is always what SAP builds. From SAPs side, we can + force push or rewrite history on that branch. +- We use branch protection for `gradenlinux`, PRs, CI, and code reviews +- With every new CHV release, we rename `gardenlinux` to `gardenlinux-vXX` and + create a new `gardenlinux` branch manually: + - use release as base and push it into the repo + - cherry-pick all commits from `gardenlinux-vXX` that are still relevant onto a + new branch and create a pull request against this fork + - adapt git commit history +- PoC Development: + - happens here (in [cyberus-technology/cloud-hypervisor](https://github.com/cyberus-technology/cloud-hypervisor)) + - open PR against `gardenlinux` + - Branch name patterns **must not** follow `gardenlinux-*` pattern + - We recommend `cyberus-fork-*` as branch pattern to better keep the overview. +- Productization: + - happens upstream (in [cloud-hypervisor/cloud-hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor)) + - We recommend `productize-*` as branch pattern to better keep the overview. + + +[sap-gl-ci]: https://github.com/gardenlinux/package-cloud-hypervisor-gl/blob/main/prepare_source#L1 \ No newline at end of file diff --git a/cloud-hypervisor/src/bin/ch-remote.rs b/cloud-hypervisor/src/bin/ch-remote.rs index fd48ffab1f..fa4350a960 100644 --- a/cloud-hypervisor/src/bin/ch-remote.rs +++ b/cloud-hypervisor/src/bin/ch-remote.rs @@ -9,7 +9,9 @@ mod test_util; use std::io::Read; use std::marker::PhantomData; +use std::num::NonZeroU32; use std::os::unix::net::UnixStream; +use std::path::PathBuf; use std::process; use api_client::{ @@ -492,11 +494,34 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .subcommand_matches("send-migration") .unwrap() .get_one::("send_migration_config") - .unwrap(), + .unwrap() + .to_owned(), matches .subcommand_matches("send-migration") .unwrap() .get_flag("send_migration_local"), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("downtime-ms") + .unwrap_or(&300), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("migration-timeout-s") + .unwrap_or(&3600), + matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("connections") + .copied() + .and_then(NonZeroU32::new) + .unwrap_or(NonZeroU32::new(1).unwrap()), + matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("tls-dir") + .cloned(), ); simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) .map_err(Error::HttpApiClient) @@ -507,7 +532,13 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .subcommand_matches("receive-migration") .unwrap() .get_one::("receive_migration_config") - .unwrap(), + .unwrap() + .to_owned(), + matches + .subcommand_matches("receive-migration") + .unwrap() + .get_one::("tls-dir") + .cloned(), ); simple_api_command( socket, @@ -711,6 +742,16 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .subcommand_matches("send-migration") .unwrap() .get_flag("send_migration_local"), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("downtime-ms") + .unwrap_or(&300), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("migration-timeout-s") + .unwrap_or(&3600), ); proxy.api_vm_send_migration(&send_migration_data) } @@ -901,18 +942,35 @@ fn coredump_config(destination_url: &str) -> String { serde_json::to_string(&coredump_config).unwrap() } -fn receive_migration_data(url: &str) -> String { +fn receive_migration_data(url: String, tls_dir: Option) -> String { let receive_migration_data = vmm::api::VmReceiveMigrationData { - receiver_url: url.to_owned(), + receiver_url: url, + tcp_serial_url: None, + // Only FDs transmitted via an SCM_RIGHTS UNIX Domain Socket message + // are valid. Transmitting specific FD nums via the HTTP API is + // almost always invalid. + net_fds: None, + tls_dir, }; serde_json::to_string(&receive_migration_data).unwrap() } -fn send_migration_data(url: &str, local: bool) -> String { +fn send_migration_data( + url: String, + local: bool, + downtime: u64, + migration_timeout: u64, + connections: NonZeroU32, + tls_dir: Option, +) -> String { let send_migration_data = vmm::api::VmSendMigrationData { - destination_url: url.to_owned(), + destination_url: url, local, + downtime, + migration_timeout, + connections, + tls_dir, }; serde_json::to_string(&send_migration_data).unwrap() @@ -1024,7 +1082,14 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("receive_migration_config") .index(1) + // Live migration with net_fds not supported in ch-remote. .help(""), + ) + .arg( + Arg::new("tls-dir") + .long("tls-dir") + .help("directory with TLS certificates") + .num_args(1), ), Command::new("remove-device") .about("Remove VFIO and PCI device") @@ -1087,6 +1152,32 @@ fn get_cli_commands_sorted() -> Box<[Command]> { Command::new("resume").about("Resume the VM"), Command::new("send-migration") .about("Initiate a VM migration") + .arg( + Arg::new("connections") + .long("connections") + .help("The number of connections to use for the migration") + .num_args(1) + .value_parser(clap::value_parser!(u32)) + .default_value("1"), + ) + .arg( + Arg::new("downtime-ms") + .long("downtime-ms") + .visible_alias("downtime") + .help("Set the expected maximum downtime in milliseconds") + .num_args(1) + .value_parser(clap::value_parser!(u64)) + .default_value("300"), + ) + .arg( + Arg::new("migration-timeout-s") + .long("migration-timeout-s") + .visible_alias("migration-timeout") + .help("Set the maximum allowed migration time in seconds") + .num_args(1) + .value_parser(clap::value_parser!(u64)) + .default_value("3600"), + ) .arg( Arg::new("send_migration_config") .index(1) @@ -1097,6 +1188,12 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .long("local") .num_args(0) .action(ArgAction::SetTrue), + ) + .arg( + Arg::new("tls-dir") + .long("tls-dir") + .help("directory with TLS certificates") + .num_args(1), ), Command::new("shutdown").about("Shutdown the VM"), Command::new("shutdown-vmm").about("Shutdown the VMM"), diff --git a/cloud-hypervisor/src/main.rs b/cloud-hypervisor/src/main.rs index d08293b6e4..d3ebac9dd5 100644 --- a/cloud-hypervisor/src/main.rs +++ b/cloud-hypervisor/src/main.rs @@ -421,7 +421,7 @@ fn get_cli_options_sorted( .default_value("true"), Arg::new("serial") .long("serial") - .help("Control serial port: off|null|pty|tty|file=|socket=") + .help("Control serial port: off|null|pty|tty|file=|socket=|tcp=") .default_value("null") .group("vm-config"), Arg::new("tpm") @@ -995,6 +995,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -1004,12 +1005,16 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -1211,6 +1216,24 @@ mod unit_tests { }"#, true, ), + ( + vec![ + "cloud-hypervisor", + "--kernel", + "/path/to/kernel", + "--disk", + "path=/path/to/disk/1,addr=15.0", + "path=/path/to/disk/2", + ], + r#"{ + "payload": {"kernel": "/path/to/kernel"}, + "disks": [ + {"path": "/path/to/disk/1", "bdf_device": 21}, + {"path": "/path/to/disk/2"} + ] + }"#, + true, + ), ( vec![ "cloud-hypervisor", @@ -1408,6 +1431,20 @@ mod unit_tests { }"#, true, ), + ( + vec![ + "cloud-hypervisor", "--kernel", "/path/to/kernel", + "--net", + "mac=12:34:56:78:90:ab,host_mac=34:56:78:90:ab:cd,tap=tap0,ip=1.2.3.4,mask=5.6.7.8,addr=08.0", + ], + r#"{ + "payload": {"kernel": "/path/to/kernel"}, + "net": [ + {"mac": "12:34:56:78:90:ab", "host_mac": "34:56:78:90:ab:cd", "tap": "tap0", "ip": "1.2.3.4", "mask": "5.6.7.8", "num_queues": 2, "queue_size": 256, "bdf_device": 8} + ] + }"#, + true, + ), #[cfg(target_arch = "x86_64")] ( vec![ @@ -1479,11 +1516,11 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--rng", - "src=/path/to/entropy/source", + "src=/path/to/entropy/source,addr=11.0", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, - "rng": {"src": "/path/to/entropy/source"} + "rng": {"src": "/path/to/entropy/source", "bdf_device": 17} }"#, true, )] @@ -1500,14 +1537,14 @@ mod unit_tests { "cloud-hypervisor", "--kernel", "/path/to/kernel", "--memory", "shared=true", "--fs", - "tag=virtiofs1,socket=/path/to/sock1", + "tag=virtiofs1,socket=/path/to/sock1,addr=10.0", "tag=virtiofs2,socket=/path/to/sock2", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "memory" : { "shared": true, "size": 536870912 }, "fs": [ - {"tag": "virtiofs1", "socket": "/path/to/sock1"}, + {"tag": "virtiofs1", "socket": "/path/to/sock1", "bdf_device": 16}, {"tag": "virtiofs2", "socket": "/path/to/sock2"} ] }"#, @@ -1579,13 +1616,13 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--pmem", - "file=/path/to/img/1,size=1G", + "file=/path/to/img/1,size=1G,addr=1F.0", "file=/path/to/img/2,size=2G", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "pmem": [ - {"file": "/path/to/img/1", "size": 1073741824}, + {"file": "/path/to/img/1", "size": 1073741824,"bdf_device": 31}, {"file": "/path/to/img/2", "size": 2147483648} ] }"#, @@ -1863,13 +1900,13 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--vdpa", - "path=/path/to/device/1", + "path=/path/to/device/1,addr=18.0", "path=/path/to/device/2,num_queues=2", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "vdpa": [ - {"path": "/path/to/device/1", "num_queues": 1}, + {"path": "/path/to/device/1", "num_queues": 1, "bdf_device": 24}, {"path": "/path/to/device/2", "num_queues": 2} ] }"#, @@ -1908,11 +1945,11 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--vsock", - "cid=123,socket=/path/to/sock/1", + "cid=123,socket=/path/to/sock/1,addr=0F.0", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, - "vsock": {"cid": 123, "socket": "/path/to/sock/1"} + "vsock": {"cid": 123, "socket": "/path/to/sock/1", "bdf_device": 15} }"#, true, ), diff --git a/cloud-hypervisor/tests/integration.rs b/cloud-hypervisor/tests/integration.rs index 2164437954..336b11776c 100644 --- a/cloud-hypervisor/tests/integration.rs +++ b/cloud-hypervisor/tests/integration.rs @@ -7866,7 +7866,9 @@ mod ivshmem { &migration_socket, &src_api_socket, &dest_api_socket, - local + local, + 300, + 60 ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); @@ -10037,6 +10039,8 @@ mod live_migration { src_api_socket: &str, dest_api_socket: &str, local: bool, + downtime: u64, + timeout: u64, ) -> bool { // Start to receive migration from the destination VM let mut receive_migration = Command::new(clh_command("ch-remote")) @@ -10057,6 +10061,10 @@ mod live_migration { format!("--api-socket={}", &src_api_socket), "send-migration".to_string(), format! {"unix:{migration_socket}"}, + "--downtime".to_string(), + format!("{downtime}"), + "--migration-timeout".to_string(), + format!("{timeout}"), ] .to_vec(); @@ -10281,8 +10289,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100_000; // 100s + let migration_timeout = 1000; // 1000s + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10455,8 +10473,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10673,8 +10701,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10889,8 +10927,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10999,8 +11047,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -11146,8 +11204,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, true), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + true, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -11202,7 +11270,12 @@ mod live_migration { .port() } - fn start_live_migration_tcp(src_api_socket: &str, dest_api_socket: &str) -> bool { + fn start_live_migration_tcp( + src_api_socket: &str, + dest_api_socket: &str, + downtime: u64, + timeout: u64, + ) -> bool { // Get an available TCP port let migration_port = get_available_port(); let host_ip = "127.0.0.1"; @@ -11229,6 +11302,10 @@ mod live_migration { &format!("--api-socket={src_api_socket}"), "send-migration", &format!("tcp:{host_ip}:{migration_port}"), + "--downtime", + &format!("{downtime}"), + "--migration-timeout", + &format!("{timeout}"), ]) .stdin(Stdio::null()) .stderr(Stdio::piped()) @@ -11299,6 +11376,8 @@ mod live_migration { .output() .expect("Expect creating disk image to succeed"); let pmem_path = String::from("/dev/pmem0"); + let downtime = 100000; + let timeout = 1000; // Start the source VM let src_vm_path = clh_command("cloud-hypervisor"); @@ -11361,7 +11440,7 @@ mod live_migration { } // Start TCP live migration assert!( - start_live_migration_tcp(&src_api_socket, &dest_api_socket), + start_live_migration_tcp(&src_api_socket, &dest_api_socket, downtime, timeout), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); diff --git a/devices/src/ioapic.rs b/devices/src/ioapic.rs index ba05c1ed5b..9312ab1156 100644 --- a/devices/src/ioapic.rs +++ b/devices/src/ioapic.rs @@ -172,7 +172,7 @@ impl BusDevice for Ioapic { return None; } - debug!("IOAPIC_W @ offset 0x{offset:x}"); + trace!("IOAPIC_W @ offset 0x{offset:x}"); let value = LittleEndian::read_u32(data); @@ -250,7 +250,7 @@ impl Ioapic { } fn ioapic_write(&mut self, val: u32) { - debug!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); + trace!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); match self.reg_sel as u8 { IOAPIC_REG_VERSION => { diff --git a/docs/live_migration.md b/docs/live_migration.md index 94c9afc236..5c77d2625f 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -171,7 +171,13 @@ After a few seconds the VM should be up and you can interact with it. Initiate the Migration over TCP: ```console -src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +``` + +With migration parameters: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} --migration-timeout 60 --downtime 5000 ``` > Replace {dst}:{port} with the actual IP address and port of your destination host. @@ -180,3 +186,24 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. + +#### Migration Parameters + +Cloud Hypervisor supports additional parameters to control the +migration process: + +- `migration-timeout ` +Sets the maximum time (in seconds) allowed for the migration process. +If the migration takes longer than this timeout, it will be aborted. A +value of 0 means no timeout limit. +- `downtime ` +Sets the maximum acceptable downtime (in milliseconds) during the +migration. This parameter helps control the trade-off between migration +time and VM downtime. + +> The downtime limit is related to the cost of serialization +(deserialization) of vCPU and device state. Therefore, the expected +downtime is always shorter than the actual downtime. + +These parameters can be used with the `send-migration` command to +fine-tune the migration behavior according to your requirements. \ No newline at end of file diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 4bc348a98d..d377c4a4bc 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -10,6 +10,9 @@ // // +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; + use thiserror::Error; #[cfg(not(target_arch = "riscv64"))] use {anyhow::anyhow, vm_memory::GuestAddress}; @@ -603,4 +606,11 @@ pub trait Vcpu: Send + Sync { /// Trigger NMI interrupt /// fn nmi(&self) -> Result<()>; + /// Returns the underlying vCPU FD of KVM. + /// + /// # SAFETY + /// This is safe as we only use this to map the KVM_RUN structure for the + /// signal handler and only use it from there. + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd; } diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 259009151e..394743d1c1 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -14,10 +14,8 @@ use std::any::Any; use std::collections::HashMap; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::mem::offset_of; -#[cfg(feature = "tdx")] -use std::os::unix::io::AsRawFd; -#[cfg(feature = "tdx")] -use std::os::unix::io::RawFd; +#[cfg(any(feature = "tdx", feature = "kvm"))] +use std::os::unix::io::{AsRawFd, RawFd}; use std::result; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::sync::Mutex; @@ -2018,7 +2016,11 @@ impl cpu::Vcpu for KvmVcpu { }, Err(ref e) => match e.errno() { - libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + Ok(cpu::VmExit::Ignore) + } + libc::EAGAIN => Ok(cpu::VmExit::Ignore), _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "VCPU error {e:?}" ))), @@ -2707,6 +2709,11 @@ impl cpu::Vcpu for KvmVcpu { self.fd.set_kvm_immediate_exit(exit.into()); } + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } + /// /// Returns the details about TDX exit reason /// diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 40d8796e9a..650345b487 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -48,6 +48,8 @@ pub mod x86_64; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use std::sync::Mutex; @@ -1524,6 +1526,11 @@ impl cpu::Vcpu for MshvVcpu { Ok(()) } + + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + unimplemented!() + } } impl MshvVcpu { diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index 39d4285df3..b2d82bc876 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -77,7 +77,14 @@ fn open_tap_rx_q_0( let tap = match if_name { Some(name) => Tap::open_named(name, num_rx_q, flags).map_err(Error::TapOpen)?, // Create a new Tap device in Linux, if none was specified. - None => Tap::new(num_rx_q).map_err(Error::TapOpen)?, + None => { + let tap = Tap::new(num_rx_q).map_err(Error::TapOpen)?; + log::info!( + "Created tap device: name={}, num_rx_q={num_rx_q}", + tap.if_name_as_str() + ); + tap + } }; // Don't overwrite ip configuration of existing interfaces: if tap_exists { diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 36f0e2ba33..e761c7332e 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -11,6 +11,7 @@ use std::net::{IpAddr, Ipv6Addr}; use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use log::debug; use thiserror::Error; use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; @@ -65,6 +66,16 @@ pub struct Tap { if_name: Vec, } +impl Drop for Tap { + fn drop(&mut self) { + debug!( + "Dropping Tap: if_name={}, FD={}", + self.if_name_as_str(), + self.tap_file.as_raw_fd() + ); + } +} + impl PartialEq for Tap { fn eq(&self, other: &Tap) -> bool { self.if_name == other.if_name @@ -129,6 +140,9 @@ fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { } impl Tap { + /// The default naming scheme for Tap devices that are created by Cloud Hypervisor. + pub const DEFAULT_NAME_SCHEME: &'static str = "vmtap%d"; + /// # Safety /// The caller should ensure to pass a valid file descriptor and valid /// arguments for the `ioctl()` syscall. @@ -183,6 +197,7 @@ impl Tap { if fd < 0 { return Err(Error::OpenTun(IoError::last_os_error())); } + debug!("Opening Tap device with given name: ifname={if_name}, fd={fd}"); // SAFETY: We just checked that the fd is valid. let tuntap = unsafe { File::from_raw_fd(fd) }; @@ -236,7 +251,7 @@ impl Tap { /// Create a new tap interface. pub fn new(num_queue_pairs: usize) -> Result { - Self::open_named("vmtap%d", num_queue_pairs, None) + Self::open_named(Self::DEFAULT_NAME_SCHEME, num_queue_pairs, None) } pub fn from_tap_fd(fd: RawFd, num_queue_pairs: usize) -> Result { diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index 1722da39f2..313423bd34 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -46,6 +46,8 @@ pub enum OptionParserError { Conversion(String /* field */, String /* value */), #[error("invalid value: {0}")] InvalidValue(String), + #[error("failed to convert {1}")] + NumberConversion(#[source] ParseIntError, String), } type OptionParserResult = std::result::Result; @@ -167,6 +169,41 @@ impl OptionParser { .is_some() } + /// Parses the `addr` option of PCI devices and returns the PCI device as well as the function ID + /// + /// Returns a tuple consisting of the parsed IDs for device and function in this order. Returns an error if the + /// supplied `addr` values cannot be parsed to [`u8`]. The tuple might consist of two times [`None`] if `addr` was + /// not provided. + pub fn get_pci_device_function( + &self, + ) -> OptionParserResult<(Option, Option)> { + if let Some(addr_str) = self.get("addr") { + let (device_str, function_str) = addr_str + .split_once('.') + .ok_or(OptionParserError::InvalidValue(addr_str.to_owned()))?; + + // We also accept hex number with `0x` prefix, but need to strip it before conversion in case it's present. + let device_str = device_str.strip_prefix("0x").unwrap_or(device_str); + let device_id = u8::from_str_radix(device_str, 16) + .map_err(|e| OptionParserError::NumberConversion(e, addr_str.to_owned()))?; + + let function_str = function_str.strip_prefix("0x").unwrap_or(function_str); + let function_id = u8::from_str_radix(function_str, 16) + .map_err(|e| OptionParserError::NumberConversion(e, addr_str.to_owned()))?; + + // Currently CHV only support single-function devices. Those are mapped to function ID 0 in all cases, so we + // disallow the assignment of any other function ID. + if function_id != 0 { + return Err(OptionParserError::InvalidValue(format!( + "multi-function devices currently not supported; expected 0 got {function_id}", + ))); + } + Ok((Some(device_id), Some(function_id))) + } else { + Ok((None, None)) + } + } + pub fn convert(&self, option: &str) -> OptionParserResult> { match self.options.get(option).and_then(|v| v.value.as_ref()) { None => Ok(None), @@ -454,7 +491,8 @@ mod unit_tests { .add("hotplug_method") .add("hotplug_size") .add("topology") - .add("cmdline"); + .add("cmdline") + .add("addr"); assert_eq!(split_commas("\"\"").unwrap(), vec!["\"\""]); parser.parse("size=128M,hanging_param").unwrap_err(); @@ -506,6 +544,22 @@ mod unit_tests { ); parser.parse("cmdline=\"").unwrap_err(); parser.parse("cmdline=\"\"\"").unwrap_err(); + + parser.parse("addr=0A.0").unwrap(); + assert_eq!( + (Some(0xa_u8), Some(0)), + parser.get_pci_device_function().expect("should be valid") + ); + parser.parse("addr=0A.1").unwrap(); + assert!(matches!( + parser.get_pci_device_function(), + Err(OptionParserError::InvalidValue(_)) + )); + parser.parse("addr=1g.0").unwrap(); + assert!(matches!( + parser.get_pci_device_function(), + Err(OptionParserError::NumberConversion(_, _)) + )); } #[test] diff --git a/pci/src/bus.rs b/pci/src/bus.rs index eaae23a4d8..ae4235151c 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -46,7 +46,7 @@ pub enum PciRootError { #[error("Invalid PCI device identifier provided")] InvalidPciDeviceSlot(usize), /// Valid PCI device identifier but already used. - #[error("Valid PCI device identifier but already used")] + #[error("Valid PCI device identifier but already used: {0}")] AlreadyInUsePciDeviceSlot(usize), } pub type Result = std::result::Result; @@ -168,15 +168,42 @@ impl PciBus { Ok(()) } - pub fn next_device_id(&mut self) -> Result { - for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; - return Ok(idx as u32); + /// Allocates a PCI device ID on the bus. + /// + /// - `id`: ID to allocate on the bus. If [`None`], the next free + /// device ID on the bus is allocated, else the ID given is + /// allocated + /// + /// ## Errors + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] in case + /// the ID requested is already allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] in case the + /// requested ID exceeds the maximum number of devices allowed per + /// bus (see [`NUM_DEVICE_IDS`]). + /// * If `id` is [`None`]: Returns + /// [`PciRootError::NoPciDeviceSlotAvailable`] if no free device + /// slot is available on the bus. + pub fn allocate_device_id(&mut self, id: Option) -> Result { + if let Some(id) = id { + if (id as usize) < NUM_DEVICE_IDS { + if self.device_ids[id as usize] { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id as usize)) + } else { + self.device_ids[id as usize] = true; + Ok(id as u32) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id as usize)) + } + } else { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } } + Err(PciRootError::NoPciDeviceSlotAvailable) } - - Err(PciRootError::NoPciDeviceSlotAvailable) } pub fn get_device_id(&mut self, id: usize) -> Result<()> { @@ -486,3 +513,110 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), ) } + +#[cfg(test)] +mod unit_tests { + use std::error::Error; + use std::result::Result; + + use super::*; + + #[derive(Debug)] + struct MocRelocDevice; + + impl DeviceRelocation for MocRelocDevice { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup_bus() -> PciBus { + let pci_root = PciRoot::new(None); + let moc_device_reloc = Arc::new(MocRelocDevice {}); + PciBus::new(pci_root, moc_device_reloc) + } + + #[test] + // Test to acquire all IDs that can be acquired + fn allocate_device_id_next_free() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id as u32, bus.allocate_device_id(None).unwrap()); + } + } + + #[test] + // Test that requesting specific ID work + fn allocate_device_id_request_id() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS - 1).try_into()?; + assert_eq!(0x01_u32, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x10_u32, bus.allocate_device_id(Some(0x10))?); + assert_eq!(max_id as u32, bus.allocate_device_id(Some(max_id))?); + Ok(()) + } + + #[test] + // Test that gaps resulting from explicit allocations are filled by implicit ones, + // beginning with the first free slot + fn allocate_device_id_fills_gaps() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + assert_eq!(0x01_u32, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x03_u32, bus.allocate_device_id(Some(0x03))?); + assert_eq!(0x06_u32, bus.allocate_device_id(Some(0x06))?); + assert_eq!(0x02_u32, bus.allocate_device_id(None)?); + assert_eq!(0x04_u32, bus.allocate_device_id(None)?); + assert_eq!(0x05_u32, bus.allocate_device_id(None)?); + assert_eq!(0x07_u32, bus.allocate_device_id(None)?); + Ok(()) + } + + #[test] + // Test that requesting the same ID twice fails + fn allocate_device_id_request_id_twice_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS - 1).try_into()?; + bus.allocate_device_id(Some(max_id))?; + let _result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + PciRootError::AlreadyInUsePciDeviceSlot(max_id.into()), + _result + )); + Ok(()) + } + + #[test] + // Test to request an invalid ID + fn allocate_device_id_request_invalid_id_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS + 1).try_into()?; + let _result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + PciRootError::InvalidPciDeviceSlot(max_id.into()), + _result + )); + Ok(()) + } + + #[test] + // Test to acquire an ID when all IDs were already acquired + fn allocate_device_id_none_left() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id as u32, bus.allocate_device_id(None).unwrap()); + } + let _result = bus.allocate_device_id(None); + assert!(matches!(PciRootError::NoPciDeviceSlotAvailable, _result)); + } +} diff --git a/scripts/gitlint/rules/on-behalf-of-marker.py b/scripts/gitlint/rules/on-behalf-of-marker.py new file mode 100644 index 0000000000..d08e334b17 --- /dev/null +++ b/scripts/gitlint/rules/on-behalf-of-marker.py @@ -0,0 +1,36 @@ +from gitlint.rules import LineRule, RuleViolation, CommitMessageTitle, CommitRule + +class BodyContainsOnBehalfOfSAPMarker(CommitRule): + """Enforce that each commit coming from an SAP contractor contains an + "On-behalf-of SAP user@sap.com" marker. + """ + + # A rule MUST have a human friendly name + name = "body-requires-on-behalf-of-sap" + + # A rule MUST have a *unique* id + # We recommend starting with UC (for User-defined Commit-rule). + id = "UC-sap" + + # Lower-case list of contractors + contractors = [ + "@cyberus-technology.de" + ] + + # Marker followed by " name.surname@sap.com" + marker = "On-behalf-of: SAP" + + def validate(self, commit): + if "@sap.com" in commit.author_email.lower(): + return + + # Allow third-party open-source contributions + if not any(contractor in commit.author_email.lower() for contractor in self.contractors): + return + + for line in commit.message.body: + if line.startswith(self.marker) and "@sap.com" in line.lower(): + return + + msg = f"Body does not contain a '{self.marker} user@sap.com' line" + return [RuleViolation(self.id, msg, line_nr=1)] diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 8eee661341..9b52786ca4 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -16,7 +16,7 @@ use std::{result, thread}; use anyhow::anyhow; use event_monitor::event; -use log::{debug, error, info}; +use log::{debug, error, info, trace}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ @@ -257,9 +257,9 @@ impl NetEpollHandler { || !self.driver_awake { self.signal_used_queue(self.queue_index_base)?; - debug!("Signalling RX queue"); + trace!("Signalling RX queue"); } else { - debug!("Not signalling RX queue"); + trace!("Not signalling RX queue"); } Ok(()) } @@ -610,11 +610,12 @@ impl Net { for fd in fds.iter() { // Duplicate so that it can survive reboots // SAFETY: FFI call to dup. Trivially safe. - let fd = unsafe { libc::dup(*fd) }; - if fd < 0 { + let fd_duped = unsafe { libc::dup(*fd) }; + if fd_duped < 0 { return Err(Error::DuplicateTapFd(std::io::Error::last_os_error())); } - let tap = Tap::from_tap_fd(fd, num_queue_pairs).map_err(Error::TapError)?; + debug!("dup'ed fd {fd} => {fd_duped} for virtio-net device {id}"); + let tap = Tap::from_tap_fd(fd_duped, num_queue_pairs).map_err(Error::TapError)?; taps.push(tap); } @@ -658,6 +659,19 @@ impl Net { impl Drop for Net { fn drop(&mut self) { + // Get a comma-separated list of the interface names of the tap devices + // associated with this network device. + let ifnames_str = self + .taps + .iter() + .map(|tap| tap.if_name_as_str()) + .collect::>(); + let ifnames_str = ifnames_str.join(","); + debug!( + "virtio-net device closed: id={}, ifnames=[{ifnames_str}]", + self.id + ); + if let Some(kill_evt) = self.common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index 5a7b5f57a4..07f5b4fc0f 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -10,7 +10,7 @@ use std::sync::atomic::{AtomicU16, Ordering}; use std::sync::{Arc, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use log::{debug, error, warn}; +use log::{debug, error, trace, warn}; use serde::{Deserialize, Serialize}; use virtio_queue::{Queue, QueueT}; use vm_migration::{MigratableError, Pausable, Snapshot, Snapshottable}; @@ -243,7 +243,7 @@ impl VirtioPciCommonConfig { } fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { - debug!("read_common_config_word: offset 0x{offset:x}"); + trace!("read_common_config_word: offset 0x{offset:x}"); match offset { 0x10 => self.msix_config.load(Ordering::Acquire), 0x12 => queues.len() as u16, // num_queues diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index b17475065c..2053afc472 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -7,6 +7,7 @@ version = "0.1.0" [dependencies] anyhow = { workspace = true } itertools = { workspace = true } +rustls = { workspace = true } serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } thiserror = { workspace = true } diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 921ae5b3db..7ae78eaf24 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -11,6 +11,7 @@ use crate::protocol::MemoryRangeTable; mod bitpos_iterator; pub mod protocol; +pub mod tls; #[derive(Error, Debug)] pub enum MigratableError { @@ -52,6 +53,9 @@ pub enum MigratableError { #[error("Failed to release a disk lock")] UnlockError(#[source] anyhow::Error), + + #[error("TLS error")] + Tls(#[from] tls::TlsError), } /// A Pausable component can be paused and resumed. diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 3ae226ece2..5eac87cb8a 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -266,18 +266,95 @@ impl Response { } #[repr(C)] -#[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct MemoryRange { pub gpa: u64, pub length: u64, } -#[derive(Clone, Default, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct MemoryRangeTable { data: Vec, } +#[derive(Debug, Clone, Default)] +struct MemoryRangeTableIterator { + chunk_size: u64, + data: Vec, +} + +impl MemoryRangeTableIterator { + pub fn new(table: &MemoryRangeTable, chunk_size: u64) -> Self { + MemoryRangeTableIterator { + chunk_size, + data: table.data.clone(), + } + } +} + +impl Iterator for MemoryRangeTableIterator { + type Item = MemoryRangeTable; + + /// Return the next memory range in the table, making sure that + /// the returned range is not larger than `chunk_size`. + /// + /// **Note**: Do not rely on the order of the ranges returned by this + /// iterator. This allows for a more efficient implementation. + fn next(&mut self) -> Option { + let mut ranges: Vec = vec![]; + let mut ranges_size: u64 = 0; + + loop { + assert!(ranges_size <= self.chunk_size); + + if ranges_size == self.chunk_size || self.data.is_empty() { + break; + } + + if let Some(range) = self.data.pop() { + let next_range: MemoryRange = if ranges_size + range.length > self.chunk_size { + // How many bytes we need to put back into the table. + let leftover_bytes = ranges_size + range.length - self.chunk_size; + assert!(leftover_bytes <= range.length); + let returned_bytes = range.length - leftover_bytes; + assert!(returned_bytes <= range.length); + assert_eq!(leftover_bytes + returned_bytes, range.length); + + self.data.push(MemoryRange { + gpa: range.gpa + returned_bytes, + length: leftover_bytes, + }); + MemoryRange { + gpa: range.gpa, + length: returned_bytes, + } + } else { + range + }; + + ranges_size += next_range.length; + ranges.push(next_range); + } + } + + if ranges.is_empty() { + None + } else { + Some(MemoryRangeTable { data: ranges }) + } + } +} + impl MemoryRangeTable { + pub fn ranges(&self) -> &[MemoryRange] { + &self.data + } + + /// Partitions the table into chunks of at most `chunk_size` bytes. + pub fn partition(&self, chunk_size: u64) -> impl Iterator { + MemoryRangeTableIterator::new(self, chunk_size) + } + /// Converts an iterator over a dirty bitmap into an iterator of dirty /// [`MemoryRange`]s, merging consecutive dirty pages into contiguous ranges. /// @@ -408,4 +485,107 @@ mod unit_tests { ] ); } + + #[test] + fn test_memory_range_table_partition() { + // We start the test similar as the one above, but with a input that is simpler to parse for + // developers. + let input = [0b11_0011_0011_0011]; + + let start_gpa = 0x1000; + let page_size = 0x1000; + + let table = MemoryRangeTable::from_dirty_bitmap(input, start_gpa, page_size); + let expected_regions = [ + MemoryRange { + gpa: start_gpa, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 4 * page_size, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 8 * page_size, + length: page_size * 2, + }, + MemoryRange { + gpa: start_gpa + 12 * page_size, + length: page_size * 2, + }, + ]; + assert_eq!(table.regions(), &expected_regions); + + // In the first test, we expect to see the exact same result as above, as we use the length + // of every region (which is fixed!). + { + let chunks = table + .partition(page_size * 2) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. + // For better testability, we reverse it. + let chunks = chunks + .into_iter() + .map(|vec| vec.into_iter().rev().collect::>()) + .rev() + .collect::>(); + + assert_eq!( + chunks, + &[ + [expected_regions[0].clone()].to_vec(), + [expected_regions[1].clone()].to_vec(), + [expected_regions[2].clone()].to_vec(), + [expected_regions[3].clone()].to_vec(), + ] + ); + } + + // Next, we have a more sophisticated test with a chunk size of 5 pages. + { + let chunks = table + .partition(page_size * 5) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. + // For better testability, we reverse it. + let chunks = chunks + .into_iter() + .map(|vec| vec.into_iter().rev().collect::>()) + .collect::>(); + + assert_eq!( + chunks, + &[ + vec![ + MemoryRange { + gpa: start_gpa + 4 * page_size, + length: page_size + }, + MemoryRange { + gpa: start_gpa + 8 * page_size, + length: 2 * page_size + }, + MemoryRange { + gpa: start_gpa + 12 * page_size, + length: 2 * page_size + } + ], + vec![ + MemoryRange { + gpa: start_gpa, + length: 2 * page_size + }, + MemoryRange { + gpa: start_gpa + 5 * page_size, + length: page_size + } + ] + ] + ); + } + } } diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs new file mode 100644 index 0000000000..b3e1579475 --- /dev/null +++ b/vm-migration/src/tls.rs @@ -0,0 +1,265 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::io::{self, Read, Write}; +use std::net::TcpStream; +use std::os::fd::{AsFd, BorrowedFd}; +use std::path::Path; +use std::sync::Arc; + +use rustls::pki_types::pem::PemObject; +use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; +use rustls::{ + ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, +}; +use thiserror::Error; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::io::{ReadVolatile, WriteVolatile}; +use vm_memory::{VolatileMemoryError, VolatileSlice}; + +use crate::MigratableError; + +#[derive(Error, Debug)] +pub enum TlsError { + #[error( + "The provided input could not be parsed because it is not a syntactically-valid DNS Name." + )] + InvalidDnsName(#[source] InvalidDnsNameError), + + #[error("Rustls protocol error")] + RustlsError(#[from] rustls::Error), + + #[error("Rustls protocol IO error")] + RustlsIoError(#[from] std::io::Error), + + #[error("Error during TLS handshake: {0}")] + HandshakeError(String), + + #[error("Error handling PEM file")] + RustlsPemError(#[from] rustls::pki_types::pem::Error), +} + +// This TlsStream will be later encapsulated in a SocketStream. Thus it has to +// implement the same traits. It is important that we never directly read from +// or write to the TcpStream encapsulated in StreamOwned. +#[derive(Debug)] +pub enum TlsStream { + Client(StreamOwned), + Server(StreamOwned), +} + +// The TLS-Stream objects cannot read or write volatile, thus we need a buffer +// between the VolatileSlice and the TLS stream (see ReadVolatile and +// WriteVolatile implementations below). Allocating this buffer in these +// function calls would make it very slow, thus we tie the buffer to the stream +// with this wrapper. +pub struct TlsStreamWrapper { + stream: TlsStream, + // Used only in ReadVolatile and WriteVolatile + buf: Vec, +} + +static MAX_CHUNK: usize = 1024 * 64; + +impl TlsStreamWrapper { + pub fn new(stream: TlsStream) -> Self { + Self { + stream, + buf: Vec::new(), + } + } +} + +impl Read for TlsStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + TlsStream::Client(s) => s.read(buf), + TlsStream::Server(s) => s.read(buf), + } + } +} + +impl Read for TlsStreamWrapper { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + Read::read(&mut self.stream, buf) + } +} + +impl Write for TlsStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + TlsStream::Client(s) => s.write(buf), + TlsStream::Server(s) => s.write(buf), + } + } + fn flush(&mut self) -> io::Result<()> { + match self { + TlsStream::Client(s) => s.flush(), + TlsStream::Server(s) => s.flush(), + } + } +} + +impl Write for TlsStreamWrapper { + fn write(&mut self, buf: &[u8]) -> io::Result { + Write::write(&mut self.stream, buf) + } + fn flush(&mut self) -> io::Result<()> { + Write::flush(&mut self.stream) + } +} + +// Reading from or writing to these FDs would break the connection, because +// those reads or writes wouldn't go through rustls. But the FD is used to wait +// until it becomes readable. +impl AsFd for TlsStreamWrapper { + fn as_fd(&self) -> BorrowedFd<'_> { + match &self.stream { + TlsStream::Client(s) => s.get_ref().as_fd(), + TlsStream::Server(s) => s.get_ref().as_fd(), + } + } +} + +impl ReadVolatile for TlsStreamWrapper { + fn read_volatile( + &mut self, + vs: &mut VolatileSlice, + ) -> std::result::Result { + let len = vs.len().min(MAX_CHUNK); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = + Read::read(&mut self.stream, &mut buf[..len]).map_err(VolatileMemoryError::IOError)?; + + if n == 0 { + return Ok(0); + } + + vs.copy_from(&buf[..n]); + self.buf.clear(); + + Ok(n) + } +} + +impl WriteVolatile for TlsStreamWrapper { + fn write_volatile( + &mut self, + vs: &VolatileSlice, + ) -> std::result::Result { + let len = vs.len().min(MAX_CHUNK); + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = vs.copy_to(&mut buf[..len]); + + if n == 0 { + return Ok(0); + } + + let n = Write::write(&mut self.stream, &buf[..n]).map_err(VolatileMemoryError::IOError)?; + self.buf.clear(); + + Ok(n) + } +} + +// A small wrapper to be put into ReceiveListener::Tls. It carries the +// TLS-Config and creates a TlsStream after the TcpConnection accepted a +// connection. +#[derive(Debug, Clone)] +pub struct TlsConnectionWrapper { + config: Arc, +} + +impl TlsConnectionWrapper { + pub fn new(cert_dir: &Path) -> Result { + let certs = CertificateDer::pem_file_iter(cert_dir.join("server-cert.pem")) + .map_err(TlsError::RustlsPemError)? + .map(|cert| cert.map_err(TlsError::RustlsPemError)) + .collect::>, TlsError>>()?; + let key = PrivateKeyDer::from_pem_file(cert_dir.join("server-key.pem")) + .map_err(TlsError::RustlsPemError)?; + let config = ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key) + .map_err(TlsError::RustlsError)?; + let config = Arc::new(config); + Ok(Self { config }) + } + + pub fn wrap( + &self, + socket: TcpStream, + ) -> std::result::Result { + let conn = ServerConnection::new(self.config.clone()).map_err(TlsError::RustlsError)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError)?; + if rd == 0 && wr == 0 { + Err(TlsError::HandshakeError( + "EOF during TLS handshake".to_string(), + ))?; + } + } + + Ok(TlsStreamWrapper::new(TlsStream::Server(tls))) + } +} + +pub fn client_stream( + socket: TcpStream, + cert_dir: &Path, + hostname: &str, +) -> std::result::Result, MigratableError> { + let mut root_store = RootCertStore::empty(); + root_store.add_parsable_certificates( + CertificateDer::pem_file_iter(cert_dir.join("ca-cert.pem")) + .map_err(TlsError::RustlsPemError)? + .map(|cert| cert.map_err(TlsError::RustlsPemError)) + .collect::>, TlsError>>()?, + ); + let config = ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let config = Arc::new(config); + let server_name = + ServerName::try_from(hostname.to_string()).map_err(TlsError::InvalidDnsName)?; + let conn = ClientConnection::new(config.clone(), server_name.clone()) + .map_err(TlsError::RustlsError)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError)?; + if rd == 0 && wr == 0 { + Err(TlsError::HandshakeError( + "EOF during TLS handshake".to_string(), + ))?; + } + } + + Ok(tls) +} diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index a2fadfbd82..85fed684bd 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -54,10 +54,13 @@ hex = { version = "0.4.3", optional = true } hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } +kvm-bindings = { workspace = true } landlock = "0.4.4" libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } log = { workspace = true } +# Special fork of micro_http that combines HTTP traffic over a UNIX domain +# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors. micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } mshv-bindings = { workspace = true, features = [ "fam-wrappers", diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index e463a20819..7a2070f28e 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -6,11 +6,11 @@ //! # HTTP Endpoints of the Cloud Hypervisor API //! -//! ## Special Handling for Devices Backed by Network File Descriptors (FDs) (e.g., virtio-net) +//! ## Special Handling for Externally Provided File Descriptors (FDs) (e.g., virtio-net) //! //! Some of the HTTP handlers here implement special logic for devices -//! **backed by network FDs** to enable live-migration, state save/resume -//! (restore), and similar VM lifecycle events. +//! **backed by externally opened FDs** to enable live-migration, +//! state save/resume (restore), and similar VM lifecycle events. //! //! The utilized mechanism requires that the control software (e.g., libvirt) //! connects to Cloud Hypervisor by using a UNIX domain socket and that it @@ -35,11 +35,23 @@ //! [special HTTP library]: https://github.com/firecracker-microvm/micro-http use std::fs::File; -use std::sync::mpsc::Sender; +use std::sync::mpsc::{Receiver, Sender, SyncSender}; +use std::sync::{LazyLock, Mutex}; +use log::info; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; +/// Helper to make the VmSendMigration call blocking as long as a migration is ongoing. +#[allow(clippy::type_complexity)] +pub static ONGOING_LIVEMIGRATION: LazyLock<( + SyncSender>, + Mutex>>, +)> = LazyLock::new(|| { + let (sender, receiver) = std::sync::mpsc::sync_channel(0); + (sender, Mutex::new(receiver)) +}); + #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; use crate::api::http::http_endpoint::fds_helper::{attach_fds_to_cfg, attach_fds_to_cfgs}; @@ -47,8 +59,8 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmConfig, VmCounters, VmDelete, VmNmi, VmPause, - VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, + VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -427,13 +439,11 @@ vm_action_put_handler_body!(VmRemoveDevice); vm_action_put_handler_body!(VmResizeDisk); vm_action_put_handler_body!(VmResizeZone); vm_action_put_handler_body!(VmSnapshot); -vm_action_put_handler_body!(VmReceiveMigration); -vm_action_put_handler_body!(VmSendMigration); #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] vm_action_put_handler_body!(VmCoredump); -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmAddNet { fn handle_request( @@ -457,6 +467,74 @@ impl PutHandler for VmAddNet { impl GetHandler for VmAddNet {} +// Special handling for externally provided FDs. +// See module description for more info. +impl PutHandler for VmReceiveMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let mut net_cfg: VmReceiveMigrationData = serde_json::from_slice(body.raw())?; + if let Some(cfgs) = &mut net_cfg.net_fds { + let mut cfgs = cfgs.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + attach_fds_to_cfgs(files, cfgs)?; + } + + self.send(api_notifier, api_sender, net_cfg) + .map_err(HttpError::ApiError) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmReceiveMigration {} + +// Special Handling for virtio-net Devices Backed by Network File Descriptors +// +// See above. +impl PutHandler for VmSendMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let res = self + .send( + api_notifier, + api_sender, + serde_json::from_slice(body.raw())?, + ) + .map_err(HttpError::ApiError)?; + + info!("live migration started"); + + let (_, receiver) = &*ONGOING_LIVEMIGRATION; + + info!("waiting for live migration result"); + let mig_res = receiver.lock().unwrap().recv().unwrap(); + info!("received live migration result"); + + // We forward the migration error here to the guest + mig_res + .map(|_| res) + .map_err(|e| HttpError::ApiError(ApiError::VmSendMigration(e))) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmSendMigration {} + impl PutHandler for VmResize { fn handle_request( &'static self, @@ -485,7 +563,7 @@ impl PutHandler for VmResize { impl GetHandler for VmResize {} -// Special handling for virtio-net devices backed by network FDs. +// Special handling for externally provided FDs. // See module description for more info. impl PutHandler for VmRestore { fn handle_request( diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 2aa52e8e37..a3e1bd3f76 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -6,22 +6,25 @@ use std::collections::BTreeMap; use std::error::Error; use std::fs::File; +use std::os::fd::AsRawFd; use std::os::unix::io::{IntoRawFd, RawFd}; use std::os::unix::net::UnixListener; use std::panic::AssertUnwindSafe; use std::path::PathBuf; -use std::sync::LazyLock; -use std::sync::mpsc::Sender; +use std::sync::mpsc::{Receiver, Sender, channel, sync_channel}; +use std::sync::{Arc, LazyLock, Mutex}; use std::thread; use hypervisor::HypervisorType; -use log::error; +use log::{debug, error}; use micro_http::{ - Body, HttpServer, MediaType, Method, Request, Response, ServerError, StatusCode, Version, + Body, HttpServer, MediaType, Method, Request, Response, ServerError, ServerRequest, + ServerResponse, StatusCode, Version, }; use seccompiler::{SeccompAction, apply_filter}; use serde_json::Error as SerdeError; use thiserror::Error; +use vmm_sys_util::epoll::{ControlOperation, Epoll, EpollEvent, EventSet}; use vmm_sys_util::eventfd::EventFd; use self::http_endpoint::{VmActionHandler, VmCreate, VmInfo, VmmPing, VmmShutdown}; @@ -316,10 +319,152 @@ fn handle_http_request( response } +/// Keeps track of the worker threads, and the resources needed to interact +/// with them. +#[derive(Debug)] +struct HttpWorkerThreads { + // The worker threads themselves. + threads: Vec>>, + // An MPSC channel to send server requests to the workers. We put it into + // an option so we can easily drop it in the destructor. + request_tx: Option>, + // An MPSC channel that the workers use to send responses to the HTTP + // server thread. + response_rx: Receiver, + // Workers signal this eventfd when they have a response for the HTTP + // server thread. + response_event: EventFd, +} + +impl HttpWorkerThreads { + fn new( + thread_count: usize, + api_notifier: &EventFd, + api_sender: &Sender, + seccomp_action: &SeccompAction, + hypervisor_type: HypervisorType, + landlock_enable: bool, + exit_evt: &EventFd, + ) -> Result { + let response_event = EventFd::new(libc::EFD_NONBLOCK).map_err(VmmError::EventFdCreate)?; + let (response_tx, response_rx) = sync_channel::(thread_count); + + let mut threads = Vec::new(); + let (request_tx, request_rx) = channel::(); + + let request_rx = Arc::new(Mutex::new(request_rx)); + + // We use the same seccomp filter that we already use for the HTTP server thread. + let api_seccomp_filter = + get_seccomp_filter(seccomp_action, Thread::HttpApi, hypervisor_type) + .map_err(VmmError::CreateSeccompFilter)?; + + for n in 0..thread_count { + let response_event = response_event.try_clone().map_err(VmmError::EventFdClone)?; + + let response_tx = response_tx.clone(); + let request_rx = request_rx.clone(); + + let api_notifier = api_notifier.try_clone().map_err(VmmError::EventFdClone)?; + let api_sender = api_sender.clone(); + + let api_seccomp_filter = api_seccomp_filter.clone(); + let exit_evt = exit_evt.try_clone().map_err(VmmError::EventFdClone)?; + + let thread = thread::Builder::new() + .name(format!("http-worker-{n}").to_string()) + .spawn(move || { + debug!("Spawned HTTP worker thread with id {n}",); + if !api_seccomp_filter.is_empty() { + apply_filter(&api_seccomp_filter) + .map_err(VmmError::ApplySeccompFilter) + .map_err(|e| { + error!("Error applying seccomp filter: {e:?}"); + exit_evt.write(1).ok(); + e + })?; + } + + if landlock_enable { + Landlock::new() + .map_err(VmmError::CreateLandlock)? + .restrict_self() + .map_err(VmmError::ApplyLandlock) + .map_err(|e| { + error!("Error applying landlock to http-worker thread: {e:?}"); + exit_evt.write(1).ok(); + e + })?; + } + + std::panic::catch_unwind(AssertUnwindSafe(move || { + let id = n; + loop { + let request = request_rx.lock().unwrap().recv(); + match request { + Ok(msg) => { + // Process the server request + let response = msg.process(|request| { + handle_http_request(request, &api_notifier, &api_sender) + }); + + // Send the response to the HTTP server thread together with this + // threads id. + if let Err(e) = response_tx.send(response) { + error!( + "HTTP worker thread {id}: error sending response {e}" + ); + break; + } + + // Notify the HTTP server thread. + response_event.write(1).ok(); + } + Err(e) => { + error!("HTTP worker thread {id}: error receiving request {e}"); + break; + } + } + } + })) + .map_err(|_| { + error!("http-worker thread {n} panicked"); + exit_evt.write(1).ok() + }) + .ok(); + + Ok(()) + }) + .map_err(VmmError::HttpThreadSpawn)?; + + threads.push(thread); + } + + Ok(Self { + threads, + request_tx: Some(request_tx), + response_rx, + response_event, + }) + } +} + +impl Drop for HttpWorkerThreads { + fn drop(&mut self) { + // Dropping the Sender side of the request channels to throw the worker + // threads out of their loops. + drop(self.request_tx.take()); + // Now we can join each thread. + self.threads + .drain(..) + .for_each(|thread| thread.join().unwrap().unwrap()); + } +} + fn start_http_thread( mut server: HttpServer, - api_notifier: EventFd, - api_sender: Sender, + api_notifier: &EventFd, + api_sender: &Sender, seccomp_action: &SeccompAction, exit_evt: EventFd, hypervisor_type: HypervisorType, @@ -336,6 +481,42 @@ fn start_http_thread( .add_kill_switch(api_shutdown_fd_clone) .map_err(VmmError::CreateApiServer)?; + // We use the epoll mechanism to parallelize this. The epoll tokens are + // attached when registering the FDs with epoll. That way we can later + // check why we were notified. + const HTTP_EPOLL_TOKEN: u64 = 1; + const WORKER_EPOLL_TOKEN: u64 = 2; + + // The epoll instance our HTTP server thread will wait on. + let outer_epoll = Epoll::new().unwrap(); + let worker_threads = HttpWorkerThreads::new( + 2, + api_notifier, + api_sender, + seccomp_action, + hypervisor_type, + landlock_enable, + &exit_evt, + )?; + + // Register the fd that the worker threads will signal. + outer_epoll + .ctl( + ControlOperation::Add, + worker_threads.response_event.as_raw_fd(), + EpollEvent::new(EventSet::IN, WORKER_EPOLL_TOKEN), + ) + .unwrap(); + + // Register the HttpServer's fd. + outer_epoll + .ctl( + ControlOperation::Add, + server.epoll().as_raw_fd(), + EpollEvent::new(EventSet::IN, HTTP_EPOLL_TOKEN), + ) + .unwrap(); + let thread = thread::Builder::new() .name("http-server".to_string()) .spawn(move || { @@ -363,24 +544,42 @@ fn start_http_thread( } std::panic::catch_unwind(AssertUnwindSafe(move || { + let mut events = vec![EpollEvent::default(); 32]; server.start_server().unwrap(); + loop { - match server.requests() { - Ok(request_vec) => { - for server_request in request_vec { - if let Err(e) = server.respond(server_request.process(|request| { - handle_http_request(request, &api_notifier, &api_sender) - })) { + let n = outer_epoll.wait(-1, &mut events).unwrap(); + for ev in events.iter().take(n) { + match ev.data() { + HTTP_EPOLL_TOKEN => { + // The HttpServer got a request, handle that. + match server.requests() { + Ok(request_vec) => { + for server_request in request_vec { + worker_threads.request_tx.as_ref().unwrap().send(server_request).unwrap(); + } + } + Err(ServerError::ShutdownEvent) => { + server.flush_outgoing_writes(); + return; + } + Err(e) => { + error!( + "HTTP server error on retrieving incoming request. Error: {e}" + ); + } + } + } + WORKER_EPOLL_TOKEN => { + // One of the worker threads has a response. + // We clear the eventfd first. + let _ = worker_threads.response_event.read().unwrap(); + let response = worker_threads.response_rx.recv().unwrap(); + if let Err(e) = server.respond(response){ error!("HTTP server error on response: {e}"); } } - } - Err(ServerError::ShutdownEvent) => { - server.flush_outgoing_writes(); - return; - } - Err(e) => { - error!("HTTP server error on retrieving incoming request. Error: {e}"); + _ => { } } } } @@ -398,6 +597,7 @@ fn start_http_thread( Ok((thread, api_shutdown_fd)) } +#[allow(clippy::needless_pass_by_value)] pub fn start_http_path_thread( path: &str, api_notifier: EventFd, @@ -415,8 +615,8 @@ pub fn start_http_path_thread( start_http_thread( server, - api_notifier, - api_sender, + &api_notifier, + &api_sender, seccomp_action, exit_evt, hypervisor_type, @@ -424,6 +624,7 @@ pub fn start_http_path_thread( ) } +#[allow(clippy::needless_pass_by_value)] pub fn start_http_fd_thread( fd: RawFd, api_notifier: EventFd, @@ -437,8 +638,8 @@ pub fn start_http_fd_thread( let server = unsafe { HttpServer::new_from_fd(fd) }.map_err(VmmError::CreateApiServer)?; start_http_thread( server, - api_notifier, - api_sender, + &api_notifier, + &api_sender, seccomp_action, exit_evt, hypervisor_type, diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 12ca6b9877..e56b01e67a 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -34,6 +34,8 @@ pub mod dbus; pub mod http; use std::io; +use std::num::NonZeroU32; +use std::path::PathBuf; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use log::info; @@ -47,7 +49,7 @@ use vmm_sys_util::eventfd::EventFd; pub use self::dbus::start_dbus_thread; pub use self::http::{start_http_fd_thread, start_http_path_thread}; use crate::Error as VmmError; -use crate::config::RestoreConfig; +use crate::config::{RestoreConfig, RestoredNetConfig}; use crate::device_tree::DeviceTree; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ @@ -256,19 +258,53 @@ pub struct VmCoredumpData { pub destination_url: String, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmReceiveMigrationData { /// URL for the reception of migration state pub receiver_url: String, + /// Optional URL if the TCP serial configuration must be changed during + /// migration. Example: "192.168.1.1:2222". + pub tcp_serial_url: Option, + /// Map with new network FDs on the new host. + pub net_fds: Option>, + /// Directory containing the TLS server certificate (server-cert.pem) and TLS server key (server-key.pem). + #[serde(default)] + pub tls_dir: Option, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmSendMigrationData { - /// URL to migrate the VM to + /// URL to migrate the VM to. + /// + /// This is not actually a URL, but we are stuck with the name, because it's + /// part of the HTTP API. The destination is a string, such as + /// tcp:: or unix:/path/to/socket. pub destination_url: String, /// Send memory across socket without copying #[serde(default)] pub local: bool, + /// Microsecond level downtime + #[serde(default = "default_downtime")] + pub downtime: u64, + /// Second level migration timeout + #[serde(default)] + pub migration_timeout: u64, + /// The number of parallel connections for migration + #[serde(default = "default_connections")] + pub connections: NonZeroU32, + /// Directory containing the TLS root CA certificate (ca-cert.pem) + #[serde(default)] + pub tls_dir: Option, +} + +// Default value for downtime the same as qemu. +fn default_downtime() -> u64 { + 300 +} + +// We use a single connection for backward compatibility as default. +fn default_connections() -> NonZeroU32 { + NonZeroU32::new(1).unwrap() } pub enum ApiResponsePayload { diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 1fa3d9b517..04079e36f3 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1281,10 +1281,24 @@ components: - destination_url type: object properties: + connections: + type: integer + format: int64 + default: 1 destination_url: type: string local: type: boolean + downtime: + type: integer + format: int64 + description: Maximum downtime in milliseconds during migration + default: 500 + migration_timeout: + type: integer + format: int64 + description: Total timeout for migration in milliseconds (0 = no limit) + default: 0 VmAddUserDevice: required: diff --git a/vmm/src/config.rs b/vmm/src/config.rs index 78d6f9f1e1..5cde35a799 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -6,6 +6,7 @@ use std::collections::{BTreeSet, HashMap}; #[cfg(feature = "ivshmem")] use std::fs; +use std::os::fd::RawFd; use std::path::PathBuf; use std::result; use std::str::FromStr; @@ -175,6 +176,9 @@ pub enum Error { /// Failed Parsing FwCfgItem config #[error("Error parsing --fw-cfg-config items")] ParseFwCfgItem(#[source] OptionParserError), + /// Failed parsing addr option + #[error("Error parsing --addr")] + ParsePciAddr(#[source] OptionParserError), } #[derive(Debug, PartialEq, Eq, Error)] @@ -182,6 +186,9 @@ pub enum ValidationError { /// Missing file value for console #[error("Path missing when using file console mode")] ConsoleFileMissing, + /// Missing TCP address for console + #[error("Address missing when using TCP console mode")] + ConsoleTcpAddressMissing, /// Missing socket path for console #[error("Path missing when using socket console mode")] ConsoleSocketPathMissing, @@ -227,8 +234,8 @@ pub enum ValidationError { #[error("Number of queues to virtio_net does not match the number of input FDs")] VnetQueueFdMismatch, /// Using reserved fd - #[error("Reserved fd number (<= 2)")] - VnetReservedFd, + #[error("Reserved fd number (fd={0} <= 2)")] + VnetReservedFd(RawFd), /// Hardware checksum offload is disabled. #[error("\"offload_tso\" and \"offload_ufo\" depend on \"offload_csum\"")] NoHardwareChecksumOffload, @@ -1093,7 +1100,7 @@ impl DiskConfig { ops_size=,ops_one_time_burst=,ops_refill_time=,\ id=,pci_segment=,rate_limit_group=,\ queue_affinity=,\ - serial="; + serial=,addr="; pub fn parse(disk: &str) -> Result { let mut parser = OptionParser::new(); @@ -1118,7 +1125,8 @@ impl DiskConfig { .add("pci_segment") .add("serial") .add("rate_limit_group") - .add("queue_affinity"); + .add("queue_affinity") + .add("addr"); parser.parse(disk).map_err(Error::ParseDisk)?; let path = parser.get("path").map(PathBuf::from); @@ -1230,6 +1238,10 @@ impl DiskConfig { None }; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(DiskConfig { path, readonly, @@ -1247,6 +1259,7 @@ impl DiskConfig { pci_segment, serial, queue_affinity, + bdf_device, }) } @@ -1318,7 +1331,7 @@ impl NetConfig { vhost_user=,socket=,vhost_mode=client|server,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=\ - offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off\""; + offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off,addr=DD.F\""; pub fn parse(net: &str) -> Result { let mut parser = OptionParser::new(); @@ -1347,7 +1360,8 @@ impl NetConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(net).map_err(Error::ParseNetwork)?; let tap = parser.get("tap"); @@ -1458,6 +1472,10 @@ impl NetConfig { None }; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + let config = NetConfig { tap, ip, @@ -1478,6 +1496,7 @@ impl NetConfig { offload_tso, offload_ufo, offload_csum, + bdf_device, }; Ok(config) } @@ -1494,7 +1513,12 @@ impl NetConfig { if let Some(fds) = self.fds.as_ref() { for fd in fds { if *fd <= 2 { - return Err(ValidationError::VnetReservedFd); + // If we see this, most likely our live migration path for network FDs failed. + log::debug!( + "virtio-net devices {:?} unexpectedly reports invalid FD", + self.id + ); + return Err(ValidationError::VnetReservedFd(*fd)); } } } @@ -1545,7 +1569,7 @@ impl NetConfig { impl RngConfig { pub fn parse(rng: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("src").add("iommu"); + parser.add("src").add("iommu").add("addr"); parser.parse(rng).map_err(Error::ParseRng)?; let src = PathBuf::from( @@ -1559,19 +1583,27 @@ impl RngConfig { .unwrap_or(Toggle(false)) .0; - Ok(RngConfig { src, iommu }) + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + + Ok(RngConfig { + src, + iommu, + bdf_device, + }) } } impl BalloonConfig { pub const SYNTAX: &'static str = "Balloon parameters \"size=,deflate_on_oom=on|off,\ - free_page_reporting=on|off\""; + free_page_reporting=on|off,addr=\""; pub fn parse(balloon: &str) -> Result { let mut parser = OptionParser::new(); parser.add("size"); parser.add("deflate_on_oom"); - parser.add("free_page_reporting"); + parser.add("free_page_reporting").add("addr"); parser.parse(balloon).map_err(Error::ParseBalloon)?; let size = parser @@ -1591,10 +1623,15 @@ impl BalloonConfig { .unwrap_or(Toggle(false)) .0; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(BalloonConfig { size, deflate_on_oom, free_page_reporting, + bdf_device, }) } } @@ -1602,7 +1639,8 @@ impl BalloonConfig { impl FsConfig { pub const SYNTAX: &'static str = "virtio-fs parameters \ \"tag=,socket=,num_queues=,\ - queue_size=,id=,pci_segment=\""; + queue_size=,id=,pci_segment=,\ + addr=\""; pub fn parse(fs: &str) -> Result { let mut parser = OptionParser::new(); @@ -1612,7 +1650,8 @@ impl FsConfig { .add("num_queues") .add("socket") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(fs).map_err(Error::ParseFileSystem)?; let tag = parser.get("tag").ok_or(Error::ParseFsTagMissing)?; @@ -1637,6 +1676,10 @@ impl FsConfig { .map_err(Error::ParseFileSystem)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(FsConfig { tag, socket, @@ -1644,6 +1687,7 @@ impl FsConfig { queue_size, id, pci_segment, + bdf_device, }) } @@ -1769,7 +1813,7 @@ impl FwCfgItem { impl PmemConfig { pub const SYNTAX: &'static str = "Persistent memory parameters \ \"file=,size=,iommu=on|off,\ - discard_writes=on|off,id=,pci_segment=\""; + discard_writes=on|off,id=,pci_segment=,addr=\""; pub fn parse(pmem: &str) -> Result { let mut parser = OptionParser::new(); @@ -1779,7 +1823,8 @@ impl PmemConfig { .add("iommu") .add("discard_writes") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(pmem).map_err(Error::ParsePersistentMemory)?; let file = PathBuf::from(parser.get("file").ok_or(Error::ParsePmemFileMissing)?); @@ -1803,6 +1848,10 @@ impl PmemConfig { .map_err(Error::ParsePersistentMemory)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(PmemConfig { file, size, @@ -1810,6 +1859,7 @@ impl PmemConfig { discard_writes, id, pci_segment, + bdf_device, }) } @@ -1841,11 +1891,14 @@ impl ConsoleConfig { .add_valueless("null") .add("file") .add("iommu") - .add("socket"); + .add("tcp") + .add("socket") + .add("addr"); parser.parse(console).map_err(Error::ParseConsole)?; let mut file: Option = default_consoleconfig_file(); let mut socket: Option = None; + let mut url: Option = None; let mut mode: ConsoleOutputMode = ConsoleOutputMode::Off; if parser.is_set("off") { @@ -1861,6 +1914,25 @@ impl ConsoleConfig { Some(PathBuf::from(parser.get("file").ok_or( Error::Validation(ValidationError::ConsoleFileMissing), )?)); + } else if parser.is_set("tcp") { + mode = ConsoleOutputMode::Tcp; + url = Some( + parser + .get("tcp") + .ok_or(Error::Validation(ValidationError::ConsoleTcpAddressMissing))?, + ); + if parser.is_set("file") { + file = + Some(PathBuf::from(parser.get("file").ok_or( + Error::Validation(ValidationError::ConsoleFileMissing), + )?)); + } + } else if parser.is_set("file") { + mode = ConsoleOutputMode::File; + file = + Some(PathBuf::from(parser.get("file").ok_or( + Error::Validation(ValidationError::ConsoleFileMissing), + )?)); } else if parser.is_set("socket") { mode = ConsoleOutputMode::Socket; socket = Some(PathBuf::from(parser.get("socket").ok_or( @@ -1875,11 +1947,17 @@ impl ConsoleConfig { .unwrap_or(Toggle(false)) .0; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(Self { file, mode, iommu, socket, + url, + bdf_device, }) } } @@ -1939,7 +2017,8 @@ impl DebugConsoleConfig { } impl DeviceConfig { - pub const SYNTAX: &'static str = "Direct device assignment parameters \"path=,iommu=on|off,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Direct device assignment parameters \"\ + path=,iommu=on|off,id=,pci_segment=\""; pub fn parse(device: &str) -> Result { let mut parser = OptionParser::new(); @@ -2043,7 +2122,7 @@ impl UserDeviceConfig { impl VdpaConfig { pub const SYNTAX: &'static str = "vDPA device \ \"path=,num_queues=,iommu=on|off,\ - id=,pci_segment=\""; + id=,pci_segment=,addr=\""; pub fn parse(vdpa: &str) -> Result { let mut parser = OptionParser::new(); @@ -2052,7 +2131,8 @@ impl VdpaConfig { .add("num_queues") .add("iommu") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(vdpa).map_err(Error::ParseVdpa)?; let path = parser @@ -2074,12 +2154,17 @@ impl VdpaConfig { .map_err(Error::ParseVdpa)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(VdpaConfig { path, num_queues, iommu, id, pci_segment, + bdf_device, }) } @@ -2103,7 +2188,8 @@ impl VdpaConfig { impl VsockConfig { pub const SYNTAX: &'static str = "Virtio VSOCK parameters \ - \"cid=,socket=,iommu=on|off,id=,pci_segment=\""; + \"cid=,socket=,iommu=on|off,id=,\ + pci_segment=,addr=\""; pub fn parse(vsock: &str) -> Result { let mut parser = OptionParser::new(); @@ -2112,7 +2198,8 @@ impl VsockConfig { .add("cid") .add("iommu") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(vsock).map_err(Error::ParseVsock)?; let socket = parser @@ -2134,12 +2221,17 @@ impl VsockConfig { .map_err(Error::ParseVsock)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(VsockConfig { cid, socket, iommu, id, pci_segment, + bdf_device, }) } @@ -2231,6 +2323,27 @@ pub struct RestoredNetConfig { pub fds: Option>, } +impl RestoredNetConfig { + // Ensure all net devices from 'VmConfig' backed by FDs have a + // corresponding 'RestoreNetConfig' with a matched 'id' and expected + // number of FDs. + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + let found = vm_config + .net + .iter() + .flatten() + .any(|net| net.id.as_ref() == Some(&self.id)); + + if found { + Ok(()) + } else { + Err(ValidationError::RestoreMissingRequiredNetId( + self.id.clone(), + )) + } + } +} + fn deserialize_restorednetconfig_fds<'de, D>( d: D, ) -> std::result::Result>, D::Error> @@ -3109,6 +3222,8 @@ impl VmConfig { /// To use this safely, the caller must guarantee that the input /// fds are all valid. pub unsafe fn add_preserved_fds(&mut self, mut fds: Vec) { + debug!("adding preserved FDs to VM list: {fds:?}"); + if fds.is_empty() { return; } @@ -3162,7 +3277,16 @@ impl Clone for VmConfig { .preserved_fds .as_ref() // SAFETY: FFI call with valid FDs - .map(|fds| fds.iter().map(|fd| unsafe { libc::dup(*fd) }).collect()), + .map(|fds| { + fds.iter() + .map(|fd| { + // SAFETY: Trivially safe. + let fd_duped = unsafe { libc::dup(*fd) }; + warn!("Cloning VM config: duping preserved FD {fd} => {fd_duped}"); + fd_duped + }) + .collect() + }), landlock_rules: self.landlock_rules.clone(), #[cfg(feature = "ivshmem")] ivshmem: self.ivshmem.clone(), @@ -3174,6 +3298,7 @@ impl Clone for VmConfig { impl Drop for VmConfig { fn drop(&mut self) { if let Some(mut fds) = self.preserved_fds.take() { + debug!("Closing preserved FDs from VM: fds={fds:?}"); for fd in fds.drain(..) { // SAFETY: FFI call with valid FDs unsafe { libc::close(fd) }; @@ -3414,6 +3539,7 @@ mod unit_tests { pci_segment: 0, serial: None, queue_affinity: None, + bdf_device: None, } } @@ -3508,6 +3634,13 @@ mod unit_tests { ..disk_fixture() } ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,addr=15.0")?, + DiskConfig { + bdf_device: Some(21), + ..disk_fixture() + } + ); Ok(()) } @@ -3532,6 +3665,7 @@ mod unit_tests { offload_tso: true, offload_ufo: true, offload_csum: true, + bdf_device: None, } } @@ -3596,6 +3730,14 @@ mod unit_tests { } ); + assert_eq!( + NetConfig::parse("mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,addr=08.0")?, + NetConfig { + bdf_device: Some(8), + ..net_fixture() + } + ); + assert_eq!( NetConfig::parse("mac=de:ad:be:ef:12:34,mask=255.255.255.0")?, NetConfig { @@ -3623,6 +3765,7 @@ mod unit_tests { RngConfig { src: PathBuf::from("/dev/random"), iommu: true, + bdf_device: None, } ); assert_eq!( @@ -3632,6 +3775,13 @@ mod unit_tests { ..Default::default() } ); + assert_eq!( + RngConfig::parse("addr=10.0")?, + RngConfig { + bdf_device: Some(16), + ..Default::default() + } + ); Ok(()) } @@ -3643,6 +3793,7 @@ mod unit_tests { queue_size: 1024, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3662,6 +3813,14 @@ mod unit_tests { } ); + assert_eq!( + FsConfig::parse("tag=mytag,socket=/tmp/sock,addr=0F.0")?, + FsConfig { + bdf_device: Some(15), + ..fs_fixture() + } + ); + Ok(()) } @@ -3673,6 +3832,7 @@ mod unit_tests { discard_writes: false, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3700,6 +3860,13 @@ mod unit_tests { ..pmem_fixture() } ); + assert_eq!( + PmemConfig::parse("file=/tmp/pmem,size=128M,addr=1F.0")?, + PmemConfig { + bdf_device: Some(31), + ..pmem_fixture() + } + ); Ok(()) } @@ -3715,6 +3882,8 @@ mod unit_tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3724,6 +3893,8 @@ mod unit_tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3733,6 +3904,8 @@ mod unit_tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3742,6 +3915,8 @@ mod unit_tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3751,6 +3926,8 @@ mod unit_tests { iommu: false, file: Some(PathBuf::from("/tmp/console")), socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3760,6 +3937,8 @@ mod unit_tests { iommu: true, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3769,6 +3948,8 @@ mod unit_tests { iommu: true, file: Some(PathBuf::from("/tmp/console")), socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3778,6 +3959,8 @@ mod unit_tests { iommu: true, file: None, socket: Some(PathBuf::from("/tmp/serial.sock")), + url: None, + bdf_device: None, } ); Ok(()) @@ -3829,6 +4012,7 @@ mod unit_tests { iommu: false, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3845,6 +4029,13 @@ mod unit_tests { ..vdpa_fixture() } ); + assert_eq!( + VdpaConfig::parse("path=/dev/vhost-vdpa,addr=0A.0")?, + VdpaConfig { + bdf_device: Some(10), + ..vdpa_fixture() + } + ); Ok(()) } @@ -3873,6 +4064,7 @@ mod unit_tests { iommu: false, id: None, pci_segment: 0, + bdf_device: None, } ); assert_eq!( @@ -3883,6 +4075,19 @@ mod unit_tests { iommu: true, id: None, pci_segment: 0, + bdf_device: None, + } + ); + + assert_eq!( + VsockConfig::parse("socket=/tmp/sock,cid=3,iommu=on,addr=08.0")?, + VsockConfig { + cid: 3, + socket: PathBuf::from("/tmp/sock"), + iommu: true, + id: None, + pci_segment: 0, + bdf_device: Some(8), } ); Ok(()) @@ -3962,6 +4167,7 @@ mod unit_tests { id: Some("net0".to_owned()), num_queues: 2, fds: Some(vec![-1, -1, -1, -1]), + bdf_device: Some(15), ..net_fixture() }, NetConfig { @@ -4135,6 +4341,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -4144,12 +4351,16 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -4292,7 +4503,7 @@ mod unit_tests { }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::VnetReservedFd) + Err(ValidationError::VnetReservedFd(0)) ); let mut invalid_config = valid_config.clone(); @@ -4467,6 +4678,7 @@ mod unit_tests { id: None, iommu: true, pci_segment: 1, + bdf_device: None, }); still_valid_config.validate().unwrap(); @@ -4543,6 +4755,7 @@ mod unit_tests { id: None, iommu: false, pci_segment: 1, + bdf_device: None, }); assert_eq!( invalid_config.validate(), diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index 76655d6c16..bbab9ada9f 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -12,6 +12,7 @@ use std::fs::{File, OpenOptions, read_link}; use std::mem::zeroed; +use std::net::TcpListener; use std::os::fd::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::net::UnixListener; @@ -40,6 +41,10 @@ pub enum ConsoleDeviceError { #[error("No socket option support for console device")] NoSocketOptionSupportForConsoleDevice, + /// Error parsing the TCP address + #[error("Wrong TCP address format: {0}")] + WrongTcpAddressFormat(std::string::String), + /// Error setting pty raw mode #[error("Error setting pty raw mode")] SetPtyRaw(#[source] vmm_sys_util::errno::Error), @@ -62,6 +67,7 @@ pub enum ConsoleOutput { Tty(Arc), Null, Socket(Arc), + Tcp(Arc, Option>), Off, } @@ -227,6 +233,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleOutput::Null, ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, @@ -264,6 +271,21 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { + let url = vmconfig.serial.url.as_ref().unwrap(); + let socket_addr: std::net::SocketAddr = url + .parse() + .map_err(|_| ConsoleDeviceError::WrongTcpAddressFormat(url.to_string()))?; + let listener = TcpListener::bind(socket_addr) + .map_err(ConsoleDeviceError::CreateConsoleDevice)?; + + let mut f = None; + if let Some(p) = &vmconfig.serial.file { + let file = File::create(p).map_err(ConsoleDeviceError::CreateConsoleDevice)?; + f = Some(Arc::new(file)); + } + ConsoleOutput::Tcp(Arc::new(listener), f) + } ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, @@ -290,6 +312,7 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleOutput::Null, ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index ea030fb0cf..5c972f3f64 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -80,6 +80,8 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::{SIGRTMIN, register_signal_handler}; use zerocopy::{FromBytes, Immutable, IntoBytes}; +#[cfg(feature = "kvm")] +use {kvm_bindings::kvm_run, std::cell::Cell, std::os::fd::RawFd, std::sync::RwLock}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -95,6 +97,16 @@ use crate::vm::physical_bits; use crate::vm_config::CpusConfig; use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; +#[cfg(feature = "kvm")] +thread_local! { + static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())}; +} +#[cfg(feature = "kvm")] +/// Tell signal handler to not access certain stuff anymore during shutdown. +/// Otherwise => panics. +/// Better alternative would be to prevent signals there at all. +pub static IS_IN_SHUTDOWN: RwLock = RwLock::new(false); + #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] /// Extract the specified bits of a 64-bit integer. /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`, @@ -554,6 +566,13 @@ impl Vcpu { .map_err(Error::VcpuSetGicrBaseAddr)?; Ok(()) } + + #[cfg(feature = "kvm")] + pub fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + // SAFETY: We happen to know that all current uses respect the safety contract. + // TODO find a better way to keep this safe and/or express its fragile state. + unsafe { self.vcpu.get_kvm_vcpu_raw_fd() } + } } impl Pausable for Vcpu {} @@ -1092,6 +1111,28 @@ impl CpuManager { thread::Builder::new() .name(format!("vcpu{vcpu_id}")) .spawn(move || { + // init thread-local kvm_run structure + #[cfg(feature = "kvm")] + { + let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd(); + + // SAFETY: We know the FD is valid and have the proper args. + let buffer = unsafe { + libc::mmap( + core::ptr::null_mut(), + 4096, + libc::PROT_WRITE | libc::PROT_READ, + libc::MAP_SHARED, + raw_kvm_fd, + 0, + ) + }; + assert!(!buffer.is_null()); + assert_ne!(buffer, libc::MAP_FAILED); + let kvm_run = buffer.cast::(); + KVM_RUN.set(kvm_run); + } + // Schedule the thread to run on the expected CPU set if let Some(cpuset) = cpuset.as_ref() { // SAFETY: FFI call with correct arguments @@ -1121,7 +1162,35 @@ impl CpuManager { return; } + #[cfg(not(feature = "kvm"))] extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} + #[cfg(feature = "kvm")] + extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) { + // We do not need a self-pipe for safe UNIX signal handling here as in this + // signal handler, we only expect the same signal over and over again. While + // different signals can interrupt a signal being handled, the same signal + // again can't by default. Therefore, this is safe. + + // This lock prevents accessing thread locals when a signal is received + // in the teardown phase of the Rust standard library. Otherwise, we would + // panic. + // + // Masking signals would be a nicer approach but this is the pragmatic + // solution. + // + // We don't have lock contention in normal operation. When the writer + // sets the bool to true, the lock is only held for a couple of µs. + let lock = IS_IN_SHUTDOWN.read().unwrap(); + if *lock { + return; + } + + let kvm_run = KVM_RUN.get(); + // SAFETY: the mapping is valid + let kvm_run = unsafe { + kvm_run.as_mut().expect("kvm_run should have been mapped as part of vCPU setup") }; + kvm_run.immediate_exit = 1; + } // This uses an async signal safe handler to kill the vcpu handles. register_signal_handler(SIGRTMIN(), handle_signal) .expect("Failed to register vcpu signal handler"); @@ -1160,12 +1229,14 @@ impl CpuManager { #[cfg(feature = "kvm")] if matches!(hypervisor_type, HypervisorType::Kvm) { - vcpu.lock().unwrap().vcpu.set_immediate_exit(true); - if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { + let lock = vcpu.lock(); + let mut lock = lock.unwrap(); + lock.vcpu.set_immediate_exit(true); + if !matches!(lock.run(), Ok(VmExit::Ignore)) { error!("Unexpected VM exit on \"immediate_exit\" run"); break; } - vcpu.lock().unwrap().vcpu.set_immediate_exit(false); + lock.vcpu.set_immediate_exit(false); } vcpu_run_interrupted.store(true, Ordering::SeqCst); diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 6d465047b1..6ffeadf4cd 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -9,7 +9,7 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; use std::fs::{File, OpenOptions}; use std::io::{self, IsTerminal, Seek, SeekFrom, stdout}; use std::num::Wrapping; @@ -470,7 +470,7 @@ pub enum DeviceManagerError { /// Failed to find an available PCI device ID. #[error("Failed to find an available PCI device ID")] - NextPciDeviceId(#[source] pci::PciRootError), + AllocatePciDeviceId(#[source] pci::PciRootError), /// Could not reserve the PCI device ID. #[error("Could not reserve the PCI device ID")] @@ -907,6 +907,7 @@ struct MetaVirtioDevice { iommu: bool, id: String, pci_segment: u16, + bdf_device: Option, dma_handler: Option>, } @@ -992,7 +993,7 @@ pub struct DeviceManager { cpu_manager: Arc>, // The virtio devices on the system - virtio_devices: Vec, + virtio_devices: VecDeque, /// All disks. Needed for locking and unlocking the images. block_devices: Vec>>, @@ -1329,7 +1330,7 @@ impl DeviceManager { config, memory_manager, cpu_manager, - virtio_devices: Vec::new(), + virtio_devices: VecDeque::new(), block_devices: vec![], bus_devices: Vec::new(), device_id_cnt, @@ -1643,6 +1644,7 @@ impl DeviceManager { &handle.id, handle.pci_segment, handle.dma_handler, + handle.bdf_device, )?; if handle.iommu { @@ -1671,7 +1673,8 @@ impl DeviceManager { } if let Some(iommu_device) = iommu_device { - let dev_id = self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None)?; + let dev_id = + self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None, None)?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -2352,6 +2355,9 @@ impl DeviceManager { ConsoleOutput::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutput::Tcp(_, _) => { + return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); + } ConsoleOutput::Null => Endpoint::Null, ConsoleOutput::Off => return Ok(None), }; @@ -2373,14 +2379,21 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioConsole)?; let virtio_console_device = Arc::new(Mutex::new(virtio_console_device)); - self.virtio_devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_console_device) as Arc>, iommu: console_config.iommu, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: console_config.bdf_device, + }; + + if console_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } // Fill the device tree with a new node. In case of restore, we // know there is nothing to do, so we can simply override the @@ -2425,12 +2438,16 @@ impl DeviceManager { | ConsoleOutput::Null | ConsoleOutput::Pty(_) | ConsoleOutput::Socket(_) => None, + ConsoleOutput::Tcp(_, _) => None, }; if !matches!(console_info.serial_main_fd, ConsoleOutput::Off) { let serial = self.add_serial_device(interrupt_manager, serial_writer)?; self.serial_manager = match console_info.serial_main_fd { - ConsoleOutput::Pty(_) | ConsoleOutput::Tty(_) | ConsoleOutput::Socket(_) => { + ConsoleOutput::Pty(_) + | ConsoleOutput::Tty(_) + | ConsoleOutput::Socket(_) + | ConsoleOutput::Tcp(_, _) => { let serial_manager = SerialManager::new( serial, console_info.serial_main_fd, @@ -2463,6 +2480,7 @@ impl DeviceManager { | ConsoleOutput::Null | ConsoleOutput::Pty(_) | ConsoleOutput::Socket(_) => None, + ConsoleOutput::Tcp(_, _) => None, }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; @@ -2822,6 +2840,7 @@ impl DeviceManager { id, pci_segment: disk_cfg.pci_segment, dma_handler: None, + bdf_device: disk_cfg.bdf_device, }) } @@ -2830,7 +2849,11 @@ impl DeviceManager { if let Some(disk_list_cfg) = &mut block_devices { for disk_cfg in disk_list_cfg.iter_mut() { let device = self.make_virtio_block_device(disk_cfg, false)?; - self.virtio_devices.push(device); + if disk_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().disks = block_devices; @@ -2853,6 +2876,7 @@ impl DeviceManager { let (virtio_device, migratable_device) = if net_cfg.vhost_user { let socket = net_cfg.vhost_socket.as_ref().unwrap().clone(); + debug!("Creating virtio-net device with vhost-user backend: {socket}"); let vu_cfg = VhostUserConfig { socket, num_queues: net_cfg.num_queues, @@ -2895,6 +2919,7 @@ impl DeviceManager { let state = state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?; let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap { + debug!("Creating virtio-net device from Tap device: {tap_if_name}"); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2920,6 +2945,7 @@ impl DeviceManager { .map_err(DeviceManagerError::CreateVirtioNet)?, )) } else if let Some(fds) = &net_cfg.fds { + debug!("Creating virtio-net device from network FDs: {fds:?}"); let net = virtio_devices::Net::from_tap_fds( id.clone(), fds, @@ -2946,6 +2972,9 @@ impl DeviceManager { Arc::new(Mutex::new(net)) } else { + debug!( + "Creating virtio-net device: no ifname or FDs given, creating new Tap device" + ); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2992,6 +3021,7 @@ impl DeviceManager { id, pci_segment: net_cfg.pci_segment, dma_handler: None, + bdf_device: net_cfg.bdf_device, }) } @@ -3001,7 +3031,11 @@ impl DeviceManager { if let Some(net_list_cfg) = &mut net_devices { for net_cfg in net_list_cfg.iter_mut() { let device = self.make_virtio_net_device(net_cfg)?; - self.virtio_devices.push(device); + if net_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().net = net_devices; @@ -3030,14 +3064,20 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioRng)?, )); - self.virtio_devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_rng_device) as Arc>, iommu: rng_config.iommu, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: rng_config.bdf_device, + }; + if rng_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } // Fill the device tree with a new node. In case of restore, we // know there is nothing to do, so we can simply override the @@ -3098,6 +3138,7 @@ impl DeviceManager { id, pci_segment: fs_cfg.pci_segment, dma_handler: None, + bdf_device: fs_cfg.bdf_device, }) } else { Err(DeviceManagerError::NoVirtioFsSock) @@ -3109,7 +3150,11 @@ impl DeviceManager { if let Some(fs_list_cfg) = &mut fs_devices { for fs_cfg in fs_list_cfg.iter_mut() { let device = self.make_virtio_fs_device(fs_cfg)?; - self.virtio_devices.push(device); + if fs_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().fs = fs_devices; @@ -3287,6 +3332,7 @@ impl DeviceManager { id, pci_segment: pmem_cfg.pci_segment, dma_handler: None, + bdf_device: pmem_cfg.bdf_device, }) } @@ -3296,7 +3342,11 @@ impl DeviceManager { if let Some(pmem_list_cfg) = &mut pmem_devices { for pmem_cfg in pmem_list_cfg.iter_mut() { let device = self.make_virtio_pmem_device(pmem_cfg)?; - self.virtio_devices.push(device); + if pmem_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().pmem = pmem_devices; @@ -3358,6 +3408,7 @@ impl DeviceManager { id, pci_segment: vsock_cfg.pci_segment, dma_handler: None, + bdf_device: vsock_cfg.bdf_device, }) } @@ -3365,7 +3416,11 @@ impl DeviceManager { let mut vsock = self.config.lock().unwrap().vsock.clone(); if let Some(vsock_cfg) = &mut vsock { let device = self.make_virtio_vsock_device(vsock_cfg)?; - self.virtio_devices.push(device); + if vsock_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } self.config.lock().unwrap().vsock = vsock; @@ -3407,13 +3462,14 @@ impl DeviceManager { self.virtio_mem_devices.push(Arc::clone(&virtio_mem_device)); - self.virtio_devices.push(MetaVirtioDevice { + self.virtio_devices.push_back(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_mem_device) as Arc>, iommu: false, id: memory_zone_id.clone(), pci_segment: 0, dma_handler: None, + bdf_device: None, }); // Fill the device tree with a new node. In case of restore, we @@ -3440,7 +3496,7 @@ impl DeviceManager { let pci_segment_id = 0x0_u16; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; info!("Creating pvmemcontrol device: id = {id}"); let (pvmemcontrol_pci_device, pvmemcontrol_bus_device) = @@ -3494,14 +3550,21 @@ impl DeviceManager { self.balloon = Some(virtio_balloon_device.clone()); - self.virtio_devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_balloon_device) as Arc>, iommu: false, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: balloon_config.bdf_device, + }; + + if balloon_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } self.device_tree .lock() @@ -3533,13 +3596,14 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioWatchdog)?, )); - self.virtio_devices.push(MetaVirtioDevice { + self.virtio_devices.push_back(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_watchdog_device) as Arc>, iommu: false, id: id.clone(), pci_segment: 0, dma_handler: None, + bdf_device: None, }); self.device_tree @@ -3598,6 +3662,7 @@ impl DeviceManager { id, pci_segment: vdpa_cfg.pci_segment, dma_handler: Some(vdpa_mapping), + bdf_device: vdpa_cfg.bdf_device, }) } @@ -3607,7 +3672,11 @@ impl DeviceManager { if let Some(vdpa_list_cfg) = &mut vdpa_devices { for vdpa_cfg in vdpa_list_cfg.iter_mut() { let device = self.make_vdpa_device(vdpa_cfg)?; - self.virtio_devices.push(device); + if vdpa_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().vdpa = vdpa_devices; @@ -3684,7 +3753,7 @@ impl DeviceManager { }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_name, device_cfg.pci_segment, None)?; let mut needs_dma_mapping = false; @@ -3927,7 +3996,7 @@ impl DeviceManager { }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_user_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_user_name, device_cfg.pci_segment, None)?; let legacy_interrupt_group = if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager { @@ -4039,6 +4108,7 @@ impl DeviceManager { virtio_device_id: &str, pci_segment_id: u16, dma_handler: Option>, + bdf_device: Option, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4047,7 +4117,7 @@ impl DeviceManager { node.children = vec![virtio_device_id.to_string()]; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, bdf_device)?; // Update the existing virtio node by setting the parent. if let Some(node) = self.device_tree.lock().unwrap().get_mut(virtio_device_id) { @@ -4184,7 +4254,7 @@ impl DeviceManager { info!("Creating pvpanic device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); @@ -4222,7 +4292,7 @@ impl DeviceManager { info!("Creating ivshmem device {id}"); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { @@ -4267,6 +4337,7 @@ impl DeviceManager { &self, id: &str, pci_segment_id: u16, + pci_device_id: Option, ) -> DeviceManagerResult<(u16, PciBdf, Option>)> { // Look for the id in the device tree. If it can be found, that means // the device is being restored, otherwise it's created from scratch. @@ -4293,7 +4364,8 @@ impl DeviceManager { (pci_segment_id, pci_device_bdf, resources) } else { - let pci_device_bdf = self.pci_segments[pci_segment_id as usize].next_device_bdf()?; + let pci_device_bdf = + self.pci_segments[pci_segment_id as usize].allocate_device_bdf(pci_device_id)?; (pci_segment_id, pci_device_bdf, None) }) @@ -4399,6 +4471,10 @@ impl DeviceManager { Ok(()) } + /// Notifies the VM for a hotplug. + /// + /// This call doesn't wait for the vCPU receiving the + /// interrupt to acknowledge. pub fn notify_hotplug( &self, _notification_type: AcpiNotificationFlags, @@ -4771,7 +4847,7 @@ impl DeviceManager { // Add the virtio device to the device manager list. This is important // as the list is used to notify virtio devices about memory updates // for instance. - self.virtio_devices.push(handle.clone()); + self.virtio_devices.push_back(handle.clone()); let mapping: Option> = if handle.iommu { self.iommu_mapping.clone() @@ -4785,6 +4861,7 @@ impl DeviceManager { &handle.id, handle.pci_segment, handle.dma_handler, + handle.bdf_device, )?; // Update the PCIU bitmap diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 6917e005e7..dd437e4694 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3,31 +3,44 @@ // SPDX-License-Identifier: Apache-2.0 // +/// Amount of iterations before auto-converging starts. +const AUTO_CONVERGE_ITERATION_DELAY: u64 = 2; +/// Step size in percent to increase the vCPU throttling. +const AUTO_CONVERGE_STEP_SIZE: u8 = 10; +/// Amount of iterations after that we increase vCPU throttling. +const AUTO_CONVERGE_ITERATION_INCREASE: u64 = 2; +/// Maximum vCPU throttling value. +const AUTO_CONVERGE_MAX: u8 = 99; + use std::collections::HashMap; use std::fs::File; -use std::io::{Read, Write, stdout}; +use std::io::{ErrorKind, Read, Write, stdout}; use std::net::{TcpListener, TcpStream}; +use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::panic::AssertUnwindSafe; use std::path::PathBuf; -use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; -use std::sync::{Arc, Mutex}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::{Receiver, RecvError, SendError, Sender, TrySendError}; +use std::sync::{Arc, Barrier, Mutex}; +use std::thread::JoinHandle; #[cfg(not(target_arch = "riscv64"))] -use std::time::Instant; -use std::{io, result, thread}; +use std::time::{Duration, Instant}; +use std::{io, mem, result, thread}; use anyhow::anyhow; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; +use arch::PAGE_SIZE; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; use console_devices::{ConsoleInfo, pre_create_console_devices}; use event_monitor::event; use landlock::LandlockError; use libc::{EFD_NONBLOCK, SIGINT, SIGTERM, TCSANOW, tcsetattr, termios}; -use log::{error, info, trace, warn}; +use log::{debug, error, info, trace, warn}; use memory_manager::MemoryManagerSnapshotData; use pci::PciBdf; use seccompiler::{SeccompAction, apply_filter}; @@ -37,13 +50,20 @@ use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; use vm_memory::bitmap::{AtomicBitmap, BitmapSlice}; -use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; +use vm_memory::{ + GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, ReadVolatile, + VolatileMemoryError, VolatileSlice, WriteVolatile, +}; use vm_migration::protocol::*; -use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_migration::tls::{TlsConnectionWrapper, TlsStream, TlsStreamWrapper}; +use vm_migration::{ + Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, tls, +}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; +use crate::api::http::http_endpoint::ONGOING_LIVEMIGRATION; use crate::api::{ ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, @@ -51,6 +71,8 @@ use crate::api::{ use crate::config::{RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; +#[cfg(feature = "kvm")] +use crate::cpu::IS_IN_SHUTDOWN; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] @@ -85,6 +107,7 @@ mod pci_segment; pub mod seccomp_filters; mod serial_manager; mod sigwinch_listener; +mod vcpu_throttling; pub mod vm; pub mod vm_config; @@ -235,6 +258,7 @@ pub enum EpollDispatch { Api = 2, ActivateVirtioDevices = 3, Debug = 4, + CheckMigration = 5, Unknown, } @@ -247,6 +271,7 @@ impl From for EpollDispatch { 2 => Api, 3 => ActivateVirtioDevices, 4 => Debug, + 5 => CheckMigration, _ => Unknown, } } @@ -255,6 +280,7 @@ impl From for EpollDispatch { enum SocketStream { Unix(UnixStream), Tcp(TcpStream), + Tls(Box), } impl Read for SocketStream { @@ -262,6 +288,7 @@ impl Read for SocketStream { match self { SocketStream::Unix(stream) => stream.read(buf), SocketStream::Tcp(stream) => stream.read(buf), + SocketStream::Tls(stream) => stream.read(buf), } } } @@ -271,6 +298,7 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.write(buf), SocketStream::Tcp(stream) => stream.write(buf), + SocketStream::Tls(stream) => stream.write(buf), } } @@ -278,15 +306,17 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.flush(), SocketStream::Tcp(stream) => stream.flush(), + SocketStream::Tls(stream) => stream.flush(), } } } -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { +impl AsFd for SocketStream { + fn as_fd(&self) -> BorrowedFd<'_> { match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), + SocketStream::Unix(s) => s.as_fd(), + SocketStream::Tcp(s) => s.as_fd(), + SocketStream::Tls(s) => s.as_fd(), } } } @@ -299,6 +329,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_volatile(buf), SocketStream::Tcp(s) => s.read_volatile(buf), + SocketStream::Tls(s) => s.read_volatile(buf), } } @@ -309,6 +340,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_exact_volatile(buf), SocketStream::Tcp(s) => s.read_exact_volatile(buf), + SocketStream::Tls(s) => s.read_exact_volatile(buf), } } } @@ -321,6 +353,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_volatile(buf), SocketStream::Tcp(s) => s.write_volatile(buf), + SocketStream::Tls(s) => s.write_volatile(buf), } } @@ -331,6 +364,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_all_volatile(buf), SocketStream::Tcp(s) => s.write_all_volatile(buf), + SocketStream::Tls(s) => s.write_all_volatile(buf), } } } @@ -654,6 +688,101 @@ impl VmmVersionInfo { } } +#[derive(Debug, Clone)] +struct MigrationState { + current_dirty_pages: u64, + downtime: Duration, + downtime_start: Instant, + iteration: u64, + iteration_cost_time: Duration, + iteration_start_time: Instant, + mb_per_sec: f64, + pages_per_second: u64, + pending_size: u64, + start_time: Instant, + threshold_size: u64, + total_time: Duration, + total_transferred_bytes: u64, + total_transferred_dirty_pages: u64, +} + +impl MigrationState { + pub fn new() -> Self { + Self { + current_dirty_pages: 0, + downtime: Duration::default(), + downtime_start: Instant::now(), + iteration: 0, + iteration_cost_time: Duration::default(), + iteration_start_time: Instant::now(), + mb_per_sec: 0.0, + pages_per_second: 0, + pending_size: 0, + start_time: Instant::now(), + threshold_size: 0, + total_time: Duration::default(), + total_transferred_bytes: 0, + total_transferred_dirty_pages: 0, + } + } +} + +/// Abstraction for the thread controlling and performing the live migration. +/// +/// The migration thread also takes ownership of the [`Vm`] from the [`Vmm`]. +struct MigrationWorker { + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: Arc, +} + +impl MigrationWorker { + /// Performs any final cleanup after failed live migrations. + /// + /// Helper for [`Self::migrate`]. + fn migrate_error_cleanup(&mut self) -> result::Result<(), MigratableError> { + // Stop logging dirty pages only for non-local migrations + if !self.config.local { + self.vm.stop_dirty_log()?; + } + + Ok(()) + } + + /// Migrate and cleanup. + fn migrate(&mut self) -> result::Result<(), MigratableError> { + debug!("start sending migration"); + Vmm::send_migration( + &mut self.vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.as_ref(), + &self.config, + ).inspect_err(|_| { + let e = self.migrate_error_cleanup(); + if let Err(e) = e { + error!("Failed to clean up after a failed live migration. VM might keep running but in an odd or possibly slowed-down state: {e}"); + } + })?; + + Ok(()) + } + + /// Perform the migration and communicate with the [`Vmm`] thread. + fn run(mut self) -> (Vm, result::Result<(), MigratableError>) { + debug!("migration thread is starting"); + + let res = self.migrate().inspect_err(|e| error!("migrate error: {e}")); + + // Notify VMM thread to get migration result by joining this thread. + self.check_migration_evt.write(1).unwrap(); + + debug!("migration thread is finished"); + (self.vm, res) + } +} + pub struct VmmThreadHandle { pub thread_handle: thread::JoinHandle>, #[cfg(feature = "dbus_api")] @@ -661,6 +790,41 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +/// Describes the current ownership of a running VM. +#[allow(clippy::large_enum_variant)] +pub enum MaybeVmOwnership { + /// The VMM holds the ownership of the VM. + Vmm(Vm), + /// The VM is temporarily blocked by the current ongoing migration. + Migration, + /// No VM is running. + None, +} + +impl MaybeVmOwnership { + /// Takes the VM and replaces it with [`Self::Migration`]. + /// + /// # Panics + /// This method panics if `self` is not [`Self::Vmm`]. + fn take_vm_for_migration(&mut self) -> Vm { + if !matches!(self, Self::Vmm(_)) { + panic!("should only be called when a migration can start"); + } + + match mem::replace(self, Self::Migration) { + MaybeVmOwnership::Vmm(vm) => vm, + _ => unreachable!(), + } + } + + fn vm_mut(&mut self) -> Option<&mut Vm> { + match self { + MaybeVmOwnership::Vmm(vm) => Some(vm), + _ => None, + } + } +} + pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, @@ -671,7 +835,7 @@ pub struct Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, version: VmmVersionInfo, - vm: Option, + vm: MaybeVmOwnership, vm_config: Option>>, seccomp_action: SeccompAction, hypervisor: Arc, @@ -681,6 +845,300 @@ pub struct Vmm { original_termios_opt: Arc>>, console_resize_pipe: Option>, console_info: Option, + check_migration_evt: EventFd, + /// Handle to the [`MigrationWorker`] thread. + /// + /// The handle will return the [`Vm`] back in any case. Further, the underlying error (if any) is returned. + migration_thread_handle: Option)>>, +} + +/// Wait for a file descriptor to become readable. In this case, we return +/// true. In case, the eventfd was signaled, return false. +fn wait_for_readable( + fd: &impl AsFd, + eventfd: &EventFd, +) -> std::result::Result { + let fd_event = eventfd.as_raw_fd().as_raw_fd(); + let fd_io = fd.as_fd().as_raw_fd(); + let mut poll_fds = [ + libc::pollfd { + fd: fd_event, + events: libc::POLLIN, + revents: 0, + }, + libc::pollfd { + fd: fd_io, + events: libc::POLLIN, + revents: 0, + }, + ]; + + // SAFETY: This is safe, because the file descriptors are valid and the + // poll_fds array is properly initialized. + let ret = unsafe { libc::poll(poll_fds.as_mut_ptr(), poll_fds.len() as libc::nfds_t, -1) }; + + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + + if poll_fds[0].revents & libc::POLLIN != 0 { + return Ok(false); + } + if poll_fds[1].revents & libc::POLLIN != 0 { + return Ok(true); + } + + panic!("Poll returned, but neither file descriptor is readable?"); +} + +/// Abstract over the different types of listeners that can be used to receive connections. +#[derive(Debug)] +enum ReceiveListener { + Tcp(TcpListener), + Unix(UnixListener, Option), + Tls(TcpListener, TlsConnectionWrapper), +} + +impl AsFd for ReceiveListener { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + ReceiveListener::Tcp(listener) => listener.as_fd(), + ReceiveListener::Unix(listener, _) => listener.as_fd(), + ReceiveListener::Tls(listener, _) => listener.as_fd(), + } + } +} + +impl ReceiveListener { + /// Block until a connection is accepted. + fn accept(&mut self) -> std::result::Result { + match self { + ReceiveListener::Tcp(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Tcp(socket)), + ReceiveListener::Unix(listener, opt_path) => { + let socket = listener + .accept() + .map(|(socket, _)| SocketStream::Unix(socket))?; + + // Remove the UNIX socket file after accepting the connection. Is this actually safe? If a user + // moves the file and creates a new one with the same name, we will delete the wrong file. + // Sounds like a confused deputy to me. + // + // TODO Don't do this? + if let Some(path) = opt_path.take() { + std::fs::remove_file(&path)?; + } + + Ok(socket) + } + ReceiveListener::Tls(listener, conn) => listener.accept().map(|(socket, _)| { + conn.wrap(socket) + .map(Box::new) + .map(SocketStream::Tls) + .map_err(std::io::Error::other) + })?, + } + } + + /// Same as accept(), but returns None if the eventfd is signaled. + fn abortable_accept( + &mut self, + eventfd: &EventFd, + ) -> std::result::Result, std::io::Error> { + wait_for_readable(&self, eventfd)? + .then(|| self.accept()) + .transpose() + } + + fn try_clone(&self) -> std::result::Result { + match self { + ReceiveListener::Tcp(listener) => listener.try_clone().map(ReceiveListener::Tcp), + ReceiveListener::Unix(listener, opt_path) => listener + .try_clone() + .map(|listener| ReceiveListener::Unix(listener, opt_path.clone())), + ReceiveListener::Tls(listener, conn) => listener + .try_clone() + .map(|listener| ReceiveListener::Tls(listener, conn.clone())), + } + } +} + +/// Handles a `Memory` request by writing its payload to the VM memory. +fn vm_receive_memory( + req: &Request, + socket: &mut T, + guest_mem: &GuestMemoryAtomic, +) -> std::result::Result<(), MigratableError> +where + T: Read + ReadVolatile, +{ + assert_eq!(req.command(), Command::Memory); + + // Read table + let ranges = MemoryRangeTable::read_from(socket, req.length())?; + let mem = guest_mem.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of read_exact_from() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_read = mem + .read_volatile_from( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error receiving memory from socket: {e}" + )) + })?; + offset += bytes_read as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) +} + +/// We keep track of additional connections for receiving VM migration data +/// here. +struct ReceiveAdditionalConnections { + terminate_fd: EventFd, + + // This is only an option to be able to join it in the destructor. + accept_thread: Option>, +} + +impl ReceiveAdditionalConnections { + /// Create a pair of file descriptors that map to the same underlying event_fd. + fn event_fd_pair() -> std::result::Result<(EventFd, EventFd), std::io::Error> { + let event_fd = EventFd::new(0)?; + Ok((event_fd.try_clone()?, event_fd)) + } + + /// Handle incoming requests. + /// + /// For now we only handle `Command::Memory` requests here. Everything else + /// needs to come via the main connection. This function returns when the + /// abort_event_fd is triggered or the connection is closed or encountered + /// an error. + fn handle_requests( + socket: &mut SocketStream, + abort_event_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> std::result::Result<(), MigratableError> { + loop { + if !wait_for_readable(socket, abort_event_fd).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Failed to poll descriptors: {e}")) + })? { + info!("Got signal to tear down connection."); + return Ok(()); + } + + // TODO We only check whether we should abort when waiting for a new + // request. If the sender just stops sending data mid-request, we + // should still be abortable, but we are not... In this case, we + // will hang forever. But given that the sender is also in charge of + // driving the migration to completion, this is not a major concern. + // In the long run, it would be preferable to move I/O to + // asynchronous tasks to be able to handle aborts more gracefully. + + let req = match Request::read_from(socket) { + Ok(req) => req, + Err(MigratableError::MigrateSocket(io_error)) + if io_error.kind() == ErrorKind::UnexpectedEof => + { + debug!("Connection closed by peer"); + return Ok(()); + } + Err(e) => return Err(e), + }; + + if req.command() != Command::Memory { + return Err(MigratableError::MigrateReceive(anyhow!( + "Dropping connection. Only Memory commands are allowed on additional connections, but got {:?}", + req.command() + ))); + } + + vm_receive_memory(&req, socket, guest_memory)?; + Response::ok().write_to(socket)?; + } + } + + /// Starts a thread to accept incoming connections and handle them. These + /// additional connections are used to receive additional memory regions + /// during VM migration. + fn new( + listener: ReceiveListener, + guest_memory: GuestMemoryAtomic, + ) -> std::result::Result { + let (terminate_fd1, terminate_fd2) = Self::event_fd_pair()?; + + let accept_thread = std::thread::spawn(move || { + let terminate_fd = terminate_fd2; + let mut listener = listener; + let mut threads: Vec> = Vec::new(); + while let Ok(Some(mut socket)) = listener.abortable_accept(&terminate_fd) { + let guest_memory = guest_memory.clone(); + let terminate_fd = terminate_fd.try_clone().unwrap(); + + // We handle errors locally and log them. Passing them along is + // painful with little value. + threads.push(std::thread::spawn(move || { + if let Err(e) = Self::handle_requests(&mut socket, &terminate_fd, &guest_memory) + { + error!( + "Failed to read more requests on additional receive connection: {e}" + ); + } + })); + } + + info!("Stopped accepting additional connections. Cleaning up threads."); + threads.into_iter().for_each(|thread| { + thread.join().unwrap(); + }); + }); + + Ok(Self { + accept_thread: Some(accept_thread), + terminate_fd: terminate_fd1, + }) + } + + /// Stop accepting additional connections and tear down all connections. + /// + /// This function does not wait for the operation to complete. + fn signal_termination(&self) { + // It's not really worth propagating this error, because it only happens if + // something hit the fan and we can't really do anything about it. + if let Err(e) = self.terminate_fd.write(1) { + error!("Failed to wake up other threads: {e}"); + } + } +} + +impl Drop for ReceiveAdditionalConnections { + fn drop(&mut self) { + self.signal_termination(); + // This unwrap is safe, because we never write a None into + // self.accept_thread in other places. + let _accept_thread = self.accept_thread.take().unwrap(); + + // TODO The accept thread tries to join all threads it started, but we + // haven't implemented tearing them down yet. + // accept_thread.join().unwrap(); + } } /// The receiver's state machine behind the migration protocol. @@ -694,8 +1152,16 @@ enum ReceiveMigrationState { /// We received file descriptors for memory. This can only happen on UNIX domain sockets. MemoryFdsReceived(Vec<(u32, File)>), - /// We received the VM configuration. We keep the memory configuration around to populate guest memory. From this point on, the sender can start sending memory updates. - Configured(Arc>), + /// We received the VM configuration. We keep the memory configuration around to populate guest memory. + /// From this point on, the sender can start sending memory updates. + /// + /// While the memory manager can also be used to populate guest memory, we keep a direct reference to + /// the memory around to populate guest memory without having to acquire a lock. + Configured( + Arc>, + GuestMemoryAtomic, + ReceiveAdditionalConnections, + ), /// Memory is populated and we received the state. The VM is ready to go. StateReceived, @@ -716,6 +1182,366 @@ impl ReceiveMigrationState { } } +/// The different kinds of messages we can send to memory sending threads. +#[derive(Debug)] +enum SendMemoryThreadMessage { + Memory(Arc), + Barrier(Arc), + Disconnect, +} + +/// This struct keeps track of additional threads we use to send VM memory. +struct SendAdditionalConnections { + guest_memory: GuestMemoryAtomic, + threads: Vec>, + channels: Vec>, + // If an error occurs in one of the worker threads, the worker signals this + // using this flag. Only the main thread checks this variable, the other + // workers will be stopped in the destructor. + cancel: Arc, + // The first worker encountering an error will transmit the error using + // this channel. + error_rx: std::sync::mpsc::Receiver, +} + +/// Send memory from the given table. +fn vm_send_memory( + guest_memory: &GuestMemoryAtomic, + socket: &mut SocketStream, + table: &MemoryRangeTable, +) -> result::Result<(), MigratableError> { + if table.regions().is_empty() { + return Ok(()); + } + + Request::memory(table.length()).write_to(socket)?; + table.write_to(socket)?; + // And then the memory itself + send_memory_regions(guest_memory, table, socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + )?; + + Ok(()) +} + +impl SendAdditionalConnections { + /// How many requests can be waiting to be sent for each connection. This + /// can be set to zero to disable buffering. Whether we need to buffer + /// requests is currently unclear. If this is set too high, some connections + /// might go unused, because work pools up on some connections. + const BUFFERED_REQUESTS_PER_THREAD: usize = 1; + + /// The size of each chunk of memory to send. + /// + /// We want to make this large, because each chunk is acknowledged and we + /// wait for the ack before sending the next chunk. The challenge is that if + /// it is _too_ large, we become more sensitive to network issues, like + /// packet drops in individual connections, because large amounts of data + /// can pool when throughput on one connection is temporarily reduced. + /// + /// We can consider making this configurable, but a better network protocol + /// that doesn't require ACKs would be more efficient. + /// + /// The best-case throughput per connection can be estimated via: + /// effective_throughput = chunk_size / (chunk_size / throughput_per_connection + round_trip_time) + const CHUNK_SIZE: u64 = 64 /* MiB */ << 20; + + fn new( + send_data_migration: &VmSendMigrationData, + guest_mem: &GuestMemoryAtomic, + ) -> std::result::Result { + let mut threads = Vec::new(); + let mut channels = Vec::new(); + let cancel = Arc::new(AtomicBool::new(false)); + let (error_tx, error_rx) = std::sync::mpsc::channel::(); + + let additional_connections = send_data_migration.connections.get() - 1; + for n in 0..(additional_connections) { + let socket = (match send_migration_socket(send_data_migration) { + Err(e) if n == 0 => { + // If we encounter a problem on the first additional + // connection, we just assume the other side doesn't support + // multiple connections and carry on. + info!( + "Couldn't establish additional connections for sending VM memory: {e}, ignoring!" + ); + break; + } + otherwise => otherwise, + })?; + let guest_mem = guest_mem.clone(); + let (send, recv) = std::sync::mpsc::sync_channel::( + Self::BUFFERED_REQUESTS_PER_THREAD, + ); + let cancel = cancel.clone(); + let err_tx = error_tx.clone(); + + let thread = thread::spawn(move || { + info!("Spawned thread to send VM memory."); + + let mut total_sent = 0; + let mut socket = socket; + + for msg in recv { + match msg { + SendMemoryThreadMessage::Memory(table) => { + match vm_send_memory(&guest_mem, &mut socket, &table) { + Ok(()) => { + total_sent += table + .ranges() + .iter() + .map(|range| range.length) + .sum::(); + } + Err(e) => { + // Only the first thread that encounters an + // error sends it to the main thread. + if cancel.swap(true, Ordering::AcqRel) + && let Err(e) = err_tx.send(e) + { + error!("Could not send error to main thread: {e}"); + } + // After that we exit gracefully. Note that + // this also closes our mpsc channel. + break; + } + } + } + SendMemoryThreadMessage::Barrier(barrier) => { + barrier.wait(); + } + SendMemoryThreadMessage::Disconnect => { + break; + } + } + } + info!("Sent {} MiB via additional connection.", total_sent >> 20); + }); + + threads.push(thread); + channels.push(send); + } + + Ok(Self { + guest_memory: guest_mem.clone(), + threads, + channels, + cancel, + error_rx, + }) + } + + /// Wait until all data that is in-flight has actually been sent and acknowledged. + fn wait_for_pending_data(&self) { + assert_eq!(self.channels.len(), self.threads.len()); + + // TODO We don't actually need the threads to block at the barrier. We + // can probably find a better implementation that involves less + // synchronization. + + let barrier = Arc::new(Barrier::new(self.channels.len() + 1)); + + for channel in &self.channels { + channel + .send(SendMemoryThreadMessage::Barrier(barrier.clone())) + // The unwrap only fails fi + .unwrap(); + } + + barrier.wait(); + } + + /// Send memory via all connections that we have. This may be just one. + /// `socket` is the original socket that was used to connect to the + /// destination. + /// + /// When this function returns, all memory has been sent and acknowledged. + fn send_memory( + &self, + table: &MemoryRangeTable, + socket: &mut SocketStream, + ) -> std::result::Result<(), MigratableError> { + let thread_len = self.threads.len(); + assert_eq!(thread_len, self.channels.len()); + + // In case, we didn't manage to establish additional connections, don't + // bother sending memory in chunks. This would just lower throughput, + // because we wait for a response after each chunk instead of sending + // everything in one go. + if thread_len == 0 { + vm_send_memory(&self.guest_memory, socket, table)?; + return Ok(()); + } + + // The chunk size is chosen to be big enough so that even very fast + // links need some milliseconds to send it. + 'next_partition: for chunk in table.partition(Self::CHUNK_SIZE) { + // If one of the workers encountered an error, we return it. + if self.cancel.load(Ordering::Acquire) { + return Err(self.error_rx.recv().unwrap()); + } + + let chunk = Arc::new(chunk); + + // Find the first free channel and send the chunk via it. + // + // TODO A better implementation wouldn't always start at the + // first thread, but go round-robin. + for channel in &self.channels { + match channel.try_send(SendMemoryThreadMessage::Memory(chunk.clone())) { + Ok(()) => continue 'next_partition, + Err(TrySendError::Full(_)) => { + // Try next channel. + } + Err(TrySendError::Disconnected(_)) => { + return Err(MigratableError::MigrateSend(anyhow!( + "Sending thread died?" + ))); + } + } + } + + // Fallback to sending the chunk via the control connection. + vm_send_memory(&self.guest_memory, socket, &chunk)?; + } + + self.wait_for_pending_data(); + + Ok(()) + } +} + +impl Drop for SendAdditionalConnections { + fn drop(&mut self) { + info!("Sending disconnect message to channels"); + self.channels.drain(..).for_each(|channel| { + // One of the workers may have died and thus closed the channel. + // Thus we cannot simply do send().unwrap(). + let e = channel.send(SendMemoryThreadMessage::Disconnect); + if let Err(e) = e { + error!("Could not send disconnect message to worker thread: {e}"); + } + }); + + info!("Waiting for threads to finish"); + self.threads + .drain(..) + .for_each(|thread| thread.join().unwrap()); + info!("Threads finished"); + } +} + +/// Establishes a connection to a migration destination socket (TCP or UNIX). +fn send_migration_socket( + send_data_migration: &VmSendMigrationData, +) -> std::result::Result { + if let Some(address) = send_data_migration.destination_url.strip_prefix("tcp:") { + info!("Connecting to TCP socket at {address}"); + + let socket = TcpStream::connect(address).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) + })?; + + if send_data_migration.tls_dir.is_none() { + Ok(SocketStream::Tcp(socket)) + } else { + info!("Live Migration will be encrypted using TLS."); + // The address may still contain a port. I think we should build something more robust to also handle IPv6. + let tls_stream = tls::client_stream( + socket, + send_data_migration.tls_dir.as_ref().unwrap(), + address.split_once(':').map_or(address, |(host, _)| host), + )?; + Ok(SocketStream::Tls(Box::new(TlsStreamWrapper::new( + TlsStream::Client(tls_stream), + )))) + } + } else if let Some(path) = &send_data_migration.destination_url.strip_prefix("unix:") { + info!("Connecting to UNIX socket at {path:?}"); + + let socket = UnixStream::connect(path).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) + })?; + + Ok(SocketStream::Unix(socket)) + } else { + Err(MigratableError::MigrateSend(anyhow!( + "Invalid destination: {}", + send_data_migration.destination_url + ))) + } +} + +/// Creates a listener socket for receiving incoming migration connections (TCP or UNIX). +fn receive_migration_listener( + receiver_data_migration: &VmReceiveMigrationData, +) -> std::result::Result { + if let Some(address) = receiver_data_migration.receiver_url.strip_prefix("tcp:") { + let listener = TcpListener::bind(address).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) + })?; + + if receiver_data_migration.tls_dir.is_none() { + Ok(ReceiveListener::Tcp(listener)) + } else { + Ok(ReceiveListener::Tls( + listener, + TlsConnectionWrapper::new(receiver_data_migration.tls_dir.as_ref().unwrap())?, + )) + } + } else if let Some(path) = receiver_data_migration.receiver_url.strip_prefix("unix:") { + UnixListener::bind(path) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) + }) + .map(|listener| ReceiveListener::Unix(listener, Some(path.into()))) + } else { + Err(MigratableError::MigrateSend(anyhow!( + "Invalid source: {}", + receiver_data_migration.receiver_url + ))) + } +} + +fn send_memory_regions( + guest_memory: &GuestMemoryAtomic, + ranges: &MemoryRangeTable, + fd: &mut SocketStream, +) -> std::result::Result<(), MigratableError> { + let mem = guest_memory.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of write_all_to() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_written = mem + .write_volatile_to( + GuestAddress(range.gpa + offset), + fd, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Error transferring memory to socket: {e}" + )) + })?; + offset += bytes_written as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) +} + impl Vmm { pub const HANDLED_SIGNALS: [i32; 2] = [SIGTERM, SIGINT]; @@ -771,14 +1597,14 @@ impl Vmm { .name("vmm_signal_handler".to_string()) .spawn(move || { if !signal_handler_seccomp_filter.is_empty() && let Err(e) = apply_filter(&signal_handler_seccomp_filter) - .map_err(Error::ApplySeccompFilter) - { - error!("Error applying seccomp filter: {e:?}"); - exit_evt.write(1).ok(); - return; - } + .map_err(Error::ApplySeccompFilter) + { + error!("Error applying seccomp filter: {e:?}"); + exit_evt.write(1).ok(); + return; + } - if landlock_enable{ + if landlock_enable { match Landlock::new() { Ok(landlock) => { let _ = landlock.restrict_self().map_err(Error::ApplyLandlock).map_err(|e| { @@ -796,11 +1622,11 @@ impl Vmm { std::panic::catch_unwind(AssertUnwindSafe(|| { Vmm::signal_handler(signals, original_termios_opt.as_ref(), &exit_evt); })) - .map_err(|_| { - error!("vmm signal_handler thread panicked"); - exit_evt.write(1).ok() - }) - .ok(); + .map_err(|_| { + error!("vmm signal_handler thread panicked"); + exit_evt.write(1).ok() + }) + .ok(); }) .map_err(Error::SignalHandlerSpawn)?, ); @@ -823,6 +1649,7 @@ impl Vmm { let mut epoll = EpollContext::new().map_err(Error::Epoll)?; let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let check_migration_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll .add_event(&exit_evt, EpollDispatch::Exit) @@ -845,6 +1672,10 @@ impl Vmm { .add_event(&debug_evt, EpollDispatch::Debug) .map_err(Error::Epoll)?; + epoll + .add_event(&check_migration_evt, EpollDispatch::CheckMigration) + .map_err(Error::Epoll)?; + Ok(Vmm { epoll, exit_evt, @@ -855,7 +1686,7 @@ impl Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt, version: vmm_version, - vm: None, + vm: MaybeVmOwnership::None, vm_config: None, seccomp_action, hypervisor, @@ -865,6 +1696,8 @@ impl Vmm { original_termios_opt: Arc::new(Mutex::new(None)), console_resize_pipe: None, console_info: None, + check_migration_evt, + migration_thread_handle: None, }) } @@ -893,10 +1726,11 @@ impl Vmm { /// _not_ write any response to the socket. fn vm_receive_migration_step( &mut self, + listener: &ReceiveListener, socket: &mut SocketStream, state: ReceiveMigrationState, req: &Request, - _receive_data_migration: &VmReceiveMigrationData, + receive_data_migration: &VmReceiveMigrationData, ) -> std::result::Result { use ReceiveMigrationState::*; @@ -906,14 +1740,59 @@ impl Vmm { ))) }; - let mut configure_vm = - |socket: &mut SocketStream, - memory_files: HashMap| - -> std::result::Result>, MigratableError> { - let memory_manager = self.vm_receive_config(req, socket, memory_files)?; + #[allow(clippy::type_complexity)] + let mut configure_vm = |socket: &mut SocketStream, + memory_files: HashMap| + -> std::result::Result< + ( + Arc>, + GuestMemoryAtomic, + ReceiveAdditionalConnections, + ), + MigratableError, + > { + let memory_manager = self.vm_receive_config( + req, + socket, + memory_files, + receive_data_migration.tcp_serial_url.clone(), + )?; - Ok(memory_manager) - }; + if let Some(ref restored_net_configs) = receive_data_migration.net_fds { + // TODO do some validation + //restored_net_config.validate(); + // Update VM's net configurations with new fds received for restore operation + + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + + for net in restored_net_configs { + for net_config in vm_config.net.iter_mut().flatten() { + // update only if the net dev is backed by FDs + if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { + debug!( + "overwriting net fds: id={}, old={:?}, new={:?}", + net.id, &net_config.fds, &net.fds + ); + net_config.fds.clone_from(&net.fds); + } + } + } + } + + let guest_memory = memory_manager.lock().unwrap().guest_memory(); + Ok(( + memory_manager, + guest_memory.clone(), + listener + .try_clone() + .and_then(|l| ReceiveAdditionalConnections::new(l, guest_memory)) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Failed to create receive additional connections: {e}" + )) + })?, + )) + }; let recv_memory_fd = |socket: &mut SocketStream, mut memory_files: Vec<(u32, File)>| @@ -936,32 +1815,38 @@ impl Vmm { }, Started => match req.command() { Command::MemoryFd => recv_memory_fd(socket, Vec::new()).map(MemoryFdsReceived), - Command::Config => configure_vm(socket, Default::default()).map(Configured), + Command::Config => configure_vm(socket, Default::default()) + .map(|res| Configured(res.0, res.1, res.2)), _ => invalid_command(), }, MemoryFdsReceived(memory_files) => match req.command() { Command::MemoryFd => recv_memory_fd(socket, memory_files).map(MemoryFdsReceived), - Command::Config => { - configure_vm(socket, HashMap::from_iter(memory_files)).map(Configured) - } + Command::Config => configure_vm(socket, HashMap::from_iter(memory_files)) + .map(|res| Configured(res.0, res.1, res.2)), _ => invalid_command(), }, - Configured(memory_manager) => match req.command() { - Command::Memory => { - self.vm_receive_memory(req, socket, &mut memory_manager.lock().unwrap())?; - Ok(Configured(memory_manager)) - } - Command::State => { - self.vm_receive_state(req, socket, memory_manager.clone())?; - Ok(StateReceived) + Configured(memory_manager, guest_memory, receive_additional_connections) => { + match req.command() { + Command::Memory => { + vm_receive_memory(req, socket, &guest_memory)?; + Ok(Configured( + memory_manager, + guest_memory, + receive_additional_connections, + )) + } + Command::State => { + self.vm_receive_state(req, socket, memory_manager)?; + Ok(StateReceived) + } + _ => invalid_command(), } - _ => invalid_command(), - }, + } StateReceived => match req.command() { Command::Complete => { // The unwrap is safe, because the state machine makes sure we called // vm_receive_state before, which creates the VM. - let vm = self.vm.as_mut().unwrap(); + let vm = self.vm.vm_mut().unwrap(); vm.resume()?; Ok(Completed) } @@ -978,6 +1863,7 @@ impl Vmm { req: &Request, socket: &mut T, existing_memory_files: HashMap, + tcp_serial_url: Option, ) -> std::result::Result>, MigratableError> where T: Read, @@ -1002,6 +1888,12 @@ impl Vmm { let config = vm_migration_config.vm_config.clone(); self.vm_config = Some(vm_migration_config.vm_config); + + if let Some(tcp_serial_url) = tcp_serial_url { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + vm_config.serial.url = Some(tcp_serial_url); + } + self.console_info = Some(pre_create_console_devices(self).map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating console devices: {e:?}")) })?); @@ -1121,128 +2013,202 @@ impl Vmm { vm.restore().map_err(|e| { MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {e}")) })?; - self.vm = Some(vm); - - Ok(()) - } - - fn vm_receive_memory( - &mut self, - req: &Request, - socket: &mut T, - memory_manager: &mut MemoryManager, - ) -> std::result::Result<(), MigratableError> - where - T: Read + ReadVolatile, - { - // Read table - let table = MemoryRangeTable::read_from(socket, req.length())?; + self.vm = MaybeVmOwnership::Vmm(vm); - // And then read the memory itself - memory_manager.receive_memory_regions(&table, socket)?; Ok(()) } - fn socket_url_to_path(url: &str) -> result::Result { - url.strip_prefix("unix:") - .ok_or_else(|| { - MigratableError::MigrateSend(anyhow!("Could not extract path from URL: {url}")) - }) - .map(|s| s.into()) + fn can_increase_autoconverge_step(s: &MigrationState) -> bool { + if s.iteration < AUTO_CONVERGE_ITERATION_DELAY { + false + } else { + let iteration = s.iteration - AUTO_CONVERGE_ITERATION_DELAY; + iteration.is_multiple_of(AUTO_CONVERGE_ITERATION_INCREASE) + } } - fn send_migration_socket( - destination_url: &str, - ) -> std::result::Result { - if let Some(address) = destination_url.strip_prefix("tcp:") { - info!("Connecting to TCP socket at {address}"); - - let socket = TcpStream::connect(address).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {e}")) - })?; + fn memory_copy_iterations( + vm: &mut Vm, + mem_send: &SendAdditionalConnections, + socket: &mut SocketStream, + s: &mut MigrationState, + migration_timeout: Duration, + migrate_downtime_limit: Duration, + ) -> result::Result { + let mut bandwidth = 0.0; + let mut iteration_table; + + loop { + // todo: check if auto-converge is enabled at all? + if Self::can_increase_autoconverge_step(s) && vm.throttle_percent() < AUTO_CONVERGE_MAX + { + let current_throttle = vm.throttle_percent(); + let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE; + let new_throttle = std::cmp::min(new_throttle, AUTO_CONVERGE_MAX); + log::info!("Increasing auto-converge: {new_throttle}%"); + if new_throttle != current_throttle { + vm.set_throttle_percent(new_throttle); + } + } - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(destination_url)?; - info!("Connecting to UNIX socket at {path:?}"); + // Update the start time of the iteration + s.iteration_start_time = Instant::now(); - let socket = UnixStream::connect(&path).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {e}")) - })?; + // Increment iteration counter + s.iteration += 1; - Ok(SocketStream::Unix(socket)) - } - } + // Check if migration has timed out + // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check + if !migration_timeout.is_zero() && s.start_time.elapsed() > migration_timeout { + warn!("Migration timed out after {migration_timeout:?}"); + Request::abandon().write_to(socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Migration timed out")), + )?; + } - fn receive_migration_socket( - receiver_url: &str, - ) -> std::result::Result { - if let Some(address) = receiver_url.strip_prefix("tcp:") { - let listener = TcpListener::bind(address).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {e}")) - })?; + // Get the dirty page table + iteration_table = vm.dirty_log()?; - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on TCP socket: {e}" - )) - })?; + // Update the pending size (amount of data to transfer) + s.pending_size = iteration_table + .regions() + .iter() + .map(|range| range.length) + .sum(); - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(receiver_url)?; - let listener = UnixListener::bind(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {e}")) - })?; + // Update thresholds + if bandwidth > 0.0 { + s.threshold_size = bandwidth as u64 * migrate_downtime_limit.as_millis() as u64; + } - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on UNIX socket: {e}" - )) - })?; + // Enter the final stage of migration when the suspension conditions are met + if s.iteration > 1 && s.pending_size <= s.threshold_size { + break; + } - // Remove the UNIX socket file after accepting the connection - std::fs::remove_file(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {e}")) - })?; + // Update the number of dirty pages + s.total_transferred_bytes += s.pending_size; + s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); + s.total_transferred_dirty_pages += s.current_dirty_pages; + + // Send the current dirty pages + let transfer_start = Instant::now(); + mem_send.send_memory(&iteration_table, socket)?; + let transfer_time = transfer_start.elapsed().as_millis() as f64; + + // Update bandwidth + if transfer_time > 0.0 && s.pending_size > 0 { + bandwidth = s.pending_size as f64 / transfer_time; + // Convert bandwidth to MB/s + s.mb_per_sec = (bandwidth * 1000.0) / (1024.0 * 1024.0); + } - Ok(SocketStream::Unix(socket)) + // Update iteration cost time + s.iteration_cost_time = s.iteration_start_time.elapsed(); + if s.iteration_cost_time.as_millis() > 0 { + s.pages_per_second = + s.current_dirty_pages * 1000 / s.iteration_cost_time.as_millis() as u64; + } + debug!( + "iteration {}: cost={}ms, throttle={}%", + s.iteration, + s.iteration_cost_time.as_millis(), + vm.throttle_percent() + ); } + + Ok(iteration_table) } - // Returns true if there were dirty pages to send - fn vm_maybe_send_dirty_pages( + fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, - ) -> result::Result { - // Send (dirty) memory table - let table = vm.dirty_log()?; + s: &mut MigrationState, + send_data_migration: &VmSendMigrationData, + ) -> result::Result<(), MigratableError> { + let mem_send = SendAdditionalConnections::new(send_data_migration, &vm.guest_memory())?; + + // Start logging dirty pages + vm.start_dirty_log()?; + + mem_send.send_memory(&vm.memory_range_table()?, socket)?; + + // Define the maximum allowed downtime 2000 seconds(2000000 milliseconds) + const MAX_MIGRATE_DOWNTIME: u64 = 2000000; - // But if there are no regions go straight to pause - if table.regions().is_empty() { - return Ok(false); + // Verify that downtime must be between 1 and MAX_MIGRATE_DOWNTIME + if send_data_migration.downtime == 0 || send_data_migration.downtime > MAX_MIGRATE_DOWNTIME + { + return Err(MigratableError::MigrateSend(anyhow!( + "downtime_limit must be an integer in the range of 1 to {MAX_MIGRATE_DOWNTIME} ms", + ))); } - Request::memory(table.length()).write_to(socket).unwrap(); - table.write_to(socket)?; - // And then the memory itself - vm.send_memory_regions(&table, socket)?; - Response::read_from(socket)?.ok_or_abandon( + let migration_timeout = Duration::from_secs(send_data_migration.migration_timeout); + let migrate_downtime_limit = Duration::from_millis(send_data_migration.downtime); + + // Verify that downtime must be less than the migration timeout + if !migration_timeout.is_zero() && migrate_downtime_limit >= migration_timeout { + return Err(MigratableError::MigrateSend(anyhow!( + "downtime_limit {}ms must be less than migration_timeout {}ms", + send_data_migration.downtime, + send_data_migration.migration_timeout * 1000 + ))); + } + + let iteration_table = Self::memory_copy_iterations( + vm, + &mem_send, socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + s, + migration_timeout, + migrate_downtime_limit, )?; - Ok(true) + info!("Entering downtime phase"); + s.downtime_start = Instant::now(); + // End throttle thread + info!("stopping vcpu thread"); + vm.stop_vcpu_throttling(); + info!("stopped vcpu thread"); + info!("pausing VM"); + vm.pause()?; + info!("paused VM"); + + // Send last batch of dirty pages + let mut final_table = vm.dirty_log()?; + final_table.extend(iteration_table.clone()); + mem_send.send_memory(&final_table, socket)?; + // Update statistics + s.pending_size = final_table.regions().iter().map(|range| range.length).sum(); + s.total_transferred_bytes += s.pending_size; + s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); + s.total_transferred_dirty_pages += s.current_dirty_pages; + + // Stop logging dirty pages + vm.stop_dirty_log()?; + + Ok(()) } + /// Performs a live-migration. + /// + /// This function performs necessary after-migration cleanup only in the + /// good case. Callers are responsible for properly handling failed + /// migrations. + #[allow(unused_assignments)] // TODO remove fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: &dyn hypervisor::Hypervisor, send_data_migration: &VmSendMigrationData, ) -> result::Result<(), MigratableError> { + let mut s = MigrationState::new(); + // Set up the socket connection - let mut socket = Self::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = send_migration_socket(send_data_migration)?; // Start the migration Request::start().write_to(&mut socket)?; @@ -1291,6 +2257,11 @@ impl Vmm { "--local option is not supported with TCP sockets", ))); } + SocketStream::Tls(_tls_socket) => { + return Err(MigratableError::MigrateSend(anyhow!( + "--local option is not supported with TCP sockets", + ))); + } } } @@ -1317,36 +2288,7 @@ impl Vmm { // Now pause VM vm.pause()?; } else { - // Start logging dirty pages - vm.start_dirty_log()?; - - // Send memory table - let table = vm.memory_range_table()?; - Request::memory(table.length()) - .write_to(&mut socket) - .unwrap(); - table.write_to(&mut socket)?; - // And then the memory itself - vm.send_memory_regions(&table, &mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), - )?; - - // Try at most 5 passes of dirty memory sending - const MAX_DIRTY_MIGRATIONS: usize = 5; - for i in 0..MAX_DIRTY_MIGRATIONS { - info!("Dirty memory migration {i} of {MAX_DIRTY_MIGRATIONS}"); - if !Self::vm_maybe_send_dirty_pages(vm, &mut socket)? { - break; - } - } - - // Now pause VM - vm.pause()?; - - // Send last batch of dirty pages - Self::vm_maybe_send_dirty_pages(vm, &mut socket)?; + Self::do_memory_migration(vm, &mut socket, &mut s, send_data_migration)?; } // We release the locks early to enable locking them on the destination host. @@ -1354,6 +2296,14 @@ impl Vmm { vm.release_disk_locks() .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; + #[cfg(feature = "kvm")] + // Prevent signal handler to access thread local storage when signals are received + // close to the end when thread-local storage is already destroyed. + { + let mut lock = IS_IN_SHUTDOWN.write().unwrap(); + *lock = true; + } + // Capture snapshot and send it let vm_snapshot = vm.snapshot()?; let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap(); @@ -1373,11 +2323,17 @@ impl Vmm { MigratableError::MigrateSend(anyhow!("Error completing migration")), )?; + // Record downtime + s.downtime = s.downtime_start.elapsed(); + // Stop logging dirty pages if !send_data_migration.local { vm.stop_dirty_log()?; } + // Record total migration time + s.total_time = s.start_time.elapsed(); + info!("Migration complete"); // Let every Migratable object know about the migration being complete @@ -1431,6 +2387,10 @@ impl Vmm { vm_config: Arc>, prefault: bool, ) -> std::result::Result<(), VmError> { + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } + let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let vm_snapshot = get_vm_snapshot(&snapshot).map_err(VmError::Restore)?; @@ -1473,7 +2433,7 @@ impl Vmm { Some(source_url), Some(prefault), )?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); if self .vm_config @@ -1488,10 +2448,53 @@ impl Vmm { } // Now we can restore the rest of the VM. - if let Some(ref mut vm) = self.vm { - vm.restore() - } else { - Err(VmError::VmNotCreated) + // PANIC: won't panic, we just checked that the VM is there. + self.vm.vm_mut().unwrap().restore() + } + + /// Checks the migration result. + /// + /// This should be called when the migration thread indicated a state + /// change (and therefore, its termination). The function checks the result + /// of that thread and either shuts down the VMM on success or keeps the VM + /// and the VMM running on migration failure. + fn check_migration_result(&mut self) { + // At this point, the thread must be finished. + // If we fail here, we have lost anyway. Just panic. + let (vm, migration_res) = self + .migration_thread_handle + .take() + .expect("should have thread") + .join() + .expect("should have joined"); + + // Give VMM back control. + self.vm = MaybeVmOwnership::Vmm(vm); + + match migration_res { + Ok(()) => { + { + info!("Sending Receiver in HTTP thread that migration succeeded"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Ok(())).unwrap(); + } + + // Shutdown the VM after the migration succeeded + if let Err(e) = self.exit_evt.write(1) { + error!("Failed shutting down the VM after migration: {e}"); + } + } + Err(e) => { + error!("Migration failed: {e}"); + { + info!("Sending Receiver in HTTP thread that migration failed"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Err(e)).unwrap(); + } + // we don't fail the VMM here, it just continues running its VM + } } } @@ -1545,7 +2548,7 @@ impl Vmm { self.vm_reboot().map_err(Error::VmReboot)?; } EpollDispatch::ActivateVirtioDevices => { - if let Some(ref vm) = self.vm { + if let MaybeVmOwnership::Vmm(ref vm) = self.vm { let count = self.activate_evt.read().map_err(Error::EventFdRead)?; info!("Trying to activate pending virtio devices: count = {count}"); vm.activate_virtio_devices() @@ -1570,7 +2573,7 @@ impl Vmm { // Read from the API receiver channel let gdb_request = gdb_receiver.recv().map_err(Error::GdbRequestRecv)?; - let response = if let Some(ref mut vm) = self.vm { + let response = if let MaybeVmOwnership::Vmm(ref mut vm) = self.vm { vm.debug_request(&gdb_request.payload, gdb_request.cpu_id) } else { Err(VmError::VmNotRunning) @@ -1585,6 +2588,14 @@ impl Vmm { } #[cfg(not(feature = "guest_debug"))] EpollDispatch::Debug => {} + EpollDispatch::CheckMigration => { + info!("VM migration check event"); + // Consume the event. + self.check_migration_evt + .read() + .map_err(Error::EventFdRead)?; + self.check_migration_result(); + } } } } @@ -1635,102 +2646,116 @@ impl RequestHandler for Vmm { tracer::start(); info!("Booting VM"); event!("vm", "booting"); - let r = { - trace_scoped!("vm_boot"); - // If we don't have a config, we cannot boot a VM. - if self.vm_config.is_none() { - return Err(VmError::VmMissingConfig); - } - - // console_info is set to None in vm_shutdown. re-populate here if empty - if self.console_info.is_none() { - self.console_info = - Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); - } - - // Create a new VM if we don't have one yet. - if self.vm.is_none() { - let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; - let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; - #[cfg(feature = "guest_debug")] - let vm_debug_evt = self - .vm_debug_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - let activate_evt = self - .activate_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - - if let Some(ref vm_config) = self.vm_config { - let vm = Vm::new( - Arc::clone(vm_config), - exit_evt, - reset_evt, - #[cfg(feature = "guest_debug")] - vm_debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - None, - None, - None, - )?; - - self.vm = Some(vm); - } + + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } + + trace_scoped!("vm_boot"); + // If we don't have a config, we cannot boot a VM. + if self.vm_config.is_none() { + return Err(VmError::VmMissingConfig); + } + + // console_info is set to None in vm_shutdown. re-populate here if empty + if self.console_info.is_none() { + self.console_info = + Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); + } + + // Create a new VM if we don't have one yet. + if matches!(self.vm, MaybeVmOwnership::None) { + let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; + let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + #[cfg(feature = "guest_debug")] + let vm_debug_evt = self + .vm_debug_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + let activate_evt = self + .activate_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + + if let Some(ref vm_config) = self.vm_config { + let vm = Vm::new( + Arc::clone(vm_config), + exit_evt, + reset_evt, + #[cfg(feature = "guest_debug")] + vm_debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + None, + None, + None, + )?; + + self.vm = MaybeVmOwnership::Vmm(vm); } + } - // Now we can boot the VM. - if let Some(ref mut vm) = self.vm { - vm.boot() - } else { - Err(VmError::VmNotCreated) + // Now we can boot the VM. + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.boot()?; + event!("vm", "booted"); } - }; - tracer::end(); - if r.is_ok() { - event!("vm", "booted"); + MaybeVmOwnership::None => { + return Err(VmError::VmNotCreated); + } + _ => unreachable!(), } - r + + tracer::end(); + Ok(()) } fn vm_pause(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.pause().map_err(VmError::Pause) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.pause().map_err(VmError::Pause), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_resume(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.resume().map_err(VmError::Resume) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.resume().map_err(VmError::Resume), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - // Drain console_info so that FDs are not reused - let _ = self.console_info.take(); - vm.snapshot() - .map_err(VmError::Snapshot) - .and_then(|snapshot| { - vm.send(&snapshot, destination_url) - .map_err(VmError::SnapshotSend) - }) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + // Drain console_info so that FDs are not reused + let _ = self.console_info.take(); + vm.snapshot() + .map_err(VmError::Snapshot) + .and_then(|snapshot| { + vm.send(&snapshot, destination_url) + .map_err(VmError::SnapshotSend) + }) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> result::Result<(), VmError> { - if self.vm.is_some() || self.vm_config.is_some() { + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => return Err(VmError::VmAlreadyCreated), + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => (), + } + + if self.vm_config.is_some() { return Err(VmError::VmAlreadyCreated); } @@ -1777,21 +2802,25 @@ impl RequestHandler for Vmm { #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] fn vm_coredump(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.coredump(destination_url).map_err(VmError::Coredump) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.coredump(destination_url).map_err(VmError::Coredump) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_shutdown(&mut self) -> result::Result<(), VmError> { - let r = if let Some(ref mut vm) = self.vm.take() { - // Drain console_info so that the FDs are not reused - let _ = self.console_info.take(); - vm.shutdown() - } else { - Err(VmError::VmNotRunning) + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + // Drain console_info so that the FDs are not reused + let _ = self.console_info.take(); + let r = vm.shutdown(); + self.vm = MaybeVmOwnership::None; if r.is_ok() { event!("vm", "shutdown"); @@ -1804,13 +2833,14 @@ impl RequestHandler for Vmm { event!("vm", "rebooting"); // First we stop the current VM - let config = if let Some(mut vm) = self.vm.take() { - let config = vm.get_config(); - vm.shutdown()?; - config - } else { - return Err(VmError::VmNotCreated); + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + let config = vm.get_config(); + vm.shutdown()?; + self.vm = MaybeVmOwnership::None; // vm.shutdown() closes all the console devices, so set console_info to None // so that the closed FD #s are not reused. @@ -1859,7 +2889,7 @@ impl RequestHandler for Vmm { // And we boot it vm.boot()?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); event!("vm", "rebooted"); @@ -1867,33 +2897,38 @@ impl RequestHandler for Vmm { } fn vm_info(&self) -> result::Result { - match &self.vm_config { - Some(vm_config) => { - let state = match &self.vm { - Some(vm) => vm.get_state()?, - None => VmState::Created, - }; - let config = vm_config.lock().unwrap().clone(); - - let mut memory_actual_size = config.memory.total_size(); - if let Some(vm) = &self.vm { - memory_actual_size -= vm.balloon_size(); - } - - let device_tree = self - .vm - .as_ref() - .map(|vm| vm.device_tree().lock().unwrap().clone()); + let vm_config = self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + let vm_config = vm_config.lock().unwrap().clone(); + + let state = match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm.get_state()?, + // TODO in theory one could live-migrate a non-running VM .. + MaybeVmOwnership::Migration => VmState::Running, + MaybeVmOwnership::None => VmState::Created, + }; - Ok(VmInfoResponse { - config: Box::new(config), - state, - memory_actual_size, - device_tree, - }) + let mut memory_actual_size = vm_config.memory.total_size(); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => { + memory_actual_size -= vm.balloon_size(); } - None => Err(VmError::VmNotCreated), + MaybeVmOwnership::Migration => {} + MaybeVmOwnership::None => {} } + + let device_tree = match &self.vm { + MaybeVmOwnership::Vmm(vm) => Some(vm.device_tree().lock().unwrap().clone()), + // TODO we need to fix this + MaybeVmOwnership::Migration => None, + MaybeVmOwnership::None => None, + }; + + Ok(VmInfoResponse { + config: Box::new(vm_config), + state, + memory_actual_size, + device_tree, + }) } fn vmm_ping(&self) -> VmmPingResponse { @@ -1915,14 +2950,19 @@ impl RequestHandler for Vmm { return Ok(()); } - // If a VM is booted, we first try to shut it down. - if self.vm.is_some() { - self.vm_shutdown()?; - } - - self.vm_config = None; + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => { + event!("vm", "deleted"); - event!("vm", "deleted"); + // If a VM is booted, we first try to shut it down. + self.vm_shutdown()?; + self.vm_config = None; + } + MaybeVmOwnership::None => { + self.vm_config = None; + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + } Ok(()) } @@ -1941,59 +2981,80 @@ impl RequestHandler for Vmm { ) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - vm.resize(desired_vcpus, desired_ram, desired_balloon) - .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; - Ok(()) - } else { - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - if let Some(desired_vcpus) = desired_vcpus { - config.cpus.boot_vcpus = desired_vcpus; - } - if let Some(desired_ram) = desired_ram { - config.memory.size = desired_ram; + if desired_vcpus.is_some() { + todo!("doesn't work currently with our thread-local KVM_RUN approach"); + } + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.resize(desired_vcpus, desired_ram, desired_balloon) + .inspect_err(|e| error!("Error when resizing VM: {e:?}"))?; + Ok(()) } - if let Some(desired_balloon) = desired_balloon - && let Some(balloon_config) = &mut config.balloon - { - balloon_config.size = desired_balloon; + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + if let Some(desired_vcpus) = desired_vcpus { + config.cpus.boot_vcpus = desired_vcpus; + } + if let Some(desired_ram) = desired_ram { + config.memory.size = desired_ram; + } + if let Some(desired_balloon) = desired_balloon + && let Some(balloon_config) = &mut config.balloon + { + balloon_config.size = desired_balloon; + } + + Ok(()) } - Ok(()) } } fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> result::Result<(), VmError> { + info!("request to resize disk: id={id}"); self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - return vm.resize_disk(&id, desired_size); + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_disk(&id, desired_size) { + error!("Error when resizing disk: {e:?}"); + Err(e) + } else { + Ok(()) + } + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::ResizeDisk), } - - Err(VmError::ResizeDisk) } fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - vm.resize_zone(&id, desired_ram) - .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; - Ok(()) - } else { - // Update VmConfig by setting the new desired ram. - let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; - - if let Some(zones) = &mut memory_config.zones { - for zone in zones.iter_mut() { - if zone.id == id { - zone.size = desired_ram; - return Ok(()); + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.resize_zone(&id, desired_ram) + .inspect_err(|e| error!("Error when resizing zone: {e:?}"))?; + Ok(()) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by setting the new desired ram. + let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; + + if let Some(zones) = &mut memory_config.zones { + for zone in zones.iter_mut() { + if zone.id == id { + zone.size = desired_ram; + return Ok(()); + } } } - } - error!("Could not find the memory zone {id} for the resize"); - Err(VmError::ResizeZone) + error!("Could not find the memory zone {id} for the resize"); + Err(VmError::ResizeZone) + } } } @@ -2010,18 +3071,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_device(device_cfg).inspect_err(|e| { - error!("Error when adding new device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_device(device_cfg).inspect_err(|e| { + error!("Error when adding new device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.devices, device_cfg); + Ok(None) + } } } @@ -2038,35 +3103,45 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_user_device(device_cfg).inspect_err(|e| { - error!("Error when adding new user device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.user_devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_user_device(device_cfg).inspect_err(|e| { + error!("Error when adding new user device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.user_devices, device_cfg); + Ok(None) + } } } fn vm_remove_device(&mut self, id: String) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.remove_device(&id) - .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; - Ok(()) - } else if let Some(ref config) = self.vm_config { - let mut config = config.lock().unwrap(); - if config.remove_device(&id) { + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.remove_device(&id) + .inspect_err(|e| error!("Error when removing device from the VM: {e:?}"))?; Ok(()) - } else { - Err(VmError::NoDeviceToRemove(id)) } - } else { - Err(VmError::VmNotCreated) + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + if let Some(ref config) = self.vm_config { + let mut config = config.lock().unwrap(); + if config.remove_device(&id) { + Ok(()) + } else { + Err(VmError::NoDeviceToRemove(id)) + } + } else { + Err(VmError::VmNotCreated) + } + } } } @@ -2080,18 +3155,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_disk(disk_cfg).inspect_err(|e| { - error!("Error when adding new disk to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.disks, disk_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_disk(disk_cfg).inspect_err(|e| { + error!("Error when adding new disk to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.disks, disk_cfg); + Ok(None) + } } } @@ -2105,18 +3184,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_fs(fs_cfg).inspect_err(|e| { - error!("Error when adding new fs to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.fs, fs_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_fs(fs_cfg).inspect_err(|e| { + error!("Error when adding new fs to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.fs, fs_cfg); + Ok(None) + } } } @@ -2130,18 +3213,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_pmem(pmem_cfg).inspect_err(|e| { - error!("Error when adding new pmem device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.pmem, pmem_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_pmem(pmem_cfg).inspect_err(|e| { + error!("Error when adding new pmem device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.pmem, pmem_cfg); + Ok(None) + } } } @@ -2155,18 +3242,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_net(net_cfg).inspect_err(|e| { - error!("Error when adding new network device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.net, net_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_net(net_cfg).inspect_err(|e| { + error!("Error when adding new network device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.net, net_cfg); + Ok(None) + } } } @@ -2180,18 +3271,22 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vdpa(vdpa_cfg).inspect_err(|e| { - error!("Error when adding new vDPA device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.vdpa, vdpa_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vdpa(vdpa_cfg).inspect_err(|e| { + error!("Error when adding new vDPA device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.vdpa, vdpa_cfg); + Ok(None) + } } } @@ -2210,47 +3305,53 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vsock(vsock_cfg).inspect_err(|e| { - error!("Error when adding new vsock device to the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - config.vsock = Some(vsock_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vsock(vsock_cfg).inspect_err(|e| { + error!("Error when adding new vsock device to the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + config.vsock = Some(vsock_cfg); + Ok(None) + } } } fn vm_counters(&mut self) -> result::Result>, VmError> { - if let Some(ref mut vm) = self.vm { - let info = vm.counters().inspect_err(|e| { - error!("Error when getting counters from the VM: {e:?}"); - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.counters().inspect_err(|e| { + error!("Error when getting counters from the VM: {e:?}"); + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_power_button(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.power_button() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.power_button(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_nmi(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.nmi() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.nmi(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -2259,12 +3360,16 @@ impl RequestHandler for Vmm { receive_data_migration: VmReceiveMigrationData, ) -> result::Result<(), MigratableError> { info!( - "Receiving migration: receiver_url = {}", - receive_data_migration.receiver_url + "Receiving migration: receiver_url = {}, net_fds={:?}", + receive_data_migration.receiver_url, &receive_data_migration.net_fds ); + let mut listener = receive_migration_listener(&receive_data_migration)?; // Accept the connection and get the socket - let mut socket = Vmm::receive_migration_socket(&receive_data_migration.receiver_url)?; + let mut socket = listener.accept().map_err(|e| { + warn!("Failed to accept migration connection: {e}"); + MigratableError::MigrateReceive(anyhow!("Failed to accept migration connection: {e}")) + })?; let mut state = ReceiveMigrationState::Established; @@ -2273,6 +3378,7 @@ impl RequestHandler for Vmm { trace!("Command {:?} received", req.command()); let (response, new_state) = match self.vm_receive_migration_step( + &listener, &mut socket, state, &req, @@ -2295,7 +3401,7 @@ impl RequestHandler for Vmm { } if let ReceiveMigrationState::Aborted = state { - self.vm = None; + self.vm = MaybeVmOwnership::None; self.vm_config = None; } @@ -2306,6 +3412,18 @@ impl RequestHandler for Vmm { &mut self, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + match self.vm { + MaybeVmOwnership::Vmm(_) => (), + MaybeVmOwnership::Migration => { + return Err(MigratableError::MigrateSend(anyhow!( + "There is already an ongoing migration" + ))); + } + MaybeVmOwnership::None => { + return Err(MigratableError::MigrateSend(anyhow!("VM is not running"))); + } + } + info!( "Sending migration: destination_url = {}, local = {}", send_data_migration.destination_url, send_data_migration.local @@ -2325,41 +3443,29 @@ impl RequestHandler for Vmm { ))); } - if let Some(vm) = self.vm.as_mut() { - Self::send_migration( - vm, - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.as_ref(), - &send_data_migration, - ) - .map_err(|migration_err| { - error!("Migration failed: {migration_err:?}"); + // Take VM ownership. This also means that API events can no longer + // change the VM (e.g. net device hotplug). + let vm = self.vm.take_vm_for_migration(); - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local - && let Err(e) = vm.stop_dirty_log() - { - return e; - } - - if vm.get_state().unwrap() == VmState::Paused - && let Err(e) = vm.resume() - { - return e; - } + // Start migration thread + let worker = MigrationWorker { + vm, + check_migration_evt: self.check_migration_evt.try_clone().unwrap(), + config: send_data_migration, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: self.hypervisor.clone(), + }; - migration_err - })?; + self.migration_thread_handle = Some( + thread::Builder::new() + .name("migration".into()) + .spawn(move || worker.run()) + // For upstreaming, we should simply continue and return an + // error when this fails. For our PoC, this is fine. + .unwrap(), + ); - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {e:?}" - )) - }) - } else { - Err(MigratableError::MigrateSend(anyhow!("VM is not running"))) - } + Ok(()) } } @@ -2435,6 +3541,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -2444,6 +3551,8 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, @@ -2451,6 +3560,8 @@ mod unit_tests { mode: ConsoleOutputMode::Off, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 8b0cdff6c1..c4fcceb8b3 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -25,9 +25,7 @@ use devices::ioapic; #[cfg(target_arch = "aarch64")] use hypervisor::HypervisorVmError; use libc::_SC_NPROCESSORS_ONLN; -#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] -use log::debug; -use log::{error, info, warn}; +use log::{debug, error, info, trace, warn}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tracer::trace_scoped; @@ -41,7 +39,7 @@ use vm_memory::guest_memory::FileOffset; use vm_memory::mmap::MmapRegionError; use vm_memory::{ Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, - GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, + GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, }; use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; use vm_migration::{ @@ -2137,47 +2135,6 @@ impl MemoryManager { debug!("coredump total bytes {total_bytes}"); Ok(()) } - - pub fn receive_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: ReadVolatile, - { - let guest_memory = self.guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of read_exact_from() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_read = mem - .read_volatile_from( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving memory from socket: {e}" - )) - })?; - offset += bytes_read as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } } struct MemoryNotify { @@ -2648,8 +2605,8 @@ impl Migratable for MemoryManager { let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) { Some(region) => { - assert!(region.start_addr().raw_value() == r.gpa); - assert!(region.len() == r.size); + assert_eq!(region.start_addr().raw_value(), r.gpa); + assert_eq!(region.len(), r.size); (**region).bitmap().get_and_reset() } None => { @@ -2668,11 +2625,11 @@ impl Migratable for MemoryManager { let sub_table = MemoryRangeTable::from_dirty_bitmap(dirty_bitmap, r.gpa, 4096); if sub_table.regions().is_empty() { - info!("Dirty Memory Range Table is empty"); + debug!("Dirty Memory Range Table is empty"); } else { - info!("Dirty Memory Range Table:"); + debug!("Dirty Memory Range Table:"); for range in sub_table.regions() { - info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); + trace!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); } } diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index b334ddb5d6..462d6b7111 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -164,15 +164,22 @@ impl PciSegment { ) } - pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult { + /// Allocates a device BDF on this PCI segment. + /// + /// - `device_id`: Device ID to request for BDF allocation + /// + /// ## Errors + /// * [`DeviceManagerError::AllocatePciDeviceId`] if device ID + /// allocation on the bus fails. + pub(crate) fn allocate_device_bdf(&self, device_id: Option) -> DeviceManagerResult { Ok(PciBdf::new( self.id, 0, self.pci_bus .lock() .unwrap() - .next_device_id() - .map_err(DeviceManagerError::NextPciDeviceId)? as u8, + .allocate_device_id(device_id) + .map_err(DeviceManagerError::AllocatePciDeviceId)? as u8, 0, )) } @@ -202,6 +209,65 @@ impl PciSegment { Ok(()) } + + #[cfg(test)] + /// Creates a PciSegment without the need for an [`AddressManager`] + /// for testing purpose. + /// + /// An [`AddressManager`] would otherwise be required to create + /// [`PciBus`] instances. Instead, we use any struct that implements + /// [`DeviceRelocation`] to instantiate a [`PciBus`]. + pub(crate) fn new_without_address_manager( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + pci_irq_slots: &[u8; 32], + device_reloc: Arc, + ) -> DeviceManagerResult { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, device_reloc))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = + layout::PCI_MMCONFIG_START.0 + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area + ); + Ok(segment) + } } struct PciDevSlot { @@ -474,3 +540,96 @@ impl Aml for PciSegment { .to_aml_bytes(sink); } } + +#[cfg(test)] +mod unit_tests { + use std::result::Result; + + use vm_memory::GuestAddress; + + use super::*; + + #[derive(Debug)] + struct MocRelocDevice; + impl DeviceRelocation for MocRelocDevice { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup() -> PciSegment { + let guest_addr = 0_u64; + let guest_size = 0x1000_usize; + let allocator_1 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let allocator_2 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let moc_device_reloc = Arc::new(MocRelocDevice {}); + let arr = [0_u8; 32]; + + PciSegment::new_without_address_manager( + 0, + 0, + allocator_1, + allocator_2, + &arr, + moc_device_reloc, + ) + .unwrap() + } + + #[test] + // Test the default bdf for a segment with an empty bus (except for the root device) + fn allocate_device_bdf_default() { + // The first address is occupied by the root + let segment = setup(); + let bdf = segment.allocate_device_bdf(None).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 1); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a bdf with s specific device ID + fn allocate_device_bdf_fixed_device_id() { + // The first address is occupied by the root + let expect_device_id = 0x10_u8; + let segment = setup(); + let bdf = segment.allocate_device_bdf(Some(expect_device_id)).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), expect_device_id); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a bdf with invalid device id, one already + // taken and the other being greater then the number of allowed + // devices per bus. + fn allocate_device_bdf_invalid_device_id() { + // The first address is occupied by the root + let already_taken_device_id = 0x0_u8; + let overflow_device_id = 0xff_u8; + let segment = setup(); + let bdf_res = segment.allocate_device_bdf(Some(already_taken_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(_)) + )); + let bdf_res = segment.allocate_device_bdf(Some(overflow_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(_)) + )); + } +} diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 8c0071fc74..e30f2fab70 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -865,6 +865,8 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_write, vec![]), (libc::SYS_rt_sigprocmask, vec![]), (libc::SYS_getcwd, vec![]), + (libc::SYS_clock_nanosleep, vec![]), + (libc::SYS_read, vec![]), ]) } diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index 5f8de1874a..8c48ba73a2 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -3,9 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::any::TypeId; +use std::collections::HashMap; use std::fs::File; -use std::io::Read; -use std::net::Shutdown; +use std::io::{Read, Write}; +use std::net::{Shutdown, TcpStream}; +use std::os::fd::IntoRawFd; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; @@ -68,9 +71,9 @@ pub enum Error { #[error("Error accepting connection")] AcceptConnection(#[source] io::Error), - /// Cannot clone the UnixStream - #[error("Error cloning UnixStream")] - CloneUnixStream(#[source] io::Error), + /// Cannot clone the Stream + #[error("Error cloning Stream")] + CloneStream(#[source] io::Error), /// Cannot shutdown the connection #[error("Error shutting down a connection")] @@ -92,9 +95,10 @@ pub enum EpollDispatch { File = 0, Kill = 1, Socket = 2, + Tcp = 3, Unknown, } -const EPOLL_EVENTS_LEN: usize = 4; +const EPOLL_EVENTS_LEN: usize = 5; impl From for EpollDispatch { fn from(v: u64) -> Self { @@ -103,11 +107,64 @@ impl From for EpollDispatch { 0 => File, 1 => Kill, 2 => Socket, + 3 => Tcp, _ => Unknown, } } } +/// A thread-safe writer that fans out to multiple keyed writers. Allows for +/// bundling different kinds of writers for the serial device, e.g. writing to +/// a TCP socket and a file. +#[derive(Clone)] +pub struct FanoutWriter { + writers: Arc>>>, +} + +impl FanoutWriter { + pub fn new() -> Self { + FanoutWriter { + writers: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub fn add_writer(&self, writer: W) { + let mut writers = self.writers.lock().unwrap(); + writers.insert(TypeId::of::(), Box::new(writer)); + } + + pub fn remove_writer(&self, id: TypeId) -> Option> { + let mut writers = self.writers.lock().unwrap(); + writers.remove(&id) + } +} + +impl Write for FanoutWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let mut writers = self.writers.lock().unwrap(); + let mut result: io::Result = Ok(buf.len()); + + for (i, w) in writers.values_mut().enumerate() { + let r = w.write(buf); + if i == 0 { + result = r; + } else if let Err(e) = r { + return Err(e); + } + } + + result + } + + fn flush(&mut self) -> io::Result<()> { + let mut writers = self.writers.lock().unwrap(); + for w in writers.values_mut() { + w.flush()?; + } + Ok(()) + } +} + pub struct SerialManager { #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, @@ -166,6 +223,7 @@ impl SerialManager { } fd.as_raw_fd() } + ConsoleOutput::Tcp(ref fd, _) => fd.as_raw_fd(), _ => return Ok(None), }; @@ -180,10 +238,14 @@ impl SerialManager { ) .map_err(Error::Epoll)?; - let epoll_fd_data = if let ConsoleOutput::Socket(_) = output { - EpollDispatch::Socket - } else { - EpollDispatch::File + let epoll_fd_data = match output { + ConsoleOutput::File(_) => EpollDispatch::File, + ConsoleOutput::Pty(_) => EpollDispatch::File, + ConsoleOutput::Tty(_) => EpollDispatch::File, + ConsoleOutput::Null => EpollDispatch::File, + ConsoleOutput::Off => EpollDispatch::File, + ConsoleOutput::Socket(_) => EpollDispatch::Socket, + ConsoleOutput::Tcp(_, _) => EpollDispatch::Tcp, }; epoll::ctl( @@ -260,6 +322,7 @@ impl SerialManager { let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; + let mut reader_tcp: Option = None; // In case of PTY, we want to be able to detect a connection on the // other end of the PTY. This is done by detecting there's no event @@ -273,6 +336,17 @@ impl SerialManager { .name("serial-manager".to_string()) .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { + let write_distributor = FanoutWriter::new(); + + if let ConsoleOutput::Tcp(_, Some(f)) = &in_file { + write_distributor.add_writer(f.clone()); + serial + .as_ref() + .lock() + .unwrap() + .set_out(Some(Box::new(write_distributor.clone()))); + } + let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; @@ -328,7 +402,7 @@ impl SerialManager { let (unix_stream, _) = listener.accept().map_err(Error::AcceptConnection)?; let writer = - unix_stream.try_clone().map_err(Error::CloneUnixStream)?; + unix_stream.try_clone().map_err(Error::CloneStream)?; epoll::ctl( epoll_fd, @@ -344,6 +418,41 @@ impl SerialManager { reader = Some(unix_stream); serial.lock().unwrap().set_out(Some(Box::new(writer))); } + EpollDispatch::Tcp => { + // New connection request arrived. + // Shutdown the previous connection, if any + if let Some(ref previous_reader) = reader_tcp { + previous_reader + .shutdown(Shutdown::Both) + .map_err(Error::AcceptConnection)?; + write_distributor.remove_writer(TypeId::of::()); + } + + let ConsoleOutput::Tcp(ref listener, _) = in_file else { + unreachable!(); + }; + + // Events on the listening socket will be connection requests. + // Accept them, create a reader and a writer. + let (tcp_stream, _) = + listener.accept().map_err(Error::AcceptConnection)?; + let writer = + tcp_stream.try_clone().map_err(Error::CloneStream)?; + reader_tcp = + Some(tcp_stream.try_clone().map_err(Error::CloneStream)?); + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + tcp_stream.into_raw_fd(), + epoll::Event::new( + epoll::Events::EPOLLIN, + EpollDispatch::File as u64, + ), + ) + .map_err(Error::Epoll)?; + write_distributor.add_writer(writer); + } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; @@ -370,6 +479,27 @@ impl SerialManager { 0 } } + ConsoleOutput::Tcp(_, _) => { + if let Some(mut serial_reader) = reader_tcp.as_ref() + { + let count = serial_reader + .read(&mut input) + .map_err(Error::ReadInput)?; + if count == 0 { + info!("Remote end closed serial socket"); + serial_reader + .shutdown(Shutdown::Both) + .map_err(Error::ShutdownConnection)?; + reader_tcp = None; + write_distributor.remove_writer( + TypeId::of::(), + ); + } + count + } else { + 0 + } + } ConsoleOutput::Pty(file) | ConsoleOutput::Tty(file) => { (&**file) .read(&mut input) diff --git a/vmm/src/vcpu_throttling.rs b/vmm/src/vcpu_throttling.rs new file mode 100644 index 0000000000..e8fd0d3b12 --- /dev/null +++ b/vmm/src/vcpu_throttling.rs @@ -0,0 +1,605 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! # vCPU throttling for Auto Converging +//! +//! vCPU throttling is crucial to reach a reasonable downtime when using a +//! precopy strategy for live-migration of VMs with memory-intensive workloads. +//! Auto converge means an increasing vCPU throttling over time until the memory +//! delta is small enough for the migration thread(s) to perform the switch-over +//! to the new host. +//! +//! Therefore, the migration thread(s) use this thread to help them reach their +//! goal. Next to typical lifecycle management, this thread must fulfill various +//! requirements to ensure a minimal downtime. +//! +//! ## Thread Requirements +//! - Needs to be able to gracefully wait for work. +//! - Must be able to exit gracefully. +//! - Must be able to cancel any work and return to its init state to support +//! live-migration cancellation and restart of live-migrations. +//! - Must not block the migration thread(s) whenever possible, to facilitate +//! fast live-migrations with short downtimes. +//! - Must be interruptible during a sleep phase to not block the migration +//! thread(s). +//! - Must not confuse or hinder the migration thread(s) regarding +//! pause()/resume() operations. Context: migration thread shuts down the +//! vCPUs for the handover. The throttle thread must not restart the vCPUs +//! again. + +use std::cell::Cell; +use std::cmp::min; +use std::sync::mpsc::RecvTimeoutError; +use std::sync::{Arc, Mutex, mpsc}; +use std::thread; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; + +use log::{debug, warn}; +use vm_migration::Pausable; + +use crate::cpu::CpuManager; + +/// The possible command of the thread, i.e., the current state. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ThrottleCommand { + /// Waiting for next event. + Waiting, + /// Ongoing vCPU throttling. + /// + /// The inner value shows the current throttling percentage in range `1..=99`. + Throttling(u8 /* `1..=99` */), + /// Thread is shutting down gracefully. + Exiting, +} + +/// Helper to adapt the throttling timeslice as we go, depending on the time it +/// takes to pause() and resume() all vCPUs. +#[derive(Debug)] +struct TimesliceContext { + current_timeslice: Duration, + /// Duration it took to pause() all vCPUs on the previous iteration. + previous_pause_duration: Duration, + /// Duration it took to resume() all vCPUs on the previous iteration. + previous_resume_duration: Duration, +} + +impl TimesliceContext { + /// The initial timeslice for a throttling cycle (vCPU pause & resume). + const INITIAL_TIMESLICE: Duration = Duration::from_millis(100); + + /// The minimal value for the operations. + /// + /// Any value smaller than this is upgraded to this to prevent math + /// exceptions during timing calculations. + const MIN_DURATION: Duration = Duration::from_millis(1); + + /// Maximum time slice. This should not be too big. + /// + /// Otherwise, for example: Assuming we have 10% throttling and + /// 2000ms time slice, then the WM will be unresponsive for + /// 200ms every 1800ms. This is not convenient. /// + const MAX_TIMESLICE: Duration = Duration::from_millis(800); + + /// Creates a new instance with [`Self::INITIAL_TIMESLICE`]. + fn new() -> Self { + Self { + current_timeslice: Self::INITIAL_TIMESLICE, + previous_pause_duration: Self::MIN_DURATION, + previous_resume_duration: Self::MIN_DURATION, + } + } + + /// Updates the timeslice. + fn update_timeslice(&mut self) { + // CpuManager::pause() plus CpuManager::resume() without additional delay is the shortest + // we can get. + let one_percent = self.previous_pause_duration + self.previous_resume_duration; + self.current_timeslice = one_percent * 100; + self.current_timeslice = min(self.current_timeslice, Self::MAX_TIMESLICE); + } + + /// Calculates the sleep durations for after the `pause()` and `resume()` operations with + /// the current `timeslice`. + /// + /// It uses the `timeslice` that was calculated on the previous + /// invocation of [`Self::update_timeslice`]. + fn calc_sleep_durations( + &mut self, + percentage: u64, + ) -> ( + Duration, /* after pause */ + Duration, /* after resume */ + ) { + assert!(percentage <= 100); + assert!(percentage > 0); + + let timeslice_ms = self.current_timeslice.as_millis() as u64; + let wait_ms_after_pause_ms = timeslice_ms * percentage / 100; + let wait_ms_after_resume_ms = timeslice_ms - wait_ms_after_pause_ms; + + let wait_ms_after_pause_ms = + wait_ms_after_pause_ms.saturating_sub(self.previous_pause_duration.as_millis() as u64); + let wait_ms_after_resume_ms = wait_ms_after_resume_ms + .saturating_sub(self.previous_resume_duration.as_millis() as u64); + + ( + Duration::from_millis(wait_ms_after_pause_ms), + Duration::from_millis(wait_ms_after_resume_ms), + ) + } + + /// Set the previous pause duration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_pause_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION; + } + + self.previous_pause_duration = duration; + } + + /// Set the duration it took to `resume()` all vCPUs on the previous iteration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_resume_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION; + } + self.previous_resume_duration = duration; + } +} + +/// Context of the vCPU throttle thread. +// The main justification for this dedicated type is to split the thread +// functions from the higher-level control API. +// TODO seccomp is missing +pub struct ThrottleWorker { + handle: Option>, +} + +impl ThrottleWorker { + /// This should not be named "vcpu*" as libvirt fails when + /// iterating the vCPU threads then. Fix this first in libvirt! + const THREAD_NAME: &'static str = "throttle-vcpu"; + + /// Executes the provided callback and goes to sleep until the specified + /// `sleep_duration` passed. + /// + /// The time to execute the callback itself is not taken into account + /// when sleeping for `sleep_duration`. Therefore, the callback is + /// supposed to be quick (a couple of milliseconds). + /// + /// The thread is interruptible during the sleep phase when the `receiver` + /// receives a new [`ThrottleCommand`]. + /// + /// # Arguments + /// - `callback`: Function to run + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `sleep_duration`: Duration this function takes at most, including + /// running the `callback`. + /// - `receiver`: Receiving end of the channel to the migration managing + /// thread. + fn execute_and_wait_interruptible( + callback: &impl Fn(), + mut set_callback_duration: impl FnMut(Duration), + sleep_duration: Duration, + receiver: &mpsc::Receiver, + ) -> Option { + let begin = Instant::now(); + callback(); + let cb_duration = begin.elapsed(); + // Help to adjust the timeslice in the next cycle. + set_callback_duration(cb_duration); + + // It might happen that sometimes we get interrupted during a sleep phase + // with a new higher throttle percentage but this is negligible. For an + // auto-converge cycle, there are typically only ~10 steps involved over + // a time frame from a couple of seconds up to a couple of minutes. + match receiver.recv_timeout(sleep_duration) { + Ok(next_task) => Some(next_task), + Err(RecvTimeoutError::Timeout) => None, + Err(RecvTimeoutError::Disconnected) => { + panic!("thread and channel should exit gracefully") + } + } + } + + /// Executes one throttling step: either pause or resume of vCPUs. + /// + /// Runs the given callback, then waits for the specified duration, unless + /// interrupted by a new [`ThrottleCommand`]. + /// + /// # Behavior + /// - Runs the provided `callback` immediately. + /// - Waits up to `duration` for new commands on the `receiver`. + /// - If no command arrives before the timeout, this step completes + /// normally and returns `None`. + /// - If a [`ThrottleCommand::Throttling`] arrives, updates the current + /// throttle percentage in `current_throttle` and continues with the + /// loop. Returns `None`. + /// - If a [`ThrottleCommand::Waiting`] or [`ThrottleCommand::Exiting`] + /// arrives, this command is forwarded to the caller. + /// + /// # Arguments + /// - `callback`: Function to run (e.g., pause or resume vCPUs). + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `receiver`: Channel for receiving new [`ThrottleCommand`]s. + /// - `current_throttle`: Mutable reference to the current throttle + /// percentage (updated on [`ThrottleCommand::Throttling`]). + /// + /// # Returns + /// - `None` if the throttling cycle should continue. + /// - `Some(ThrottleCommand::Waiting | ThrottleCommand::Exiting)` if + /// throttling should stop. + fn throttle_step( + callback: &F, + set_callback_duration: impl FnMut(Duration), + duration: Duration, + receiver: &mpsc::Receiver, + current_throttle: &mut u64, + ) -> Option + where + F: Fn(), + { + let maybe_task = Self::execute_and_wait_interruptible( + callback, + set_callback_duration, + duration, + receiver, + ); + match maybe_task { + None => None, + Some(ThrottleCommand::Throttling(next)) => { + // A new throttle value is only applied at the end of a full + // throttling cycle. This is fine and negligible in a series of + // (tens of) thousands of cycles. + *current_throttle = next as u64; + None + } + Some(cmd @ (ThrottleCommand::Exiting | ThrottleCommand::Waiting)) => Some(cmd), + } + } + + /// Helper for [`Self::control_loop`] that runs the actual throttling loop. + /// + /// This function returns the next [`ThrottleCommand`] **only** if the thread + /// stopped the vCPU throttling. + fn throttle_loop( + receiver: &mpsc::Receiver, + initial_throttle: u8, + callback_pause_vcpus: &impl Fn(), + callback_resume_vcpus: &impl Fn(), + ) -> ThrottleCommand { + // The current throttle value, as long as the thread is throttling. + let mut current_throttle = initial_throttle as u64; + let mut timeslice_ctx = TimesliceContext::new(); + + loop { + // Catch logic bug: We should have exited in this case already. + assert_ne!(current_throttle, 0); + assert!(current_throttle < 100); + + let (wait_ms_after_pause, wait_ms_after_resume) = + timeslice_ctx.calc_sleep_durations(current_throttle); + + // pause vCPUs + if let Some(cmd) = Self::throttle_step( + callback_pause_vcpus, + |new_duration| timeslice_ctx.set_previous_pause_duration(new_duration), + wait_ms_after_pause, + receiver, + &mut current_throttle, + ) { + // TODO: future optimization + // Prevent unnecessary resume() here when the migration thread + // performs .pause() right after anyway. We could make .pause() and + // .resume() idempotent. + callback_resume_vcpus(); + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // resume vCPUs + if let Some(cmd) = Self::throttle_step( + callback_resume_vcpus, + |new_duration| timeslice_ctx.set_previous_resume_duration(new_duration), + wait_ms_after_resume, + receiver, + &mut current_throttle, + ) { + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // Update timeslice for next cycle. This way, we can closely match the expected + // percentage for pause() and resume(). + timeslice_ctx.update_timeslice(); + } + } + + /// Implements the control loop of the thread. + /// + /// It wraps the actual throttling with the necessary thread lifecycle + /// management. + fn control_loop( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> impl Fn() { + move || { + // In the outer loop, we gracefully wait for commands. + 'control: loop { + let thread_task = receiver.recv().expect("channel should not be closed"); + match thread_task { + ThrottleCommand::Exiting => { + break 'control; + } + ThrottleCommand::Waiting => { + continue 'control; + } + ThrottleCommand::Throttling(initial_throttle) => { + let next_task = Self::throttle_loop( + &receiver, + initial_throttle, + &callback_pause_vcpus, + &callback_resume_vcpus, + ); + if next_task == ThrottleCommand::Exiting { + break 'control; + } + // else: thread is in Waiting state + } + } + } + debug!("thread exited gracefully"); + } + } + + /// Spawns a new thread. + fn spawn( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> Self { + let handle = { + let thread_fn = + Self::control_loop(receiver, callback_pause_vcpus, callback_resume_vcpus); + thread::Builder::new() + .name(String::from(Self::THREAD_NAME)) + .spawn(thread_fn) + .expect("should spawn thread") + }; + + Self { + handle: Some(handle), + } + } +} + +impl Drop for ThrottleWorker { + fn drop(&mut self) { + // Note: The thread handle must send the shutdown command first. + if let Some(handle) = self.handle.take() { + handle.join().expect("thread should have succeeded"); + } + } +} + +/// Handler for controlling the vCPU throttle thread. +/// +/// vCPU throttling is needed for live-migration of memory-intensive workloads. +/// The current design assumes that all vCPUs are throttled equally. +/// +/// # Transitions +/// - `Waiting` -> `Throttling(x %)`, `Exit` +/// - `Throttling(x %)` -> `Exit`, `Waiting`, `Throttling(y %)` +/// - `Exiting` +pub struct ThrottleThreadHandle { + /// Thread state wrapped by synchronization primitives. + state_sender: mpsc::Sender, + /// Current throttle value. + /// + /// This is the last throttle value that was sent to the + /// thread. + current_throttle: Cell, + /// The underlying thread handle. Option to have more control over when it is dropped. + throttle_thread: Option, +} + +impl ThrottleThreadHandle { + /// Spawns a new thread and returning a handle to it. + /// + /// # Parameters + /// - `cpu_manager`: CPU manager to pause and resume vCPUs + pub fn new_from_cpu_manager(cpu_manager: &Arc>) -> Self { + let callback_pause_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().pause().unwrap()) + }; + + let callback_resume_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().resume().unwrap()) + }; + + Self::new(callback_pause_vcpus, callback_resume_vcpus) + } + + /// Spawns a new thread and returning a handle to it. + /// + /// This function returns when the thread gracefully arrived in + /// [`ThrottleCommand::Waiting`]. + /// + /// # Parameters + /// - `callback_pause_vcpus`: Function putting all vCPUs into pause state. The + /// function must not perform any artificial delay itself. + /// - `callback_resume_vcpus`: Function putting all vCPUs back into running + /// state. The function must not perform any artificial delay itself. + fn new( + callback_pause_vcpus: Box, + callback_resume_vcpus: Box, + ) -> Self { + // Channel used for synchronization. + let (sender, receiver) = mpsc::channel::(); + + let thread = ThrottleWorker::spawn(receiver, callback_pause_vcpus, callback_resume_vcpus); + + Self { + state_sender: sender, + current_throttle: Cell::new(0), + throttle_thread: Some(thread), + } + } + + /// Set's the throttle percentage to a value in range `0..=99` and updates + /// the thread's state. + /// + /// Setting the value back to `0` equals setting the thread back into + /// [`ThrottleCommand::Waiting`]. + /// + /// In case of an ongoing throttling cycle (vCPU pause & resume), any new + /// throttling percentage will be applied no later than when the current cycle + /// ends. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent_new: u8) { + assert!( + percent_new <= 100, + "setting a percentage of 100 or above is not allowed: {percent_new}%" + ); + + // We have no problematic race condition here as in normal operation + // there is exactly one thread calling these functions. + let percent_old = self.throttle_percent(); + + // Return early, no action needed. + if percent_old == percent_new { + return; + } + + if percent_new == 0 { + self.state_sender + .send(ThrottleCommand::Waiting) + .expect("channel should not be closed"); + } else { + self.state_sender + .send(ThrottleCommand::Throttling(percent_new)) + .expect("channel should not be closed"); + } + + self.current_throttle.set(percent_new); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.current_throttle.get() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. This function **must** be called before + /// the migration thread(s) do anything with the CPU manager to prevent + /// odd states. + pub fn shutdown(&mut self) { + let begin = Instant::now(); + + { + // drop thread; ensure that the channel is still alive when it is dropped + if let Some(worker) = self.throttle_thread.take() { + self.state_sender + .send(ThrottleCommand::Exiting) + .expect("channel should not be closed"); + + // Ensure the sender is still living when this is dropped. + drop(worker); + } + } + + let elapsed = begin.elapsed(); + if elapsed > Duration::from_millis(20) { + warn!( + "shutting down thread takes too long ({} ms): this increases the downtime!", + elapsed.as_millis() + ); + } + } +} + +impl Drop for ThrottleThreadHandle { + fn drop(&mut self) { + self.shutdown(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicBool, Ordering}; + use std::thread::sleep; + + use super::*; + + // The test is successful if it does not get stuck. Then, the thread exits + // gracefully. + #[test] + fn test_vcpu_throttling_thread_lifecycle() { + for _ in 0..5 { + // State transitions: Waiting -> Exit + { + let mut handler = ThrottleThreadHandle::new(Box::new(|| {}), Box::new(|| {})); + + // The test is successful if it does not get stuck. + handler.shutdown(); + } + + // Dummy CpuManager + let cpus_throttled = Arc::new(AtomicBool::new(false)); + let callback_pause_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(true, Ordering::SeqCst); + assert!(!old); + }) + }; + let callback_resume_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(false, Ordering::SeqCst); + assert!(old); + }) + }; + + // State transitions: Waiting -> Throttle -> Waiting -> Throttle -> Exit + { + let mut handler = + ThrottleThreadHandle::new(callback_pause_vcpus, callback_resume_vcpus); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // Assume we aborted vCPU throttling (or the live-migration at all). + handler.set_throttle_percent(0 /* reset to waiting */); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // The test is successful if we don't have a panic here due to a + // closed channel. + for _ in 0..10 { + handler.shutdown(); + sleep(Duration::from_millis(1)); + } + + // The test is successful if it does not get stuck. + drop(handler); + } + } + } +} diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 5364818855..aab2f00309 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -66,9 +66,7 @@ use tracer::trace_scoped; use vm_device::Bus; #[cfg(feature = "tdx")] use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; -use vm_memory::{ - Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, -}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{ Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, snapshot_from_id, @@ -97,6 +95,7 @@ use crate::migration::get_vm_snapshot; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::migration::url_to_file; use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; +use crate::vcpu_throttling::ThrottleThreadHandle; #[cfg(feature = "fw_cfg")] use crate::vm_config::FwCfgConfig; use crate::vm_config::{ @@ -180,6 +179,9 @@ pub enum Error { #[error("VM is not running")] VmNotRunning, + #[error("VM is currently migrating and can't be modified")] + VmMigrating, + #[error("Cannot clone EventFd")] EventFdClone(#[source] io::Error), @@ -526,6 +528,7 @@ pub struct Vm { hypervisor: Arc, stop_on_boot: bool, load_payload_handle: Option>>, + vcpu_throttler: ThrottleThreadHandle, } impl Vm { @@ -812,6 +815,10 @@ impl Vm { VmState::Created }; + // TODO we could also spawn the thread when a migration with auto-converge starts. + // Probably this is the better design. + let vcpu_throttler = ThrottleThreadHandle::new_from_cpu_manager(&cpu_manager); + Ok(Vm { #[cfg(feature = "tdx")] kernel, @@ -831,6 +838,7 @@ impl Vm { hypervisor, stop_on_boot, load_payload_handle, + vcpu_throttler, }) } @@ -984,6 +992,31 @@ impl Vm { Ok(numa_nodes) } + /// Set's the throttle percentage to a value in range `0..=99`. + /// + /// Setting the value back to `0` brings the thread back into a waiting + /// state. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent: u8 /* 1..=99 */) { + self.vcpu_throttler.set_throttle_percent(percent); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.vcpu_throttler.throttle_percent() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. + pub fn stop_vcpu_throttling(&mut self) { + self.vcpu_throttler.shutdown(); + } + #[allow(clippy::too_many_arguments)] pub fn new( vm_config: Arc>, @@ -2553,45 +2586,8 @@ impl Vm { Ok(()) } - pub fn send_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: WriteVolatile, - { - let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of write_all_to() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_written = mem - .write_volatile_to( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Error transferring memory to socket: {e}" - )) - })?; - offset += bytes_written as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) + pub fn guest_memory(&self) -> GuestMemoryAtomic { + self.memory_manager.lock().unwrap().guest_memory() } pub fn memory_range_table(&self) -> std::result::Result { diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 9c28e536da..6bb5b7ca70 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -284,6 +284,8 @@ pub struct DiskConfig { pub serial: Option, #[serde(default)] pub queue_affinity: Option>, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for DiskConfig { @@ -332,12 +334,6 @@ pub struct NetConfig { pub vhost_mode: VhostMode, #[serde(default)] pub id: Option, - // Special deserialize handling: - // Therefore, we don't serialize FDs, and whatever value is here after - // deserialization is invalid. - // - // Valid FDs are transmitted via a different channel (SCM_RIGHTS message) - // and will be populated into this struct on the destination VMM eventually. #[serde(default, deserialize_with = "deserialize_netconfig_fds")] pub fds: Option>, #[serde(default)] @@ -350,6 +346,8 @@ pub struct NetConfig { pub offload_ufo: bool, #[serde(default = "default_netconfig_true")] pub offload_csum: bool, + #[serde(default)] + pub bdf_device: Option, } pub fn default_netconfig_true() -> bool { @@ -396,6 +394,8 @@ pub struct RngConfig { pub src: PathBuf, #[serde(default)] pub iommu: bool, + #[serde(default)] + pub bdf_device: Option, } pub const DEFAULT_RNG_SOURCE: &str = "/dev/urandom"; @@ -405,6 +405,7 @@ impl Default for RngConfig { RngConfig { src: PathBuf::from(DEFAULT_RNG_SOURCE), iommu: false, + bdf_device: None, } } } @@ -426,6 +427,8 @@ pub struct BalloonConfig { /// Option to enable free page reporting from the guest. #[serde(default)] pub free_page_reporting: bool, + #[serde(default)] + pub bdf_device: Option, } #[cfg(feature = "pvmemcontrol")] @@ -444,6 +447,8 @@ pub struct FsConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } pub fn default_fsconfig_num_queues() -> usize { @@ -474,6 +479,8 @@ pub struct PmemConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for PmemConfig { @@ -491,6 +498,7 @@ pub enum ConsoleOutputMode { Tty, File, Socket, + Tcp, Null, } @@ -502,6 +510,10 @@ pub struct ConsoleConfig { #[serde(default)] pub iommu: bool, pub socket: Option, + pub url: Option, + /// PCI BDF to attach the console in the guest to + #[serde(default)] + pub bdf_device: Option, } pub fn default_consoleconfig_file() -> Option { @@ -607,6 +619,8 @@ pub struct VdpaConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } pub fn default_vdpaconfig_num_queues() -> usize { @@ -630,6 +644,8 @@ pub struct VsockConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for VsockConfig { @@ -862,6 +878,8 @@ pub fn default_serial() -> ConsoleConfig { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, } } @@ -871,6 +889,8 @@ pub fn default_console() -> ConsoleConfig { mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, } }