Skip to content

Add weekly forward compatibility testing #2

Add weekly forward compatibility testing

Add weekly forward compatibility testing #2

Workflow file for this run

# Copyright NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: E2E Tests
on:
push:
branches:
- "pull-request/[0-9]+"
- main
- release-*
workflow_call:
inputs:
operator_image:
required: true
type: string
operator_version:
required: true
type: string
toolkit_image:
required: false
type: string
description: 'Full container-toolkit image path (e.g., ghcr.io/nvidia/container-toolkit:v1.18.0)'
device_plugin_image:
required: false
type: string
description: 'Full device-plugin image path'
mig_manager_image:
required: false
type: string
description: 'Full mig-manager image path'
secrets:
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true
AWS_SSH_KEY:
required: true
SLACK_BOT_TOKEN:
required: false
SLACK_CHANNEL_ID:
required: false
workflow_dispatch:
inputs:
operator_image:
description: 'Operator image to test (override)'
required: false
type: string
operator_version:
description: 'Operator version to test (override)'
required: false
type: string
toolkit_image:
description: 'Override container-toolkit image'
required: false
type: string
device_plugin_image:
description: 'Override device-plugin image'
required: false
type: string
mig_manager_image:
description: 'Override mig-manager image'
required: false
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
variables:
runs-on: ubuntu-latest
outputs:
operator_version: ${{ steps.vars.outputs.operator_version }}
operator_image: ${{ steps.vars.outputs.operator_image }}
toolkit_image: ${{ steps.vars.outputs.toolkit_image }}
device_plugin_image: ${{ steps.vars.outputs.device_plugin_image }}
mig_manager_image: ${{ steps.vars.outputs.mig_manager_image }}
steps:
- name: Checkout code
if: ${{ github.event_name != 'workflow_call' }}
uses: actions/checkout@v5
- name: Calculate test variables
id: vars
run: |
# Use inputs from workflow_call if available
if [[ "${{ github.event_name }}" == "workflow_call" ]]; then
OPERATOR_IMAGE="${{ inputs.operator_image }}"
OPERATOR_VERSION="${{ inputs.operator_version }}"
# Use workflow_dispatch inputs if provided
elif [[ -n "${{ inputs.operator_version }}" && -n "${{ inputs.operator_image }}" ]]; then
OPERATOR_VERSION="${{ inputs.operator_version }}"
OPERATOR_IMAGE="${{ inputs.operator_image }}"
else
# Calculate for standalone runs
COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}"
OPERATOR_VERSION="${COMMIT_SHORT_SHA}"
OPERATOR_IMAGE="ghcr.io/nvidia/gpu-operator"
fi
# Component images (optional, use inputs if provided)
TOOLKIT_IMAGE="${{ inputs.toolkit_image }}"
DEVICE_PLUGIN_IMAGE="${{ inputs.device_plugin_image }}"
MIG_MANAGER_IMAGE="${{ inputs.mig_manager_image }}"
# Output all variables
echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT
echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT
echo "toolkit_image=${TOOLKIT_IMAGE}" >> $GITHUB_OUTPUT
echo "device_plugin_image=${DEVICE_PLUGIN_IMAGE}" >> $GITHUB_OUTPUT
echo "mig_manager_image=${MIG_MANAGER_IMAGE}" >> $GITHUB_OUTPUT
# Display for debugging
echo "::notice::Testing operator: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}"
if [[ -n "${TOOLKIT_IMAGE}" ]]; then
echo "::notice::Using custom toolkit: ${TOOLKIT_IMAGE}"
fi
if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then
echo "::notice::Using custom device-plugin: ${DEVICE_PLUGIN_IMAGE}"
fi
if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then
echo "::notice::Using custom mig-manager: ${MIG_MANAGER_IMAGE}"
fi
e2e-tests-containerd:
needs: [variables]
runs-on: linux-amd64-cpu4
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v5
name: Check out code
- name: Set up Holodeck
uses: NVIDIA/[email protected]
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set test environment
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
- name: Run e2e tests
env:
OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }}
OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }}
TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }}
DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }}
MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }}
GPU_PRODUCT_NAME: "Tesla-T4"
SKIP_LAUNCH: "true"
CONTAINER_RUNTIME: "containerd"
TEST_CASE: "./tests/cases/defaults.sh"
run: |
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$?
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v5
with:
name: containerd-e2e-test-logs
path: ./logs/
retention-days: 15
e2e-tests-nvidiadriver:
needs: [variables]
runs-on: linux-amd64-cpu4
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v5
name: Check out code
- name: Set up Holodeck
uses: NVIDIA/[email protected]
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set test environment
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
- name: Run e2e tests
env:
OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }}
OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }}
TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }}
DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }}
MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }}
GPU_PRODUCT_NAME: "Tesla-T4"
SKIP_LAUNCH: "true"
CONTAINER_RUNTIME: "containerd"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$?
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v5
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15