|
| 1 | +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +name: "Test Template" |
| 15 | +description: "Template for running NeMo tests in a containerized environment" |
| 16 | + |
| 17 | +inputs: |
| 18 | + runner: |
| 19 | + description: "Runner to use for test" |
| 20 | + required: true |
| 21 | + timeout: |
| 22 | + description: "Max runtime of test in minutes" |
| 23 | + required: false |
| 24 | + default: "10" |
| 25 | + script: |
| 26 | + description: "Test script to execute" |
| 27 | + required: true |
| 28 | + is_optional: |
| 29 | + description: "Failure will cancel all other tests if set to true" |
| 30 | + required: false |
| 31 | + default: "false" |
| 32 | + is_unit_test: |
| 33 | + description: "Upload coverage as unit test" |
| 34 | + required: false |
| 35 | + default: "false" |
| 36 | + image: |
| 37 | + description: "Image to use for test" |
| 38 | + required: false |
| 39 | + default: "llm_shower" |
| 40 | + cpu-only: |
| 41 | + description: "Run tests on CPU only" |
| 42 | + required: false |
| 43 | + default: "false" |
| 44 | + azure-client-id: |
| 45 | + description: "Azure Client ID" |
| 46 | + required: true |
| 47 | + azure-tenant-id: |
| 48 | + description: "Azure Tenant ID" |
| 49 | + required: true |
| 50 | + azure-subscription-id: |
| 51 | + description: "Azure Subscription ID" |
| 52 | + required: true |
| 53 | + has-azure-credentials: |
| 54 | + description: "Has Azure credentials" |
| 55 | + required: false |
| 56 | + default: "false" |
| 57 | + |
| 58 | +runs: |
| 59 | + using: "composite" |
| 60 | + steps: |
| 61 | + - name: Install Azure CLI |
| 62 | + if: ${{ inputs.has-azure-credentials == 'true' }} |
| 63 | + shell: bash |
| 64 | + run: | |
| 65 | + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash |
| 66 | +
|
| 67 | + - name: Azure Login |
| 68 | + if: ${{ inputs.has-azure-credentials == 'true' }} |
| 69 | + uses: azure/login@v2 |
| 70 | + with: |
| 71 | + client-id: ${{ inputs.azure-client-id }} |
| 72 | + tenant-id: ${{ inputs.azure-tenant-id }} |
| 73 | + subscription-id: ${{ inputs.azure-subscription-id }} |
| 74 | + |
| 75 | + - name: Azure ACR Login |
| 76 | + if: ${{ inputs.has-azure-credentials == 'true' }} |
| 77 | + shell: bash |
| 78 | + run: | |
| 79 | + az acr login --name nemoci |
| 80 | +
|
| 81 | + - name: Azure Fileshare |
| 82 | + if: ${{ inputs.has-azure-credentials == 'true' && inputs.is_unit_test == 'false' }} |
| 83 | + shell: bash |
| 84 | + id: azure-fileshare |
| 85 | + run: | |
| 86 | + sudo apt update |
| 87 | + sudo apt install -y cifs-utils |
| 88 | +
|
| 89 | + RESOURCE_GROUP_NAME="azure-gpu-vm-runner_group" |
| 90 | + STORAGE_ACCOUNT_NAME="nemocistorageaccount2" |
| 91 | + FILE_SHARE_NAME="fileshare" |
| 92 | +
|
| 93 | + MNT_ROOT="/media" |
| 94 | + MNT_PATH="$MNT_ROOT/$STORAGE_ACCOUNT_NAME/$FILE_SHARE_NAME" |
| 95 | +
|
| 96 | + echo "MNT_PATH=$MNT_PATH" | tee -a "$GITHUB_OUTPUT" |
| 97 | +
|
| 98 | + sudo mkdir -p $MNT_PATH |
| 99 | +
|
| 100 | + # Create a folder to store the credentials for this storage account and |
| 101 | + # any other that you might set up. |
| 102 | + CREDENTIAL_ROOT="/etc/smbcredentials" |
| 103 | + sudo mkdir -p "/etc/smbcredentials" |
| 104 | +
|
| 105 | + # Get the storage account key for the indicated storage account. |
| 106 | + # You must be logged in with az login and your user identity must have |
| 107 | + # permissions to list the storage account keys for this command to work. |
| 108 | + STORAGE_ACCOUNT_KEY=$(az storage account keys list \ |
| 109 | + --resource-group $RESOURCE_GROUP_NAME \ |
| 110 | + --account-name $STORAGE_ACCOUNT_NAME \ |
| 111 | + --query "[0].value" --output tsv | tr -d '"') |
| 112 | +
|
| 113 | + # Create the credential file for this individual storage account |
| 114 | + SMB_CREDENTIAL_FILE="$CREDENTIAL_ROOT/$STORAGE_ACCOUNT_NAME.cred" |
| 115 | + if [ ! -f $SMB_CREDENTIAL_FILE ]; then |
| 116 | + echo "username=$STORAGE_ACCOUNT_NAME" | sudo tee $SMB_CREDENTIAL_FILE > /dev/null |
| 117 | + echo "password=$STORAGE_ACCOUNT_KEY" | sudo tee -a $SMB_CREDENTIAL_FILE > /dev/null |
| 118 | + else |
| 119 | + echo "The credential file $SMB_CREDENTIAL_FILE already exists, and was not modified." |
| 120 | + fi |
| 121 | +
|
| 122 | + # Change permissions on the credential file so only root can read or modify the password file. |
| 123 | + sudo chmod 600 $SMB_CREDENTIAL_FILE |
| 124 | +
|
| 125 | + # This command assumes you have logged in with az login |
| 126 | + HTTP_ENDPOINT=$(az storage account show --resource-group $RESOURCE_GROUP_NAME --name $STORAGE_ACCOUNT_NAME --query "primaryEndpoints.file" --output tsv | tr -d '"') |
| 127 | + SMB_PATH=$(echo $HTTP_ENDPOINT | cut -c7-${#HTTP_ENDPOINT})$FILE_SHARE_NAME |
| 128 | +
|
| 129 | + STORAGE_ACCOUNT_KEY=$(az storage account keys list --resource-group $RESOURCE_GROUP_NAME --account-name $STORAGE_ACCOUNT_NAME --query "[0].value" --output tsv | tr -d '"') |
| 130 | +
|
| 131 | + sudo mount -t cifs $SMB_PATH $MNT_PATH -o credentials=$SMB_CREDENTIAL_FILE,serverino,nosharesock,actimeo=30,mfsymlinks |
| 132 | +
|
| 133 | + ls -al $MNT_PATH/TestData |
| 134 | +
|
| 135 | + - name: Docker pull image |
| 136 | + shell: bash |
| 137 | + run: | |
| 138 | + docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} |
| 139 | +
|
| 140 | + - name: Checkout repository |
| 141 | + uses: actions/checkout@v2 |
| 142 | + with: |
| 143 | + path: LLM-Shower |
| 144 | + |
| 145 | + - name: Start container |
| 146 | + shell: bash |
| 147 | + run: | |
| 148 | + MNT_PATH=${{ steps.azure-fileshare.outputs.mnt_path }} |
| 149 | +
|
| 150 | + ARG=("") |
| 151 | + if [[ "${{ inputs.cpu-only }}" == "false" ]]; then |
| 152 | + ARG=("--runtime=nvidia --gpus all") |
| 153 | + fi |
| 154 | +
|
| 155 | + cmd=$(cat <<RUN_TEST_EOF |
| 156 | + #!/bin/bash |
| 157 | + docker container rm -f nemo_container_${{ github.run_id }} || true |
| 158 | + docker run \ |
| 159 | + --rm \ |
| 160 | + -d \ |
| 161 | + --name nemo_container_${{ github.run_id }} ${ARG[@]} \ |
| 162 | + --shm-size=64g \ |
| 163 | + --env TRANSFORMERS_OFFLINE=0 \ |
| 164 | + --env HYDRA_FULL_ERROR=1 \ |
| 165 | + --env HF_HOME=/home/TestData/HF_HOME \ |
| 166 | + --env RUN_ID=${{ github.run_id }} \ |
| 167 | + --volume $(pwd)/LLM-Shower:/workspace \ |
| 168 | + --volume $MNT_PATH/TestData:/home/TestData \ |
| 169 | + nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \ |
| 170 | + bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60))" |
| 171 | + RUN_TEST_EOF |
| 172 | + ) |
| 173 | +
|
| 174 | + echo "$cmd" | tee "retry_job.sh" |
| 175 | + bash retry_job.sh |
| 176 | +
|
| 177 | + - name: Create run-script |
| 178 | + id: create |
| 179 | + shell: bash |
| 180 | + run: | |
| 181 | + COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e") |
| 182 | + echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT" |
| 183 | +
|
| 184 | + cmd=$(cat <<RUN_TEST_EOF |
| 185 | + #!/bin/bash |
| 186 | +
|
| 187 | + ( |
| 188 | + set -e |
| 189 | +
|
| 190 | + docker exec -t nemo_container_${{ github.run_id }} bash -c '\ |
| 191 | + uv pip install --no-deps -e . && \ |
| 192 | + bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh && \ |
| 193 | + echo "Finished successfully." || echo "Did not finish."' |
| 194 | + ) 2>&1 | tee err.log |
| 195 | +
|
| 196 | + RUN_TEST_EOF |
| 197 | + ) |
| 198 | +
|
| 199 | + echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT" |
| 200 | + echo "$cmd" | tee "job.sh" |
| 201 | +
|
| 202 | + - name: Run main script |
| 203 | + uses: nick-fields/retry@v3 |
| 204 | + with: |
| 205 | + timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }} |
| 206 | + max_attempts: 3 |
| 207 | + shell: bash |
| 208 | + retry_on: timeout |
| 209 | + command: /bin/bash job.sh |
| 210 | + on_retry_command: /bin/bash retry_job.sh |
| 211 | + |
| 212 | + - name: Check result |
| 213 | + id: check |
| 214 | + shell: bash |
| 215 | + run: | |
| 216 | + docker exec nemo_container_${{ github.run_id }} coverage combine || true |
| 217 | + docker exec nemo_container_${{ github.run_id }} coverage xml |
| 218 | + docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage |
| 219 | + docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml |
| 220 | +
|
| 221 | + coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen) |
| 222 | + echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT" |
| 223 | +
|
| 224 | + IS_SUCCESS=$(tail -n 1 err.log | grep -q "Finished successfully." && echo "true" || echo "false") |
| 225 | +
|
| 226 | + if [[ "$IS_SUCCESS" == "false" && "{% raw %}${{ inputs.is_optional }}" == "true" ]]; then |
| 227 | + echo "::warning:: Test failed, but displayed as successful because it is marked as optional." |
| 228 | + IS_SUCCESS=true |
| 229 | + fi |
| 230 | +
|
| 231 | + if [[ "$IS_SUCCESS" == "false" ]]; then |
| 232 | + echo Test did not finish successfully. |
| 233 | + exit 1 |
| 234 | + fi |
| 235 | +
|
| 236 | + exit $EXIT_CODE |
| 237 | +
|
| 238 | + - name: Test coverage |
| 239 | + shell: bash -x -e -u -o pipefail {0} |
| 240 | + run: | |
| 241 | + docker exec -t nemo_container_${{ github.run_id }} coverage report -i |
| 242 | +
|
| 243 | + - name: Upload artifacts |
| 244 | + uses: actions/upload-artifact@v4 |
| 245 | + if: ${{ steps.check.outputs.coverage_report != 'none' }} |
| 246 | + with: |
| 247 | + name: ${{ steps.check.outputs.coverage_report }} |
| 248 | + path: | |
| 249 | + coverage.xml |
| 250 | + .coverage |
| 251 | + include-hidden-files: true |
0 commit comments