Skip to content

Commit 79743b6

Browse files
authored
Merge pull request #1 from NVIDIA-NeMo/chtruong/initial-template
Add initial repo template
2 parents 2a64592 + e5b2c6f commit 79743b6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3735
-1
lines changed

.dockerignore

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
__pycache__
2+
*.pyc
3+
*.pyo
4+
*.pyd
5+
.Python
6+
env
7+
pip-log.txt
8+
pip-delete-this-directory.txt
9+
.tox
10+
.coverage
11+
.coverage.*
12+
.cache
13+
nosetests.xml
14+
coverage.xml
15+
*,cover
16+
*.log
17+
.git
18+
**/*.nemo
19+
**/*.ckpt

.github/CODEOWNERS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.github/ @nvidia-nemo/automation
2+
docker/ @nvidia-nemo/automation
3+
pyproject.toml @nvidia-nemo/automation
4+
uv.lock @nvidia-nemo/automation
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
---
2+
name: Bug report
3+
about: Create a report to help us improve
4+
title: ''
5+
labels: bug
6+
assignees: ''
7+
8+
---
9+
10+
**Describe the bug**
11+
12+
A clear and concise description of what the bug is.
13+
14+
**Steps/Code to reproduce bug**
15+
16+
Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
17+
18+
A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
19+
20+
21+
**Expected behavior**
22+
23+
A clear and concise description of what you expected to happen.
24+
25+
**Environment overview (please complete the following information)**
26+
27+
- Environment location: [Bare-metal, Docker, Cloud(specify cloud provider - AWS, Azure, GCP, Collab)]
28+
- Method of install: [pip install or from source]. Please specify exact commands you used to install.
29+
- If method of install is [Docker], provide `docker pull` & `docker run` commands used
30+
31+
**Environment details**
32+
33+
If NVIDIA docker image is used you don't need to specify these.
34+
Otherwise, please provide:
35+
- OS version
36+
- PyTorch version
37+
- Python version
38+
39+
**Additional context**
40+
41+
Add any other context about the problem here.
42+
Example: GPU model
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
name: Feature request
3+
about: Suggest an idea for this project
4+
title: ''
5+
labels: feature request
6+
assignees: ''
7+
8+
---
9+
10+
**Is your feature request related to a problem? Please describe.**
11+
12+
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13+
14+
**Describe the solution you'd like**
15+
16+
A clear and concise description of what you want to happen.
17+
Provide a code snippet on how new APIs/changes would be used by others.
18+
19+
**Describe alternatives you've considered**
20+
21+
A clear and concise description of any alternative solutions or features you've considered.
22+
23+
**Additional context**
24+
25+
Add any other context or screenshots about the feature request here
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
name: "Test Template"
15+
description: "Template for running NeMo tests in a containerized environment"
16+
17+
inputs:
18+
runner:
19+
description: "Runner to use for test"
20+
required: true
21+
timeout:
22+
description: "Max runtime of test in minutes"
23+
required: false
24+
default: "10"
25+
script:
26+
description: "Test script to execute"
27+
required: true
28+
is_optional:
29+
description: "Failure will cancel all other tests if set to true"
30+
required: false
31+
default: "false"
32+
is_unit_test:
33+
description: "Upload coverage as unit test"
34+
required: false
35+
default: "false"
36+
image:
37+
description: "Image to use for test"
38+
required: false
39+
default: "llm_shower"
40+
cpu-only:
41+
description: "Run tests on CPU only"
42+
required: false
43+
default: "false"
44+
azure-client-id:
45+
description: "Azure Client ID"
46+
required: true
47+
azure-tenant-id:
48+
description: "Azure Tenant ID"
49+
required: true
50+
azure-subscription-id:
51+
description: "Azure Subscription ID"
52+
required: true
53+
has-azure-credentials:
54+
description: "Has Azure credentials"
55+
required: false
56+
default: "false"
57+
58+
runs:
59+
using: "composite"
60+
steps:
61+
- name: Install Azure CLI
62+
if: ${{ inputs.has-azure-credentials == 'true' }}
63+
shell: bash
64+
run: |
65+
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
66+
67+
- name: Azure Login
68+
if: ${{ inputs.has-azure-credentials == 'true' }}
69+
uses: azure/login@v2
70+
with:
71+
client-id: ${{ inputs.azure-client-id }}
72+
tenant-id: ${{ inputs.azure-tenant-id }}
73+
subscription-id: ${{ inputs.azure-subscription-id }}
74+
75+
- name: Azure ACR Login
76+
if: ${{ inputs.has-azure-credentials == 'true' }}
77+
shell: bash
78+
run: |
79+
az acr login --name nemoci
80+
81+
- name: Azure Fileshare
82+
if: ${{ inputs.has-azure-credentials == 'true' && inputs.is_unit_test == 'false' }}
83+
shell: bash
84+
id: azure-fileshare
85+
run: |
86+
sudo apt update
87+
sudo apt install -y cifs-utils
88+
89+
RESOURCE_GROUP_NAME="azure-gpu-vm-runner_group"
90+
STORAGE_ACCOUNT_NAME="nemocistorageaccount2"
91+
FILE_SHARE_NAME="fileshare"
92+
93+
MNT_ROOT="/media"
94+
MNT_PATH="$MNT_ROOT/$STORAGE_ACCOUNT_NAME/$FILE_SHARE_NAME"
95+
96+
echo "MNT_PATH=$MNT_PATH" | tee -a "$GITHUB_OUTPUT"
97+
98+
sudo mkdir -p $MNT_PATH
99+
100+
# Create a folder to store the credentials for this storage account and
101+
# any other that you might set up.
102+
CREDENTIAL_ROOT="/etc/smbcredentials"
103+
sudo mkdir -p "/etc/smbcredentials"
104+
105+
# Get the storage account key for the indicated storage account.
106+
# You must be logged in with az login and your user identity must have
107+
# permissions to list the storage account keys for this command to work.
108+
STORAGE_ACCOUNT_KEY=$(az storage account keys list \
109+
--resource-group $RESOURCE_GROUP_NAME \
110+
--account-name $STORAGE_ACCOUNT_NAME \
111+
--query "[0].value" --output tsv | tr -d '"')
112+
113+
# Create the credential file for this individual storage account
114+
SMB_CREDENTIAL_FILE="$CREDENTIAL_ROOT/$STORAGE_ACCOUNT_NAME.cred"
115+
if [ ! -f $SMB_CREDENTIAL_FILE ]; then
116+
echo "username=$STORAGE_ACCOUNT_NAME" | sudo tee $SMB_CREDENTIAL_FILE > /dev/null
117+
echo "password=$STORAGE_ACCOUNT_KEY" | sudo tee -a $SMB_CREDENTIAL_FILE > /dev/null
118+
else
119+
echo "The credential file $SMB_CREDENTIAL_FILE already exists, and was not modified."
120+
fi
121+
122+
# Change permissions on the credential file so only root can read or modify the password file.
123+
sudo chmod 600 $SMB_CREDENTIAL_FILE
124+
125+
# This command assumes you have logged in with az login
126+
HTTP_ENDPOINT=$(az storage account show --resource-group $RESOURCE_GROUP_NAME --name $STORAGE_ACCOUNT_NAME --query "primaryEndpoints.file" --output tsv | tr -d '"')
127+
SMB_PATH=$(echo $HTTP_ENDPOINT | cut -c7-${#HTTP_ENDPOINT})$FILE_SHARE_NAME
128+
129+
STORAGE_ACCOUNT_KEY=$(az storage account keys list --resource-group $RESOURCE_GROUP_NAME --account-name $STORAGE_ACCOUNT_NAME --query "[0].value" --output tsv | tr -d '"')
130+
131+
sudo mount -t cifs $SMB_PATH $MNT_PATH -o credentials=$SMB_CREDENTIAL_FILE,serverino,nosharesock,actimeo=30,mfsymlinks
132+
133+
ls -al $MNT_PATH/TestData
134+
135+
- name: Docker pull image
136+
shell: bash
137+
run: |
138+
docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }}
139+
140+
- name: Checkout repository
141+
uses: actions/checkout@v2
142+
with:
143+
path: LLM-Shower
144+
145+
- name: Start container
146+
shell: bash
147+
run: |
148+
MNT_PATH=${{ steps.azure-fileshare.outputs.mnt_path }}
149+
150+
ARG=("")
151+
if [[ "${{ inputs.cpu-only }}" == "false" ]]; then
152+
ARG=("--runtime=nvidia --gpus all")
153+
fi
154+
155+
cmd=$(cat <<RUN_TEST_EOF
156+
#!/bin/bash
157+
docker container rm -f nemo_container_${{ github.run_id }} || true
158+
docker run \
159+
--rm \
160+
-d \
161+
--name nemo_container_${{ github.run_id }} ${ARG[@]} \
162+
--shm-size=64g \
163+
--env TRANSFORMERS_OFFLINE=0 \
164+
--env HYDRA_FULL_ERROR=1 \
165+
--env HF_HOME=/home/TestData/HF_HOME \
166+
--env RUN_ID=${{ github.run_id }} \
167+
--volume $(pwd)/LLM-Shower:/workspace \
168+
--volume $MNT_PATH/TestData:/home/TestData \
169+
nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
170+
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60))"
171+
RUN_TEST_EOF
172+
)
173+
174+
echo "$cmd" | tee "retry_job.sh"
175+
bash retry_job.sh
176+
177+
- name: Create run-script
178+
id: create
179+
shell: bash
180+
run: |
181+
COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
182+
echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
183+
184+
cmd=$(cat <<RUN_TEST_EOF
185+
#!/bin/bash
186+
187+
(
188+
set -e
189+
190+
docker exec -t nemo_container_${{ github.run_id }} bash -c '\
191+
uv pip install --no-deps -e . && \
192+
bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh && \
193+
echo "Finished successfully." || echo "Did not finish."'
194+
) 2>&1 | tee err.log
195+
196+
RUN_TEST_EOF
197+
)
198+
199+
echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
200+
echo "$cmd" | tee "job.sh"
201+
202+
- name: Run main script
203+
uses: nick-fields/retry@v3
204+
with:
205+
timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
206+
max_attempts: 3
207+
shell: bash
208+
retry_on: timeout
209+
command: /bin/bash job.sh
210+
on_retry_command: /bin/bash retry_job.sh
211+
212+
- name: Check result
213+
id: check
214+
shell: bash
215+
run: |
216+
docker exec nemo_container_${{ github.run_id }} coverage combine || true
217+
docker exec nemo_container_${{ github.run_id }} coverage xml
218+
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage
219+
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml
220+
221+
coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
222+
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
223+
224+
IS_SUCCESS=$(tail -n 1 err.log | grep -q "Finished successfully." && echo "true" || echo "false")
225+
226+
if [[ "$IS_SUCCESS" == "false" && "{% raw %}${{ inputs.is_optional }}" == "true" ]]; then
227+
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
228+
IS_SUCCESS=true
229+
fi
230+
231+
if [[ "$IS_SUCCESS" == "false" ]]; then
232+
echo Test did not finish successfully.
233+
exit 1
234+
fi
235+
236+
exit $EXIT_CODE
237+
238+
- name: Test coverage
239+
shell: bash -x -e -u -o pipefail {0}
240+
run: |
241+
docker exec -t nemo_container_${{ github.run_id }} coverage report -i
242+
243+
- name: Upload artifacts
244+
uses: actions/upload-artifact@v4
245+
if: ${{ steps.check.outputs.coverage_report != 'none' }}
246+
with:
247+
name: ${{ steps.check.outputs.coverage_report }}
248+
path: |
249+
coverage.xml
250+
.coverage
251+
include-hidden-files: true

0 commit comments

Comments
 (0)