Skip to content

Commit 7490df3

Browse files
authored
Merge branch 'main' into askmanu/docs-release-0.2.0-2
2 parents fbc702a + 46e6253 commit 7490df3

File tree

172 files changed

+17197
-4131
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

172 files changed

+17197
-4131
lines changed
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Build container
16+
17+
inputs:
18+
azure-client-id:
19+
description: "Azure Client ID"
20+
required: true
21+
azure-tenant-id:
22+
description: "Azure Tenant ID"
23+
required: true
24+
azure-subscription-id:
25+
description: "Azure Subscription ID"
26+
required: true
27+
dockerfile-path:
28+
description: "Path to dockerfile to build"
29+
required: true
30+
has-azure-credentials:
31+
description: "Has Azure credentials"
32+
required: false
33+
default: "false"
34+
PAT:
35+
description: "GitHub Personal Access Token"
36+
required: true
37+
repo-name:
38+
description: "The name of the repo to build container"
39+
required: true
40+
type: string
41+
42+
env:
43+
container-registry: nemoci.azurecr.io
44+
45+
runs:
46+
using: "composite"
47+
steps:
48+
- name: Checkout
49+
uses: actions/checkout@v4
50+
with:
51+
path: NeMo-Automodel
52+
53+
- name: Setup python
54+
uses: actions/setup-python@v5
55+
with:
56+
python-version: 3.12
57+
58+
- name: Get PR info
59+
id: get-pr-info
60+
if: startsWith(github.ref, 'refs/heads/pull-request/')
61+
uses: nv-gha-runners/get-pr-info@main
62+
63+
- name: Install Azure CLI
64+
shell: bash
65+
run: |
66+
echo "::group::Install Azure CLI"
67+
# Create systemd override for proper dependencies
68+
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
69+
echo "::endgroup::"
70+
71+
- name: Azure Login
72+
uses: azure/login@v2
73+
with:
74+
client-id: ${{ secrets.AZURE_CLIENT_ID }}
75+
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
76+
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
77+
78+
- name: Azure ACR Login
79+
shell: bash
80+
run: |
81+
az acr login --name nemoci
82+
83+
- name: Install GH CLI
84+
shell: bash
85+
run: |
86+
apt-get update
87+
apt-get install -y gh
88+
89+
- name: Normalize repo name to lowercase
90+
shell: bash
91+
env:
92+
REPO: ${{ inputs.repo-name }}
93+
run: |
94+
echo "REPO_LOWER=${REPO,,}" >> "$GITHUB_ENV"
95+
96+
- name: Get last merged PR
97+
shell: bash
98+
id: cache_from
99+
env:
100+
GH_TOKEN: ${{ github.token }}
101+
run: |
102+
LAST_PRS=$(gh api graphql \
103+
-F owner="NVIDIA-NeMo" \
104+
-F name=${{ inputs.repo-name }} \
105+
-f query='
106+
query($owner: String!, $name: String!) {
107+
repository(owner: $owner, name: $name) {
108+
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
109+
nodes {
110+
number
111+
}
112+
}
113+
}
114+
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
115+
echo "type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:$number-buildcache,mode=max"
116+
done)
117+
118+
echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
119+
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
120+
echo "EOF" | tee -a $GITHUB_OUTPUT
121+
122+
- name: Set up Docker Buildx
123+
uses: docker/setup-buildx-action@v3
124+
125+
- name: Build and push
126+
uses: docker/build-push-action@v5
127+
with:
128+
file: ${{ inputs.dockerfile-path }}
129+
push: true
130+
context: .
131+
build-args: |
132+
BASE_IMAGE=pytorch
133+
INSTALL_TE=True
134+
cache-from: |
135+
type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
136+
type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:main-buildcache,mode=max
137+
${{ steps.cache_from.outputs.LAST_PRS }}
138+
cache-to: |
139+
type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
140+
no-cache: false
141+
tags: |
142+
${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
143+
${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ github.sha }}
144+
secrets: |
145+
GH_TOKEN=${{ secrets.PAT }}

.github/actions/test-template/action.yml

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,18 @@ inputs:
5050
PAT:
5151
description: "GitHub Personal Access Token"
5252
required: true
53+
container-image:
54+
description: "Container image to use for test"
55+
required: true
56+
5357
runs:
5458
using: "composite"
5559
steps:
60+
- name: Checkout repository
61+
uses: actions/checkout@v2
62+
with:
63+
path: NeMo-Automodel
64+
5665
- name: Install Azure CLI
5766
if: ${{ inputs.has-azure-credentials == 'true' }}
5867
shell: bash
@@ -69,6 +78,11 @@ runs:
6978
tenant-id: ${{ inputs.azure-tenant-id }}
7079
subscription-id: ${{ inputs.azure-subscription-id }}
7180

81+
- name: Azure ACR Login
82+
shell: bash
83+
run: |
84+
az acr login --name nemoci
85+
7286
- name: Azure Fileshare
7387
if: ${{ inputs.has-azure-credentials == 'true' && inputs.is-unit-test == 'false' }}
7488
shell: bash
@@ -125,23 +139,6 @@ runs:
125139
ls -al $MNT_PATH/TestData
126140
echo "::endgroup::"
127141
128-
- name: Checkout repository
129-
uses: actions/checkout@v2
130-
with:
131-
path: NeMo-Automodel
132-
133-
- name: Build container
134-
shell: bash
135-
env:
136-
GH_TOKEN: ${{ inputs.PAT }}
137-
run: |
138-
echo "::group::Build test container"
139-
docker system prune -af
140-
docker build -f docker/Dockerfile \
141-
--build-arg BASE_IMAGE=pytorch \
142-
--target automodel_final -t automodel .
143-
echo "::endgroup::"
144-
145142
- name: Start container
146143
shell: bash
147144
run: |
@@ -170,7 +167,7 @@ runs:
170167
--volume $(pwd)/NeMo-Automodel:/workspace \
171168
--workdir /workspace \
172169
--volume $MNT_PATH/TestData:/home/TestData \
173-
automodel \
170+
${{ inputs.container-image }} \
174171
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
175172
RUN_TEST_EOF
176173
)
@@ -257,4 +254,4 @@ runs:
257254
path: |
258255
coverage.xml
259256
.coverage
260-
include-hidden-files: true
257+
include-hidden-files: true

.github/workflows/cicd-main.yml

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ permissions:
2929
id-token: write
3030
contents: read
3131

32+
env:
33+
container-registry: nemoci.azurecr.io
34+
3235
jobs:
3336
pre-flight:
3437
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
@@ -129,6 +132,31 @@ jobs:
129132
run: |
130133
echo "Running CI tests"
131134
135+
cicd-container-build:
136+
needs: [pre-flight, cicd-wait-in-queue]
137+
if: |
138+
(
139+
success()
140+
|| needs.pre-flight.outputs.is_ci_workload == 'true'
141+
|| needs.pre-flight.outputs.force_run_all == 'true'
142+
)
143+
&& !cancelled()
144+
runs-on: self-hosted-nemo
145+
environment: nemo-ci
146+
steps:
147+
- name: Checkout
148+
uses: actions/checkout@v4
149+
- name: main
150+
uses: ./.github/actions/build-container
151+
with:
152+
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
153+
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
154+
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
155+
dockerfile-path: "./docker/Dockerfile"
156+
has-azure-credentials: "true"
157+
PAT: ${{ secrets.PAT }}
158+
repo-name: "Automodel"
159+
132160
cicd-unit-tests:
133161
strategy:
134162
fail-fast: false
@@ -142,7 +170,7 @@ jobs:
142170
runner: self-hosted-nemo
143171
cpu-only: false
144172
timeout: 30
145-
needs: [pre-flight, cicd-wait-in-queue]
173+
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build]
146174
runs-on: ${{ matrix.runner }}
147175
name: ${{ matrix.test-name }}
148176
environment: nemo-ci
@@ -167,6 +195,7 @@ jobs:
167195
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
168196
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
169197
PAT: ${{ secrets.PAT }}
198+
container-image: ${{ env.container-registry }}/automodel:${{ github.sha }}
170199

171200
cicd-e2e-tests:
172201
strategy:
@@ -231,6 +260,8 @@ jobs:
231260
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
232261
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
233262
PAT: ${{ secrets.PAT }}
263+
container-image: ${{ env.container-registry }}/automodel:${{ github.sha }}
264+
234265

235266
Nemo_CICD_Test:
236267
needs:

.github/workflows/install-test.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,15 +92,15 @@ jobs:
9292
- name: Set up UV
9393
uses: astral-sh/setup-uv@v1
9494
with:
95-
version: 0.7.2
95+
version: 0.9.26
9696

9797
- name: Set up yq
9898
shell: bash
9999
run: |
100100
wget https://github.com/mikefarah/yq/releases/download/v4.45.4/yq_linux_amd64.tar.gz
101101
tar -xzf yq_linux_amd64.tar.gz
102102
mkdir -p ./bin
103-
mv yq_linux_amd64 ./bin/yq
103+
mv yq_linux_amd64 ./bin/yq
104104
chmod +x ./bin/yq
105105
106106
- name: Install project
@@ -110,7 +110,7 @@ jobs:
110110
shell: bash
111111
run: |
112112
echo -e "machine github.com\n login token\n password ${{ secrets.PAT }}" > ~/.netrc
113-
chmod 600 ~/.netrc
113+
chmod 600 ~/.netrc
114114
115115
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
116116
source ./venv/bin/activate
@@ -208,7 +208,7 @@ jobs:
208208
- name: Set up UV
209209
uses: astral-sh/setup-uv@v1
210210
with:
211-
version: 0.7.2
211+
version: 0.9.26
212212

213213
- name: Install dependencies with UV
214214
env:
@@ -217,7 +217,7 @@ jobs:
217217
run: |
218218
219219
echo -e "machine github.com\n login token\n password ${{ secrets.PAT }}" > ~/.netrc
220-
chmod 600 ~/.netrc
220+
chmod 600 ~/.netrc
221221
222222
export PATH="${UV_PROJECT_ENVIRONMENT}/bin/:$PATH"
223223

.github/workflows/release-docs.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,24 @@
1313
# limitations under the License.
1414
name: Release docs
1515
on:
16+
push:
17+
branches:
18+
- main
19+
paths:
20+
- 'docs/**'
21+
1622
workflow_dispatch:
1723
inputs:
1824
dry-run:
1925
description: Whether to run the workflow in dry-run mode
2026
required: true
2127
type: boolean
22-
default: true
23-
version-number:
24-
description: Version number to release this as (use `latest` for main branch)
25-
required: true
26-
type: string
28+
default: false
2729
notify-emails:
2830
description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
2931
required: true
3032
type: string
33+
default: "akoumparouli@nvidia.com"
3134
aws-region:
3235
description: AWS region
3336
required: false

0 commit comments

Comments
 (0)