Skip to content

Commit 465d71c

Browse files
authored
Merge branch 'main' into akoumparouli/fix_lowercase_automodel_dir
2 parents 244aa1b + 3cb35cc commit 465d71c

File tree

505 files changed

+6650
-136299
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

505 files changed

+6650
-136299
lines changed

.github/actions/test-template/action.yml

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ runs:
140140
- name: Checkout repository
141141
uses: actions/checkout@v2
142142
with:
143-
path: VFM
143+
path: DFM
144+
submodules: recursive
144145

145146
- name: Start container
146147
shell: bash
@@ -164,7 +165,7 @@ runs:
164165
--env HYDRA_FULL_ERROR=1 \
165166
--env HF_HOME=/home/TestData/HF_HOME \
166167
--env RUN_ID=${{ github.run_id }} \
167-
--volume $(pwd)/VFM:/workspace \
168+
--volume $(pwd)/DFM:/opt/DFM \
168169
--volume $MNT_PATH/TestData:/home/TestData \
169170
nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
170171
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60))"
@@ -181,17 +182,22 @@ runs:
181182
COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
182183
echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
183184
184-
cmd=$(cat <<RUN_TEST_EOF
185+
cmd=$(cat <<'RUN_TEST_EOF'
185186
#!/bin/bash
186187
187-
(
188+
docker exec -t nemo_container_${{ github.run_id }} bash -c '
188189
set -e
190+
source /opt/venv/bin/activate
191+
uv pip install --no-deps -e .
192+
193+
timeout $(( ${{ inputs.timeout }} * 60 ))s bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh || EXIT_CODE=$?
194+
195+
if [[ "$EXIT_CODE" -eq 124 ]]; then
196+
echo "::error:: Test timed out after ${{ inputs.timeout }} minutes."
197+
fi
189198
190-
docker exec -t nemo_container_${{ github.run_id }} bash -c '\
191-
uv pip install --no-deps -e . && \
192-
bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh && \
193-
echo "Finished successfully." || echo "Did not finish."'
194-
) 2>&1 | tee err.log
199+
exit $EXIT_CODE
200+
'
195201
196202
RUN_TEST_EOF
197203
)
@@ -200,6 +206,7 @@ runs:
200206
echo "$cmd" | tee "job.sh"
201207
202208
- name: Run main script
209+
id: run-main-script
203210
uses: nick-fields/retry@v3
204211
with:
205212
timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
@@ -213,15 +220,16 @@ runs:
213220
id: check
214221
shell: bash
215222
run: |
216-
docker exec nemo_container_${{ github.run_id }} coverage combine || true
217-
docker exec nemo_container_${{ github.run_id }} coverage xml
218-
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage
219-
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml
223+
docker exec nemo_container_${{ github.run_id }} /opt/venv/bin/coverage combine || true
224+
docker exec nemo_container_${{ github.run_id }} /opt/venv/bin/coverage xml
225+
docker cp nemo_container_${{ github.run_id }}:/opt/DFM/.coverage .coverage
226+
docker cp nemo_container_${{ github.run_id }}:/opt/DFM/coverage.xml coverage.xml
220227
221228
coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
222229
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
223230
224-
IS_SUCCESS=$(tail -n 1 err.log | grep -q "Finished successfully." && echo "true" || echo "false")
231+
EXIT_CODE=${{ steps.run-main-script.outputs.exit_code }}
232+
IS_SUCCESS=$([[ "$EXIT_CODE" -eq 0 ]] && echo "true" || echo "false")
225233
226234
if [[ "$IS_SUCCESS" == "false" && "{% raw %}${{ inputs.is_optional }}" == "true" ]]; then
227235
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
@@ -231,15 +239,12 @@ runs:
231239
if [[ "$IS_SUCCESS" == "false" ]]; then
232240
echo Test did not finish successfully.
233241
exit 1
242+
else
243+
docker exec -t nemo_container_${{ github.run_id }} /opt/venv/bin/coverage report -i
234244
fi
235245
236246
exit $EXIT_CODE
237247
238-
- name: Test coverage
239-
shell: bash -x -e -u -o pipefail {0}
240-
run: |
241-
docker exec -t nemo_container_${{ github.run_id }} coverage report -i
242-
243248
- name: Upload artifacts
244249
uses: actions/upload-artifact@v4
245250
if: ${{ steps.check.outputs.coverage_report != 'none' }}

.github/workflows/build-docs.yml

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,41 @@ on:
2121

2222
jobs:
2323
build-docs:
24-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
24+
runs-on: ubuntu-latest
25+
steps:
26+
- name: Checkout repository
27+
uses: actions/checkout@v4
28+
29+
- name: Set up Python
30+
uses: actions/setup-python@v5
31+
with:
32+
python-version: '3.12'
33+
34+
- name: Install uv
35+
run: |
36+
curl -LsSf https://astral.sh/uv/0.8.22/install.sh | sh
37+
echo "$HOME/.local/bin" >> $GITHUB_PATH
38+
39+
- name: Create virtual environment
40+
run: uv venv
41+
42+
- name: Install docs dependencies (skip project deps to avoid CUDA)
43+
run: |
44+
uv pip install \
45+
"myst-parser>=4.0.1" \
46+
"nvidia-sphinx-theme>=0.0.8" \
47+
"sphinx>=8.1.3" \
48+
"sphinx-autobuild>=2024.10.3" \
49+
"sphinx-autodoc2>=0.5.0" \
50+
"sphinx-copybutton>=0.5.2"
51+
52+
- name: Build documentation
53+
run: |
54+
source .venv/bin/activate
55+
sphinx-build -b html docs docs/build/html
56+
57+
- name: Upload docs artifact
58+
uses: actions/upload-artifact@v4
59+
with:
60+
name: documentation
61+
path: docs/build/html/

.github/workflows/cicd-main.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ jobs:
6868
steps:
6969
- name: Checkout
7070
uses: actions/checkout@v4
71+
with:
72+
submodules: recursive
7173
- name: main
7274
uses: ./.github/actions/test-template
7375
with:
@@ -97,6 +99,8 @@ jobs:
9799
steps:
98100
- name: Checkout
99101
uses: actions/checkout@v4
102+
with:
103+
submodules: recursive
100104
- name: main
101105
uses: ./.github/actions/test-template
102106
with:
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Generate UV Lockfile
16+
17+
on:
18+
workflow_dispatch: # Manual trigger only
19+
20+
jobs:
21+
generate-lockfile:
22+
runs-on: ubuntu-latest
23+
steps:
24+
- name: Free up massive disk space BEFORE pulling container
25+
run: |
26+
echo "Disk space before cleanup:"
27+
df -h
28+
# Remove EVERYTHING unnecessary from the runner
29+
sudo rm -rf /usr/share/dotnet
30+
sudo rm -rf /usr/local/lib/android
31+
sudo rm -rf /opt/ghc
32+
sudo rm -rf /opt/hostedtoolcache/CodeQL
33+
sudo rm -rf /usr/local/share/boost
34+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
35+
sudo apt-get remove -y '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*' '^mysql-.*' azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri || true
36+
sudo apt-get autoremove -y
37+
sudo apt-get clean
38+
sudo docker system prune -a -f
39+
echo "Disk space after cleanup:"
40+
df -h
41+
42+
- name: Checkout repository
43+
uses: actions/checkout@v4
44+
with:
45+
submodules: recursive
46+
47+
- name: Generate lockfile in NGC container
48+
run: |
49+
docker run --rm \
50+
-v $PWD:/workspace \
51+
-w /workspace \
52+
nvcr.io/nvidia/pytorch:25.09-py3 \
53+
bash -c '
54+
# Install uv
55+
curl -LsSf https://astral.sh/uv/0.8.22/install.sh | sh
56+
export PATH="$HOME/.local/bin:$PATH"
57+
58+
# Create venv with system packages (this makes container torch available)
59+
uv venv /opt/venv --system-site-packages
60+
61+
# Set environment variables like Megatron-Bridge does
62+
export UV_PROJECT_ENVIRONMENT=/opt/venv
63+
export VIRTUAL_ENV=/opt/venv
64+
export PATH="$VIRTUAL_ENV/bin:$PATH"
65+
66+
# Pre-install build dependencies before any sync/lock operation
67+
uv pip install setuptools wheel pybind11 "Cython>=3.0.0" "numpy<2.0.0" ninja packaging poetry
68+
69+
# Generate lockfile with the EXACT configuration from pyproject.toml
70+
# No modifications! This ensures lockfile matches what Dockerfile will use
71+
uv lock --no-build-isolation || { echo "uv lock failed!"; exit 1; }
72+
73+
# Show disk usage
74+
df -h
75+
'
76+
77+
- name: Check for lockfile changes
78+
id: check_changes
79+
run: |
80+
if git diff --quiet uv.lock; then
81+
echo "changed=false" >> $GITHUB_OUTPUT
82+
else
83+
echo "changed=true" >> $GITHUB_OUTPUT
84+
fi
85+
86+
- name: Upload lockfile artifact
87+
uses: actions/upload-artifact@v4
88+
with:
89+
name: uv-lockfile
90+
path: uv.lock
91+
retention-days: 7
92+
93+
- name: Commit lockfile (if changed)
94+
if: steps.check_changes.outputs.changed == 'true' && github.event_name == 'workflow_dispatch'
95+
run: |
96+
git config --global user.name "github-actions[bot]"
97+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
98+
git add uv.lock
99+
git commit -m "Update uv.lock [skip ci]"
100+
git push origin HEAD:${{ github.ref_name }}
101+
102+
- name: Comment on PR with lockfile status
103+
if: github.event_name == 'pull_request'
104+
uses: actions/github-script@v7
105+
with:
106+
script: |
107+
const changed = '${{ steps.check_changes.outputs.changed }}';
108+
const message = changed === 'true'
109+
? '⚠️ **uv.lock needs to be regenerated**\n\nThe lockfile is out of sync with pyproject.toml. Please run the "Generate UV Lockfile" workflow manually or regenerate locally on Linux.'
110+
: '✅ **uv.lock is up to date**\n\nThe lockfile is in sync with pyproject.toml.';
111+
112+
github.rest.issues.createComment({
113+
issue_number: context.issue.number,
114+
owner: context.repo.owner,
115+
repo: context.repo.repo,
116+
body: message
117+
});

.gitmodules

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[submodule "3rdparty/Automodel"]
2+
path = 3rdparty/Automodel
3+
url = https://github.com/NVIDIA-NeMo/Automodel.git
4+
[submodule "3rdparty/Megatron-Bridge"]
5+
path = 3rdparty/Megatron-Bridge
6+
url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git

3rdparty/Automodel

Submodule Automodel added at a5f0652

3rdparty/Megatron-Bridge

Submodule Megatron-Bridge added at 8e21f81

dfm/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from dfm.package_info import (
15+
__contact_emails__,
16+
__contact_names__,
17+
__description__,
18+
__download_url__,
19+
__homepage__,
20+
__keywords__,
21+
__license__,
22+
__package_name__,
23+
__repository_url__,
24+
__shortversion__,
25+
__version__,
26+
)
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@
2424
__shortversion__ = ".".join(map(str, VERSION[:3]))
2525
__version__ = ".".join(map(str, VERSION[:3])) + "".join(VERSION[3:])
2626

27-
__package_name__ = "nemo_vfm"
27+
__package_name__ = "dfm"
2828
__contact_names__ = "NVIDIA"
2929
__contact_emails__ = "nemo-toolkit@nvidia.com"
30-
__homepage__ = "https://github.com/NVIDIA-NeMo/NeMo-VFM"
31-
__repository_url__ = "https://github.com/NVIDIA-NeMo/NeMo-VFM"
32-
__download_url__ = "https://github.com/NVIDIA-NeMo/NeMo-VFM/releases"
33-
__description__ = "NeMo VFM"
30+
__homepage__ = "https://github.com/NVIDIA-NeMo/NeMo-DFM"
31+
__repository_url__ = "https://github.com/NVIDIA-NeMo/NeMo-DFM"
32+
__download_url__ = "https://github.com/NVIDIA-NeMo/NeMo-DFM/releases"
33+
__description__ = "NeMo DFM"
3434
__license__ = "Apache2"
3535
__keywords__ = "deep learning, machine learning, gpu, NLP, pytorch, torch"
File renamed without changes.

0 commit comments

Comments
 (0)