Skip to content

Commit 0b0058f

Browse files
author
Huy Vu2
committed
update uv.lock, merge main
2 parents f5c10a1 + 3cb35cc commit 0b0058f

File tree

13 files changed

+4268
-828
lines changed

13 files changed

+4268
-828
lines changed

.github/actions/test-template/action.yml

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ runs:
141141
uses: actions/checkout@v2
142142
with:
143143
path: DFM
144+
submodules: recursive
144145

145146
- name: Start container
146147
shell: bash
@@ -164,7 +165,7 @@ runs:
164165
--env HYDRA_FULL_ERROR=1 \
165166
--env HF_HOME=/home/TestData/HF_HOME \
166167
--env RUN_ID=${{ github.run_id }} \
167-
--volume $(pwd)/DFM:/workspace \
168+
--volume $(pwd)/DFM:/opt/DFM \
168169
--volume $MNT_PATH/TestData:/home/TestData \
169170
nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
170171
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60))"
@@ -181,17 +182,22 @@ runs:
181182
COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
182183
echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
183184
184-
cmd=$(cat <<RUN_TEST_EOF
185+
cmd=$(cat <<'RUN_TEST_EOF'
185186
#!/bin/bash
186187
187-
(
188+
docker exec -t nemo_container_${{ github.run_id }} bash -c '
188189
set -e
190+
source /opt/venv/bin/activate
191+
uv pip install --no-deps -e .
192+
193+
timeout $(( ${{ inputs.timeout }} * 60 ))s bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh || EXIT_CODE=$?
194+
195+
if [[ "$EXIT_CODE" -eq 124 ]]; then
196+
echo "::error:: Test timed out after ${{ inputs.timeout }} minutes."
197+
fi
189198
190-
docker exec -t nemo_container_${{ github.run_id }} bash -c '\
191-
uv pip install --no-deps -e . && \
192-
bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh && \
193-
echo "Finished successfully." || echo "Did not finish."'
194-
) 2>&1 | tee err.log
199+
exit $EXIT_CODE
200+
'
195201
196202
RUN_TEST_EOF
197203
)
@@ -200,6 +206,7 @@ runs:
200206
echo "$cmd" | tee "job.sh"
201207
202208
- name: Run main script
209+
id: run-main-script
203210
uses: nick-fields/retry@v3
204211
with:
205212
timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
@@ -213,15 +220,16 @@ runs:
213220
id: check
214221
shell: bash
215222
run: |
216-
docker exec nemo_container_${{ github.run_id }} coverage combine || true
217-
docker exec nemo_container_${{ github.run_id }} coverage xml
218-
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage
219-
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml
223+
docker exec nemo_container_${{ github.run_id }} /opt/venv/bin/coverage combine || true
224+
docker exec nemo_container_${{ github.run_id }} /opt/venv/bin/coverage xml
225+
docker cp nemo_container_${{ github.run_id }}:/opt/DFM/.coverage .coverage
226+
docker cp nemo_container_${{ github.run_id }}:/opt/DFM/coverage.xml coverage.xml
220227
221228
coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
222229
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
223230
224-
IS_SUCCESS=$(tail -n 1 err.log | grep -q "Finished successfully." && echo "true" || echo "false")
231+
EXIT_CODE=${{ steps.run-main-script.outputs.exit_code }}
232+
IS_SUCCESS=$([[ "$EXIT_CODE" -eq 0 ]] && echo "true" || echo "false")
225233
226234
if [[ "$IS_SUCCESS" == "false" && "{% raw %}${{ inputs.is_optional }}" == "true" ]]; then
227235
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
@@ -231,15 +239,12 @@ runs:
231239
if [[ "$IS_SUCCESS" == "false" ]]; then
232240
echo Test did not finish successfully.
233241
exit 1
242+
else
243+
docker exec -t nemo_container_${{ github.run_id }} /opt/venv/bin/coverage report -i
234244
fi
235245
236246
exit $EXIT_CODE
237247
238-
- name: Test coverage
239-
shell: bash -x -e -u -o pipefail {0}
240-
run: |
241-
docker exec -t nemo_container_${{ github.run_id }} coverage report -i
242-
243248
- name: Upload artifacts
244249
uses: actions/upload-artifact@v4
245250
if: ${{ steps.check.outputs.coverage_report != 'none' }}

.github/workflows/build-docs.yml

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,41 @@ on:
2121

2222
jobs:
2323
build-docs:
24-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
24+
runs-on: ubuntu-latest
25+
steps:
26+
- name: Checkout repository
27+
uses: actions/checkout@v4
28+
29+
- name: Set up Python
30+
uses: actions/setup-python@v5
31+
with:
32+
python-version: '3.12'
33+
34+
- name: Install uv
35+
run: |
36+
curl -LsSf https://astral.sh/uv/0.8.22/install.sh | sh
37+
echo "$HOME/.local/bin" >> $GITHUB_PATH
38+
39+
- name: Create virtual environment
40+
run: uv venv
41+
42+
- name: Install docs dependencies (skip project deps to avoid CUDA)
43+
run: |
44+
uv pip install \
45+
"myst-parser>=4.0.1" \
46+
"nvidia-sphinx-theme>=0.0.8" \
47+
"sphinx>=8.1.3" \
48+
"sphinx-autobuild>=2024.10.3" \
49+
"sphinx-autodoc2>=0.5.0" \
50+
"sphinx-copybutton>=0.5.2"
51+
52+
- name: Build documentation
53+
run: |
54+
source .venv/bin/activate
55+
sphinx-build -b html docs docs/build/html
56+
57+
- name: Upload docs artifact
58+
uses: actions/upload-artifact@v4
59+
with:
60+
name: documentation
61+
path: docs/build/html/

.github/workflows/cicd-main.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ jobs:
6868
steps:
6969
- name: Checkout
7070
uses: actions/checkout@v4
71+
with:
72+
submodules: recursive
7173
- name: main
7274
uses: ./.github/actions/test-template
7375
with:
@@ -97,6 +99,8 @@ jobs:
9799
steps:
98100
- name: Checkout
99101
uses: actions/checkout@v4
102+
with:
103+
submodules: recursive
100104
- name: main
101105
uses: ./.github/actions/test-template
102106
with:
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Generate UV Lockfile
16+
17+
on:
18+
workflow_dispatch: # Manual trigger only
19+
20+
jobs:
21+
generate-lockfile:
22+
runs-on: ubuntu-latest
23+
steps:
24+
- name: Free up massive disk space BEFORE pulling container
25+
run: |
26+
echo "Disk space before cleanup:"
27+
df -h
28+
# Remove EVERYTHING unnecessary from the runner
29+
sudo rm -rf /usr/share/dotnet
30+
sudo rm -rf /usr/local/lib/android
31+
sudo rm -rf /opt/ghc
32+
sudo rm -rf /opt/hostedtoolcache/CodeQL
33+
sudo rm -rf /usr/local/share/boost
34+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
35+
sudo apt-get remove -y '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*' '^mysql-.*' azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri || true
36+
sudo apt-get autoremove -y
37+
sudo apt-get clean
38+
sudo docker system prune -a -f
39+
echo "Disk space after cleanup:"
40+
df -h
41+
42+
- name: Checkout repository
43+
uses: actions/checkout@v4
44+
with:
45+
submodules: recursive
46+
47+
- name: Generate lockfile in NGC container
48+
run: |
49+
docker run --rm \
50+
-v $PWD:/workspace \
51+
-w /workspace \
52+
nvcr.io/nvidia/pytorch:25.09-py3 \
53+
bash -c '
54+
# Install uv
55+
curl -LsSf https://astral.sh/uv/0.8.22/install.sh | sh
56+
export PATH="$HOME/.local/bin:$PATH"
57+
58+
# Create venv with system packages (this makes container torch available)
59+
uv venv /opt/venv --system-site-packages
60+
61+
# Set environment variables like Megatron-Bridge does
62+
export UV_PROJECT_ENVIRONMENT=/opt/venv
63+
export VIRTUAL_ENV=/opt/venv
64+
export PATH="$VIRTUAL_ENV/bin:$PATH"
65+
66+
# Pre-install build dependencies before any sync/lock operation
67+
uv pip install setuptools wheel pybind11 "Cython>=3.0.0" "numpy<2.0.0" ninja packaging poetry
68+
69+
# Generate lockfile with the EXACT configuration from pyproject.toml
70+
# No modifications! This ensures lockfile matches what Dockerfile will use
71+
uv lock --no-build-isolation || { echo "uv lock failed!"; exit 1; }
72+
73+
# Show disk usage
74+
df -h
75+
'
76+
77+
- name: Check for lockfile changes
78+
id: check_changes
79+
run: |
80+
if git diff --quiet uv.lock; then
81+
echo "changed=false" >> $GITHUB_OUTPUT
82+
else
83+
echo "changed=true" >> $GITHUB_OUTPUT
84+
fi
85+
86+
- name: Upload lockfile artifact
87+
uses: actions/upload-artifact@v4
88+
with:
89+
name: uv-lockfile
90+
path: uv.lock
91+
retention-days: 7
92+
93+
- name: Commit lockfile (if changed)
94+
if: steps.check_changes.outputs.changed == 'true' && github.event_name == 'workflow_dispatch'
95+
run: |
96+
git config --global user.name "github-actions[bot]"
97+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
98+
git add uv.lock
99+
git commit -m "Update uv.lock [skip ci]"
100+
git push origin HEAD:${{ github.ref_name }}
101+
102+
- name: Comment on PR with lockfile status
103+
if: github.event_name == 'pull_request'
104+
uses: actions/github-script@v7
105+
with:
106+
script: |
107+
const changed = '${{ steps.check_changes.outputs.changed }}';
108+
const message = changed === 'true'
109+
? '⚠️ **uv.lock needs to be regenerated**\n\nThe lockfile is out of sync with pyproject.toml. Please run the "Generate UV Lockfile" workflow manually or regenerate locally on Linux.'
110+
: '✅ **uv.lock is up to date**\n\nThe lockfile is in sync with pyproject.toml.';
111+
112+
github.rest.issues.createComment({
113+
issue_number: context.issue.number,
114+
owner: context.repo.owner,
115+
repo: context.repo.repo,
116+
body: message
117+
});

.gitmodules

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[submodule "3rdparty/Automodel"]
2+
path = 3rdparty/Automodel
3+
url = https://github.com/NVIDIA-NeMo/Automodel.git
4+
[submodule "3rdparty/Megatron-Bridge"]
5+
path = 3rdparty/Megatron-Bridge
6+
url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
7+
branch = main

3rdparty/Automodel

Submodule Automodel added at 8134b0c

3rdparty/Megatron-Bridge

Submodule Megatron-Bridge added at 9744106

docker/Dockerfile.ci

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414
FROM nvcr.io/nvidia/pytorch:25.09-py3
1515

16-
WORKDIR /workspace
16+
WORKDIR /opt/DFM
1717

1818
# Install uv
1919
ENV UV_VERSION="0.8.22"
@@ -32,6 +32,14 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
3232
# Copy dependency files and source code (needed for dynamic version resolution)
3333
COPY pyproject.toml uv.lock ./
3434
COPY dfm ./dfm
35+
COPY 3rdparty ./3rdparty
3536

36-
# Install all dependencies using uv sync (without installing the project itself)
37-
RUN uv sync --link-mode copy --locked --all-groups --no-install-project
37+
# Install dependencies in two steps:
38+
# 1. Install build dependencies first (required for packages with no-build-isolation)
39+
# 2. Install all other dependencies
40+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
41+
--mount=type=cache,target=/var/lib/apt,sharing=locked \
42+
--mount=type=cache,target=/root/.cache/uv \
43+
uv sync --locked --only-group build && \
44+
uv sync --link-mode copy --locked --all-extras --all-groups --no-install-project && \
45+
uv cache prune

pyproject.toml

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,13 @@ classifiers = [
5555
"Topic :: Utilities",
5656
]
5757
dependencies = [
58-
"nemo-automodel @ git+https://github.com/NVIDIA-NeMo/Automodel@main",
5958
"diffusers==0.35.1",
6059
"easydict",
6160
"ftfy",
6261
"imageio",
6362
"imageio-ffmpeg",
6463
"opencv-python-headless==4.10.0.84",
64+
"megatron-energon",
6565
]
6666

6767
[build-system]
@@ -90,6 +90,15 @@ dev = [
9090
"ruff>=0.9.9",
9191
"mypy>=1.8.0",
9292
]
93+
build = ["setuptools", "wheel", "torch", "pybind11", "Cython>=3.0.0", "numpy<2.0.0", "ninja", "packaging", "nvidia-mathdx"]
94+
automodel = [
95+
"nemo-automodel",
96+
"diffusers",
97+
"ftfy",
98+
"imageio-ffmpeg",
99+
"opencv-python-headless==4.10.0.84",
100+
]
101+
megatron-bridge = ["megatron-bridge"]
93102

94103
[tool.setuptools]
95104
packages = ["dfm"]
@@ -98,21 +107,36 @@ packages = ["dfm"]
98107
version = {attr = "dfm.__version__"}
99108
readme = {file = "README.md", content-type = "text/markdown"}
100109

110+
[tool.uv]
111+
prerelease = "allow"
112+
# These packages require torch during build, so disable build isolation
113+
no-build-isolation-package = [
114+
"transformer-engine",
115+
"transformer-engine-torch",
116+
"mamba-ssm",
117+
"causal-conv1d",
118+
"nv-grouped-gemm",
119+
"flash_mla",
120+
"flash-linear-attention",
121+
]
122+
override-dependencies = [
123+
"nvidia-modelopt[torch]>=0.37.0",
124+
"torch; sys_platform == 'never'",
125+
"torchvision; sys_platform == 'never'",
126+
"triton; sys_platform == 'never'",
127+
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
128+
]
129+
101130
[[tool.uv.index]]
102131
name = "pypi"
103132
url = "https://pypi.org/simple"
104133
explicit = true
105134

106-
[[tool.uv.index]]
107-
name = "pytorch"
108-
url = "https://download.pytorch.org/whl/cu128"
109-
explicit = true
110-
111135
[tool.uv.sources]
112-
torch = [
113-
{ index = "pytorch", marker = "sys_platform != 'darwin'" },
114-
{ index = "pypi", marker = "sys_platform == 'darwin'" },
115-
]
136+
nemo-automodel = { path = "3rdparty/Automodel" }
137+
megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
138+
transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
139+
nvidia-resiliency-ext = { index = "pypi" }
116140

117141
[project.urls]
118142
Download = "https://github.com/NVIDIA-NeMo/DFM/releases"
@@ -220,4 +244,4 @@ concurrency = ["thread", "multiprocessing"]
220244
omit = ["/tmp/*"]
221245

222246
[tool.coverage.paths]
223-
source = ["dfm/", "/workspace/dfm"]
247+
source = ["dfm/", "/opt/DFM/dfm"]

0 commit comments

Comments
 (0)