Skip to content

Commit f053d84

Browse files
Initial CI/CD setup for GPU tests (#242)
Signed-off-by: Keval Morabia <[email protected]>
1 parent 917a5d4 commit f053d84

File tree

4 files changed

+113
-9
lines changed

4 files changed

+113
-9
lines changed

.github/workflows/gpu_tests.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: GPU tests
2+
3+
on:
4+
push:
5+
branches: ["pull-request/[0-9]+"]
6+
paths:
7+
- ".github/workflows/gpu_tests.yml"
8+
- "modelopt/**"
9+
- "tests/**"
10+
- "setup.py"
11+
- "tox.ini"
12+
13+
# Cancel previous runs if new commit is pushed to the same PR
14+
concurrency:
15+
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
16+
cancel-in-progress: true
17+
18+
jobs:
19+
gpu-tests:
20+
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
21+
runs-on: linux-amd64-gpu-h100-latest-1
22+
timeout-minutes: 60
23+
container:
24+
image: nvcr.io/nvidia/pytorch:25.04-py3
25+
env:
26+
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
27+
LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path.
28+
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
29+
steps:
30+
- uses: actions/checkout@v4
31+
- name: Install dependencies
32+
run: pip install tox-current-env
33+
- name: Run gpu tests
34+
run: tox -e py312-cuda12-gpu --current-env

.github/workflows/unit_tests.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ jobs:
2424
- uses: actions/setup-python@v5
2525
with:
2626
python-version: "3.12"
27-
# Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
2827
- name: Install dependencies
29-
run: |
30-
pip install onnxsim
31-
pip install tox
28+
run: pip install tox
3229
- name: Run unit tests
3330
run: tox -e py312-torch27-unit

tests/unit/torch/quantization/test_quant_rnn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def test_fake_quant_per_channel(self, original_cls, bidirectional):
211211

212212
out1 = quant_rnn_object(test_input)[0]
213213
out2 = rnn_object_original(test_input)[0]
214-
assert torch.allclose(out1, out2)
214+
assert torch.allclose(out1, out2, atol=1e-5)
215215

216216
@pytest.mark.parametrize(
217217
("original_cls", "bidirectional"),

tox.ini

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tox]
22
envlist=
33
pre-commit-all
4-
py312-torch27-unit
4+
py312-torch27-{unit,gpu}
55
skipsdist = True
66
toxworkdir = /tmp/{env:USER}-modelopt-tox
77

@@ -11,6 +11,9 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox
1111
############################
1212
[testenv:{py39,py310,py311,py312}-torch{24,25,26,27}-unit]
1313
deps =
14+
# Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
15+
py312: onnxsim
16+
1417
# torch version auto-selected based on torchvision version
1518
torch24: torchvision~=0.19.0
1619
torch25: torchvision~=0.20.0
@@ -22,6 +25,53 @@ commands =
2225
python -m pytest tests/unit --cov
2326

2427

28+
#####################################################################
29+
# Environment to run unit tests with subset of dependencies installed
30+
#####################################################################
31+
[testenv:{py39,py310,py311,py312}-ext-unit-{onnx,torch,torch_deploy}]
32+
allowlist_externals =
33+
bash, rm
34+
deps =
35+
# Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
36+
py312: onnxsim
37+
38+
# ONNX unit tests heavily rely on torch / torchvision
39+
onnx: .[onnx,dev-test]
40+
onnx: torchvision
41+
42+
# Install megatron-core to test torch-only install can still import plugins
43+
torch: megatron-core
44+
torch: .[dev-test]
45+
46+
torch_deploy: .[onnx,torch,dev-test]
47+
commands =
48+
onnx: python -m pytest tests/unit/onnx
49+
torch: python -m pytest tests/unit/torch --ignore tests/unit/torch/deploy
50+
torch_deploy: python -m pytest tests/unit/torch/deploy
51+
52+
53+
########################################################
54+
# GPU test environments (Can be used with --current-env)
55+
########################################################
56+
[testenv:{py39,py310,py311,py312}-cuda12-gpu]
57+
commands_pre =
58+
# Install deps here so that it gets installed even in --current-env
59+
pip install -U megatron-core
60+
pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
61+
62+
# Install Eagle-3 test dependencies
63+
pip install tiktoken blobfile sentencepiece
64+
65+
# Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
66+
py312: pip install onnxsim
67+
68+
# NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
69+
# to avoid possible CUDA version mismatch
70+
pip install -e .[all,dev-test]
71+
commands =
72+
# Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
73+
python -m pytest tests/gpu
74+
2575
#############################################
2676
# Code quality checks on all files or on diff
2777
#############################################
@@ -33,9 +83,9 @@ commands =
3383
diff: pre-commit run --from-ref origin/main --to-ref HEAD {posargs}
3484

3585

36-
#####################
37-
# Documentation build
38-
#####################
86+
#########################
87+
# Run documentation build
88+
#########################
3989
[testenv:{build,debug}-docs]
4090
allowlist_externals =
4191
rm
@@ -50,3 +100,26 @@ commands_pre =
50100
commands =
51101
sphinx-build source build/html --fail-on-warning --show-traceback --keep-going
52102
debug: sphinx-autobuild source build/html --host 0.0.0.0
103+
104+
105+
#################
106+
# Run wheel build
107+
#################
108+
[testenv:build-wheel]
109+
allowlist_externals =
110+
bash, cd, rm
111+
passenv =
112+
SETUPTOOLS_SCM_PRETEND_VERSION
113+
deps =
114+
twine
115+
commands =
116+
# Clean build directory to avoid any stale files getting into the wheel
117+
rm -rf build
118+
119+
# Build and check wheel
120+
pip wheel --no-deps --wheel-dir=dist .
121+
twine check dist/*
122+
123+
# Install and test the wheel
124+
bash -c "find dist -name 'nvidia_modelopt-*.whl' | xargs pip install -f dist"
125+
bash -c "cd dist; python -c 'import modelopt; print(modelopt.__version__);'"

0 commit comments

Comments
 (0)