diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml new file mode 100644 index 000000000..327e6633c --- /dev/null +++ b/.github/workflows/gpu_tests.yml @@ -0,0 +1,34 @@ +name: GPU tests + +on: + push: + branches: ["pull-request/[0-9]+"] + paths: + - ".github/workflows/gpu_tests.yml" + - "modelopt/**" + - "tests/**" + - "setup.py" + - "tox.ini" + +# Cancel previous runs if new commit is pushed to the same PR +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + gpu-tests: + # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md + runs-on: linux-amd64-gpu-h100-latest-1 + timeout-minutes: 60 + container: + image: nvcr.io/nvidia/pytorch:25.04-py3 + env: + GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py + LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path. + PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages + steps: + - uses: actions/checkout@v4 + - name: Install dependencies + run: pip install tox-current-env + - name: Run gpu tests + run: tox -e py312-cuda12-gpu --current-env diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 181225a25..b279a2764 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -24,10 +24,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" - # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353 - name: Install dependencies - run: | - pip install onnxsim - pip install tox + run: pip install tox - name: Run unit tests run: tox -e py312-torch27-unit diff --git a/tests/unit/torch/quantization/test_quant_rnn.py b/tests/unit/torch/quantization/test_quant_rnn.py index b297028b7..284ba4c70 100644 --- a/tests/unit/torch/quantization/test_quant_rnn.py +++ b/tests/unit/torch/quantization/test_quant_rnn.py @@ -211,7 +211,7 @@ def test_fake_quant_per_channel(self, original_cls, bidirectional): out1 = quant_rnn_object(test_input)[0] out2 = rnn_object_original(test_input)[0] - assert torch.allclose(out1, out2) + assert torch.allclose(out1, out2, atol=1e-5) @pytest.mark.parametrize( ("original_cls", "bidirectional"), diff --git a/tox.ini b/tox.ini index e5fd614e3..aa5223938 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,7 @@ [tox] envlist= pre-commit-all - py312-torch27-unit + py312-torch27-{unit,gpu} skipsdist = True toxworkdir = /tmp/{env:USER}-modelopt-tox @@ -11,6 +11,9 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox ############################ [testenv:{py39,py310,py311,py312}-torch{24,25,26,27}-unit] deps = + # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353 + py312: onnxsim + # torch version auto-selected based on torchvision version torch24: torchvision~=0.19.0 torch25: torchvision~=0.20.0 @@ -22,6 +25,53 @@ commands = python -m pytest tests/unit --cov +##################################################################### +# Environment to run unit tests with subset of dependencies installed +##################################################################### +[testenv:{py39,py310,py311,py312}-ext-unit-{onnx,torch,torch_deploy}] +allowlist_externals = + bash, rm +deps = + # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353 + py312: onnxsim + + # ONNX unit tests heavily rely on torch / torchvision + onnx: .[onnx,dev-test] + onnx: torchvision + + # Install megatron-core to test torch-only install can still import plugins + torch: megatron-core + torch: .[dev-test] + + torch_deploy: .[onnx,torch,dev-test] +commands = + onnx: python -m pytest tests/unit/onnx + torch: python -m pytest tests/unit/torch --ignore tests/unit/torch/deploy + torch_deploy: python -m pytest tests/unit/torch/deploy + + +######################################################## +# GPU test environments (Can be used with --current-env) +######################################################## +[testenv:{py39,py310,py311,py312}-cuda12-gpu] +commands_pre = + # Install deps here so that it gets installed even in --current-env + pip install -U megatron-core + pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git + + # Install Eagle-3 test dependencies + pip install tiktoken blobfile sentencepiece + + # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353 + py312: pip install onnxsim + + # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env + # to avoid possible CUDA version mismatch + pip install -e .[all,dev-test] +commands = + # Coverage fails with "Can't combine line data with arc data" error so not using "--cov" + python -m pytest tests/gpu + ############################################# # Code quality checks on all files or on diff ############################################# @@ -33,9 +83,9 @@ commands = diff: pre-commit run --from-ref origin/main --to-ref HEAD {posargs} -##################### -# Documentation build -##################### +######################### +# Run documentation build +######################### [testenv:{build,debug}-docs] allowlist_externals = rm @@ -50,3 +100,26 @@ commands_pre = commands = sphinx-build source build/html --fail-on-warning --show-traceback --keep-going debug: sphinx-autobuild source build/html --host 0.0.0.0 + + +################# +# Run wheel build +################# +[testenv:build-wheel] +allowlist_externals = + bash, cd, rm +passenv = + SETUPTOOLS_SCM_PRETEND_VERSION +deps = + twine +commands = + # Clean build directory to avoid any stale files getting into the wheel + rm -rf build + + # Build and check wheel + pip wheel --no-deps --wheel-dir=dist . + twine check dist/* + + # Install and test the wheel + bash -c "find dist -name 'nvidia_modelopt-*.whl' | xargs pip install -f dist" + bash -c "cd dist; python -c 'import modelopt; print(modelopt.__version__);'"