Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# NOTE: Make sure this file is consistent with .gitlab/tests.yml
name: GPU tests

on:
Expand Down Expand Up @@ -46,7 +47,7 @@ jobs:
if: needs.check-file-changes.outputs.any_changed == 'true'
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
runs-on: linux-amd64-gpu-h100-latest-1
timeout-minutes: 60
timeout-minutes: 90
container:
image: nvcr.io/nvidia/pytorch:25.06-py3
env:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# NOTE: Make sure this file is consistent with .gitlab/tests.yml
name: Unit tests

on:
Expand Down Expand Up @@ -84,7 +85,7 @@ jobs:
timeout-minutes: 30
strategy:
matrix:
torch: [25, 26, 27]
torch: [26, 27]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
Expand Down
15 changes: 15 additions & 0 deletions .gitlab/.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
workflow:
auto_cancel:
on_new_commit: interruptible

default:
image: python:3.12
tags: [type/docker, os/linux, cpu] # Use a runner with these tags

stages: # List of stages for jobs, and their order of execution
- tests
- release

include:
- .gitlab/tests.yml
- .gitlab/release.yml
53 changes: 53 additions & 0 deletions .gitlab/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Upload to PyPI. For external releases with KitMaker, we need to check compliance and use jfrog cli
build-and-upload-wheels:
variables:
GIT_DEPTH: 1000 # For correct version naming (e.g. 0.1.dev20) of nightly builds
stage: release
timeout: 15m
tags: [type/docker, os/linux] # Use a runner with these tags
rules:
- if: $JET_ONLY != null
when: never
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
variables:
RELEASE: "true"
TWINE_USERNAME: svc-dl-algo-ammo
TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD
REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local
- if: $CI_PIPELINE_SOURCE == "schedule"
variables:
RELEASE: "false"
TWINE_USERNAME: gitlab-ci-token
TWINE_PASSWORD: $CI_JOB_TOKEN
REPO_URL: $CI_API_V4_URL/projects/$CI_PROJECT_ID/packages/pypi
script:
- pip install tox
- tox -e build-wheel
# KitMaker compliance checker: https://gitlab-master.nvidia.com/dl/pypi/Wheel-CI-CD/
# - |
# if [[ $RELEASE == "true" ]]; then
# curl -fsSL https://get.docker.com | sh
# docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
# docker run --pull=always --rm --network=host \
# -e IGNORE_FAILED_PIP_INSTALL="1" \
# -e EXPECTED_PKG_LICENSE="Apache 2.0" \
# -e SKIPPED_SECURITY_RULES="" \
# -e ALLOWED_NOSEC_COUNT="0" \
# -v dist:/workspace/ \
# gitlab-master.nvidia.com:5005/dl/pypi/wheel-ci-cd:wheeltamer
# fi
- |
set -ex
if [[ $RELEASE == "true" ]]; then
curl -fL https://install-cli.jfrog.io | sh
jf rt upload "dist/*.whl" sw-dl-algo-ammo-pypi-local/nvidia-modelopt/release/$CI_COMMIT_TAG/ \
--url=https://urm.nvidia.com/artifactory --user=$TWINE_USERNAME --password=$TWINE_PASSWORD \
--target-props="component_name=nvidia-modelopt;os=any;arch=any;version=$CI_COMMIT_TAG;branch=release;release_approver=kmorabia;release_status=ready" \
--flat --detailed-summary
else
pip install twine
twine upload --repository-url $REPO_URL dist/*.whl
fi
artifacts:
paths:
- dist/
90 changes: 90 additions & 0 deletions .gitlab/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu}_tests.yml
.tests-default:
stage: tests
variables:
PYTHON: 12
TORCH: 28
rules:
- if: $JET_ONLY != null
when: never
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
- if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule"

##### Unit Tests #####
unit:
extends: .tests-default
timeout: 30m
image: python:3.$PYTHON
before_script:
# Install cmake to build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
- if [ "$PYTHON" = "12" ]; then apt-get update && apt-get install -y cmake; fi
- pip install tox
script:
- tox -e py3$PYTHON-torch$TORCH-unit

multi-py-unit:
extends: unit
parallel:
matrix:
- PYTHON: [10, 11]

multi-torch-unit:
extends: unit
parallel:
matrix:
- TORCH: [26, 27]

##### GPU Tests #####
gpu:
extends: .tests-default
timeout: 60m
image: nvcr.io/nvidia/pytorch:25.06-py3
variables:
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path.
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
tags: [docker, linux, 2-gpu]
script:
# Use pre-installed packages without a new venv with tox-current-env
- pip install tox-current-env
- tox -e py312-cuda12-gpu --current-env

##### Example Tests #####
example:
extends: .tests-default
stage: tests
timeout: 45m
image: gitlab-master.nvidia.com:5005/omniml/modelopt/modelopt_examples:latest
variables:
TEST_TYPE: pytest
tags: [docker, linux, 2-gpu, sm<89]
parallel:
matrix:
- TEST: [diffusers, llm_distill, llm_qat, llm_sparsity, onnx_ptq, speculative_decoding]
allow_failure: true # Allow to continue next stages even if job is canceled (e.g. during release)
before_script:
- pip install ".[all]" -U
script:
# Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391
- if [ "$TEST" = "diffusers" ]; then pip uninstall -y apex; fi
- if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$TEST; else bash tests/examples/test_$TEST.sh; fi

example-ada:
extends: example
timeout: 60m
tags: [docker, linux, 2-gpu, sm>=89]
parallel:
matrix:
- TEST: [llm_eval, llm_ptq, vlm_ptq, llm_autodeploy]
- TEST: [onnx_ptq]
TEST_TYPE: bash

##### Megatron / NeMo Integration Tests #####
megatron-nemo-integration:
extends: .tests-default
variables:
UPSTREAM_REF: $CI_COMMIT_REF_NAME
trigger:
project: omniml/integration/nmm-sandbox
branch: main
strategy: depend # Make sure the upstream task is waiting for the downstream task
3 changes: 1 addition & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox
############################
# CPU Unit test environments
############################
[testenv:{py310,py311,py312}-torch{25,26,27,28}-unit]
[testenv:{py310,py311,py312}-torch{26,27,28}-unit]
deps =
# Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
py312: onnxsim

# torch version auto-selected based on torchvision version
torch25: torchvision~=0.20.0
torch26: torchvision~=0.21.0
torch27: torchvision~=0.22.0
torch28: torchvision~=0.23.0
Expand Down
Loading