diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index b9b2ce540..987e6ad57 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -1,3 +1,4 @@ +# NOTE: Make sure this file is consistent with .gitlab/tests.yml name: GPU tests on: @@ -46,7 +47,7 @@ jobs: if: needs.check-file-changes.outputs.any_changed == 'true' # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md runs-on: linux-amd64-gpu-h100-latest-1 - timeout-minutes: 60 + timeout-minutes: 90 container: image: nvcr.io/nvidia/pytorch:25.06-py3 env: diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index de3568158..fb3e67788 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -1,3 +1,4 @@ +# NOTE: Make sure this file is consistent with .gitlab/tests.yml name: Unit tests on: @@ -84,7 +85,7 @@ jobs: timeout-minutes: 30 strategy: matrix: - torch: [25, 26, 27] + torch: [26, 27] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/.gitlab/.gitlab-ci.yml b/.gitlab/.gitlab-ci.yml new file mode 100644 index 000000000..49393b141 --- /dev/null +++ b/.gitlab/.gitlab-ci.yml @@ -0,0 +1,15 @@ +workflow: + auto_cancel: + on_new_commit: interruptible + +default: + image: python:3.12 + tags: [type/docker, os/linux, cpu] # Use a runner with these tags + +stages: # List of stages for jobs, and their order of execution + - tests + - release + +include: + - .gitlab/tests.yml + - .gitlab/release.yml diff --git a/.gitlab/release.yml b/.gitlab/release.yml new file mode 100644 index 000000000..4bf5480e0 --- /dev/null +++ b/.gitlab/release.yml @@ -0,0 +1,53 @@ +# Upload to PyPI. For external releases with KitMaker, we need to check compliance and use jfrog cli +build-and-upload-wheels: + variables: + GIT_DEPTH: 1000 # For correct version naming (e.g. 0.1.dev20) of nightly builds + stage: release + timeout: 15m + tags: [type/docker, os/linux] # Use a runner with these tags + rules: + - if: $JET_ONLY != null + when: never + - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/ + variables: + RELEASE: "true" + TWINE_USERNAME: svc-dl-algo-ammo + TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD + REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local + - if: $CI_PIPELINE_SOURCE == "schedule" + variables: + RELEASE: "false" + TWINE_USERNAME: gitlab-ci-token + TWINE_PASSWORD: $CI_JOB_TOKEN + REPO_URL: $CI_API_V4_URL/projects/$CI_PROJECT_ID/packages/pypi + script: + - pip install tox + - tox -e build-wheel + # KitMaker compliance checker: https://gitlab-master.nvidia.com/dl/pypi/Wheel-CI-CD/ + # - | + # if [[ $RELEASE == "true" ]]; then + # curl -fsSL https://get.docker.com | sh + # docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + # docker run --pull=always --rm --network=host \ + # -e IGNORE_FAILED_PIP_INSTALL="1" \ + # -e EXPECTED_PKG_LICENSE="Apache 2.0" \ + # -e SKIPPED_SECURITY_RULES="" \ + # -e ALLOWED_NOSEC_COUNT="0" \ + # -v dist:/workspace/ \ + # gitlab-master.nvidia.com:5005/dl/pypi/wheel-ci-cd:wheeltamer + # fi + - | + set -ex + if [[ $RELEASE == "true" ]]; then + curl -fL https://install-cli.jfrog.io | sh + jf rt upload "dist/*.whl" sw-dl-algo-ammo-pypi-local/nvidia-modelopt/release/$CI_COMMIT_TAG/ \ + --url=https://urm.nvidia.com/artifactory --user=$TWINE_USERNAME --password=$TWINE_PASSWORD \ + --target-props="component_name=nvidia-modelopt;os=any;arch=any;version=$CI_COMMIT_TAG;branch=release;release_approver=kmorabia;release_status=ready" \ + --flat --detailed-summary + else + pip install twine + twine upload --repository-url $REPO_URL dist/*.whl + fi + artifacts: + paths: + - dist/ diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml new file mode 100644 index 000000000..4525e764f --- /dev/null +++ b/.gitlab/tests.yml @@ -0,0 +1,90 @@ +# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu}_tests.yml +.tests-default: + stage: tests + variables: + PYTHON: 12 + TORCH: 28 + rules: + - if: $JET_ONLY != null + when: never + - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/ + - if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule" + +##### Unit Tests ##### +unit: + extends: .tests-default + timeout: 30m + image: python:3.$PYTHON + before_script: + # Install cmake to build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353 + - if [ "$PYTHON" = "12" ]; then apt-get update && apt-get install -y cmake; fi + - pip install tox + script: + - tox -e py3$PYTHON-torch$TORCH-unit + +multi-py-unit: + extends: unit + parallel: + matrix: + - PYTHON: [10, 11] + +multi-torch-unit: + extends: unit + parallel: + matrix: + - TORCH: [26, 27] + +##### GPU Tests ##### +gpu: + extends: .tests-default + timeout: 60m + image: nvcr.io/nvidia/pytorch:25.06-py3 + variables: + GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py + LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path. + PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages + tags: [docker, linux, 2-gpu] + script: + # Use pre-installed packages without a new venv with tox-current-env + - pip install tox-current-env + - tox -e py312-cuda12-gpu --current-env + +##### Example Tests ##### +example: + extends: .tests-default + stage: tests + timeout: 45m + image: gitlab-master.nvidia.com:5005/omniml/modelopt/modelopt_examples:latest + variables: + TEST_TYPE: pytest + tags: [docker, linux, 2-gpu, sm<89] + parallel: + matrix: + - TEST: [diffusers, llm_distill, llm_qat, llm_sparsity, onnx_ptq, speculative_decoding] + allow_failure: true # Allow to continue next stages even if job is canceled (e.g. during release) + before_script: + - pip install ".[all]" -U + script: + # Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391 + - if [ "$TEST" = "diffusers" ]; then pip uninstall -y apex; fi + - if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$TEST; else bash tests/examples/test_$TEST.sh; fi + +example-ada: + extends: example + timeout: 60m + tags: [docker, linux, 2-gpu, sm>=89] + parallel: + matrix: + - TEST: [llm_eval, llm_ptq, vlm_ptq, llm_autodeploy] + - TEST: [onnx_ptq] + TEST_TYPE: bash + +##### Megatron / NeMo Integration Tests ##### +megatron-nemo-integration: + extends: .tests-default + variables: + UPSTREAM_REF: $CI_COMMIT_REF_NAME + trigger: + project: omniml/integration/nmm-sandbox + branch: main + strategy: depend # Make sure the upstream task is waiting for the downstream task diff --git a/tox.ini b/tox.ini index 4ae9bac82..554b310f7 100644 --- a/tox.ini +++ b/tox.ini @@ -9,13 +9,12 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox ############################ # CPU Unit test environments ############################ -[testenv:{py310,py311,py312}-torch{25,26,27,28}-unit] +[testenv:{py310,py311,py312}-torch{26,27,28}-unit] deps = # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353 py312: onnxsim # torch version auto-selected based on torchvision version - torch25: torchvision~=0.20.0 torch26: torchvision~=0.21.0 torch27: torchvision~=0.22.0 torch28: torchvision~=0.23.0