Skip to content

Commit 0d8538a

Browse files
Add Internal GitLab CI config
Signed-off-by: Keval Morabia <[email protected]>
1 parent 73d2610 commit 0d8538a

File tree

6 files changed

+163
-4
lines changed

6 files changed

+163
-4
lines changed

.github/workflows/gpu_tests.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# NOTE: Make sure this file is consistent with .gitlab/tests.yml
12
name: GPU tests
23

34
on:
@@ -46,7 +47,7 @@ jobs:
4647
if: needs.check-file-changes.outputs.any_changed == 'true'
4748
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
4849
runs-on: linux-amd64-gpu-h100-latest-1
49-
timeout-minutes: 60
50+
timeout-minutes: 90
5051
container:
5152
image: nvcr.io/nvidia/pytorch:25.06-py3
5253
env:

.github/workflows/unit_tests.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# NOTE: Make sure this file is consistent with .gitlab/tests.yml
12
name: Unit tests
23

34
on:
@@ -84,7 +85,7 @@ jobs:
8485
timeout-minutes: 30
8586
strategy:
8687
matrix:
87-
torch: [25, 26, 27]
88+
torch: [26, 27]
8889
steps:
8990
- uses: actions/checkout@v4
9091
- uses: actions/setup-python@v5

.gitlab/.gitlab-ci.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
workflow:
2+
auto_cancel:
3+
on_new_commit: interruptible
4+
5+
default:
6+
image: python:3.12
7+
tags: [type/docker, os/linux, cpu] # Use a runner with these tags
8+
9+
stages: # List of stages for jobs, and their order of execution
10+
- tests
11+
- release
12+
13+
include:
14+
- .gitlab/tests.yml
15+
- .gitlab/release.yml

.gitlab/release.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Upload to PyPI. For external releases with KitMaker, we need to check compliance and use jfrog cli
2+
build-and-upload-wheels:
3+
variables:
4+
GIT_DEPTH: 1000 # For correct version naming (e.g. 0.1.dev20) of nightly builds
5+
stage: release
6+
timeout: 15m
7+
tags: [type/docker, os/linux] # Use a runner with these tags
8+
rules:
9+
- if: $JET_ONLY != null
10+
when: never
11+
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
12+
variables:
13+
RELEASE: "true"
14+
TWINE_USERNAME: svc-dl-algo-ammo
15+
TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD
16+
REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local
17+
- if: $CI_PIPELINE_SOURCE == "schedule"
18+
variables:
19+
RELEASE: "false"
20+
TWINE_USERNAME: gitlab-ci-token
21+
TWINE_PASSWORD: $CI_JOB_TOKEN
22+
REPO_URL: $CI_API_V4_URL/projects/$CI_PROJECT_ID/packages/pypi
23+
script:
24+
- pip install tox
25+
- tox -e build-wheel
26+
# KitMaker compliance checker: https://gitlab-master.nvidia.com/dl/pypi/Wheel-CI-CD/
27+
# - |
28+
# if [[ $RELEASE == "true" ]]; then
29+
# curl -fsSL https://get.docker.com | sh
30+
# docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
31+
# docker run --pull=always --rm --network=host \
32+
# -e IGNORE_FAILED_PIP_INSTALL="1" \
33+
# -e EXPECTED_PKG_LICENSE="Apache 2.0" \
34+
# -e SKIPPED_SECURITY_RULES="" \
35+
# -e ALLOWED_NOSEC_COUNT="0" \
36+
# -v dist:/workspace/ \
37+
# gitlab-master.nvidia.com:5005/dl/pypi/wheel-ci-cd:wheeltamer
38+
# fi
39+
- |
40+
set -ex
41+
if [[ $RELEASE == "true" ]]; then
42+
curl -fL https://install-cli.jfrog.io | sh
43+
jf rt upload "dist/*.whl" sw-dl-algo-ammo-pypi-local/nvidia-modelopt/release/$CI_COMMIT_TAG/ \
44+
--url=https://urm.nvidia.com/artifactory --user=$TWINE_USERNAME --password=$TWINE_PASSWORD \
45+
--target-props="component_name=nvidia-modelopt;os=any;arch=any;version=$CI_COMMIT_TAG;branch=release;release_approver=kmorabia;release_status=ready" \
46+
--flat --detailed-summary
47+
else
48+
pip install twine
49+
twine upload --repository-url $REPO_URL dist/*.whl
50+
fi
51+
artifacts:
52+
paths:
53+
- dist/

.gitlab/tests.yml

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu}_tests.yml
2+
.tests-default:
3+
stage: tests
4+
variables:
5+
PYTHON: 12
6+
TORCH: 28
7+
rules:
8+
- if: $JET_ONLY != null
9+
when: never
10+
- if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
11+
- if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule"
12+
13+
##### Unit Tests #####
14+
unit:
15+
extends: .tests-default
16+
timeout: 30m
17+
image: python:3.$PYTHON
18+
before_script:
19+
# Install cmake to build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
20+
- if [ "$PYTHON" = "12" ]; then apt-get update && apt-get install -y cmake; fi
21+
- pip install tox
22+
script:
23+
- tox -e py3$PYTHON-torch$TORCH-unit
24+
25+
multi-py-unit:
26+
extends: unit
27+
parallel:
28+
matrix:
29+
- PYTHON: [10, 11]
30+
31+
multi-torch-unit:
32+
extends: unit
33+
parallel:
34+
matrix:
35+
- TORCH: [26, 27]
36+
37+
##### GPU Tests #####
38+
gpu:
39+
extends: .tests-default
40+
timeout: 60m
41+
image: nvcr.io/nvidia/pytorch:25.06-py3
42+
variables:
43+
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
44+
LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path.
45+
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
46+
tags: [docker, linux, 2-gpu]
47+
script:
48+
# Use pre-installed packages without a new venv with tox-current-env
49+
- pip install tox-current-env
50+
- tox -e py312-cuda12-gpu --current-env
51+
52+
##### Example Tests #####
53+
example:
54+
extends: .tests-default
55+
stage: tests
56+
timeout: 45m
57+
image: gitlab-master.nvidia.com:5005/omniml/modelopt/modelopt_examples:latest
58+
variables:
59+
TEST_TYPE: pytest
60+
tags: [docker, linux, 2-gpu, sm<89]
61+
parallel:
62+
matrix:
63+
- TEST: [diffusers, llm_distill, llm_qat, llm_sparsity, onnx_ptq, speculative_decoding]
64+
allow_failure: true # Allow to continue next stages even if job is canceled (e.g. during release)
65+
before_script:
66+
- pip install ".[all]" -U
67+
script:
68+
# Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391
69+
- if [ "$TEST" = "diffusers" ]; then pip uninstall -y apex; fi
70+
- if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$TEST; else bash tests/examples/test_$TEST.sh; fi
71+
72+
example-ada:
73+
extends: example
74+
timeout: 60m
75+
tags: [docker, linux, 2-gpu, sm>=89]
76+
parallel:
77+
matrix:
78+
- TEST: [llm_eval, llm_ptq, vlm_ptq, llm_autodeploy]
79+
- TEST: [onnx_ptq]
80+
TEST_TYPE: bash
81+
82+
##### Megatron / NeMo Integration Tests #####
83+
megatron-nemo-integration:
84+
extends: .tests-default
85+
variables:
86+
UPSTREAM_REF: $CI_COMMIT_REF_NAME
87+
trigger:
88+
project: omniml/integration/nmm-sandbox
89+
branch: main
90+
strategy: depend # Make sure the upstream task is waiting for the downstream task

tox.ini

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox
99
############################
1010
# CPU Unit test environments
1111
############################
12-
[testenv:{py310,py311,py312}-torch{25,26,27,28}-unit]
12+
[testenv:{py310,py311,py312}-torch{26,27,28}-unit]
1313
deps =
1414
# Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
1515
py312: onnxsim
1616

1717
# torch version auto-selected based on torchvision version
18-
torch25: torchvision~=0.20.0
1918
torch26: torchvision~=0.21.0
2019
torch27: torchvision~=0.22.0
2120
torch28: torchvision~=0.23.0

0 commit comments

Comments
 (0)