benchmarking: replace nvidia-smi with gpustat-based GPU stats, add Sl… #1690
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # This workflow validates NeMo Curator installation for each optional dependency. | |
| # Tests both pip and uv (lockfile) installations. | |
| # - Most jobs: CPU runner, install-only validation | |
| # - UV Py3.12 jobs: GPU runner, install + import validation | |
| name: Installation Test | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - "pull-request/[0-9]+" | |
| schedule: | |
| - cron: "0 0 * * *" | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| UV_HTTP_TIMEOUT: 300 # 300s timeout needed for large GPU packages (cudf, vllm, flash-attn, torch) | |
| jobs: | |
| pre-flight: | |
| uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.80.1 | |
| with: | |
| default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }} | |
| non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }} | |
| default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }} | |
| non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }} | |
| sso_users_filename: ${{ vars.SSO_USERS_FILENAME }} | |
| secrets: | |
| NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }} | |
| cuda-pre-flight: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| cuda_base_image: ${{ steps.cuda_config.outputs.base_image }} | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Extract CUDA config | |
| id: cuda_config | |
| run: | | |
| CUDA_VER=$(grep -E "^ARG CUDA_VER=" docker/Dockerfile | cut -d'=' -f2) | |
| LINUX_VER=$(grep -E "^ARG LINUX_VER=" docker/Dockerfile | cut -d'=' -f2) | |
| echo "base_image=nvidia/cuda:${CUDA_VER}-cudnn-devel-${LINUX_VER}" >> $GITHUB_OUTPUT | |
| # ============================================================================ | |
| # CPU Extras - Install Only (CPU Runner) | |
| # ============================================================================ | |
| cpu-install-test: | |
| needs: [pre-flight] | |
| if: needs.pre-flight.outputs.docs_only != 'true' | |
| runs-on: ubuntu-latest | |
| name: CPU Extras Individual - Install + Import (${{ matrix.installer }}) - Py3.12 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| installer: [pip, uv] | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: Free disk space | |
| run: sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost | |
| - name: Set up Python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Set up uv | |
| if: matrix.installer == 'uv' | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| python-version: "3.12" | |
| enable-cache: false # We use isolated venvs with uv cache clean | |
| - name: Test each CPU extra (${{ matrix.installer }}) | |
| id: test | |
| run: | | |
| set -o pipefail | |
| INSTALLER="${{ matrix.installer }}" | |
| CPU_EXTRAS=("audio_cpu" "image_cpu" "sdg_cpu" "text_cpu" "video_cpu") | |
| RESULTS="" | |
| FAILED=0 | |
| for extra in "${CPU_EXTRAS[@]}"; do | |
| echo "::group::📦 Testing: $extra ($INSTALLER)" | |
| # Setup environment | |
| if [ "$INSTALLER" == "pip" ]; then | |
| python -m venv "venv_$extra" | |
| source "venv_$extra/bin/activate" | |
| pip install --upgrade pip | |
| else | |
| rm -rf .venv | |
| uv cache clean | |
| fi | |
| INSTALL_OK=false | |
| IMPORT_OK=false | |
| # Install | |
| if [ "$INSTALLER" == "pip" ]; then | |
| INSTALL_CMD="pip install '.[$extra]'" | |
| else | |
| INSTALL_CMD="uv sync --locked --link-mode copy --extra $extra" | |
| fi | |
| if eval "$INSTALL_CMD" 2>&1 | tee "install-$extra.log"; then | |
| echo "✅ $extra: INSTALL SUCCESS" | |
| INSTALL_OK=true | |
| # Import test | |
| if [ "$INSTALLER" == "pip" ]; then | |
| RUN_CMD="python" | |
| else | |
| RUN_CMD="uv run python" | |
| fi | |
| if $RUN_CMD -c "from nemo_curator import package_info; print(f' nemo_curator v{package_info.__version__}')" 2>&1 | tee -a "install-$extra.log"; then | |
| echo "✅ $extra: IMPORT SUCCESS" | |
| IMPORT_OK=true | |
| else | |
| echo "❌ $extra: IMPORT FAILED" | |
| fi | |
| else | |
| echo "❌ $extra: INSTALL FAILED" | |
| echo "" | |
| echo "=== ERROR for $extra ===" | |
| grep -E "(ERROR:|error:|Cannot install|conflicting dependencies|ResolutionImpossible|No space left|Failed to install)" "install-$extra.log" | tail -15 || tail -30 "install-$extra.log" | |
| echo "=========================" | |
| fi | |
| if [ "$INSTALL_OK" = "true" ] && [ "$IMPORT_OK" = "true" ]; then | |
| RESULTS="$RESULTS\n| $extra | ✅ | ✅ |" | |
| else | |
| RESULTS="$RESULTS\n| $extra | $( [ "$INSTALL_OK" = "true" ] && echo '✅' || echo '❌') | $( [ "$IMPORT_OK" = "true" ] && echo '✅' || echo '❌') |" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| # Cleanup | |
| rm -f "install-$extra.log" | |
| if [ "$INSTALLER" == "pip" ]; then | |
| deactivate | |
| rm -rf "venv_$extra" | |
| else | |
| rm -rf .venv | |
| uv cache clean | |
| fi | |
| echo "::endgroup::" | |
| done | |
| echo "" | |
| echo "## CPU Installation Results ($INSTALLER)" | |
| echo "| Extra | Install | Import |" | |
| echo "|-------|---------|--------|" | |
| echo -e "$RESULTS" | |
| if [ $FAILED -gt 0 ]; then | |
| echo "test_status=failure" >> $GITHUB_OUTPUT | |
| exit 1 | |
| else | |
| echo "test_status=success" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Save status for summary | |
| if: always() | |
| run: | | |
| mkdir -p cpu-status | |
| echo "${{ steps.test.outputs.test_status || 'failure' }}" > cpu-status/${{ matrix.installer }}_status.txt | |
| - name: Upload status artifact | |
| if: always() | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: cpu-status-${{ matrix.installer }} | |
| path: cpu-status/ | |
| retention-days: 1 | |
| # ============================================================================ | |
| # GPU Extras - Install Only (pip=required, uv=Required) | |
| # ============================================================================ | |
| gpu-install-test: | |
| needs: [pre-flight, cuda-pre-flight] | |
| if: needs.pre-flight.outputs.docs_only != 'true' | |
| runs-on: linux-amd64-cpu16 | |
| container: | |
| image: ${{ needs.cuda-pre-flight.outputs.cuda_base_image }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| installer: [pip, uv] | |
| name: "GPU Extras Individual - Install Only (${{ matrix.installer }}) - Py3.12" | |
| steps: | |
| - name: Install dependencies for setup-python | |
| run: apt-get update && apt-get install -y --no-install-recommends git curl | |
| - uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Setup uv | |
| if: matrix.installer == 'uv' | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| python-version: "3.12" | |
| enable-cache: false | |
| - name: Test each GPU extra (${{ matrix.installer }}) | |
| id: test | |
| shell: bash | |
| run: | | |
| set -o pipefail | |
| INSTALLER="${{ matrix.installer }}" | |
| GPU_EXTRAS=("cuda12" "deduplication_cuda12" "audio_cuda12" "image_cuda12" "inference_server" "text_cuda12" "inference_server") | |
| RESULTS="" | |
| FAILED=0 | |
| for extra in "${GPU_EXTRAS[@]}"; do | |
| echo "::group::🎮 Testing: $extra ($INSTALLER) - Install Only" | |
| # Setup isolated environment | |
| if [ "$INSTALLER" = "pip" ]; then | |
| python -m venv "venv_$extra" | |
| source "venv_$extra/bin/activate" | |
| pip install --upgrade pip || true | |
| else | |
| rm -rf .venv | |
| uv cache clean | |
| fi | |
| # Install | |
| if [ "$INSTALLER" = "pip" ]; then | |
| INSTALL_CMD="pip install --no-cache-dir '.[$extra]'" | |
| else | |
| INSTALL_CMD="uv sync --locked --link-mode copy --extra $extra" | |
| fi | |
| if eval "$INSTALL_CMD" 2>&1 | tee "install-$extra.log"; then | |
| echo "✅ $extra: INSTALL SUCCESS" | |
| RESULTS="$RESULTS\n| $extra | ✅ |" | |
| else | |
| echo "❌ $extra: INSTALL FAILED" | |
| echo "" | |
| echo "=== ERROR for $extra ===" | |
| if [ "$INSTALLER" = "pip" ]; then | |
| grep -E "(ERROR:|error:|Cannot install|conflicting dependencies|ResolutionImpossible|versions have conflicting)" "install-$extra.log" | tail -15 || tail -30 "install-$extra.log" | |
| else | |
| grep -E "(error:|Error:|No space left|Failed to install)" "install-$extra.log" | tail -15 || tail -30 "install-$extra.log" | |
| fi | |
| echo "=========================" | |
| RESULTS="$RESULTS\n| $extra | ❌ |" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| # Cleanup | |
| rm -f "install-$extra.log" | |
| if [ "$INSTALLER" = "pip" ]; then | |
| deactivate || true | |
| rm -rf "venv_$extra" | |
| else | |
| rm -rf .venv | |
| uv cache clean | |
| fi | |
| echo "::endgroup::" | |
| done | |
| echo "" | |
| echo "## GPU Installation Results ($INSTALLER)" | |
| echo "| Extra | Status |" | |
| echo "|-------|--------|" | |
| echo -e "$RESULTS" | |
| if [ $FAILED -gt 0 ]; then | |
| echo "test_status=failure" >> $GITHUB_OUTPUT | |
| exit 1 | |
| else | |
| echo "test_status=success" >> $GITHUB_OUTPUT | |
| fi | |
| env: | |
| UV_HTTP_TIMEOUT: 300 | |
| PIP_BREAK_SYSTEM_PACKAGES: 1 | |
| - name: Save status for summary | |
| if: always() | |
| run: | | |
| mkdir -p gpu-status | |
| echo "${{ steps.test.outputs.test_status || 'failure' }}" > gpu-status/${{ matrix.installer }}_status.txt | |
| - name: Upload status artifact | |
| if: always() | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: gpu-status-${{ matrix.installer }} | |
| path: gpu-status/ | |
| retention-days: 1 | |
| # ============================================================================ | |
| # ALL Extras - Install Only (CPU Runner) | |
| # ============================================================================ | |
| # all-install-only-test: | |
| # needs: [pre-flight, cuda-pre-flight] | |
| # if: needs.pre-flight.outputs.docs_only != 'true' | |
| # runs-on: linux-amd64-cpu16 | |
| # container: | |
| # image: ${{ needs.cuda-pre-flight.outputs.cuda_base_image }} | |
| # name: "ALL Extras - Install Only (${{ matrix.installer }}) - Py${{ matrix.python-version }}" | |
| # strategy: | |
| # fail-fast: false | |
| # matrix: | |
| # installer: [uv] | |
| # python-version: ["3.12", "3.10"] | |
| # exclude: | |
| # - installer: uv | |
| # python-version: "3.12" # This combo runs on GPU with import checks | |
| # steps: | |
| # - name: Install dependencies for setup-python | |
| # run: apt-get update && apt-get install -y --no-install-recommends git curl | |
| # - uses: actions/checkout@v6 | |
| # with: | |
| # submodules: recursive | |
| # - uses: actions/setup-python@v6 | |
| # with: | |
| # python-version: "${{ matrix.python-version }}" | |
| # - name: Setup uv | |
| # if: matrix.installer == 'uv' | |
| # uses: astral-sh/setup-uv@v6 | |
| # with: | |
| # python-version: "${{ matrix.python-version }}" | |
| # enable-cache: false | |
| # - name: Install ALL extras (${{ matrix.installer }}) | |
| # id: install | |
| # shell: bash | |
| # run: | | |
| # set -o pipefail | |
| # echo "🚀 Installing ALL extras (${{ matrix.installer }}, Py${{ matrix.python-version }})" | |
| # if [ "${{ matrix.installer }}" = "pip" ]; then | |
| # if pip install --no-cache-dir ".[all]" 2>&1 | tee install.log; then | |
| # echo "install_status=success" >> $GITHUB_OUTPUT | |
| # echo "✅ INSTALL SUCCESS" | |
| # else | |
| # echo "install_status=failure" >> $GITHUB_OUTPUT | |
| # echo "❌ INSTALL FAILED" | |
| # grep -E "(ERROR|error:|failed|conflict)" install.log | head -20 || true | |
| # fi | |
| # else | |
| # export PATH="/root/.local/bin:$PATH" | |
| # if uv sync --locked --link-mode copy --extra all 2>&1 | tee install.log; then | |
| # echo "install_status=success" >> $GITHUB_OUTPUT | |
| # echo "✅ INSTALL SUCCESS" | |
| # else | |
| # echo "install_status=failure" >> $GITHUB_OUTPUT | |
| # echo "❌ INSTALL FAILED" | |
| # grep -E "(error:|Error:|failed)" install.log | head -20 || true | |
| # fi | |
| # fi | |
| # env: | |
| # UV_HTTP_TIMEOUT: 300 | |
| # PIP_BREAK_SYSTEM_PACKAGES: 1 | |
| # - name: Report | |
| # if: always() | |
| # run: | | |
| # echo "📊 ALL Extras (${{ matrix.installer }}, Py${{ matrix.python-version }}) - Install Only" | |
| # if [ "${{ steps.install.outputs.install_status }}" = "success" ]; then | |
| # echo "✅ Installation succeeded" | |
| # else | |
| # echo "❌ Installation failed (expected - known conflicts)" | |
| # echo "" | |
| # echo "Known conflicts that may cause this:" | |
| # echo " - nemo_toolkit[asr] requires transformers<4.55 but vllm requires transformers>=4.55" | |
| # echo " - flash-attn build requires packaging module" | |
| # echo " - flash-attn build requires torch pre-installed (pip can't handle build-time deps)" | |
| # exit 1 | |
| # fi | |
| # ============================================================================ | |
| # ALL Extras - Install + Import (GPU Runner, uv Py3.12 only) | |
| # ============================================================================ | |
| all-install-import-test: | |
| needs: [pre-flight, cuda-pre-flight] | |
| if: needs.pre-flight.outputs.docs_only != 'true' | |
| runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2 | |
| name: "ALL Extras - Install + Import (uv) - Py3.12" | |
| steps: | |
| - uses: actions/checkout@v6 | |
| with: | |
| submodules: recursive | |
| - name: Install ALL extras (uv) | |
| id: install | |
| shell: bash | |
| run: | | |
| set -o pipefail | |
| echo "🚀 Installing ALL extras (uv, Py3.12) - Install + Import" | |
| cat > ${{ github.workspace }}/.run_install.sh << 'SCRIPT' | |
| set -e | |
| apt-get update && apt-get install -y --no-install-recommends git curl | |
| curl -LsSf https://astral.sh/uv/install.sh | sh | |
| export PATH="/root/.local/bin:$PATH" | |
| uv sync --locked --link-mode copy --extra all --python 3.12 2>&1 | tee install.log | |
| echo "::group::Import Verification" | |
| uv run python -c "import sys; print('Python:', sys.version);" | |
| uv run python -c "from nemo_curator import package_info; print(f'nemo_curator v{package_info.__version__}')" | |
| uv run python -c "import cudf, cuml; print('cudf, cuml');" | |
| uv run python -c "import torch; print(f'torch {torch.__version__} (CUDA: {torch.cuda.is_available()})');" | |
| uv run python -c "import torchvision; print(f'torchvision {torchvision.__version__}')" | |
| echo "::endgroup::" | |
| SCRIPT | |
| if docker run --rm --gpus all \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| -e UV_HTTP_TIMEOUT=300 \ | |
| ${{ needs.cuda-pre-flight.outputs.cuda_base_image }} \ | |
| bash /workspace/.run_install.sh; then | |
| echo "install_status=success" >> $GITHUB_OUTPUT | |
| echo "✅ INSTALL SUCCESS" | |
| else | |
| echo "install_status=failure" >> $GITHUB_OUTPUT | |
| echo "❌ INSTALL FAILED" | |
| echo "" | |
| echo "Known conflicts that may cause this:" | |
| echo " - nemo_toolkit[asr] requires transformers<4.55 but vllm requires transformers>=4.55" | |
| echo "" | |
| echo "Install log errors:" | |
| cat ${{ github.workspace }}/install.log 2>/dev/null | grep -E "(ResolutionImpossible|ERROR:|versions have conflicting|incompatible)" | head -20 || true | |
| exit 1 | |
| fi | |
| # ============================================================================ | |
| # Summary | |
| # ============================================================================ | |
| install-test-summary: | |
| # Note: all jobs listed to access their .result status | |
| needs: [pre-flight, cpu-install-test, gpu-install-test, all-install-import-test] | |
| runs-on: ubuntu-latest | |
| if: ${{ always() && needs.pre-flight.outputs.docs_only != 'true' }} | |
| steps: | |
| - name: Download CPU status artifacts | |
| continue-on-error: true # May not exist if job was skipped | |
| uses: actions/download-artifact@v7 | |
| with: | |
| pattern: cpu-status-* | |
| merge-multiple: true | |
| path: cpu-status/ | |
| - name: Download GPU status artifacts | |
| continue-on-error: true # May not exist if job was skipped | |
| uses: actions/download-artifact@v7 | |
| with: | |
| pattern: gpu-status-* | |
| merge-multiple: true | |
| path: gpu-status/ | |
| - name: Summary | |
| run: | | |
| # Read status from artifacts; if job didn't succeed, use job result directly | |
| read_status() { [ "$1" = "success" ] && cat "$2" 2>/dev/null || echo "${1:-failure}"; } | |
| CPU_PIP=$(read_status "${{ needs.cpu-install-test.result }}" cpu-status/pip_status.txt) | |
| CPU_UV=$(read_status "${{ needs.cpu-install-test.result }}" cpu-status/uv_status.txt) | |
| GPU_PIP=$(read_status "${{ needs.gpu-install-test.result }}" gpu-status/pip_status.txt) | |
| GPU_UV=$(read_status "${{ needs.gpu-install-test.result }}" gpu-status/uv_status.txt) | |
| echo "# 📊 Installation Test Summary" | |
| echo "" | |
| echo "## Individual Extras" | |
| echo "| Job | Runner | Installer | Validation | Status |" | |
| echo "|-----|--------|-----------|------------|--------|" | |
| echo "| CPU Extras | CPU | pip | Install+Import | $([[ $CPU_PIP == 'success' ]] && echo '✅' || echo '❌') |" | |
| echo "| CPU Extras | CPU | uv | Install+Import | $([[ $CPU_UV == 'success' ]] && echo '✅' || echo '❌') |" | |
| echo "| GPU Extras | CPU | pip | Install Only | $([[ $GPU_PIP == 'success' ]] && echo '✅' || echo '❌ (expected)') |" | |
| echo "| GPU Extras | CPU | uv | Install Only | $([[ $GPU_UV == 'success' ]] && echo '✅' || echo '❌') |" | |
| echo "" | |
| echo "## ALL Extras (Combined)" | |
| echo "| Job | Runner | Validation | Status |" | |
| echo "|-----|--------|------------|--------|" | |
| # echo "| ALL (pip/uv, Py3.10/3.12) | CPU | Install Only | ${{ needs.all-install-only-test.result == 'success' && '✅' || '⚠️ (expected)' }} |" | |
| echo "| ALL (uv, Py3.12) | **GPU** | **Install+Import** | ${{ needs.all-install-import-test.result == 'success' && '✅' || '❌' }} |" | |
| echo "" | |
| # Core tests: CPU extras (both), GPU extras (uv), and ALL extras (uv) must pass | |
| if [ "$CPU_PIP" = "success" ] && [ "$CPU_UV" = "success" ] && \ | |
| [ "$GPU_UV" = "success" ] && [ "${{ needs.all-install-import-test.result }}" = "success" ]; then | |
| echo "✅ All required tests passed" | |
| else | |
| echo "❌ Core installation tests failed"; exit 1 | |
| fi |