Skip to content

Make HQQ default PTQ quantization in ExecuTorch #262

Make HQQ default PTQ quantization in ExecuTorch

Make HQQ default PTQ quantization in ExecuTorch #262

Workflow file for this run

# Test ExecuTorch CUDA Build Compatibility
# This workflow tests whether ExecuTorch can be successfully built with CUDA support
# across different CUDA versions (12.6, 12.8, 12.9) using the command:
# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
#
# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
name: Test CUDA Builds
on:
pull_request:
push:
branches:
- main
- release/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: false
jobs:
test-cuda-builds:
strategy:
fail-fast: false
matrix:
cuda-version: ["12.6", "12.8", "13.0"]
name: test-executorch-cuda-build-${{ matrix.cuda-version }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: ${{ matrix.cuda-version }}
use-custom-docker-registry: false
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
# Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
# and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
# This job will fail if any of the CUDA versions fail
check-all-cuda-builds:
needs: test-cuda-builds
runs-on: ubuntu-latest
if: always()
steps:
- name: Check if all CUDA builds succeeded
run: |
if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then
echo "ERROR: One or more ExecuTorch CUDA builds failed!"
echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
exit 1
else
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
fi
test-models-cuda:
name: test-models-cuda
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
model: [linear, add, add_mul, resnet18]
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
test-voxtral-cuda-e2e:
name: test-voxtral-cuda-e2e
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
secrets: inherit
strategy:
fail-fast: false
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
echo "::group::Setup ExecuTorch"
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
echo "::endgroup::"
echo "::group::Setup Huggingface"
pip install -U "huggingface_hub[cli]" accelerate
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
pip install mistral-common librosa
echo "::endgroup::"
echo "::group::Export Voxtral"
optimum-cli export executorch \
--model "mistralai/Voxtral-Mini-3B-2507" \
--task "multimodal-text-to-text" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
--max_seq_len 1024 \
--output_dir ./
echo "::endgroup::"
echo "::group::Build Voxtral Runner"
cmake -DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_CUDA=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-DEXECUTORCH_BUILD_TESTS=ON \
-Bcmake-out .
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
echo "::endgroup::"
echo "::group::Run Voxtral Runner"
# Capture output and allow exit code 139 if we have the expected printout
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
EXIT_CODE=$?
set -e
echo "$OUTPUT"
# Check if the output contains "Run latency (ms):"
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
echo "Found expected output: 'Run latency (ms):'"
if [ $EXIT_CODE -eq 139 ]; then
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
exit 0
elif [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
else
echo "Command succeeded with exit code 0"
exit 0
fi
else
echo "Expected output 'Run latency (ms):' not found in output"
exit 1
fi
echo "::endgroup::"