Skip to content

Add Metal backend documentation to Voxtral README #1374

Add Metal backend documentation to Voxtral README

Add Metal backend documentation to Voxtral README #1374

Workflow file for this run

# Test ExecuTorch CUDA Build Compatibility
# This workflow tests whether ExecuTorch can be successfully built with CUDA support
# across different CUDA versions (12.6, 12.8, 12.9) using the command:
# ./install_executorch.sh
#
# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
name: Test CUDA Builds
on:
pull_request:
push:
branches:
- main
- release/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: false
jobs:
test-cuda-builds:
strategy:
fail-fast: false
matrix:
cuda-version: ["12.6", "12.8", "13.0"]
name: test-executorch-cuda-build-${{ matrix.cuda-version }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: ${{ matrix.cuda-version }}
use-custom-docker-registry: false
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
# Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
# and install the appropriate PyTorch wheel
source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
# This job will fail if any of the CUDA versions fail
check-all-cuda-builds:
needs: test-cuda-builds
runs-on: ubuntu-latest
if: always()
steps:
- name: Check if all CUDA builds succeeded
run: |
if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then
echo "ERROR: One or more ExecuTorch CUDA builds failed!"
echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
exit 1
else
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
fi
test-models-cuda:
name: test-models-cuda
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
model: [linear, add, add_mul, resnet18, conv1d]
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
PYTHON_EXECUTABLE=python ./install_executorch.sh
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
export-voxtral-cuda-artifact:
name: export-voxtral-cuda-${{ matrix.quant.name }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
secrets: inherit
strategy:
fail-fast: false
matrix:
quant:
- name: "non-quantized"
artifact: "voxtral-cuda-export"
extra_args: ""
- name: "quantized-int4-tile-packed"
artifact: "voxtral-cuda-quantized-int4-tile-packed"
extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
- name: "quantized-int4-weight-only"
artifact: "voxtral-cuda-quantized-int4-weight-only"
# TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
extra_args: "--qlinear_encoder 4w"
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
upload-artifact: ${{ matrix.quant.artifact }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
echo "::group::Setup ExecuTorch"
./install_executorch.sh
echo "::endgroup::"
echo "::group::Setup Huggingface"
pip install -U "huggingface_hub[cli]" accelerate
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
pip install mistral-common librosa
pip list
echo "::endgroup::"
echo "::group::Export Voxtral (${{ matrix.quant.name }})"
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
optimum-cli export executorch \
--model "mistralai/Voxtral-Mini-3B-2507" \
--task "multimodal-text-to-text" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
--max_seq_len 1024 \
${EXTRA_ARGS} \
--output_dir ./
python -m executorch.extension.audio.mel_spectrogram \
--feature_size 128 \
--stack_output \
--max_audio_len 300 \
--output_file voxtral_preprocessor.pte
test -f model.pte
test -f aoti_cuda_blob.ptd
test -f voxtral_preprocessor.pte
echo "::endgroup::"
echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
mkdir -p "${RUNNER_ARTIFACT_DIR}"
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
ls -al "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"
benchmark-voxtral-cuda:
name: benchmark-voxtral-cuda
needs: export-voxtral-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: voxtral-cuda-export
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
echo "::group::Setup ExecuTorch Requirements"
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
pip list
echo "::endgroup::"
echo "::group::Prepare Voxtral Artifacts"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
ls -al model.pte aoti_cuda_blob.ptd
echo "::endgroup::"
echo "::group::Build Voxtral Benchmark"
cmake -DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_CUDA=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-DEXECUTORCH_BUILD_TESTS=ON \
-Bcmake-out .
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
echo "::endgroup::"
echo "::group::Run Voxtral Benchmark"
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
echo "::endgroup::"
test-voxtral-cuda-e2e:
name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
needs: export-voxtral-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
format:
- name: "non-quantized"
artifact: "voxtral-cuda-export"
- name: "quantized-int4-tile-packed"
artifact: "voxtral-cuda-quantized-int4-tile-packed"
- name: "quantized-int4-weight-only"
artifact: "voxtral-cuda-quantized-int4-weight-only"
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: ${{ matrix.format.artifact }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
echo "::group::Setup ExecuTorch Requirements"
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
pip list
echo "::endgroup::"
echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
curl -L $TOKENIZER_URL -o tekken.json
ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
echo "::endgroup::"
echo "::group::Download Test Audio File"
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
curl -L $AUDIO_URL -o poem.wav
echo "::endgroup::"
echo "::group::Build Voxtral Runner"
cmake --preset llm \
-DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out -S.
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
cmake -DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-Sexamples/models/voxtral \
-Bcmake-out/examples/models/voxtral/
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
echo "::endgroup::"
echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
--model_path model.pte \
--data_path aoti_cuda_blob.ptd \
--tokenizer_path tekken.json \
--audio_path poem.wav \
--processor_path voxtral_preprocessor.pte \
--temperature 0 2>&1)
EXIT_CODE=$?
set -e
echo "$OUTPUT"
if ! echo "$OUTPUT" | grep -iq "poem"; then
echo "Expected output 'poem' not found in output"
exit 1
fi
if [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
fi
echo "::endgroup::"