Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/optimum-executorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
44d8d54e38c0258357d4e92e1fefe21e845947a3
09fdbd0a0639b128f712a4f5202ed42ca4c60957
153 changes: 148 additions & 5 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,23 +200,166 @@ jobs:

echo "::endgroup::"

export-voxtral-cuda-quantized-int4-tile-packed:
name: export-voxtral-cuda-quantized-int4-tile-packed
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
secrets: inherit
strategy:
fail-fast: false
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
upload-artifact: voxtral-cuda-quantized-int4-tile-packed
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch"
./install_executorch.sh
echo "::endgroup::"

echo "::group::Setup Huggingface"
pip install -U "huggingface_hub[cli]" accelerate
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
pip install mistral-common librosa
pip list
echo "::endgroup::"

echo "::group::Export Voxtral with Quantization (int4-tile-packed)"
optimum-cli export executorch \
--model "mistralai/Voxtral-Mini-3B-2507" \
--task "multimodal-text-to-text" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
--max_seq_len 1024 \
--qlinear 4w \
--qlinear_encoder 4w \
--qlinear_packing_format tile_packed_to_4d \
--qlinear_encoder_packing_format tile_packed_to_4d \
--output_dir ./
python -m executorch.extension.audio.mel_spectrogram \
--feature_size 128 \
--stack_output \
--max_audio_len 300 \
--output_file voxtral_preprocessor.pte

test -f model.pte
test -f aoti_cuda_blob.ptd
test -f voxtral_preprocessor.pte
echo "::endgroup::"

echo "::group::Store Voxtral Quantized Artifacts (int4-tile-packed)"
mkdir -p "${RUNNER_ARTIFACT_DIR}"
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
ls -al "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"

export-voxtral-cuda-quantized-int4-weight-only:
name: export-voxtral-cuda-quantized-int4-weight-only
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
secrets: inherit
strategy:
fail-fast: false
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
upload-artifact: voxtral-cuda-quantized-int4-weight-only
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch"
./install_executorch.sh
echo "::endgroup::"

echo "::group::Setup Huggingface"
pip install -U "huggingface_hub[cli]" accelerate
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
pip install mistral-common librosa
pip list
echo "::endgroup::"

echo "::group::Export Voxtral with Quantization (int4-weight-only)"
optimum-cli export executorch \
--model "mistralai/Voxtral-Mini-3B-2507" \
--task "multimodal-text-to-text" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
--max_seq_len 1024 \
--qlinear 4w \
--qlinear_encoder 4w \
--output_dir ./
python -m executorch.extension.audio.mel_spectrogram \
--feature_size 128 \
--stack_output \
--max_audio_len 300 \
--output_file voxtral_preprocessor.pte

test -f model.pte
test -f aoti_cuda_blob.ptd
test -f voxtral_preprocessor.pte
echo "::endgroup::"

echo "::group::Store Voxtral Quantized Artifacts (int4-weight-only)"
mkdir -p "${RUNNER_ARTIFACT_DIR}"
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
ls -al "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"

test-voxtral-cuda-e2e:
name: test-voxtral-cuda-e2e
needs: export-voxtral-cuda-artifact
name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
needs:
- export-voxtral-cuda-artifact
- export-voxtral-cuda-quantized-int4-tile-packed
- export-voxtral-cuda-quantized-int4-weight-only
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
format:
- name: "non-quantized"
artifact: "voxtral-cuda-export"
- name: "quantized-int4-tile-packed"
artifact: "voxtral-cuda-quantized-int4-tile-packed"
- name: "quantized-int4-weight-only"
artifact: "voxtral-cuda-quantized-int4-weight-only"
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: voxtral-cuda-export
download-artifact: ${{ matrix.format.artifact }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
Expand All @@ -226,7 +369,7 @@ jobs:
pip list
echo "::endgroup::"

echo "::group::Prepare Voxtral Artifacts"
echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
Expand Down Expand Up @@ -255,7 +398,7 @@ jobs:
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
echo "::endgroup::"

echo "::group::Run Voxtral Runner"
echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
Expand Down
18 changes: 16 additions & 2 deletions examples/models/voxtral/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit ac
## CUDA Support
If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:

**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.

### Exporting with CUDA
```
optimum-cli export executorch \
Expand All @@ -57,6 +55,22 @@ This will generate:
- `model.pte` - The exported model
- `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime

Furthermore, we support several quantization formats on CUDA.
To export Voxtral with int4 weight-only quantization, use
```
optimum-cli export executorch \
--model "mistralai/Voxtral-Mini-3B-2507" \
--task "multimodal-text-to-text" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
--max_seq_len 1024 \
--qlinear 4w \
--qlinear_encoder 4w \
--qembedding 4w \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you're removing from CI, don't forget to update here as well

--output_dir="voxtral"
```

See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.

# Running the model
Expand Down
Loading