Skip to content

Commit af97aa8

Browse files
authored
Add int4mm test to the CUDA CI flow (pytorch#15181)
Tested in CI
1 parent 98baab7 commit af97aa8

File tree

3 files changed

+48
-11
lines changed

3 files changed

+48
-11
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
44d8d54e38c0258357d4e92e1fefe21e845947a3
1+
09fdbd0a0639b128f712a4f5202ed42ca4c60957

.github/workflows/cuda.yml

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,26 @@ jobs:
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
9090
export-voxtral-cuda-artifact:
91-
name: export-voxtral-cuda-artifact
91+
name: export-voxtral-cuda-${{ matrix.quant.name }}
9292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393
permissions:
9494
id-token: write
9595
contents: read
9696
secrets: inherit
9797
strategy:
9898
fail-fast: false
99+
matrix:
100+
quant:
101+
- name: "non-quantized"
102+
artifact: "voxtral-cuda-export"
103+
extra_args: ""
104+
- name: "quantized-int4-tile-packed"
105+
artifact: "voxtral-cuda-quantized-int4-tile-packed"
106+
extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
107+
- name: "quantized-int4-weight-only"
108+
artifact: "voxtral-cuda-quantized-int4-weight-only"
109+
# TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
110+
extra_args: "--qlinear_encoder 4w"
99111
with:
100112
timeout: 90
101113
secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
104116
gpu-arch-version: 12.6
105117
use-custom-docker-registry: false
106118
submodules: recursive
107-
upload-artifact: voxtral-cuda-export
119+
upload-artifact: ${{ matrix.quant.artifact }}
108120
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
109121
script: |
110122
set -eux
@@ -122,14 +134,16 @@ jobs:
122134
pip list
123135
echo "::endgroup::"
124136
125-
echo "::group::Export Voxtral"
137+
echo "::group::Export Voxtral (${{ matrix.quant.name }})"
138+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
126139
optimum-cli export executorch \
127140
--model "mistralai/Voxtral-Mini-3B-2507" \
128141
--task "multimodal-text-to-text" \
129142
--recipe "cuda" \
130143
--dtype bfloat16 \
131144
--device cuda \
132145
--max_seq_len 1024 \
146+
${EXTRA_ARGS} \
133147
--output_dir ./
134148
python -m executorch.extension.audio.mel_spectrogram \
135149
--feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
142156
test -f voxtral_preprocessor.pte
143157
echo "::endgroup::"
144158
145-
echo "::group::Store Voxtral Artifacts"
159+
echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
146160
mkdir -p "${RUNNER_ARTIFACT_DIR}"
147161
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148162
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
201215
echo "::endgroup::"
202216
203217
test-voxtral-cuda-e2e:
204-
name: test-voxtral-cuda-e2e
218+
name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
205219
needs: export-voxtral-cuda-artifact
206220
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207221
permissions:
208222
id-token: write
209223
contents: read
210224
strategy:
211225
fail-fast: false
226+
matrix:
227+
format:
228+
- name: "non-quantized"
229+
artifact: "voxtral-cuda-export"
230+
- name: "quantized-int4-tile-packed"
231+
artifact: "voxtral-cuda-quantized-int4-tile-packed"
232+
- name: "quantized-int4-weight-only"
233+
artifact: "voxtral-cuda-quantized-int4-weight-only"
212234
with:
213235
timeout: 90
214236
runner: linux.g5.4xlarge.nvidia.gpu
215237
gpu-arch-type: cuda
216238
gpu-arch-version: 12.6
217239
use-custom-docker-registry: false
218240
submodules: recursive
219-
download-artifact: voxtral-cuda-export
241+
download-artifact: ${{ matrix.format.artifact }}
220242
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221243
script: |
222244
set -eux
@@ -226,7 +248,7 @@ jobs:
226248
pip list
227249
echo "::endgroup::"
228250
229-
echo "::group::Prepare Voxtral Artifacts"
251+
echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
230252
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231253
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232254
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
255277
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256278
echo "::endgroup::"
257279
258-
echo "::group::Run Voxtral Runner"
280+
echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
259281
set +e
260282
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
261283
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \

examples/models/voxtral/README.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit ac
3939
## CUDA Support
4040
If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
4141

42-
**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
43-
4442
### Exporting with CUDA
4543
```
4644
optimum-cli export executorch \
@@ -57,6 +55,23 @@ This will generate:
5755
- `model.pte` - The exported model
5856
- `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
5957

58+
Furthermore, we support several quantization formats on CUDA.
59+
For example, to export Voxtral with int4 weight and int4mm for linear layers, you can use the following command,
60+
```
61+
optimum-cli export executorch \
62+
--model "mistralai/Voxtral-Mini-3B-2507" \
63+
--task "multimodal-text-to-text" \
64+
--recipe "cuda" \
65+
--dtype bfloat16 \
66+
--device cuda \
67+
--max_seq_len 1024 \
68+
--qlinear 4w \
69+
--qlinear_encoder 4w \
70+
--qlinear_packing_format tile_packed_to_4d \
71+
--qlinear_encoder_packing_format tile_packed_to_4d \
72+
--output_dir="voxtral"
73+
```
74+
6075
See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
6176

6277
# Running the model

0 commit comments

Comments
 (0)