@@ -88,14 +88,26 @@ jobs:
8888 PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
9090 export-voxtral-cuda-artifact :
91- name : export-voxtral-cuda-artifact
91+ name : export-voxtral-cuda-${{ matrix.quant.name }}
9292 uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393 permissions :
9494 id-token : write
9595 contents : read
9696 secrets : inherit
9797 strategy :
9898 fail-fast : false
99+ matrix :
100+ quant :
101+ - name : " non-quantized"
102+ artifact : " voxtral-cuda-export"
103+ extra_args : " "
104+ - name : " quantized-int4-tile-packed"
105+ artifact : " voxtral-cuda-quantized-int4-tile-packed"
106+ extra_args : " --qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
107+ - name : " quantized-int4-weight-only"
108+ artifact : " voxtral-cuda-quantized-int4-weight-only"
109+ # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
110+ extra_args : " --qlinear_encoder 4w"
99111 with :
100112 timeout : 90
101113 secrets-env : EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
104116 gpu-arch-version : 12.6
105117 use-custom-docker-registry : false
106118 submodules : recursive
107- upload-artifact : voxtral-cuda-export
119+ upload-artifact : ${{ matrix.quant.artifact }}
108120 ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
109121 script : |
110122 set -eux
@@ -122,14 +134,16 @@ jobs:
122134 pip list
123135 echo "::endgroup::"
124136
125- echo "::group::Export Voxtral"
137+ echo "::group::Export Voxtral (${{ matrix.quant.name }})"
138+ EXTRA_ARGS="${{ matrix.quant.extra_args }}"
126139 optimum-cli export executorch \
127140 --model "mistralai/Voxtral-Mini-3B-2507" \
128141 --task "multimodal-text-to-text" \
129142 --recipe "cuda" \
130143 --dtype bfloat16 \
131144 --device cuda \
132145 --max_seq_len 1024 \
146+ ${EXTRA_ARGS} \
133147 --output_dir ./
134148 python -m executorch.extension.audio.mel_spectrogram \
135149 --feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
142156 test -f voxtral_preprocessor.pte
143157 echo "::endgroup::"
144158
145- echo "::group::Store Voxtral Artifacts"
159+ echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }}) "
146160 mkdir -p "${RUNNER_ARTIFACT_DIR}"
147161 cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148162 cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
201215 echo "::endgroup::"
202216
203217 test-voxtral-cuda-e2e :
204- name : test-voxtral-cuda-e2e
218+ name : test-voxtral-cuda-e2e-${{ matrix.format.name }}
205219 needs : export-voxtral-cuda-artifact
206220 uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207221 permissions :
208222 id-token : write
209223 contents : read
210224 strategy :
211225 fail-fast : false
226+ matrix :
227+ format :
228+ - name : " non-quantized"
229+ artifact : " voxtral-cuda-export"
230+ - name : " quantized-int4-tile-packed"
231+ artifact : " voxtral-cuda-quantized-int4-tile-packed"
232+ - name : " quantized-int4-weight-only"
233+ artifact : " voxtral-cuda-quantized-int4-weight-only"
212234 with :
213235 timeout : 90
214236 runner : linux.g5.4xlarge.nvidia.gpu
215237 gpu-arch-type : cuda
216238 gpu-arch-version : 12.6
217239 use-custom-docker-registry : false
218240 submodules : recursive
219- download-artifact : voxtral-cuda-export
241+ download-artifact : ${{ matrix.format.artifact }}
220242 ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221243 script : |
222244 set -eux
@@ -226,7 +248,7 @@ jobs:
226248 pip list
227249 echo "::endgroup::"
228250
229- echo "::group::Prepare Voxtral Artifacts"
251+ echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }}) "
230252 cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231253 cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232254 cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
255277 cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256278 echo "::endgroup::"
257279
258- echo "::group::Run Voxtral Runner"
280+ echo "::group::Run Voxtral Runner (${{ matrix.format.name }}) "
259281 set +e
260282 export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
261283 OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
0 commit comments