@@ -164,6 +164,75 @@ jobs:
164164 ls -al "${RUNNER_ARTIFACT_DIR}"
165165 echo "::endgroup::"
166166
167+ export-gemma3-cuda-artifact :
168+ name : export-gemma3-cuda-${{ matrix.quant.name }}
169+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170+ permissions :
171+ id-token : write
172+ contents : read
173+ secrets : inherit
174+ strategy :
175+ fail-fast : false
176+ matrix :
177+ quant :
178+ - name : " non-quantized"
179+ artifact : " voxtral-cuda-export"
180+ extra_args : " "
181+ # TODO: enable gemma3 quantization
182+ # - name: "quantized-int4-tile-packed"
183+ # artifact: "voxtral-cuda-quantized-int4-tile-packed"
184+ # extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
185+ # - name: "quantized-int4-weight-only"
186+ # artifact: "voxtral-cuda-quantized-int4-weight-only"
187+ # # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
188+ # extra_args: "--qlinear_encoder 4w"
189+ with :
190+ timeout : 90
191+ secrets-env : EXECUTORCH_HF_TOKEN
192+ runner : linux.g5.4xlarge.nvidia.gpu
193+ gpu-arch-type : cuda
194+ gpu-arch-version : 12.6
195+ use-custom-docker-registry : false
196+ submodules : recursive
197+ upload-artifact : gemma3-cuda-export
198+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
199+ script : |
200+ set -eux
201+
202+ echo "::group::Setup ExecuTorch"
203+ ./install_executorch.sh
204+ echo "::endgroup::"
205+
206+ echo "::group::Setup Huggingface"
207+ pip install -U "huggingface_hub[cli]" accelerate
208+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
209+ OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
210+ pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
211+ pip list
212+ echo "::endgroup::"
213+
214+ echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
215+ EXTRA_ARGS="${{ matrix.quant.extra_args }}"
216+ optimum-cli export executorch \
217+ --model "google/gemma-3-4b-it" \
218+ --task "multimodal-text-to-text" \
219+ --recipe "cuda" \
220+ --dtype bfloat16 \
221+ --device cuda \
222+ --max_seq_len 64 \
223+ --output_dir ./
224+
225+ test -f model.pte
226+ test -f aoti_cuda_blob.ptd
227+ echo "::endgroup::"
228+
229+ echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
230+ mkdir -p "${RUNNER_ARTIFACT_DIR}/"
231+ cp model.pte "${RUNNER_ARTIFACT_DIR}/"
232+ cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
233+ ls -al "${RUNNER_ARTIFACT_DIR}/"
234+ echo "::endgroup::"
235+
167236 benchmark-voxtral-cuda :
168237 name : benchmark-voxtral-cuda
169238 needs : export-voxtral-cuda-artifact
@@ -204,13 +273,63 @@ jobs:
204273 -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
205274 -DEXECUTORCH_BUILD_TESTS=ON \
206275 -Bcmake-out .
207- cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
276+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
208277 echo "::endgroup::"
209278
210279 echo "::group::Run Voxtral Benchmark"
211280
212281 export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
213- cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
282+ cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
283+
284+ echo "::endgroup::"
285+
286+ benchmark-gemma3-cuda :
287+ name : benchmark-gemma3-cuda
288+ needs : export-gemma3-cuda-artifact
289+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
290+ permissions :
291+ id-token : write
292+ contents : read
293+ strategy :
294+ fail-fast : false
295+ with :
296+ timeout : 90
297+ runner : linux.g5.4xlarge.nvidia.gpu
298+ gpu-arch-type : cuda
299+ gpu-arch-version : 12.6
300+ use-custom-docker-registry : false
301+ submodules : recursive
302+ download-artifact : gemma3-cuda-export
303+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
304+ script : |
305+ set -eux
306+
307+ echo "::group::Setup ExecuTorch Requirements"
308+ CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
309+ pip list
310+ echo "::endgroup::"
311+
312+ echo "::group::Prepare Gemma3 Artifacts"
313+ cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
314+ cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
315+ ls -al model.pte aoti_cuda_blob.ptd
316+ echo "::endgroup::"
317+
318+ echo "::group::Build Gemma3 Benchmark"
319+ cmake -DCMAKE_BUILD_TYPE=Release \
320+ -DEXECUTORCH_BUILD_CUDA=ON \
321+ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
322+ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
323+ -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
324+ -DEXECUTORCH_BUILD_TESTS=ON \
325+ -Bcmake-out .
326+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
327+ echo "::endgroup::"
328+
329+ echo "::group::Run Gemma3 Benchmark"
330+
331+ export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
332+ cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
214333
215334 echo "::endgroup::"
216335
@@ -302,3 +421,87 @@ jobs:
302421 exit $EXIT_CODE
303422 fi
304423 echo "::endgroup::"
424+
425+ test-gemma3-cuda-e2e :
426+ name : test-gemma3-cuda-e2e-${{ matrix.format.name }}
427+ needs : export-gemma3-cuda-artifact
428+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
429+ permissions :
430+ id-token : write
431+ contents : read
432+ strategy :
433+ fail-fast : false
434+ matrix :
435+ format :
436+ - name : " non-quantized"
437+ artifact : " gemma3-cuda-export"
438+ # TODO: enable quantized gemma3.
439+ # - name: "quantized-int4-tile-packed"
440+ # artifact: "gemma3-cuda-quantized-int4-tile-packed"
441+ # - name: "quantized-int4-weight-only"
442+ # artifact: "gemma3-cuda-quantized-int4-weight-only"
443+ with :
444+ timeout : 90
445+ runner : linux.g5.4xlarge.nvidia.gpu
446+ gpu-arch-type : cuda
447+ gpu-arch-version : 12.6
448+ use-custom-docker-registry : false
449+ submodules : recursive
450+ download-artifact : ${{ matrix.format.artifact }}
451+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
452+ script : |
453+ set -eux
454+
455+ echo "::group::Setup ExecuTorch Requirements"
456+ ./install_requirements.sh
457+ pip list
458+ echo "::endgroup::"
459+
460+ echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
461+ cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
462+ cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
463+ TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
464+ curl -L $TOKENIZER_URL -o tokenizer.json
465+ ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
466+ IMAGE_PATH="docs/source/_static/img/et-logo.png"
467+ echo "::endgroup::"
468+
469+ echo "::group::Build Gemma3 Runner"
470+ cmake --preset llm \
471+ -DEXECUTORCH_BUILD_CUDA=ON \
472+ -DCMAKE_INSTALL_PREFIX=cmake-out \
473+ -DCMAKE_BUILD_TYPE=Release \
474+ -Bcmake-out -S.
475+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
476+
477+ cmake -DEXECUTORCH_BUILD_CUDA=ON \
478+ -DCMAKE_BUILD_TYPE=Release \
479+ -Sexamples/models/gemma3 \
480+ -Bcmake-out/examples/models/gemma3/
481+ cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
482+ echo "::endgroup::"
483+
484+ echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
485+ set +e
486+ export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
487+ OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
488+ --model_path model.pte \
489+ --data_path aoti_cuda_blob.ptd \
490+ --tokenizer_path tokenizer.json \
491+ --image_path $IMAGE_PATH \
492+ --temperature 0 2>&1)
493+ EXIT_CODE=$?
494+ set -e
495+
496+ echo "$OUTPUT"
497+
498+ if ! echo "$OUTPUT" | grep -iq "chip"; then
499+ echo "Expected output 'chip' not found in output"
500+ exit 1
501+ fi
502+
503+ if [ $EXIT_CODE -ne 0 ]; then
504+ echo "Unexpected exit code: $EXIT_CODE"
505+ exit $EXIT_CODE
506+ fi
507+ echo "::endgroup::"
0 commit comments