8989
9090 export-voxtral-cuda-artifact :
9191 name : export-voxtral-cuda-${{ matrix.quant.name }}
92+ # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
93+ if : github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
9294 uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9395 permissions :
9496 id-token : write
@@ -164,6 +166,77 @@ jobs:
164166 ls -al "${RUNNER_ARTIFACT_DIR}"
165167 echo "::endgroup::"
166168
169+ export-gemma3-cuda-artifact :
170+ name : export-gemma3-cuda-${{ matrix.quant.name }}
171+ # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
172+ if : github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
173+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
174+ permissions :
175+ id-token : write
176+ contents : read
177+ secrets : inherit
178+ strategy :
179+ fail-fast : false
180+ matrix :
181+ quant :
182+ - name : " non-quantized"
183+ artifact : " gemma3-cuda-export"
184+ extra_args : " "
185+ - name : " quantized-int4-tile-packed"
186+ artifact : " gemma3-cuda-quantized-int4-tile-packed"
187+ extra_args : " --qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
188+ # TODO: enable int4-weight-only on gemma3.
189+ # - name: "quantized-int4-weight-only"
190+ # artifact: "voxtral-cuda-quantized-int4-weight-only"
191+ # # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
192+ # extra_args: "--qlinear_encoder 4w"
193+ with :
194+ timeout : 90
195+ secrets-env : EXECUTORCH_HF_TOKEN
196+ runner : linux.g5.4xlarge.nvidia.gpu
197+ gpu-arch-type : cuda
198+ gpu-arch-version : 12.6
199+ use-custom-docker-registry : false
200+ submodules : recursive
201+ upload-artifact : ${{ matrix.quant.artifact }}
202+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
203+ script : |
204+ set -eux
205+
206+ echo "::group::Setup ExecuTorch"
207+ ./install_executorch.sh
208+ echo "::endgroup::"
209+
210+ echo "::group::Setup Huggingface"
211+ pip install -U "huggingface_hub[cli]" accelerate
212+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
213+ OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
214+ pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
215+ pip list
216+ echo "::endgroup::"
217+
218+ echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
219+ EXTRA_ARGS="${{ matrix.quant.extra_args }}"
220+ optimum-cli export executorch \
221+ --model "google/gemma-3-4b-it" \
222+ --task "multimodal-text-to-text" \
223+ --recipe "cuda" \
224+ --dtype bfloat16 \
225+ --device cuda \
226+ --max_seq_len 64 \
227+ --output_dir ./
228+
229+ test -f model.pte
230+ test -f aoti_cuda_blob.ptd
231+ echo "::endgroup::"
232+
233+ echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
234+ mkdir -p "${RUNNER_ARTIFACT_DIR}/"
235+ cp model.pte "${RUNNER_ARTIFACT_DIR}/"
236+ cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
237+ ls -al "${RUNNER_ARTIFACT_DIR}/"
238+ echo "::endgroup::"
239+
167240 benchmark-voxtral-cuda :
168241 name : benchmark-voxtral-cuda
169242 needs : export-voxtral-cuda-artifact
@@ -186,7 +259,7 @@ jobs:
186259 set -eux
187260
188261 echo "::group::Setup ExecuTorch Requirements"
189- CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
262+ ./install_requirements.sh
190263 pip list
191264 echo "::endgroup::"
192265
@@ -204,13 +277,63 @@ jobs:
204277 -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
205278 -DEXECUTORCH_BUILD_TESTS=ON \
206279 -Bcmake-out .
207- cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
280+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
208281 echo "::endgroup::"
209282
210283 echo "::group::Run Voxtral Benchmark"
211284
212285 export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
213- cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
286+ cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
287+
288+ echo "::endgroup::"
289+
290+ benchmark-gemma3-cuda :
291+ name : benchmark-gemma3-cuda
292+ needs : export-gemma3-cuda-artifact
293+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
294+ permissions :
295+ id-token : write
296+ contents : read
297+ strategy :
298+ fail-fast : false
299+ with :
300+ timeout : 90
301+ runner : linux.g5.4xlarge.nvidia.gpu
302+ gpu-arch-type : cuda
303+ gpu-arch-version : 12.6
304+ use-custom-docker-registry : false
305+ submodules : recursive
306+ download-artifact : gemma3-cuda-export
307+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
308+ script : |
309+ set -eux
310+
311+ echo "::group::Setup ExecuTorch Requirements"
312+ ./install_requirements.sh
313+ pip list
314+ echo "::endgroup::"
315+
316+ echo "::group::Prepare Gemma3 Artifacts"
317+ cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
318+ cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
319+ ls -al model.pte aoti_cuda_blob.ptd
320+ echo "::endgroup::"
321+
322+ echo "::group::Build Gemma3 Benchmark"
323+ cmake -DCMAKE_BUILD_TYPE=Release \
324+ -DEXECUTORCH_BUILD_CUDA=ON \
325+ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
326+ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
327+ -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
328+ -DEXECUTORCH_BUILD_TESTS=ON \
329+ -Bcmake-out .
330+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
331+ echo "::endgroup::"
332+
333+ echo "::group::Run Gemma3 Benchmark"
334+
335+ export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
336+ cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
214337
215338 echo "::endgroup::"
216339
@@ -244,7 +367,7 @@ jobs:
244367 set -eux
245368
246369 echo "::group::Setup ExecuTorch Requirements"
247- CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
370+ ./install_requirements.sh
248371 pip list
249372 echo "::endgroup::"
250373
@@ -302,3 +425,87 @@ jobs:
302425 exit $EXIT_CODE
303426 fi
304427 echo "::endgroup::"
428+
429+ test-gemma3-cuda-e2e :
430+ name : test-gemma3-cuda-e2e-${{ matrix.format.name }}
431+ needs : export-gemma3-cuda-artifact
432+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
433+ permissions :
434+ id-token : write
435+ contents : read
436+ strategy :
437+ fail-fast : false
438+ matrix :
439+ format :
440+ - name : " non-quantized"
441+ artifact : " gemma3-cuda-export"
442+ - name : " quantized-int4-tile-packed"
443+ artifact : " gemma3-cuda-quantized-int4-tile-packed"
444+ # TODO: enable int4-weight-only on gemma3.
445+ # - name: "quantized-int4-weight-only"
446+ # artifact: "gemma3-cuda-quantized-int4-weight-only"
447+ with :
448+ timeout : 90
449+ runner : linux.g5.4xlarge.nvidia.gpu
450+ gpu-arch-type : cuda
451+ gpu-arch-version : 12.6
452+ use-custom-docker-registry : false
453+ submodules : recursive
454+ download-artifact : ${{ matrix.format.artifact }}
455+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
456+ script : |
457+ set -eux
458+
459+ echo "::group::Setup ExecuTorch Requirements"
460+ ./install_requirements.sh
461+ pip list
462+ echo "::endgroup::"
463+
464+ echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
465+ cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
466+ cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
467+ TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
468+ curl -L $TOKENIZER_URL -o tokenizer.json
469+ ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
470+ IMAGE_PATH="docs/source/_static/img/et-logo.png"
471+ echo "::endgroup::"
472+
473+ echo "::group::Build Gemma3 Runner"
474+ cmake --preset llm \
475+ -DEXECUTORCH_BUILD_CUDA=ON \
476+ -DCMAKE_INSTALL_PREFIX=cmake-out \
477+ -DCMAKE_BUILD_TYPE=Release \
478+ -Bcmake-out -S.
479+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
480+
481+ cmake -DEXECUTORCH_BUILD_CUDA=ON \
482+ -DCMAKE_BUILD_TYPE=Release \
483+ -Sexamples/models/gemma3 \
484+ -Bcmake-out/examples/models/gemma3/
485+ cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
486+ echo "::endgroup::"
487+
488+ echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
489+ set +e
490+ export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
491+ OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
492+ --model_path model.pte \
493+ --data_path aoti_cuda_blob.ptd \
494+ --tokenizer_path tokenizer.json \
495+ --image_path $IMAGE_PATH \
496+ --temperature 0 2>&1)
497+ EXIT_CODE=$?
498+ set -e
499+
500+ echo "$OUTPUT"
501+
502+ if ! echo "$OUTPUT" | grep -iq "chip"; then
503+ echo "Expected output 'chip' not found in output"
504+ exit 1
505+ fi
506+
507+ if [ $EXIT_CODE -ne 0 ]; then
508+ echo "Unexpected exit code: $EXIT_CODE"
509+ exit $EXIT_CODE
510+ fi
511+ echo "::endgroup::"
0 commit comments