@@ -164,6 +164,61 @@ jobs:
164164 ls -al "${RUNNER_ARTIFACT_DIR}"
165165 echo "::endgroup::"
166166
167+ export-gemma3-cuda-artifact :
168+ name : export-gemma3-cuda-artifact
169+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170+ permissions :
171+ id-token : write
172+ contents : read
173+ secrets : inherit
174+ strategy :
175+ fail-fast : false
176+ with :
177+ timeout : 90
178+ secrets-env : EXECUTORCH_HF_TOKEN
179+ runner : linux.g5.4xlarge.nvidia.gpu
180+ gpu-arch-type : cuda
181+ gpu-arch-version : 12.6
182+ use-custom-docker-registry : false
183+ submodules : recursive
184+ upload-artifact : gemma3-cuda-export
185+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
186+ script : |
187+ set -eux
188+
189+ echo "::group::Setup ExecuTorch"
190+ ./install_executorch.sh
191+ echo "::endgroup::"
192+
193+ echo "::group::Setup Huggingface"
194+ pip install -U "huggingface_hub[cli]" accelerate
195+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
196+ OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
197+ pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
198+ pip list
199+ echo "::endgroup::"
200+
201+ echo "::group::Export Gemma3"
202+ optimum-cli export executorch \
203+ --model "google/gemma-3-4b-it" \
204+ --task "multimodal-text-to-text" \
205+ --recipe "cuda" \
206+ --dtype bfloat16 \
207+ --device cuda \
208+ --max_seq_len 64 \
209+ --output_dir ./
210+
211+ test -f model.pte
212+ test -f aoti_cuda_blob.ptd
213+ echo "::endgroup::"
214+
215+ echo "::group::Store Gemma3 Artifacts"
216+ mkdir -p "${RUNNER_ARTIFACT_DIR}/"
217+ cp model.pte "${RUNNER_ARTIFACT_DIR}/"
218+ cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
219+ ls -al "${RUNNER_ARTIFACT_DIR}/"
220+ echo "::endgroup::"
221+
167222 benchmark-voxtral-cuda :
168223 name : benchmark-voxtral-cuda
169224 needs : export-voxtral-cuda-artifact
@@ -204,13 +259,63 @@ jobs:
204259 -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
205260 -DEXECUTORCH_BUILD_TESTS=ON \
206261 -Bcmake-out .
207- cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
262+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
208263 echo "::endgroup::"
209264
210265 echo "::group::Run Voxtral Benchmark"
211266
212267 export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
213- cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
268+ cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
269+
270+ echo "::endgroup::"
271+
272+ benchmark-gemma3-cuda :
273+ name : benchmark-gemma3-cuda
274+ needs : export-gemma3-cuda-artifact
275+ uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
276+ permissions :
277+ id-token : write
278+ contents : read
279+ strategy :
280+ fail-fast : false
281+ with :
282+ timeout : 90
283+ runner : linux.g5.4xlarge.nvidia.gpu
284+ gpu-arch-type : cuda
285+ gpu-arch-version : 12.6
286+ use-custom-docker-registry : false
287+ submodules : recursive
288+ download-artifact : gemma3-cuda-export
289+ ref : ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
290+ script : |
291+ set -eux
292+
293+ echo "::group::Setup ExecuTorch Requirements"
294+ CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
295+ pip list
296+ echo "::endgroup::"
297+
298+ echo "::group::Prepare Gemma3 Artifacts"
299+ cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
300+ cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
301+ ls -al model.pte aoti_cuda_blob.ptd
302+ echo "::endgroup::"
303+
304+ echo "::group::Build Gemma3 Benchmark"
305+ cmake -DCMAKE_BUILD_TYPE=Release \
306+ -DEXECUTORCH_BUILD_CUDA=ON \
307+ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
308+ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
309+ -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
310+ -DEXECUTORCH_BUILD_TESTS=ON \
311+ -Bcmake-out .
312+ cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
313+ echo "::endgroup::"
314+
315+ echo "::group::Run Gemma3 Benchmark"
316+
317+ export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
318+ cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
214319
215320 echo "::endgroup::"
216321
0 commit comments