Skip to content

Commit 4b3de68

Browse files
authored
add module level benchmark for gemma3 model
Differential Revision: D84958564 Pull Request resolved: #15241
1 parent ed47836 commit 4b3de68

File tree

5 files changed

+578
-270
lines changed

5 files changed

+578
-270
lines changed

.github/workflows/cuda.yml

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,61 @@ jobs:
164164
ls -al "${RUNNER_ARTIFACT_DIR}"
165165
echo "::endgroup::"
166166
167+
export-gemma3-cuda-artifact:
168+
name: export-gemma3-cuda-artifact
169+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170+
permissions:
171+
id-token: write
172+
contents: read
173+
secrets: inherit
174+
strategy:
175+
fail-fast: false
176+
with:
177+
timeout: 90
178+
secrets-env: EXECUTORCH_HF_TOKEN
179+
runner: linux.g5.4xlarge.nvidia.gpu
180+
gpu-arch-type: cuda
181+
gpu-arch-version: 12.6
182+
use-custom-docker-registry: false
183+
submodules: recursive
184+
upload-artifact: gemma3-cuda-export
185+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
186+
script: |
187+
set -eux
188+
189+
echo "::group::Setup ExecuTorch"
190+
./install_executorch.sh
191+
echo "::endgroup::"
192+
193+
echo "::group::Setup Huggingface"
194+
pip install -U "huggingface_hub[cli]" accelerate
195+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
196+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
197+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
198+
pip list
199+
echo "::endgroup::"
200+
201+
echo "::group::Export Gemma3"
202+
optimum-cli export executorch \
203+
--model "google/gemma-3-4b-it" \
204+
--task "multimodal-text-to-text" \
205+
--recipe "cuda" \
206+
--dtype bfloat16 \
207+
--device cuda \
208+
--max_seq_len 64 \
209+
--output_dir ./
210+
211+
test -f model.pte
212+
test -f aoti_cuda_blob.ptd
213+
echo "::endgroup::"
214+
215+
echo "::group::Store Gemma3 Artifacts"
216+
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
217+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
218+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
219+
ls -al "${RUNNER_ARTIFACT_DIR}/"
220+
echo "::endgroup::"
221+
167222
benchmark-voxtral-cuda:
168223
name: benchmark-voxtral-cuda
169224
needs: export-voxtral-cuda-artifact
@@ -204,13 +259,63 @@ jobs:
204259
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
205260
-DEXECUTORCH_BUILD_TESTS=ON \
206261
-Bcmake-out .
207-
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
262+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
208263
echo "::endgroup::"
209264
210265
echo "::group::Run Voxtral Benchmark"
211266
212267
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
213-
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
268+
cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
269+
270+
echo "::endgroup::"
271+
272+
benchmark-gemma3-cuda:
273+
name: benchmark-gemma3-cuda
274+
needs: export-gemma3-cuda-artifact
275+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
276+
permissions:
277+
id-token: write
278+
contents: read
279+
strategy:
280+
fail-fast: false
281+
with:
282+
timeout: 90
283+
runner: linux.g5.4xlarge.nvidia.gpu
284+
gpu-arch-type: cuda
285+
gpu-arch-version: 12.6
286+
use-custom-docker-registry: false
287+
submodules: recursive
288+
download-artifact: gemma3-cuda-export
289+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
290+
script: |
291+
set -eux
292+
293+
echo "::group::Setup ExecuTorch Requirements"
294+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
295+
pip list
296+
echo "::endgroup::"
297+
298+
echo "::group::Prepare Gemma3 Artifacts"
299+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
300+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
301+
ls -al model.pte aoti_cuda_blob.ptd
302+
echo "::endgroup::"
303+
304+
echo "::group::Build Gemma3 Benchmark"
305+
cmake -DCMAKE_BUILD_TYPE=Release \
306+
-DEXECUTORCH_BUILD_CUDA=ON \
307+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
308+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
309+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
310+
-DEXECUTORCH_BUILD_TESTS=ON \
311+
-Bcmake-out .
312+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
313+
echo "::endgroup::"
314+
315+
echo "::group::Run Gemma3 Benchmark"
316+
317+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
318+
cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
214319
215320
echo "::endgroup::"
216321

backends/cuda/CMakeLists.txt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,10 @@ target_link_libraries(
105105
executorch_target_link_options_shared_lib(aoti_cuda)
106106

107107
if(BUILD_TESTING)
108-
# Add runtime
109-
add_executable(voxtral_runner tests/voxtral_runner.cpp)
108+
add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
110109
target_link_libraries(
111-
voxtral_runner PUBLIC aoti_cuda extension_module_static
112-
extension_flat_tensor portable_ops_lib
110+
multimodal_benchmark PUBLIC aoti_cuda extension_module_static
111+
extension_flat_tensor portable_ops_lib
113112
)
114113
endif()
115114

backends/cuda/cuda_backend.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ def preprocess(
140140
user_input_placeholders.append(node.meta["val"])
141141

142142
options: dict[str, typing.Any] = {
143+
# Disable this to support sdpa decomposition
144+
# TODO(gasoonjia): remove it after pin bump to latest pytorch
145+
"loop_ordering_after_fusion": False,
143146
# Better model precision
144147
"emulate_precision_casts": True,
145148
# Embed CUDA kernel binaries directly into the compiled shared object

0 commit comments

Comments
 (0)