Skip to content

Commit b325341

Browse files
committed
add module level benchmark for gemma3 model
This diff adds a module-level benchmark for the GEMMA3 model. Also introduce mutlmodal_benchmark.cpp to replace original voxtral_runner.cpp for benchmarking both gemma3 and voxtral model in module level. Differential Revision: [D84958564](https://our.internmc.facebook.com/intern/diff/D84958564/) [ghstack-poisoned]
1 parent 2cacc74 commit b325341

File tree

4 files changed

+575
-270
lines changed

4 files changed

+575
-270
lines changed

.github/workflows/cuda.yml

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,61 @@ jobs:
150150
ls -al "${RUNNER_ARTIFACT_DIR}"
151151
echo "::endgroup::"
152152
153+
export-gemma3-cuda-artifact:
154+
name: export-gemma3-cuda-artifact
155+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
156+
permissions:
157+
id-token: write
158+
contents: read
159+
secrets: inherit
160+
strategy:
161+
fail-fast: false
162+
with:
163+
timeout: 90
164+
secrets-env: EXECUTORCH_HF_TOKEN
165+
runner: linux.g5.4xlarge.nvidia.gpu
166+
gpu-arch-type: cuda
167+
gpu-arch-version: 12.6
168+
use-custom-docker-registry: false
169+
submodules: recursive
170+
upload-artifact: gemma3-cuda-export
171+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
172+
script: |
173+
set -eux
174+
175+
echo "::group::Setup ExecuTorch"
176+
./install_executorch.sh
177+
echo "::endgroup::"
178+
179+
echo "::group::Setup Huggingface"
180+
pip install -U "huggingface_hub[cli]" accelerate
181+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
182+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
183+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
184+
pip list
185+
echo "::endgroup::"
186+
187+
echo "::group::Export Gemma3"
188+
optimum-cli export executorch \
189+
--model "google/gemma-3-4b-it" \
190+
--task "multimodal-text-to-text" \
191+
--recipe "cuda" \
192+
--dtype bfloat16 \
193+
--device cuda \
194+
--max_seq_len 64 \
195+
--output_dir ./
196+
197+
test -f model.pte
198+
test -f aoti_cuda_blob.ptd
199+
echo "::endgroup::"
200+
201+
echo "::group::Store Gemma3 Artifacts"
202+
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
203+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
204+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
205+
ls -al "${RUNNER_ARTIFACT_DIR}/"
206+
echo "::endgroup::"
207+
153208
benchmark-voxtral-cuda:
154209
name: benchmark-voxtral-cuda
155210
needs: export-voxtral-cuda-artifact
@@ -190,13 +245,63 @@ jobs:
190245
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
191246
-DEXECUTORCH_BUILD_TESTS=ON \
192247
-Bcmake-out .
193-
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
248+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
194249
echo "::endgroup::"
195250
196251
echo "::group::Run Voxtral Benchmark"
197252
198253
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
199-
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
254+
cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
255+
256+
echo "::endgroup::"
257+
258+
benchmark-gemma3-cuda:
259+
name: benchmark-gemma3-cuda
260+
needs: export-gemma3-cuda-artifact
261+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
262+
permissions:
263+
id-token: write
264+
contents: read
265+
strategy:
266+
fail-fast: false
267+
with:
268+
timeout: 90
269+
runner: linux.g5.4xlarge.nvidia.gpu
270+
gpu-arch-type: cuda
271+
gpu-arch-version: 12.6
272+
use-custom-docker-registry: false
273+
submodules: recursive
274+
download-artifact: gemma3-cuda-export
275+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
276+
script: |
277+
set -eux
278+
279+
echo "::group::Setup ExecuTorch Requirements"
280+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
281+
pip list
282+
echo "::endgroup::"
283+
284+
echo "::group::Prepare Gemma3 Artifacts"
285+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
286+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
287+
ls -al model.pte aoti_cuda_blob.ptd
288+
echo "::endgroup::"
289+
290+
echo "::group::Build Gemma3 Benchmark"
291+
cmake -DCMAKE_BUILD_TYPE=Release \
292+
-DEXECUTORCH_BUILD_CUDA=ON \
293+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
294+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
295+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
296+
-DEXECUTORCH_BUILD_TESTS=ON \
297+
-Bcmake-out .
298+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
299+
echo "::endgroup::"
300+
301+
echo "::group::Run Gemma3 Benchmark"
302+
303+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
304+
cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
200305
201306
echo "::endgroup::"
202307

backends/cuda/CMakeLists.txt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,10 @@ target_link_libraries(
7171
executorch_target_link_options_shared_lib(aoti_cuda)
7272

7373
if(BUILD_TESTING)
74-
# Add runtime
75-
add_executable(voxtral_runner tests/voxtral_runner.cpp)
74+
add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
7675
target_link_libraries(
77-
voxtral_runner PUBLIC aoti_cuda extension_module_static
78-
extension_flat_tensor portable_ops_lib
76+
multimodal_benchmark PUBLIC aoti_cuda extension_module_static
77+
extension_flat_tensor portable_ops_lib
7978
)
8079
endif()
8180

0 commit comments

Comments
 (0)