Skip to content

Commit bc931ba

Browse files
Merge branch 'main' into pin-bump-oct17
2 parents dd8e85a + 7f68c4f commit bc931ba

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1828
-364
lines changed

.github/workflows/cuda.yml

Lines changed: 205 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,75 @@ jobs:
164164
ls -al "${RUNNER_ARTIFACT_DIR}"
165165
echo "::endgroup::"
166166
167+
export-gemma3-cuda-artifact:
168+
name: export-gemma3-cuda-${{ matrix.quant.name }}
169+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170+
permissions:
171+
id-token: write
172+
contents: read
173+
secrets: inherit
174+
strategy:
175+
fail-fast: false
176+
matrix:
177+
quant:
178+
- name: "non-quantized"
179+
artifact: "voxtral-cuda-export"
180+
extra_args: ""
181+
# TODO: enable gemma3 quantization
182+
# - name: "quantized-int4-tile-packed"
183+
# artifact: "voxtral-cuda-quantized-int4-tile-packed"
184+
# extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
185+
# - name: "quantized-int4-weight-only"
186+
# artifact: "voxtral-cuda-quantized-int4-weight-only"
187+
# # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
188+
# extra_args: "--qlinear_encoder 4w"
189+
with:
190+
timeout: 90
191+
secrets-env: EXECUTORCH_HF_TOKEN
192+
runner: linux.g5.4xlarge.nvidia.gpu
193+
gpu-arch-type: cuda
194+
gpu-arch-version: 12.6
195+
use-custom-docker-registry: false
196+
submodules: recursive
197+
upload-artifact: gemma3-cuda-export
198+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
199+
script: |
200+
set -eux
201+
202+
echo "::group::Setup ExecuTorch"
203+
./install_executorch.sh
204+
echo "::endgroup::"
205+
206+
echo "::group::Setup Huggingface"
207+
pip install -U "huggingface_hub[cli]" accelerate
208+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
209+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
210+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
211+
pip list
212+
echo "::endgroup::"
213+
214+
echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
215+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
216+
optimum-cli export executorch \
217+
--model "google/gemma-3-4b-it" \
218+
--task "multimodal-text-to-text" \
219+
--recipe "cuda" \
220+
--dtype bfloat16 \
221+
--device cuda \
222+
--max_seq_len 64 \
223+
--output_dir ./
224+
225+
test -f model.pte
226+
test -f aoti_cuda_blob.ptd
227+
echo "::endgroup::"
228+
229+
echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
230+
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
231+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
232+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
233+
ls -al "${RUNNER_ARTIFACT_DIR}/"
234+
echo "::endgroup::"
235+
167236
benchmark-voxtral-cuda:
168237
name: benchmark-voxtral-cuda
169238
needs: export-voxtral-cuda-artifact
@@ -204,13 +273,63 @@ jobs:
204273
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
205274
-DEXECUTORCH_BUILD_TESTS=ON \
206275
-Bcmake-out .
207-
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
276+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
208277
echo "::endgroup::"
209278
210279
echo "::group::Run Voxtral Benchmark"
211280
212281
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
213-
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
282+
cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
283+
284+
echo "::endgroup::"
285+
286+
benchmark-gemma3-cuda:
287+
name: benchmark-gemma3-cuda
288+
needs: export-gemma3-cuda-artifact
289+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
290+
permissions:
291+
id-token: write
292+
contents: read
293+
strategy:
294+
fail-fast: false
295+
with:
296+
timeout: 90
297+
runner: linux.g5.4xlarge.nvidia.gpu
298+
gpu-arch-type: cuda
299+
gpu-arch-version: 12.6
300+
use-custom-docker-registry: false
301+
submodules: recursive
302+
download-artifact: gemma3-cuda-export
303+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
304+
script: |
305+
set -eux
306+
307+
echo "::group::Setup ExecuTorch Requirements"
308+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
309+
pip list
310+
echo "::endgroup::"
311+
312+
echo "::group::Prepare Gemma3 Artifacts"
313+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
314+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
315+
ls -al model.pte aoti_cuda_blob.ptd
316+
echo "::endgroup::"
317+
318+
echo "::group::Build Gemma3 Benchmark"
319+
cmake -DCMAKE_BUILD_TYPE=Release \
320+
-DEXECUTORCH_BUILD_CUDA=ON \
321+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
322+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
323+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
324+
-DEXECUTORCH_BUILD_TESTS=ON \
325+
-Bcmake-out .
326+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
327+
echo "::endgroup::"
328+
329+
echo "::group::Run Gemma3 Benchmark"
330+
331+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
332+
cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
214333
215334
echo "::endgroup::"
216335
@@ -302,3 +421,87 @@ jobs:
302421
exit $EXIT_CODE
303422
fi
304423
echo "::endgroup::"
424+
425+
test-gemma3-cuda-e2e:
426+
name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
427+
needs: export-gemma3-cuda-artifact
428+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
429+
permissions:
430+
id-token: write
431+
contents: read
432+
strategy:
433+
fail-fast: false
434+
matrix:
435+
format:
436+
- name: "non-quantized"
437+
artifact: "gemma3-cuda-export"
438+
# TODO: enable quantized gemma3.
439+
# - name: "quantized-int4-tile-packed"
440+
# artifact: "gemma3-cuda-quantized-int4-tile-packed"
441+
# - name: "quantized-int4-weight-only"
442+
# artifact: "gemma3-cuda-quantized-int4-weight-only"
443+
with:
444+
timeout: 90
445+
runner: linux.g5.4xlarge.nvidia.gpu
446+
gpu-arch-type: cuda
447+
gpu-arch-version: 12.6
448+
use-custom-docker-registry: false
449+
submodules: recursive
450+
download-artifact: ${{ matrix.format.artifact }}
451+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
452+
script: |
453+
set -eux
454+
455+
echo "::group::Setup ExecuTorch Requirements"
456+
./install_requirements.sh
457+
pip list
458+
echo "::endgroup::"
459+
460+
echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
461+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
462+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
463+
TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
464+
curl -L $TOKENIZER_URL -o tokenizer.json
465+
ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
466+
IMAGE_PATH="docs/source/_static/img/et-logo.png"
467+
echo "::endgroup::"
468+
469+
echo "::group::Build Gemma3 Runner"
470+
cmake --preset llm \
471+
-DEXECUTORCH_BUILD_CUDA=ON \
472+
-DCMAKE_INSTALL_PREFIX=cmake-out \
473+
-DCMAKE_BUILD_TYPE=Release \
474+
-Bcmake-out -S.
475+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
476+
477+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
478+
-DCMAKE_BUILD_TYPE=Release \
479+
-Sexamples/models/gemma3 \
480+
-Bcmake-out/examples/models/gemma3/
481+
cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
482+
echo "::endgroup::"
483+
484+
echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
485+
set +e
486+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
487+
OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
488+
--model_path model.pte \
489+
--data_path aoti_cuda_blob.ptd \
490+
--tokenizer_path tokenizer.json \
491+
--image_path $IMAGE_PATH \
492+
--temperature 0 2>&1)
493+
EXIT_CODE=$?
494+
set -e
495+
496+
echo "$OUTPUT"
497+
498+
if ! echo "$OUTPUT" | grep -iq "chip"; then
499+
echo "Expected output 'chip' not found in output"
500+
exit 1
501+
fi
502+
503+
if [ $EXIT_CODE -ne 0 ]; then
504+
echo "Unexpected exit code: $EXIT_CODE"
505+
exit $EXIT_CODE
506+
fi
507+
echo "::endgroup::"

backends/cadence/aot/ops_registrations.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def _validate_ref_impl_exists() -> None:
5353
# 1. be removed
5454
# 2. have a reference implementation added to ref_implementations.py
5555
_WARN_ONLY = {
56-
"cadence::_softmax_f32_f32",
5756
"cadence::quantized_softmax.per_tensor",
5857
"cadence::quantized_softmax",
5958
"cadence::quantized_w8a32_gru",
@@ -640,10 +639,10 @@ def register_fake(
640639
"int sampling_ratio, bool aligned) -> (Tensor out)"
641640
)
642641
lib.define(
643-
"_softmax_f32_f32(Tensor self, int dim, bool? half_to_float) -> (Tensor out)"
642+
"_softmax_f32_f32(Tensor self, int dim, bool? half_to_float = None) -> (Tensor out)"
644643
)
645644
lib.define(
646-
"_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float, *, Tensor(a!) out) -> Tensor(a!)"
645+
"_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float = None, *, Tensor(a!) out) -> Tensor(a!)"
647646
)
648647

649648
lib.define(
@@ -2652,12 +2651,13 @@ def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_meta(
26522651

26532652
@register_fake("cadence::_softmax_f32_f32")
26542653
def softmax_f32_f32_meta(
2655-
self: torch.Tensor,
2654+
input_tensor: torch.Tensor,
26562655
dim: int,
2657-
dtype: torch.dtype,
26582656
half_to_float: Optional[bool] = None,
26592657
) -> torch.Tensor:
2660-
return self.new_empty(self.size(), dtype=self.dtype)
2658+
assert input_tensor.dtype == torch.float32, "input_tensor must be float32"
2659+
assert not half_to_float, "half_to_float is not supported"
2660+
return input_tensor.new_empty(input_tensor.size(), dtype=torch.float32)
26612661

26622662

26632663
@register_fake("cadence::quantized_softmax")

backends/cadence/aot/ref_implementations.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1979,3 +1979,14 @@ def linalg_svd(
19791979
assert compute_uv
19801980
U, S, Vh = torch.linalg.svd(A, full_matrices=full_matrices, driver=driver)
19811981
return U.contiguous(), S.contiguous(), Vh.contiguous()
1982+
1983+
1984+
@impl_tracked(m, "_softmax_f32_f32")
1985+
def softmax_f32_f32(
1986+
input_tensor: torch.Tensor,
1987+
dim: int,
1988+
half_to_float: bool | None = None,
1989+
) -> torch.Tensor:
1990+
assert input_tensor.dtype == torch.float32, "input_tensor must be float32"
1991+
assert not half_to_float, "half_to_float is not supported"
1992+
return torch.nn.functional.softmax(input_tensor, dim=dim, dtype=torch.float32)

backends/cadence/aot/tests/test_ref_implementations.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2885,3 +2885,12 @@ def test_quantized_layer_norm(self) -> None:
28852885
output_scale,
28862886
output_zero_point,
28872887
)
2888+
2889+
def test_softmax_f32_f32(self) -> None:
2890+
# Just a wrapper around torch.nn.functional.softmax, so just ensure that it runs
2891+
input_tensor = torch.tensor(
2892+
[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=torch.float32
2893+
)
2894+
output = torch.ops.cadence._softmax_f32_f32(input_tensor, dim=1)
2895+
self.assertEqual(output.dtype, torch.float32)
2896+
self.assertEqual(output.shape, input_tensor.shape)

backends/cuda/CMakeLists.txt

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED)
3434
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
3535
find_package_torch()
3636

37+
# CUDA tensor maker for backends that support incontiguous tensors
38+
set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
39+
add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
40+
target_include_directories(
41+
cuda_tensor_maker
42+
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
43+
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
44+
)
45+
target_compile_options(
46+
cuda_tensor_maker
47+
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
48+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
49+
)
50+
# Ensure symbols are exported properly
51+
if(APPLE)
52+
target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
53+
else()
54+
target_link_options(
55+
cuda_tensor_maker PUBLIC
56+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
57+
)
58+
endif()
59+
60+
# Link against ExecuTorch core libraries
61+
target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
62+
executorch_target_link_options_shared_lib(cuda_tensor_maker)
63+
64+
install(
65+
TARGETS cuda_tensor_maker
66+
EXPORT ExecuTorchTargets
67+
DESTINATION lib
68+
)
69+
3770
# CUDA-specific AOTI functionality
3871
set(_aoti_cuda_sources
3972
runtime/cuda_backend.cpp
@@ -62,20 +95,20 @@ target_link_options(
6295
aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
6396
)
6497

65-
# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
98+
# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
99+
# CUDA libraries
66100
target_link_libraries(
67-
aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
101+
aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
68102
)
69103
# If you need other CUDA libraries, link them similarly:
70104
# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
71105
executorch_target_link_options_shared_lib(aoti_cuda)
72106

73107
if(BUILD_TESTING)
74-
# Add runtime
75-
add_executable(voxtral_runner tests/voxtral_runner.cpp)
108+
add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
76109
target_link_libraries(
77-
voxtral_runner PUBLIC aoti_cuda extension_module_static
78-
extension_flat_tensor portable_ops_lib
110+
multimodal_benchmark PUBLIC aoti_cuda extension_module_static
111+
extension_flat_tensor portable_ops_lib
79112
)
80113
endif()
81114

backends/cuda/cuda_backend.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ def preprocess(
140140
user_input_placeholders.append(node.meta["val"])
141141

142142
options: dict[str, typing.Any] = {
143+
# Disable this to support sdpa decomposition
144+
# TODO(gasoonjia): remove it after pin bump to latest pytorch
145+
"loop_ordering_after_fusion": False,
143146
# Better model precision
144147
"emulate_precision_casts": True,
145148
# Embed CUDA kernel binaries directly into the compiled shared object

0 commit comments

Comments
 (0)