Skip to content

Commit a5bc7c8

Browse files
committed
Update on "adding cuda memory estimation support"
Differential Revision: [D85119089](https://our.internmc.facebook.com/intern/diff/D85119089/) [ghstack-poisoned]
2 parents 0d58b63 + c959fe9 commit a5bc7c8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2076
-307
lines changed

.github/workflows/cuda.yml

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,27 @@ jobs:
165165
echo "::endgroup::"
166166
167167
export-gemma3-cuda-artifact:
168-
name: export-gemma3-cuda-artifact
168+
name: export-gemma3-cuda-${{ matrix.quant.name }}
169169
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170170
permissions:
171171
id-token: write
172172
contents: read
173173
secrets: inherit
174174
strategy:
175175
fail-fast: false
176+
matrix:
177+
quant:
178+
- name: "non-quantized"
179+
artifact: "voxtral-cuda-export"
180+
extra_args: ""
181+
# TODO: enable gemma3 quantization
182+
# - name: "quantized-int4-tile-packed"
183+
# artifact: "voxtral-cuda-quantized-int4-tile-packed"
184+
# extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
185+
# - name: "quantized-int4-weight-only"
186+
# artifact: "voxtral-cuda-quantized-int4-weight-only"
187+
# # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
188+
# extra_args: "--qlinear_encoder 4w"
176189
with:
177190
timeout: 90
178191
secrets-env: EXECUTORCH_HF_TOKEN
@@ -198,7 +211,8 @@ jobs:
198211
pip list
199212
echo "::endgroup::"
200213
201-
echo "::group::Export Gemma3"
214+
echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
215+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
202216
optimum-cli export executorch \
203217
--model "google/gemma-3-4b-it" \
204218
--task "multimodal-text-to-text" \
@@ -212,7 +226,7 @@ jobs:
212226
test -f aoti_cuda_blob.ptd
213227
echo "::endgroup::"
214228
215-
echo "::group::Store Gemma3 Artifacts"
229+
echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
216230
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
217231
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
218232
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -407,3 +421,87 @@ jobs:
407421
exit $EXIT_CODE
408422
fi
409423
echo "::endgroup::"
424+
425+
test-gemma3-cuda-e2e:
426+
name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
427+
needs: export-gemma3-cuda-artifact
428+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
429+
permissions:
430+
id-token: write
431+
contents: read
432+
strategy:
433+
fail-fast: false
434+
matrix:
435+
format:
436+
- name: "non-quantized"
437+
artifact: "gemma3-cuda-export"
438+
# TODO: enable quantized gemma3.
439+
# - name: "quantized-int4-tile-packed"
440+
# artifact: "gemma3-cuda-quantized-int4-tile-packed"
441+
# - name: "quantized-int4-weight-only"
442+
# artifact: "gemma3-cuda-quantized-int4-weight-only"
443+
with:
444+
timeout: 90
445+
runner: linux.g5.4xlarge.nvidia.gpu
446+
gpu-arch-type: cuda
447+
gpu-arch-version: 12.6
448+
use-custom-docker-registry: false
449+
submodules: recursive
450+
download-artifact: ${{ matrix.format.artifact }}
451+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
452+
script: |
453+
set -eux
454+
455+
echo "::group::Setup ExecuTorch Requirements"
456+
./install_requirements.sh
457+
pip list
458+
echo "::endgroup::"
459+
460+
echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
461+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
462+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
463+
TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
464+
curl -L $TOKENIZER_URL -o tokenizer.json
465+
ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
466+
IMAGE_PATH="docs/source/_static/img/et-logo.png"
467+
echo "::endgroup::"
468+
469+
echo "::group::Build Gemma3 Runner"
470+
cmake --preset llm \
471+
-DEXECUTORCH_BUILD_CUDA=ON \
472+
-DCMAKE_INSTALL_PREFIX=cmake-out \
473+
-DCMAKE_BUILD_TYPE=Release \
474+
-Bcmake-out -S.
475+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
476+
477+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
478+
-DCMAKE_BUILD_TYPE=Release \
479+
-Sexamples/models/gemma3 \
480+
-Bcmake-out/examples/models/gemma3/
481+
cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
482+
echo "::endgroup::"
483+
484+
echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
485+
set +e
486+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
487+
OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
488+
--model_path model.pte \
489+
--data_path aoti_cuda_blob.ptd \
490+
--tokenizer_path tokenizer.json \
491+
--image_path $IMAGE_PATH \
492+
--temperature 0 2>&1)
493+
EXIT_CODE=$?
494+
set -e
495+
496+
echo "$OUTPUT"
497+
498+
if ! echo "$OUTPUT" | grep -iq "chip"; then
499+
echo "Expected output 'chip' not found in output"
500+
exit 1
501+
fi
502+
503+
if [ $EXIT_CODE -ne 0 ]; then
504+
echo "Unexpected exit code: $EXIT_CODE"
505+
exit $EXIT_CODE
506+
fi
507+
echo "::endgroup::"

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ executorch
3333
│ ├── <a href="backends/openvino">openvino</a> - OpenVINO backend for Intel hardware.
3434
│ ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
3535
│ ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
36-
│ ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
36+
│ ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends/vulkan/vulkan-overview.md">doc</a>.
3737
│ └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
3838
├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
3939
├── <a href="configurations">configurations</a> - Configuration files.

backends/arm/test/models/test_nn_modules.py

Lines changed: 77 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,32 +17,91 @@
1717
- Transformer
1818
"""
1919

20+
from typing import Callable
21+
2022
import torch
2123
from executorch.backends.arm.test.common import parametrize
2224
from executorch.backends.arm.test.tester.test_pipeline import (
2325
TosaPipelineFP,
2426
TosaPipelineINT,
2527
)
2628

29+
30+
def make_module_wrapper(
31+
name: str, module_factory: Callable[[], torch.nn.Module]
32+
) -> torch.nn.Module:
33+
class ModuleWrapper(torch.nn.Module):
34+
def __init__(self):
35+
super().__init__()
36+
self._module = module_factory()
37+
38+
def forward(self, *args, **kwargs):
39+
return self._module(*args, **kwargs)
40+
41+
ModuleWrapper.__name__ = name
42+
ModuleWrapper.__qualname__ = name
43+
return ModuleWrapper()
44+
45+
2746
example_input = torch.rand(1, 6, 16, 16)
2847

2948
module_tests = [
30-
(torch.nn.Embedding(10, 10), (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),)),
31-
(torch.nn.LeakyReLU(), (example_input,)),
32-
(torch.nn.BatchNorm1d(16), (torch.rand(6, 16, 16),)),
33-
(torch.nn.AdaptiveAvgPool2d((12, 12)), (example_input,)),
34-
(torch.nn.ConvTranspose2d(6, 3, 2), (example_input,)),
35-
(torch.nn.GRU(10, 20, 2), (torch.randn(5, 3, 10), torch.randn(2, 3, 20))),
36-
(torch.nn.GroupNorm(2, 6), (example_input,)),
37-
(torch.nn.InstanceNorm2d(16), (example_input,)),
38-
(torch.nn.PReLU(), (example_input,)),
3949
(
40-
torch.nn.Transformer(
41-
d_model=64,
42-
nhead=1,
43-
num_encoder_layers=1,
44-
num_decoder_layers=1,
45-
dtype=torch.float32,
50+
make_module_wrapper(
51+
"EmbeddingModule",
52+
lambda: torch.nn.Embedding(10, 10),
53+
),
54+
(torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),),
55+
),
56+
(
57+
make_module_wrapper("LeakyReLUModule", torch.nn.LeakyReLU),
58+
(example_input,),
59+
),
60+
(
61+
make_module_wrapper("BatchNorm1dModule", lambda: torch.nn.BatchNorm1d(16)),
62+
(torch.rand(6, 16, 16),),
63+
),
64+
(
65+
make_module_wrapper(
66+
"AdaptiveAvgPool2dModule",
67+
lambda: torch.nn.AdaptiveAvgPool2d((12, 12)),
68+
),
69+
(example_input,),
70+
),
71+
(
72+
make_module_wrapper(
73+
"ConvTranspose2dModule", lambda: torch.nn.ConvTranspose2d(6, 3, 2)
74+
),
75+
(example_input,),
76+
),
77+
(
78+
make_module_wrapper("GRUModule", lambda: torch.nn.GRU(10, 20, 2)),
79+
(torch.randn(5, 3, 10), torch.randn(2, 3, 20)),
80+
),
81+
(
82+
make_module_wrapper("GroupNormModule", lambda: torch.nn.GroupNorm(2, 6)),
83+
(example_input,),
84+
),
85+
(
86+
make_module_wrapper(
87+
"InstanceNorm2dModule", lambda: torch.nn.InstanceNorm2d(16)
88+
),
89+
(example_input,),
90+
),
91+
(
92+
make_module_wrapper("PReLUModule", torch.nn.PReLU),
93+
(example_input,),
94+
),
95+
(
96+
make_module_wrapper(
97+
"TransformerModule",
98+
lambda: torch.nn.Transformer(
99+
d_model=64,
100+
nhead=1,
101+
num_encoder_layers=1,
102+
num_decoder_layers=1,
103+
dtype=torch.float32,
104+
),
46105
),
47106
(torch.rand((10, 32, 64)), torch.rand((20, 32, 64))),
48107
),
@@ -78,9 +137,9 @@ def test_nn_Modules_FP(test_data):
78137
"test_data",
79138
test_parameters,
80139
xfails={
81-
"GRU": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
82-
"PReLU": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
83-
"Transformer": "AssertionError: Output 0 does not match reference output.",
140+
"GRUModule": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
141+
"PReLUModule": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
142+
"TransformerModule": "AssertionError: Output 0 does not match reference output.",
84143
},
85144
)
86145
def test_nn_Modules_INT(test_data):

backends/arm/test/models/test_resnet18.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@ def test_resnet_u55_INT(per_channel_quantization):
7979

8080

8181
@pytest.mark.slow
82-
@pytest.mark.xfail(
83-
reason="For resnet18 for Ethos-U85, the SRAM memory footprint is very high. The compiler team is investigating."
84-
)
8582
@common.XfailIfNoCorstone320
8683
@common.parametrize("per_channel_quantization", quant_test_data)
8784
def test_resnet_u85_INT(per_channel_quantization):

backends/arm/test/models/test_torch_functions.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ def test_torch_fns_FP(test_data):
128128
"Requires dynamic output shape.",
129129
"topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
130130
"sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
131-
"t": "MLETORCH-855: Issue with Quantization folding.",
132131
},
133132
strict=False,
134133
)

0 commit comments

Comments
 (0)