Skip to content

Commit e56b81f

Browse files
author
ssjia
committed
Update on "[ET-VK] Add mechanism to trigger command buffer re-encode only when necessary"
## Context Dynamic shape models currently will require the command buffer to be re-encoded every inference. However, this introduces a significant overhead when running models that require dynamic shapes. The reality is that a command buffer re-encode may not be needed every frame. A command buffer re-encode will only be needed when: 1. Shader dispatch parameters change; i.e. new tensor sizes require a completely different compute shader, require new local work group sizing, or require new work group grid size (i.e. global work group size / local work group size) 2. Push constants containing tensor metadata need to be updated This diff aims to reduce the overhead of triggering tensor shape change by detecting when a command buffer re-encode is actually needed. ## Changes `ComputeGraph`: * Introduce `requires_reencode` flag to `ComputeGraph` to indicate when a command buffer re-encode is needed. * Introduce a `std::set<ValueRef>` tracking which values were updated when propagating tensor sizes * "update" can be one of two things: 1) tensor sizes changed 2) symint value changed `DispatchNode`: * When propagating new tensor sizes, only execute the resize function if any of the values participating in the computation have been updated * Mark `requries_reencode` if any push constants associated with tensor metadata need to be udpated `DynamicDispatchNode`: * Only recompute compute shader dispatch params if any of the values participating in the computation have been updated * Mark `requires_reencode` if 1) a new compute shader is required, 2) local work group size changed, 3) work group grid size changed Differential Revision: [D79813237](https://our.internmc.facebook.com/intern/diff/D79813237/) [ghstack-poisoned]
2 parents 9152a97 + 4363f93 commit e56b81f

File tree

167 files changed

+3091
-1238
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

167 files changed

+3091
-1238
lines changed

.ci/scripts/test_llama_lora.sh

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,17 @@ DOWNLOADED_PATH=$(
4848
--model_id "${HF_MODEL_REPO}" \
4949
--files "adapter_config.json" "adapter_model.pt" "consolidated.00.pth" "params.json" "tokenizer.model"
5050
)
51-
EXPORTED_MODEL_NAME="llama_3_2_1B_lora.pte"
52-
# Export model.
51+
# Build llama runner.
52+
cmake_install_executorch_libraries
53+
cmake_build_llama_runner
54+
55+
# Constants.
56+
RUNTIME_ARGS="--tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
57+
PROMPT="What happens if you eat watermelon seeds?"
58+
EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
59+
60+
# Export LoRA PTE file.
61+
MODEL_NAME="llama_3_2_1B_lora"
5362
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
5463
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
5564
base.params="${DOWNLOADED_PATH}/params.json" \
@@ -61,36 +70,64 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
6170
model.dtype_override="fp32" \
6271
backend.xnnpack.enabled=true \
6372
backend.xnnpack.extended_ops=true \
64-
export.output_name="${EXPORTED_MODEL_NAME}"
65-
66-
# Build llama runner.
67-
cmake_install_executorch_libraries
68-
cmake_build_llama_runner
73+
export.output_name="${MODEL_NAME}.pte"
6974

70-
PROMPT="What happens if you eat watermelon seeds?"
7175
# Run llama runner
72-
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
73-
7476
NOW=$(date +"%H:%M:%S")
7577
echo "Starting to run llama runner at ${NOW}"
7678
# shellcheck source=/dev/null
77-
cmake-out/examples/models/llama/llama_main --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
79+
cmake-out/examples/models/llama/llama_main --model_path=${MODEL_NAME}.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
7880
NOW=$(date +"%H:%M:%S")
7981
echo "Finished at ${NOW}"
8082

8183
RESULT=$(cat result.txt)
82-
EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
83-
8484
if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
8585
echo "Expected result prefix: ${EXPECTED_PREFIX}"
8686
echo "Actual result: ${RESULT}"
87+
# Do not clean up files if test passes, as they're re-used in the next test.
8788
echo "Success"
88-
cleanup_files
8989
else
9090
echo "Expected result prefix: ${EXPECTED_PREFIX}"
9191
echo "Actual result: ${RESULT}"
9292
echo "Failure; results not the same"
93+
cleanup_files
94+
exit 1
95+
fi
9396

97+
# Export LoRA PTE, PTD file.
98+
MODEL_SEPARATE="${MODEL_NAME}_separate"
99+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
100+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
101+
base.params="${DOWNLOADED_PATH}/params.json" \
102+
base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
103+
base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
104+
base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
105+
model.use_kv_cache=true \
106+
model.use_sdpa_with_kv_cache=true \
107+
model.dtype_override="fp32" \
108+
backend.xnnpack.enabled=true \
109+
backend.xnnpack.extended_ops=true \
110+
export.output_name="${MODEL_SEPARATE}.pte" \
111+
export.foundation_weights_file="${MODEL_SEPARATE}.ptd"
112+
113+
# Run llama runner.
114+
NOW=$(date +"%H:%M:%S")
115+
echo "Starting to run llama runner at ${NOW}"
116+
# shellcheck source=/dev/null
117+
cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
118+
NOW=$(date +"%H:%M:%S")
119+
echo "Finished at ${NOW}"
120+
121+
RESULT2=$(cat result2.txt)
122+
if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
123+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
124+
echo "Actual result: ${RESULT2}"
125+
echo "Success"
126+
cleanup_files
127+
else
128+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
129+
echo "Actual result: ${RESULT2}"
130+
echo "Failure; results not the same"
94131
cleanup_files
95132
exit 1
96133
fi

.github/workflows/build-presets.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ on:
66
branches:
77
- main
88
- release/*
9-
paths:
10-
- .github/workflows/build-presets.yml
119
workflow_dispatch:
1210

1311
concurrency:

.github/workflows/trunk.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ jobs:
288288
- test_arm_baremetal: test_models_tosa
289289
- test_arm_baremetal: test_models_ethos-u55
290290
- test_arm_baremetal: test_models_ethos-u85
291+
- test_arm_baremetal: test_smaller_stories_llama
291292
fail-fast: false
292293
with:
293294
runner: linux.2xlarge.memory

.lintrunner.toml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,36 @@ init_command = [
136136
'--requirement=requirements-lintrunner.txt',
137137
]
138138

139+
[[linter]]
140+
code = 'CMAKEFORMAT'
141+
include_patterns = [
142+
"**/*.cmake",
143+
"**/*.cmake.in",
144+
"**/CMakeLists.txt",
145+
]
146+
exclude_patterns = [
147+
'third-party/**',
148+
'**/third-party/**',
149+
]
150+
command = [
151+
'python',
152+
'-m',
153+
'lintrunner_adapters',
154+
'run',
155+
'cmake_format_linter',
156+
'--',
157+
'@{{PATHSFILE}}',
158+
]
159+
init_command = [
160+
'python',
161+
'-m',
162+
'lintrunner_adapters',
163+
'run',
164+
'pip_init',
165+
'--dry-run={{DRYRUN}}',
166+
'--requirement=requirements-lintrunner.txt',
167+
]
168+
139169
[[linter]]
140170
code = 'ETCAPITAL'
141171
include_patterns = [

CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,15 +284,19 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
284284
set(TORCHAO_BUILD_CPU_AARCH64 ON)
285285
set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
286286

287-
list(APPEND TORCHAO_INCLUDE_DIRS
287+
list(
288+
APPEND
289+
TORCHAO_INCLUDE_DIRS
288290
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
289291
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
290292
${EXECUTORCH_ROOT}/third-party/ao
291293
)
292294

293295
set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
294296

295-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental)
297+
add_subdirectory(
298+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
299+
)
296300
executorch_target_link_options_shared_lib(torchao_ops_executorch)
297301
list(APPEND _executorch_kernels torchao_ops_executorch)
298302
endif()

backends/arm/CMakeLists.txt

Lines changed: 55 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -19,69 +19,71 @@ set(_common_include_directories
1919
)
2020
add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
2121

22-
2322
# bare metal backend builds
2423
if(EXECUTORCH_BUILD_ARM_BAREMETAL)
2524

26-
add_compile_options("-Wall" "-Werror")
25+
add_compile_options("-Wall" "-Werror")
2726

28-
# Third-party folder and Ethos-U driver inclued
29-
set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
30-
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
31-
include_directories(${DRIVER_ETHOSU_INCLUDE_DIR})
27+
# Third-party folder and Ethos-U driver inclued
28+
set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
29+
set(DRIVER_ETHOSU_INCLUDE_DIR
30+
"${THIRD_PARTY_ROOT}/ethos-u-core-driver/include"
31+
)
32+
include_directories(${DRIVER_ETHOSU_INCLUDE_DIR})
3233

33-
set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp
34-
backends/arm/runtime/VelaBinStream.cpp
35-
)
36-
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
34+
set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp
35+
backends/arm/runtime/VelaBinStream.cpp
36+
)
37+
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
3738

38-
add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources})
39-
target_link_libraries(
40-
executorch_delegate_ethos_u PUBLIC executorch_core ethosu_core_driver
41-
)
39+
add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources})
40+
target_link_libraries(
41+
executorch_delegate_ethos_u PUBLIC executorch_core ethosu_core_driver
42+
)
4243

43-
install(TARGETS executorch_delegate_ethos_u EXPORT ExecuTorchTargets)
44+
install(TARGETS executorch_delegate_ethos_u EXPORT ExecuTorchTargets)
4445

45-
# end config for bare metal builds
46+
# end config for bare metal builds
4647
endif()
4748

48-
49-
# VGF backend builds
49+
# VGF backend builds
5050
if(EXECUTORCH_BUILD_VGF)
5151

52-
# include libvgf
53-
set(LIBVGF_PATH "${EXECUTORCH_ROOT}/examples/arm/ethos-u-scratch/ml-sdk-for-vulkan-manifest/sw/vgf-lib/")
54-
55-
set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party)
56-
set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
57-
set(VOLK_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
58-
59-
set(LIBVGF_STATIC "${LIBVGF_PATH}/build/src/libvgf.a")
60-
set(LIBVGF_INCLUDE "${LIBVGF_PATH}/include/")
61-
62-
add_library(vgf STATIC IMPORTED)
63-
set_property( TARGET vgf PROPERTY IMPORTED_LOCATION "${LIBVGF_STATIC}" )
64-
target_include_directories(vgf INTERFACE "${LIBVGF_INCLUDE}")
65-
66-
# Add backend delegate for VGF
67-
set(_vgf_backend_sources backends/arm/runtime/VGFBackend.cpp
68-
backends/arm/runtime/VGFSetup.cpp )
69-
70-
# vgf backend
71-
list(TRANSFORM _vgf_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
72-
add_library(vgf_backend ${_vgf_backend_sources})
73-
target_include_directories(
74-
vgf_backend PUBLIC
75-
${_common_include_directories}
76-
${VULKAN_HEADERS_PATH}
77-
${VOLK_HEADERS_PATH}
78-
)
79-
target_compile_options(vgf_backend PRIVATE -DUSE_VULKAN_WRAPPER -DUSE_VULKAN_VOLK)
80-
81-
82-
target_link_libraries(vgf_backend PRIVATE executorch_core)
83-
target_link_libraries(vgf_backend PRIVATE vgf)
84-
executorch_target_link_options_shared_lib(vgf_backend)
85-
86-
# end config for VGF builds
52+
# include libvgf
53+
set(LIBVGF_PATH
54+
"${EXECUTORCH_ROOT}/examples/arm/ethos-u-scratch/ml-sdk-for-vulkan-manifest/sw/vgf-lib/"
55+
)
56+
57+
set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party)
58+
set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
59+
set(VOLK_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
60+
61+
set(LIBVGF_STATIC "${LIBVGF_PATH}/build/src/libvgf.a")
62+
set(LIBVGF_INCLUDE "${LIBVGF_PATH}/include/")
63+
64+
add_library(vgf STATIC IMPORTED)
65+
set_property(TARGET vgf PROPERTY IMPORTED_LOCATION "${LIBVGF_STATIC}")
66+
target_include_directories(vgf INTERFACE "${LIBVGF_INCLUDE}")
67+
68+
# Add backend delegate for VGF
69+
set(_vgf_backend_sources backends/arm/runtime/VGFBackend.cpp
70+
backends/arm/runtime/VGFSetup.cpp
71+
)
72+
73+
# vgf backend
74+
list(TRANSFORM _vgf_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
75+
add_library(vgf_backend ${_vgf_backend_sources})
76+
target_include_directories(
77+
vgf_backend PUBLIC ${_common_include_directories} ${VULKAN_HEADERS_PATH}
78+
${VOLK_HEADERS_PATH}
79+
)
80+
target_compile_options(
81+
vgf_backend PRIVATE -DUSE_VULKAN_WRAPPER -DUSE_VULKAN_VOLK
82+
)
83+
84+
target_link_libraries(vgf_backend PRIVATE executorch_core)
85+
target_link_libraries(vgf_backend PRIVATE vgf)
86+
executorch_target_link_options_shared_lib(vgf_backend)
87+
88+
# end config for VGF builds
8789
endif()

backends/arm/TARGETS

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,19 @@ python_library(
2121
"//executorch/exir/dialects:lib",
2222
],
2323
)
24+
python_library(
25+
name = "common",
26+
srcs = [
27+
"common/__init__.py",
28+
"common/debug.py",
29+
],
30+
deps = [
31+
"fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
32+
"fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
33+
"//caffe2:torch",
34+
"//executorch/exir:lib",
35+
],
36+
)
2437
python_library(
2538
name = "arm_partitioner",
2639
srcs = [

backends/arm/_passes/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ python_library(
44
name = "passes",
55
srcs = glob(["*.py"]),
66
deps = [
7+
"//executorch/backends/arm:common",
78
"//executorch/backends/arm:constants",
89
"//executorch/backends/arm:tosa_quant_utils",
910
"//executorch/backends/arm:tosa_utils",

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@
3535
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
3636
from .decompose_div_pass import DecomposeDivPass # noqa
3737
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa
38+
from .decompose_expm1_pass import DecomposeExpm1Pass # noqa
3839
from .decompose_gelu_pass import DecomposeGeluPass # noqa
40+
from .decompose_glu_pass import DecomposeGluPass # noqa
3941
from .decompose_grouped_conv import DecomposeGroupedConv # noqa
4042
from .decompose_groupnorm_pass import DecomposeGroupNormPass # noqa
4143
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@
4040
DecomposeCosineSimilarityPass,
4141
DecomposeDivPass,
4242
DecomposeEmbeddingPass,
43+
DecomposeExpm1Pass,
4344
DecomposeGeluPass,
45+
DecomposeGluPass,
4446
DecomposeGroupedConv,
4547
DecomposeGroupNormPass,
4648
DecomposeLayerNormPass,
@@ -163,6 +165,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
163165
return self._transform(exported_program.graph_module)
164166

165167
def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
168+
self.add_pass(DecomposeExpm1Pass())
166169
self.add_pass(DecomposeMaskedFill())
167170
self.add_pass(DecomposeRoundPass())
168171
self.add_pass(DecomposeAcoshPass())
@@ -184,6 +187,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
184187
self.add_pass(ConvertSplitToSlicePass())
185188
self.add_pass(FuseBatchnorm2DPass(exported_program))
186189
self.add_pass(ConvertMmToBmmPass())
190+
self.add_pass(DecomposeGluPass())
187191
self.add_pass(DecomposeLinearPass())
188192
self.add_pass(DecomposeLeakyReLUPass())
189193
self.add_pass(DecomposeGroupNormPass())
@@ -264,6 +268,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
264268
self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
265269
self.add_pass(DecomposeNotEqualPass())
266270
self.add_pass(DecomposeCosineSimilarityPass())
271+
self.add_pass(DecomposeGluPass())
267272
self.add_pass(DecomposeDivPass())
268273
self.add_pass(DecomposeLeakyReLUPass())
269274
self.add_pass(DecomposeLinearVectorNormPass())

0 commit comments

Comments
 (0)