Skip to content

Commit d7f7d8d

Browse files
committed
Update on "[ET-VK] Adding reserve and append functions to SpecVarList"
This diff adds two new functions to the SpecVarList class in the Vulkan runtime library. The first function, reserve, allows the user to reserve a certain amount of space in the SpecVarList before adding any elements. The second function, append, allows the user to add a single SpecVar to the SpecVarList. These functions are useful for optimizing memory usage and improving performance in the Vulkan runtime. Differential Revision: [D70021782](https://our.internmc.facebook.com/intern/diff/D70021782/) [ghstack-poisoned]
2 parents 86c6dbb + a886eb2 commit d7f7d8d

File tree

19 files changed

+1243
-300
lines changed

19 files changed

+1243
-300
lines changed

.github/workflows/trunk.yml

Lines changed: 32 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,13 @@ jobs:
374374
secrets: inherit
375375
strategy:
376376
matrix:
377-
hf_model_repo: [google/gemma-2-2b]
377+
hf_model_id: [
378+
google/gemma-2-2b,
379+
Qwen/Qwen2.5-0.5B,
380+
HuggingFaceTB/SmolLM2-135M,
381+
meta-llama/Llama-3.2-1B,
382+
allenai/OLMo-1B-hf
383+
]
378384
fail-fast: false
379385
with:
380386
secrets-env: EXECUTORCH_HF_TOKEN
@@ -389,66 +395,39 @@ jobs:
389395
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
390396
conda activate "${CONDA_ENV}"
391397
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
392-
393-
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
394-
rm -rf cmake-out
395-
cmake \
396-
-DCMAKE_INSTALL_PREFIX=cmake-out \
397-
-DCMAKE_BUILD_TYPE=Release \
398-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
399-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
400-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
401-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
402-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
403-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
404-
-DEXECUTORCH_BUILD_XNNPACK=ON \
405-
-DPYTHON_EXECUTABLE=python \
406-
-Bcmake-out .
407-
cmake --build cmake-out -j9 --target install --config Release
408-
409-
echo "Build llama runner"
410-
dir="examples/models/llama"
411-
cmake \
412-
-DCMAKE_INSTALL_PREFIX=cmake-out \
413-
-DCMAKE_BUILD_TYPE=Release \
414-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
415-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
416-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
417-
-DEXECUTORCH_BUILD_XNNPACK=ON \
418-
-DPYTHON_EXECUTABLE=python \
419-
-Bcmake-out/${dir} \
420-
${dir}
421-
cmake --build cmake-out/${dir} -j9 --config Release
422398
echo "::endgroup::"
423399
424-
echo "::group::Set up HuggingFace Dependencies"
425-
if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
426-
echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
427-
exit 1
428-
fi
400+
echo "::group::Set up Hugging Face"
429401
pip install -U "huggingface_hub[cli]"
430402
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
403+
git clone https://github.com/huggingface/optimum-executorch
404+
cd optimum-executorch
405+
# There is no release yet, for CI stability, always test from the same commit on main
406+
git checkout 6a7e83f3eee2976fa809335bfb78a45b1ea1cb25
407+
pip install .
431408
pip install accelerate sentencepiece
432409
pip list
433410
echo "::endgroup::"
434411
435-
echo "::group::Export to ExecuTorch"
436-
TOKENIZER_FILE=tokenizer.model
437-
TOKENIZER_BIN_FILE=tokenizer.bin
438-
ET_MODEL_NAME=et_model
439-
DOWNLOADED_TOKENIZER_FILE_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${{ matrix.hf_model_repo }}" --files "${TOKENIZER_FILE}")
440-
if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" ]; then
441-
echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
442-
python -m extension.llm.tokenizer.tokenizer -t "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" -o ./${TOKENIZER_BIN_FILE}
443-
ls ./tokenizer.bin
444-
else
445-
echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
446-
exit 1
447-
fi
448-
449-
python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
450-
451-
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
412+
echo "::group::Export and Run ${{ matrix.hf_model_id }}"
413+
# Pass matrix variable as environment variable
414+
export MODEL_ID="${{ matrix.hf_model_id }}"
415+
python -c "
416+
import os
417+
from optimum.executorch import ExecuTorchModelForCausalLM
418+
from transformers import AutoTokenizer
419+
420+
model_id = os.getenv('MODEL_ID')
421+
print(f'Loading model: {model_id}')
422+
model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
423+
tokenizer = AutoTokenizer.from_pretrained(model_id)
424+
generated_text = model.text_generation(
425+
tokenizer=tokenizer,
426+
prompt='Simply put, the theory of relativity states that',
427+
max_seq_len=64
428+
)
429+
print(generated_text)
430+
"
452431
echo "::endgroup::"
453432
454433

backends/apple/coreml/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ runtime.cxx_python_extension(
7676
base_module = "",
7777
visibility = [
7878
"//executorch/examples/apple/coreml/...",
79+
"@EXECUTORCH_CLIENTS",
7980
],
8081
external_deps = [
8182
"pybind11",

backends/arm/operator_support/TARGETS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ python_library(
55
srcs = glob(["*.py"]),
66
typing = True,
77
deps = [
8+
"//executorch/backends/arm/_passes:passes",
9+
"//executorch/backends/arm:tosa_specification",
810
"//executorch/backends/xnnpack/_passes:xnnpack_passes",
911
"//executorch/exir:lib",
10-
"//executorch/backends/arm:tosa_specification"
1112
],
1213
)

backends/cadence/aot/memory_planning.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def get_aligned_offset(pre_aligned_offset: int, alignment: int) -> int:
4646

4747
def collect_specs_from_graph_module(
4848
graph_module: torch.fx.GraphModule,
49+
graph_signature: ExportGraphSignature,
4950
alloc_graph_input: bool,
5051
alloc_graph_output: bool,
5152
) -> Iterable[TensorSpec]:
@@ -56,6 +57,7 @@ def collect_specs_from_graph_module(
5657
# Collect the specs from all the nodes in the graph module, and return it
5758
return collect_specs_from_nodes(
5859
graph_module.graph.nodes,
60+
graph_signature,
5961
ignore_graph_input=not alloc_graph_input,
6062
ignore_graph_output=not alloc_graph_output,
6163
)
@@ -107,7 +109,7 @@ def memory_available(spec: TensorSpec) -> bool:
107109
# Iterate over all the specs in sorted order
108110
for spec in sorted(
109111
collect_specs_from_graph_module(
110-
graph_module, alloc_graph_input, alloc_graph_output
112+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
111113
),
112114
key=lambda spec: spec.allocated_memory,
113115
reverse=True,
@@ -182,7 +184,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
182184
# Iterate over all the specs in sorted order
183185
for spec in sorted(
184186
collect_specs_from_graph_module(
185-
graph_module, alloc_graph_input, alloc_graph_output
187+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
186188
),
187189
key=lambda spec: spec.allocated_memory,
188190
reverse=True,
@@ -250,6 +252,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
250252

251253
def find_peak_memory_usages_per_memory(
252254
graph_module: torch.fx.GraphModule,
255+
graph_signature: ExportGraphSignature,
253256
alloc_graph_input: bool,
254257
alloc_graph_output: bool,
255258
mem_constraints: Optional[MemConstraints] = None,
@@ -265,7 +268,7 @@ def find_peak_memory_usages_per_memory(
265268

266269
# go through all nodes in the graph, collect memory usage per spec.mem_id
267270
for spec in collect_specs_from_graph_module(
268-
graph_module, alloc_graph_input, alloc_graph_output
271+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
269272
):
270273
if mem_constraints is not None and mem_constraints.skipped_spec(spec):
271274
continue
@@ -288,6 +291,7 @@ def find_peak_memory_usages_per_memory(
288291

289292
def find_peak_memory_usage(
290293
graph_module: torch.fx.GraphModule,
294+
graph_signature: ExportGraphSignature,
291295
alloc_graph_input: bool,
292296
alloc_graph_output: bool,
293297
mem_constraints: Optional[MemConstraints] = None,
@@ -303,7 +307,7 @@ def find_peak_memory_usage(
303307

304308
# Iterate over all the node specs
305309
for spec in collect_specs_from_graph_module(
306-
graph_module, alloc_graph_input, alloc_graph_output
310+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
307311
):
308312
if spec.lifetime[0] is None or (
309313
mem_constraints is not None and mem_constraints.skipped_spec(spec)
@@ -358,6 +362,7 @@ def print_memory_planning_info(
358362
# Get the peak memory usages per memory space
359363
peak_memory_usages_per_memory = find_peak_memory_usages_per_memory(
360364
executorch_prog.exported_program().graph_module,
365+
executorch_prog.exported_program().graph_signature,
361366
alloc_graph_input,
362367
alloc_graph_output,
363368
mem_constraints,
@@ -393,6 +398,7 @@ def print_memory_planning_info(
393398
# Get the total peak memory usage across all memory spaces
394399
total_peak_memory_usage = find_peak_memory_usage(
395400
executorch_prog.exported_program().graph_module,
401+
executorch_prog.exported_program().graph_signature,
396402
alloc_graph_input,
397403
alloc_graph_output,
398404
mem_constraints,
@@ -453,7 +459,17 @@ def _init_mem_algos(self) -> None:
453459
greedy_by_size_for_offset_calculation_with_hierarchy,
454460
]
455461

456-
def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
462+
def __call__(
463+
self,
464+
graph_module: torch.fx.GraphModule,
465+
) -> PassResult:
466+
return self.run(graph_module)
467+
468+
def run(
469+
self,
470+
graph_module: torch.fx.GraphModule,
471+
graph_signature: Optional[ExportGraphSignature] = None,
472+
) -> PassResult:
457473
mem_constraints = MemConstraints(
458474
opt_level=self.opt_level,
459475
alloc_graph_input=self.alloc_graph_input,
@@ -475,6 +491,6 @@ def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
475491
alloc_graph_output=self.alloc_graph_output,
476492
alignment=self.mem_alignment,
477493
)
478-
mem_planning(graph_module)
494+
mem_planning.run(graph_module, graph_signature)
479495

480496
return PassResult(graph_module, True)

backends/cadence/aot/tests/test_memory_passes.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
4646
inputs = (torch.ones(batch_size, input_dim),)
4747
model = PeakMemoryTestModel(input_dim, hidden_dim, output_dim)
4848

49-
graph_module = (
50-
compiler.export_to_executorch_gen_etrecord(model, inputs)
51-
.exported_program()
52-
.graph_module
53-
)
49+
exported_program = compiler.export_to_executorch_gen_etrecord(
50+
model, inputs
51+
).exported_program()
5452

5553
peak_usage, _ = find_peak_memory_usage(
56-
graph_module,
54+
exported_program.graph_module,
55+
exported_program.graph_signature,
5756
mem_constraints=None,
5857
alloc_graph_input=True,
5958
alloc_graph_output=True,
@@ -73,14 +72,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
7372
input_dim, hidden_dim, hidden_dim, hidden_dim, output_dim
7473
)
7574

76-
graph_module = (
77-
compiler.export_to_executorch_gen_etrecord(model, inputs)
78-
.exported_program()
79-
.graph_module
80-
)
75+
exported_program = compiler.export_to_executorch_gen_etrecord(
76+
model, inputs
77+
).exported_program()
8178

8279
peak_usage, _ = find_peak_memory_usage(
83-
graph_module,
80+
exported_program.graph_module,
81+
exported_program.graph_signature,
8482
mem_constraints=None,
8583
alloc_graph_input=True,
8684
alloc_graph_output=True,
@@ -111,6 +109,7 @@ def forward(self, x):
111109
graph_module.graph.eliminate_dead_code()
112110
peak_usage, _ = find_peak_memory_usage(
113111
graph_module,
112+
executorch_prog.exported_program().graph_signature,
114113
alloc_graph_input=False,
115114
alloc_graph_output=False,
116115
mem_constraints=None,

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,12 @@ void add_conv1d_node(
475475
const ValueRef out,
476476
const bool clamp_out) {
477477
ValueRef arg_weight = prepack_standard(
478-
graph, weight, graph.storage_type_of(out), utils::kChannelsPacked);
478+
graph,
479+
weight,
480+
graph.storage_type_of(out),
481+
utils::kChannelsPacked,
482+
/* passthrough = */ false,
483+
utils::kOptimizedAxisMap);
479484
ValueRef arg_bias = prepack_biases(
480485
graph,
481486
bias,

build/Utils.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ function(add_torch_to_cmake_prefix_path)
357357
endif()
358358
execute_process(
359359
COMMAND "${PYTHON_EXECUTABLE}" -c
360-
"import torch as _; print(_.__path__[0], end='')"
360+
"import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])"
361361
OUTPUT_VARIABLE _tmp_torch_path
362362
ERROR_VARIABLE _tmp_torch_path_error
363363
RESULT_VARIABLE _tmp_torch_path_result COMMAND_ECHO STDERR

0 commit comments

Comments
 (0)