Skip to content

Commit 9e739bf

Browse files
committed
Update base for Update on "[ET-VK] Adding a workgroup class to VecUtils"
This diff adds a new class called `WorkgroupSize` to the `VecUtils` header file. The `WorkgroupSize` class takes three `uint32_t` values as parameters and stores them in a single `uint32_t` variable using bitwise operations. This class is used in the Vulkan backend to specify the size of a workgroup for a given operation. Differential Revision: [D70021019](https://our.internmc.facebook.com/intern/diff/D70021019/) [ghstack-poisoned]
2 parents f87940d + abe8834 commit 9e739bf

File tree

19 files changed

+1243
-300
lines changed

19 files changed

+1243
-300
lines changed

.github/workflows/trunk.yml

Lines changed: 32 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,13 @@ jobs:
374374
secrets: inherit
375375
strategy:
376376
matrix:
377-
hf_model_repo: [google/gemma-2-2b]
377+
hf_model_id: [
378+
google/gemma-2-2b,
379+
Qwen/Qwen2.5-0.5B,
380+
HuggingFaceTB/SmolLM2-135M,
381+
meta-llama/Llama-3.2-1B,
382+
allenai/OLMo-1B-hf
383+
]
378384
fail-fast: false
379385
with:
380386
secrets-env: EXECUTORCH_HF_TOKEN
@@ -389,66 +395,39 @@ jobs:
389395
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
390396
conda activate "${CONDA_ENV}"
391397
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
392-
393-
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
394-
rm -rf cmake-out
395-
cmake \
396-
-DCMAKE_INSTALL_PREFIX=cmake-out \
397-
-DCMAKE_BUILD_TYPE=Release \
398-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
399-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
400-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
401-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
402-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
403-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
404-
-DEXECUTORCH_BUILD_XNNPACK=ON \
405-
-DPYTHON_EXECUTABLE=python \
406-
-Bcmake-out .
407-
cmake --build cmake-out -j9 --target install --config Release
408-
409-
echo "Build llama runner"
410-
dir="examples/models/llama"
411-
cmake \
412-
-DCMAKE_INSTALL_PREFIX=cmake-out \
413-
-DCMAKE_BUILD_TYPE=Release \
414-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
415-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
416-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
417-
-DEXECUTORCH_BUILD_XNNPACK=ON \
418-
-DPYTHON_EXECUTABLE=python \
419-
-Bcmake-out/${dir} \
420-
${dir}
421-
cmake --build cmake-out/${dir} -j9 --config Release
422398
echo "::endgroup::"
423399
424-
echo "::group::Set up HuggingFace Dependencies"
425-
if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
426-
echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
427-
exit 1
428-
fi
400+
echo "::group::Set up Hugging Face"
429401
pip install -U "huggingface_hub[cli]"
430402
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
403+
git clone https://github.com/huggingface/optimum-executorch
404+
cd optimum-executorch
405+
# There is no release yet, for CI stability, always test from the same commit on main
406+
git checkout 6a7e83f3eee2976fa809335bfb78a45b1ea1cb25
407+
pip install .
431408
pip install accelerate sentencepiece
432409
pip list
433410
echo "::endgroup::"
434411
435-
echo "::group::Export to ExecuTorch"
436-
TOKENIZER_FILE=tokenizer.model
437-
TOKENIZER_BIN_FILE=tokenizer.bin
438-
ET_MODEL_NAME=et_model
439-
DOWNLOADED_TOKENIZER_FILE_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${{ matrix.hf_model_repo }}" --files "${TOKENIZER_FILE}")
440-
if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" ]; then
441-
echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
442-
python -m extension.llm.tokenizer.tokenizer -t "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" -o ./${TOKENIZER_BIN_FILE}
443-
ls ./tokenizer.bin
444-
else
445-
echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
446-
exit 1
447-
fi
448-
449-
python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
450-
451-
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
412+
echo "::group::Export and Run ${{ matrix.hf_model_id }}"
413+
# Pass matrix variable as environment variable
414+
export MODEL_ID="${{ matrix.hf_model_id }}"
415+
python -c "
416+
import os
417+
from optimum.executorch import ExecuTorchModelForCausalLM
418+
from transformers import AutoTokenizer
419+
420+
model_id = os.getenv('MODEL_ID')
421+
print(f'Loading model: {model_id}')
422+
model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
423+
tokenizer = AutoTokenizer.from_pretrained(model_id)
424+
generated_text = model.text_generation(
425+
tokenizer=tokenizer,
426+
prompt='Simply put, the theory of relativity states that',
427+
max_seq_len=64
428+
)
429+
print(generated_text)
430+
"
452431
echo "::endgroup::"
453432
454433

backends/apple/coreml/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ runtime.cxx_python_extension(
7676
base_module = "",
7777
visibility = [
7878
"//executorch/examples/apple/coreml/...",
79+
"@EXECUTORCH_CLIENTS",
7980
],
8081
external_deps = [
8182
"pybind11",

backends/arm/operator_support/TARGETS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ python_library(
55
srcs = glob(["*.py"]),
66
typing = True,
77
deps = [
8+
"//executorch/backends/arm/_passes:passes",
9+
"//executorch/backends/arm:tosa_specification",
810
"//executorch/backends/xnnpack/_passes:xnnpack_passes",
911
"//executorch/exir:lib",
10-
"//executorch/backends/arm:tosa_specification"
1112
],
1213
)

backends/cadence/aot/memory_planning.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def get_aligned_offset(pre_aligned_offset: int, alignment: int) -> int:
4646

4747
def collect_specs_from_graph_module(
4848
graph_module: torch.fx.GraphModule,
49+
graph_signature: ExportGraphSignature,
4950
alloc_graph_input: bool,
5051
alloc_graph_output: bool,
5152
) -> Iterable[TensorSpec]:
@@ -56,6 +57,7 @@ def collect_specs_from_graph_module(
5657
# Collect the specs from all the nodes in the graph module, and return it
5758
return collect_specs_from_nodes(
5859
graph_module.graph.nodes,
60+
graph_signature,
5961
ignore_graph_input=not alloc_graph_input,
6062
ignore_graph_output=not alloc_graph_output,
6163
)
@@ -107,7 +109,7 @@ def memory_available(spec: TensorSpec) -> bool:
107109
# Iterate over all the specs in sorted order
108110
for spec in sorted(
109111
collect_specs_from_graph_module(
110-
graph_module, alloc_graph_input, alloc_graph_output
112+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
111113
),
112114
key=lambda spec: spec.allocated_memory,
113115
reverse=True,
@@ -182,7 +184,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
182184
# Iterate over all the specs in sorted order
183185
for spec in sorted(
184186
collect_specs_from_graph_module(
185-
graph_module, alloc_graph_input, alloc_graph_output
187+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
186188
),
187189
key=lambda spec: spec.allocated_memory,
188190
reverse=True,
@@ -250,6 +252,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
250252

251253
def find_peak_memory_usages_per_memory(
252254
graph_module: torch.fx.GraphModule,
255+
graph_signature: ExportGraphSignature,
253256
alloc_graph_input: bool,
254257
alloc_graph_output: bool,
255258
mem_constraints: Optional[MemConstraints] = None,
@@ -265,7 +268,7 @@ def find_peak_memory_usages_per_memory(
265268

266269
# go through all nodes in the graph, collect memory usage per spec.mem_id
267270
for spec in collect_specs_from_graph_module(
268-
graph_module, alloc_graph_input, alloc_graph_output
271+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
269272
):
270273
if mem_constraints is not None and mem_constraints.skipped_spec(spec):
271274
continue
@@ -288,6 +291,7 @@ def find_peak_memory_usages_per_memory(
288291

289292
def find_peak_memory_usage(
290293
graph_module: torch.fx.GraphModule,
294+
graph_signature: ExportGraphSignature,
291295
alloc_graph_input: bool,
292296
alloc_graph_output: bool,
293297
mem_constraints: Optional[MemConstraints] = None,
@@ -303,7 +307,7 @@ def find_peak_memory_usage(
303307

304308
# Iterate over all the node specs
305309
for spec in collect_specs_from_graph_module(
306-
graph_module, alloc_graph_input, alloc_graph_output
310+
graph_module, graph_signature, alloc_graph_input, alloc_graph_output
307311
):
308312
if spec.lifetime[0] is None or (
309313
mem_constraints is not None and mem_constraints.skipped_spec(spec)
@@ -358,6 +362,7 @@ def print_memory_planning_info(
358362
# Get the peak memory usages per memory space
359363
peak_memory_usages_per_memory = find_peak_memory_usages_per_memory(
360364
executorch_prog.exported_program().graph_module,
365+
executorch_prog.exported_program().graph_signature,
361366
alloc_graph_input,
362367
alloc_graph_output,
363368
mem_constraints,
@@ -393,6 +398,7 @@ def print_memory_planning_info(
393398
# Get the total peak memory usage across all memory spaces
394399
total_peak_memory_usage = find_peak_memory_usage(
395400
executorch_prog.exported_program().graph_module,
401+
executorch_prog.exported_program().graph_signature,
396402
alloc_graph_input,
397403
alloc_graph_output,
398404
mem_constraints,
@@ -453,7 +459,17 @@ def _init_mem_algos(self) -> None:
453459
greedy_by_size_for_offset_calculation_with_hierarchy,
454460
]
455461

456-
def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
462+
def __call__(
463+
self,
464+
graph_module: torch.fx.GraphModule,
465+
) -> PassResult:
466+
return self.run(graph_module)
467+
468+
def run(
469+
self,
470+
graph_module: torch.fx.GraphModule,
471+
graph_signature: Optional[ExportGraphSignature] = None,
472+
) -> PassResult:
457473
mem_constraints = MemConstraints(
458474
opt_level=self.opt_level,
459475
alloc_graph_input=self.alloc_graph_input,
@@ -475,6 +491,6 @@ def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
475491
alloc_graph_output=self.alloc_graph_output,
476492
alignment=self.mem_alignment,
477493
)
478-
mem_planning(graph_module)
494+
mem_planning.run(graph_module, graph_signature)
479495

480496
return PassResult(graph_module, True)

backends/cadence/aot/tests/test_memory_passes.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
4646
inputs = (torch.ones(batch_size, input_dim),)
4747
model = PeakMemoryTestModel(input_dim, hidden_dim, output_dim)
4848

49-
graph_module = (
50-
compiler.export_to_executorch_gen_etrecord(model, inputs)
51-
.exported_program()
52-
.graph_module
53-
)
49+
exported_program = compiler.export_to_executorch_gen_etrecord(
50+
model, inputs
51+
).exported_program()
5452

5553
peak_usage, _ = find_peak_memory_usage(
56-
graph_module,
54+
exported_program.graph_module,
55+
exported_program.graph_signature,
5756
mem_constraints=None,
5857
alloc_graph_input=True,
5958
alloc_graph_output=True,
@@ -73,14 +72,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
7372
input_dim, hidden_dim, hidden_dim, hidden_dim, output_dim
7473
)
7574

76-
graph_module = (
77-
compiler.export_to_executorch_gen_etrecord(model, inputs)
78-
.exported_program()
79-
.graph_module
80-
)
75+
exported_program = compiler.export_to_executorch_gen_etrecord(
76+
model, inputs
77+
).exported_program()
8178

8279
peak_usage, _ = find_peak_memory_usage(
83-
graph_module,
80+
exported_program.graph_module,
81+
exported_program.graph_signature,
8482
mem_constraints=None,
8583
alloc_graph_input=True,
8684
alloc_graph_output=True,
@@ -111,6 +109,7 @@ def forward(self, x):
111109
graph_module.graph.eliminate_dead_code()
112110
peak_usage, _ = find_peak_memory_usage(
113111
graph_module,
112+
executorch_prog.exported_program().graph_signature,
114113
alloc_graph_input=False,
115114
alloc_graph_output=False,
116115
mem_constraints=None,

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,12 @@ void add_conv1d_node(
475475
const ValueRef out,
476476
const bool clamp_out) {
477477
ValueRef arg_weight = prepack_standard(
478-
graph, weight, graph.storage_type_of(out), utils::kChannelsPacked);
478+
graph,
479+
weight,
480+
graph.storage_type_of(out),
481+
utils::kChannelsPacked,
482+
/* passthrough = */ false,
483+
utils::kOptimizedAxisMap);
479484
ValueRef arg_bias = prepack_biases(
480485
graph,
481486
bias,

build/Utils.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ function(add_torch_to_cmake_prefix_path)
357357
endif()
358358
execute_process(
359359
COMMAND "${PYTHON_EXECUTABLE}" -c
360-
"import torch as _; print(_.__path__[0], end='')"
360+
"import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])"
361361
OUTPUT_VARIABLE _tmp_torch_path
362362
ERROR_VARIABLE _tmp_torch_path_error
363363
RESULT_VARIABLE _tmp_torch_path_result COMMAND_ECHO STDERR

0 commit comments

Comments
 (0)