Skip to content

Commit 88a34db

Browse files
committed
Update on "[ET-VK] 7/n Split dispatches between multiple command buffers. Split execute dispatch into multiple commands based on dispatch count."
Differential Revision: [D78360039](https://our.internmc.facebook.com/intern/diff/D78360039/) [ghstack-poisoned]
2 parents 0c2f76b + bdc7dc2 commit 88a34db

File tree

55 files changed

+2183
-1154
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+2183
-1154
lines changed

.ci/scripts/test_llama_lora.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
# shellcheck source=/dev/null
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
cmake_install_executorch_libraries() {
13+
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
14+
rm -rf cmake-out
15+
retry cmake --preset llm \
16+
-DCMAKE_INSTALL_PREFIX=cmake-out \
17+
-DCMAKE_BUILD_TYPE=Release
18+
cmake --build cmake-out -j9 --target install --config Release
19+
}
20+
21+
cmake_build_llama_runner() {
22+
echo "Building llama runner"
23+
pushd extension/llm/tokenizers
24+
echo "Updating tokenizers submodule"
25+
git submodule update --init
26+
popd
27+
dir="examples/models/llama"
28+
retry cmake \
29+
-DBUILD_TESTING=OFF \
30+
-DCMAKE_INSTALL_PREFIX=cmake-out \
31+
-DCMAKE_BUILD_TYPE=Release \
32+
-Bcmake-out/${dir} \
33+
${dir}
34+
cmake --build cmake-out/${dir} -j9 --config Release
35+
}
36+
37+
cleanup_files() {
38+
echo "Deleting downloaded and generated files"
39+
rm -rf "${DOWNLOADED_PATH}/"
40+
rm result.txt
41+
}
42+
43+
# Download model artifacts from HF Hub.
44+
# Hosting in personal repo for now.
45+
HF_MODEL_REPO="lucylq/llama3_1B_lora"
46+
DOWNLOADED_PATH=$(
47+
bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
48+
--model_id "${HF_MODEL_REPO}" \
49+
--files "adapter_config.json" "adapter_model.pt" "consolidated.00.pth" "params.json" "tokenizer.model"
50+
)
51+
EXPORTED_MODEL_NAME="llama_3_2_1B_lora.pte"
52+
# Export model.
53+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
54+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
55+
base.params="${DOWNLOADED_PATH}/params.json" \
56+
base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
57+
base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
58+
base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
59+
model.use_kv_cache=true \
60+
model.use_sdpa_with_kv_cache=true \
61+
model.dtype_override="fp32" \
62+
backend.xnnpack.enabled=true \
63+
backend.xnnpack.extended_ops=true \
64+
export.output_name="${EXPORTED_MODEL_NAME}"
65+
66+
# Build llama runner.
67+
cmake_install_executorch_libraries
68+
cmake_build_llama_runner
69+
70+
PROMPT="What happens if you eat watermelon seeds?"
71+
# Run llama runner
72+
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
73+
74+
NOW=$(date +"%H:%M:%S")
75+
echo "Starting to run llama runner at ${NOW}"
76+
# shellcheck source=/dev/null
77+
cmake-out/examples/models/llama/llama_main --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
78+
NOW=$(date +"%H:%M:%S")
79+
echo "Finished at ${NOW}"
80+
81+
RESULT=$(cat result.txt)
82+
EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
83+
84+
if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
85+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
86+
echo "Actual result: ${RESULT}"
87+
echo "Success"
88+
cleanup_files
89+
else
90+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
91+
echo "Actual result: ${RESULT}"
92+
echo "Failure; results not the same"
93+
94+
cleanup_files
95+
exit 1
96+
fi

.github/workflows/pull.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,36 @@ jobs:
687687
# run llama runner in eager mode
688688
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
689689
690+
test-llama-lora-linux:
691+
name: test-llama-lora-linux
692+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
693+
permissions:
694+
id-token: write
695+
contents: read
696+
strategy:
697+
fail-fast: false
698+
with:
699+
runner: linux.24xlarge
700+
docker-image: ci-image:executorch-ubuntu-22.04-clang12
701+
submodules: 'recursive'
702+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
703+
timeout: 90
704+
script: |
705+
# The generic Linux job chooses to use base env, not the one setup by the image
706+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
707+
conda activate "${CONDA_ENV}"
708+
709+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
710+
711+
# Install llama requirements
712+
bash examples/models/llama/install_requirements.sh
713+
714+
# install a recent version of torchtune.
715+
PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250730 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
716+
717+
# run llama runner in eager mode
718+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_lora.sh
719+
690720
test-mediatek-models-linux:
691721
name: test-mediatek-models-linux
692722
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
849849
LIB_NAME
850850
"executorch_selected_kernels"
851851
OPS_SCHEMA_YAML
852-
"${EXECUTORCH_SELECT_OPS_LIB}"
852+
"${EXECUTORCH_SELECT_OPS_YAML}"
853853
ROOT_OPS
854854
"${EXECUTORCH_SELECT_OPS_LIST}"
855855
INCLUDE_ALL_OPS

backends/apple/coreml/partition/coreml_partitioner.py

Lines changed: 80 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
PartitionResult,
2121
)
2222
from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
23+
from executorch.exir.dialects._ops import ops as exir_ops
2324
from torch.export.exported_program import ExportedProgram
2425
from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
2526
from torch.fx.passes.operator_support import OperatorSupportBase
@@ -56,6 +57,80 @@ def log_once(self, msg: str) -> None:
5657
logger.info(msg)
5758
self._logged_msgs.add(msg)
5859

60+
def should_skip_op_for_delegation(self, node_target_name: str) -> bool:
61+
skipped_ops = self.skip_ops_for_coreml_delegation or []
62+
if node_target_name in skipped_ops:
63+
assert (
64+
not self.lower_full_graph
65+
), f"Cannot skip {node_target_name} because lower_full_graph is True. Please set skip_ops_for_coreml_delegation=None or lower_full_graph=False in the CoreMLPartitioner"
66+
self.log_once(
67+
"Skipping op for CoreML delegation because it is in skip_ops_for_coreml_delegation: "
68+
+ node_target_name
69+
)
70+
return True
71+
return False
72+
73+
def should_override_support(self, node) -> bool:
74+
# https://github.com/apple/coremltools/issues/2573
75+
if (
76+
node.target
77+
in [
78+
torch.ops.aten.sub.Tensor,
79+
exir_ops.edge.aten.sub.Tensor,
80+
torch.ops.aten.add.Tensor,
81+
exir_ops.edge.aten.add.Tensor,
82+
]
83+
and "alpha" in node.kwargs
84+
and node.kwargs["alpha"] != 1
85+
):
86+
self.log_once(
87+
"torch.ops.aten.{sub, add}.Tensor with alpha != 1 is not supported by CoreML. Overriding support."
88+
)
89+
return True
90+
91+
# https://github.com/apple/coremltools/issues/2565
92+
if node.target in [
93+
torch.ops.aten.diagonal.default,
94+
torch.ops.aten.diagonal_copy.default,
95+
exir_ops.edge.aten.diagonal.default,
96+
exir_ops.edge.aten.diagonal_copy.default,
97+
]:
98+
self.log_once(
99+
"torch.ops.aten.diagonal.default has a bug in CoreML. Overriding op support."
100+
)
101+
return True
102+
103+
# https://github.com/apple/coremltools/issues/2569
104+
if node.target in [
105+
torch.ops.aten.acosh.default,
106+
exir_ops.edge.aten.acosh.default,
107+
torch.ops.aten.asinh.default,
108+
exir_ops.edge.aten.asinh.default,
109+
]:
110+
self.log_once(
111+
"torch.ops.aten.{acosh, asinh}.default is not supported by CoreML. Overriding op support."
112+
)
113+
return True
114+
115+
# TODO: enable this after bugs in ExecuTorch's partitioner are fixed
116+
# # If lower_full_graph=False, do not partition nodes with symbolic args because it can result in symbolic args
117+
# # in the placeholders due to partitioning, which CoreML does not support
118+
# if not self.lower_full_graph and any(
119+
# isinstance(arg, torch.fx.Node)
120+
# and isinstance(
121+
# arg.meta.get("val", None),
122+
# (torch.SymInt, torch.SymBool, torch.SymFloat),
123+
# )
124+
# for arg in node.args
125+
# ):
126+
# self.log_once(
127+
# "Skipping op for CoreML delegation because it contains symbolic args: "
128+
# + node_target_name
129+
# )
130+
# return True
131+
132+
return False
133+
59134
def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
60135
# get_attr node can always be supported on any backend
61136
if node.op == "get_attr":
@@ -64,38 +139,17 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
64139
elif node.op == "call_function":
65140
# skip ops if specified by user
66141
node_target_name = getattr(node.target, "__name__", "").lower()
67-
if node_target_name in (self.skip_ops_for_coreml_delegation or []):
68-
self.log_once(
69-
"Skipping op for CoreML delegation because it is in skip_ops_for_coreml_delegation: "
70-
+ node_target_name
71-
)
72-
assert (
73-
not self.lower_full_graph
74-
), "Cannot have skip_ops_for_coreml_delegation when lower_full_graph is True"
75-
return False
76142

77-
# TODO: enable this after bugs in ExecuTorch's partitioner are fixed
78-
# # If lower_full_graph=False, do not partition nodes with symbolic args because it can result in symbolic args
79-
# # in the placeholders due to partitioning, which CoreML does not support
80-
# if not self.lower_full_graph and any(
81-
# isinstance(arg, torch.fx.Node)
82-
# and isinstance(
83-
# arg.meta.get("val", None),
84-
# (torch.SymInt, torch.SymBool, torch.SymFloat),
85-
# )
86-
# for arg in node.args
87-
# ):
88-
# self.log_once(
89-
# "Skipping op for CoreML delegation because it contains symbolic args: "
90-
# + node_target_name
91-
# )
92-
# assert not self.lower_full_graph
93-
# return False
143+
if self.should_skip_op_for_delegation(node_target_name):
144+
return False
94145

95146
# query coremltools to see if node is supported
96147
is_supported = ct.converters.mil.frontend.torch.is_torch_fx_node_supported(
97148
node
98149
)
150+
if self.should_override_support(node):
151+
is_supported = False
152+
99153
if not is_supported:
100154
if self.lower_full_graph:
101155
raise NotImplementedError(
@@ -126,7 +180,6 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
126180

127181

128182
class CoreMLPartitioner(Partitioner):
129-
130183
def __init__(
131184
self,
132185
*,

backends/apple/coreml/runtime/delegate/multiarray.mm

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArr
123123
}
124124

125125
bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {
126+
if (src.layout().dataType() != dst.layout().dataType()) {
127+
return false;
128+
}
126129
if (dst.layout().num_bytes() < src.layout().num_bytes()) {
127130
return false;
128131
}

backends/arm/test/ops/test_asinh.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99

1010
from executorch.backends.arm.test import common
1111
from executorch.backends.arm.test.tester.test_pipeline import (
12-
EthosU55PipelineBI,
13-
EthosU85PipelineBI,
14-
TosaPipelineBI,
15-
TosaPipelineMI,
12+
EthosU55PipelineINT,
13+
EthosU85PipelineINT,
14+
TosaPipelineFP,
15+
TosaPipelineINT,
1616
)
1717

1818
input_t = Tuple[torch.Tensor] # Input x
@@ -36,8 +36,8 @@ def forward(self, x):
3636

3737

3838
@common.parametrize("test_data", test_data_suite)
39-
def test_asin_tosa_MI(test_data: Tuple):
40-
pipeline = TosaPipelineMI[input_t](
39+
def test_asinh_tosa_FP(test_data: Tuple):
40+
pipeline = TosaPipelineFP[input_t](
4141
Asinh(),
4242
(test_data(),),
4343
aten_op,
@@ -47,8 +47,8 @@ def test_asin_tosa_MI(test_data: Tuple):
4747

4848

4949
@common.parametrize("test_data", test_data_suite)
50-
def test_asin_tosa_BI(test_data: Tuple):
51-
pipeline = TosaPipelineBI[input_t](
50+
def test_asinh_tosa_INT(test_data: Tuple):
51+
pipeline = TosaPipelineINT[input_t](
5252
Asinh(),
5353
(test_data(),),
5454
aten_op=[],
@@ -59,8 +59,8 @@ def test_asin_tosa_BI(test_data: Tuple):
5959

6060
@common.parametrize("test_data", test_data_suite)
6161
@common.XfailIfNoCorstone300
62-
def test_asin_u55_BI(test_data: Tuple):
63-
pipeline = EthosU55PipelineBI[input_t](
62+
def test_asinh_u55_INT(test_data: Tuple):
63+
pipeline = EthosU55PipelineINT[input_t](
6464
Asinh(),
6565
(test_data(),),
6666
aten_ops=[],
@@ -70,8 +70,8 @@ def test_asin_u55_BI(test_data: Tuple):
7070

7171
@common.parametrize("test_data", test_data_suite)
7272
@common.XfailIfNoCorstone320
73-
def test_asin_u85_BI(test_data: Tuple):
74-
pipeline = EthosU85PipelineBI[input_t](
73+
def test_asinh_u85_INT(test_data: Tuple):
74+
pipeline = EthosU85PipelineINT[input_t](
7575
Asinh(),
7676
(test_data(),),
7777
aten_ops=[],

0 commit comments

Comments
 (0)