Skip to content

Commit 5cc215e

Browse files
committed
Update
[ghstack-poisoned]
2 parents 8c36fd6 + 7b04d3b commit 5cc215e

File tree

167 files changed

+7983
-1507
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

167 files changed

+7983
-1507
lines changed

CMakeLists.txt

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,15 @@ set(_common_include_directories
309309
)
310310

311311
#
312-
# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
312+
# The `_<target>_srcs` lists are defined by executorch_load_build_variables.
313313
#
314-
314+
if(EXECUTORCH_SRCS_FILE)
315+
message(
316+
WARNING
317+
"EXECUTORCH_SRCS_FILE is no longer necessary and will not affect the build."
318+
)
319+
endif()
320+
executorch_load_build_variables()
315321
if(NOT EXECUTORCH_SRCS_FILE)
316322
# A file wasn't provided. Run a script to extract the source lists from the
317323
# buck2 build system and write them to a file we can include.
@@ -324,10 +330,6 @@ if(NOT EXECUTORCH_SRCS_FILE)
324330
executorch_validate_build_variables()
325331
endif()
326332

327-
# This file defines the `_<target>__srcs` variables used below.
328-
message(STATUS "executorch: Using sources file ${EXECUTORCH_SRCS_FILE}")
329-
include(${EXECUTORCH_SRCS_FILE})
330-
331333
# Detect if an iOS toolchain is set.
332334
if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
333335
set(CMAKE_TOOLCHAIN_IOS ON)

backends/apple/coreml/runtime/delegate/multiarray.mm

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,10 @@ bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArr
124124

125125
bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {
126126
if (src.layout().dataType() != dst.layout().dataType()) {
127-
return false;
127+
// Copying from FP16 to FP32 is supported and this is a common use case
128+
if (!(src.layout().dataType() == MultiArray::DataType::Float16 && dst.layout().dataType() == MultiArray::DataType::Float32)) {
129+
return false;
130+
}
128131
}
129132
if (dst.layout().num_bytes() < src.layout().num_bytes()) {
130133
return false;

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass # noqa
3434
from .decompose_cosh_pass import DecomposeCoshPass # noqa
3535
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
36+
from .decompose_cumsum_pass import DecomposeCumsumPass # noqa
3637
from .decompose_div_pass import DecomposeDivPass # noqa
3738
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa
3839
from .decompose_expm1_pass import DecomposeExpm1Pass # noqa

backends/arm/_passes/annotate_channels_last_dim_order_pass.py

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,36 +14,12 @@
1414
from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
1515
from executorch.exir.dialects._ops import ops as exir_ops
1616
from executorch.exir.pass_base import ExportPass, PassResult
17-
from torch.library import impl, Library
18-
19-
# Define lib with passthrough operators. The operators have no real meaning in edge IR
20-
# except for argument validaiton and a passthrough output. The operators will be used
21-
# when lowering to TOSA, e.g. a passthrough_to_tosa._transpose will not affect
22-
# the edge IR graph but will be lowered to a TOSA-TRANSPOSE.
23-
lib = Library("passthrough_to_tosa", "DEF")
24-
# For certain operators we need the data in a specific data format. Changing tosa_dim_order
25-
# is not sufficient as we also need transpose the data.
26-
# By utilizing an edge IR passthrough operator we can keep the edge program in
27-
# channels-first/contiguous and get the desired behavior in the TOSA lowering.
28-
lib.define("_transpose(Tensor self, int[] dim_order) -> Tensor")
29-
30-
31-
@impl(lib, "_transpose")
32-
def _transpose_impl(*args, **kwargs):
33-
# Validate length of dim_order array
34-
dim = args[1]
35-
if len(dim) != 4 and len(dim) != 5:
36-
raise ValueError(
37-
f"Dim order length must be either 4 or 5, got {len(dim)}: {dim}"
38-
)
39-
# Pass-through in edge-IR
40-
return args[0]
4117

4218

4319
class AnnotateChannelsLastDimOrder(ExportPass):
4420
"""
4521
Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
46-
that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts passthrough_to_tosa._transpose
22+
that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts backend.tosa.TRANSPOSE
4723
when a transition between 3D and 4D/5D tensors happen.
4824
The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape.
4925
"""
@@ -119,7 +95,7 @@ def insert_input_transpose(node, input_node, graph_module):
11995
with graph_module.graph.inserting_before(node):
12096
permute_node = create_node(
12197
graph_module.graph,
122-
torch.ops.passthrough_to_tosa._transpose.default,
98+
exir_ops.backend.tosa.TRANSPOSE.default,
12399
args=(
124100
input_node,
125101
list(
@@ -141,7 +117,7 @@ def insert_output_transpose(node, graph_module):
141117
with graph_module.graph.inserting_after(node):
142118
permute_node = create_node(
143119
graph_module.graph,
144-
torch.ops.passthrough_to_tosa._transpose.default,
120+
exir_ops.backend.tosa.TRANSPOSE.default,
145121
args=(
146122
node,
147123
list(

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
DecomposeBatchNormNoStatsPass,
3939
DecomposeCoshPass,
4040
DecomposeCosineSimilarityPass,
41+
DecomposeCumsumPass,
4142
DecomposeDivPass,
4243
DecomposeEmbeddingPass,
4344
DecomposeExpm1Pass,
@@ -148,6 +149,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
148149
self.add_pass(UnsqueezeBeforeRepeatPass())
149150
self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
150151
self.add_pass(DecomposeSumPass())
152+
self.add_pass(DecomposeCumsumPass(exported_program))
151153
self.add_pass(Conv1dUnsqueezePass())
152154
self.add_pass(DecomposeMaxPool2DPass())
153155
self.add_pass(SizeAdjustInputPass())
@@ -227,6 +229,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
227229
self.add_pass(UnsqueezeBeforeRepeatPass())
228230
self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
229231
self.add_pass(DecomposeSumPass())
232+
self.add_pass(DecomposeCumsumPass(exported_program))
230233
self.add_pass(Conv1dUnsqueezePass())
231234
self.add_pass(DecomposeMaxPool2DPass())
232235
self.add_pass(SizeAdjustInputPass())
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from math import prod
7+
8+
import torch
9+
from executorch.backends.arm._passes import ArmPass
10+
from executorch.backends.arm._passes.arm_pass_utils import create_node
11+
from executorch.backends.arm._passes.quant_args import QuantArgs
12+
13+
from executorch.backends.transforms.utils import create_constant_placeholder
14+
from executorch.exir.dialects._ops import ops as exir_ops
15+
from executorch.exir.pass_base import PassResult
16+
from torch.export.graph_signature import InputKind
17+
18+
19+
class DecomposeCumsumPass(ArmPass):
20+
"""
21+
Decomposes cumsum into a 1D convolution with a kernel of ones.
22+
23+
For example, the cumsum of an input tensor [1, 1] is [1, 1 + 1] = [1, 2].
24+
To decompose this, take the input tensor and pre-padded with len(input)-1 zeros and
25+
slided over with a kernel [1,1], of length len(input):
26+
27+
Input: [0, 1, 1]
28+
Kernel: [1, 1] = [1]
29+
[1, 1] = [2]
30+
31+
Since pytorch only supports symmetric padding, in reality the result will have
32+
an additional 1 calculated at the end, which leads to an required extra slice op.
33+
34+
To extend this to higher dimensions, the input is reshaped to [N, C, H, W] with
35+
N = <dims before cumsum dim>
36+
C = 1
37+
H = <cumsum dim>
38+
W = <dims after cumsum dim>
39+
And the convolution is applied over dimension H.
40+
"""
41+
42+
def call(self, graph_module):
43+
graph = graph_module.graph
44+
targets = (exir_ops.edge.aten.cumsum.default, torch.ops.aten.cumsum.default)
45+
modified = False
46+
for node in list(graph.nodes):
47+
if node.op != "call_function" or node.target not in targets:
48+
continue
49+
50+
if len(node.args) != 2:
51+
raise ValueError(
52+
"Cumsum node should have exactly two arguments: input and dim."
53+
)
54+
55+
# Get node data
56+
input_node, dim = node.args
57+
val = node.meta.get("val")
58+
original_shape = list(val.shape)
59+
dtype = input_node.meta.get("val").dtype
60+
dim = dim % len(original_shape)
61+
62+
# Compute shapes
63+
pre_cumsum_dim = prod(original_shape[:dim]) if dim > 0 else 1
64+
cumsum_dim = original_shape[dim]
65+
post_cumsum_dim = (
66+
prod(original_shape[dim + 1 :]) if dim < len(original_shape) - 1 else 1
67+
)
68+
conv_shape = [
69+
pre_cumsum_dim,
70+
1,
71+
cumsum_dim,
72+
post_cumsum_dim,
73+
]
74+
pad_shape = [original_shape[dim] - 1, 0]
75+
weight_shape = [1, 1, original_shape[dim], 1]
76+
77+
# Create convolution weight
78+
with graph.inserting_before(list(graph.nodes)[0]):
79+
weight_data = torch.ones(size=weight_shape, dtype=dtype)
80+
weight_node = create_constant_placeholder(
81+
self.exported_program,
82+
graph,
83+
node.name + "_kernel",
84+
InputKind.PARAMETER,
85+
weight_data,
86+
)
87+
88+
# Create decomposed nodes
89+
view_op = exir_ops.edge.aten.view_copy.default
90+
conv_op = exir_ops.edge.aten.convolution.default
91+
slice_op = exir_ops.edge.aten.slice_copy.Tensor
92+
with graph.inserting_before(node):
93+
# Reshape to 4D with
94+
view_args = (input_node, conv_shape)
95+
view_node = create_node(graph, view_op, args=view_args, from_node=node)
96+
97+
conv_args = (
98+
view_node,
99+
weight_node,
100+
None,
101+
[1, 1],
102+
pad_shape,
103+
[1, 1],
104+
False,
105+
[0],
106+
1,
107+
)
108+
conv_node = create_node(graph, conv_op, args=conv_args, from_node=node)
109+
110+
# The convolution is inserted after quantization, so we need to set our
111+
# own quantization parameters for the weights here. However since the
112+
# data is ones directly created as int8, they already have correct scale
113+
# and so no scaling needs to be done, i.e. set scale=1.0, zero_point=0.0
114+
if (
115+
"input_qparams" in conv_node.meta
116+
and len(conv_node.meta["input_qparams"]) > 0
117+
):
118+
qparams = QuantArgs(1.0, 0.0, -128, 127, torch.int8)
119+
conv_node.meta["input_qparams"][1] = qparams
120+
121+
slice_args = (conv_node, 2, 0, original_shape[dim])
122+
slice_node = create_node(
123+
graph, slice_op, args=slice_args, from_node=node
124+
)
125+
126+
view_original_args = (slice_node, original_shape)
127+
view_original_node = create_node(
128+
graph, view_op, args=view_original_args, from_node=node
129+
)
130+
131+
# Replace and remove original
132+
node.replace_all_uses_with(view_original_node)
133+
graph.erase_node(node)
134+
modified = True
135+
136+
if modified:
137+
# Cleanup
138+
graph.eliminate_dead_code()
139+
graph_module.recompile()
140+
# Apply any operator-level transforms
141+
graph_module = super().call(graph_module).graph_module
142+
return PassResult(graph_module, modified)

backends/arm/_passes/fuse_constant_ops_pass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def call(self, graph_module):
107107
for node in graph_module.graph.nodes:
108108
if node.op != "call_function":
109109
continue
110-
if node.target == torch.ops.tosa._table.default:
110+
if node.target == exir_ops.backend.tosa.TABLE.default:
111111
continue
112112

113113
input_nodes = node.all_input_nodes

backends/arm/_passes/insert_rescales_pass.py

Lines changed: 5 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3,70 +3,25 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6-
import logging
76
from copy import copy
87
from typing import cast
98

10-
import torch
119
from executorch.backends.arm._passes.arm_pass_utils import create_node
1210
from executorch.backends.arm._passes.quant_args import QuantArgs
1311
from executorch.backends.arm.constants import DQ_OPS, Q_OPS
12+
from executorch.exir.dialects._ops import ops as exir_ops
1413
from executorch.exir.pass_base import ExportPass, PassResult
15-
from torch import Tensor
1614
from torch.fx import GraphModule, Node
17-
from torch.library import custom_op, register_fake
18-
19-
logger = logging.getLogger(__name__)
20-
21-
22-
@custom_op("tosa::_rescale", mutates_args=()) # type: ignore[misc]
23-
def rescale(
24-
x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
25-
) -> Tensor:
26-
logger.warning(
27-
"Ran default implementation of tosa::_rescale."
28-
"This op is meant to always be inserted inside a partition and a correct default implementation is not implemented."
29-
)
30-
# Clone is needed to not return reference when rescaling to same dtype.
31-
# This is a neccessary requirement for non-mutating custom ops.
32-
return x.to(dtype=dtype).clone()
33-
34-
35-
@register_fake("tosa::_rescale") # type: ignore[misc]
36-
def rescale_fake(
37-
x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
38-
) -> Tensor:
39-
"""Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
40-
Additionally validates TOSA constraints of a RESCALE op.
41-
"""
42-
if dtype not in (torch.int32, torch.int8, torch.int16):
43-
raise NotImplementedError(
44-
f"tosa::rescale currently only supports int32, int16 and int8, not {dtype}"
45-
)
46-
if dtype in (torch.int32, torch.int16) and out_zp != 0:
47-
raise ValueError(
48-
f"TOSA requires output_zp to be zero when the output dtype is {dtype}."
49-
)
50-
if x.dtype in (torch.int32, torch.int16) and in_zp != 0:
51-
raise ValueError(
52-
f"TOSA requires input_zp to be zero when the input dtype is {dtype}"
53-
)
54-
if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
55-
raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
56-
if dtype == torch.int8 and not -128 <= out_zp <= 127:
57-
raise ValueError(f"{out_zp=} outside valid range (-128,127) for int8.")
58-
59-
return x.to(dtype=dtype).clone()
6015

6116

6217
class InsertRescalePass(ExportPass):
6318
"""Finds patterns of dq -> q, and replaces them
64-
with passthrough_to_tosa::rescales.
19+
with backend dialect tosa::RESCALE op.
6520
66-
Does not garantuee that the dtypes and zero points are valid
21+
Does not guarantee that the dtypes and zero points are valid
6722
in TOSA, that is the job of the quantization annotator that
6823
produced the dq and q nodes. The TOSA constraints are validated
69-
in the fake implementation of passthrough_to_tosa:rescale.
24+
in the fake implementation of.
7025
"""
7126

7227
def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule):
@@ -77,7 +32,7 @@ def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule
7732
with graph_module.graph.inserting_before(node):
7833
rescale_node = create_node(
7934
graph_module.graph,
80-
torch.ops.tosa._rescale.default,
35+
exir_ops.backend.tosa.RESCALE.default,
8136
(
8237
node.all_input_nodes[0],
8338
q_args.dtype,

0 commit comments

Comments
 (0)