Skip to content

Commit c30cb75

Browse files
authored
Merge branch 'pytorch:main' into Arm-backend-Updated-toolchain-to-arm-gnu-toolchain-13.3.rel1
2 parents 03c39c3 + a64ed1b commit c30cb75

File tree

13 files changed

+99
-126
lines changed

13 files changed

+99
-126
lines changed

backends/arm/_passes/cast_int64_pass.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,15 @@
55

66
# pyre-unsafe
77

8+
import logging
9+
810
import torch
11+
from executorch.backends.arm._passes.arm_pass_utils import is_param_node
912
from executorch.exir.pass_base import ExportPass, PassResult
13+
from torch._export.utils import is_buffer
14+
15+
logger = logging.getLogger(__name__)
16+
logger.setLevel(logging.WARNING)
1017

1118

1219
class CastInt64ToInt32Pass(ExportPass):
@@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
1825
for node in graph_module.graph.nodes:
1926
fake_tensor = node.meta["val"]
2027
if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
21-
if node.meta["val"].dtype == torch.int64:
22-
node.meta["val"] = node.meta["val"].to(torch.int32)
23-
buffer_name = (
24-
self.exported_program.graph_signature.inputs_to_buffers[
25-
node.name
26-
]
27-
)
28-
new_tensor = self.exported_program.state_dict[buffer_name].to(
29-
torch.int32
30-
)
31-
self.exported_program.state_dict[buffer_name] = new_tensor
28+
if node.meta["val"].dtype == torch.int64 and is_param_node(
29+
self.exported_program, node
30+
):
31+
if is_buffer(self.exported_program, node):
32+
node.meta["val"] = node.meta["val"].to(torch.int32)
33+
buffer_name = (
34+
self.exported_program.graph_signature.inputs_to_buffers[
35+
node.name
36+
]
37+
)
38+
buffer = self.exported_program.state_dict[node.name]
39+
logger.warning(
40+
f"Casting buffer {node.name} from torch.int64 to torch.int32"
41+
f" defined in {node.meta['stack_trace']}"
42+
)
43+
if torch.min(buffer) < torch.iinfo(torch.int32).min:
44+
raise RuntimeError(
45+
f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
46+
)
47+
if torch.max(buffer) > torch.iinfo(torch.int32).max:
48+
raise RuntimeError(
49+
f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
50+
)
51+
buffer_int32 = buffer.to(torch.int32)
52+
self.exported_program.state_dict[buffer_name] = buffer_int32
3253

3354
def call(self, graph_module: torch.fx.GraphModule):
3455
self._to_int32(graph_module)

backends/arm/_passes/scalars_to_attribute_pass.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
5151
if isinstance(arg, Node):
5252
new_args.append(arg)
5353
continue
54+
if isinstance(arg, int) and not torch.is_floating_point(
55+
get_first_fake_tensor(n)
56+
):
57+
new_args.append(arg)
58+
continue
5459

5560
prefix = "_tensor_constant_"
5661
get_new_attr_name = get_new_attr_name_with_prefix(prefix)

backends/arm/test/ops/test_scalars.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ def forward(self, x):
7575
x = 1.0 + x
7676
return x
7777

78+
class ShiftInplaceSub(torch.nn.Module):
79+
def forward(self, x):
80+
x = x >> 4
81+
x -= 10
82+
return x
83+
7884
# Inplace ops end with '_' (from aten naming)
7985
ops = [
8086
("Add", Add()),
@@ -160,3 +166,6 @@ def test_MI_const(self, test_name: str, op: torch.nn.Module, x):
160166
@parameterized.expand(tensor_scalar_tests)
161167
def test_BI(self, test_name: str, op: torch.nn.Module, x, y):
162168
self._test_add_tosa_BI_pipeline(op, (x, y))
169+
170+
def test_shift_sub_inplace_tosa_MI(self):
171+
self._test_add_tosa_MI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))

backends/arm/test/runner_utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,6 @@ def run_corstone(
266266
"-C",
267267
"mps3_board.uart0.out_file='-'",
268268
"-C",
269-
"cpu0.CFGITCMSZ=11",
270-
"-C",
271269
"cpu0.semihosting-enable=1",
272270
"-C",
273271
"cpu0.semihosting-stack_base=0",

backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -101,23 +101,25 @@ void main() {
101101
// "k" tracks the kernel's index for our input-kernel computation.
102102
// It reads out-of-bound zeros, but trying to avoid them complicates
103103
// for-loop conditions, which results in worse performance.
104-
for (int k = 0; k < kernel_size; k += 4) {
105-
// Since the weight tensor is width-packed, which is along the length
106-
// dimension, we can batch-read four elements at a time.
107-
const ivec3 w_lpos = ivec3(k / 4, in_c % in_group_size, out_c);
108-
const VEC4_T weight = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
109104

110-
ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map);
111-
sum = fma(weight.xxxx, load_texel(t_in, in_pos), sum);
112-
113-
in_pos[in_axis_map.x] += dilation;
114-
sum = fma(weight.yyyy, load_texel(t_in, in_pos), sum);
105+
// The weight tensor is channel-packed. It may not be trival choice for
106+
// performance reason since need to have more data fetch. The reason is
107+
// for some sequence model, we found that the weight tensor
108+
// (out_channel, in_channel / group, kernel) often has a large
109+
// out_channel >> kernel, leading to non-optimal use of memory as the
110+
// weight tensor gets very deep. As a mitigation, we use channel-packing
111+
// for the weight tensor, yielding a 75% reduction in weight-tensor
112+
// memory.
113+
114+
// It is possible to further reduce the memory footprint by swapping the
115+
// dimensions, using x extent for out_channel, and y for kernel.
116+
for (int k = 0; k < kernel_size; k += 1) {
117+
const ivec3 w_lpos = ivec3(k, in_c % in_group_size, out_c / 4);
118+
const VEC4_T weight_texel = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
119+
VEC4_T weight = VEC4_T(weight_texel[out_c % 4]);
115120

116-
in_pos[in_axis_map.x] += dilation;
117-
sum = fma(weight.zzzz, load_texel(t_in, in_pos), sum);
118-
119-
in_pos[in_axis_map.x] += dilation;
120-
sum = fma(weight.wwww, load_texel(t_in, in_pos), sum);
121+
ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map);
122+
sum = fma(weight, load_texel(t_in, in_pos), sum);
121123
}
122124
}
123125

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ void add_conv1d_node(
407407
const ValueRef out,
408408
const bool clamp_out) {
409409
ValueRef arg_weight = prepack_standard(
410-
graph, weight, graph.storage_type_of(out), utils::kWidthPacked);
410+
graph, weight, graph.storage_type_of(out), utils::kChannelsPacked);
411411
ValueRef arg_bias = prepack_biases(
412412
graph,
413413
bias,

docs/source/executorch-arm-delegate-tutorial.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,6 @@ ethos_u_build_dir=examples/arm/executor_runner/
322322
elf=$(find ${ethos_u_build_dir} -name "arm_executor_runner")
323323

324324
FVP_Corstone_SSE-320_Ethos-U85 \
325-
-C mps4_board.subsystem.cpu0.CFGITCMSZ=11 \
326325
-C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
327326
-C mps4_board.visualisation.disable-visualisation=1 \
328327
-C vis_hdlcd.disable_visualisation=1 \

examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
From 0fb46c2fe4a072546f87c6cb9202d5001f1eb9c5 Mon Sep 17 00:00:00 2001
2+
From: George Gekov <[email protected]>
3+
Date: Mon, 18 Nov 2024 11:24:11 +0000
4+
Subject: [PATCH] Move rodata to the DDR
5+
6+
---
7+
targets/corstone-300/platform.ld | 4 ++--
8+
1 file changed, 2 insertions(+), 2 deletions(-)
9+
10+
diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
11+
index b458fc6..8d4bc73 100644
12+
--- a/targets/corstone-300/platform.ld
13+
+++ b/targets/corstone-300/platform.ld
14+
@@ -154,7 +154,7 @@ SECTIONS
15+
*(SORT(.dtors.*))
16+
*(.dtors)
17+
18+
- *(.rodata*)
19+
+
20+
21+
KEEP(*(.eh_frame*))
22+
} > ITCM :rom_exec
23+
@@ -280,7 +280,7 @@ SECTIONS
24+
#endif
25+
* (expected_output_data_sec)
26+
* (sec_command_stream, sec_weight_data, sec_input_data)
27+
-
28+
+ *(.rodata*)
29+
* (ethosu_core_in_queue)
30+
* (ethosu_core_out_queue)
31+
. = ALIGN(4);
32+
--
33+
2.25.1
34+

examples/arm/ethos-u-setup/core_platform/patches/0001-New-phdr-for-.data-section.patch

Lines changed: 0 additions & 33 deletions
This file was deleted.

0 commit comments

Comments
 (0)