Merge branch 'pytorch:main' into Arm-backend-Updated-toolchain-to-arm-gnu-toolchain-13.3.rel1

zingo · web-flow · commit c30cb7528581 · 2024-11-26T16:10:06.000+01:00
diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py
@@ -5,8 +5,15 @@
 
 # pyre-unsafe
 
+import logging
+
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import is_param_node
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import is_buffer
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
 
 
 class CastInt64ToInt32Pass(ExportPass):
@@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             fake_tensor = node.meta["val"]
             if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
-                if node.meta["val"].dtype == torch.int64:
-                    node.meta["val"] = node.meta["val"].to(torch.int32)
-                    buffer_name = (
-                        self.exported_program.graph_signature.inputs_to_buffers[
-                            node.name
-                        ]
-                    )
-                    new_tensor = self.exported_program.state_dict[buffer_name].to(
-                        torch.int32
-                    )
-                    self.exported_program.state_dict[buffer_name] = new_tensor
+                if node.meta["val"].dtype == torch.int64 and is_param_node(
+                    self.exported_program, node
+                ):
+                    if is_buffer(self.exported_program, node):
+                        node.meta["val"] = node.meta["val"].to(torch.int32)
+                        buffer_name = (
+                            self.exported_program.graph_signature.inputs_to_buffers[
+                                node.name
+                            ]
+                        )
+                        buffer = self.exported_program.state_dict[node.name]
+                        logger.warning(
+                            f"Casting buffer {node.name} from torch.int64 to torch.int32"
+                            f" defined in {node.meta['stack_trace']}"
+                        )
+                        if torch.min(buffer) < torch.iinfo(torch.int32).min:
+                            raise RuntimeError(
+                                f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
+                            )
+                        if torch.max(buffer) > torch.iinfo(torch.int32).max:
+                            raise RuntimeError(
+                                f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
+                            )
+                        buffer_int32 = buffer.to(torch.int32)
+                        self.exported_program.state_dict[buffer_name] = buffer_int32
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._to_int32(graph_module)
diff --git a/backends/arm/_passes/scalars_to_attribute_pass.py b/backends/arm/_passes/scalars_to_attribute_pass.py
@@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 if isinstance(arg, Node):
                     new_args.append(arg)
                     continue
+                if isinstance(arg, int) and not torch.is_floating_point(
+                    get_first_fake_tensor(n)
+                ):
+                    new_args.append(arg)
+                    continue
 
                 prefix = "_tensor_constant_"
                 get_new_attr_name = get_new_attr_name_with_prefix(prefix)
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
@@ -75,6 +75,12 @@ def forward(self, x):
             x = 1.0 + x
             return x
 
+    class ShiftInplaceSub(torch.nn.Module):
+        def forward(self, x):
+            x = x >> 4
+            x -= 10
+            return x
+
     # Inplace ops end with '_' (from aten naming)
     ops = [
         ("Add", Add()),
@@ -160,3 +166,6 @@ def test_MI_const(self, test_name: str, op: torch.nn.Module, x):
     @parameterized.expand(tensor_scalar_tests)
     def test_BI(self, test_name: str, op: torch.nn.Module, x, y):
         self._test_add_tosa_BI_pipeline(op, (x, y))
+
+    def test_shift_sub_inplace_tosa_MI(self):
+        self._test_add_tosa_MI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
@@ -266,8 +266,6 @@ def run_corstone(
                 "-C",
                 "mps3_board.uart0.out_file='-'",
                 "-C",
-                "cpu0.CFGITCMSZ=11",
-                "-C",
                 "cpu0.semihosting-enable=1",
                 "-C",
                 "cpu0.semihosting-stack_base=0",
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -101,23 +101,25 @@ void main() {
         // "k" tracks the kernel's index for our input-kernel computation.
         // It reads out-of-bound zeros, but trying to avoid them complicates
         // for-loop conditions, which results in worse performance.
-        for (int k = 0; k < kernel_size; k += 4) {
-          // Since the weight tensor is width-packed, which is along the length
-          // dimension, we can batch-read four elements at a time.
-          const ivec3 w_lpos = ivec3(k / 4, in_c % in_group_size, out_c);
-          const VEC4_T weight = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
 
-          ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map);
-          sum = fma(weight.xxxx, load_texel(t_in, in_pos), sum);
-
-          in_pos[in_axis_map.x] += dilation;
-          sum = fma(weight.yyyy, load_texel(t_in, in_pos), sum);
+        // The weight tensor is channel-packed. It may not be trival choice for
+        // performance reason since need to have more data fetch. The reason is
+        // for some sequence model, we found that the weight tensor
+        // (out_channel, in_channel / group, kernel) often has a large
+        // out_channel >> kernel, leading to non-optimal use of memory as the
+        // weight tensor gets very deep. As a mitigation, we use channel-packing
+        // for the weight tensor, yielding a 75% reduction in weight-tensor
+        // memory.
+
+        // It is possible to further reduce the memory footprint by swapping the
+        // dimensions, using x extent for out_channel, and y for kernel.
+        for (int k = 0; k < kernel_size; k += 1) {
+          const ivec3 w_lpos = ivec3(k, in_c % in_group_size, out_c / 4);
+          const VEC4_T weight_texel = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
+          VEC4_T weight = VEC4_T(weight_texel[out_c % 4]);
 
-          in_pos[in_axis_map.x] += dilation;
-          sum = fma(weight.zzzz, load_texel(t_in, in_pos), sum);
-
-          in_pos[in_axis_map.x] += dilation;
-          sum = fma(weight.wwww, load_texel(t_in, in_pos), sum);
+          ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map);
+          sum = fma(weight, load_texel(t_in, in_pos), sum);
         }
       }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -407,7 +407,7 @@ void add_conv1d_node(
     const ValueRef out,
     const bool clamp_out) {
   ValueRef arg_weight = prepack_standard(
-      graph, weight, graph.storage_type_of(out), utils::kWidthPacked);
+      graph, weight, graph.storage_type_of(out), utils::kChannelsPacked);
   ValueRef arg_bias = prepack_biases(
       graph,
       bias,
diff --git a/docs/source/executorch-arm-delegate-tutorial.md b/docs/source/executorch-arm-delegate-tutorial.md
@@ -322,7 +322,6 @@ ethos_u_build_dir=examples/arm/executor_runner/
 elf=$(find ${ethos_u_build_dir} -name "arm_executor_runner")
 
 FVP_Corstone_SSE-320_Ethos-U85                          \
-    -C mps4_board.subsystem.cpu0.CFGITCMSZ=11           \
     -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
     -C mps4_board.visualisation.disable-visualisation=1 \
     -C vis_hdlcd.disable_visualisation=1                \
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
@@ -0,0 +1,34 @@
+From 0fb46c2fe4a072546f87c6cb9202d5001f1eb9c5 Mon Sep 17 00:00:00 2001
+From: George Gekov <george.gekov@arm.com>
+Date: Mon, 18 Nov 2024 11:24:11 +0000
+Subject: [PATCH] Move rodata to the DDR
+
+---
+ targets/corstone-300/platform.ld | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
+index b458fc6..8d4bc73 100644
+--- a/targets/corstone-300/platform.ld
++++ b/targets/corstone-300/platform.ld
+@@ -154,7 +154,7 @@ SECTIONS
+     *(SORT(.dtors.*))
+     *(.dtors)
+ 
+-    *(.rodata*)
++
+ 
+     KEEP(*(.eh_frame*))
+   } > ITCM :rom_exec
+@@ -280,7 +280,7 @@ SECTIONS
+ #endif
+     * (expected_output_data_sec)
+     * (sec_command_stream, sec_weight_data, sec_input_data)
+-
++    *(.rodata*)
+     * (ethosu_core_in_queue)
+     * (ethosu_core_out_queue)
+     . = ALIGN(4);
+-- 
+2.25.1
+
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-New-phdr-for-.data-section.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-New-phdr-for-.data-section.patch
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0003-Make-ITCM-1MB.patch b/examples/arm/ethos-u-setup/core_platform/patches/0003-Make-ITCM-1MB.patch
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
@@ -234,6 +234,7 @@ target_link_libraries(
   quantized_kernels
   portable_kernels
   "-Wl,--no-whole-archive"
+  -Xlinker -Map=arm_executor_runner.map
 )
 
 # ET headers and generated headers includes
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
@@ -229,7 +229,6 @@ function run_fvp() {
     if [[ ${target} == *"ethos-u55"*  ]]; then
         echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
         ${fvp_model}                                            \
-            -C cpu0.CFGITCMSZ=11                                \
             -C ethosu.num_macs=${num_macs}                      \
             -C mps3_board.visualisation.disable-visualisation=1 \
             -C mps3_board.telnetterminal0.start_telnet=0        \
@@ -241,7 +240,6 @@ function run_fvp() {
     elif [[ ${target} == *"ethos-u85"*  ]]; then
         echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
     	${fvp_model}                                            \
-            -C mps4_board.subsystem.cpu0.CFGITCMSZ=11           \
             -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
             -C mps4_board.visualisation.disable-visualisation=1 \
             -C vis_hdlcd.disable_visualisation=1                \