Implement slice as a view (pytorch#5590)

SS-JIA · facebook-github-bot · commit 2060434788c9 · 2024-09-24T12:22:57.000-07:00
Summary: Pull Request resolved: pytorch#5590 ## Context TSIA. Implement slice as a view operator. This is only valid under the following conditions: * All dims preceding the sliced dim in the dim order have a size of 1 * start is 0 * step is 1 The reasoning for these restrictions is so that the offset of the slice view with respect to the source buffer is 0. More details are in the comments. To test the operator effectively, this diff also extends the test codegen to handle multiple test suites for one operator, each with a different configuration. ghstack-source-id: 244431147 exported-using-ghexport Reviewed By: jorgep31415 Differential Revision: D61666462 fbshipit-source-id: e4645ec672be0699c88eb1bb88fdef5b4e5cfdb1
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -10,6 +10,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
@@ -31,7 +33,7 @@ inline int64_t normalize_idx(
   return normalize(index, max);
 }
 
-void add_slice_tensor_out_node(
+void add_slice_tensor_copy_node(
     ComputeGraph& graph,
     ValueRef in,
     ValueRef dim_ref,
@@ -149,8 +151,126 @@ void add_slice_tensor_out_node(
   }
 }
 
-void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_slice_tensor_out_node(
+std::vector<int64_t> get_slice_sizes(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref) {
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  std::optional<int64_t> opt_start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+  std::optional<int64_t> opt_end =
+      graph.extract_optional_scalar<int64_t>(opt_end_ref);
+
+  int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(dim_size);
+
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
+
+  std::vector<int64_t> new_out_sizes = graph.sizes_of(in_ref);
+  new_out_sizes.at(dim) = end - start;
+
+  return new_out_sizes;
+}
+
+void resize_slice_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)args;
+  vTensorPtr out = graph->get_tensor(extra_args[0]);
+
+  std::vector<int64_t> new_out_sizes = get_slice_sizes(
+      *graph,
+      extra_args[1], // input
+      extra_args[2], // dim
+      extra_args[3], // optional start
+      extra_args[4]); // optional end
+
+  out->virtual_resize(new_out_sizes);
+}
+
+void check_slice_view_args(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref) {
+  VK_CHECK_COND(
+      graph.val_is_view_of(out_ref, in_ref),
+      "output must be a view of the input");
+
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  const int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
+
+  int64_t start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref).value_or(0);
+  int64_t end = graph.extract_optional_scalar<int64_t>(opt_end_ref).value_or(0);
+  int64_t step =
+      graph.extract_optional_scalar<int64_t>(opt_step_ref).value_or(1);
+
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
+
+  // The start idx must be 0; this is to ensure that the start of the slice view
+  // does not have any offset with respect to the base buffer storage. If the
+  // offset is nonzero, then it will potentially change upon a resize; however
+  // the buffer offset of the view tensor will have been "locked in" when the
+  // descriptor for its buffer storage is bound to a compute shader. Therefore
+  // there is no way to update the offset of the view once it has been bound.
+  VK_CHECK_COND(start == 0, "start must be 0 for slice view");
+  VK_CHECK_COND(step == 1, "step must be 1 for slice view");
+
+  VK_CHECK_COND(
+      end < dim_size, "end must be less than dim size for slice view");
+
+  // We must also check that all earlier dims in the dim order have a size of 1.
+  // This ensures that the slice view encompasses a contiguous memory region of
+  // the source tensor's memory buffer.
+  std::vector<int64_t> in_sizes = graph.sizes_of(in_ref);
+  std::vector<int64_t> in_dim_order = graph.dim_order_of(in_ref);
+  for (int i = 0; i < in_dim_order.size(); ++i) {
+    if (in_dim_order[i] == dim) {
+      break;
+    }
+    VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1);
+  }
+}
+
+void add_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref) {
+  check_slice_view_args(
+      graph,
+      in_ref,
+      dim_ref,
+      opt_start_ref,
+      opt_end_ref,
+      opt_step_ref,
+      out_ref);
+
+  std::vector<int64_t> new_out_sizes =
+      get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref);
+
+  graph.get_tensor(out_ref)->virtual_resize(new_out_sizes);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      resize_slice_view_node,
+      {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref}));
+}
+
+void slice_tensor_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_slice_tensor_copy_node(
       graph,
       args[0],
       args[1], // dim
@@ -160,9 +280,36 @@ void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       args[5]);
 }
 
+void slice_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  ValueRef out = args[5];
+
+  // Special case if out is a view of in
+  if (graph.val_is_view_of(out, in)) {
+    add_slice_view_node(
+        graph,
+        in,
+        args[1], // dim
+        args[2], // optional start
+        args[3], // optional end
+        args[4], // step
+        out);
+    return;
+  }
+
+  add_slice_tensor_copy_node(
+      graph,
+      in,
+      args[1], // dim
+      args[2], // optional start
+      args[3], // optional end
+      args[4], // step
+      out);
+}
+
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out);
-  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor_out);
+  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_copy);
+  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.h b/backends/vulkan/runtime/graph/ops/impl/Slice.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+#include <vector>
+
+namespace vkcompute {
+
+void add_slice_view_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef opt_step_ref,
+    ValueRef out_ref);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl
@@ -13,7 +13,7 @@
 layout(std430) buffer;
 
 ${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")}
-${layout_declare_ubo(1, "uvec3", "extents")}
+${layout_declare_ubo(1, "ivec3", "extents")}
 ${layout_declare_ubo(2, "int", "scalar")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -466,8 +466,8 @@ def get_view_inputs():
     return test_suite
 
 
-@register_test_suite(["aten.slice.Tensor", "aten.slice_copy.Tensor"])
-def get_slice_inputs():
+@register_test_suite("aten.slice_copy.Tensor")
+def get_slice_out_inputs():
     Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
     Test.__new__.__defaults__ = (None, 0, None, None, 1)
 
@@ -549,6 +549,39 @@ def get_slice_inputs():
     return test_suite
 
 
+def get_slice_view_inputs():
+    Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
+    Test.__new__.__defaults__ = (None, 0, None, None, 1)
+
+    # Slice by channel
+    test_cases = [
+        Test(self=[1, 17, 1, 10], dim=1, start=0, end=4),
+        Test(self=[1, 17, 1, 10], dim=1, start=0, end=8),
+        Test(self=[1, 17, 3, 7], dim=1, start=0, end=12),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.layouts = ["utils::kWidthPacked"]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.is_view_op = True
+
+    return test_suite
+
+
+@register_test_suite(["aten.slice.Tensor"])
+def get_slice_inputs():
+    texture_test_suite = get_slice_out_inputs()
+    texture_test_suite.test_name_suffix = "no_view"
+
+    view_test_suite = get_slice_view_inputs()
+    view_test_suite.test_name_suffix = "view"
+
+    return [view_test_suite, texture_test_suite]
+
+
 @register_test_suite(["aten.transpose.int"])
 def get_transpose_inputs():
     Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"])
diff --git a/backends/vulkan/test/op_tests/generate_op_benchmarks.py b/backends/vulkan/test/op_tests/generate_op_benchmarks.py
@@ -43,9 +43,13 @@ def process_test_suites(
     f_map: Dict[str, NativeFunction],
     test_suites: Dict[str, TestSuite],
 ) -> None:
-    for registry_name, op_test_suite in test_suites.items():
+    for registry_name, op_test_suites in test_suites.items():
         f = f_map[registry_name]
-        cpp_generator.add_suite(registry_name, f, op_test_suite)
+        if isinstance(op_test_suites, list):
+            for suite in op_test_suites:
+                cpp_generator.add_suite(registry_name, f, suite)
+        else:
+            cpp_generator.add_suite(registry_name, f, op_test_suites)
 
 
 @local.parametrize(
diff --git a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
@@ -43,9 +43,13 @@ def process_test_suites(
     f_map: Dict[str, NativeFunction],
     test_suites: Dict[str, TestSuite],
 ) -> None:
-    for registry_name, op_test_suite in test_suites.items():
+    for registry_name, op_test_suites in test_suites.items():
         f = f_map[registry_name]
-        cpp_generator.add_suite(registry_name, f, op_test_suite)
+        if isinstance(op_test_suites, list):
+            for suite in op_test_suites:
+                cpp_generator.add_suite(registry_name, f, suite)
+        else:
+            cpp_generator.add_suite(registry_name, f, op_test_suites)
 
 
 @local.parametrize(
diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -658,6 +658,9 @@ def gen_conditional_skips(self, skip_str: str = "GTEST_SKIP();") -> str:
 
     def gen_op_check_fn(self) -> str:
         op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
+
         op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n"
         if self.should_prepack:
             op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n"
@@ -676,6 +679,8 @@ def gen_op_check_fn(self) -> str:
 
     def gen_build_graph_fn(self, include_declarations: bool = False) -> str:
         op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
         op_build_graph_fn = self.gen_decl(f"build_graph_{op_name}") + " {\n"
         if self.should_prepack:
             op_build_graph_fn = (
@@ -691,6 +696,8 @@ def gen_build_graph_fn(self, include_declarations: bool = False) -> str:
 
     def gen_op_exec_graph_fn(self) -> str:
         op_name = self.f.func.name.unambiguous_name()
+        if self.suite_def.test_name_suffix is not None:
+            op_name += "_" + self.suite_def.test_name_suffix
         op_benchmark_fn = self.gen_decl(f"benchmark_{op_name}") + " {\n"
         if self.should_prepack:
             op_benchmark_fn = self.gen_decl(f"prepacked_benchmark_{op_name}") + " {\n"
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
@@ -79,6 +79,8 @@ def __init__(self, f: NativeFunction, test_suite: TestSuite):
         self.f = f
         self.suite_def = test_suite
         self.op_name = f.func.name.unambiguous_name()
+        if test_suite.test_name_suffix is not None:
+            self.op_name += f"_{test_suite.test_name_suffix}"
 
         self.f_sig = CppSignatureGroup.from_native_function(
             self.f, method=False, fallback_binding=self.f.manual_cpp_binding
diff --git a/backends/vulkan/test/op_tests/utils/test_suite.py b/backends/vulkan/test/op_tests/utils/test_suite.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
-from typing import Any, List
+from typing import Any, List, Optional
 
 ###################################
 ## Generic Test Suite definition ##
@@ -29,6 +29,7 @@ def __init__(self, input_cases: List[Any]):
         self.rtol: str = "1e-5"
 
         self.is_view_op: bool = False
+        self.test_name_suffix: Optional[str] = None
 
     def supports_prepack(self):
         return len(self.prepacked_args) > 0