Implement repeat_interleave (pytorch#5830)

SS-JIA · facebook-github-bot · commit e2e2129bf799 · 2024-10-02T18:22:45.000-07:00
Summary: Pull Request resolved: pytorch#5830 As title; implement the `repeat_interleave` operator. The current implementation has some limitations which are documented in the code. ghstack-source-id: 246028695 exported-using-ghexport Reviewed By: jorgep31415 Differential Revision: D63790717 fbshipit-source-id: 090c9fc77d160619def1d0a2acd01d88185a311e
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -378,6 +378,19 @@ class ComputeGraph final {
     return values_.at(idx).toString();
   }
 
+  template <
+      typename T,
+      typename std::enable_if<
+          std::is_integral<T>::value && std::is_signed<T>::value,
+          int>::type = 0>
+  T extract_whcn_dim(const ValueRef idx, const int64_t ndim) {
+    T dim = extract_scalar<T>(idx);
+    // Normalize dim to account for negative indexing
+    dim = (dim % ndim + ndim) % ndim;
+    // Assume original value is NCHW ordering, obtain the WHCN ordering
+    return ndim - 1 - dim;
+  }
+
   //
   // Utility functions
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec3", "tin_limits")}
+${layout_declare_ubo(B, "ivec4", "tin_axis_map")}
+${layout_declare_ubo(B, "ivec4", "tout_axis_map")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int nrepeats = 1;
+layout(constant_id = 4) const int repeat_dim = 1;
+
+#include "indexing_utils.h"
+
+void main() {
+  const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(tin_lpos, tin_limits))) {
+    return;
+  }
+
+  const VEC4_T intex = load_texel_lpos(tin, tin_lpos, tin_axis_map);
+
+  ivec3 tout_lpos = tin_lpos;
+  tout_lpos[repeat_dim] *= nrepeats;
+
+  for (int i = 0; i < nrepeats; ++i, tout_lpos[repeat_dim]++) {
+    write_texel_lpos(tout, tout_lpos, intex, tout_axis_map);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
@@ -0,0 +1,10 @@
+repeat_interleave:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: repeat_interleave
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_repeat_interleave_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+
+  const int64_t nrepeats = graph->extract_scalar<int64_t>(extra_args[0]);
+  int64_t repeat_dim = graph->extract_scalar<int64_t>(extra_args[1]);
+
+  std::vector<int64_t> new_sizes = in->sizes();
+  repeat_dim = normalize(repeat_dim, new_sizes.size());
+  new_sizes.at(repeat_dim) *= nrepeats;
+
+  out->virtual_resize(new_sizes);
+}
+
+void add_repeat_interleave_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef num_repeats,
+    const ValueRef dim,
+    const ValueRef out) {
+  const int32_t nrepeats = graph.extract_scalar<int32_t>(num_repeats);
+  const int32_t repeat_dim =
+      graph.extract_whcn_dim<int32_t>(dim, graph.dim_of(in));
+
+  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(out));
+  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(in));
+
+  std::string kernel_name = "repeat_interleave";
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  const utils::uvec3 global_wg_size = graph.logical_limits_of(in);
+  const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      // Shader
+      VK_KERNEL_FROM_STR(kernel_name),
+      // Workgroup sizes
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {in, vkapi::MemoryAccessType::READ}},
+      // Parameter buffers
+      {graph.logical_limits_ubo(in),
+       graph.axis_map_ubo(in),
+       graph.axis_map_ubo(out)},
+      // Specialization Constants
+      {nrepeats, repeat_dim},
+      // Resizing Logic
+      resize_repeat_interleave_node,
+      {num_repeats, dim}));
+}
+
+void repeat_interleave(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int args_i = 0;
+  const ValueRef in = args[args_i++];
+  const ValueRef num_repeats = args[args_i++];
+  const ValueRef dim = args[args_i++];
+  const ValueRef output_size = args[args_i++];
+  const ValueRef out = args[args_i++];
+
+  // Output size is not used in the kernel
+  (void)output_size;
+
+  add_repeat_interleave_node(graph, in, num_repeats, dim, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.repeat_interleave.self_int, repeat_interleave);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_repeat_interleave_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef num_repeats,
+    const ValueRef dim,
+    const ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -747,6 +747,46 @@ def get_repeat_inputs():
     return test_suite
 
 
+@register_test_suite("aten.repeat_interleave.self_int")
+def get_repeat_interleave_inputs():
+    test_suite_W = VkTestSuite(
+        [
+            ((4, 32, 256), 3, -2),
+            # Test repeat on each non-packed dim
+            ((16, 32, 64), 5, -2),
+            ((16, 32, 64), 5, -3),
+            # Test batched inputs
+            ((3, 5, 32, 64), 4, -2),
+            ((3, 5, 32, 64), 4, -3),
+        ]
+    )
+    test_suite_W.layouts = [
+        "utils::kWidthPacked",
+    ]
+    test_suite_W.data_gen = "make_seq_tensor"
+    test_suite_W.dtypes = ["at::kFloat"]
+    test_suite_W.test_name_suffix = "W_packed"
+
+    test_suite_C = VkTestSuite(
+        [
+            # Test repeat on each non-packed dim
+            ((32, 32, 16), 5, -1),
+            ((32, 32, 16), 5, -2),
+            # Test batched inputs
+            ((3, 16, 8, 64), 4, -1),
+            ((3, 16, 8, 64), 4, -2),
+        ]
+    )
+    test_suite_C.layouts = [
+        "utils::kChannelsPacked",
+    ]
+    test_suite_C.data_gen = "make_seq_tensor"
+    test_suite_C.dtypes = ["at::kFloat"]
+    test_suite_C.test_name_suffix = "C_packed"
+
+    return [test_suite_W, test_suite_C]
+
+
 @register_test_suite("aten.cat.default")
 def get_cat_inputs():
     # TensorList must be specified as list of tuples