Update on "[ET-VK] Allow clone op to transfer between memory layouts and storage types"

SS-JIA · SS-JIA · commit c5ff5173a330 · 2024-10-31T12:22:58.000-07:00
## Changes As title. Extend the functionality of the `aten.clone` operator to allow transitioning the storage type and memory layout between the input to the output tensor. ## Context This functionality will be used to transition input tensors to the optimal storage type and memory layout before entering the execution of an op. The transition nodes will be added by a memory metadata tagging pass that will be introduced in a subsequent diff. Differential Revision: [D65277710](https://our.internmc.facebook.com/intern/diff/D65277710/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -19,5 +19,5 @@ image_to_nchw:
     - NAME: image_to_nchw_texture3d
     - NAME: image_to_nchw_texture2d
       STORAGE: texture2d
-    - NAME: image_to_buffer
+    - NAME: clone_image_to_buffer
       TO_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -19,5 +19,5 @@ nchw_to_image:
     - NAME: nchw_to_image_texture3d
     - NAME: nchw_to_image_texture2d
       STORAGE: texture2d
-    - NAME: buffer_to_image
+    - NAME: clone_buffer_to_image
       FROM_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -25,7 +25,11 @@ void resize_clone_node(
   (void)extra_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr in = graph->get_tensor(args[1].refs[0]);
-  out->virtual_resize(in->sizes());
+  // TODO: support for when dimensionality doesn't match, i.e. clone is used to
+  // implement squeeze.
+  if (out->dim() == in->dim()) {
+    out->virtual_resize(in->sizes());
+  }
 }
 
 void add_clone_node(
@@ -56,7 +60,7 @@ void add_image_to_buffer_node(
     ComputeGraph& graph,
     const ValueRef image,
     const ValueRef buffer) {
-  std::string kernel_name = "image_to_buffer";
+  std::string kernel_name = "clone_image_to_buffer";
   add_dtype_suffix(kernel_name, graph.dtype_of(image));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
@@ -80,7 +84,7 @@ void add_buffer_to_image_node(
     ComputeGraph& graph,
     const ValueRef buffer,
     const ValueRef image) {
-  std::string kernel_name = "buffer_to_image";
+  std::string kernel_name = "clone_buffer_to_image";
   add_dtype_suffix(kernel_name, graph.dtype_of(image));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_src);
     add_storage_type_suffix(kernel_name, v_src);
+    add_dtype_suffix(kernel_name, v_src);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -118,8 +118,8 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
 
   std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-  add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
@@ -1,9 +1,4 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load(
-    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
-    "get_compiler_optimization_flags",
-)
-
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -39,7 +34,7 @@ def define_common_targets():
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
             ],
-            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
             visibility = [
                 "//executorch/...",
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
@@ -41,62 +41,10 @@ enum class ElementwiseOptimizedPath {
   kTreatAs1d,
   kBroadcast2dBy1d,
   kBroadcast2dBy1dReverseArguments,
-  kBroadcastNdByNd,
-  kBroadcastNdByNdReverseArguments,
 };
 
 namespace internal {
-
-// Find the single broadcast dimension if it exists.
-// This path aims to handle broadcast of the following form
-// A = [a1, a2,., 1, .., an]
-// B = [b1, b2,., bm, .., bn]
-// OR
-// A = [a1, a2,., am, .., an]
-// B = [b1, b2,., 1, .., bn]
-int32_t inline get_broadcast_dim(const Tensor& lhs, const Tensor& rhs) {
-  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
-  auto lhs_end = lhs.sizes().end();
-
-  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes());
-  auto rhs_end = rhs.sizes().end();
-
-  const auto lhs_size = lhs_end - lhs_begin;
-  const auto rhs_size = rhs_end - rhs_begin;
-
-  // Following example is not handled at the moment
-  // [1, 3, 4, 5]
-  // [2, 3, 4, 5]
-  if (lhs_size != rhs_size) {
-    return 0;
-  }
-
-  int32_t broadcast_dim = 0;
-  // Check
-  // 1. if any dim value is 1 (it constitutes a broadcast dim)
-  // 2. If more than one dim value is 1 (we cannot handle)
-  // 3. If non-1 dim values are equal
-  lhs_end--;
-  rhs_end--;
-  while (lhs_end != lhs_begin) {
-    if (*lhs_end == 1 || *rhs_end == 1) {
-      // If more than one broadcast dim is found, return 0.
-      if (broadcast_dim != 0) {
-        return 0;
-      }
-      // negative index is used
-      broadcast_dim = lhs_end - lhs.sizes().end();
-    } else if (*lhs_end != *rhs_end) {
-      // If non-1 dim values are not equal, return 0.
-      return 0;
-    }
-    lhs_end--;
-    rhs_end--;
-  }
-  return broadcast_dim;
-}
-
-inline ElementwiseOptimizedPath select_broadcast_optimized_path(
+inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
     const Tensor& lhs,
     const Tensor& rhs) {
   auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
@@ -115,17 +63,6 @@ inline ElementwiseOptimizedPath select_broadcast_optimized_path(
     return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments;
   }
 
-  int32_t broadcast_dim = get_broadcast_dim(lhs, rhs);
-  // Right now we dont handle last dim broadcast
-  if (broadcast_dim < -1) {
-    if (std::count_if(rhs_begin, rhs_end, [](Tensor::SizesType x) {
-          return x == 1;
-        }) == 1) {
-      return ElementwiseOptimizedPath::kBroadcastNdByNd;
-    } else {
-      return ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments;
-    }
-  }
   return ElementwiseOptimizedPath::kNone;
 }
 } // namespace internal
@@ -148,28 +85,7 @@ ElementwiseOptimizedPath inline select_optimized_path(
         internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) {
     return ElementwiseOptimizedPath::kTreatAs1d;
   }
-  return internal::select_broadcast_optimized_path(a, b);
-}
-
-std::array<int32_t, 3> inline get_normalized_tensor_size(
-    const Tensor& a,
-    const int32_t broadcast_dim) {
-  ET_CHECK_MSG(
-      a.dim() > broadcast_dim,
-      "Size of tensor: %zd, must be larger than broadcast_dim: %d",
-      a.dim(),
-      broadcast_dim);
-  std::array<int32_t, 3> normalized_tensor_size;
-  normalized_tensor_size[0] = 1;
-  normalized_tensor_size[1] = a.size(broadcast_dim);
-  normalized_tensor_size[2] = 1;
-  for (size_t i = 0; i < broadcast_dim; i++) {
-    normalized_tensor_size[0] *= a.size(i);
-  }
-  for (size_t i = broadcast_dim + 1; i < a.dim(); i++) {
-    normalized_tensor_size[2] *= a.size(i);
-  }
-  return normalized_tensor_size;
+  return internal::select_broadcast_2d_by_1d_optimized_path(a, b);
 }
 
 } // namespace executor
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -130,19 +130,15 @@ Tensor& opt_mul_out(
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
     const Tensor* lhs;
     const Tensor* rhs;
-    if ((selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
-        (selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
       lhs = &b;
       rhs = &a;
     } else {
       // Catch failure to update logic when adding new broadcasting possibility.
       ET_DCHECK(
-          (selected_optimized_path ==
-           ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
-          (selected_optimized_path ==
-           ElementwiseOptimizedPath::kBroadcastNdByNd));
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
       lhs = &a;
       rhs = &b;
     }
@@ -153,34 +149,15 @@ Tensor& opt_mul_out(
         InvalidArgument,
         out,
         "Failed to resize output tensor.");
-    int64_t outer_size = 1;
-    int64_t broadcast_size;
-    int64_t inner_size;
-    if ((selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcastNdByNd) ||
-        (selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
-      int32_t broadcast_dim = internal::get_broadcast_dim(*lhs, *rhs);
-      int32_t broadcast_dim_lhs = lhs->dim() + broadcast_dim;
-      auto normalized_tensor_size_lhs =
-          get_normalized_tensor_size(*lhs, broadcast_dim_lhs);
-      outer_size = normalized_tensor_size_lhs[0];
-      broadcast_size = normalized_tensor_size_lhs[1];
-      inner_size = normalized_tensor_size_lhs[2];
-    } else {
-      broadcast_size = lhs->sizes()[lhs->dim() - 2];
-      inner_size = lhs->sizes()[lhs->dim() - 1];
-    }
     ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
       using Vec = executorch::vec::Vectorized<CTYPE>;
-      executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
+      executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
           [](Vec x, Vec y) { return x * y; },
           out.mutable_data_ptr<CTYPE>(),
           lhs->const_data_ptr<CTYPE>(),
           rhs->const_data_ptr<CTYPE>(),
-          outer_size,
-          broadcast_size,
-          inner_size);
+          lhs->sizes()[lhs->dim() - 2],
+          lhs->sizes()[lhs->dim() - 1]);
     });
   } else {
     ScalarType common_type =
diff --git a/kernels/optimized/vec/functional_base.h b/kernels/optimized/vec/functional_base.h
@@ -326,49 +326,10 @@ inline void map4(
 }
 
 
-// This function implements broadcasting binary operation on two tensors
-// where lhs tensor is treated to be of shape [outer_size, broadcast_size, inner_size]
-// and rhs tensor is treated to be of shape [outer_size, 1, inner_size]
-// And this 1st dimension is considered broadcasting dimension
-// This formula can map broadcasting on any dim=broadcast_dim
-// for any two N dimensional tensors, where 0 < braodcast_dim < N-1
-template <typename scalar_t, typename Op>
-inline void broadcasting_map_3d_and_unsqueezed_3d(
-    const Op& vec_fun,
-    scalar_t* output_data,
-    const scalar_t* lhs,
-    const scalar_t* rhs,
-    int64_t outer_size,
-    int64_t broadcast_size,
-    int64_t inner_size) {
-  using Vec = vec::Vectorized<scalar_t>;
-  int64_t outer_stride_lhs = inner_size * broadcast_size;
-  int64_t outer_stride_rhs = inner_size;
-  int64_t broadcast_stride_lhs = inner_size;
-  for (int64_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
-    const scalar_t* lhs_outer = lhs + outer_idx * outer_stride_lhs;
-    scalar_t* output_data_row = output_data + outer_idx * outer_stride_lhs;
-    const scalar_t* rhs_outer = rhs + outer_idx * outer_stride_rhs;
-    for (int64_t broadcast_idx = 0; broadcast_idx < broadcast_size; ++broadcast_idx) {
-      const scalar_t* lhs_outer_2 = lhs_outer + broadcast_idx * broadcast_stride_lhs;
-      scalar_t* output_data_row_2 = output_data_row + broadcast_idx * broadcast_stride_lhs;
-      int64_t inner_idx = 0;
-      for (; inner_idx < inner_size - (inner_size % Vec::size()); inner_idx += Vec::size()) {
-        Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx);
-        Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx);
-        Vec output_vec = vec_fun(data_vec, data_vec2);
-        output_vec.store(output_data_row_2 + inner_idx);
-      }
-      if (inner_size - inner_idx > 0) {
-        Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx, inner_size - inner_idx);
-        Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx, inner_size - inner_idx);
-        Vec output_vec = vec_fun(data_vec, data_vec2);
-        output_vec.store(output_data_row_2 + inner_idx, inner_size - inner_idx);
-      }
-    }
-  }
-}
-
+// Map vec_fun across input_data and input_data2, where input_data is
+// a two-dimensional array of size (size, size2), input_data2 is a
+// one-dimensional array of size size2, and input_data2 is broadcast
+// to be of size (size, size2).
 template <typename scalar_t, typename Op>
 inline void broadcasting_map_2d_by_1d(
     const Op& vec_fun,
@@ -377,8 +338,27 @@ inline void broadcasting_map_2d_by_1d(
     const scalar_t* input_data2,
     int64_t size,
     int64_t size2) {
-  broadcasting_map_3d_and_unsqueezed_3d(vec_fun, output_data, input_data, input_data2, 1, size, size2);
+  using Vec = vec::Vectorized<scalar_t>;
+  for (int64_t outer_idx = 0; outer_idx < size; ++outer_idx) {
+    const scalar_t* input_data_row = input_data + outer_idx * size2;
+    scalar_t* output_data_row = output_data + outer_idx * size2;
+    int64_t inner_idx = 0;
+    for (; inner_idx < size2 - (size2 % Vec::size()); inner_idx += Vec::size()) {
+      Vec data_vec = Vec::loadu(input_data_row + inner_idx);
+      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx);
+      Vec output_vec = vec_fun(data_vec, data_vec2);
+      output_vec.store(output_data_row + inner_idx);
+    }
+    if (size2 - inner_idx > 0) {
+      Vec data_vec = Vec::loadu(input_data_row + inner_idx, size2 - inner_idx);
+      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx, size2 - inner_idx);
+      Vec output_vec = vec_fun(data_vec, data_vec2);
+      output_vec.store(output_data_row + inner_idx, size2 - inner_idx);
+    }
+  }
 }
 
+
+
 } // namespace vec
 } // namespace executorch
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl

Original file line number	Diff line number	Diff line change
`@@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(`
`56`	`56`	`if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&`
`57`	`57`	`!int8_buffer_enabled) {`
`58`	`58`	`kernel_name = "bitw8_image_to_nchw_nobitw8buffer";`
`59`		`- add_dtype_suffix(kernel_name, v_src);`
`60`	`59`	`add_storage_type_suffix(kernel_name, v_src);`
	`60`	`+ add_dtype_suffix(kernel_name, v_src);`
`61`	`61`	`return VK_KERNEL_FROM_STR(kernel_name);`
`62`	`62`	`}`
`63`	`63`