Update on "[ET-VK] Enable buffer implementation of aten.linear"

SS-JIA · SS-JIA · commit 83f0eb36fe5b · 2024-10-31T12:22:59.000-07:00
## Changes As title. Extend the existing buffer implementation of `matmul` to support the linear operator as well. Differential Revision: [D65277712](https://our.internmc.facebook.com/intern/diff/D65277712/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -19,5 +19,5 @@ image_to_nchw:
     - NAME: image_to_nchw_texture3d
     - NAME: image_to_nchw_texture2d
       STORAGE: texture2d
-    - NAME: image_to_buffer
+    - NAME: clone_image_to_buffer
       TO_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -19,5 +19,5 @@ nchw_to_image:
     - NAME: nchw_to_image_texture3d
     - NAME: nchw_to_image_texture2d
       STORAGE: texture2d
-    - NAME: buffer_to_image
+    - NAME: clone_buffer_to_image
       FROM_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -25,7 +25,11 @@ void resize_clone_node(
   (void)extra_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr in = graph->get_tensor(args[1].refs[0]);
-  out->virtual_resize(in->sizes());
+  // TODO: support for when dimensionality doesn't match, i.e. clone is used to
+  // implement squeeze.
+  if (out->dim() == in->dim()) {
+    out->virtual_resize(in->sizes());
+  }
 }
 
 void add_clone_node(
@@ -56,7 +60,7 @@ void add_image_to_buffer_node(
     ComputeGraph& graph,
     const ValueRef image,
     const ValueRef buffer) {
-  std::string kernel_name = "image_to_buffer";
+  std::string kernel_name = "clone_image_to_buffer";
   add_dtype_suffix(kernel_name, graph.dtype_of(image));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
@@ -80,7 +84,7 @@ void add_buffer_to_image_node(
     ComputeGraph& graph,
     const ValueRef buffer,
     const ValueRef image) {
-  std::string kernel_name = "buffer_to_image";
+  std::string kernel_name = "clone_buffer_to_image";
   add_dtype_suffix(kernel_name, graph.dtype_of(image));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -77,11 +77,10 @@ void add_matmul_naive_buffer_node(
       graph.size_at<uint32_t>(-2, out),
       graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
 
-  int mat2_is_transposed_val = 0;
-  if (mat2_is_transposed != kDummyValueRef &&
-      graph.get_bool(mat2_is_transposed)) {
-    mat2_is_transposed_val = 1;
-  }
+  int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef &&
+                                graph.get_bool(mat2_is_transposed))
+      ? 1
+      : 0;
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_src);
     add_storage_type_suffix(kernel_name, v_src);
+    add_dtype_suffix(kernel_name, v_src);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -118,8 +118,8 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
 
   std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-  add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
@@ -1,9 +1,4 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load(
-    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
-    "get_compiler_optimization_flags",
-)
-
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -39,7 +34,7 @@ def define_common_targets():
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
             ],
-            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
             visibility = [
                 "//executorch/...",
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
@@ -41,62 +41,10 @@ enum class ElementwiseOptimizedPath {
   kTreatAs1d,
   kBroadcast2dBy1d,
   kBroadcast2dBy1dReverseArguments,
-  kBroadcastNdByNd,
-  kBroadcastNdByNdReverseArguments,
 };
 
 namespace internal {
-
-// Find the single broadcast dimension if it exists.
-// This path aims to handle broadcast of the following form
-// A = [a1, a2,., 1, .., an]
-// B = [b1, b2,., bm, .., bn]
-// OR
-// A = [a1, a2,., am, .., an]
-// B = [b1, b2,., 1, .., bn]
-int32_t inline get_broadcast_dim(const Tensor& lhs, const Tensor& rhs) {
-  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
-  auto lhs_end = lhs.sizes().end();
-
-  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes());
-  auto rhs_end = rhs.sizes().end();
-
-  const auto lhs_size = lhs_end - lhs_begin;
-  const auto rhs_size = rhs_end - rhs_begin;
-
-  // Following example is not handled at the moment
-  // [1, 3, 4, 5]
-  // [2, 3, 4, 5]
-  if (lhs_size != rhs_size) {
-    return 0;
-  }
-
-  int32_t broadcast_dim = 0;
-  // Check
-  // 1. if any dim value is 1 (it constitutes a broadcast dim)
-  // 2. If more than one dim value is 1 (we cannot handle)
-  // 3. If non-1 dim values are equal
-  lhs_end--;
-  rhs_end--;
-  while (lhs_end != lhs_begin) {
-    if (*lhs_end == 1 || *rhs_end == 1) {
-      // If more than one broadcast dim is found, return 0.
-      if (broadcast_dim != 0) {
-        return 0;
-      }
-      // negative index is used
-      broadcast_dim = lhs_end - lhs.sizes().end();
-    } else if (*lhs_end != *rhs_end) {
-      // If non-1 dim values are not equal, return 0.
-      return 0;
-    }
-    lhs_end--;
-    rhs_end--;
-  }
-  return broadcast_dim;
-}
-
-inline ElementwiseOptimizedPath select_broadcast_optimized_path(
+inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
     const Tensor& lhs,
     const Tensor& rhs) {
   auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
@@ -115,17 +63,6 @@ inline ElementwiseOptimizedPath select_broadcast_optimized_path(
     return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments;
   }
 
-  int32_t broadcast_dim = get_broadcast_dim(lhs, rhs);
-  // Right now we dont handle last dim broadcast
-  if (broadcast_dim < -1) {
-    if (std::count_if(rhs_begin, rhs_end, [](Tensor::SizesType x) {
-          return x == 1;
-        }) == 1) {
-      return ElementwiseOptimizedPath::kBroadcastNdByNd;
-    } else {
-      return ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments;
-    }
-  }
   return ElementwiseOptimizedPath::kNone;
 }
 } // namespace internal
@@ -148,28 +85,7 @@ ElementwiseOptimizedPath inline select_optimized_path(
         internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) {
     return ElementwiseOptimizedPath::kTreatAs1d;
   }
-  return internal::select_broadcast_optimized_path(a, b);
-}
-
-std::array<int32_t, 3> inline get_normalized_tensor_size(
-    const Tensor& a,
-    const int32_t broadcast_dim) {
-  ET_CHECK_MSG(
-      a.dim() > broadcast_dim,
-      "Size of tensor: %zd, must be larger than broadcast_dim: %d",
-      a.dim(),
-      broadcast_dim);
-  std::array<int32_t, 3> normalized_tensor_size;
-  normalized_tensor_size[0] = 1;
-  normalized_tensor_size[1] = a.size(broadcast_dim);
-  normalized_tensor_size[2] = 1;
-  for (size_t i = 0; i < broadcast_dim; i++) {
-    normalized_tensor_size[0] *= a.size(i);
-  }
-  for (size_t i = broadcast_dim + 1; i < a.dim(); i++) {
-    normalized_tensor_size[2] *= a.size(i);
-  }
-  return normalized_tensor_size;
+  return internal::select_broadcast_2d_by_1d_optimized_path(a, b);
 }
 
 } // namespace executor
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -130,19 +130,15 @@ Tensor& opt_mul_out(
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
     const Tensor* lhs;
     const Tensor* rhs;
-    if ((selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
-        (selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
       lhs = &b;
       rhs = &a;
     } else {
       // Catch failure to update logic when adding new broadcasting possibility.
       ET_DCHECK(
-          (selected_optimized_path ==
-           ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
-          (selected_optimized_path ==
-           ElementwiseOptimizedPath::kBroadcastNdByNd));
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
       lhs = &a;
       rhs = &b;
     }
@@ -153,34 +149,15 @@ Tensor& opt_mul_out(
         InvalidArgument,
         out,
         "Failed to resize output tensor.");
-    int64_t outer_size = 1;
-    int64_t broadcast_size;
-    int64_t inner_size;
-    if ((selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcastNdByNd) ||
-        (selected_optimized_path ==
-         ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
-      int32_t broadcast_dim = internal::get_broadcast_dim(*lhs, *rhs);
-      int32_t broadcast_dim_lhs = lhs->dim() + broadcast_dim;
-      auto normalized_tensor_size_lhs =
-          get_normalized_tensor_size(*lhs, broadcast_dim_lhs);
-      outer_size = normalized_tensor_size_lhs[0];
-      broadcast_size = normalized_tensor_size_lhs[1];
-      inner_size = normalized_tensor_size_lhs[2];
-    } else {
-      broadcast_size = lhs->sizes()[lhs->dim() - 2];
-      inner_size = lhs->sizes()[lhs->dim() - 1];
-    }
     ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
       using Vec = executorch::vec::Vectorized<CTYPE>;
-      executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
+      executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
           [](Vec x, Vec y) { return x * y; },
           out.mutable_data_ptr<CTYPE>(),
           lhs->const_data_ptr<CTYPE>(),
           rhs->const_data_ptr<CTYPE>(),
-          outer_size,
-          broadcast_size,
-          inner_size);
+          lhs->sizes()[lhs->dim() - 2],
+          lhs->sizes()[lhs->dim() - 1]);
     });
   } else {
     ScalarType common_type =
diff --git a/kernels/optimized/vec/functional_base.h b/kernels/optimized/vec/functional_base.h
@@ -326,49 +326,10 @@ inline void map4(
 }
 
 
-// This function implements broadcasting binary operation on two tensors
-// where lhs tensor is treated to be of shape [outer_size, broadcast_size, inner_size]
-// and rhs tensor is treated to be of shape [outer_size, 1, inner_size]
-// And this 1st dimension is considered broadcasting dimension
-// This formula can map broadcasting on any dim=broadcast_dim
-// for any two N dimensional tensors, where 0 < braodcast_dim < N-1
-template <typename scalar_t, typename Op>
-inline void broadcasting_map_3d_and_unsqueezed_3d(
-    const Op& vec_fun,
-    scalar_t* output_data,
-    const scalar_t* lhs,
-    const scalar_t* rhs,
-    int64_t outer_size,
-    int64_t broadcast_size,
-    int64_t inner_size) {
-  using Vec = vec::Vectorized<scalar_t>;
-  int64_t outer_stride_lhs = inner_size * broadcast_size;
-  int64_t outer_stride_rhs = inner_size;
-  int64_t broadcast_stride_lhs = inner_size;
-  for (int64_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
-    const scalar_t* lhs_outer = lhs + outer_idx * outer_stride_lhs;
-    scalar_t* output_data_row = output_data + outer_idx * outer_stride_lhs;
-    const scalar_t* rhs_outer = rhs + outer_idx * outer_stride_rhs;
-    for (int64_t broadcast_idx = 0; broadcast_idx < broadcast_size; ++broadcast_idx) {
-      const scalar_t* lhs_outer_2 = lhs_outer + broadcast_idx * broadcast_stride_lhs;
-      scalar_t* output_data_row_2 = output_data_row + broadcast_idx * broadcast_stride_lhs;
-      int64_t inner_idx = 0;
-      for (; inner_idx < inner_size - (inner_size % Vec::size()); inner_idx += Vec::size()) {
-        Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx);
-        Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx);
-        Vec output_vec = vec_fun(data_vec, data_vec2);
-        output_vec.store(output_data_row_2 + inner_idx);
-      }
-      if (inner_size - inner_idx > 0) {
-        Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx, inner_size - inner_idx);
-        Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx, inner_size - inner_idx);
-        Vec output_vec = vec_fun(data_vec, data_vec2);
-        output_vec.store(output_data_row_2 + inner_idx, inner_size - inner_idx);
-      }
-    }
-  }
-}
-
+// Map vec_fun across input_data and input_data2, where input_data is
+// a two-dimensional array of size (size, size2), input_data2 is a
+// one-dimensional array of size size2, and input_data2 is broadcast
+// to be of size (size, size2).
 template <typename scalar_t, typename Op>
 inline void broadcasting_map_2d_by_1d(
     const Op& vec_fun,
@@ -377,8 +338,27 @@ inline void broadcasting_map_2d_by_1d(
     const scalar_t* input_data2,
     int64_t size,
     int64_t size2) {
-  broadcasting_map_3d_and_unsqueezed_3d(vec_fun, output_data, input_data, input_data2, 1, size, size2);
+  using Vec = vec::Vectorized<scalar_t>;
+  for (int64_t outer_idx = 0; outer_idx < size; ++outer_idx) {
+    const scalar_t* input_data_row = input_data + outer_idx * size2;
+    scalar_t* output_data_row = output_data + outer_idx * size2;
+    int64_t inner_idx = 0;
+    for (; inner_idx < size2 - (size2 % Vec::size()); inner_idx += Vec::size()) {
+      Vec data_vec = Vec::loadu(input_data_row + inner_idx);
+      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx);
+      Vec output_vec = vec_fun(data_vec, data_vec2);
+      output_vec.store(output_data_row + inner_idx);
+    }
+    if (size2 - inner_idx > 0) {
+      Vec data_vec = Vec::loadu(input_data_row + inner_idx, size2 - inner_idx);
+      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx, size2 - inner_idx);
+      Vec output_vec = vec_fun(data_vec, data_vec2);
+      output_vec.store(output_data_row + inner_idx, size2 - inner_idx);
+    }
+  }
 }
 
+
+
 } // namespace vec
 } // namespace executorch
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl

Original file line number	Diff line number	Diff line change
`@@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(`
`56`	`56`	`if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&`
`57`	`57`	`!int8_buffer_enabled) {`
`58`	`58`	`kernel_name = "bitw8_image_to_nchw_nobitw8buffer";`
`59`		`- add_dtype_suffix(kernel_name, v_src);`
`60`	`59`	`add_storage_type_suffix(kernel_name, v_src);`
	`60`	`+ add_dtype_suffix(kernel_name, v_src);`
`61`	`61`	`return VK_KERNEL_FROM_STR(kernel_name);`
`62`	`62`	`}`
`63`	`63`