Update on "[ET-VK][AOT][ez] Introduce vulkan export utils lib"

SS-JIA · SS-JIA · commit 7e86248924a1 · 2024-10-31T14:06:12.000-07:00
## Changes As title. Introduce a common Python utility library for scripts in the Vulkan backend. Differential Revision: [D65291064](https://our.internmc.facebook.com/intern/diff/D65291064/) [ghstack-poisoned]
diff --git a/backends/vulkan/_passes/insert_prepack_nodes.py b/backends/vulkan/_passes/insert_prepack_nodes.py
@@ -56,11 +56,11 @@ def prepack_not_required(node: torch.fx.Node) -> bool:
             )
             # This pass assumes that the SpecPropPass() has already been applied
             assert "spec" in node.meta
+            assert node.meta["spec"].const
             # Validate that the original node is marked as a constant. Constant tensors
             # do not participate in memory planning.
             prepack_node.meta["val"] = node.meta["val"]
             prepack_node.meta["spec"] = deepcopy(node.meta["spec"])
-            # prepack_node.meta = deepcopy(node.meta)
             # Set the mem_obj_id to -1 to indicate that this node requires a dedicated
             # memory object.
             prepack_node.meta["spec"].mem_obj_id = -1
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -91,27 +91,23 @@ void main() {
 
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
-VEC4_T q_8w_linear(const ivec3 out_pos, const int K) {
-  u16vec3 mat1_pos = u16vec3(0, out_pos.yz);
-  u16vec3 qmat2_pos = u16vec3(0, out_pos.x * 4, 0);
+VEC4_T q_8w_linear(const u16vec3 out_pos, const uint16_t K) {
+  const uint16_t qmat2_pos_y = out_pos.x * uint16_t(4);
 
   VEC4_T outtex = VEC4_T(0);
 
   const u16vec3 scales_pos = u16vec3(out_pos.x, 0, 0);
   const VEC4_T scales = load_texel(t_scales, scales_pos);
 
-  for (int i = 0; i < K; i += 4) {
-    const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos);
+  for (uint16_t i = uint16_t(0), x = uint16_t(0); i < K; i += uint16_t(4), x++) {
+    const VEC4_T mat1_tex = load_texel(t_mat1, u16vec3(x, out_pos.yz));
     const VEC4_T sums = VEC4_T(
-        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos)),
-        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 1, 0))),
-        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 2, 0))),
-        dot(mat1_tex, load_texel(t_qmat2, qmat2_pos + u16vec3(0, 3, 0))));
+        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y, 0))),
+        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y + uint16_t(1), 0))),
+        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y + uint16_t(2), 0))),
+        dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y + uint16_t(3), 0))));
 
     outtex += sums;
-
-    mat1_pos.x++;
-    qmat2_pos.x++;
   }
 
   outtex *= scales;
@@ -120,12 +116,12 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) {
 }
 
 void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+  const u16vec3 out_pos = u16vec3(gl_GlobalInvocationID);
   if (any(greaterThanEqual(out_pos, out_limits))) {
     return;
   }
 
-  VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
+  VEC4_T outtex = q_8w_linear(out_pos, uint16_t(mat1_sizes.x));
   write_texel(t_out, out_pos, outtex);
 }
 
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
@@ -101,28 +101,37 @@ def define_common_targets(is_fbcode = False):
         "fbsource//third-party/VulkanMemoryAllocator/3.0.1:VulkanMemoryAllocator_xplat",
     ]
 
-    if not is_fbcode:
+    if is_fbcode:
         VK_API_DEPS += [
-            "fbsource//third-party/volk:volk",
+            "fbsource//third-party/swiftshader:swiftshader_vk_headers",
+            "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_fbcode",
+            "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_so",
         ]
+    else:
         VK_API_DEPS += select({
-            "DEFAULT": [],
-            "ovr_config//os:android": ["fbsource//third-party/toolchains:android"],
+            "DEFAULT": [
+                "fbsource//third-party/volk:volk",
+            ],
+            "ovr_config//os:android": [
+                "fbsource//third-party/volk:volk",
+                "fbsource//third-party/toolchains:android"
+            ],
+            "ovr_config//os:macos-arm64": [
+                "//third-party/khronos:moltenVK"
+            ],
         })
-        VK_API_PREPROCESSOR_FLAGS += [
-            "-DUSE_VULKAN_WRAPPER",
-            "-DUSE_VULKAN_VOLK",
-        ]
         VK_API_PREPROCESSOR_FLAGS += select({
-            "DEFAULT": [],
-            "ovr_config//os:android": ["-DVK_ANDROID_external_memory_android_hardware_buffer"],
+            "DEFAULT": [
+                "-DUSE_VULKAN_WRAPPER",
+                "-DUSE_VULKAN_VOLK",
+            ],
+            "ovr_config//os:android": [
+                "-DUSE_VULKAN_WRAPPER",
+                "-DUSE_VULKAN_VOLK",
+                "-DVK_ANDROID_external_memory_android_hardware_buffer"
+            ],
+            "ovr_config//os:macos-arm64": []
         })
-    else:
-        VK_API_DEPS += [
-            "fbsource//third-party/swiftshader:swiftshader_vk_headers",
-            "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_fbcode",
-            "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_so",
-        ]
 
     runtime.cxx_library(
         name = "vulkan_compute_api",
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
@@ -1,4 +1,9 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(
+    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
+    "get_compiler_optimization_flags",
+)
+
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -34,7 +39,7 @@ def define_common_targets():
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
             ],
-            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
             visibility = [
                 "//executorch/...",
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
@@ -41,10 +41,62 @@ enum class ElementwiseOptimizedPath {
   kTreatAs1d,
   kBroadcast2dBy1d,
   kBroadcast2dBy1dReverseArguments,
+  kBroadcastNdByNd,
+  kBroadcastNdByNdReverseArguments,
 };
 
 namespace internal {
-inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
+
+// Find the single broadcast dimension if it exists.
+// This path aims to handle broadcast of the following form
+// A = [a1, a2,., 1, .., an]
+// B = [b1, b2,., bm, .., bn]
+// OR
+// A = [a1, a2,., am, .., an]
+// B = [b1, b2,., 1, .., bn]
+int32_t inline get_broadcast_dim(const Tensor& lhs, const Tensor& rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
+  auto lhs_end = lhs.sizes().end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes());
+  auto rhs_end = rhs.sizes().end();
+
+  const auto lhs_size = lhs_end - lhs_begin;
+  const auto rhs_size = rhs_end - rhs_begin;
+
+  // Following example is not handled at the moment
+  // [1, 3, 4, 5]
+  // [2, 3, 4, 5]
+  if (lhs_size != rhs_size) {
+    return 0;
+  }
+
+  int32_t broadcast_dim = 0;
+  // Check
+  // 1. if any dim value is 1 (it constitutes a broadcast dim)
+  // 2. If more than one dim value is 1 (we cannot handle)
+  // 3. If non-1 dim values are equal
+  lhs_end--;
+  rhs_end--;
+  while (lhs_end != lhs_begin) {
+    if (*lhs_end == 1 || *rhs_end == 1) {
+      // If more than one broadcast dim is found, return 0.
+      if (broadcast_dim != 0) {
+        return 0;
+      }
+      // negative index is used
+      broadcast_dim = lhs_end - lhs.sizes().end();
+    } else if (*lhs_end != *rhs_end) {
+      // If non-1 dim values are not equal, return 0.
+      return 0;
+    }
+    lhs_end--;
+    rhs_end--;
+  }
+  return broadcast_dim;
+}
+
+inline ElementwiseOptimizedPath select_broadcast_optimized_path(
     const Tensor& lhs,
     const Tensor& rhs) {
   auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
@@ -63,6 +115,17 @@ inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
     return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments;
   }
 
+  int32_t broadcast_dim = get_broadcast_dim(lhs, rhs);
+  // Right now we dont handle last dim broadcast
+  if (broadcast_dim < -1) {
+    if (std::count_if(rhs_begin, rhs_end, [](Tensor::SizesType x) {
+          return x == 1;
+        }) == 1) {
+      return ElementwiseOptimizedPath::kBroadcastNdByNd;
+    } else {
+      return ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments;
+    }
+  }
   return ElementwiseOptimizedPath::kNone;
 }
 } // namespace internal
@@ -85,7 +148,28 @@ ElementwiseOptimizedPath inline select_optimized_path(
         internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) {
     return ElementwiseOptimizedPath::kTreatAs1d;
   }
-  return internal::select_broadcast_2d_by_1d_optimized_path(a, b);
+  return internal::select_broadcast_optimized_path(a, b);
+}
+
+std::array<int32_t, 3> inline get_normalized_tensor_size(
+    const Tensor& a,
+    const int32_t broadcast_dim) {
+  ET_CHECK_MSG(
+      a.dim() > broadcast_dim,
+      "Size of tensor: %zd, must be larger than broadcast_dim: %d",
+      a.dim(),
+      broadcast_dim);
+  std::array<int32_t, 3> normalized_tensor_size;
+  normalized_tensor_size[0] = 1;
+  normalized_tensor_size[1] = a.size(broadcast_dim);
+  normalized_tensor_size[2] = 1;
+  for (size_t i = 0; i < broadcast_dim; i++) {
+    normalized_tensor_size[0] *= a.size(i);
+  }
+  for (size_t i = broadcast_dim + 1; i < a.dim(); i++) {
+    normalized_tensor_size[2] *= a.size(i);
+  }
+  return normalized_tensor_size;
 }
 
 } // namespace executor
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -130,15 +130,19 @@ Tensor& opt_mul_out(
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
     const Tensor* lhs;
     const Tensor* rhs;
-    if (selected_optimized_path ==
-        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+    if ((selected_optimized_path ==
+         ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
+        (selected_optimized_path ==
+         ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
       lhs = &b;
       rhs = &a;
     } else {
       // Catch failure to update logic when adding new broadcasting possibility.
       ET_DCHECK(
-          selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+          (selected_optimized_path ==
+           ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
+          (selected_optimized_path ==
+           ElementwiseOptimizedPath::kBroadcastNdByNd));
       lhs = &a;
       rhs = &b;
     }
@@ -149,15 +153,34 @@ Tensor& opt_mul_out(
         InvalidArgument,
         out,
         "Failed to resize output tensor.");
+    int64_t outer_size = 1;
+    int64_t broadcast_size;
+    int64_t inner_size;
+    if ((selected_optimized_path ==
+         ElementwiseOptimizedPath::kBroadcastNdByNd) ||
+        (selected_optimized_path ==
+         ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
+      int32_t broadcast_dim = internal::get_broadcast_dim(*lhs, *rhs);
+      int32_t broadcast_dim_lhs = lhs->dim() + broadcast_dim;
+      auto normalized_tensor_size_lhs =
+          get_normalized_tensor_size(*lhs, broadcast_dim_lhs);
+      outer_size = normalized_tensor_size_lhs[0];
+      broadcast_size = normalized_tensor_size_lhs[1];
+      inner_size = normalized_tensor_size_lhs[2];
+    } else {
+      broadcast_size = lhs->sizes()[lhs->dim() - 2];
+      inner_size = lhs->sizes()[lhs->dim() - 1];
+    }
     ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
       using Vec = executorch::vec::Vectorized<CTYPE>;
-      executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+      executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
           [](Vec x, Vec y) { return x * y; },
           out.mutable_data_ptr<CTYPE>(),
           lhs->const_data_ptr<CTYPE>(),
           rhs->const_data_ptr<CTYPE>(),
-          lhs->sizes()[lhs->dim() - 2],
-          lhs->sizes()[lhs->dim() - 1]);
+          outer_size,
+          broadcast_size,
+          inner_size);
     });
   } else {
     ScalarType common_type =
diff --git a/kernels/optimized/vec/functional_base.h b/kernels/optimized/vec/functional_base.h
@@ -326,10 +326,49 @@ inline void map4(
 }
 
 
-// Map vec_fun across input_data and input_data2, where input_data is
-// a two-dimensional array of size (size, size2), input_data2 is a
-// one-dimensional array of size size2, and input_data2 is broadcast
-// to be of size (size, size2).
+// This function implements broadcasting binary operation on two tensors
+// where lhs tensor is treated to be of shape [outer_size, broadcast_size, inner_size]
+// and rhs tensor is treated to be of shape [outer_size, 1, inner_size]
+// And this 1st dimension is considered broadcasting dimension
+// This formula can map broadcasting on any dim=broadcast_dim
+// for any two N dimensional tensors, where 0 < braodcast_dim < N-1
+template <typename scalar_t, typename Op>
+inline void broadcasting_map_3d_and_unsqueezed_3d(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* lhs,
+    const scalar_t* rhs,
+    int64_t outer_size,
+    int64_t broadcast_size,
+    int64_t inner_size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t outer_stride_lhs = inner_size * broadcast_size;
+  int64_t outer_stride_rhs = inner_size;
+  int64_t broadcast_stride_lhs = inner_size;
+  for (int64_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+    const scalar_t* lhs_outer = lhs + outer_idx * outer_stride_lhs;
+    scalar_t* output_data_row = output_data + outer_idx * outer_stride_lhs;
+    const scalar_t* rhs_outer = rhs + outer_idx * outer_stride_rhs;
+    for (int64_t broadcast_idx = 0; broadcast_idx < broadcast_size; ++broadcast_idx) {
+      const scalar_t* lhs_outer_2 = lhs_outer + broadcast_idx * broadcast_stride_lhs;
+      scalar_t* output_data_row_2 = output_data_row + broadcast_idx * broadcast_stride_lhs;
+      int64_t inner_idx = 0;
+      for (; inner_idx < inner_size - (inner_size % Vec::size()); inner_idx += Vec::size()) {
+        Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx);
+        Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx);
+        Vec output_vec = vec_fun(data_vec, data_vec2);
+        output_vec.store(output_data_row_2 + inner_idx);
+      }
+      if (inner_size - inner_idx > 0) {
+        Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx, inner_size - inner_idx);
+        Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx, inner_size - inner_idx);
+        Vec output_vec = vec_fun(data_vec, data_vec2);
+        output_vec.store(output_data_row_2 + inner_idx, inner_size - inner_idx);
+      }
+    }
+  }
+}
+
 template <typename scalar_t, typename Op>
 inline void broadcasting_map_2d_by_1d(
     const Op& vec_fun,
@@ -338,27 +377,8 @@ inline void broadcasting_map_2d_by_1d(
     const scalar_t* input_data2,
     int64_t size,
     int64_t size2) {
-  using Vec = vec::Vectorized<scalar_t>;
-  for (int64_t outer_idx = 0; outer_idx < size; ++outer_idx) {
-    const scalar_t* input_data_row = input_data + outer_idx * size2;
-    scalar_t* output_data_row = output_data + outer_idx * size2;
-    int64_t inner_idx = 0;
-    for (; inner_idx < size2 - (size2 % Vec::size()); inner_idx += Vec::size()) {
-      Vec data_vec = Vec::loadu(input_data_row + inner_idx);
-      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx);
-      Vec output_vec = vec_fun(data_vec, data_vec2);
-      output_vec.store(output_data_row + inner_idx);
-    }
-    if (size2 - inner_idx > 0) {
-      Vec data_vec = Vec::loadu(input_data_row + inner_idx, size2 - inner_idx);
-      Vec data_vec2 = Vec::loadu(input_data2 + inner_idx, size2 - inner_idx);
-      Vec output_vec = vec_fun(data_vec, data_vec2);
-      output_vec.store(output_data_row + inner_idx, size2 - inner_idx);
-    }
-  }
+  broadcasting_map_3d_and_unsqueezed_3d(vec_fun, output_data, input_data, input_data2, 1, size, size2);
 }
 
-
-
 } // namespace vec
 } // namespace executorch
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl