Update on "[ExecuTorch] Add broadcast support for optimized add op"

kimishpatel · kimishpatel · commit 7ea55eb0f4a7 · 2025-02-06T20:05:32.000-08:00
Summary:
This brings add op to feature parity, wrt, broadcasting, to mul op in
optimized kernels lib

Test Plan:
tests added

Reviewers:

Subscribers:

Tasks:

Tags:

cc larryliu0820 manuelcandales

[ghstack-poisoned]
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
@@ -49,38 +49,8 @@ enum class ElementwiseOptimizedPath {
   kBroadcastLastDimReverseArguments,
 };
 
-enum class BinaryOpType {
-  kAdd,
-  kSub,
-  kMul,
-  kDiv,
-};
-
 namespace internal {
 
-template <BinaryOpType op_type>
-struct BinaryOpTypeName;
-
-template <>
-struct BinaryOpTypeName<BinaryOpType::kAdd> {
-  static constexpr char kName[] = "add.out";
-};
-
-template <>
-struct BinaryOpTypeName<BinaryOpType::kSub> {
-  static constexpr char kName[] = "sub.out";
-};
-
-template <>
-struct BinaryOpTypeName<BinaryOpType::kMul> {
-  static constexpr char kName[] = "mul.out";
-};
-
-template <>
-struct BinaryOpTypeName<BinaryOpType::kDiv> {
-  static constexpr char kName[] = "div.out";
-};
-
 /*
   Given two tensors, this function returns the broadcast dim if it exists.
   Returns 0 if no broadcast dim is found.
@@ -222,15 +192,15 @@ std::array<int32_t, 3> inline get_normalized_tensor_size(
   return normalized_tensor_size;
 }
 
-template <BinaryOpType op_type, typename Op>
+template <const char* op_name, typename Op>
 Tensor& handle_last_dim_broadcast_elementwise(
     KernelRuntimeContext& ctx,
     const Op& vec_fun,
     const Tensor& a,
     const Tensor& b,
     Tensor& out,
     const ElementwiseOptimizedPath selected_optimized_path,
-    executorch::aten::optional<Scalar>& alpha = {}) {
+    const executorch::aten::optional<Scalar>& alpha = {}) {
   ScalarType out_type = out.scalar_type();
   const Tensor* lhs;
   const Tensor* rhs;
@@ -251,11 +221,11 @@ Tensor& handle_last_dim_broadcast_elementwise(
       "Failed to resize output tensor.");
   const size_t outer_size = getLeadingDims(out, out.dim() - 1);
   const auto broadcast_size = out.size(out.dim() - 1);
-  ET_SWITCH_REALB_TYPES(out_type, ctx, internal::BinaryOpTypeName<op_type>::kName, CTYPE, [&]() {
+  ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() {
     using Vec = executorch::vec::Vectorized<CTYPE>;
-    CTYPE alpha_val;
-    Vec alpha_val_vec(alpha_val);
+    Vec alpha_val_vec;
     if (alpha.has_value()) {
+      CTYPE alpha_val;
       ET_KERNEL_CHECK(
           ctx,
           native::utils::extract_scalar(alpha.value(), &alpha_val),
@@ -276,20 +246,20 @@ Tensor& handle_last_dim_broadcast_elementwise(
   return out;
 }
 
-template <BinaryOpType op_type, typename Op>
+template <const char* op_name, typename Op>
 Tensor& handle_broadcast_elementwise(
     KernelRuntimeContext& ctx,
     const Op& vec_fun,
     const Tensor& a,
     const Tensor& b,
     Tensor& out,
     const ElementwiseOptimizedPath selected_optimized_path,
-    executorch::aten::optional<Scalar> alpha = {}) {
+    const executorch::aten::optional<Scalar>& alpha = {}) {
   if ((selected_optimized_path ==
        ElementwiseOptimizedPath::kBroadcastLastDim) ||
       (selected_optimized_path ==
        ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments)) {
-    return handle_last_dim_broadcast_elementwise<op_type>(
+    return handle_last_dim_broadcast_elementwise<op_name>(
         ctx, vec_fun, a, b, out, selected_optimized_path, alpha);
   }
 
@@ -336,11 +306,11 @@ Tensor& handle_broadcast_elementwise(
     broadcast_size = lhs->sizes()[lhs->dim() - 2];
     inner_size = lhs->sizes()[lhs->dim() - 1];
   }
-  ET_SWITCH_REALB_TYPES(out_type, ctx, internal::BinaryOpTypeName<op_type>::kName, CTYPE, [&]() {
+  ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() {
     using Vec = executorch::vec::Vectorized<CTYPE>;
-    CTYPE alpha_val;
     Vec alpha_val_vec;
     if (alpha.has_value()) {
+      CTYPE alpha_val;
       ET_KERNEL_CHECK(
           ctx,
           native::utils::extract_scalar(alpha.value(), &alpha_val),
diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
@@ -140,29 +140,30 @@ Tensor& opt_add_out(
           out.numel());
     });
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    static constexpr const char op_name[] = "add.out";
     if (selected_optimized_path ==
             ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments ||
         selected_optimized_path ==
             ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments ||
         selected_optimized_path ==
             ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) {
-      // This behavior is a bit confusing.
       // Reason we swap out args here is because handle_broadcast_elementwise
       // handles this selected_optimized_path option a bit differently.
-      // This should really be resoled in handle_broadcast_elementwise.
-      // However, the current blocker is that handle_broadcast_elementwise tries to
-      // be agnostic of op. This should be fixed, likely by moving lambda creation
-      // to handle_broadcast_elementwise and it be aware of which op is being executed.
+      // This should really be resolved in handle_broadcast_elementwise.
+      // However, the current blocker is that handle_broadcast_elementwise tries
+      // to be agnostic of op. This should be fixed, likely by moving lambda
+      // creation to handle_broadcast_elementwise and it be aware of which op is
+      // being executed.
       auto add_lambda = [](auto x, auto y, auto alpha_val) {
         return y + alpha_val * x;
       };
-      return torch::executor::handle_broadcast_elementwise<BinaryOpType::kAdd>(
+      return torch::executor::handle_broadcast_elementwise<op_name>(
           ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
     } else {
       auto add_lambda = [](auto x, auto y, auto alpha_val) {
         return x + alpha_val * y;
       };
-      return torch::executor::handle_broadcast_elementwise<BinaryOpType::kAdd>(
+      return torch::executor::handle_broadcast_elementwise<op_name>(
           ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
     }
   } else {
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -130,12 +130,15 @@ Tensor& opt_mul_out(
           out.numel());
     });
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
-    // Reason for using alpha:
+    // Reason for using alpha even when used for mul is becasuse
+    // handle_broadcast_elementwise is used for add and sub as well
+    // and it uses alpha.
     auto mul_lambda = [](auto x, auto y, auto alpha) {
-      (void)alpha;
+      [[maybe_unused]] alpha;
       return x * y;
     };
-    return torch::executor::handle_broadcast_elementwise<BinaryOpType::kMul>(
+    static constexpr const char op_name[] = "mul.out";
+    return torch::executor::handle_broadcast_elementwise<op_name>(
         ctx, mul_lambda, a, b, out, selected_optimized_path);
   } else {
     ScalarType common_type =
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
@@ -417,16 +417,6 @@ TEST_F(OpMulOutTest, BroadcastA2BTest) {
   test_broadcast_a2b<ScalarType::Int>();
   test_broadcast_a2b<ScalarType::Half>();
   test_broadcast_a2b<ScalarType::BFloat16>();
-
-  // Test 3D tensors
-  test_broadcast_3D<ScalarType::Float>();
-  test_broadcast_3D<ScalarType::Half>();
-  test_broadcast_3D<ScalarType::BFloat16>();
-
-  // Test 4D tensors
-  test_broadcast_4D<ScalarType::Float>();
-  test_broadcast_4D<ScalarType::Half>();
-  test_broadcast_4D<ScalarType::BFloat16>();
 }
 
 // Broadcast tensor a's size to tensor b's size