Update on "[ExecuTorch] Add broadcast support for optimized add op"

kimishpatel · kimishpatel · commit bf761db1c33f · 2025-02-05T22:39:05.000-08:00
Summary:
This brings add op to feature parity, wrt, broadcasting, to mul op in
optimized kernels lib

Test Plan:
tests added

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
@@ -49,8 +49,38 @@ enum class ElementwiseOptimizedPath {
   kBroadcastLastDimReverseArguments,
 };
 
+enum class BinaryOpType {
+  kAdd,
+  kSub,
+  kMul,
+  kDiv,
+};
+
 namespace internal {
 
+template <BinaryOpType op_type>
+struct BinaryOpTypeName;
+
+template <>
+struct BinaryOpTypeName<BinaryOpType::kAdd> {
+  static constexpr char kName[] = "add.out";
+};
+
+template <>
+struct BinaryOpTypeName<BinaryOpType::kSub> {
+  static constexpr char kName[] = "sub.out";
+};
+
+template <>
+struct BinaryOpTypeName<BinaryOpType::kMul> {
+  static constexpr char kName[] = "mul.out";
+};
+
+template <>
+struct BinaryOpTypeName<BinaryOpType::kDiv> {
+  static constexpr char kName[] = "div.out";
+};
+
 /*
   Given two tensors, this function returns the broadcast dim if it exists.
   Returns 0 if no broadcast dim is found.
@@ -192,7 +222,7 @@ std::array<int32_t, 3> inline get_normalized_tensor_size(
   return normalized_tensor_size;
 }
 
-template <typename Op>
+template <BinaryOpType op_type, typename Op>
 Tensor& handle_last_dim_broadcast_elementwise(
     KernelRuntimeContext& ctx,
     const Op& vec_fun,
@@ -221,7 +251,7 @@ Tensor& handle_last_dim_broadcast_elementwise(
       "Failed to resize output tensor.");
   const size_t outer_size = getLeadingDims(out, out.dim() - 1);
   const auto broadcast_size = out.size(out.dim() - 1);
-  ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
+  ET_SWITCH_REALB_TYPES(out_type, ctx, internal::BinaryOpTypeName<op_type>::kName, CTYPE, [&]() {
     using Vec = executorch::vec::Vectorized<CTYPE>;
     CTYPE alpha_val;
     Vec alpha_val_vec(alpha_val);
@@ -246,7 +276,7 @@ Tensor& handle_last_dim_broadcast_elementwise(
   return out;
 }
 
-template <typename Op>
+template <BinaryOpType op_type, typename Op>
 Tensor& handle_broadcast_elementwise(
     KernelRuntimeContext& ctx,
     const Op& vec_fun,
@@ -259,7 +289,7 @@ Tensor& handle_broadcast_elementwise(
        ElementwiseOptimizedPath::kBroadcastLastDim) ||
       (selected_optimized_path ==
        ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments)) {
-    return handle_last_dim_broadcast_elementwise(
+    return handle_last_dim_broadcast_elementwise<op_type>(
         ctx, vec_fun, a, b, out, selected_optimized_path, alpha);
   }
 
@@ -306,7 +336,7 @@ Tensor& handle_broadcast_elementwise(
     broadcast_size = lhs->sizes()[lhs->dim() - 2];
     inner_size = lhs->sizes()[lhs->dim() - 1];
   }
-  ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
+  ET_SWITCH_REALB_TYPES(out_type, ctx, internal::BinaryOpTypeName<op_type>::kName, CTYPE, [&]() {
     using Vec = executorch::vec::Vectorized<CTYPE>;
     CTYPE alpha_val;
     Vec alpha_val_vec;
diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
@@ -140,11 +140,31 @@ Tensor& opt_add_out(
           out.numel());
     });
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
-    auto add_lambda = [](auto x, auto y, auto alpha_val) {
-      return x + alpha_val * y;
-    };
-    return torch::executor::handle_broadcast_elementwise(
-        ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+    if (selected_optimized_path ==
+            ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments ||
+        selected_optimized_path ==
+            ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments ||
+        selected_optimized_path ==
+            ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) {
+      // This behavior is a bit confusing.
+      // Reason we swap out args here is because handle_broadcast_elementwise
+      // handles this selected_optimized_path option a bit differently.
+      // This should really be resoled in handle_broadcast_elementwise.
+      // However, the current blocker is that handle_broadcast_elementwise tries to
+      // be agnostic of op. This should be fixed, likely by moving lambda creation
+      // to handle_broadcast_elementwise and it be aware of which op is being executed.
+      auto add_lambda = [](auto x, auto y, auto alpha_val) {
+        return y + alpha_val * x;
+      };
+      return torch::executor::handle_broadcast_elementwise<BinaryOpType::kAdd>(
+          ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+    } else {
+      auto add_lambda = [](auto x, auto y, auto alpha_val) {
+        return x + alpha_val * y;
+      };
+      return torch::executor::handle_broadcast_elementwise<BinaryOpType::kAdd>(
+          ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+    }
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -135,7 +135,7 @@ Tensor& opt_mul_out(
       (void)alpha;
       return x * y;
     };
-    return torch::executor::handle_broadcast_elementwise(
+    return torch::executor::handle_broadcast_elementwise<BinaryOpType::kMul>(
         ctx, mul_lambda, a, b, out, selected_optimized_path);
   } else {
     ScalarType common_type =
diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
@@ -129,7 +129,10 @@ class OpAddOutKernelTest : public OperatorTest {
 
     // Check that it matches the expected output.
     EXPECT_TENSOR_CLOSE(op_add_out(a, b, 1.0, out), expected);
-    EXPECT_TENSOR_CLOSE(op_add_out(b, a, 1.0, out), expected);
+    expected = tf_a.make(
+        {2, 2, 3},
+        /*data=*/{3.5, 6, 8.5, 8, 10.5, 13, 15.5, 18, 20.5, 20, 22.5, 25});
+    EXPECT_TENSOR_CLOSE(op_add_out(b, a, 1.5, out), expected);
   }
 
   template <ScalarType DTYPE>