Update on "[Executorch] Renable operator optimization flags"

kimishpatel · kimishpatel · commit baa8660f0e7c · 2024-11-08T07:47:18.000-08:00
Previous attempt at this resulted in revert due to app size increase. Much of this was due to op_div exploding. Two diffs underneath solve this issue Differential Revision: [D65606666](https://our.internmc.facebook.com/intern/diff/D65606666/) [ghstack-poisoned]
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
@@ -9,6 +9,7 @@
 #include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
+#include <executorch/kernels/portable/cpu/op_sub_impl.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -210,35 +211,7 @@ Tensor& opt_sub_out(
       }
     });
   } else {
-    ScalarType common_type =
-        promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
-          CTYPE_IN alpha_val;
-          ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-          SubInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, alpha_val, out);
-        });
-      });
-    });
+    sub_out_impl(ctx, a, b, alpha, out);
   }
 
   return out;
@@ -290,31 +263,7 @@ Tensor& opt_sub_scalar_out(
           });
     });
   } else {
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE_A, [&]() {
-      ET_SWITCH_SCALAR_OBJ_REAL_TYPES(
-          b_type, ctx, "sub.Scalar_out", CTYPE_B, [&]() {
-            ET_SWITCH_REAL_TYPES(
-                common_type, ctx, "sub.Scalar_out", CTYPE_IN, [&]() {
-                  ET_SWITCH_REALH_TYPES(
-                      out_type, ctx, "sub.Scalar_out", CTYPE_OUT, [&]() {
-                        CTYPE_B b_val;
-                        ET_EXTRACT_SCALAR(b, b_val);
-                        CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                        CTYPE_IN alpha_val;
-                        ET_EXTRACT_SCALAR(alpha, alpha_val);
-
-                        const size_t n = a.numel();
-                        const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                        CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-                        for (auto i = 0; i < n; ++i) {
-                          out_data[i] = static_cast<CTYPE_OUT>(
-                              static_cast<CTYPE_IN>(a_data[i]) -
-                              alpha_val * b_casted);
-                        }
-                      });
-                });
-          });
-    });
+    sub_scalar_out_impl(ctx, a, b, alpha, out);
   }
 
   return out;
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -88,6 +88,7 @@ _OPTIMIZED_ATEN_OPS = (
         name = "op_sub",
         deps = [
             ":binary_ops",
+            "//executorch/kernels/portable/cpu:op_sub_impl",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
@@ -11,6 +11,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <executorch/kernels/portable/cpu/op_sub_impl.h>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -21,55 +23,7 @@ Tensor& sub_out(
     const Tensor& b,
     const Scalar& alpha,
     Tensor& out) {
-  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
-
-  // Check alpha type
-  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
-
-  // Common Dtype
-  ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type());
-
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (canCast(common_type, out.scalar_type()) &&
-       canCast(alpha_type, common_type)),
-      InvalidArgument,
-      out);
-
-  // Check Dim Order
-  ET_KERNEL_CHECK(
-      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
-
-  // Resize
-  ET_KERNEL_CHECK(
-      ctx,
-      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  // Compute Dtype
-  ScalarType compute_type = utils::get_compute_type(common_type);
-
-  // @lint-ignore CLANGTIDY facebook-hte-CArray
-  static constexpr const char op_name[] = "sub.out";
-
-  ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a - val_alpha * val_b;
-        },
-        ctx,
-        a,
-        utils::SupportedTensorDtypes::REALHBF16,
-        b,
-        utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
-  });
-
-  return out;
+  return sub_out_impl(ctx, a, b, alpha, out);
 }
 
 Tensor& sub_scalar_out(
@@ -78,50 +32,7 @@ Tensor& sub_scalar_out(
     const Scalar& b,
     const Scalar& alpha,
     Tensor& out) {
-  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
-
-  // Check alpha type
-  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
-
-  // Common Dtype
-  ScalarType common_type = utils::promote_type_with_scalar(a.scalar_type(), b);
-
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (common_type == out.scalar_type() && canCast(alpha_type, common_type)),
-      InvalidArgument,
-      out);
-
-  // Check Dim Order
-  ET_KERNEL_CHECK(
-      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
-
-  // Resize
-  ET_KERNEL_CHECK(
-      ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out);
-
-  // Compute Dtype
-  ScalarType compute_type = utils::get_compute_type(common_type);
-
-  // @lint-ignore CLANGTIDY facebook-hte-CArray
-  static constexpr const char op_name[] = "sub.Scalar_out";
-
-  ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
-          return val_a - val_alpha * val_b;
-        },
-        ctx,
-        a,
-        utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
-  });
-
-  return out;
+  return sub_scalar_out_impl(ctx, a, b, alpha, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sub_impl.cpp b/kernels/portable/cpu/op_sub_impl.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <executorch/kernels/portable/cpu/op_sub_impl.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& sub_out_impl(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
+
+  // Check alpha type
+  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
+
+  // Common Dtype
+  ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type());
+
+  // Check Common Dtype
+  ET_KERNEL_CHECK(
+      ctx,
+      (canCast(common_type, out.scalar_type()) &&
+       canCast(alpha_type, common_type)),
+      InvalidArgument,
+      out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Compute Dtype
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sub.out";
+
+  ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
+    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          return val_a - val_alpha * val_b;
+        },
+        ctx,
+        a,
+        utils::SupportedTensorDtypes::REALHBF16,
+        b,
+        utils::SupportedTensorDtypes::REALHBF16,
+        out,
+        utils::SupportedTensorDtypes::REALHBF16);
+  });
+
+  return out;
+}
+
+Tensor& sub_scalar_out_impl(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
+
+  // Check alpha type
+  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
+
+  // Common Dtype
+  ScalarType common_type = utils::promote_type_with_scalar(a.scalar_type(), b);
+
+  // Check Common Dtype
+  ET_KERNEL_CHECK(
+      ctx,
+      (common_type == out.scalar_type() && canCast(alpha_type, common_type)),
+      InvalidArgument,
+      out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out);
+
+  // Compute Dtype
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sub.Scalar_out";
+
+  ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
+    const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
+    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+          return val_a - val_alpha * val_b;
+        },
+        ctx,
+        a,
+        utils::SupportedTensorDtypes::REALHBF16,
+        out,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_sub_impl.h b/kernels/portable/cpu/op_sub_impl.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& sub_out_impl(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    const Scalar& alpha,
+    Tensor& out);
+
+Tensor& sub_scalar_out_impl(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    const Scalar& alpha,
+    Tensor& out);
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl