[Executorch][Optimized] Use portable's impl for optimized op_add's fallback

kimishpatel · kimishpatel · commit bc939922d735 · 2024-11-08T07:47:16.000-08:00
This way we can get advantage of size and build opt efforts in portable kernels On one of the builds it saves X KB in app size when optimized lib is built with -Os Differential Revision: [D65666037](https://our.internmc.facebook.com/intern/diff/D65666037/) ghstack-source-id: 252537302 Pull Request resolved: #6737
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
@@ -9,6 +9,7 @@
 #include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
+#include <executorch/kernels/portable/cpu/op_sub_impl.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -210,35 +211,7 @@ Tensor& opt_sub_out(
       }
     });
   } else {
-    ScalarType common_type =
-        promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
-          CTYPE_IN alpha_val;
-          ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-          SubInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, alpha_val, out);
-        });
-      });
-    });
+    sub_out_impl(ctx, a, b, alpha, out);
   }
 
   return out;
@@ -290,31 +263,7 @@ Tensor& opt_sub_scalar_out(
           });
     });
   } else {
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE_A, [&]() {
-      ET_SWITCH_SCALAR_OBJ_REAL_TYPES(
-          b_type, ctx, "sub.Scalar_out", CTYPE_B, [&]() {
-            ET_SWITCH_REAL_TYPES(
-                common_type, ctx, "sub.Scalar_out", CTYPE_IN, [&]() {
-                  ET_SWITCH_REALH_TYPES(
-                      out_type, ctx, "sub.Scalar_out", CTYPE_OUT, [&]() {
-                        CTYPE_B b_val;
-                        ET_EXTRACT_SCALAR(b, b_val);
-                        CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                        CTYPE_IN alpha_val;
-                        ET_EXTRACT_SCALAR(alpha, alpha_val);
-
-                        const size_t n = a.numel();
-                        const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                        CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-                        for (auto i = 0; i < n; ++i) {
-                          out_data[i] = static_cast<CTYPE_OUT>(
-                              static_cast<CTYPE_IN>(a_data[i]) -
-                              alpha_val * b_casted);
-                        }
-                      });
-                });
-          });
-    });
+    sub_scalar_out_impl(ctx, a, b, alpha, out);
   }
 
   return out;
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -88,6 +88,7 @@ _OPTIMIZED_ATEN_OPS = (
         name = "op_sub",
         deps = [
             ":binary_ops",
+            "//executorch/kernels/portable/cpu:op_sub_impl",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],