Use elementwise_util instead of functional_util in op_neg, delete optimized op (pytorch#11660)

swolchok · hinriksnaer · commit 2f09fcdad370 · 2025-06-26T15:17:12.000-04:00
Allows vectorization (checked assembly to make sure; it's different from
the old optimized op because the optimized op wasn't parallelized), so
we can delete the optimized op as well.
diff --git a/kernels/optimized/cpu/op_neg.cpp b/kernels/optimized/cpu/op_neg.cpp
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
@@ -92,11 +92,6 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_native_layer_norm_out
 
-- op: neg.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_neg_out
-
 - op: sub.out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -33,12 +33,17 @@ Tensor& neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
-  ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] {
-    apply_unary_map_fn(
-        [](const CTYPE val_in) { return static_cast<CTYPE>(-val_in); },
-        in.const_data_ptr<CTYPE>(),
-        out.mutable_data_ptr<CTYPE>(),
-        in.numel());
+  static constexpr const char op_name[] = "neg.out";
+  ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+    utils::internal::apply_unitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [](const auto val_in) { return -val_in; },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBF16,
+        out);
   });
 
   return out;
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
@@ -276,7 +276,7 @@ def define_common_targets():
     _common_op_test("op_native_group_norm_test", ["aten", "portable"])
     _common_op_test("op_native_layer_norm_test", ["aten", "portable", "optimized"])
     _common_op_test("op_ne_test", ["aten", "portable"])
-    _common_op_test("op_neg_test", ["aten", "portable", "optimized"])
+    _common_op_test("op_neg_test", ["aten", "portable"])
     _common_op_test("op_nonzero_test", ["aten", "portable"])
     _common_op_test("op_ones_test", ["aten", "portable"])
     _common_op_test("op_pdist_forward_test", ["aten", "portable"])
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -259,12 +259,6 @@ OPTIMIZED_ATEN_OPS = (
             "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
         ],
     ),
-    op_target(
-        name = "op_neg",
-        deps = [
-            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
-        ],
-    ),
     op_target(
         name = "op_sub",
         deps = [
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -918,7 +918,7 @@ ATEN_OPS = (
     op_target(
         name = "op_neg",
         deps = [
-            "//executorch/kernels/portable/cpu/util:functional_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
         ],
     ),
     op_target(