From 119cc80a7b19d7a8f33caa8d92f58783d70ca732 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 24 Feb 2025 17:03:46 -0800
Subject: [PATCH 1/3] Fix log_softmax along non-contiguous dim

Pull Request resolved: https://github.com/pytorch/executorch/pull/8595

#8382 certainly didn't fix this problem (and added it on x86), but I don't think it was correct on ARM prior to that either.

Added a regression test.
ghstack-source-id: 268149462
@exported-using-ghexport

Differential Revision: [D69928884](https://our.internmc.facebook.com/intern/diff/D69928884/)
---
 kernels/optimized/cpu/op_log_softmax.cpp | 19 ++++----
 kernels/test/op_log_softmax_test.cpp     | 57 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 8 deletions(-)
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index 1d2467bca5f..1822a06f29f 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -75,17 +75,20 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
       static_assert(
           std::is_same_v<OUT_T, float>,
           "Below loop actually only supports float.");
-      const VecIn max_input_vec(max_input);
-      for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
-        auto index = d * dim_stride;
-        auto in = VecIn::loadu(&input_data[index]);
-        auto out_ = (in - max_input_vec).exp();
-        out_.store(&output_data[index]);
+      // It is not correct to vectorize if dim is not contiguous!
+      if (dim_stride == 1) {
+        const VecIn max_input_vec(max_input);
+        for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
+          auto index = d * dim_stride;
+          auto in = VecIn::loadu(&input_data[index]);
+          auto out_ = (in - max_input_vec).exp();
+          out_.store(&output_data[index]);
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
-        temp_sum += vaddvq_f32(out_);
+          temp_sum += vaddvq_f32(out_);
 #else
-        temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
+          temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
 #endif
+        }
       }
       for (; d < dim_size; ++d) {
         output_data[d * dim_stride] =
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index 94047592a80..1b01ff8a78d 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -72,6 +72,59 @@ class OpLogSoftmaxOutTest : public OperatorTest {
       EXPECT_TENSOR_CLOSE(out, expected);
     }
   }
+
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
+  void test_dtype_noncontiguous_dim() {
+    TensorFactory<DTYPE> tf;
+
+    // Dim 0 must be longer than the vector width of the machine (for
+    // float, this is 4 for ARM64 and 8 for AVX2) to exhibit problems.
+    // clang-format off
+    Tensor x = tf.make(
+      {9, 3},
+      {
+        0, 9,  18,
+        1, 10, 19,
+        2, 11, 20,
+        3, 12, 21,
+        4, 13, 22,
+        5, 14, 23,
+        6, 15, 24,
+        7, 16, 25,
+        8, 17, 26,
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({9, 3});
+
+    op_log_softmax_out(x, /*dim=*/0, /*half_to_float*/ false, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+      {9, 3},
+      {
+        -8.45855, -8.45855, -8.45855,
+        -7.45855, -7.45855, -7.45855,
+        -6.45855, -6.45855, -6.45855,
+        -5.45855, -5.45855, -5.45855,
+        -4.45855, -4.45855, -4.45855,
+        -3.45855, -3.45855, -3.45855,
+        -2.45855, -2.45855, -2.45855,
+        -1.45855, -1.45855, -1.45855,
+        -0.458552, -0.458552, -0.458552
+      });
+    // clang-format on
+
+    if constexpr (DTYPE == ScalarType::BFloat16) {
+      EXPECT_TENSOR_CLOSE_WITH_TOL(
+          out,
+          expected,
+          1e-2,
+          executorch::runtime::testing::internal::kDefaultAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
+  }
 };
 
 TEST_F(OpLogSoftmaxOutTest, Smoke) {
@@ -101,6 +154,10 @@ TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
+TEST_F(OpLogSoftmaxOutTest, NonContiguous) {
+  test_dtype_noncontiguous_dim<float, ScalarType::Float>();
+}
+
 TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched dimensions";

From 8184d04402ff35d88e4928610152b69a93a403f1 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 24 Feb 2025 17:03:46 -0800
Subject: [PATCH 2/3] Re-enable optimized gelu test in CMake

Pull Request resolved: https://github.com/pytorch/executorch/pull/8597

I missed this line disabling the test. (Splitting out re-enable of log_softmax because I think that one needs fixes.)
ghstack-source-id: 268149463
@exported-using-ghexport

Differential Revision: [D69929122](https://our.internmc.facebook.com/intern/diff/D69929122/)
---
 kernels/test/CMakeLists.txt | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 4250f1f7581..63b92ab525b 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -66,7 +66,7 @@ foreach(kernel ${_kernels})
       cp
       "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-      DEPENDS "${kernel}_ops_lib"
+    DEPENDS "${kernel}_ops_lib"
   )
 endforeach()
 
@@ -278,10 +278,8 @@ set(_optimized_kernels_test_sources
     ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp
 )
 
-# We don't have sleef on OSS so we don't have gelu and log_softmax
-list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp"
-     "op_log_softmax_test.cpp"
-)
+# We don't have sleef on OSS so we don't have log_softmax
+list(REMOVE_ITEM _optimized_kernels_test_sources "op_log_softmax_test.cpp")
 
 et_cxx_test(
   optimized_kernels_test

From f28abf2d57bd70e18435b7b148371f7c7bed0d5d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 24 Feb 2025 17:03:47 -0800
Subject: [PATCH 3/3] Clean up optimized-oss.yaml

Pull Request resolved: https://github.com/pytorch/executorch/pull/8549

We don't need this now that we support log_softmax and gelu in OSS!
ghstack-source-id: 268149465
@exported-using-ghexport

Differential Revision: [D69475020](https://our.internmc.facebook.com/intern/diff/D69475020/)
---
 build/cmake_deps.toml                         |  6 +-
 configurations/CMakeLists.txt                 |  2 +-
 configurations/targets.bzl                    | 18 ----
 examples/models/llama/runner/targets.bzl      |  5 +-
 kernels/optimized/CMakeLists.txt              |  4 +-
 kernels/optimized/cpu/targets.bzl             |  8 +-
 kernels/optimized/op_registration_util.bzl    |  6 +-
 kernels/optimized/optimized-oss.yaml          | 96 -------------------
 kernels/optimized/targets.bzl                 |  8 --
 kernels/test/CMakeLists.txt                   |  6 +-
 .../optimized/op_registration_util.bzl        | 23 +++--
 11 files changed, 27 insertions(+), 155 deletions(-)
 delete mode 100644 kernels/optimized/optimized-oss.yaml

diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index c44fcf92ea6..21a8e282929 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -117,9 +117,9 @@ deps = [
   "executorch",
 ]
 
-[targets.optimized_native_cpu_ops_oss]
+[targets.optimized_native_cpu_ops]
 buck_targets = [
-  "//configurations:optimized_native_cpu_ops_oss",
+  "//configurations:optimized_native_cpu_ops",
 ]
 filters = [
   ".cpp$",
@@ -437,6 +437,6 @@ deps = [
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
-  "optimized_native_cpu_ops_oss",
+  "optimized_native_cpu_ops",
 ]
 # ---------------------------------- LLama end ----------------------------------
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index eddb8b2a12c..462124a6ea6 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -30,7 +30,7 @@ include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   # Merge optimized and portable definitions, taking optimized where available.
   merge_yaml(
-    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized-oss.yaml
+    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized.yaml
     FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml OUTPUT_DIR
     ${CMAKE_CURRENT_BINARY_DIR}
   )
diff --git a/configurations/targets.bzl b/configurations/targets.bzl
index 6a5341c2904..5a39f7301ec 100644
--- a/configurations/targets.bzl
+++ b/configurations/targets.bzl
@@ -50,21 +50,3 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
-
-    # TODO(T183193812): delete this target after optimized-oss.yaml is gone
-    executorch_generated_lib(
-        name = "optimized_native_cpu_ops_oss",
-        deps = [
-            "//executorch/kernels/optimized:optimized_operators",
-            "//executorch/kernels/optimized:optimized_oplist",
-            "//executorch/kernels/portable:executorch_aten_ops",
-            "//executorch/kernels/portable:operators",
-        ],
-        functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml",
-        fallback_yaml_target = "//executorch/kernels/portable:functions.yaml",
-        define_static_targets = True,
-        visibility = [
-            "//executorch/examples/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl
index 9f095b93970..37827bb78a5 100644
--- a/examples/models/llama/runner/targets.bzl
+++ b/examples/models/llama/runner/targets.bzl
@@ -3,9 +3,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/aten:generated_lib"]
-    elif runtime.is_oss:
-        # TODO(T183193812): delete this path after optimized-oss.yaml is no more.
-        return ["//executorch/configurations:optimized_native_cpu_ops_oss", "//executorch/extension/llm/custom_ops:custom_ops"]
     else:
         return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"]
 
@@ -13,7 +10,7 @@ def get_qnn_dependency():
     # buck build -c executorch.enable_qnn=true //executorch/examples/models/llama/runner:runner
     # Check if QNN is enabled before including the dependency
     if native.read_config("executorch", "enable_qnn", "false") == "true":
-        # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work, 
+        # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work,
         #  likely due to it's an empty library with dependency only
         return [
             "//executorch/backends/qualcomm/runtime:runtime",
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 1f3aff57ecf..235c6738d9a 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -49,12 +49,12 @@ target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in optimized.yaml
-set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized-oss.yaml")
+set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized.yaml")
 gen_selected_ops(LIB_NAME "optimized_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
 
 generate_bindings_for_kernels(
   LIB_NAME "optimized_ops_lib" FUNCTIONS_YAML
-  ${CMAKE_CURRENT_SOURCE_DIR}/optimized-oss.yaml
+  ${CMAKE_CURRENT_SOURCE_DIR}/optimized.yaml
   ADD_EXCEPTION_BOUNDARY
 )
 message("Generated files ${gen_command_sources}")
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 2a66407a5ce..83b2c320266 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -1,5 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "is_op_disabled", "op_target")
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "op_target")
 
 _OPTIMIZED_ATEN_OPS = (
     op_target(
@@ -111,13 +111,11 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    enabled_ops = [op for op in _OPTIMIZED_ATEN_OPS if not is_op_disabled(op["name"])]
-
     # Define build targets for all operators registered in the tables above.
-    for op in enabled_ops:
+    for op in _OPTIMIZED_ATEN_OPS:
         define_op_target(**op)
 
-    aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops]
+    aten_op_targets = [":{}".format(op["name"]) for op in _OPTIMIZED_ATEN_OPS]
     all_op_targets = aten_op_targets
 
     runtime.cxx_library(
diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
index 12a5f012a38..3ac89132380 100644
--- a/kernels/optimized/op_registration_util.bzl
+++ b/kernels/optimized/op_registration_util.bzl
@@ -2,8 +2,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
-    "get_vec_preprocessor_flags",
     "get_vec_deps",
+    "get_vec_preprocessor_flags",
 )
 load(
     "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
@@ -137,7 +137,3 @@ def define_op_target(name, compiler_flags, deps):
         compiler_flags = compiler_flags,
         deps = deps,
     )
-
-def is_op_disabled(name):
-    # All ops are enabled for internal builds.
-    return False
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
deleted file mode 100644
index a24aa9ca173..00000000000
--- a/kernels/optimized/optimized-oss.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This yaml file contains operators that have optimized kernels available.
-# Note that this is a copy of optimized.yaml that does not include log_softmax,
-# due to the OSS build not currently including sleef.
-# TODO (T183193812)
-
-- op: _fft_r2c.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_fft_r2c_out
-
-- op: add.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_add_out
-
-- op: add.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_add_scalar_out
-
-- op: bmm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_bmm_out
-
-- op: div.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_div_out
-
-- op: div.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_div_scalar_out
-
-- op: exp.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_exp_out
-
-- op: sigmoid.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_sigmoid_out
-
-- op: gelu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_gelu_out
-
-- op: le.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_le_scalar_out
-
-- op: le.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_le_tensor_out
-
-- op: linear.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_linear_out
-
-- op: mul.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_mul_out
-
-- op: mul.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_mul_scalar_out
-
-- op: native_layer_norm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_native_layer_norm_out
-
-- op: neg.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_neg_out
-
-- op: sub.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_sub_out
-
-- op: sub.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_sub_scalar_out
diff --git a/kernels/optimized/targets.bzl b/kernels/optimized/targets.bzl
index 9978d4196dd..c655cb149a3 100644
--- a/kernels/optimized/targets.bzl
+++ b/kernels/optimized/targets.bzl
@@ -19,14 +19,6 @@ def define_common_targets(is_fbcode=False):
         ],
     )
 
-    runtime.export_file(
-        name = "optimized-oss.yaml",
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
     runtime.cxx_library(
         name = "optimized_operators",
         srcs = [],
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 63b92ab525b..24adb8d9c80 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -270,17 +270,15 @@ set(_optimized_kernels_test_sources
     "op_le_test.cpp"
     "op_linear_test.cpp"
     "op_log_softmax_test.cpp"
+    "op_mm_test.cpp"
     "op_mul_test.cpp"
     "op_native_layer_norm_test.cpp"
     "op_neg_test.cpp"
     "op_sub_test.cpp"
     "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp"
-    ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp
 )
 
-# We don't have sleef on OSS so we don't have log_softmax
-list(REMOVE_ITEM _optimized_kernels_test_sources "op_log_softmax_test.cpp")
-
 et_cxx_test(
   optimized_kernels_test
   SOURCES
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index c70757e29b9..d48a22cee37 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -9,8 +9,13 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
+    "get_vec_deps",
     "get_vec_preprocessor_flags",
 )
+load(
+    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
+    "get_compiler_optimization_flags",
+)
 
 def op_target(name, deps = [], compiler_flags = []):
     """Registers an optimized implementation for an operator overload group.
@@ -94,12 +99,17 @@ def define_op_library(name, compiler_flags, deps):
             "//executorch/kernels/test/...",
             "@EXECUTORCH_CLIENTS",
         ],
-        # kernels often have helpers with no prototypes just disabling the warning here as the headers
-        # are codegend and linked in later
-        compiler_flags = ["-Wno-missing-prototypes"],
+        compiler_flags = [
+            # kernels often have helpers with no prototypes just disabling the warning here as the headers
+            # are codegend and linked in later
+            "-Wno-missing-prototypes",
+            # pragma unroll fails with -Os, don't need to warn us and
+            # fail Werror builds; see https://godbolt.org/z/zvf85vTsr
+            "-Wno-pass-failed",
+        ] + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
-        ] + augmented_deps,
+        ] + augmented_deps + get_vec_deps(),
         preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
         # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
@@ -134,8 +144,3 @@ def define_op_target(name, compiler_flags, deps):
         compiler_flags = compiler_flags,
         deps = deps,
     )
-
-def is_op_disabled(name):
-    # TODO (gjcomer) Enable ops with sleef dependency in OSS
-    disabled_ops = ["op_log_softmax"]
-    return name in disabled_ops