pytorch · swolchok · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
@@ -117,9 +117,9 @@ deps = [
   "executorch",
 ]
 
-[targets.optimized_native_cpu_ops_oss]
+[targets.optimized_native_cpu_ops]
 buck_targets = [
-  "//configurations:optimized_native_cpu_ops_oss",
+  "//configurations:optimized_native_cpu_ops",
 ]
 filters = [
   ".cpp$",
@@ -437,6 +437,6 @@ deps = [
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
-  "optimized_native_cpu_ops_oss",
+  "optimized_native_cpu_ops",
 ]
 # ---------------------------------- LLama end ----------------------------------
@@ -30,7 +30,7 @@ include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   # Merge optimized and portable definitions, taking optimized where available.
   merge_yaml(
-    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized-oss.yaml
+    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized.yaml
     FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml OUTPUT_DIR
     ${CMAKE_CURRENT_BINARY_DIR}
   )

diff --git a/configurations/targets.bzl b/configurations/targets.bzl
@@ -50,21 +50,3 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
-
-    # TODO(T183193812): delete this target after optimized-oss.yaml is gone
-    executorch_generated_lib(
-        name = "optimized_native_cpu_ops_oss",
-        deps = [
-            "//executorch/kernels/optimized:optimized_operators",
-            "//executorch/kernels/optimized:optimized_oplist",
-            "//executorch/kernels/portable:executorch_aten_ops",
-            "//executorch/kernels/portable:operators",
-        ],
-        functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml",
-        fallback_yaml_target = "//executorch/kernels/portable:functions.yaml",
-        define_static_targets = True,
-        visibility = [
-            "//executorch/examples/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
@@ -3,17 +3,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/aten:generated_lib"]
-    elif runtime.is_oss:
-        # TODO(T183193812): delete this path after optimized-oss.yaml is no more.
-        return ["//executorch/configurations:optimized_native_cpu_ops_oss", "//executorch/extension/llm/custom_ops:custom_ops"]
     else:
         return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"]
 
 def get_qnn_dependency():
     # buck build -c executorch.enable_qnn=true //executorch/examples/models/llama/runner:runner
     # Check if QNN is enabled before including the dependency
     if native.read_config("executorch", "enable_qnn", "false") == "true":
-        # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work, 
+        # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work,
         #  likely due to it's an empty library with dependency only
         return [
             "//executorch/backends/qualcomm/runtime:runtime",

@@ -49,12 +49,12 @@ target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in optimized.yaml
-set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized-oss.yaml")
+set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized.yaml")
 gen_selected_ops(LIB_NAME "optimized_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
 
 generate_bindings_for_kernels(
   LIB_NAME "optimized_ops_lib" FUNCTIONS_YAML
-  ${CMAKE_CURRENT_SOURCE_DIR}/optimized-oss.yaml
+  ${CMAKE_CURRENT_SOURCE_DIR}/optimized.yaml
   ADD_EXCEPTION_BOUNDARY
 )
 message("Generated files ${gen_command_sources}")

diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -75,17 +75,20 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
       static_assert(
           std::is_same_v<OUT_T, float>,
           "Below loop actually only supports float.");
-      const VecIn max_input_vec(max_input);
-      for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
-        auto index = d * dim_stride;
-        auto in = VecIn::loadu(&input_data[index]);
-        auto out_ = (in - max_input_vec).exp();
-        out_.store(&output_data[index]);
+      // It is not correct to vectorize if dim is not contiguous!
+      if (dim_stride == 1) {
+        const VecIn max_input_vec(max_input);
+        for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
+          auto index = d * dim_stride;
+          auto in = VecIn::loadu(&input_data[index]);
+          auto out_ = (in - max_input_vec).exp();
+          out_.store(&output_data[index]);
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
-        temp_sum += vaddvq_f32(out_);
+          temp_sum += vaddvq_f32(out_);
 #else
-        temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
+          temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
 #endif
+        }
       }
       for (; d < dim_size; ++d) {
         output_data[d * dim_stride] =

@@ -1,5 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "is_op_disabled", "op_target")
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "op_target")
 
 _OPTIMIZED_ATEN_OPS = (
     op_target(
@@ -111,13 +111,11 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    enabled_ops = [op for op in _OPTIMIZED_ATEN_OPS if not is_op_disabled(op["name"])]
-
     # Define build targets for all operators registered in the tables above.
-    for op in enabled_ops:
+    for op in _OPTIMIZED_ATEN_OPS:
         define_op_target(**op)
 
-    aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops]
+    aten_op_targets = [":{}".format(op["name"]) for op in _OPTIMIZED_ATEN_OPS]
     all_op_targets = aten_op_targets
 
     runtime.cxx_library(

@@ -2,8 +2,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
-    "get_vec_preprocessor_flags",
     "get_vec_deps",
+    "get_vec_preprocessor_flags",
 )
 load(
     "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
@@ -137,7 +137,3 @@ def define_op_target(name, compiler_flags, deps):
         compiler_flags = compiler_flags,
         deps = deps,
     )
-
-def is_op_disabled(name):
-    # All ops are enabled for internal builds.
-    return False
@@ -19,14 +19,6 @@ def define_common_targets(is_fbcode=False):
         ],
     )
 
-    runtime.export_file(
-        name = "optimized-oss.yaml",
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
     runtime.cxx_library(
         name = "optimized_operators",
         srcs = [],

@@ -66,7 +66,7 @@ foreach(kernel ${_kernels})
       cp
       "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-      DEPENDS "${kernel}_ops_lib"
+    DEPENDS "${kernel}_ops_lib"
   )
 endforeach()
 
@@ -270,17 +270,13 @@ set(_optimized_kernels_test_sources
     "op_le_test.cpp"
     "op_linear_test.cpp"
     "op_log_softmax_test.cpp"
+    "op_mm_test.cpp"
     "op_mul_test.cpp"
     "op_native_layer_norm_test.cpp"
     "op_neg_test.cpp"
     "op_sub_test.cpp"
     "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp"
-    ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp
-)
-
-# We don't have sleef on OSS so we don't have gelu and log_softmax
-list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp"
-     "op_log_softmax_test.cpp"
+    ${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp
 )
 
 et_cxx_test(

diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
@@ -72,6 +72,59 @@ class OpLogSoftmaxOutTest : public OperatorTest {
       EXPECT_TENSOR_CLOSE(out, expected);
     }
   }
+
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
+  void test_dtype_noncontiguous_dim() {
+    TensorFactory<DTYPE> tf;
+
+    // Dim 0 must be longer than the vector width of the machine (for
+    // float, this is 4 for ARM64 and 8 for AVX2) to exhibit problems.
+    // clang-format off
+    Tensor x = tf.make(
+      {9, 3},
+      {
+        0, 9,  18,
+        1, 10, 19,
+        2, 11, 20,
+        3, 12, 21,
+        4, 13, 22,
+        5, 14, 23,
+        6, 15, 24,
+        7, 16, 25,
+        8, 17, 26,
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({9, 3});
+
+    op_log_softmax_out(x, /*dim=*/0, /*half_to_float*/ false, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+      {9, 3},
+      {
+        -8.45855, -8.45855, -8.45855,
+        -7.45855, -7.45855, -7.45855,
+        -6.45855, -6.45855, -6.45855,
+        -5.45855, -5.45855, -5.45855,
+        -4.45855, -4.45855, -4.45855,
+        -3.45855, -3.45855, -3.45855,
+        -2.45855, -2.45855, -2.45855,
+        -1.45855, -1.45855, -1.45855,
+        -0.458552, -0.458552, -0.458552
+      });
+    // clang-format on
+
+    if constexpr (DTYPE == ScalarType::BFloat16) {
+      EXPECT_TENSOR_CLOSE_WITH_TOL(
+          out,
+          expected,
+          1e-2,
+          executorch::runtime::testing::internal::kDefaultAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
+  }
 };
 
 TEST_F(OpLogSoftmaxOutTest, Smoke) {
@@ -101,6 +154,10 @@ TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
+TEST_F(OpLogSoftmaxOutTest, NonContiguous) {
+  test_dtype_noncontiguous_dim<float, ScalarType::Float>();
+}
+
 TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched dimensions";