From 119cc80a7b19d7a8f33caa8d92f58783d70ca732 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 24 Feb 2025 17:03:46 -0800 Subject: [PATCH 1/3] Fix log_softmax along non-contiguous dim Pull Request resolved: https://github.com/pytorch/executorch/pull/8595 #8382 certainly didn't fix this problem (and added it on x86), but I don't think it was correct on ARM prior to that either. Added a regression test. ghstack-source-id: 268149462 @exported-using-ghexport Differential Revision: [D69928884](https://our.internmc.facebook.com/intern/diff/D69928884/) --- kernels/optimized/cpu/op_log_softmax.cpp | 19 ++++---- kernels/test/op_log_softmax_test.cpp | 57 ++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp index 1d2467bca5f..1822a06f29f 100644 --- a/kernels/optimized/cpu/op_log_softmax.cpp +++ b/kernels/optimized/cpu/op_log_softmax.cpp @@ -75,17 +75,20 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) { static_assert( std::is_same_v, "Below loop actually only supports float."); - const VecIn max_input_vec(max_input); - for (; d + VecOut::size() < dim_size; d += VecOut::size()) { - auto index = d * dim_stride; - auto in = VecIn::loadu(&input_data[index]); - auto out_ = (in - max_input_vec).exp(); - out_.store(&output_data[index]); + // It is not correct to vectorize if dim is not contiguous! + if (dim_stride == 1) { + const VecIn max_input_vec(max_input); + for (; d + VecOut::size() < dim_size; d += VecOut::size()) { + auto index = d * dim_stride; + auto in = VecIn::loadu(&input_data[index]); + auto out_ = (in - max_input_vec).exp(); + out_.store(&output_data[index]); #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) - temp_sum += vaddvq_f32(out_); + temp_sum += vaddvq_f32(out_); #else - temp_sum += at::vec::vec_reduce_all(std::plus(), out_); + temp_sum += at::vec::vec_reduce_all(std::plus(), out_); #endif + } } for (; d < dim_size; ++d) { output_data[d * dim_stride] = diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp index 94047592a80..1b01ff8a78d 100644 --- a/kernels/test/op_log_softmax_test.cpp +++ b/kernels/test/op_log_softmax_test.cpp @@ -72,6 +72,59 @@ class OpLogSoftmaxOutTest : public OperatorTest { EXPECT_TENSOR_CLOSE(out, expected); } } + + template + void test_dtype_noncontiguous_dim() { + TensorFactory tf; + + // Dim 0 must be longer than the vector width of the machine (for + // float, this is 4 for ARM64 and 8 for AVX2) to exhibit problems. + // clang-format off + Tensor x = tf.make( + {9, 3}, + { + 0, 9, 18, + 1, 10, 19, + 2, 11, 20, + 3, 12, 21, + 4, 13, 22, + 5, 14, 23, + 6, 15, 24, + 7, 16, 25, + 8, 17, 26, + }); + // clang-format on + + Tensor out = tf.zeros({9, 3}); + + op_log_softmax_out(x, /*dim=*/0, /*half_to_float*/ false, out); + + // clang-format off + Tensor expected = tf.make( + {9, 3}, + { + -8.45855, -8.45855, -8.45855, + -7.45855, -7.45855, -7.45855, + -6.45855, -6.45855, -6.45855, + -5.45855, -5.45855, -5.45855, + -4.45855, -4.45855, -4.45855, + -3.45855, -3.45855, -3.45855, + -2.45855, -2.45855, -2.45855, + -1.45855, -1.45855, -1.45855, + -0.458552, -0.458552, -0.458552 + }); + // clang-format on + + if constexpr (DTYPE == ScalarType::BFloat16) { + EXPECT_TENSOR_CLOSE_WITH_TOL( + out, + expected, + 1e-2, + executorch::runtime::testing::internal::kDefaultAtol); + } else { + EXPECT_TENSOR_CLOSE(out, expected); + } + } }; TEST_F(OpLogSoftmaxOutTest, Smoke) { @@ -101,6 +154,10 @@ TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) { #undef TEST_ENTRY } +TEST_F(OpLogSoftmaxOutTest, NonContiguous) { + test_dtype_noncontiguous_dim(); +} + TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) { if (SupportedFeatures::get()->is_aten) { GTEST_SKIP() << "ATen currently supports mismatched dimensions"; From 8184d04402ff35d88e4928610152b69a93a403f1 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 24 Feb 2025 17:03:46 -0800 Subject: [PATCH 2/3] Re-enable optimized gelu test in CMake Pull Request resolved: https://github.com/pytorch/executorch/pull/8597 I missed this line disabling the test. (Splitting out re-enable of log_softmax because I think that one needs fixes.) ghstack-source-id: 268149463 @exported-using-ghexport Differential Revision: [D69929122](https://our.internmc.facebook.com/intern/diff/D69929122/) --- kernels/test/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 4250f1f7581..63b92ab525b 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -66,7 +66,7 @@ foreach(kernel ${_kernels}) cp "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h" "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/" - DEPENDS "${kernel}_ops_lib" + DEPENDS "${kernel}_ops_lib" ) endforeach() @@ -278,10 +278,8 @@ set(_optimized_kernels_test_sources ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp ) -# We don't have sleef on OSS so we don't have gelu and log_softmax -list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp" - "op_log_softmax_test.cpp" -) +# We don't have sleef on OSS so we don't have log_softmax +list(REMOVE_ITEM _optimized_kernels_test_sources "op_log_softmax_test.cpp") et_cxx_test( optimized_kernels_test From f28abf2d57bd70e18435b7b148371f7c7bed0d5d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 24 Feb 2025 17:03:47 -0800 Subject: [PATCH 3/3] Clean up optimized-oss.yaml Pull Request resolved: https://github.com/pytorch/executorch/pull/8549 We don't need this now that we support log_softmax and gelu in OSS! ghstack-source-id: 268149465 @exported-using-ghexport Differential Revision: [D69475020](https://our.internmc.facebook.com/intern/diff/D69475020/) --- build/cmake_deps.toml | 6 +- configurations/CMakeLists.txt | 2 +- configurations/targets.bzl | 18 ---- examples/models/llama/runner/targets.bzl | 5 +- kernels/optimized/CMakeLists.txt | 4 +- kernels/optimized/cpu/targets.bzl | 8 +- kernels/optimized/op_registration_util.bzl | 6 +- kernels/optimized/optimized-oss.yaml | 96 ------------------- kernels/optimized/targets.bzl | 8 -- kernels/test/CMakeLists.txt | 6 +- .../optimized/op_registration_util.bzl | 23 +++-- 11 files changed, 27 insertions(+), 155 deletions(-) delete mode 100644 kernels/optimized/optimized-oss.yaml diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index c44fcf92ea6..21a8e282929 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -117,9 +117,9 @@ deps = [ "executorch", ] -[targets.optimized_native_cpu_ops_oss] +[targets.optimized_native_cpu_ops] buck_targets = [ - "//configurations:optimized_native_cpu_ops_oss", + "//configurations:optimized_native_cpu_ops", ] filters = [ ".cpp$", @@ -437,6 +437,6 @@ deps = [ "portable_kernels", "quantized_kernels", "xnnpack_backend", - "optimized_native_cpu_ops_oss", + "optimized_native_cpu_ops", ] # ---------------------------------- LLama end ---------------------------------- diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt index eddb8b2a12c..462124a6ea6 100644 --- a/configurations/CMakeLists.txt +++ b/configurations/CMakeLists.txt @@ -30,7 +30,7 @@ include(${EXECUTORCH_ROOT}/build/Codegen.cmake) if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) # Merge optimized and portable definitions, taking optimized where available. merge_yaml( - FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized-oss.yaml + FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized.yaml FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR} ) diff --git a/configurations/targets.bzl b/configurations/targets.bzl index 6a5341c2904..5a39f7301ec 100644 --- a/configurations/targets.bzl +++ b/configurations/targets.bzl @@ -50,21 +50,3 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) - - # TODO(T183193812): delete this target after optimized-oss.yaml is gone - executorch_generated_lib( - name = "optimized_native_cpu_ops_oss", - deps = [ - "//executorch/kernels/optimized:optimized_operators", - "//executorch/kernels/optimized:optimized_oplist", - "//executorch/kernels/portable:executorch_aten_ops", - "//executorch/kernels/portable:operators", - ], - functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml", - fallback_yaml_target = "//executorch/kernels/portable:functions.yaml", - define_static_targets = True, - visibility = [ - "//executorch/examples/...", - "@EXECUTORCH_CLIENTS", - ], - ) diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl index 9f095b93970..37827bb78a5 100644 --- a/examples/models/llama/runner/targets.bzl +++ b/examples/models/llama/runner/targets.bzl @@ -3,9 +3,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def _get_operator_lib(aten = False): if aten: return ["//executorch/kernels/aten:generated_lib"] - elif runtime.is_oss: - # TODO(T183193812): delete this path after optimized-oss.yaml is no more. - return ["//executorch/configurations:optimized_native_cpu_ops_oss", "//executorch/extension/llm/custom_ops:custom_ops"] else: return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"] @@ -13,7 +10,7 @@ def get_qnn_dependency(): # buck build -c executorch.enable_qnn=true //executorch/examples/models/llama/runner:runner # Check if QNN is enabled before including the dependency if native.read_config("executorch", "enable_qnn", "false") == "true": - # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work, + # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work, # likely due to it's an empty library with dependency only return [ "//executorch/backends/qualcomm/runtime:runtime", diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 1f3aff57ecf..235c6738d9a 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -49,12 +49,12 @@ target_compile_options(cpublas PUBLIC ${_common_compile_options}) # Generate C++ bindings to register kernels into both PyTorch (for AOT) and # Executorch (for runtime). Here select all ops in optimized.yaml -set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized-oss.yaml") +set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized.yaml") gen_selected_ops(LIB_NAME "optimized_ops_lib" OPS_SCHEMA_YAML "${_yaml}") generate_bindings_for_kernels( LIB_NAME "optimized_ops_lib" FUNCTIONS_YAML - ${CMAKE_CURRENT_SOURCE_DIR}/optimized-oss.yaml + ${CMAKE_CURRENT_SOURCE_DIR}/optimized.yaml ADD_EXCEPTION_BOUNDARY ) message("Generated files ${gen_command_sources}") diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index 2a66407a5ce..83b2c320266 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -1,5 +1,5 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "is_op_disabled", "op_target") +load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "op_target") _OPTIMIZED_ATEN_OPS = ( op_target( @@ -111,13 +111,11 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ - enabled_ops = [op for op in _OPTIMIZED_ATEN_OPS if not is_op_disabled(op["name"])] - # Define build targets for all operators registered in the tables above. - for op in enabled_ops: + for op in _OPTIMIZED_ATEN_OPS: define_op_target(**op) - aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops] + aten_op_targets = [":{}".format(op["name"]) for op in _OPTIMIZED_ATEN_OPS] all_op_targets = aten_op_targets runtime.cxx_library( diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl index 12a5f012a38..3ac89132380 100644 --- a/kernels/optimized/op_registration_util.bzl +++ b/kernels/optimized/op_registration_util.bzl @@ -2,8 +2,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/build:selects.bzl", "selects") load( "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl", - "get_vec_preprocessor_flags", "get_vec_deps", + "get_vec_preprocessor_flags", ) load( "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", @@ -137,7 +137,3 @@ def define_op_target(name, compiler_flags, deps): compiler_flags = compiler_flags, deps = deps, ) - -def is_op_disabled(name): - # All ops are enabled for internal builds. - return False diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml deleted file mode 100644 index a24aa9ca173..00000000000 --- a/kernels/optimized/optimized-oss.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This yaml file contains operators that have optimized kernels available. -# Note that this is a copy of optimized.yaml that does not include log_softmax, -# due to the OSS build not currently including sleef. -# TODO (T183193812) - -- op: _fft_r2c.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_fft_r2c_out - -- op: add.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_add_out - -- op: add.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_add_scalar_out - -- op: bmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_bmm_out - -- op: div.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_div_out - -- op: div.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_div_scalar_out - -- op: exp.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_exp_out - -- op: sigmoid.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_sigmoid_out - -- op: gelu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_gelu_out - -- op: le.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_le_scalar_out - -- op: le.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_le_tensor_out - -- op: linear.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_linear_out - -- op: mul.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_mul_out - -- op: mul.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_mul_scalar_out - -- op: native_layer_norm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_native_layer_norm_out - -- op: neg.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_neg_out - -- op: sub.out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_sub_out - -- op: sub.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::opt_sub_scalar_out diff --git a/kernels/optimized/targets.bzl b/kernels/optimized/targets.bzl index 9978d4196dd..c655cb149a3 100644 --- a/kernels/optimized/targets.bzl +++ b/kernels/optimized/targets.bzl @@ -19,14 +19,6 @@ def define_common_targets(is_fbcode=False): ], ) - runtime.export_file( - name = "optimized-oss.yaml", - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - ) - runtime.cxx_library( name = "optimized_operators", srcs = [], diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 63b92ab525b..24adb8d9c80 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -270,17 +270,15 @@ set(_optimized_kernels_test_sources "op_le_test.cpp" "op_linear_test.cpp" "op_log_softmax_test.cpp" + "op_mm_test.cpp" "op_mul_test.cpp" "op_native_layer_norm_test.cpp" "op_neg_test.cpp" "op_sub_test.cpp" "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp" - ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp + ${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp ) -# We don't have sleef on OSS so we don't have log_softmax -list(REMOVE_ITEM _optimized_kernels_test_sources "op_log_softmax_test.cpp") - et_cxx_test( optimized_kernels_test SOURCES diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl index c70757e29b9..d48a22cee37 100644 --- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl @@ -9,8 +9,13 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/build:selects.bzl", "selects") load( "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl", + "get_vec_deps", "get_vec_preprocessor_flags", ) +load( + "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", + "get_compiler_optimization_flags", +) def op_target(name, deps = [], compiler_flags = []): """Registers an optimized implementation for an operator overload group. @@ -94,12 +99,17 @@ def define_op_library(name, compiler_flags, deps): "//executorch/kernels/test/...", "@EXECUTORCH_CLIENTS", ], - # kernels often have helpers with no prototypes just disabling the warning here as the headers - # are codegend and linked in later - compiler_flags = ["-Wno-missing-prototypes"], + compiler_flags = [ + # kernels often have helpers with no prototypes just disabling the warning here as the headers + # are codegend and linked in later + "-Wno-missing-prototypes", + # pragma unroll fails with -Os, don't need to warn us and + # fail Werror builds; see https://godbolt.org/z/zvf85vTsr + "-Wno-pass-failed", + ] + get_compiler_optimization_flags(), deps = [ "//executorch/runtime/kernel:kernel_includes", - ] + augmented_deps, + ] + augmented_deps + get_vec_deps(), preprocessor_flags = get_vec_preprocessor_flags(), # sleef needs to be added as a direct dependency of the operator target when building for Android, # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of @@ -134,8 +144,3 @@ def define_op_target(name, compiler_flags, deps): compiler_flags = compiler_flags, deps = deps, ) - -def is_op_disabled(name): - # TODO (gjcomer) Enable ops with sleef dependency in OSS - disabled_ops = ["op_log_softmax"] - return name in disabled_ops