From 9bc4b60079760f647ddd2fda5651293f2d46e4a7 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 27 Oct 2025 16:24:01 -0700 Subject: [PATCH 1/5] Fix warning when building kernel libraries Getting warnings like the following: ``` /home/larryliu/executorch/kernels/portable/cpu/op_maximum.cpp: In lambda function: /home/larryliu/executorch/kernels/portable/cpu/op_maximum.cpp:52:9: note: the ABI for passing parameters with 32-byte alignment has changed in GCC 4.6 52 | [](const auto val_a, const auto val_b) { | ^ /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:919:7: note: in definition of macro 'ET_INTERNAL_SWITCH' 919 | __VA_ARGS__ \ | ^~~~~~~~~~~ /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:931:3: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE' 931 | ET_INTERNAL_SWITCH_CASE( \ | ^~~~~~~~~~~~~~~~~~~~~~~ /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:957:3: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE_INT_TYPES' 957 | ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:1008:3: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE_REAL_TYPES' 1008 | ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:1136:7: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND' 1136 | ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND( \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:1172:3: note: in expansion of macro 'ET_SWITCH_REAL_TYPES_AND' 1172 | ET_SWITCH_REAL_TYPES_AND(Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~ /home/larryliu/executorch/kernels/portable/cpu/op_maximum.cpp:47:3: note: in expansion of macro 'ET_SWITCH_REALB_TYPES' 47 | ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { | ^~~~~~~~~~~~~~~~~~~~~ ``` Fixing them in this PR --- kernels/optimized/cpu/op_add.cpp | 2 +- kernels/optimized/cpu/op_div.cpp | 4 ++-- kernels/optimized/cpu/op_exp.cpp | 2 +- kernels/optimized/cpu/op_le.cpp | 2 +- kernels/optimized/cpu/op_log_softmax.cpp | 2 +- kernels/optimized/cpu/op_mul.cpp | 2 +- kernels/optimized/cpu/op_native_layer_norm.cpp | 2 +- kernels/optimized/cpu/op_sub.cpp | 4 ++-- kernels/portable/cpu/op_add.cpp | 4 ++-- kernels/portable/cpu/op_addmm.cpp | 2 +- kernels/portable/cpu/op_atan2.cpp | 2 +- kernels/portable/cpu/op_clamp.cpp | 2 +- kernels/portable/cpu/op_div.cpp | 4 ++-- kernels/portable/cpu/op_fmod.cpp | 2 +- kernels/portable/cpu/op_isinf.cpp | 13 ++++++++++++- kernels/portable/cpu/op_isnan.cpp | 12 +++++++++++- kernels/portable/cpu/op_maximum.cpp | 2 +- kernels/portable/cpu/op_mul.cpp | 4 ++-- kernels/portable/cpu/op_neg.cpp | 2 +- kernels/portable/cpu/op_pow.cpp | 2 +- kernels/portable/cpu/op_rsub.cpp | 2 +- kernels/portable/cpu/op_sigmoid.cpp | 2 +- kernels/portable/cpu/op_sub.cpp | 2 +- 23 files changed, 49 insertions(+), 28 deletions(-) diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index 562d4e227dd..2b7d6bbfbd1 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -68,7 +68,7 @@ Tensor& opt_add_out( using Vec = at::vec::Vectorized; at::vec::map( - [alpha_val, b_val](Vec x) { return x + Vec(alpha_val * b_val); }, + [alpha_val, b_val](Vec& x) { return x + Vec(alpha_val * b_val); }, out.mutable_data_ptr(), a.const_data_ptr(), out.numel()); diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index d74a293af8a..bc958c67e8b 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -87,14 +87,14 @@ Tensor& opt_div_out( using Vec = at::vec::Vectorized; if (a.numel() == 1) { at::vec::map( - [scalar_casted](Vec x) { return Vec(scalar_casted) / x; }, + [scalar_casted](Vec& x) { return Vec(scalar_casted) / x; }, out.mutable_data_ptr(), tensor->const_data_ptr(), out.numel()); } else { Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted); at::vec::map( - [inv_scalar_casted_vec](Vec x) { + [inv_scalar_casted_vec](Vec& x) { return x * inv_scalar_casted_vec; }, out.mutable_data_ptr(), diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp index 0798d1e2b25..8968326c327 100644 --- a/kernels/optimized/cpu/op_exp.cpp +++ b/kernels/optimized/cpu/op_exp.cpp @@ -36,7 +36,7 @@ void exp_data( CTYPE_OUT* out_data) { using Vec = at::vec::Vectorized; at::vec::map( - [](Vec x) { return x.exp(); }, out_data, in_data, numel); + [](Vec& x) { return x.exp(); }, out_data, in_data, numel); } /** diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp index 60696f1d2f1..521db4531b0 100644 --- a/kernels/optimized/cpu/op_le.cpp +++ b/kernels/optimized/cpu/op_le.cpp @@ -48,7 +48,7 @@ Tensor& opt_le_tensor_out( ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; at::vec::map2( - [](Vec x, Vec y) { return x.le(y); }, + [](Vec& x, Vec& y) { return x.le(y); }, out.mutable_data_ptr(), a.const_data_ptr(), b.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp index 629a81a6429..256a4079a58 100644 --- a/kernels/optimized/cpu/op_log_softmax.cpp +++ b/kernels/optimized/cpu/op_log_softmax.cpp @@ -55,7 +55,7 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) { 0, outer_size, ::executorch::extension::internal::GRAIN_SIZE, - [&](const auto begin, const auto end) { + [&](const auto& begin, const auto& end) { at::native::serial_vec_log_softmax_lastdim_range( input_data_base, output_data_base, diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 48670b7441b..61aa5469a2b 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -56,7 +56,7 @@ Tensor& opt_mul_out( using Vec = at::vec::Vectorized; at::vec::map( - [b_casted](Vec x) { return x * Vec(b_casted); }, + [b_casted](Vec& x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), out.numel()); diff --git a/kernels/optimized/cpu/op_native_layer_norm.cpp b/kernels/optimized/cpu/op_native_layer_norm.cpp index 8d5410cb581..c583393ba8e 100644 --- a/kernels/optimized/cpu/op_native_layer_norm.cpp +++ b/kernels/optimized/cpu/op_native_layer_norm.cpp @@ -92,7 +92,7 @@ void layer_norm( } } else { at::vec::map3( - [scale, offset](auto x, auto gamma, auto beta) { + [scale, offset](auto& x, auto& gamma, auto& beta) { using Vec = decltype(x); return (x * Vec(scale) + Vec(offset)) * gamma + beta; }, diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index 41d46d1661e..feac11789e5 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -86,7 +86,7 @@ Tensor& opt_sub_out( using Vec = at::vec::Vectorized; if (a.numel() == 1) { at::vec::map( - [alpha_val, scalar_casted](Vec x) { + [alpha_val, scalar_casted](Vec& x) { return Vec(scalar_casted) - Vec(alpha_val) * x; }, out.mutable_data_ptr(), @@ -94,7 +94,7 @@ Tensor& opt_sub_out( out.numel()); } else { at::vec::map( - [alpha_val, scalar_casted](Vec x) { + [alpha_val, scalar_casted](Vec& x) { return x - Vec(alpha_val * scalar_casted); }, out.mutable_data_ptr(), diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index 122b2a2c97e..c7782e4276b 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -80,7 +80,7 @@ Tensor& add_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [val_alpha](const auto val_a, const auto val_b) { + [val_alpha](const auto& val_a, const auto& val_b) { return val_a + val_alpha * val_b; }, ctx, @@ -136,7 +136,7 @@ Tensor& add_scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [val_alpha_times_b](const auto val_a) { + [val_alpha_times_b](const auto& val_a) { // Cast here supports vectorization; either it does nothing // or it casts from CTYPE_COMPUTE to // Vectorized. diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp index 440a8b2c0fa..3e53100d356 100644 --- a/kernels/portable/cpu/op_addmm.cpp +++ b/kernels/portable/cpu/op_addmm.cpp @@ -92,7 +92,7 @@ Tensor& addmm_out( CTYPE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [alpha_val, beta_val](const auto val_a, const auto val_b) { + [alpha_val, beta_val](const auto& val_a, const auto& val_b) { return val_a * alpha_val + val_b * beta_val; }, ctx, diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 5390eb52820..d5ec2516bd4 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -59,7 +59,7 @@ Tensor& atan2_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::FLOATHBF16>( - [](const auto val_a, const auto val_b) { + [](const auto& val_a, const auto& val_b) { return executorch::math::atan2(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index 8ac78fd5477..72134ae9ff8 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -139,7 +139,7 @@ Tensor& clamp_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [has_min, min_opt, has_max, max_opt](const auto val_in) { + [has_min, min_opt, has_max, max_opt](const auto& val_in) { auto val_out = val_in; if (has_min) { val_out = utils::max_override( diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp index 51a65747b33..f94f219d853 100644 --- a/kernels/portable/cpu/op_div.cpp +++ b/kernels/portable/cpu/op_div.cpp @@ -62,7 +62,7 @@ Tensor& div_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::FLOATHBF16>( - [](const auto val_a, const auto val_b) { return val_a / val_b; }, + [](const auto& val_a, const auto& val_b) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, @@ -195,7 +195,7 @@ Tensor& div_scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [val_b](const auto val_a) { return val_a / val_b; }, + [val_b](const auto& val_a) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 40bb4a5e94c..05bb4f9e553 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -138,7 +138,7 @@ Tensor& fmod_Scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [val_b](const auto val_a) { + [val_b](const auto& val_a) { return executorch::math::fmod(val_a, (decltype(val_a))val_b); }, ctx, diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp index ac0c19f0f7a..d6fab50ec29 100644 --- a/kernels/portable/cpu/op_isinf.cpp +++ b/kernels/portable/cpu/op_isinf.cpp @@ -14,7 +14,18 @@ namespace torch { namespace executor { namespace native { -DEFINE_UNARY_UFUNC_REALHBBF16_TO_BOOL(isinf_out, std::isinf) +bool isinf_float(float x) { + return std::isinf(x); +} + +bool isinf_double(double x) { + return std::isinf(x); +} + +Tensor& isinf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { + return internal::unary_ufunc_realhbbf16_to_bool( + isinf_float, isinf_double, ctx, in, out); +} } // namespace native } // namespace executor diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp index dad38a2619a..bd3aaf7806a 100644 --- a/kernels/portable/cpu/op_isnan.cpp +++ b/kernels/portable/cpu/op_isnan.cpp @@ -13,8 +13,18 @@ namespace torch { namespace executor { namespace native { +bool isnan_float(float x) { + return std::isnan(x); +} -DEFINE_UNARY_UFUNC_REALHBBF16_TO_BOOL(isnan_out, std::isnan) +bool isnan_double(double x) { + return std::isnan(x); +} + +Tensor& isnan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { + return internal::unary_ufunc_realhbbf16_to_bool( + isnan_float, isnan_double, ctx, in, out); +} } // namespace native } // namespace executor diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index c7979e40d7c..3da154ede82 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -49,7 +49,7 @@ Tensor& maximum_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const auto val_a, const auto val_b) { + [](const auto& val_a, const auto& val_b) { return utils::max_override(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 6d4f30106ca..58172c249d4 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -72,7 +72,7 @@ Tensor& mul_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const auto val_a, const auto val_b) { return val_a * val_b; }, + [](const auto& val_a, const auto& val_b) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, @@ -116,7 +116,7 @@ Tensor& mul_scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [val_b](const auto val_a) { return val_a * val_b; }, + [val_b](const auto& val_a) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp index d184eb873d5..4d7a9284e4c 100644 --- a/kernels/portable/cpu/op_neg.cpp +++ b/kernels/portable/cpu/op_neg.cpp @@ -39,7 +39,7 @@ Tensor& neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { CTYPE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [](const auto val_in) { return -val_in; }, + [](const auto& val_in) { return -val_in; }, ctx, in, utils::SupportedTensorDtypes::REALHBF16, diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp index aaf934b9adf..31085165dde 100644 --- a/kernels/portable/cpu/op_pow.cpp +++ b/kernels/portable/cpu/op_pow.cpp @@ -57,7 +57,7 @@ Tensor& pow_Tensor_Tensor_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [](const auto val_a, const auto val_b) { + [](const auto& val_a, const auto& val_b) { return executorch::math::pow(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp index 6a0a77b6596..17faed95c52 100644 --- a/kernels/portable/cpu/op_rsub.cpp +++ b/kernels/portable/cpu/op_rsub.cpp @@ -56,7 +56,7 @@ Tensor& rsub_scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [val_b, val_alpha](const auto val_a) { + [val_b, val_alpha](const auto& val_a) { return val_b - val_alpha * val_a; }, ctx, diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index 0578c846ab7..08c85e8fd01 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -47,7 +47,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::FLOATHBF16>( - [](const auto val_in) { + [](const auto& val_in) { const auto one = static_cast(1.0); auto out_val = one / (one + executorch::math::exp(-val_in)); return out_val; diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp index b914c411303..32322aa90cd 100644 --- a/kernels/portable/cpu/op_sub.cpp +++ b/kernels/portable/cpu/op_sub.cpp @@ -60,7 +60,7 @@ Tensor& sub_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [val_alpha](const auto val_a, const auto val_b) { + [val_alpha](const auto& val_a, const auto& val_b) { return val_a - (decltype(val_b))(val_alpha)*val_b; }, ctx, From ad3f2b064926034bb3d7d560f28fc1cbc3d2939b Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 28 Oct 2025 14:56:18 -0700 Subject: [PATCH 2/5] Fix more warnings on GCC --- kernels/optimized/CMakeLists.txt | 1 + kernels/optimized/cpu/op_add.cpp | 8 +-- kernels/optimized/cpu/op_div.cpp | 12 ++--- kernels/optimized/cpu/op_elu.cpp | 51 +++++++------------ kernels/optimized/cpu/op_exp.cpp | 3 +- kernels/optimized/cpu/op_le.cpp | 6 +-- kernels/optimized/cpu/op_mul.cpp | 10 ++-- .../optimized/cpu/op_native_layer_norm.cpp | 4 +- kernels/optimized/cpu/op_sub.cpp | 10 ++-- 9 files changed, 46 insertions(+), 59 deletions(-) diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 01a10f77846..2c340b70b7d 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -24,6 +24,7 @@ endif() set(_common_compile_options $<$:/wd4996> $<$>:-Wno-deprecated-declarations> + $<$:-Wno-psabi> ) # Note for apple platform we can rely on Accelerate framework Will come back to diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index 2b7d6bbfbd1..43774a9497b 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -67,8 +67,8 @@ Tensor& opt_add_out( CTYPE b_val = *b.const_data_ptr(); using Vec = at::vec::Vectorized; - at::vec::map( - [alpha_val, b_val](Vec& x) { return x + Vec(alpha_val * b_val); }, + at::vec::map( + [alpha_val, b_val](Vec x) { return x + Vec(alpha_val * b_val); }, out.mutable_data_ptr(), a.const_data_ptr(), out.numel()); @@ -86,7 +86,7 @@ Tensor& opt_add_out( CTYPE b_casted = static_cast(b_val); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_casted](Vec x) { return x + Vec(alpha_val * b_casted); }, @@ -140,7 +140,7 @@ Tensor& opt_add_scalar_out( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_casted](Vec x) { return x + Vec(alpha_val * b_casted); }, diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index bc958c67e8b..d5f5f4f35e6 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -86,15 +86,15 @@ Tensor& opt_div_out( using Vec = at::vec::Vectorized; if (a.numel() == 1) { - at::vec::map( - [scalar_casted](Vec& x) { return Vec(scalar_casted) / x; }, + at::vec::map( + [scalar_casted](Vec x) { return Vec(scalar_casted) / x; }, out.mutable_data_ptr(), tensor->const_data_ptr(), out.numel()); } else { Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted); - at::vec::map( - [inv_scalar_casted_vec](Vec& x) { + at::vec::map( + [inv_scalar_casted_vec](Vec x) { return x * inv_scalar_casted_vec; }, out.mutable_data_ptr(), @@ -111,7 +111,7 @@ Tensor& opt_div_out( if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x / y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -193,7 +193,7 @@ Tensor& opt_div_scalar_out( using Vec = at::vec::Vectorized; Vec inv_b_casted_vec(CTYPE(1) / b_casted); - at::vec::map( + at::vec::map( [inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_elu.cpp b/kernels/optimized/cpu/op_elu.cpp index 30f7ff442d8..89258de2fb0 100644 --- a/kernels/optimized/cpu/op_elu.cpp +++ b/kernels/optimized/cpu/op_elu.cpp @@ -6,11 +6,12 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include +#include +#include #include #include -#include #include namespace torch::executor::native { @@ -31,38 +32,24 @@ void elu( const auto math_alpha = utils::scalar_to(alpha); const auto math_scale = utils::scalar_to(scale); const auto math_input_scale = utils::scalar_to(input_scale); - const auto scalar_func = - at::native::get_scalar_elu_elementwise_func( - math_alpha, math_scale, math_input_scale); - const auto vec_func = at::native::get_vectorized_elu_elementwise_func( - math_alpha, math_scale, math_input_scale); - ::executorch::extension::parallel_for( - 0, - out.numel(), - ::executorch::extension::internal::GRAIN_SIZE, - [&](const auto begin, const auto end) { - using Vec = at::vec::Vectorized; - const auto vectorized_begin = - begin + (Vec::size() - begin % Vec::size()) % Vec::size(); - const auto vectorized_end = end - (end % Vec::size()); - // Scalar prologue. - for (const auto idx : c10::irange(begin, vectorized_begin)) { - out_data[idx] = scalar_func(in_data[idx]); - } + using Vec = at::vec::Vectorized; + at::vec::map( + [math_alpha, math_scale, math_input_scale](Vec x) { + auto scaled_input = x * Vec(static_cast(math_input_scale)); + auto zero = Vec(static_cast(0)); + auto one = Vec(static_cast(1)); + auto alpha_vec = Vec(static_cast(math_alpha)); + auto scale_vec = Vec(static_cast(math_scale)); - // Main vectorized loop. - for (auto idx = vectorized_begin; idx < vectorized_end; - idx += Vec::size()) { - auto result_vec = vec_func(Vec::loadu(&in_data[idx])); - result_vec.store(&out_data[idx]); - } - - // Scalar epilogue. - for (const auto idx : c10::irange(vectorized_end, end)) { - out_data[idx] = scalar_func(in_data[idx]); - } - }); + auto pos_mask = scaled_input > zero; + auto neg_result = alpha_vec * ((scaled_input.exp()) - one); + auto result = Vec::blendv(neg_result, scaled_input, pos_mask); + return result * scale_vec; + }, + out_data, + in_data, + out.numel()); } } // namespace diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp index 8968326c327..2f09af77a74 100644 --- a/kernels/optimized/cpu/op_exp.cpp +++ b/kernels/optimized/cpu/op_exp.cpp @@ -35,8 +35,7 @@ void exp_data( const size_t numel, CTYPE_OUT* out_data) { using Vec = at::vec::Vectorized; - at::vec::map( - [](Vec& x) { return x.exp(); }, out_data, in_data, numel); + at::vec::map([](Vec x) { return x.exp(); }, out_data, in_data, numel); } /** diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp index 521db4531b0..0234922eab9 100644 --- a/kernels/optimized/cpu/op_le.cpp +++ b/kernels/optimized/cpu/op_le.cpp @@ -47,8 +47,8 @@ Tensor& opt_le_tensor_out( if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( - [](Vec& x, Vec& y) { return x.le(y); }, + at::vec::map2( + [](Vec x, Vec y) { return x.le(y); }, out.mutable_data_ptr(), a.const_data_ptr(), b.const_data_ptr(), @@ -95,7 +95,7 @@ Tensor& opt_le_scalar_out( ET_EXTRACT_SCALAR(b, b_val); CTYPE b_casted = static_cast(b_val); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [b_casted](Vec x) { return x.le(Vec(b_casted)); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 61aa5469a2b..2fd195e0cd3 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -55,8 +55,8 @@ Tensor& opt_mul_out( CTYPE b_casted = static_cast(b_val); using Vec = at::vec::Vectorized; - at::vec::map( - [b_casted](Vec& x) { return x * Vec(b_casted); }, + at::vec::map( + [b_casted](Vec x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), out.numel()); @@ -76,7 +76,7 @@ Tensor& opt_mul_out( ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -86,7 +86,7 @@ Tensor& opt_mul_out( } else { ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -173,7 +173,7 @@ Tensor& opt_mul_scalar_out( CTYPE b_casted = utils::scalar_to(b); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [b_casted](Vec x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_native_layer_norm.cpp b/kernels/optimized/cpu/op_native_layer_norm.cpp index c583393ba8e..1bb7b72506c 100644 --- a/kernels/optimized/cpu/op_native_layer_norm.cpp +++ b/kernels/optimized/cpu/op_native_layer_norm.cpp @@ -91,8 +91,8 @@ void layer_norm( dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v; } } else { - at::vec::map3( - [scale, offset](auto& x, auto& gamma, auto& beta) { + at::vec::map3( + [scale, offset](auto x, auto gamma, auto beta) { using Vec = decltype(x); return (x * Vec(scale) + Vec(offset)) * gamma + beta; }, diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index feac11789e5..648e9b015ea 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -85,16 +85,16 @@ Tensor& opt_sub_out( using Vec = at::vec::Vectorized; if (a.numel() == 1) { - at::vec::map( - [alpha_val, scalar_casted](Vec& x) { + at::vec::map( + [alpha_val, scalar_casted](Vec x) { return Vec(scalar_casted) - Vec(alpha_val) * x; }, out.mutable_data_ptr(), tensor->const_data_ptr(), out.numel()); } else { - at::vec::map( - [alpha_val, scalar_casted](Vec& x) { + at::vec::map( + [alpha_val, scalar_casted](Vec x) { return x - Vec(alpha_val * scalar_casted); }, out.mutable_data_ptr(), @@ -148,7 +148,7 @@ Tensor& opt_sub_scalar_out( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_casted](Vec x) { return x - Vec(alpha_val * b_casted); }, From af0cff85e07530de17c4c58c6efd78f9e2653aa0 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 28 Oct 2025 15:44:41 -0700 Subject: [PATCH 3/5] Fix elu --- kernels/optimized/cpu/op_elu.cpp | 51 ++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/kernels/optimized/cpu/op_elu.cpp b/kernels/optimized/cpu/op_elu.cpp index 89258de2fb0..c3ce156b31c 100644 --- a/kernels/optimized/cpu/op_elu.cpp +++ b/kernels/optimized/cpu/op_elu.cpp @@ -6,12 +6,11 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include -#include -#include #include #include +#include #include namespace torch::executor::native { @@ -32,24 +31,38 @@ void elu( const auto math_alpha = utils::scalar_to(alpha); const auto math_scale = utils::scalar_to(scale); const auto math_input_scale = utils::scalar_to(input_scale); + const auto scalar_func = + at::native::get_scalar_elu_elementwise_func( + math_alpha, math_scale, math_input_scale); + const auto vec_func = at::native::get_vectorized_elu_elementwise_func( + math_alpha, math_scale, math_input_scale); - using Vec = at::vec::Vectorized; - at::vec::map( - [math_alpha, math_scale, math_input_scale](Vec x) { - auto scaled_input = x * Vec(static_cast(math_input_scale)); - auto zero = Vec(static_cast(0)); - auto one = Vec(static_cast(1)); - auto alpha_vec = Vec(static_cast(math_alpha)); - auto scale_vec = Vec(static_cast(math_scale)); + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto& begin, const auto& end) { + using Vec = at::vec::Vectorized; + const auto vectorized_begin = + begin + (Vec::size() - begin % Vec::size()) % Vec::size(); + const auto vectorized_end = end - (end % Vec::size()); + // Scalar prologue. + for (const auto idx : c10::irange(begin, vectorized_begin)) { + out_data[idx] = scalar_func(in_data[idx]); + } - auto pos_mask = scaled_input > zero; - auto neg_result = alpha_vec * ((scaled_input.exp()) - one); - auto result = Vec::blendv(neg_result, scaled_input, pos_mask); - return result * scale_vec; - }, - out_data, - in_data, - out.numel()); + // Main vectorized loop. + for (auto idx = vectorized_begin; idx < vectorized_end; + idx += Vec::size()) { + auto result_vec = vec_func(Vec::loadu(&in_data[idx])); + result_vec.store(&out_data[idx]); + } + + // Scalar epilogue. + for (const auto idx : c10::irange(vectorized_end, end)) { + out_data[idx] = scalar_func(in_data[idx]); + } + }); } } // namespace From 801a9e0094402309294d7b609b3aba02418647c6 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 29 Oct 2025 11:05:07 -0700 Subject: [PATCH 4/5] Add explicit template variable to map --- kernels/optimized/cpu/op_add.cpp | 6 +++--- kernels/optimized/cpu/op_div.cpp | 6 +++--- kernels/optimized/cpu/op_exp.cpp | 3 ++- kernels/optimized/cpu/op_le.cpp | 2 +- kernels/optimized/cpu/op_mul.cpp | 4 ++-- kernels/optimized/cpu/op_sub.cpp | 6 +++--- 6 files changed, 14 insertions(+), 13 deletions(-) diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index 43774a9497b..562d4e227dd 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -67,7 +67,7 @@ Tensor& opt_add_out( CTYPE b_val = *b.const_data_ptr(); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_val](Vec x) { return x + Vec(alpha_val * b_val); }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -86,7 +86,7 @@ Tensor& opt_add_out( CTYPE b_casted = static_cast(b_val); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_casted](Vec x) { return x + Vec(alpha_val * b_casted); }, @@ -140,7 +140,7 @@ Tensor& opt_add_scalar_out( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_casted](Vec x) { return x + Vec(alpha_val * b_casted); }, diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index d5f5f4f35e6..23fb0547807 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -86,14 +86,14 @@ Tensor& opt_div_out( using Vec = at::vec::Vectorized; if (a.numel() == 1) { - at::vec::map( + at::vec::map( [scalar_casted](Vec x) { return Vec(scalar_casted) / x; }, out.mutable_data_ptr(), tensor->const_data_ptr(), out.numel()); } else { Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted); - at::vec::map( + at::vec::map( [inv_scalar_casted_vec](Vec x) { return x * inv_scalar_casted_vec; }, @@ -193,7 +193,7 @@ Tensor& opt_div_scalar_out( using Vec = at::vec::Vectorized; Vec inv_b_casted_vec(CTYPE(1) / b_casted); - at::vec::map( + at::vec::map( [inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp index 2f09af77a74..0798d1e2b25 100644 --- a/kernels/optimized/cpu/op_exp.cpp +++ b/kernels/optimized/cpu/op_exp.cpp @@ -35,7 +35,8 @@ void exp_data( const size_t numel, CTYPE_OUT* out_data) { using Vec = at::vec::Vectorized; - at::vec::map([](Vec x) { return x.exp(); }, out_data, in_data, numel); + at::vec::map( + [](Vec x) { return x.exp(); }, out_data, in_data, numel); } /** diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp index 0234922eab9..3ab54f77785 100644 --- a/kernels/optimized/cpu/op_le.cpp +++ b/kernels/optimized/cpu/op_le.cpp @@ -95,7 +95,7 @@ Tensor& opt_le_scalar_out( ET_EXTRACT_SCALAR(b, b_val); CTYPE b_casted = static_cast(b_val); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [b_casted](Vec x) { return x.le(Vec(b_casted)); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 2fd195e0cd3..ff5642de930 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -55,7 +55,7 @@ Tensor& opt_mul_out( CTYPE b_casted = static_cast(b_val); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [b_casted](Vec x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -173,7 +173,7 @@ Tensor& opt_mul_scalar_out( CTYPE b_casted = utils::scalar_to(b); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [b_casted](Vec x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index 648e9b015ea..41d46d1661e 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -85,7 +85,7 @@ Tensor& opt_sub_out( using Vec = at::vec::Vectorized; if (a.numel() == 1) { - at::vec::map( + at::vec::map( [alpha_val, scalar_casted](Vec x) { return Vec(scalar_casted) - Vec(alpha_val) * x; }, @@ -93,7 +93,7 @@ Tensor& opt_sub_out( tensor->const_data_ptr(), out.numel()); } else { - at::vec::map( + at::vec::map( [alpha_val, scalar_casted](Vec x) { return x - Vec(alpha_val * scalar_casted); }, @@ -148,7 +148,7 @@ Tensor& opt_sub_scalar_out( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); using Vec = at::vec::Vectorized; - at::vec::map( + at::vec::map( [alpha_val, b_casted](Vec x) { return x - Vec(alpha_val * b_casted); }, From c23e44f5fd2477dbeb956c29f6ba1e1933457ded Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 29 Oct 2025 11:18:49 -0700 Subject: [PATCH 5/5] More --- kernels/optimized/cpu/op_div.cpp | 2 +- kernels/optimized/cpu/op_le.cpp | 2 +- kernels/optimized/cpu/op_mul.cpp | 4 ++-- kernels/optimized/cpu/op_native_layer_norm.cpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index 23fb0547807..d74a293af8a 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -111,7 +111,7 @@ Tensor& opt_div_out( if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x / y; }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp index 3ab54f77785..60696f1d2f1 100644 --- a/kernels/optimized/cpu/op_le.cpp +++ b/kernels/optimized/cpu/op_le.cpp @@ -47,7 +47,7 @@ Tensor& opt_le_tensor_out( if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x.le(y); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index ff5642de930..48670b7441b 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -76,7 +76,7 @@ Tensor& opt_mul_out( ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -86,7 +86,7 @@ Tensor& opt_mul_out( } else { ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() { using Vec = at::vec::Vectorized; - at::vec::map2( + at::vec::map2( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_native_layer_norm.cpp b/kernels/optimized/cpu/op_native_layer_norm.cpp index 1bb7b72506c..8d5410cb581 100644 --- a/kernels/optimized/cpu/op_native_layer_norm.cpp +++ b/kernels/optimized/cpu/op_native_layer_norm.cpp @@ -91,7 +91,7 @@ void layer_norm( dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v; } } else { - at::vec::map3( + at::vec::map3( [scale, offset](auto x, auto gamma, auto beta) { using Vec = decltype(x); return (x * Vec(scale) + Vec(offset)) * gamma + beta;