Fix warning when building kernel libraries (pytorch#15405)

larryliu0820 · web-flow · commit 5914f8f7ac47 · 2025-10-29T13:11:27.000-07:00
Getting warnings like the following:

```
  /home/larryliu/executorch/kernels/portable/cpu/op_maximum.cpp: In lambda function:
  /home/larryliu/executorch/kernels/portable/cpu/op_maximum.cpp:52:9: note: the ABI for passing parameters with 32-byte alignment has changed in GCC 4.6
     52 |         [](const auto val_a, const auto val_b) {
        |         ^
  /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:919:7: note: in definition of macro 'ET_INTERNAL_SWITCH'
    919 |       __VA_ARGS__                                              \
        |       ^~~~~~~~~~~
  /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:931:3: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE'
    931 |   ET_INTERNAL_SWITCH_CASE(                                             \
        |   ^~~~~~~~~~~~~~~~~~~~~~~
  /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:957:3: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE_INT_TYPES'
    957 |   ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, __VA_ARGS__) \
        |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:1008:3: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE_REAL_TYPES'
   1008 |   ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
        |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:1136:7: note: in expansion of macro 'ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND'
   1136 |       ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND(          \
        |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /home/larryliu/executorch/../executorch/runtime/core/exec_aten/util/scalar_type_util.h:1172:3: note: in expansion of macro 'ET_SWITCH_REAL_TYPES_AND'
   1172 |   ET_SWITCH_REAL_TYPES_AND(Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
        |   ^~~~~~~~~~~~~~~~~~~~~~~~
  /home/larryliu/executorch/kernels/portable/cpu/op_maximum.cpp:47:3: note: in expansion of macro 'ET_SWITCH_REALB_TYPES'
     47 |   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&amp;]() {
        |   ^~~~~~~~~~~~~~~~~~~~~
```

Fixing them in this PR


## Overview
----------

This branch addresses GCC compiler warnings related to 32-byte alignment
(ABI) when passing SIMD vector types to lambda functions in both
portable and optimized kernel implementations.


**Key Changes:**
    
* **CMakeLists.txt**: Added `-Wno-psabi` compiler flag to suppress ABI
warnings for GNU compiler: `$&lt;$&lt;CXX_COMPILER_ID:GNU&gt;:-Wno-psabi&gt;`
* **Removed explicit template parameter `&lt;CTYPE&gt;` and changed lambda
parameters from reference `&amp;` to value (pass by value)** from
`at::vec::map()` and `at::vec::map3()` calls
```        
// Before: 
at::vec::map&lt;CTYPE&gt;([alpha_val, b_val](Vec&amp; x) { return x + Vec(alpha_val * b_val); }, ...);  

// After: 
at::vec::map([alpha_val, b_val](Vec x) { return x + Vec(alpha_val * b_val); }, ...);
```
        
* **Changed lambda parameters from value to reference `&amp;`** for lambdas
being passed to `apply_bitensor_elementwise_fn()` and similar functions.
```
// Before:
[val_alpha](const auto val_a, const auto val_b) {
  return val_a + val_alpha * val_b;
}

// After:
[val_alpha](const auto&amp; val_a, const auto&amp; val_b) {
  return val_a + val_alpha * val_b;
}
```

## Problem Solved
--------------

**Before:** GCC compiler produced numerous warnings like:

`note: the ABI for passing parameters with 32-byte alignment has changed
in GCC 4.6`

These warnings appeared when SIMD vector types (e.g., `__m256` from AVX)
were passed to lambda functions, cluttering build output and making it
difficult to spot real issues.

**After:** Clean builds with no ABI warnings, while maintaining
identical functionality and performance characteristics.
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 set(_common_compile_options
     $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
     $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>
+    $<$<CXX_COMPILER_ID:GNU>:-Wno-psabi>
 )
 
 # Note for apple platform we can rely on Accelerate framework Will come back to
diff --git a/kernels/optimized/cpu/op_elu.cpp b/kernels/optimized/cpu/op_elu.cpp
@@ -41,7 +41,7 @@ void elu(
       0,
       out.numel(),
       ::executorch::extension::internal::GRAIN_SIZE,
-      [&](const auto begin, const auto end) {
+      [&](const auto& begin, const auto& end) {
         using Vec = at::vec::Vectorized<CTYPE>;
         const auto vectorized_begin =
             begin + (Vec::size() - begin % Vec::size()) % Vec::size();
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -55,7 +55,7 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
         0,
         outer_size,
         ::executorch::extension::internal::GRAIN_SIZE,
-        [&](const auto begin, const auto end) {
+        [&](const auto& begin, const auto& end) {
           at::native::serial_vec_log_softmax_lastdim_range(
               input_data_base,
               output_data_base,
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
@@ -80,7 +80,7 @@ Tensor& add_out(
           CTYPE_COMPUTE,
           op_name,
           utils::SupportedTensorDtypes::REALHBBF16>(
-          [val_alpha](const auto val_a, const auto val_b) {
+          [val_alpha](const auto& val_a, const auto& val_b) {
             return val_a + val_alpha * val_b;
           },
           ctx,
@@ -136,7 +136,7 @@ Tensor& add_scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_alpha_times_b](const auto val_a) {
+        [val_alpha_times_b](const auto& val_a) {
           // Cast here supports vectorization; either it does nothing
           // or it casts from CTYPE_COMPUTE to
           // Vectorized<CTYPE_COMPUTE>.
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
@@ -92,7 +92,7 @@ Tensor& addmm_out(
           CTYPE,
           op_name,
           utils::SupportedTensorDtypes::REALHBF16>(
-          [alpha_val, beta_val](const auto val_a, const auto val_b) {
+          [alpha_val, beta_val](const auto& val_a, const auto& val_b) {
             return val_a * alpha_val + val_b * beta_val;
           },
           ctx,
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
@@ -59,7 +59,7 @@ Tensor& atan2_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const auto& val_a, const auto& val_b) {
           return executorch::math::atan2(val_a, val_b);
         },
         ctx,
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
@@ -139,7 +139,7 @@ Tensor& clamp_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [has_min, min_opt, has_max, max_opt](const auto val_in) {
+        [has_min, min_opt, has_max, max_opt](const auto& val_in) {
           auto val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
@@ -62,7 +62,7 @@ Tensor& div_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
-        [](const auto val_a, const auto val_b) { return val_a / val_b; },
+        [](const auto& val_a, const auto& val_b) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -195,7 +195,7 @@ Tensor& div_scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_b](const auto val_a) { return val_a / val_b; },
+        [val_b](const auto& val_a) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
@@ -138,7 +138,7 @@ Tensor& fmod_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [val_b](const auto val_a) {
+        [val_b](const auto& val_a) {
           return executorch::math::fmod(val_a, (decltype(val_a))val_b);
         },
         ctx,
diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp
@@ -14,7 +14,18 @@ namespace torch {
 namespace executor {
 namespace native {
 
-DEFINE_UNARY_UFUNC_REALHBBF16_TO_BOOL(isinf_out, std::isinf)
+bool isinf_float(float x) {
+  return std::isinf(x);
+}
+
+bool isinf_double(double x) {
+  return std::isinf(x);
+}
+
+Tensor& isinf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  return internal::unary_ufunc_realhbbf16_to_bool(
+      isinf_float, isinf_double, ctx, in, out);
+}
 
 } // namespace native
 } // namespace executor
diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp
@@ -13,8 +13,18 @@
 namespace torch {
 namespace executor {
 namespace native {
+bool isnan_float(float x) {
+  return std::isnan(x);
+}
 
-DEFINE_UNARY_UFUNC_REALHBBF16_TO_BOOL(isnan_out, std::isnan)
+bool isnan_double(double x) {
+  return std::isnan(x);
+}
+
+Tensor& isnan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  return internal::unary_ufunc_realhbbf16_to_bool(
+      isnan_float, isnan_double, ctx, in, out);
+}
 
 } // namespace native
 } // namespace executor
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
@@ -49,7 +49,7 @@ Tensor& maximum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const auto& val_a, const auto& val_b) {
           return utils::max_override(val_a, val_b);
         },
         ctx,
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
@@ -72,7 +72,7 @@ Tensor& mul_out(
           CTYPE_COMPUTE,
           op_name,
           utils::SupportedTensorDtypes::REALHBBF16>(
-          [](const auto val_a, const auto val_b) { return val_a * val_b; },
+          [](const auto& val_a, const auto& val_b) { return val_a * val_b; },
           ctx,
           a,
           utils::SupportedTensorDtypes::REALHBBF16,
@@ -116,7 +116,7 @@ Tensor& mul_scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_b](const auto val_a) { return val_a * val_b; },
+        [val_b](const auto& val_a) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp
@@ -39,7 +39,7 @@ Tensor& neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         CTYPE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [](const auto val_in) { return -val_in; },
+        [](const auto& val_in) { return -val_in; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBF16,
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
@@ -57,7 +57,7 @@ Tensor& pow_Tensor_Tensor_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const auto& val_a, const auto& val_b) {
           return executorch::math::pow(val_a, val_b);
         },
         ctx,
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
@@ -56,7 +56,7 @@ Tensor& rsub_scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_b, val_alpha](const auto val_a) {
+        [val_b, val_alpha](const auto& val_a) {
           return val_b - val_alpha * val_a;
         },
         ctx,
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
@@ -47,7 +47,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
-        [](const auto val_in) {
+        [](const auto& val_in) {
           const auto one = static_cast<decltype(val_in)>(1.0);
           auto out_val = one / (one + executorch::math::exp(-val_in));
           return out_val;
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
@@ -60,7 +60,7 @@ Tensor& sub_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [val_alpha](const auto val_a, const auto val_b) {
+        [val_alpha](const auto& val_a, const auto& val_b) {
           return val_a - (decltype(val_b))(val_alpha)*val_b;
         },
         ctx,

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@ endif()`
`24`	`24`	`set(_common_compile_options`
`25`	`25`	`$<$<CXX_COMPILER_ID:MSVC>:/wd4996>`
`26`	`26`	`$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>`
	`27`	`+ $<$<CXX_COMPILER_ID:GNU>:-Wno-psabi>`
`27`	`28`	`)`
`28`	`29`
`29`	`30`	`# Note for apple platform we can rely on Accelerate framework Will come back to`