diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp index 1d2467bca5f..1822a06f29f 100644 --- a/kernels/optimized/cpu/op_log_softmax.cpp +++ b/kernels/optimized/cpu/op_log_softmax.cpp @@ -75,17 +75,20 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) { static_assert( std::is_same_v, "Below loop actually only supports float."); - const VecIn max_input_vec(max_input); - for (; d + VecOut::size() < dim_size; d += VecOut::size()) { - auto index = d * dim_stride; - auto in = VecIn::loadu(&input_data[index]); - auto out_ = (in - max_input_vec).exp(); - out_.store(&output_data[index]); + // It is not correct to vectorize if dim is not contiguous! + if (dim_stride == 1) { + const VecIn max_input_vec(max_input); + for (; d + VecOut::size() < dim_size; d += VecOut::size()) { + auto index = d * dim_stride; + auto in = VecIn::loadu(&input_data[index]); + auto out_ = (in - max_input_vec).exp(); + out_.store(&output_data[index]); #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) - temp_sum += vaddvq_f32(out_); + temp_sum += vaddvq_f32(out_); #else - temp_sum += at::vec::vec_reduce_all(std::plus(), out_); + temp_sum += at::vec::vec_reduce_all(std::plus(), out_); #endif + } } for (; d < dim_size; ++d) { output_data[d * dim_stride] = diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp index 94047592a80..1b01ff8a78d 100644 --- a/kernels/test/op_log_softmax_test.cpp +++ b/kernels/test/op_log_softmax_test.cpp @@ -72,6 +72,59 @@ class OpLogSoftmaxOutTest : public OperatorTest { EXPECT_TENSOR_CLOSE(out, expected); } } + + template + void test_dtype_noncontiguous_dim() { + TensorFactory tf; + + // Dim 0 must be longer than the vector width of the machine (for + // float, this is 4 for ARM64 and 8 for AVX2) to exhibit problems. + // clang-format off + Tensor x = tf.make( + {9, 3}, + { + 0, 9, 18, + 1, 10, 19, + 2, 11, 20, + 3, 12, 21, + 4, 13, 22, + 5, 14, 23, + 6, 15, 24, + 7, 16, 25, + 8, 17, 26, + }); + // clang-format on + + Tensor out = tf.zeros({9, 3}); + + op_log_softmax_out(x, /*dim=*/0, /*half_to_float*/ false, out); + + // clang-format off + Tensor expected = tf.make( + {9, 3}, + { + -8.45855, -8.45855, -8.45855, + -7.45855, -7.45855, -7.45855, + -6.45855, -6.45855, -6.45855, + -5.45855, -5.45855, -5.45855, + -4.45855, -4.45855, -4.45855, + -3.45855, -3.45855, -3.45855, + -2.45855, -2.45855, -2.45855, + -1.45855, -1.45855, -1.45855, + -0.458552, -0.458552, -0.458552 + }); + // clang-format on + + if constexpr (DTYPE == ScalarType::BFloat16) { + EXPECT_TENSOR_CLOSE_WITH_TOL( + out, + expected, + 1e-2, + executorch::runtime::testing::internal::kDefaultAtol); + } else { + EXPECT_TENSOR_CLOSE(out, expected); + } + } }; TEST_F(OpLogSoftmaxOutTest, Smoke) { @@ -101,6 +154,10 @@ TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) { #undef TEST_ENTRY } +TEST_F(OpLogSoftmaxOutTest, NonContiguous) { + test_dtype_noncontiguous_dim(); +} + TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) { if (SupportedFeatures::get()->is_aten) { GTEST_SKIP() << "ATen currently supports mismatched dimensions";