Fix log_softmax along non-contiguous dim

Github Executorch · Github Executorch · commit 6ab0b501e497 · 2025-02-20T09:10:26.000-08:00
#8382 certainly didn't fix this problem (and added it on x86), but I don't think it was correct on ARM prior to that either. Added a regression test. Differential Revision: [D69928884](https://our.internmc.facebook.com/intern/diff/D69928884/) ghstack-source-id: 267433500 Pull Request resolved: #8595
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -75,17 +75,20 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
       static_assert(
           std::is_same_v<OUT_T, float>,
           "Below loop actually only supports float.");
-      const VecIn max_input_vec(max_input);
-      for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
-        auto index = d * dim_stride;
-        auto in = VecIn::loadu(&input_data[index]);
-        auto out_ = (in - max_input_vec).exp();
-        out_.store(&output_data[index]);
+      // It is not correct to vectorize if dim is not contiguous!
+      if (dim_stride == 1) {
+        const VecIn max_input_vec(max_input);
+        for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
+          auto index = d * dim_stride;
+          auto in = VecIn::loadu(&input_data[index]);
+          auto out_ = (in - max_input_vec).exp();
+          out_.store(&output_data[index]);
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
-        temp_sum += vaddvq_f32(out_);
+          temp_sum += vaddvq_f32(out_);
 #else
-        temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
+          temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
 #endif
+        }
       }
       for (; d < dim_size; ++d) {
         output_data[d * dim_stride] =
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
@@ -72,6 +72,60 @@ class OpLogSoftmaxOutTest : public OperatorTest {
       EXPECT_TENSOR_CLOSE(out, expected);
     }
   }
+
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
+  void test_dtype_noncontiguous_dim() {
+    TensorFactory<DTYPE> tf;
+
+    // clang-format off Dim 0 must be longer than the vector width of
+    // the machine (for float, this is 4 for ARM64 and 8 for AVX2) to
+    // exhibit problems.
+    Tensor x = tf.make(
+      {9, 3},
+      {
+        0, 9,  18,
+        1, 10, 19,
+        2, 11, 20,
+        3, 12, 21,
+        4, 13, 22,
+        5, 14, 23,
+        6, 15, 24,
+        7, 16, 25,
+        8, 17, 26,
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({9, 3});
+
+    op_log_softmax_out(x, /*dim=*/0, /*half_to_float*/ false, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+      {9, 3},
+      {
+        -8.45855, -8.45855, -8.45855,
+        -7.45855, -7.45855, -7.45855,
+        -6.45855, -6.45855, -6.45855,
+        -5.45855, -5.45855, -5.45855,
+        -4.45855, -4.45855, -4.45855,
+        -3.45855, -3.45855, -3.45855,
+        -2.45855, -2.45855, -2.45855,
+        -1.45855, -1.45855, -1.45855,
+        -0.458552, -0.458552, -0.458552
+      });
+    // clang-format on
+
+    if constexpr (DTYPE == ScalarType::BFloat16) {
+      EXPECT_TENSOR_CLOSE_WITH_TOL(
+          out,
+          expected,
+          1e-2,
+          executorch::runtime::testing::internal::kDefaultAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
+  }
+
 };
 
 TEST_F(OpLogSoftmaxOutTest, Smoke) {
@@ -101,6 +155,10 @@ TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
+TEST_F(OpLogSoftmaxOutTest, NonContiguous) {
+  test_dtype_noncontiguous_dim<float, ScalarType::Float>();
+}
+
 TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched dimensions";