From ceb8d36b762f03bd954f96a9abe7aa7ff0c9ad22 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Mon, 18 Aug 2025 16:34:45 -0700
Subject: [PATCH] Improve softmax perf when transpose is not needed (#13081)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/13081

When the supplied dimension is the last dim of the tensor, we don't need to permute anything and can call the nnlib kernel directly.

Differential Revision: D79514231
---
 .../cadence/hifi/operators/op_softmax.cpp     | 36 ++++++++++++++++---
 1 file changed, 32 insertions(+), 4 deletions(-)
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
index 645b9febef0..be496813ce8 100644
--- a/backends/cadence/hifi/operators/op_softmax.cpp
+++ b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -72,7 +72,6 @@ Tensor& _softmax_out(
   if (optimized) {
     int* p_inp = (int*)in.const_data_ptr<float>();
     int* out_data = (int*)out.mutable_data_ptr<float>();
-
     int num_inp_dims = in.dim();
     int num_out_dims = num_inp_dims;
 
@@ -99,6 +98,37 @@ Tensor& _softmax_out(
 
     outer_stride = size;
 
+    WORD32 ret_val = 0;
+
+    // Check if the input is permuted. If not, then we don't need to transpose
+    bool is_permuted = false;
+    for (int i = 0; i < num_inp_dims; i++) {
+      if (p_permute_vec[i] != i) {
+        is_permuted = true;
+        break;
+      }
+    }
+
+    if (!is_permuted) {
+      const float* p_inpf = in.const_data_ptr<float>();
+      float* out_dataf = out.mutable_data_ptr<float>();
+
+      for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+        size_t outer = outer_idx * outer_stride;
+        for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) {
+          size_t base = outer + inner_idx;
+
+          float* p_in_data = (float*)&p_inpf[base];
+          float* p_out_data = (float*)&out_dataf[base];
+
+          ret_val = xa_nn_vec_softmax_f32_f32(p_out_data, p_in_data, size);
+
+          ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+        }
+      }
+      return out;
+    }
+
     int* p_out =
         (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int));
 
@@ -109,7 +139,7 @@ Tensor& _softmax_out(
 
     ET_KERNEL_CHECK(ctx, p_out1 != nullptr, MemoryAllocationFailed, out);
 
-    WORD32 ret_val = xa_nn_transpose_32_32(
+    ret_val = xa_nn_transpose_32_32(
         p_out,
         p_out_shape,
         p_inp,
@@ -142,9 +172,7 @@ Tensor& _softmax_out(
         p_permute_vec,
         num_out_dims,
         num_inp_dims);
-
     ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
-
     return out;
   }