From ceb8d36b762f03bd954f96a9abe7aa7ff0c9ad22 Mon Sep 17 00:00:00 2001 From: Matthias Cremon Date: Mon, 18 Aug 2025 16:34:45 -0700 Subject: [PATCH] Improve softmax perf when transpose is not needed (#13081) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/13081 When the supplied dimension is the last dim of the tensor, we don't need to permute anything and can call the nnlib kernel directly. Differential Revision: D79514231 --- .../cadence/hifi/operators/op_softmax.cpp | 36 ++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp index 645b9febef0..be496813ce8 100644 --- a/backends/cadence/hifi/operators/op_softmax.cpp +++ b/backends/cadence/hifi/operators/op_softmax.cpp @@ -72,7 +72,6 @@ Tensor& _softmax_out( if (optimized) { int* p_inp = (int*)in.const_data_ptr(); int* out_data = (int*)out.mutable_data_ptr(); - int num_inp_dims = in.dim(); int num_out_dims = num_inp_dims; @@ -99,6 +98,37 @@ Tensor& _softmax_out( outer_stride = size; + WORD32 ret_val = 0; + + // Check if the input is permuted. If not, then we don't need to transpose + bool is_permuted = false; + for (int i = 0; i < num_inp_dims; i++) { + if (p_permute_vec[i] != i) { + is_permuted = true; + break; + } + } + + if (!is_permuted) { + const float* p_inpf = in.const_data_ptr(); + float* out_dataf = out.mutable_data_ptr(); + + for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + size_t outer = outer_idx * outer_stride; + for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) { + size_t base = outer + inner_idx; + + float* p_in_data = (float*)&p_inpf[base]; + float* p_out_data = (float*)&out_dataf[base]; + + ret_val = xa_nn_vec_softmax_f32_f32(p_out_data, p_in_data, size); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + } + return out; + } + int* p_out = (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int)); @@ -109,7 +139,7 @@ Tensor& _softmax_out( ET_KERNEL_CHECK(ctx, p_out1 != nullptr, MemoryAllocationFailed, out); - WORD32 ret_val = xa_nn_transpose_32_32( + ret_val = xa_nn_transpose_32_32( p_out, p_out_shape, p_inp, @@ -142,9 +172,7 @@ Tensor& _softmax_out( p_permute_vec, num_out_dims, num_inp_dims); - ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); - return out; }