[js/webgpu] Optimize MultiHeadAttention|Transpose (#22420)

qjia7 · web-flow · commit 0409c639f779 · 2024-10-14T15:43:14.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;
With this optimization, 96 MultiHeadAttention|Transpose ops in phi3
disappear. Phi3 becomes 113 tokens from 107 tokens on my dGPUs.

The optimization mainly skips the transpose op if one of the transposed
dims is 1. Reshape is enough.
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/multihead-attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/multihead-attention.ts
@@ -338,6 +338,9 @@ export const maybeTransposeToBNSHAndAddBias = (
     if (input.dims.length === 3) {
       reshapedInput = input.reshape([batchSize, sequenceLength, numHeads, headSize]);
     }
+    if (numHeads === 1 || sequenceLength === 1) {
+      return reshapedInput;
+    }
     return context.compute(createTransposeProgramInfo(reshapedInput, weightTransposeAttribute.perm), {
       inputs: [reshapedInput],
       outputs: [-1],
@@ -356,6 +359,9 @@ export const maybeTransposeToBNSHAndAddBias = (
         biasOffset!,
       );
       reshapedInput = reshapedInput.reshape([batchSize, sequenceLength, numHeads, headSize]);
+      if (numHeads === 1 || sequenceLength === 1) {
+        return reshapedInput;
+      }
       return context.compute(createTransposeProgramInfo(reshapedInput, weightTransposeAttribute.perm), {
         inputs: [reshapedInput],
         outputs: [-1],