webgpu: fix dispatch size issue of Transpose operator (#26501)

xhcao · wp · web-flow · commit 1936d646b2b7 · 2025-11-17T18:24:34.000Z
### Description
&lt;!-- Describe your changes. --&gt;



### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;

---------

Co-authored-by: wp &lt;webgraphics@intel.com&gt;
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -162,18 +162,16 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
     uint32_t dispatch_z = 1;
 
     // This temporary workaround addresses a significant performance bottleneck
-    // (~12x slower) for the shape (3, 3, 2560, 1280) due to an issue with Intel's
+    // (~12x slower) for the input shape (1280, 2560, 3, 3) due to an issue with Intel's
     // GPU drivers. We manually normalize the dispatch group size to restore
     // performance.
     //
     // TODO: Revert this change once the driver issue is fixed.
-    if (context.AdapterInfo().vendor == std::string_view{"intel"}) {
-      // Only adjusted the dispatch size when rank is 4 yet.
-      if (rank == static_cast<size_t>(4)) {
-        dispatch_x = ceil_div(input_shape[0] * input_shape[1], 2);
-        dispatch_y = ceil_div(input_shape[2], 4);
-        dispatch_z = ceil_div(input_shape[3], 8);
-      }
+    if (context.AdapterInfo().vendor == std::string_view{"intel"} && rank == 4) {
+      uint32_t dispatch_size = dispatch_x;
+      dispatch_x = 4;
+      dispatch_y = 8;
+      dispatch_z = ceil_div(dispatch_size, dispatch_x * dispatch_y);
     }
     program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z);
   }