[webgpu] Add dispatchWorkgroupsIndirect support (#25934)

qjia7 · github-actions[bot] · web-flow · commit f6b405c9c50f · 2025-09-19T06:03:42.000+08:00
### Description This PR adds the dispatchWorkgroupsIndirect capability for the program. It's part of the work to enable graph capture in phi4 #25868 --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
diff --git a/onnxruntime/core/providers/webgpu/allocator.cc b/onnxruntime/core/providers/webgpu/allocator.cc
@@ -27,7 +27,7 @@ void* GpuBufferAllocator::Alloc(size_t size) {
   stats_.num_allocs++;
 
   wgpu::BufferUsage usage = mapped_at_creation_ ? wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapWrite
-                                                : wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+                                                : wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Indirect;
 
   return buffer_manager_.Create(size, usage);
 }
diff --git a/onnxruntime/core/providers/webgpu/program.cc b/onnxruntime/core/providers/webgpu/program.cc
@@ -319,6 +319,7 @@ ProgramBase::ProgramBase(std::string_view name, ProgramMetadata&& metadata)
       dispatch_group_size_x_{0},
       dispatch_group_size_y_{0},
       dispatch_group_size_z_{0},
+      indirect_dispatch_tensor_{nullptr},
       workgroup_size_x_{0},
       workgroup_size_y_{0},
       workgroup_size_z_{0} {
@@ -359,6 +360,11 @@ ProgramBase& ProgramBase::SetDispatchGroupSize(uint32_t x, uint32_t y, uint32_t
   return *this;
 }
 
+ProgramBase& ProgramBase::SetIndirectDispatchTensor(const Tensor* indirect_dispatch_tensor) {
+  indirect_dispatch_tensor_ = indirect_dispatch_tensor;
+  return *this;
+}
+
 ProgramBase& ProgramBase::SetWorkgroupSize(uint32_t x) {
   return SetWorkgroupSize(x, 1, 1);
 }
diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h
@@ -305,6 +305,9 @@ class ProgramBase {
   // set the size of dispatch groups.
   ProgramBase& SetDispatchGroupSize(uint32_t x, uint32_t y, uint32_t z);
 
+  // set indirect dispatch tensor for indirect dispatch
+  ProgramBase& SetIndirectDispatchTensor(const Tensor* indirect_dispatch_tensor);
+
   // set the size of a workgroup grid. Y and Z are 1 if not specified.
   ProgramBase& SetWorkgroupSize(uint32_t x);
   // set the size of a workgroup grid. Z is 1 if not specified.
@@ -348,6 +351,7 @@ class ProgramBase {
   inline uint32_t DispatchGroupSizeX() const { return dispatch_group_size_x_; }
   inline uint32_t DispatchGroupSizeY() const { return dispatch_group_size_y_; }
   inline uint32_t DispatchGroupSizeZ() const { return dispatch_group_size_z_; }
+  inline const Tensor* IndirectDispatchTensor() const { return indirect_dispatch_tensor_; }
   inline uint32_t WorkgroupSizeX() const { return workgroup_size_x_; }
   inline uint32_t WorkgroupSizeY() const { return workgroup_size_y_; }
   inline uint32_t WorkgroupSizeZ() const { return workgroup_size_z_; }
@@ -374,6 +378,8 @@ class ProgramBase {
   uint32_t dispatch_group_size_y_;
   uint32_t dispatch_group_size_z_;
 
+  const Tensor* indirect_dispatch_tensor_;
+
   uint32_t workgroup_size_x_;
   uint32_t workgroup_size_y_;
   uint32_t workgroup_size_z_;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -255,7 +255,14 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
   uint32_t x = program.DispatchGroupSizeX();
   uint32_t y = program.DispatchGroupSizeY();
   uint32_t z = program.DispatchGroupSizeZ();
-  ORT_RETURN_IF_ERROR(program_mgr_->NormalizeDispatchGroupSize(x, y, z));
+
+  // Skip normalization for indirect dispatch since dimensions are determined by the indirect buffer
+  if (program.IndirectDispatchTensor() == nullptr) {
+    ORT_RETURN_IF_ERROR(program_mgr_->NormalizeDispatchGroupSize(x, y, z));
+  } else {
+    ORT_ENFORCE(x == 0 && y == 0 && z == 0,
+                "Only one of SetIndirectDispatchTensor and SetDispatchGroupSize should be called for program", program.Name());
+  }
 
   bool is_1d_dispatch = (y == 1 && z == 1);
 
@@ -442,7 +449,7 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
     bind_buffers.push_back(uniform_buffer);
   }
 
-  LaunchComputePipeline(compute_pass_encoder, bind_buffers, *program_artifact, x, y, z);
+  LaunchComputePipeline(compute_pass_encoder, bind_buffers, *program_artifact, x, y, z, program.IndirectDispatchTensor());
   if (uniform_buffer) {
     buffer_mgr.Release(uniform_buffer);
   }
@@ -722,7 +729,8 @@ void WebGpuContext::OnRunEnd() {
 void WebGpuContext::LaunchComputePipeline(const wgpu::ComputePassEncoder& compute_pass_encoder,
                                           const std::vector<WGPUBuffer>& bind_buffers,
                                           const ProgramArtifact& program_artifact,
-                                          uint32_t x, uint32_t y, uint32_t z) {
+                                          uint32_t x, uint32_t y, uint32_t z,
+                                          const Tensor* indirect_dispatch_tensor) {
   uint32_t entry_index = 0;
   std::vector<WGPUBindGroupEntry> bind_group_entries;
   for (WGPUBuffer buffer : bind_buffers) {
@@ -738,14 +746,27 @@ void WebGpuContext::LaunchComputePipeline(const wgpu::ComputePassEncoder& comput
 
   auto bind_group = wgpuDeviceCreateBindGroup(Device().Get(), &bind_group_desc);
   if (graph_capture_state_ == GraphCaptureState::Capturing) {
+    WGPUBuffer indirect_buffer = nullptr;
+    if (indirect_dispatch_tensor != nullptr) {
+      indirect_buffer = reinterpret_cast<WGPUBuffer>(const_cast<void*>(indirect_dispatch_tensor->DataRaw()));
+    }
     external_captured_commands_->push_back({program_artifact.compute_pipeline,
                                             bind_group,
                                             bind_group_layout,
-                                            {x, y, z}});
+                                            {x, y, z},
+                                            indirect_buffer});
   } else {
     compute_pass_encoder.SetPipeline(program_artifact.compute_pipeline);
     wgpuComputePassEncoderSetBindGroup(compute_pass_encoder.Get(), 0, bind_group, 0, nullptr);
-    compute_pass_encoder.DispatchWorkgroups(x, y, z);
+
+    if (indirect_dispatch_tensor != nullptr) {
+      // Use indirect dispatch
+      WGPUBuffer indirect_buffer = reinterpret_cast<WGPUBuffer>(const_cast<void*>(indirect_dispatch_tensor->DataRaw()));
+      compute_pass_encoder.DispatchWorkgroupsIndirect(indirect_buffer, 0);
+    } else {
+      // Use direct dispatch
+      compute_pass_encoder.DispatchWorkgroups(x, y, z);
+    }
 
     wgpuBindGroupRelease(bind_group);
     wgpuBindGroupLayoutRelease(bind_group_layout);
@@ -781,7 +802,15 @@ void WebGpuContext::Replay(const std::vector<webgpu::CapturedCommandInfo>& captu
     WriteTimestamp(num_pending_dispatches_ * 2);
     compute_pass_encoder.SetPipeline(command.compute_pipeline);
     wgpuComputePassEncoderSetBindGroup(compute_pass_encoder.Get(), 0, command.bind_group, 0, nullptr);
-    compute_pass_encoder.DispatchWorkgroups(command.dispatch_group[0], command.dispatch_group[1], command.dispatch_group[2]);
+
+    if (command.indirect_buffer != nullptr) {
+      // Use indirect dispatch
+      compute_pass_encoder.DispatchWorkgroupsIndirect(command.indirect_buffer, 0);
+    } else {
+      // Use direct dispatch
+      compute_pass_encoder.DispatchWorkgroups(command.dispatch_group[0], command.dispatch_group[1], command.dispatch_group[2]);
+    }
+
     WriteTimestamp(num_pending_dispatches_ * 2 + 1);
     ++num_pending_dispatches_;
     if (num_pending_dispatches_ >= max_num_pending_dispatches_ ||
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -30,6 +30,7 @@ struct CapturedCommandInfo {
   WGPUBindGroup bind_group;
   WGPUBindGroupLayout bind_group_layout;
   std::array<uint32_t, 3> dispatch_group;
+  WGPUBuffer indirect_buffer;  // WGPUBuffer for indirect dispatch, nullptr if not using indirect dispatch
 };
 
 struct WebGpuContextConfig {
@@ -182,7 +183,8 @@ class WebGpuContext final {
   void LaunchComputePipeline(const wgpu::ComputePassEncoder& compute_pass_encoder,
                              const std::vector<WGPUBuffer>& bind_buffers,
                              const ProgramArtifact& program_artifact,
-                             uint32_t x, uint32_t y, uint32_t z);
+                             uint32_t x, uint32_t y, uint32_t z,
+                             const Tensor* indirect_dispatch_tensor = nullptr);
 
   std::vector<const char*> GetEnabledAdapterToggles() const;
   std::vector<const char*> GetEnabledDeviceToggles() const;

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ void* GpuBufferAllocator::Alloc(size_t size) {`
`27`	`27`	`stats_.num_allocs++;`
`28`	`28`
`29`	`29`	`wgpu::BufferUsage usage = mapped_at_creation_ ? wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc \| wgpu::BufferUsage::CopyDst \| wgpu::BufferUsage::MapWrite`
`30`		`- : wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc \| wgpu::BufferUsage::CopyDst;`
	`30`	`+ : wgpu::BufferUsage::Storage \| wgpu::BufferUsage::CopySrc \| wgpu::BufferUsage::CopyDst \| wgpu::BufferUsage::Indirect;`
`31`	`31`
`32`	`32`	`return buffer_manager_.Create(size, usage);`
`33`	`33`	`}`