upgrade dawn version to 4cb1f9be152a4fa6bb695c08cd707ab078a1e2fb (microsoft#24247)

fs-eire · web-flow · commit 30115cfe4d13 · 2025-03-31T12:24:56.000-07:00
### Description

Bump version of Dawn to 4cb1f9be152a4fa6bb695c08cd707ab078a1e2fb.

### Changes to the patches to Dawn:

Removed patches because they are already merged into upstream or
resolved in a different way:
- (public) CMake fix to support Emscripten v4.0.3+
- (private) Fix external ref count for "external" device in emwgpu C++
implementation
- (private) Allow "external" buffer in emwgpu C++ implementation

Keep unchanged patches:
- (private) Remove hard-coded CMAKE_OSX_DEPLOYMENT_TARGET in Dawn's
CMake files

Rewritten patches:
- (public) Fix emwgpu C++ implementation for buffer destroy

### Corresponding changes in ORT

- Dawn API changes
  - follow changes to `wgpu::Limits`
- remove the usage of `DAWN_EMSCRIPTEN_TOOLCHAIN`
- use `wgpu::InstanceDescriptor` in `wgpu::Instance` creation in WASM
since it is supported now.
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -57,5 +57,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
-dawn;https://github.com/google/dawn/archive/40a9fa79f76e6c76cca9e2fa69ea07f202f1d2e6.zip;e224563d5ab4a8e53a517b06f721242533bce722
+dawn;https://github.com/google/dawn/archive/4cb1f9be152a4fa6bb695c08cd707ab078a1e2fb.zip;de39336b7715f53c14eec61072293b85cc73b691
 kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.4.0.tar.gz;22d3b57b54a61c194ab256ff11b0353a3b220244
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -623,9 +623,7 @@ if (onnxruntime_USE_WEBGPU)
   set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
   set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
   set(DAWN_BUILD_TESTS OFF CACHE BOOL "" FORCE)
-  if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-    set(DAWN_EMSCRIPTEN_TOOLCHAIN "${REPO_ROOT}/cmake/external/emsdk/upstream/emscripten" CACHE STRING "" FORCE)
-  else()
+  if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
       set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE)
       set(DAWN_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
@@ -713,27 +711,14 @@ if (onnxruntime_USE_WEBGPU)
       #
       # The dawn.patch contains the following changes:
       #
-      # - (public) CMake fix to support Emscripten v4.0.3+
-      #   This change allows Dawn to find the file "gen_struct_info.py" in the correct location.
-      #   https://dawn-review.googlesource.com/c/dawn/+/225514
-      #
-      # - (public) Fix emwgpu C++ implementation for buffer destroy
+      # - (private) Allow WGPUBufferImpl class to destroy the buffer in the destructor
       #   In native implementation, wgpuBufferRelease will trigger the buffer destroy (if refcount decreased to 0). But
-      #   in emwgpu implementation, the buffer destroy won't happen. This change fixes the bug.
-      #   https://dawn-review.googlesource.com/c/dawn/+/226315
-      #
-      # - (private) Allow "external" buffer in emwgpu C++ implementation
-      #   This change allows WGPUBufferImpl to destroy the buffer when the refcount decreased to 0 only for non-external
-      #   buffer.
-      #   "external buffer" means the GPUBuffer instance created in JavaScript and imported to C++ by `importJsBuffer`.
+      #   in emwgpu implementation, the buffer destroy won't happen. This change adds a destructor to the buffer class
+      #   to destroy the buffer when the refcount is 0 for non-external buffers.
       #
       # - (private) Remove hard-coded CMAKE_OSX_DEPLOYMENT_TARGET in Dawn's CMake files
       #   https://github.com/microsoft/onnxruntime/pull/23729
       #
-      # - (private) Fix external ref count for "external" device in emwgpu C++ implementation
-      #   This change fixes the incorrect external ref count for class WGPUDeviceImpl when used with "external" device.
-      #   "external device" means the GPUDevice instance created in JavaScript and imported to C++ by `importJsDevice`.
-      #
       #
       PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
       EXCLUDE_FROM_ALL
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
@@ -11,137 +11,28 @@ index 50638e2456..efa42711e6 100644
 -    set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS version" FORCE)
 -endif ()
 \ No newline at end of file
-diff --git a/src/emdawnwebgpu/CMakeLists.txt b/src/emdawnwebgpu/CMakeLists.txt
-index 6e8ae37593..633af91eef 100644
---- a/src/emdawnwebgpu/CMakeLists.txt
-+++ b/src/emdawnwebgpu/CMakeLists.txt
-@@ -77,9 +77,17 @@ if (${DAWN_ENABLE_EMSCRIPTEN})
-                 "${arg_UNPARSED_ARGUMENTS}")
-         endif()
- 
-+        # since Emscripten 4.0.3, file gen_struct_info.py is moved to outside of directory maint.
-+        if (EXISTS "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
-+            set(EM_GEN_STRUCT_INFO_SCRIPT "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
-+        elseif (EXISTS "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/maint/gen_struct_info.py")
-+            set(EM_GEN_STRUCT_INFO_SCRIPT "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/maint/gen_struct_info.py")
-+        else()
-+            message(FATAL_ERROR "Dawn: Failed to locate file gen_struct_info.py from Emscripten.")
-+        endif()
-         set(ARGS
-             ${Python3_EXECUTABLE}
--            "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/maint/gen_struct_info.py"
-+            "${EM_GEN_STRUCT_INFO_SCRIPT}"
-             -q
-             "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
-             "-I=${EM_BUILD_GEN_DIR}/include"
-diff --git a/src/emdawnwebgpu/README.md b/src/emdawnwebgpu/README.md
-index efd6491cd6..8ebc5d28b6 100644
---- a/src/emdawnwebgpu/README.md
-+++ b/src/emdawnwebgpu/README.md
-@@ -56,7 +56,7 @@ Set up the build directory using emcmake
- mkdir out/cmake-wasm
- cd out/cmake-wasm
- 
--# Make sure the path is to the source checkout of Emscripten, not emsdk's release.
-+# If using Emscripten v4.0.2 or lower, make sure the path is to the source checkout of Emscripten, not emsdk's release.
- emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..
- 
- ninja
 diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
-index f1c5a7d50e..16f2495712 100644
+index 5bfac41dcc..71a153daaa 100644
 --- a/third_party/emdawnwebgpu/webgpu.cpp
 +++ b/third_party/emdawnwebgpu/webgpu.cpp
-@@ -131,7 +131,6 @@ class RefCounted : NonMovable {
-   bool Release() {
-     if (mRefCount.fetch_sub(1u, std::memory_order_release) == 1u) {
-       std::atomic_thread_fence(std::memory_order_acquire);
--      emwgpuDelete(this);
-       return true;
-     }
-     return false;
-@@ -234,6 +233,7 @@ class Ref {
-   static void Release(T value) {
-     if (value != nullptr && value->RefCounted::Release()) {
-       delete value;
-+      emwgpuDelete(value);
-     }
-   }
- 
-@@ -641,7 +641,8 @@ struct WGPUAdapterImpl final : public EventSource, public RefCounted {
- struct WGPUBufferImpl final : public EventSource,
-                               public RefCountedWithExternalCount {
-  public:
--  WGPUBufferImpl(const EventSource* source, bool mappedAtCreation);
-+  WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal);
+@@ -692,6 +692,7 @@ struct WGPUBufferImpl final : public EventSource,
+   WGPUBufferImpl(const EventSource* source, bool mappedAtCreation);
+   // Injection constructor used when we already have a backing Buffer.
+   WGPUBufferImpl(const EventSource* source, WGPUBufferMapState mapState);
 +  ~WGPUBufferImpl();
  
    void Destroy();
    const void* GetConstMappedRange(size_t offset, size_t size);
-@@ -671,6 +672,7 @@ struct WGPUBufferImpl final : public EventSource,
-   };
-   MapRequest mPendingMapRequest;
-   WGPUBufferMapState mMapState;
-+  bool mIsExternal;
- };
- 
- struct WGPUQueueImpl final : public EventSource, public RefCounted {
-@@ -1164,11 +1166,15 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) {
- 
- WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
-                               bool mappedAtCreation = false) {
--  return new WGPUBufferImpl(source, mappedAtCreation);
-+  return new WGPUBufferImpl(source, mappedAtCreation, true);
- }
- 
- WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
--  return new WGPUDeviceImpl(source, queue);
-+  // This function is only called from JS via `importJsDevice()`, which
-+  // needs to increment the external ref count to fix the behavior.
-+  WGPUDeviceImpl* device = new WGPUDeviceImpl(source, queue);
-+  device->AddExternalRef();
-+  return device;
- }
- 
- WGPUQueue emwgpuCreateQueue(const EventSource* source) {
-@@ -1275,15 +1281,22 @@ WGPUAdapterImpl::WGPUAdapterImpl(const EventSource* source)
- // WGPUBuffer implementations.
- // ----------------------------------------------------------------------------
- 
--WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
-+WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal)
-     : EventSource(source),
-       mMapState(mappedAtCreation ? WGPUBufferMapState_Mapped
--                                 : WGPUBufferMapState_Unmapped) {
-+                                 : WGPUBufferMapState_Unmapped),
-+      mIsExternal(isExternal) {
-   if (mappedAtCreation) {
-     mPendingMapRequest = {kNullFutureId, WGPUMapMode_Write};
-   }
- }
+@@ -1361,6 +1362,12 @@ WGPUBufferImpl::WGPUBufferImpl(const EventSource* source,
+       RefCountedWithExternalCount(kImportedFromJS),
+       mMapState(mapState) {}
  
 +WGPUBufferImpl::~WGPUBufferImpl() {
-+  if (!mIsExternal) {
++  if (!IsImported()) {
 +    Destroy();
 +  }
 +}
 +
  void WGPUBufferImpl::Destroy() {
    emwgpuBufferDestroy(this);
    AbortPendingMap("Buffer was destroyed before mapping was resolved.");
-@@ -1504,6 +1517,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
-   void wgpu##Name##Release(WGPU##Name o) {       \
-     if (o->Release()) {                          \
-       delete o;                                  \
-+      emwgpuDelete(o);                           \
-     }                                            \
-   }
- WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
-@@ -1638,7 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) {
- 
- WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device,
-                                   const WGPUBufferDescriptor* descriptor) {
--  WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation);
-+  WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation, false);
-   emwgpuDeviceCreateBuffer(device, descriptor, buffer);
-   return buffer;
- }
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -110,7 +110,7 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
         device_desc.requiredFeatures = required_features.data();
         device_desc.requiredFeatureCount = required_features.size();
       }
-      wgpu::RequiredLimits required_limits = GetRequiredLimits(adapter);
+      wgpu::Limits required_limits = GetRequiredLimits(adapter);
       device_desc.requiredLimits = &required_limits;
 
       // TODO: revise temporary error handling
@@ -139,9 +139,7 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
     // cache adapter info
     ORT_ENFORCE(Device().GetAdapterInfo(&adapter_info_));
     // cache device limits
-    wgpu::SupportedLimits device_supported_limits;
-    ORT_ENFORCE(Device().GetLimits(&device_supported_limits));
-    device_limits_ = device_supported_limits.limits;
+    ORT_ENFORCE(Device().GetLimits(&device_limits_));
 
 #if !defined(__wasm__)
     supports_buffer_map_extended_usages_ = device_.HasFeature(wgpu::FeatureName::BufferMapExtendedUsages);
@@ -508,20 +506,20 @@ std::vector<wgpu::FeatureName> WebGpuContext::GetAvailableRequiredFeatures(const
   return required_features;
 }
 
-wgpu::RequiredLimits WebGpuContext::GetRequiredLimits(const wgpu::Adapter& adapter) const {
-  wgpu::RequiredLimits required_limits{};
-  wgpu::SupportedLimits adapter_limits;
+wgpu::Limits WebGpuContext::GetRequiredLimits(const wgpu::Adapter& adapter) const {
+  wgpu::Limits required_limits{};
+  wgpu::Limits adapter_limits;
   ORT_ENFORCE(adapter.GetLimits(&adapter_limits));
 
-  required_limits.limits.maxBindGroups = adapter_limits.limits.maxBindGroups;
-  required_limits.limits.maxComputeWorkgroupStorageSize = adapter_limits.limits.maxComputeWorkgroupStorageSize;
-  required_limits.limits.maxComputeWorkgroupsPerDimension = adapter_limits.limits.maxComputeWorkgroupsPerDimension;
-  required_limits.limits.maxStorageBufferBindingSize = adapter_limits.limits.maxStorageBufferBindingSize;
-  required_limits.limits.maxBufferSize = adapter_limits.limits.maxBufferSize;
-  required_limits.limits.maxComputeInvocationsPerWorkgroup = adapter_limits.limits.maxComputeInvocationsPerWorkgroup;
-  required_limits.limits.maxComputeWorkgroupSizeX = adapter_limits.limits.maxComputeWorkgroupSizeX;
-  required_limits.limits.maxComputeWorkgroupSizeY = adapter_limits.limits.maxComputeWorkgroupSizeY;
-  required_limits.limits.maxComputeWorkgroupSizeZ = adapter_limits.limits.maxComputeWorkgroupSizeZ;
+  required_limits.maxBindGroups = adapter_limits.maxBindGroups;
+  required_limits.maxComputeWorkgroupStorageSize = adapter_limits.maxComputeWorkgroupStorageSize;
+  required_limits.maxComputeWorkgroupsPerDimension = adapter_limits.maxComputeWorkgroupsPerDimension;
+  required_limits.maxStorageBufferBindingSize = adapter_limits.maxStorageBufferBindingSize;
+  required_limits.maxBufferSize = adapter_limits.maxBufferSize;
+  required_limits.maxComputeInvocationsPerWorkgroup = adapter_limits.maxComputeInvocationsPerWorkgroup;
+  required_limits.maxComputeWorkgroupSizeX = adapter_limits.maxComputeWorkgroupSizeX;
+  required_limits.maxComputeWorkgroupSizeY = adapter_limits.maxComputeWorkgroupSizeY;
+  required_limits.maxComputeWorkgroupSizeZ = adapter_limits.maxComputeWorkgroupSizeZ;
 
   return required_limits;
 }
@@ -740,13 +738,9 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
 #endif
 
     // Step.2 - Create wgpu::Instance
-#if !defined(__wasm__)
     wgpu::InstanceDescriptor instance_desc{};
     instance_desc.capabilities.timedWaitAnyEnable = true;
     default_instance_ = wgpu::CreateInstance(&instance_desc);
-#else
-    default_instance_ = wgpu::CreateInstance(nullptr);
-#endif
 
     ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
   });
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -161,7 +161,7 @@ class WebGpuContext final {
   std::vector<const char*> GetEnabledDeviceToggles() const;
   std::vector<const char*> GetDisabledDeviceToggles() const;
   std::vector<wgpu::FeatureName> GetAvailableRequiredFeatures(const wgpu::Adapter& adapter) const;
-  wgpu::RequiredLimits GetRequiredLimits(const wgpu::Adapter& adapter) const;
+  wgpu::Limits GetRequiredLimits(const wgpu::Adapter& adapter) const;
   void WriteTimestamp(uint32_t query_index);
 
   struct PendingKernelInfo {