Update CODEOWNERS and remove serialize submit option

reeselevine · reeselevine · commit a5e26c2d3bdd · 2025-10-07T13:39:07.000-07:00
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -70,11 +70,11 @@
 /ggml/src/ggml-rpc/                     @rgerganov
 /ggml/src/ggml-threading.*              @ggerganov @slaren
 /ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-webgpu/                  @reeselevine
 /ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
 /ggml/src/ggml.c                        @ggerganov @slaren
 /ggml/src/ggml.cpp                      @ggerganov @slaren
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
-/ggml/src/ggml-webgpu/                  @reeselevine
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
 /scripts/gen*                           @ggerganov
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -221,7 +221,6 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug in
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
-option(GGML_WEBGPU_SERIALIZE_SUBMIT         "ggml: enable WebGPU command serialization"       OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -46,10 +46,6 @@ else()
     set(DawnWebGPU_TARGET dawn::webgpu_dawn)
 endif()
 
-if (GGML_WEBGPU_SERIALIZE_SUBMIT)
-    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_SERIALIZE_SUBMIT=1)
-endif()
-
 if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -57,22 +57,12 @@
 #    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
 #endif
 
-// TODO: The WebGPU backend can deadlock in multi-threaded scenarios if the parameter buffer pool
-// is exhausted and the command submit batch size is too high, or in cases where the underlying
-// WebGPU implementation has bugs in handling concurrent operations. Serializing command submission
-// is a workaround, but we should also investigate better solutions.
-#ifdef GGML_WEBGPU_SERIALIZE_SUBMIT
-#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1u
-#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       UINT64_MAX
-#else
-#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u
-#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       0
-#endif
-
 /* Constants */
 
 #define WEBGPU_MUL_MAT_WG_SIZE               256
 #define WEBGPU_NUM_PARAM_BUFS                32u
+#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     8u
+#define WEBGPU_WAIT_ANY_TIMEOUT_MS           0
 // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool
 #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD  WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
@@ -376,11 +366,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 // Wait for the queue to finish processing all submitted work
 static void ggml_backend_webgpu_wait(webgpu_context &                         ctx,
                                      std::vector<webgpu_submission_futures> & futures,
-                                     uint64_t                                 timeout_ms = UINT64_MAX) {
+                                     bool                                     block = true) {
     // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
     // inflight_max may be 0, meaning that we must wait on all futures.
-    uint inflight_threads = ctx->inflight_threads;
-    uint inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
+    uint64_t timeout_ms       = block ? UINT64_MAX : 0;
+    uint     inflight_threads = ctx->inflight_threads;
+    uint     inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
     while (futures.size() >= inflight_max && futures.size() > 0) {
         ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
         futures.erase(futures.begin());
@@ -1287,7 +1278,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
             futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
             // Process events and check for completed submissions
             ctx->instance.ProcessEvents();
-            ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS);
+            ggml_backend_webgpu_wait(ctx, futures, false);
             commands.clear();
         }
     }