Fix formatting and add github action workflows for vulkan and metal (m-series) webgpu backends

reeselevine · reeselevine · commit 40dd1f06b32b · 2025-07-15T16:57:46.000-07:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -135,6 +135,70 @@ jobs:
           cd build
           ctest -L main --verbose --timeout 900
 
+  macOS-latest-cmake-arm64-webgpu:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64-webgpu
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          ARTIFACTS_JSON=$(curl -s -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "https://api.github.com/repos/google/dawn/actions/artifacts")
+          echo "Finding latest macos-latest-Release artifact..."
+          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
+            | sort_by(.created_at)
+            | reverse
+            | map(select(.name | test("macos-latest-Release$")))
+            | .[0].archive_download_url')
+          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
+            echo "No suitable Dawn artifact found!"
+            exit 1
+          fi
+          echo "Downloading from: $DOWNLOAD_URL"
+          curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -o artifact.zip "$DOWNLOAD_URL"
+          unzip artifact.zip
+          mkdir dawn
+          tar_file=$(find . -name '*.tar.gz' | head -n 1)
+          echo "Extracting: $tar_file"
+          tar -xvf "$tar_file" -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
   ubuntu-cpu-cmake:
     strategy:
       matrix:
@@ -344,6 +408,72 @@ jobs:
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 3600
 
+  ubuntu-22-cmake-webgpu:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-webgpu
+          evict-old-files: 1d
+
+      - name: Vulkan SDK Dependencies
+        id: vulkan-depends
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+          ARTIFACTS_JSON=$(curl -s -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "https://api.github.com/repos/google/dawn/actions/artifacts")
+          echo "Finding latest ubuntu-latest-Release artifact..."
+          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
+            | sort_by(.created_at)
+            | reverse
+            | map(select(.name | test("ubuntu-latest-Release$")))
+            | .[0].archive_download_url')
+          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
+            echo "No suitable Dawn artifact found!"
+            exit 1
+          fi
+          echo "Downloading from: $DOWNLOAD_URL"
+          curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -o artifact.zip "$DOWNLOAD_URL"
+          unzip artifact.zip
+          mkdir dawn
+          tar_file=$(find . -name '*.tar.gz' | head -n 1)
+          echo "Extracting: $tar_file"
+          tar -xvf "$tar_file" -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build -DGGML_WEBGPU=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          ctest -L main --verbose --timeout 3600
+
   ubuntu-22-cmake-hip:
     runs-on: ubuntu-22.04
     container: rocm/dev-ubuntu-22.04:6.0.2
diff --git a/docs/build.md b/docs/build.md
@@ -568,9 +568,9 @@ cmake -B build -DGGML_WEBGPU=ON
 cmake --build build --config Release
 ```
 
-### Browser Support 
+### Browser Support
 
-WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. 
+WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu.
 
 Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
 
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -51,4 +51,4 @@ if (GGML_WEBGPU_DEBUG)
 endif()
 
 target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR})
-target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET})
+target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET})
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -7,7 +7,9 @@
 
 #include "ggml-wgsl-shaders.hpp"
 
+#include <cstring>
 #include <iostream>
+#include <mutex>
 #include <vector>
 
 #ifdef GGML_WEBGPU_DEBUG
@@ -131,7 +133,7 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer
     buffer_desc.size = size;
     buffer_desc.usage = usage;
     buffer_desc.label = label;
-    buffer_desc.mappedAtCreation = false; 
+    buffer_desc.mappedAtCreation = false;
     // TODO: error handling
     buffer = device.CreateBuffer(&buffer_desc);
 }
@@ -161,7 +163,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange();
 
     params[0] = (uint32_t)offset;
-    params[1] = (uint32_t)size; 
+    params[1] = (uint32_t)size;
     params[2] = value;
     ctx->memset_params_host_buf.Unmap();
 
@@ -184,8 +186,8 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
 
     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(
-        ctx->memset_params_host_buf, 0, 
-        ctx->memset_params_dev_buf, 0, 
+        ctx->memset_params_host_buf, 0,
+        ctx->memset_params_dev_buf, 0,
         ctx->memset_params_dev_buf.GetSize()
     );
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
@@ -206,7 +208,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) {
             if (status != wgpu::QueueWorkDoneStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
             }
-        }), 
+        }),
         UINT64_MAX
     );
 }
@@ -243,7 +245,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
             return false;
-        
+
         case GGML_OP_CPY: {
             std::lock_guard<std::mutex> lock(ctx->mutex);
             const ggml_tensor * src = node->src[0];
@@ -259,7 +261,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 
             wgpu::Device device = ctx->device;
-            ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, 
+            ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf,
                 wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize());
             uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange();
             uint32_t ne = (uint32_t)ggml_nelements(node);
@@ -309,8 +311,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
             wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
             encoder.CopyBufferToBuffer(
-                ctx->cpy_params_host_buf, 0, 
-                ctx->cpy_params_dev_buf, 0, 
+                ctx->cpy_params_host_buf, 0,
+                ctx->cpy_params_dev_buf, 0,
                 ctx->cpy_params_dev_buf.GetSize()
             );
             wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
@@ -343,7 +345,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             wgpu::Device device = ctx->device;
 
             // map the host parameters buffer
-            ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf, 
+            ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf,
                 wgpu::MapMode::Write, 0, ctx->mul_mat_params_host_buf.GetSize());
             uint32_t * params = (uint32_t *) ctx->mul_mat_params_host_buf.GetMappedRange();
 
@@ -371,7 +373,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             entries[0].offset = src0_offset;
             entries[0].size = ggml_nbytes(src0);
 
-            entries[1].binding = 1; 
+            entries[1].binding = 1;
             entries[1].buffer = src1_ctx->buffer;
             entries[1].offset = src1_offset;
             entries[1].size = ggml_nbytes(src1);
@@ -395,8 +397,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
             wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
             encoder.CopyBufferToBuffer(
-                ctx->mul_mat_params_host_buf, 0, 
-                ctx->mul_mat_params_dev_buf, 0, 
+                ctx->mul_mat_params_host_buf, 0,
+                ctx->mul_mat_params_dev_buf, 0,
                 ctx->mul_mat_params_dev_buf.GetSize()
             );
             wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
@@ -417,7 +419,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             return false;
     }
 }
- 
+
 static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
 
@@ -517,13 +519,13 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 
     std::lock_guard<std::mutex> lock(webgpu_ctx->mutex);
 
-    if (webgpu_ctx->get_tensor_staging_buf == nullptr || 
+    if (webgpu_ctx->get_tensor_staging_buf == nullptr ||
         webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
         // Create a new staging buffer if it doesn't exist or is too small
         if (webgpu_ctx->get_tensor_staging_buf) {
             webgpu_ctx->get_tensor_staging_buf.Destroy();
         }
-        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, 
+        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
             wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
     }
 
@@ -577,7 +579,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
 
     wgpu::Buffer buf;
-    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, 
+    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer");
 
     ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
@@ -652,7 +654,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
     constants[1].key = "bytes_per_thread";
     constants[1].value = webgpu_ctx->memset_bytes_per_thread;
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants);
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf,
         3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value
         wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "memset_params_dev_buf");
     ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf,
@@ -679,7 +681,7 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context webgpu_ctx) {
         wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "cpy_params_host_buf");
 }
 
-// TODO: Does this need to be thread safe? Is it only called once?
+// TODO: Make thread safe if multiple devices are used
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
@@ -696,7 +698,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         dev_desc.requiredLimits = &webgpu_ctx->limits;
         dev_desc.requiredFeatures = webgpu_ctx->features.features;
         dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount;
-        dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
+        dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous,
             [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
                 GGML_UNUSED(device);
                 GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
@@ -847,7 +849,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     device_ctx.device_name = std::string(info.device.data);
     device_ctx.device_desc = std::string(info.description.data);
 
-    GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", 
+    GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n",
         info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
 
     // See GGML Backend Device Interface section
@@ -902,4 +904,4 @@ ggml_backend_t ggml_backend_webgpu_init(void) {
     return ggml_backend_webgpu_device_init(dev, nullptr);
 }
 
-GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg)
+GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -1,13 +1,16 @@
 import os
 import argparse
 
+
 def escape_triple_quotes(wgsl):
     # Simple defense in case of embedded """
     return wgsl.replace('"""', '\\"""')
 
+
 def to_cpp_string_literal(varname, content):
     return f'const char* wgsl_{varname} = R"({content})";\n'
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--input', required=True)
@@ -27,5 +30,6 @@ def main():
             out.write(to_cpp_string_literal(varname, content))
             out.write('\n')
 
+
 if __name__ == '__main__':
     main()
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@@ -41,7 +41,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
     let src12_idx = dst2_idx; // src1 is not broadcast
 
-    let dst2_rem = dst3_rem % dst2_stride; 
+    let dst2_rem = dst3_rem % dst2_stride;
 
     let row = dst2_rem / params.n; // output row
     let col = dst2_rem % params.n; // output column
@@ -53,4 +53,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
         sum = sum + src0[src0_idx] * src1[src1_idx];
     }
     dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
-}
+}