Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
30ba139
Add paramater buffer pool, batching of submissions, refactor command …
reeselevine Jul 30, 2025
04d7b27
Add header for linux builds
reeselevine Jul 30, 2025
01c8ced
Free staged parameter buffers at once
reeselevine Jul 30, 2025
bfff27f
Format with clang-format
reeselevine Jul 30, 2025
b8012ec
Fix thread-safe implementation
reeselevine Jul 31, 2025
cddda7e
Use device implicit synchronization
reeselevine Jul 31, 2025
1d5726a
Merge remote-tracking branch 'upstream/master' into fixes
reeselevine Jul 31, 2025
6a20e39
Update workflow to use custom release
reeselevine Aug 1, 2025
ea39068
Remove testing branch workflow
reeselevine Aug 1, 2025
4c58742
Merge branch 'ggml-org:master' into master
reeselevine Aug 4, 2025
ae8edbf
Disable set_rows until it's implemented
reeselevine Aug 4, 2025
75eb99b
Merge branch 'ggml-org:master' into master
reeselevine Aug 5, 2025
bfc6930
Fix potential issue around empty queue submission
reeselevine Aug 5, 2025
69965a8
Try synchronous submission
reeselevine Aug 5, 2025
c773e2f
Try waiting on all futures explicitly
reeselevine Aug 5, 2025
5aeab73
Add debug
reeselevine Aug 5, 2025
d4af0d6
Add more debug messages
reeselevine Aug 5, 2025
320f679
Work on getting ssh access for debugging
reeselevine Aug 5, 2025
f422911
Debug on failure
reeselevine Aug 5, 2025
0feece5
Disable other tests
reeselevine Aug 5, 2025
0512d66
Remove extra if
reeselevine Aug 5, 2025
9335adf
Try more locking
reeselevine Aug 5, 2025
fc9e99d
maybe passes?
reeselevine Aug 5, 2025
7d9807e
test
reeselevine Aug 5, 2025
f7745c4
Some cleanups
reeselevine Aug 5, 2025
4dc409a
Restore build file
reeselevine Aug 5, 2025
3b81c99
Remove extra testing branch ci
reeselevine Aug 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ jobs:
- name: Test
id: cmake_test
run: |
export LLAMA_SET_ROWS=0
cd build
ctest -L main --verbose --timeout 900

Expand Down Expand Up @@ -431,12 +432,13 @@ jobs:
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build -DGGML_WEBGPU=ON
cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_DEBUG=ON
cmake --build build --config Release -j $(nproc)

- name: Test
id: cmake_test
run: |
export LLAMA_SET_ROWS=0
cd build
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
Expand Down
36 changes: 24 additions & 12 deletions ggml/src/ggml-webgpu/ggml-webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

/* Constants */

#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16
#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1
#define WEBGPU_MUL_MAT_WG_SIZE 64
#define WEBGPU_NUM_PARAM_BUFS 100
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 256
Expand Down Expand Up @@ -139,6 +139,8 @@ struct webgpu_context_struct {

// Parameter buffers associated with the staged command buffers
std::vector<webgpu_param_bufs> staged_param_bufs;

std::vector<wgpu::FutureWaitInfo> callback_futures;
};

typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
Expand Down Expand Up @@ -221,25 +223,29 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,

/** WebGPU Actions */

// Wait for the queue to finish processing all submitted work
static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
// Wait for the queue to finish processing all commands
ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
wgpu::CallbackMode::AllowSpontaneous,
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
}
}),
UINT64_MAX);
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_wait_on_submission()");
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
if (ctx->callback_futures.empty()) {
return;
}
ctx->instance.WaitAny(ctx->callback_futures.size(), ctx->callback_futures.data(), UINT64_MAX);
ctx->callback_futures.clear();
}

static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()");
if (ctx->staged_command_bufs.empty()) {
// Nothing to submit
return;
}
ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
ctx->staged_command_bufs.clear();
std::vector<webgpu_param_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
// Free the staged parameter buffers once the submission completes
ctx->queue.OnSubmittedWorkDone(
wgpu::Future f = ctx->queue.OnSubmittedWorkDone(
wgpu::CallbackMode::AllowSpontaneous,
[ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) {
Expand All @@ -248,6 +254,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
// Free the staged parameter buffers
ctx->param_buf_pool.free_bufs(staged_param_bufs);
});
ctx->callback_futures.push_back({ f });
}

static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
Expand Down Expand Up @@ -307,14 +314,16 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
if (submit_imm) {
// Submit immediately
ctx->queue.Submit(1, &commands);
ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
wgpu::Future f = ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
[ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
if (status != wgpu::QueueWorkDoneStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
message.data);
}
ctx->param_buf_pool.free_bufs({ params_bufs });
});
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
ctx->callback_futures.push_back({ f });
} else {
// Lock the context mutex when pushing to the staging vectors.
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
Expand All @@ -325,6 +334,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
ggml_backend_webgpu_submit_queue(ctx);
}
}
ggml_backend_webgpu_wait_on_submission(ctx);
}

static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
Expand Down Expand Up @@ -368,6 +378,8 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
}

static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
WEBGPU_LOG_DEBUG("ggml_webgpu_cpy(" << src << ", " << dst << ")");

size_t src_offset = ggml_backend_webgpu_tensor_offset(src);
// assumes power of 2 offset alignment
size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
Expand Down
Loading