Skip to content

Commit 73676fc

Browse files
authored
[webgpu][dawn API optimization] reduce number of calls to buffer APIs (microsoft#24315)
### Description This PR is one of a series of changes for optimization of Dawn API usage. See microsoft#24281 Reduce the calls to wgpuBufferAddRef and wgpuBufferRelease (part 1).
1 parent a1186f6 commit 73676fc

File tree

5 files changed

+80
-45
lines changed

5 files changed

+80
-45
lines changed

onnxruntime/core/providers/webgpu/allocator.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@ void* GpuBufferAllocator::Alloc(size_t size) {
1313
return nullptr;
1414
}
1515

16-
WGPUBuffer buffer;
17-
if (!session_initialized_ && context_.SupportsBufferMapExtendedUsages()) {
18-
buffer = context_.BufferManager().CreateUMA(size);
19-
} else {
20-
buffer = context_.BufferManager().Create(size);
16+
stats_.num_allocs++;
17+
18+
#if !defined(__wasm__)
19+
if (!session_initialized_ && context_.DeviceHasFeature(wgpu::FeatureName::BufferMapExtendedUsages)) {
20+
return context_.BufferManager().CreateUMA(size);
2121
}
22+
#endif // !defined(__wasm__)
2223

23-
stats_.num_allocs++;
24-
return buffer;
24+
return context_.BufferManager().Create(size);
2525
}
2626

2727
void GpuBufferAllocator::Free(void* p) {

onnxruntime/core/providers/webgpu/buffer_manager.cc

Lines changed: 72 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,27 @@ class LazyReleaseCacheManager : public IBufferCacheManager {
5656
}
5757

5858
void ReleaseBuffer(WGPUBuffer buffer) override {
59-
pending_buffers_.emplace_back(wgpu::Buffer::Acquire(buffer));
59+
pending_buffers_.emplace_back(buffer);
6060
}
6161

6262
void OnRefresh() override {
63+
Release();
6364
pending_buffers_.clear();
6465
}
6566

66-
std::vector<wgpu::Buffer> pending_buffers_;
67+
public:
68+
~LazyReleaseCacheManager() {
69+
Release();
70+
}
71+
72+
protected:
73+
void Release() {
74+
for (auto& buffer : pending_buffers_) {
75+
wgpuBufferRelease(buffer);
76+
}
77+
}
78+
79+
std::vector<WGPUBuffer> pending_buffers_;
6780
};
6881

6982
class SimpleCacheManager : public IBufferCacheManager {
@@ -74,7 +87,7 @@ class SimpleCacheManager : public IBufferCacheManager {
7487
WGPUBuffer TryAcquireCachedBuffer(size_t buffer_size) override {
7588
auto it = buffers_.find(buffer_size);
7689
if (it != buffers_.end() && !it->second.empty()) {
77-
auto buffer = it->second.back().MoveToCHandle();
90+
auto buffer = it->second.back();
7891
it->second.pop_back();
7992
return buffer;
8093
}
@@ -87,18 +100,31 @@ class SimpleCacheManager : public IBufferCacheManager {
87100
}
88101

89102
void ReleaseBuffer(WGPUBuffer buffer) override {
90-
pending_buffers_.emplace_back(wgpu::Buffer::Acquire(buffer));
103+
pending_buffers_.emplace_back(buffer);
91104
}
92105

93106
void OnRefresh() override {
94107
for (auto& buffer : pending_buffers_) {
95-
buffers_[static_cast<size_t>(buffer.GetSize())].emplace_back(std::move(buffer));
108+
buffers_[static_cast<size_t>(wgpuBufferGetSize(buffer))].emplace_back(buffer);
96109
}
97110
pending_buffers_.clear();
98111
}
99112

100-
std::map<size_t, std::vector<wgpu::Buffer>> buffers_;
101-
std::vector<wgpu::Buffer> pending_buffers_;
113+
public:
114+
~SimpleCacheManager() {
115+
for (auto& buffer : pending_buffers_) {
116+
wgpuBufferRelease(buffer);
117+
}
118+
for (auto& pair : buffers_) {
119+
for (auto& buffer : pair.second) {
120+
wgpuBufferRelease(buffer);
121+
}
122+
}
123+
}
124+
125+
protected:
126+
std::map<size_t, std::vector<WGPUBuffer>> buffers_;
127+
std::vector<WGPUBuffer> pending_buffers_;
102128
};
103129

104130
// TODO: maybe use different bucket size for storage and uniform buffers?
@@ -155,7 +181,7 @@ class BucketCacheManager : public IBufferCacheManager {
155181
WGPUBuffer TryAcquireCachedBuffer(size_t buffer_size) override {
156182
auto it = buckets_.find(buffer_size);
157183
if (it != buckets_.end() && !it->second.empty()) {
158-
auto buffer = it->second.back().MoveToCHandle();
184+
auto buffer = it->second.back();
159185
it->second.pop_back();
160186
return buffer;
161187
}
@@ -167,31 +193,44 @@ class BucketCacheManager : public IBufferCacheManager {
167193
}
168194

169195
void ReleaseBuffer(WGPUBuffer buffer) override {
170-
pending_buffers_.emplace_back(wgpu::Buffer::Acquire(buffer));
196+
pending_buffers_.emplace_back(buffer);
171197
}
172198

173199
void OnRefresh() override {
174200
// TODO: consider graph capture. currently not supported
175201

176202
for (auto& buffer : pending_buffers_) {
177-
auto buffer_size = static_cast<size_t>(buffer.GetSize());
203+
auto buffer_size = static_cast<size_t>(wgpuBufferGetSize(buffer));
178204

179205
auto it = buckets_.find(buffer_size);
180206
if (it != buckets_.end() && it->second.size() < buckets_limit_[buffer_size]) {
181-
it->second.emplace_back(std::move(buffer));
207+
it->second.emplace_back(buffer);
208+
} else {
209+
wgpuBufferRelease(buffer);
182210
}
183211
}
184212

185213
pending_buffers_.clear();
186214
}
187215

216+
~BucketCacheManager() {
217+
for (auto& buffer : pending_buffers_) {
218+
wgpuBufferRelease(buffer);
219+
}
220+
for (auto& pair : buckets_) {
221+
for (auto& buffer : pair.second) {
222+
wgpuBufferRelease(buffer);
223+
}
224+
}
225+
}
226+
188227
protected:
189228
void Initialize() {
190229
buckets_keys_.reserve(buckets_limit_.size());
191230
buckets_.reserve(buckets_limit_.size());
192231
for (const auto& pair : buckets_limit_) {
193232
buckets_keys_.push_back(pair.first);
194-
buckets_.emplace(pair.first, std::vector<wgpu::Buffer>());
233+
buckets_.emplace(pair.first, std::vector<WGPUBuffer>());
195234
}
196235
std::sort(buckets_keys_.begin(), buckets_keys_.end());
197236

@@ -205,8 +244,8 @@ class BucketCacheManager : public IBufferCacheManager {
205244
#endif
206245
}
207246
std::unordered_map<size_t, size_t> buckets_limit_;
208-
std::unordered_map<size_t, std::vector<wgpu::Buffer>> buckets_;
209-
std::vector<wgpu::Buffer> pending_buffers_;
247+
std::unordered_map<size_t, std::vector<WGPUBuffer>> buckets_;
248+
std::vector<WGPUBuffer> pending_buffers_;
210249
std::vector<size_t> buckets_keys_;
211250
};
212251

@@ -255,11 +294,10 @@ BufferManager::BufferManager(WebGpuContext& context, BufferCacheMode storage_buf
255294

256295
void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size) {
257296
// If the buffer is mapped, we can directly write to it.
258-
wgpu::Buffer dst_buffer = dst;
259-
auto mapped_data = dst_buffer.GetMappedRange();
297+
void* mapped_data = wgpuBufferGetMappedRange(dst, 0, WGPU_WHOLE_MAP_SIZE); // ensure the buffer is mapped
260298
if (mapped_data) {
261299
memcpy(mapped_data, src, size);
262-
dst_buffer.Unmap();
300+
wgpuBufferUnmap(dst);
263301
return;
264302
}
265303

@@ -288,17 +326,19 @@ void BufferManager::MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size) {
288326
EnforceBufferUnmapped(context_, dst);
289327

290328
auto buffer_size = NormalizeBufferSize(size);
291-
ORT_ENFORCE(buffer_size <= wgpuBufferGetSize(src) && buffer_size <= wgpuBufferGetSize(dst),
329+
auto src_size = static_cast<size_t>(wgpuBufferGetSize(src));
330+
auto dst_size = static_cast<size_t>(wgpuBufferGetSize(dst));
331+
ORT_ENFORCE(buffer_size <= src_size && buffer_size <= dst_size,
292332
"Source and destination buffers must have enough space for the copy operation. src_size=",
293-
wgpuBufferGetSize(src), ", dst_size=", wgpuBufferGetSize(dst), ", copy_size=", buffer_size, ".");
333+
src_size, ", dst_size=", dst_size, ", copy_size=", buffer_size, ".");
294334

295335
auto& command_encoder = context_.GetCommandEncoder();
296336
context_.EndComputePass();
297337
command_encoder.CopyBufferToBuffer(src, 0, dst, 0, buffer_size);
298338
}
299339

300340
WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) {
301-
auto& cache = GetCacheManager(static_cast<WGPUBufferUsage>(usage));
341+
auto& cache = GetCacheManager(usage);
302342
auto buffer_size = cache.CalculateBufferSize(size);
303343

304344
auto buffer = cache.TryAcquireCachedBuffer(buffer_size);
@@ -310,7 +350,6 @@ WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) {
310350
wgpu::BufferDescriptor desc{};
311351
desc.size = buffer_size;
312352
desc.usage = usage;
313-
// desc.label = std::to_string(xx++).c_str();
314353
buffer = context_.Device().CreateBuffer(&desc).MoveToCHandle();
315354

316355
ORT_ENFORCE(buffer, "Failed to create GPU buffer: size=", buffer_size, ", usage=", uint64_t(usage), ".");
@@ -320,14 +359,16 @@ WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) {
320359
}
321360

322361
WGPUBuffer BufferManager::CreateUMA(size_t size, wgpu::BufferUsage usage) {
323-
ORT_ENFORCE(usage & wgpu::BufferUsage::Storage, "UMA buffer must have storage usage.");
324-
auto& cache = GetCacheManager(static_cast<WGPUBufferUsage>(usage));
362+
ORT_ENFORCE(usage & wgpu::BufferUsage::Storage, "UMA buffer must be a storage buffer.");
363+
auto& cache = GetCacheManager(usage);
325364
auto buffer_size = cache.CalculateBufferSize(size);
326365

366+
// Ensure the buffer is mapped for writing at creation.
367+
usage |= wgpu::BufferUsage::MapWrite;
368+
327369
wgpu::BufferDescriptor desc{};
328370
desc.size = buffer_size;
329-
// Ensure the buffer is mapped for writing at creation.
330-
desc.usage = usage | wgpu::BufferUsage::MapWrite;
371+
desc.usage = usage;
331372
desc.mappedAtCreation = true;
332373
auto buffer = context_.Device().CreateBuffer(&desc).MoveToCHandle();
333374

@@ -373,20 +414,21 @@ void BufferManager::RefreshPendingBuffers() {
373414
default_cache_->OnRefresh();
374415
}
375416

376-
IBufferCacheManager& BufferManager::GetCacheManager(WGPUBufferUsage usage) const {
377-
if (usage & WGPUBufferUsage_Storage) {
417+
IBufferCacheManager& BufferManager::GetCacheManager(wgpu::BufferUsage usage) const {
418+
if (usage & wgpu::BufferUsage::Storage) {
378419
return *storage_cache_;
379-
} else if (usage & WGPUBufferUsage_Uniform) {
420+
} else if (usage & wgpu::BufferUsage::Uniform) {
380421
return *uniform_cache_;
381-
} else if (usage & WGPUBufferUsage_QueryResolve) {
422+
} else if (usage & wgpu::BufferUsage::QueryResolve) {
382423
return *query_resolve_cache_;
383424
} else {
384425
return *default_cache_;
385426
}
386427
}
387428

388429
IBufferCacheManager& BufferManager::GetCacheManager(WGPUBuffer buffer) const {
389-
return GetCacheManager(wgpuBufferGetUsage(buffer));
430+
auto usage = static_cast<wgpu::BufferUsage>(wgpuBufferGetUsage(buffer));
431+
return GetCacheManager(usage);
390432
}
391433

392434
std::unique_ptr<BufferManager> BufferManagerFactory::Create(WebGpuContext& context, BufferCacheMode storage_buffer_cache_mode, BufferCacheMode uniform_buffer_cache_mode, BufferCacheMode query_resolve_buffer_cache_mode) {

onnxruntime/core/providers/webgpu/buffer_manager.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class BufferManager {
7070
void RefreshPendingBuffers();
7171

7272
private:
73-
IBufferCacheManager& GetCacheManager(WGPUBufferUsage usage) const;
73+
IBufferCacheManager& GetCacheManager(wgpu::BufferUsage usage) const;
7474
IBufferCacheManager& GetCacheManager(WGPUBuffer buffer) const;
7575

7676
WebGpuContext& context_;

onnxruntime/core/providers/webgpu/webgpu_context.cc

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,6 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
149149
device_features_.insert(supported_features.features[i]);
150150
}
151151

152-
#if !defined(__wasm__)
153-
supports_buffer_map_extended_usages_ = device_.HasFeature(wgpu::FeatureName::BufferMapExtendedUsages);
154-
#endif
155-
156152
// create buffer manager
157153
buffer_mgr_ = BufferManagerFactory::Create(*this,
158154
buffer_cache_config.storage.mode,

onnxruntime/core/providers/webgpu/webgpu_context.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,6 @@ class WebGpuContext final {
145145
Status Run(ComputeContext& context, const ProgramBase& program);
146146
void OnRunEnd();
147147

148-
bool SupportsBufferMapExtendedUsages() const { return supports_buffer_map_extended_usages_; }
149-
150148
private:
151149
enum class TimestampQueryType {
152150
None = 0,
@@ -238,7 +236,6 @@ class WebGpuContext final {
238236
#if defined(ENABLE_PIX_FOR_WEBGPU_EP)
239237
std::unique_ptr<WebGpuPIXFrameGenerator> pix_frame_generator_ = nullptr;
240238
#endif // ENABLE_PIX_FOR_WEBGPU_EP
241-
bool supports_buffer_map_extended_usages_ = false;
242239
};
243240

244241
} // namespace webgpu

0 commit comments

Comments
 (0)