Skip to content

Commit 9af4ec6

Browse files
committed
[WebGPU] small writeBuffer copies re-trigger index validation unnecessarily
https://bugs.webkit.org/show_bug.cgi?id=291497 rdar://149172455 Reviewed by Tadeu Zagallo. GPUQueue.writeBuffer on an index buffer would require re-validation of draw calls, this is suboptimal when the indices in the writeBuffer call are within the same range as previous values. clampIndexBufferToValidValues was previously >5% of samples in some traces. Test: no regression to CTS or validation regression suites. * Source/WebGPU/WebGPU/Buffer.h: * Source/WebGPU/WebGPU/Buffer.mm: (WebGPU::Buffer::indirectBufferInvalidated): (WebGPU::Buffer::needsIndexValidation): * Source/WebGPU/WebGPU/Queue.mm: (WebGPU::maxIndexValueSlow): (WebGPU::maxIndexValue): (WebGPU::Queue::writeBuffer): Canonical link: https://commits.webkit.org/294070@main
1 parent 9a7661e commit 9af4ec6

File tree

3 files changed

+78
-2
lines changed

3 files changed

+78
-2
lines changed

Source/WebGPU/WebGPU/Buffer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class Buffer : public WGPUBufferImpl, public ThreadSafeRefCountedAndCanMakeThrea
130130
bool mustTakeSlowIndexValidationPath() const { return m_mustTakeSlowIndexValidationPath; }
131131
void clearMustTakeSlowIndexValidationPath() { m_mustTakeSlowIndexValidationPath = false; }
132132
void takeSlowIndexValidationPath(CommandBuffer&, uint32_t firstIndex, uint32_t indexCount, uint32_t vertexCount, uint32_t instanceCount, MTLIndexType, uint32_t firstInstance, uint32_t baseVertex, uint32_t minInstanceCount, uint32_t primitiveOffset);
133+
bool needsIndexValidation(uint32_t, uint16_t);
133134

134135
private:
135136
Buffer(id<MTLBuffer>, uint64_t initialSize, WGPUBufferUsageFlags, State initialState, MappingRange initialMappingRange, Device&);
@@ -166,6 +167,9 @@ private PUBLIC_IN_WEBGPU_SWIFT:
166167
MappedRanges m_mappedRanges;
167168
private:
168169
WGPUMapModeFlags m_mapMode { WGPUMapMode_None };
170+
uint32_t m_maxUnsignedIndex { 0 };
171+
uint16_t m_maxUshortIndex { 0 };
172+
169173
struct IndirectArgsCache {
170174
uint64_t indirectOffset { UINT64_MAX };
171175
uint64_t indexBufferOffsetInBytes { UINT64_MAX };

Source/WebGPU/WebGPU/Buffer.mm

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ static size_t computeRangeSize(uint64_t size, size_t offset)
424424
return;
425425

426426
decrementBufferMapCount();
427+
m_maxUnsignedIndex = m_maxUshortIndex = 0;
427428
indirectBufferInvalidated();
428429

429430
#if CPU(X86_64) && (PLATFORM(MAC) || PLATFORM(MACCATALYST))
@@ -701,13 +702,15 @@ static bool verifyIndirectBufferData(MTLDrawPrimitivesIndirectArguments& input,
701702

702703
void Buffer::indirectBufferInvalidated(CommandEncoder& commandEncoder)
703704
{
705+
m_maxUnsignedIndex = m_maxUshortIndex = 0;
704706
indirectBufferInvalidated();
705707

706708
commandEncoder.addOnCommitHandler([weakThis = ThreadSafeWeakPtr { *this }, weakCommandEncoder = WeakPtr { commandEncoder }](CommandBuffer&, CommandEncoder&) {
707709
if (!weakThis.get() || !weakCommandEncoder)
708710
return true;
709711

710712
RefPtr protectedThis = weakThis.get();
713+
protectedThis->m_maxUnsignedIndex = protectedThis->m_maxUshortIndex = 0;
711714
RefPtr commandEncoder = weakCommandEncoder.get();
712715
protectedThis->indirectBufferInvalidated(commandEncoder.get());
713716
return true;
@@ -722,6 +725,21 @@ static size_t computeSize(HashSet<uint64_t, DefaultHash<uint64_t>, WTF::Unsigned
722725
return encoders.size();
723726
}
724727

728+
bool Buffer::needsIndexValidation(uint32_t maxUnsignedIndex, uint16_t maxUshortIndex)
729+
{
730+
bool needsUpdate = false;
731+
if (maxUnsignedIndex > m_maxUnsignedIndex) {
732+
m_maxUnsignedIndex = maxUnsignedIndex;
733+
needsUpdate = true;
734+
}
735+
if (m_maxUshortIndex > maxUshortIndex) {
736+
m_maxUshortIndex = maxUshortIndex;
737+
needsUpdate = true;
738+
}
739+
740+
return needsUpdate;
741+
}
742+
725743
void Buffer::indirectBufferInvalidated(CommandEncoder* commandEncoder)
726744
{
727745
if (!(m_usage & (WGPUBufferUsage_Indirect | WGPUBufferUsage_Index)))

Source/WebGPU/WebGPU/Queue.mm

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#if ENABLE(WEBGPU_SWIFT)
3939
#import "WebGPUSwiftInternal.h"
4040
#endif
41+
#import <simd/simd.h>
4142
#import <wtf/CheckedArithmetic.h>
4243
#import <wtf/StdLibExtras.h>
4344
#import <wtf/TZoneMallocInlines.h>
@@ -464,6 +465,50 @@ static void invalidateCommandBuffers(Vector<Ref<WebGPU::CommandBuffer>>&& comman
464465
#endif
465466
}
466467

468+
static std::pair<uint32_t, uint16_t> maxIndexValueSlow(std::span<uint8_t> data)
469+
{
470+
auto lengthUint32 = data.size() / 4;
471+
std::span<uint32_t> dataUint = unsafeMakeSpan(static_cast<uint32_t*>(static_cast<void*>(data.data())), lengthUint32);
472+
std::span<uint16_t> dataUshort = unsafeMakeSpan(static_cast<uint16_t*>(static_cast<void*>(data.data())), lengthUint32 * 2);
473+
uint32_t maxValue = 0;
474+
for (uint32_t dataUintV : dataUint) {
475+
if (maxValue < dataUintV)
476+
maxValue = dataUintV;
477+
}
478+
uint16_t maxUshort = 0;
479+
for (uint16_t dataUshortV : dataUshort) {
480+
if (maxUshort < dataUshortV)
481+
maxUshort = dataUshortV;
482+
}
483+
return std::make_pair(maxValue, maxUshort);
484+
}
485+
486+
static std::pair<uint32_t, uint16_t> maxIndexValue(std::span<uint8_t> data)
487+
{
488+
constexpr auto blockSize = 64;
489+
auto divResult = std::div(data.size(), blockSize);
490+
auto lengthUint32 = divResult.quot;
491+
if (!lengthUint32 || reinterpret_cast<uint64_t>(data.data()) % 64)
492+
return maxIndexValueSlow(data);
493+
494+
std::span<simd::uint16> dataUint = unsafeMakeSpan(static_cast<simd::uint16*>(static_cast<void*>(data.data())), lengthUint32);
495+
std::span<simd::ushort32> dataUshort = unsafeMakeSpan(static_cast<simd::ushort32*>(static_cast<void*>(data.data())), lengthUint32);
496+
simd::uint16 maxValue = dataUint.front();
497+
simd::ushort32 maxUshort = dataUshort.front();
498+
for (auto dataUintV : dataUint)
499+
maxValue = simd_max(maxValue, dataUintV);
500+
for (auto dataUshortV : dataUshort)
501+
maxUshort = simd_max(maxUshort, dataUshortV);
502+
503+
auto result = std::make_pair(simd_reduce_max(maxValue), simd_reduce_max(maxUshort));
504+
if (divResult.rem) {
505+
auto slowResult = maxIndexValueSlow(data.subspan(blockSize * divResult.quot));
506+
result.first = std::max(result.first, slowResult.first);
507+
result.second = std::max(result.second, slowResult.second);
508+
}
509+
return result;
510+
}
511+
467512
void Queue::writeBuffer(Buffer& buffer, uint64_t bufferOffset, std::span<uint8_t> data)
468513
{
469514
auto device = m_device.get();
@@ -472,7 +517,8 @@ static void invalidateCommandBuffers(Vector<Ref<WebGPU::CommandBuffer>>&& comman
472517

473518
// https://gpuweb.github.io/gpuweb/#dom-gpuqueue-writebuffer
474519

475-
if (!validateWriteBuffer(buffer, bufferOffset, data.size()) || !isValidToUseWith(buffer, *this)) {
520+
auto dataSize = data.size();
521+
if (!validateWriteBuffer(buffer, bufferOffset, dataSize) || !isValidToUseWith(buffer, *this)) {
476522
device->generateAValidationError("Validation failure."_s);
477523
return;
478524
}
@@ -487,7 +533,15 @@ static void invalidateCommandBuffers(Vector<Ref<WebGPU::CommandBuffer>>&& comman
487533

488534
// FIXME(PERFORMANCE): Instead of checking whether or not the whole queue is idle,
489535
// we could detect whether this specific resource is idle, if we tracked every resource.
490-
buffer.indirectBufferInvalidated();
536+
bool needsInvalidation = true;
537+
538+
if (dataSize < 16*KB && (buffer.usage() & WGPUBufferUsage_Index) && !(buffer.usage() & WGPUBufferUsage_Indirect)) {
539+
auto maxUnsignedUshortValue = maxIndexValue(data);
540+
if (!buffer.needsIndexValidation(maxUnsignedUshortValue.first, maxUnsignedUshortValue.second))
541+
needsInvalidation = false;
542+
}
543+
if (needsInvalidation)
544+
buffer.indirectBufferInvalidated();
491545
if (isIdle()) {
492546
switch (buffer.buffer().storageMode) {
493547
case MTLStorageModeShared:

0 commit comments

Comments
 (0)