[WebGPU] small writeBuffer copies re-trigger index validation unnecessarily

mwyrzykowski · mwyrzykowski · commit 9af4ec625dd7 · 2025-04-24T10:06:36.000-07:00
https://bugs.webkit.org/show_bug.cgi?id=291497 rdar://149172455 Reviewed by Tadeu Zagallo. GPUQueue.writeBuffer on an index buffer would require re-validation of draw calls, this is suboptimal when the indices in the writeBuffer call are within the same range as previous values. clampIndexBufferToValidValues was previously >5% of samples in some traces. Test: no regression to CTS or validation regression suites. * Source/WebGPU/WebGPU/Buffer.h: * Source/WebGPU/WebGPU/Buffer.mm: (WebGPU::Buffer::indirectBufferInvalidated): (WebGPU::Buffer::needsIndexValidation): * Source/WebGPU/WebGPU/Queue.mm: (WebGPU::maxIndexValueSlow): (WebGPU::maxIndexValue): (WebGPU::Queue::writeBuffer): Canonical link: https://commits.webkit.org/294070@main
diff --git a/Source/WebGPU/WebGPU/Buffer.h b/Source/WebGPU/WebGPU/Buffer.h
@@ -130,6 +130,7 @@ class Buffer : public WGPUBufferImpl, public ThreadSafeRefCountedAndCanMakeThrea
     bool mustTakeSlowIndexValidationPath() const { return m_mustTakeSlowIndexValidationPath; }
     void clearMustTakeSlowIndexValidationPath() { m_mustTakeSlowIndexValidationPath = false; }
     void takeSlowIndexValidationPath(CommandBuffer&, uint32_t firstIndex, uint32_t indexCount, uint32_t vertexCount, uint32_t instanceCount, MTLIndexType, uint32_t firstInstance, uint32_t baseVertex, uint32_t minInstanceCount, uint32_t primitiveOffset);
+    bool needsIndexValidation(uint32_t, uint16_t);
 
 private:
     Buffer(id<MTLBuffer>, uint64_t initialSize, WGPUBufferUsageFlags, State initialState, MappingRange initialMappingRange, Device&);
@@ -166,6 +167,9 @@ private PUBLIC_IN_WEBGPU_SWIFT:
     MappedRanges m_mappedRanges;
 private:
     WGPUMapModeFlags m_mapMode { WGPUMapMode_None };
+    uint32_t m_maxUnsignedIndex { 0 };
+    uint16_t m_maxUshortIndex { 0 };
+
     struct IndirectArgsCache {
         uint64_t indirectOffset { UINT64_MAX };
         uint64_t indexBufferOffsetInBytes { UINT64_MAX };
diff --git a/Source/WebGPU/WebGPU/Buffer.mm b/Source/WebGPU/WebGPU/Buffer.mm
@@ -424,6 +424,7 @@ static size_t computeRangeSize(uint64_t size, size_t offset)
         return;
 
     decrementBufferMapCount();
+    m_maxUnsignedIndex = m_maxUshortIndex = 0;
     indirectBufferInvalidated();
 
 #if CPU(X86_64) && (PLATFORM(MAC) || PLATFORM(MACCATALYST))
@@ -701,13 +702,15 @@ static bool verifyIndirectBufferData(MTLDrawPrimitivesIndirectArguments& input,
 
 void Buffer::indirectBufferInvalidated(CommandEncoder& commandEncoder)
 {
+    m_maxUnsignedIndex = m_maxUshortIndex = 0;
     indirectBufferInvalidated();
 
     commandEncoder.addOnCommitHandler([weakThis = ThreadSafeWeakPtr { *this }, weakCommandEncoder = WeakPtr { commandEncoder }](CommandBuffer&, CommandEncoder&) {
         if (!weakThis.get() || !weakCommandEncoder)
             return true;
 
         RefPtr protectedThis = weakThis.get();
+        protectedThis->m_maxUnsignedIndex = protectedThis->m_maxUshortIndex = 0;
         RefPtr commandEncoder = weakCommandEncoder.get();
         protectedThis->indirectBufferInvalidated(commandEncoder.get());
         return true;
@@ -722,6 +725,21 @@ static size_t computeSize(HashSet<uint64_t, DefaultHash<uint64_t>, WTF::Unsigned
     return encoders.size();
 }
 
+bool Buffer::needsIndexValidation(uint32_t maxUnsignedIndex, uint16_t maxUshortIndex)
+{
+    bool needsUpdate = false;
+    if (maxUnsignedIndex > m_maxUnsignedIndex) {
+        m_maxUnsignedIndex = maxUnsignedIndex;
+        needsUpdate = true;
+    }
+    if (m_maxUshortIndex > maxUshortIndex) {
+        m_maxUshortIndex = maxUshortIndex;
+        needsUpdate = true;
+    }
+
+    return needsUpdate;
+}
+
 void Buffer::indirectBufferInvalidated(CommandEncoder* commandEncoder)
 {
     if (!(m_usage & (WGPUBufferUsage_Indirect | WGPUBufferUsage_Index)))
diff --git a/Source/WebGPU/WebGPU/Queue.mm b/Source/WebGPU/WebGPU/Queue.mm
@@ -38,6 +38,7 @@
 #if ENABLE(WEBGPU_SWIFT)
 #import "WebGPUSwiftInternal.h"
 #endif
+#import <simd/simd.h>
 #import <wtf/CheckedArithmetic.h>
 #import <wtf/StdLibExtras.h>
 #import <wtf/TZoneMallocInlines.h>
@@ -464,6 +465,50 @@ static void invalidateCommandBuffers(Vector<Ref<WebGPU::CommandBuffer>>&& comman
 #endif
 }
 
+static std::pair<uint32_t, uint16_t> maxIndexValueSlow(std::span<uint8_t> data)
+{
+    auto lengthUint32 = data.size() / 4;
+    std::span<uint32_t> dataUint = unsafeMakeSpan(static_cast<uint32_t*>(static_cast<void*>(data.data())), lengthUint32);
+    std::span<uint16_t> dataUshort = unsafeMakeSpan(static_cast<uint16_t*>(static_cast<void*>(data.data())), lengthUint32 * 2);
+    uint32_t maxValue = 0;
+    for (uint32_t dataUintV : dataUint) {
+        if (maxValue < dataUintV)
+            maxValue = dataUintV;
+    }
+    uint16_t maxUshort = 0;
+    for (uint16_t dataUshortV : dataUshort) {
+        if (maxUshort < dataUshortV)
+            maxUshort = dataUshortV;
+    }
+    return std::make_pair(maxValue, maxUshort);
+}
+
+static std::pair<uint32_t, uint16_t> maxIndexValue(std::span<uint8_t> data)
+{
+    constexpr auto blockSize = 64;
+    auto divResult = std::div(data.size(), blockSize);
+    auto lengthUint32 = divResult.quot;
+    if (!lengthUint32 || reinterpret_cast<uint64_t>(data.data()) % 64)
+        return maxIndexValueSlow(data);
+
+    std::span<simd::uint16> dataUint = unsafeMakeSpan(static_cast<simd::uint16*>(static_cast<void*>(data.data())), lengthUint32);
+    std::span<simd::ushort32> dataUshort = unsafeMakeSpan(static_cast<simd::ushort32*>(static_cast<void*>(data.data())), lengthUint32);
+    simd::uint16 maxValue = dataUint.front();
+    simd::ushort32 maxUshort = dataUshort.front();
+    for (auto dataUintV : dataUint)
+        maxValue = simd_max(maxValue, dataUintV);
+    for (auto dataUshortV : dataUshort)
+        maxUshort = simd_max(maxUshort, dataUshortV);
+
+    auto result = std::make_pair(simd_reduce_max(maxValue), simd_reduce_max(maxUshort));
+    if (divResult.rem) {
+        auto slowResult = maxIndexValueSlow(data.subspan(blockSize * divResult.quot));
+        result.first = std::max(result.first, slowResult.first);
+        result.second = std::max(result.second, slowResult.second);
+    }
+    return result;
+}
+
 void Queue::writeBuffer(Buffer& buffer, uint64_t bufferOffset, std::span<uint8_t> data)
 {
     auto device = m_device.get();
@@ -472,7 +517,8 @@ static void invalidateCommandBuffers(Vector<Ref<WebGPU::CommandBuffer>>&& comman
 
     // https://gpuweb.github.io/gpuweb/#dom-gpuqueue-writebuffer
 
-    if (!validateWriteBuffer(buffer, bufferOffset, data.size()) || !isValidToUseWith(buffer, *this)) {
+    auto dataSize = data.size();
+    if (!validateWriteBuffer(buffer, bufferOffset, dataSize) || !isValidToUseWith(buffer, *this)) {
         device->generateAValidationError("Validation failure."_s);
         return;
     }
@@ -487,7 +533,15 @@ static void invalidateCommandBuffers(Vector<Ref<WebGPU::CommandBuffer>>&& comman
 
     // FIXME(PERFORMANCE): Instead of checking whether or not the whole queue is idle,
     // we could detect whether this specific resource is idle, if we tracked every resource.
-    buffer.indirectBufferInvalidated();
+    bool needsInvalidation = true;
+
+    if (dataSize < 16*KB && (buffer.usage() & WGPUBufferUsage_Index) && !(buffer.usage() & WGPUBufferUsage_Indirect)) {
+        auto maxUnsignedUshortValue = maxIndexValue(data);
+        if (!buffer.needsIndexValidation(maxUnsignedUshortValue.first, maxUnsignedUshortValue.second))
+            needsInvalidation = false;
+    }
+    if (needsInvalidation)
+        buffer.indirectBufferInvalidated();
     if (isIdle()) {
         switch (buffer.buffer().storageMode) {
         case MTLStorageModeShared: