Make changes to counting.hlsl

nipunG314 · nipunG314 · commit f4d72492c7f3 · 2024-05-31T20:16:44.000+05:30
-&gt; Use implicit key_t instead of explicit Key template parameter
-&gt; Pass workGroupIndex specifically at Counter creation instead of with
uniform data
-&gt; Remove global memory toroidal access from the histogram shader
diff --git a/include/nbl/builtin/hlsl/sort/common.hlsl b/include/nbl/builtin/hlsl/sort/common.hlsl
@@ -19,7 +19,6 @@ struct CountingParameters
 
     uint32_t dataElementCount;
     uint32_t elementsPerWT;
-    uint32_t workGroupIndex;
     Key minimum;
     Key maximum;
 };
diff --git a/include/nbl/builtin/hlsl/sort/counting.hlsl b/include/nbl/builtin/hlsl/sort/counting.hlsl
@@ -18,7 +18,6 @@ namespace sort
 template<
     uint16_t GroupSize,
     uint16_t KeyBucketCount,
-    typename Key,
     typename KeyAccessor,
     typename ValueAccessor,
     typename HistogramAccessor,
@@ -27,35 +26,33 @@ template<
 >
 struct counting
 {
-    uint32_t inclusive_scan(uint32_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    using key_t = decltype(impl::declval < KeyAccessor > ().get(0));
+    using this_t = counting<GroupSize, KeyBucketCount, KeyAccessor, ValueAccessor, HistogramAccessor, SharedAccessor>;
+
+    static this_t create(const uint32_t workGroupIndex)
     {
-        return workgroup::inclusive_scan < plus < uint32_t >, GroupSize >::
-                template __call <SharedAccessor>(value, sdata);
+        this_t retval;
+        retval.workGroupIndex = workGroupIndex;
+        return retval;
     }
 
-    uint32_t toroidal_histogram_add(uint32_t tid, uint32_t sum, NBL_REF_ARG(HistogramAccessor) histogram, NBL_REF_ARG(SharedAccessor) sdata, const CountingParameters<Key> params)
+    uint32_t inclusive_scan(uint32_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        sdata.workgroupExecutionAndMemoryBarrier();
-
-        sdata.set(tid % GroupSize, sum);
-        uint32_t shifted_tid = (tid + glsl::gl_SubgroupSize() * params.workGroupIndex) % GroupSize;
-
-        sdata.workgroupExecutionAndMemoryBarrier();
-
-        return histogram.atomicAdd((tid / GroupSize) * GroupSize + shifted_tid, sdata.get(shifted_tid));
+        return workgroup::inclusive_scan < plus < uint32_t >, GroupSize >::
+                template __call <SharedAccessor>(value, sdata);
     }
 
-    void build_histogram(NBL_REF_ARG( KeyAccessor) key, NBL_REF_ARG(SharedAccessor) sdata, const CountingParameters<Key> params)
+    void build_histogram(NBL_REF_ARG( KeyAccessor) key, NBL_REF_ARG(SharedAccessor) sdata, const CountingParameters<key_t> params)
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
 
-        for (; tid < KeyBucketCount; tid += GroupSize) {
-            sdata.set(tid, 0);
+        for (uint32_t vid = tid; vid < KeyBucketCount; vid += GroupSize) {
+            sdata.set(vid, 0);
         }
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        uint32_t index = params.workGroupIndex * GroupSize * params.elementsPerWT + tid % GroupSize;
+        uint32_t index = workGroupIndex * GroupSize * params.elementsPerWT + tid;
         uint32_t endIndex = min(params.dataElementCount, index + GroupSize * params.elementsPerWT);
 
         for (; index < endIndex; index += GroupSize)
@@ -69,7 +66,12 @@ struct counting
         sdata.workgroupExecutionAndMemoryBarrier();
     }
 
-    void histogram(NBL_REF_ARG( KeyAccessor) key, NBL_REF_ARG(HistogramAccessor) histogram, NBL_REF_ARG(SharedAccessor) sdata, const CountingParameters<Key> params)
+    void histogram(
+            NBL_REF_ARG( KeyAccessor) key,
+            NBL_REF_ARG(HistogramAccessor) histogram,
+            NBL_REF_ARG(SharedAccessor) sdata,
+            const CountingParameters<key_t> params
+    )
     {
         build_histogram(key, sdata, params);
 
@@ -79,30 +81,36 @@ struct counting
         sdata.workgroupExecutionAndMemoryBarrier();
 
         uint32_t sum = inclusive_scan(histogram_value, sdata);
-        toroidal_histogram_add(tid, sum, histogram, sdata, params);
+        histogram.atomicAdd(tid, sum);
 
         const bool is_last_wg_invocation = tid == (GroupSize - 1);
         const uint16_t adjusted_key_bucket_count = ((KeyBucketCount - 1) / GroupSize + 1) * GroupSize;
 
-        for (tid += GroupSize; tid < adjusted_key_bucket_count; tid += GroupSize)
+        for (uint32_t vid = tid + GroupSize; vid < adjusted_key_bucket_count; vid += GroupSize)
         {
             if (is_last_wg_invocation)
             {
-                uint32_t startIndex = tid - tid % GroupSize;
+                uint32_t startIndex = vid - tid;
                 sdata.set(startIndex, sdata.get(startIndex) + sum);
             }
 
-            sum = inclusive_scan(sdata.get(tid), sdata);
-            toroidal_histogram_add(tid, sum, histogram, sdata, params);
+            sum = inclusive_scan(sdata.get(vid), sdata);
+            histogram.atomicAdd(vid, sum);
         }
     }
                 
-    void scatter(NBL_REF_ARG(KeyAccessor) key, NBL_REF_ARG(ValueAccessor) val, NBL_REF_ARG(HistogramAccessor) histogram, NBL_REF_ARG(SharedAccessor) sdata, const CountingParameters<Key> params)
+    void scatter(
+            NBL_REF_ARG( KeyAccessor) key,
+            NBL_REF_ARG(ValueAccessor) val,
+            NBL_REF_ARG(HistogramAccessor) histogram,
+            NBL_REF_ARG(SharedAccessor) sdata,
+            const CountingParameters<key_t> params
+    )
     {
         build_histogram(key, sdata, params);
 
         uint32_t tid = workgroup::SubgroupContiguousIndex();
-        uint32_t shifted_tid = (tid + glsl::gl_SubgroupSize() * params.workGroupIndex) % GroupSize;
+        uint32_t shifted_tid = (tid + glsl::gl_SubgroupSize() * workGroupIndex) % GroupSize;
 
         for (; shifted_tid < KeyBucketCount; shifted_tid += GroupSize)
         {
@@ -114,13 +122,13 @@ struct counting
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        uint32_t index = params.workGroupIndex * GroupSize * params.elementsPerWT + tid;
+        uint32_t index = workGroupIndex * GroupSize * params.elementsPerWT + tid;
         uint32_t endIndex = min(params.dataElementCount, index + GroupSize * params.elementsPerWT);
 
         [unroll]
         for (; index < endIndex; index += GroupSize)
         {
-            const Key k = key.get(index);
+            const key_t k = key.get(index);
             if (robust && (k<params.minimum || k>params.maximum) )
                 continue;
             const uint32_t v = val.get(index);
@@ -129,6 +137,8 @@ struct counting
             val.set(sortedIx, v);
         }
     }
+
+    uint32_t workGroupIndex;
 };
 
 }