more util funcs in config, fix some calculations

keptsecret · keptsecret · commit 350c6a360499 · 2025-05-27T11:25:25.000+07:00
diff --git a/examples_tests b/examples_tests
@@ -1 +1 @@
-Subproject commit bb3a901b5de72b78246af20072f4489960287204
+Subproject commit 2a85f4e0911185a85df31f798b92e6902db3383e
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -19,9 +19,9 @@ template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
 struct virtual_wg_size_log2
 {
     static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
-    // static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
+    static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16");
     NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, SubgroupSizeLog2*levels, WorkgroupSizeLog2>;
     // must have at least enough level 0 outputs to feed a single subgroup
 };
 
@@ -33,24 +33,6 @@ struct items_per_invocation
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
 };
-
-// explicit specializations for cases that don't fit
-#define SPECIALIZE_VIRTUAL_WG_SIZE_CASE(WGLOG2, SGLOG2, LEVELS, VALUE) template<>\
-struct virtual_wg_size_log2<WGLOG2, SGLOG2>\
-{\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = LEVELS;\
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t value = VALUE;\
-};\
-
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(11,4,3,12);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(7,7,1,7);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(6,6,1,6);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(5,5,1,5);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(4,4,1,4);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(3,3,1,3);
-SPECIALIZE_VIRTUAL_WG_SIZE_CASE(2,2,1,2);
-
-#undef SPECIALIZE_VIRTUAL_WG_SIZE_CASE
 }
 
 template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
@@ -71,16 +53,32 @@ struct ArithmeticConfiguration
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
     static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value<LevelCount==1,uint16_t,0,conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupSize*ItemsPerInvocation_1>::value;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
+        0,
+        conditional_value<LevelCount==3,uint16_t,
+            SubgroupSize*ItemsPerInvocation_2,
+            0
+            >::value + SubgroupSize*ItemsPerInvocation_1
+        >::value;
+
+    static bool electLast()
+    {
+        return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
+    }
+
+    static uint32_t virtualSubgroupID(const uint32_t subgroupID, const uint32_t virtualIdx)
+    {
+        return virtualIdx * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
+    }
 
-    static uint32_t virtualSubgroupID(const uint32_t id, const uint32_t offset)
+    static uint32_t sharedCoalescedIndexNextLevel(const uint32_t subgroupID, const uint32_t itemsPerInvocation)
     {
-        return offset * (WorkgroupSize >> SubgroupSizeLog2) + id;
+        return (subgroupID & (itemsPerInvocation-1)) * SubgroupSize + (subgroupID/itemsPerInvocation);
     }
 
-    static uint32_t sharedMemCoalescedIndex(const uint32_t id, const uint32_t itemsPerInvocation)
+    static uint32_t sharedCoalescedIndexByComponent(const uint32_t invocationIndex, const uint32_t component)
     {
-        return (id & (itemsPerInvocation-1)) * SubgroupSize + (id/itemsPerInvocation);
+        return component * SubgroupSize + invocationIndex;
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -104,10 +104,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv0_t scan_local;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -120,10 +120,10 @@ struct reduce<Config, BinOp, 2, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
 
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
                 scratchAccessor.template set<scalar_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -159,10 +159,10 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -176,12 +176,12 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+prevIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -193,7 +193,7 @@ struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
             scratchAccessor.template get<scalar_t>(bankedIndex,left);
             if (Exclusive)
@@ -242,10 +242,10 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv0_t scan_local;
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
             scan_local = reduction0(scan_local);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);    // (virtualSubgroupID & (Config::ItemsPerInvocation_1-1)) * Config::SubgroupsPerVirtualWorkgroup + (virtualSubgroupID/Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -258,11 +258,11 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv1_t lv1_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
             lv1_val = reduction1(lv1_val);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(invocationIndex, Config::ItemsPerInvocation_2);    // (invocationIndex & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupsPerVirtualWorkgroup + (invocationIndex/Config::ItemsPerInvocation_2);
                 scratchAccessor.template set<scalar_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -275,7 +275,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             scratchAccessor.template set<scalar_t>(invocationIndex, lv2_val[Config::ItemsPerInvocation_2-1]);
         }
@@ -314,10 +314,10 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
             value = inclusiveScan0(value);
             dataAccessor.template set<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+            if (Config::electLast())
             {
                 const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
                 scratchAccessor.template set<scalar_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
             }
         }
@@ -332,15 +332,15 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+prevIndex,lv1_val[i]);
+                scratchAccessor.template get<scalar_t>(Config::sharedCoalescedIndexByComponent(prevIndex, i),lv1_val[i]);
             lv1_val[0] = hlsl::mix(BinOp::identity, lv1_val[0], bool(invocationIndex));
             lv1_val = inclusiveScan1(lv1_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
-            if (glsl::gl_SubgroupInvocationID()==Config::SubgroupSize-1)
+                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv1_val[i]);
+            if (Config::electLast())
             {
-                const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+                const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
                 scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
@@ -354,12 +354,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             const uint32_t prevIndex = invocationIndex-1;
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+prevIndex,lv2_val[i]);
+                scratchAccessor.template get<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent(prevIndex, i),lv2_val[i]);
             lv2_val[0] = hlsl::mix(hlsl::promote<vector_lv2_t>(BinOp::identity), lv2_val[0], bool(invocationIndex));
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t>(lv1_smem_size+i*Config::SubgroupSize+invocationIndex,lv2_val[i]);
+                scratchAccessor.template set<scalar_t>(lv1_smem_size+Config::sharedCoalescedIndexByComponent(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -372,12 +372,12 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
                 scratchAccessor.template get<scalar_t>(i*Config::SubgroupSize+invocationIndex,lv1_val[i]);
 
             scalar_t lv2_scan;
-            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
+            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(glsl::gl_SubgroupID(), Config::ItemsPerInvocation_2);  // (glsl::gl_SubgroupID() & (Config::ItemsPerInvocation_2-1)) * Config::SubgroupSize + (glsl::gl_SubgroupID()/Config::ItemsPerInvocation_2);
             scratchAccessor.template set<scalar_t>(lv1_smem_size+bankedIndex, lv2_scan);
 
             [unroll]
             for (uint32_t i = 0; i < Config::ItemsPerInvocation_1; i++)
-                scratchAccessor.template set<scalar_t>(i*Config::SubgroupSize+invocationIndex, binop(lv1_val[i],lv2_scan));
+                scratchAccessor.template set<scalar_t>(Config::sharedCoalescedIndexByComponent(invocationIndex, i), binop(lv1_val[i],lv2_scan));
         }
 
         // combine with level 0
@@ -388,7 +388,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             dataAccessor.template get<vector_lv0_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
 
             const uint32_t virtualSubgroupID = Config::virtualSubgroupID(glsl::gl_SubgroupID(), idx);
-            const uint32_t bankedIndex = Config::sharedMemCoalescedIndex(virtualSubgroupID, Config::ItemsPerInvocation_1);
+            const uint32_t bankedIndex = Config::sharedCoalescedIndexNextLevel(virtualSubgroupID, Config::ItemsPerInvocation_1);
             scalar_t left;
             scratchAccessor.template get<scalar_t>(bankedIndex,left);
             if (Exclusive)