some adjustments to config and func usages

keptsecret · keptsecret · commit 37aa99baee12 · 2025-06-03T16:46:32.000+07:00
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -17,7 +17,7 @@ namespace hlsl
 namespace workgroup2
 {
 
-template<class Config, class BinOp, class device_capabilities=void>
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
 struct reduction
 {
     using scalar_t = typename BinOp::type_t;
@@ -30,7 +30,7 @@ struct reduction
     }
 };
 
-template<class Config, class BinOp, class device_capabilities=void>
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
 struct inclusive_scan
 {
     using scalar_t = typename BinOp::type_t;
@@ -43,7 +43,7 @@ struct inclusive_scan
     }
 };
 
-template<class Config, class BinOp, class device_capabilities=void>
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
 struct exclusive_scan
 {
     using scalar_t = typename BinOp::type_t;
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -36,6 +36,8 @@ struct items_per_invocation
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
+
+    using ItemsPerInvocation = tuple<integral_constant<uint16_t,value0>,integral_constant<uint16_t,value1>,integral_constant<uint16_t,value2> >;
 };
 }
 
@@ -53,26 +55,24 @@ struct ArithmeticConfiguration
     static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
 
     using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
-    using ItemsPerInvocation = tuple<integral_constant<uint16_t,items_per_invoc_t::value0>,integral_constant<uint16_t,items_per_invoc_t::value1>,integral_constant<uint16_t,items_per_invoc_t::value2> >;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
+    static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
 
-    // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
-    // NBL_CONSTEXPR_STATIC_INLINE uint16_t __ItemsPerVirtualWorkgroup = uint16_t(0x1u) << __ItemsPerVirtualWorkgroupLog2;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_1 = conditional_value<LevelCount==3,uint16_t,
         mpl::max_v<uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize>,
         SubgroupSize*ItemsPerInvocation_1>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelInputCount_2 = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value;
     NBL_CONSTEXPR_STATIC_INLINE uint16_t __SubgroupsPerVirtualWorkgroup = LevelInputCount_1 / ItemsPerInvocation_1;
 
-    // user specified the shared mem size of uint32_ts
+    // user specified the shared mem size of Scalars
     NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = conditional_value<LevelCount==1,uint16_t,
         0,
         conditional_value<LevelCount==3,uint16_t,
-            SubgroupSize*ItemsPerInvocation_2+LevelInputCount_1,
-            SubgroupSize*ItemsPerInvocation_1
-            >::value
+            LevelInputCount_2,
+            0
+            >::value + LevelInputCount_1
         >::value;
 
     static bool electLast()
@@ -90,30 +90,30 @@ struct ArithmeticConfiguration
     // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
     // specify the next level to store values for in template param
     // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
-    template<uint16_t level>
-    static uint16_t sharedStoreIndex(const uint16_t subgroupID)
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
     {
         uint16_t offsetBySubgroup;
         if (level == LevelCount-1)
             offsetBySubgroup = SubgroupSize;
         else
             offsetBySubgroup = __SubgroupsPerVirtualWorkgroup;
 
-        if (level<2)
-            return (subgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_1);
+        if (level==2)
+            return LevelInputCount_1 + (virtualSubgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (virtualSubgroupID/ItemsPerInvocation_2);
         else
-            return (subgroupID & (ItemsPerInvocation_2-uint16_t(1u))) * offsetBySubgroup + (subgroupID/ItemsPerInvocation_2);
+            return (virtualSubgroupID & (ItemsPerInvocation_1-uint16_t(1u))) * offsetBySubgroup + (virtualSubgroupID/ItemsPerInvocation_1);
     }
 
-    template<uint16_t level>
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
     {
         const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
         return sharedStoreIndex<level>(virtualID);
     }
 
     // get the coalesced index in shared mem at the current level
-    template<uint16_t level>
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
     static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
     {
         if (level == LevelCount-1)
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -247,7 +247,6 @@ struct reduce<Config, BinOp, 3, device_capabilities>
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::LevelInputCount_1;
         subgroup2::reduction<params_lv1_t> reduction1;
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
@@ -259,7 +258,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             if (Config::electLast())
             {
                 const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
-                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -271,7 +270,7 @@ struct reduce<Config, BinOp, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val = reduction2(lv2_val);
             if (Config::electLast())
                 scratchAccessor.template set<scalar_t, uint16_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
@@ -305,7 +304,6 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
         const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
         // level 1 scan
-        const uint32_t lv1_smem_size = Config::LevelInputCount_1;
         subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
         if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
         {
@@ -320,7 +318,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             if (Config::electLast())
             {
                 const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
-                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
             }
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
@@ -332,11 +330,11 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
             vector_lv2_t lv2_val;
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
             lv2_val = inclusiveScan2(lv2_val);
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
-                scratchAccessor.template set<scalar_t, uint16_t>(lv1_smem_size+Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
         }
         scratchAccessor.workgroupExecutionAndMemoryBarrier();
 
@@ -351,7 +349,7 @@ struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
 
             scalar_t lv2_scan;
             const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
-            scratchAccessor.template get<scalar_t, uint16_t>(lv1_smem_size+bankedIndex, lv2_scan);
+            scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
 
             [unroll]
             for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i--)

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ namespace hlsl`
`17`	`17`	`namespace workgroup2`
`18`	`18`	`{`
`19`	`19`
`20`		`-template<class Config, class BinOp, class device_capabilities=void>`
	`20`	`+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)`
`21`	`21`	`struct reduction`
`22`	`22`	`{`
`23`	`23`	`using scalar_t = typename BinOp::type_t;`
`@@ -30,7 +30,7 @@ struct reduction`
`30`	`30`	`}`
`31`	`31`	`};`
`32`	`32`
`33`		`-template<class Config, class BinOp, class device_capabilities=void>`
	`33`	`+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)`
`34`	`34`	`struct inclusive_scan`
`35`	`35`	`{`
`36`	`36`	`using scalar_t = typename BinOp::type_t;`
`@@ -43,7 +43,7 @@ struct inclusive_scan`
`43`	`43`	`}`
`44`	`44`	`};`
`45`	`45`
`46`		`-template<class Config, class BinOp, class device_capabilities=void>`
	`46`	`+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)`
`47`	`47`	`struct exclusive_scan`
`48`	`48`	`{`
`49`	`49`	`using scalar_t = typename BinOp::type_t;`