Changes workgroup FFT to control workgroup size via a template parameter and not a define

Fletterio · Fletterio · commit b3e889e337c6 · 2024-08-28T15:41:12.000-03:00
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -93,15 +93,19 @@ struct exchangeValues<SharedMemoryAdaptor, float64_t>
     }
 };
 
+// Get the required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT
+template <typename scalar_t, uint32_t WorkgroupSize>
+NBL_CONSTEXPR uint32_t sharedMemSize = 2 * WorkgroupSize * (sizeof(scalar_t) / sizeof(uint32_t));
+
 } //namespace fft
 
 // ----------------------------------- End Utils -----------------------------------------------
 
-template<uint16_t ElementsPerInvocation, bool Inverse, typename Scalar, class device_capabilities=void>
+template<uint16_t ElementsPerInvocation, bool Inverse, uint32_t WorkgroupSize, typename Scalar, class device_capabilities=void>
 struct FFT;
 
 // For the FFT methods below, we assume:
-//      - Accessor is a global memory accessor to an array fitting 2 * _NBL_HLSL_WORKGROUP_SIZE_ elements of type complex_t<Scalar>, used to get inputs / set outputs of the FFT,
+//      - Accessor is a global memory accessor to an array fitting 2 * WorkgroupSize elements of type complex_t<Scalar>, used to get inputs / set outputs of the FFT,
 //        that is, one "lo" and one "hi" complex numbers per thread, essentially 4 Scalars per thread. The arrays it accesses with `get` and `set` can optionally be
 //        different, if you don't want the FFT to be done in-place. 
 //        The Accessor MUST provide the following methods:
@@ -110,15 +114,15 @@ struct FFT;
 //            * void memoryBarrier();
 //        You might optionally want to provide a `workgroupExecutionAndMemoryBarrier()` method on it to wait on to be sure the whole FFT pass is done
  
-//      - SharedMemoryAccessor accesses a workgroup-shared memory array of size `2 * sizeof(Scalar) * _NBL_HLSL_WORKGROUP_SIZE_`.
+//      - SharedMemoryAccessor accesses a workgroup-shared memory array of size `2 * sizeof(Scalar) * WorkgroupSize`.
 //        The SharedMemoryAccessor MUST provide the following methods:
 //             * void get(uint32_t index, inout uint32_t value);  
 //             * void set(uint32_t index, in uint32_t value); 
 //             * void workgroupExecutionAndMemoryBarrier();
 
 // 2 items per invocation forward specialization
-template<typename Scalar, class device_capabilities>
-struct FFT<2,false, Scalar, device_capabilities>
+template<uint32_t WorkgroupSize, typename Scalar, class device_capabilities>
+struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
 {
     template<typename SharedMemoryAdaptor>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
@@ -136,27 +140,27 @@ struct FFT<2,false, Scalar, device_capabilities>
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
 		const uint32_t loIx = threadID;
-		const uint32_t hiIx = _NBL_HLSL_WORKGROUP_SIZE_ | loIx;
+		const uint32_t hiIx = WorkgroupSize | loIx;
 
         // Read lo, hi values from global memory
         complex_t<Scalar> lo, hi;
         accessor.get(loIx, lo);
         accessor.get(hiIx, hi);
 
         // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
-        if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize())
+        if (WorkgroupSize > glsl::gl_SubgroupSize())
         {
             // Set up the memory adaptor
-            using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor,uint32_t,uint32_t,1,_NBL_HLSL_WORKGROUP_SIZE_>;
+            using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor,uint32_t,uint32_t,1,WorkgroupSize>;
             adaptor_t sharedmemAdaptor;
             sharedmemAdaptor.accessor = sharedmemAccessor;
 
             // special first iteration
-            hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi);
+            hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID, WorkgroupSize), lo, hi);
 
             // Run bigger steps until Subgroup-sized
             [unroll]
-            for (uint32_t stride = _NBL_HLSL_WORKGROUP_SIZE_ >> 1; stride > glsl::gl_SubgroupSize(); stride >>= 1)
+            for (uint32_t stride = WorkgroupSize >> 1; stride > glsl::gl_SubgroupSize(); stride >>= 1)
             {   
                 FFT_loop< adaptor_t >(stride, lo, hi, threadID, sharedmemAdaptor);
                 sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
@@ -181,8 +185,8 @@ struct FFT<2,false, Scalar, device_capabilities>
 
 
 // 2 items per invocation inverse specialization
-template<typename Scalar, class device_capabilities>
-struct FFT<2,true, Scalar, device_capabilities>
+template<uint32_t WorkgroupSize, typename Scalar, class device_capabilities>
+struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
 {
     template<typename SharedMemoryAdaptor>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
@@ -200,7 +204,7 @@ struct FFT<2,true, Scalar, device_capabilities>
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
         const uint32_t loIx = threadID;
-		const uint32_t hiIx = _NBL_HLSL_WORKGROUP_SIZE_ | loIx;
+		const uint32_t hiIx = WorkgroupSize | loIx;
 
         // Read lo, hi values from global memory
         complex_t<Scalar> lo, hi;
@@ -211,10 +215,10 @@ struct FFT<2,true, Scalar, device_capabilities>
         subgroup::FFT<true, Scalar, device_capabilities>::__call(lo, hi);
         
         // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
-        if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize()) 
+        if (WorkgroupSize > glsl::gl_SubgroupSize()) 
         { 
             // Set up the memory adaptor
-            using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor,uint32_t,uint32_t,1,_NBL_HLSL_WORKGROUP_SIZE_>;
+            using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor,uint32_t,uint32_t,1,WorkgroupSize>;
             adaptor_t sharedmemAdaptor;
             sharedmemAdaptor.accessor = sharedmemAccessor;
 
@@ -223,18 +227,18 @@ struct FFT<2,true, Scalar, device_capabilities>
         
             // The bigger steps
             [unroll]
-            for (uint32_t stride = glsl::gl_SubgroupSize() << 1; stride < _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1)
+            for (uint32_t stride = glsl::gl_SubgroupSize() << 1; stride < WorkgroupSize; stride <<= 1)
             {   
                 // Order of waiting for shared mem writes is also reversed here, since the shuffle came earlier
                 sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
                 FFT_loop< adaptor_t >(stride, lo, hi, threadID, sharedmemAdaptor);
             }
 
             // special last iteration 
-            hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi); 
+            hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID, WorkgroupSize), lo, hi); 
             divides_assign< complex_t<Scalar> > divAss;
-            divAss(lo, Scalar(_NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize()));
-            divAss(hi, Scalar(_NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize()));  
+            divAss(lo, Scalar(WorkgroupSize / glsl::gl_SubgroupSize()));
+            divAss(hi, Scalar(WorkgroupSize / glsl::gl_SubgroupSize()));  
 
             // Remember to update the accessor's state
             sharedmemAccessor = sharedmemAdaptor.accessor;
@@ -247,17 +251,17 @@ struct FFT<2,true, Scalar, device_capabilities>
 };
 
 // Forward FFT
-template<uint32_t K, typename Scalar, class device_capabilities>
-struct FFT<K, false, Scalar, device_capabilities>
+template<uint32_t K, uint32_t WorkgroupSize, typename Scalar, class device_capabilities>
+struct FFT<K, false, WorkgroupSize, Scalar, device_capabilities>
 {
     template<typename Accessor, typename SharedMemoryAccessor>
     static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
         [unroll]
-        for (uint32_t stride = (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_; stride > _NBL_HLSL_WORKGROUP_SIZE_; stride >>= 1)
+        for (uint32_t stride = (K / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1)
         {
             [unroll]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -282,16 +286,16 @@ struct FFT<K, false, Scalar, device_capabilities>
         {
             if (k)
                 sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-            offsetAccessor.offset = _NBL_HLSL_WORKGROUP_SIZE_*k;
-            FFT<2,false, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
+            offsetAccessor.offset = WorkgroupSize*k;
+            FFT<2,false, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
         }
         accessor = offsetAccessor.accessor;
     }
 };
 
 // Inverse FFT
-template<uint32_t K, typename Scalar, class device_capabilities>
-struct FFT<K, true, Scalar, device_capabilities>
+template<uint32_t K, uint32_t WorkgroupSize, typename Scalar, class device_capabilities>
+struct FFT<K, true, WorkgroupSize, Scalar, device_capabilities>
 {
     template<typename Accessor, typename SharedMemoryAccessor>
     static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
@@ -304,17 +308,17 @@ struct FFT<K, true, Scalar, device_capabilities>
         {
             if (k)
                 sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-            offsetAccessor.offset = _NBL_HLSL_WORKGROUP_SIZE_*k;
-            FFT<2,true, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
+            offsetAccessor.offset = WorkgroupSize*k;
+            FFT<2,true, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
         }
         accessor = offsetAccessor.accessor;
         
         [unroll]
-        for (uint32_t stride = 2 * _NBL_HLSL_WORKGROUP_SIZE_; stride < K * _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1)
+        for (uint32_t stride = 2 * WorkgroupSize; stride < K * WorkgroupSize; stride <<= 1)
         {
             accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor
             [unroll]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -326,7 +330,7 @@ struct FFT<K, true, Scalar, device_capabilities>
                 hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true,Scalar>(virtualThreadID & (stride - 1), stride), lo,hi);
                 
                 // Divide by special factor at the end
-                if ( (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_ == stride)
+                if ( (K / 2) * WorkgroupSize == stride)
                 {
                     divides_assign< complex_t<Scalar> > divAss;
                     divAss(lo, K / 2);