More changes following Bloom PR review

Fletterio · Fletterio · commit dcc537e3e018 · 2024-11-13T16:30:53.000-03:00
diff --git a/examples_tests b/examples_tests
@@ -1 +1 @@
-Subproject commit 0df6008c923712ed729b47cf7711c6301622fdb3
+Subproject commit e946590f686875e4bf0262b738e776fc668b58da
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -8,6 +8,7 @@
 #include "nbl/builtin/hlsl/mpl.hlsl"
 #include "nbl/builtin/hlsl/memory_accessor.hlsl"
 #include "nbl/builtin/hlsl/bit.hlsl"
+#include "nbl/builtin/hlsl/concepts.hlsl"
 
 // Caveats
 // - Sin and Cos in HLSL take 32-bit floats. Using this library with 64-bit floats works perfectly fine, but DXC will emit warnings
@@ -90,10 +91,6 @@ namespace impl
     }
 } //namespace impl
 
-// Get the required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT
-template <typename scalar_t, uint16_t WorkgroupSize>
-NBL_CONSTEXPR uint32_t SharedMemoryDWORDs = (sizeof(complex_t<scalar_t>) / sizeof(uint32_t))  * WorkgroupSize;
-
 // Util to unpack two values from the packed FFT X + iY - get outputs in the same input arguments, storing x to lo and y to hi
 template<typename Scalar>
 void unpack(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
@@ -103,7 +100,7 @@ void unpack(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi
     lo = x;
 }
 
-template<uint16_t ElementsPerInvocation, uint16_t WorkgroupSize>
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2>
 struct FFTIndexingUtils
 {
     // This function maps the index `idx` in the output array of a Nabla FFT to the index `freqIdx` in the DFT such that `DFT[freqIdx] = NablaFFT[idx]`
@@ -132,16 +129,36 @@ struct FFTIndexingUtils
         return getNablaIndex(getDFTMirrorIndex(getDFTIndex(idx)));
     }
 
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = mpl::log2<ElementsPerInvocation>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + mpl::log2<WorkgroupSize>::value;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(WorkgroupSize) * uint32_t(ElementsPerInvocation);
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t FFTSizeLog2 = ElementsPerInvocationLog2 + WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t FFTSize = uint32_t(1) << FFTSizeLog2;
 };
 
 } //namespace fft
 
-// ----------------------------------- End Utils -----------------------------------------------
+// ----------------------------------- End Utils --------------------------------------------------------------
+
+namespace fft
+{
+
+template<uint16_t _ElementsPerInvocationLog2, uint16_t _WorkgroupSizeLog2, typename _Scalar NBL_PRIMARY_REQUIRES(_ElementsPerInvocationLog2 > 0 && _WorkgroupSizeLog2 >= 5)
+struct ConstevalParameters
+{
+    using scalar_t = _Scalar;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocationLog2 = _ElementsPerInvocationLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t TotalSize = uint32_t(1) << (ElementsPerInvocationLog2 + WorkgroupSizeLog2);
 
-template<uint16_t ElementsPerInvocation, bool Inverse, uint16_t WorkgroupSize, typename Scalar, class device_capabilities=void>
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = uint16_t(1) << ElementsPerInvocationLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1) << WorkgroupSizeLog2;
+
+    // Required size (in number of uint32_t elements) of the workgroup shared memory array needed for the FFT
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = (sizeof(complex_t<scalar_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2;
+};
+
+} //namespace fft
+
+template<bool Inverse, typename consteval_params_t, class device_capabilities=void>
 struct FFT;
 
 // For the FFT methods below, we assume:
@@ -161,9 +178,11 @@ struct FFT;
 //             * void workgroupExecutionAndMemoryBarrier();
 
 // 2 items per invocation forward specialization
-template<uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<false, fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename SharedMemoryAdaptor>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
@@ -177,6 +196,8 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
 		const uint32_t loIx = threadID;
@@ -222,12 +243,12 @@ struct FFT<2,false, WorkgroupSize, Scalar, device_capabilities>
     }
 };
 
-
-
 // 2 items per invocation inverse specialization
-template<uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<true, fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename SharedMemoryAdaptor>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
@@ -241,6 +262,8 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
         const uint32_t loIx = threadID;
@@ -291,17 +314,23 @@ struct FFT<2,true, WorkgroupSize, Scalar, device_capabilities>
 };
 
 // Forward FFT
-template<uint32_t K, uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<K, false, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<false, fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>;
+    using small_fft_consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename Accessor, typename SharedMemoryAccessor>
-    static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = consteval_params_t::ElementsPerInvocation;
+
         [unroll]
-        for (uint32_t stride = (K / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1)
+        for (uint32_t stride = (ElementsPerInvocation / 2) * WorkgroupSize; stride > WorkgroupSize; stride >>= 1)
         {
             [unroll]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (ElementsPerInvocation / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -318,47 +347,53 @@ struct FFT<K, false, WorkgroupSize, Scalar, device_capabilities>
             accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor
         }
 
-        // do K/2 small workgroup FFTs
+        // do ElementsPerInvocation/2 small workgroup FFTs
         accessor_adaptors::Offset<Accessor> offsetAccessor;
         offsetAccessor.accessor = accessor;
         [unroll]
-        for (uint32_t k = 0; k < K; k += 2)
+        for (uint32_t k = 0; k < ElementsPerInvocation; k += 2)
         {
             if (k)
                 sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
             offsetAccessor.offset = WorkgroupSize*k;
-            FFT<2,false, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
+            FFT<false, small_fft_consteval_params_t, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
         }
         accessor = offsetAccessor.accessor;
     }
 };
 
 // Inverse FFT
-template<uint32_t K, uint16_t WorkgroupSize, typename Scalar, class device_capabilities>
-struct FFT<K, true, WorkgroupSize, Scalar, device_capabilities>
+template<uint16_t ElementsPerInvocationLog2, uint16_t WorkgroupSizeLog2, typename Scalar, class device_capabilities>
+struct FFT<true, fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>, device_capabilities>
 {
+    using consteval_params_t = fft::ConstevalParameters<ElementsPerInvocationLog2, WorkgroupSizeLog2, Scalar>;
+    using small_fft_consteval_params_t = fft::ConstevalParameters<1, WorkgroupSizeLog2, Scalar>;
+
     template<typename Accessor, typename SharedMemoryAccessor>
-    static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = consteval_params_t::WorkgroupSize;
+        NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementsPerInvocation = consteval_params_t::ElementsPerInvocation;
+
         // do K/2 small workgroup FFTs
         accessor_adaptors::Offset<Accessor> offsetAccessor;
         offsetAccessor.accessor = accessor;
         [unroll]
-        for (uint32_t k = 0; k < K; k += 2)
+        for (uint32_t k = 0; k < ElementsPerInvocation; k += 2)
         {
             if (k)
                 sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
             offsetAccessor.offset = WorkgroupSize*k;
-            FFT<2,true, WorkgroupSize, Scalar, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
+            FFT<true, small_fft_consteval_params_t, device_capabilities>::template __call(offsetAccessor,sharedmemAccessor);
         }
         accessor = offsetAccessor.accessor;
         
         [unroll]
-        for (uint32_t stride = 2 * WorkgroupSize; stride < K * WorkgroupSize; stride <<= 1)
+        for (uint32_t stride = 2 * WorkgroupSize; stride < ElementsPerInvocation * WorkgroupSize; stride <<= 1)
         {
             accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor
             [unroll]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (ElementsPerInvocation / 2) * WorkgroupSize; virtualThreadID += WorkgroupSize)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -370,11 +405,11 @@ struct FFT<K, true, WorkgroupSize, Scalar, device_capabilities>
                 hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true,Scalar>(virtualThreadID & (stride - 1), stride), lo,hi);
                 
                 // Divide by special factor at the end
-                if ( (K / 2) * WorkgroupSize == stride)
+                if ( (ElementsPerInvocation / 2) * WorkgroupSize == stride)
                 {
                     divides_assign< complex_t<Scalar> > divAss;
-                    divAss(lo, K / 2);
-                    divAss(hi, K / 2);  
+                    divAss(lo, ElementsPerInvocation / 2);
+                    divAss(hi, ElementsPerInvocation / 2);  
                 }
 
                 accessor.set(loIx, lo);