Refactor following PR review

Fletterio · Fletterio · commit a5360cafed3d · 2024-07-04T16:55:10.000-03:00
diff --git a/include/nbl/builtin/hlsl/subgroup/fft.hlsl b/include/nbl/builtin/hlsl/subgroup/fft.hlsl
@@ -11,8 +11,6 @@ namespace hlsl
 {
 namespace subgroup
 {
-namespace fft
-{
 
 // -----------------------------------------------------------------------------------------------------------------------------------------------------------------
 template<bool Inverse, typename Scalar, class device_capabilities=void>
@@ -42,15 +40,15 @@ struct FFT<false, Scalar, device_capabilities>
             hi.imag(exchanged.y);
         }
         // Get twiddle with k = subgroupInvocation mod stride, halfN = stride
-        hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);
+        fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);
     }
 
     static void __call(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi) 
     {
         const uint32_t subgroupSize = glsl::gl_SubgroupSize();  //This is N/2
 
         // special first iteration
-        hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);                                                                                   
+        fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);                                                                                   
         
         // Decimation in Frequency
         for (uint32_t stride = subgroupSize >> 1; stride > 0; stride >>= 1)
@@ -67,7 +65,7 @@ struct FFT<true, Scalar, device_capabilities>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
     {
         // Get twiddle with k = subgroupInvocation mod stride, halfN = stride
-        hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);   
+        fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);   
 
         const bool topHalf = bool(glsl::gl_SubgroupInvocationID() & stride);
         const vector <Scalar, 2> toTrade = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());
@@ -94,7 +92,7 @@ struct FFT<true, Scalar, device_capabilities>
             FFT_loop(stride, lo, hi);
         
         // special last iteration 
-        hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);
+        fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);
         divides_assign< complex_t<Scalar> > divAss;
         divAss(lo, doubleSubgroupSize);
         divAss(hi, doubleSubgroupSize);
@@ -105,6 +103,5 @@ struct FFT<true, Scalar, device_capabilities>
 }
 }
 }
-}
 
 #endif
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -4,7 +4,6 @@
 #include "nbl/builtin/hlsl/subgroup/fft.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/memory_accessor.hlsl"
 #include "nbl/builtin/hlsl/workgroup/shuffle.hlsl"
 
 namespace nbl 
@@ -18,43 +17,41 @@ namespace fft
 // ---------------------------------- Utils -----------------------------------------------
 
 template<typename SharedMemoryAccessor, typename Scalar>
-void exchangeValues(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+void exchangeValues(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
 {
     const bool topHalf = bool(threadID & stride);
+    // Ternary won't take structs so we use this aux variable
     vector <Scalar, 2> toExchange = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());
-    shuffleXor<SharedMemoryAccessor, vector <Scalar, 2> >::__call(toExchange, stride, sharedmemAdaptor);
+    complex_t<Scalar> toExchangeComplex = {toExchange.x, toExchange.y};
+    shuffleXor<SharedMemoryAccessor, complex_t<Scalar> >::__call(toExchangeComplex, stride, sharedmemAccessor);
     if (topHalf)
-    {
-        lo.real(toExchange.x);
-        lo.imag(toExchange.y);
-    }
+        lo = toExchangeComplex;
     else
-    {
-        hi.real(toExchange.x);
-        hi.imag(toExchange.y);
-    }   
+        hi = toExchangeComplex;
 }
 
+} //namespace fft
+
 // ----------------------------------- End Utils -----------------------------------------------
 
 template<uint16_t ElementsPerInvocation, bool Inverse, typename Scalar, class device_capabilities=void>
 struct FFT;
 
 // For the FFT methods below, we assume:
 //      - Accessor is a global memory accessor to an array fitting 2 * _NBL_HLSL_WORKGROUP_SIZE_ elements of type complex_t<Scalar>, used to get inputs / set outputs of the FFT,
-//        that is, one "lo" and one "hi" complex numbers per thread, essentially 4 Scalars per thread. The data layout is assumed to be a whole array of real parts 
-//        followed by a whole array of imaginary parts. So it would be something like
-//        [x_0, x_1, ..., x_{2 * _NBL_HLSL_WORKGROUP_SIZE_}, y_0, y_1, ..., y_{2 * _NBL_HLSL_WORKGROUP_SIZE_}] 
-//      - SharedMemoryAccessor accesses a shared memory array that can fit _NBL_HLSL_WORKGROUP_SIZE_ elements of type complex_t<Scalar>, so 2 * _NBL_HLSL_WORKGROUP_SIZE_ Scalars 
+//        that is, one "lo" and one "hi" complex numbers per thread, essentially 4 Scalars per thread. 
+//        There are no assumptions on the data layout: we just require the accessor to provide get and set methods for complex_t<Scalar>.
+//      - SharedMemoryAccessor accesses a shared memory array that can fit _NBL_HLSL_WORKGROUP_SIZE_ elements of type complex_t<Scalar>, with get and set 
+//        methods for complex_t<Scalar>. It benefits from coalesced accesses   
 
 // 2 items per invocation forward specialization
 template<typename Scalar, class device_capabilities>
 struct FFT<2,false, Scalar, device_capabilities>
 {
     template<typename SharedMemoryAccessor>
-    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
-        exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAdaptor);
+        fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAccessor);
         
         // Get twiddle with k = threadID mod stride, halfN = stride
         hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID & (stride - 1), stride), lo, hi);    
@@ -64,25 +61,14 @@ struct FFT<2,false, Scalar, device_capabilities>
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
-        // Set up the MemAdaptors
-        MemoryAdaptor<Accessor, 1> memAdaptor;
-        memAdaptor.accessor = accessor;
-        MemoryAdaptor<SharedMemoryAccessor> sharedmemAdaptor;
-        sharedmemAdaptor.accessor = sharedmemAccessor;
-
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
 		const uint32_t loIx = threadID;
 		const uint32_t hiIx = loIx + _NBL_HLSL_WORKGROUP_SIZE_;
 
         // Read lo, hi values from global memory
-        vector <Scalar, 2> loVec;
-        vector <Scalar, 2> hiVec;
-        // TODO: if we get rid of the Memory Adaptor on the accessor and require comples getters and setters, then no `2*`
-        memAdaptor.get(2 * loIx , loVec);
-        memAdaptor.get(2 * hiIx, hiVec);
-        complex_t<Scalar> lo = {loVec.x, loVec.y};  
-        complex_t<Scalar> hi = {hiVec.x, hiVec.y};
+        complex_t<Scalar> lo = accessor.get(loIx);
+        complex_t<Scalar> hi = accessor.get(hiIx);
 
         // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
         if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize())
@@ -93,27 +79,20 @@ struct FFT<2,false, Scalar, device_capabilities>
             // Run bigger steps until Subgroup-sized
             for (uint32_t stride = _NBL_HLSL_WORKGROUP_SIZE_ >> 1; stride > glsl::gl_SubgroupSize(); stride >>= 1)
             {   
-                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAdaptor);
-                sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
+                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAccessor);
+                sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); 
             }
 
             // special last workgroup-shuffle     
-            exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);  
+            fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAccessor);  
         }
 
         // Subgroup-sized FFT
-        subgroup::fft::FFT<false, Scalar, device_capabilities>::__call(lo, hi);
+        subgroup::FFT<false, Scalar, device_capabilities>::__call(lo, hi);
 
         // Put values back in global mem
-        loVec = vector <Scalar, 2>(lo.real(), lo.imag());
-        hiVec = vector <Scalar, 2>(hi.real(), hi.imag());
-
-        memAdaptor.set(2 * loIx, loVec);
-        memAdaptor.set(2 * hiIx, hiVec);
-
-        // Update state for accessors
-        accessor = memAdaptor.accessor;
-        sharedmemAccessor = sharedmemAdaptor.accessor;
+        accessor.set(loIx, lo);
+        accessor.set(hiIx, hi);
     }
 };
 
@@ -124,53 +103,42 @@ template<typename Scalar, class device_capabilities>
 struct FFT<2,true, Scalar, device_capabilities>
 {
     template<typename SharedMemoryAccessor>
-    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
         // Get twiddle with k = threadID mod stride, halfN = stride
         hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID & (stride - 1), stride), lo, hi);     
     
-        exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAdaptor);
+        fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAccessor);
     }
 
 
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
-        // Set up the MemAdaptors
-        MemoryAdaptor<Accessor, 1> memAdaptor;
-        memAdaptor.accessor = accessor;
-        MemoryAdaptor<SharedMemoryAccessor> sharedmemAdaptor;
-        sharedmemAdaptor.accessor = sharedmemAccessor;
-
         // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
         const uint32_t loIx = (glsl::gl_SubgroupID()<<(glsl::gl_SubgroupSizeLog2()+1))+glsl::gl_SubgroupInvocationID();
 		const uint32_t hiIx = loIx+glsl::gl_SubgroupSize();
 
         // Read lo, hi values from global memory
-        vector <Scalar, 2> loVec;
-        vector <Scalar, 2> hiVec;
-        memAdaptor.get(2 * loIx , loVec);
-        memAdaptor.get(2 * hiIx, hiVec);
-        complex_t<Scalar> lo = {loVec.x, loVec.y};  
-        complex_t<Scalar> hi = {hiVec.x, hiVec.y}; 
+        complex_t<Scalar> lo = accessor.get(loIx);
+        complex_t<Scalar> hi = accessor.get(hiIx);
 
         // Run a subgroup-sized FFT, then continue with bigger steps
-        subgroup::fft::FFT<true, Scalar, device_capabilities>::__call(lo, hi);
+        subgroup::FFT<true, Scalar, device_capabilities>::__call(lo, hi);
         
         // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
-
         if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize()) 
         { 
             // special first workgroup-shuffle
-            exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);
+            fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAccessor);
         
             // The bigger steps
             for (uint32_t stride = glsl::gl_SubgroupSize() << 1; stride < _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1)
             {   
                 // Order of waiting for shared mem writes is also reversed here, since the shuffle came earlier
-                sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
-                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAdaptor);
+                sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); 
+                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAccessor);
             }
 
             // special last iteration 
@@ -181,14 +149,8 @@ struct FFT<2,true, Scalar, device_capabilities>
         }   
         
         // Put values back in global mem
-        loVec = vector <Scalar, 2>(lo.real(), lo.imag());
-        hiVec = vector <Scalar, 2>(hi.real(), hi.imag());
-        memAdaptor.set(2 * loIx, loVec);
-        memAdaptor.set(2 * hiIx, hiVec);
-
-        // Update state for accessors
-        accessor = memAdaptor.accessor;
-        sharedmemAccessor = sharedmemAdaptor.accessor;
+        accessor.set(loIx, lo);
+        accessor.set(hiIx, hi);
     }
 };
 
@@ -221,6 +183,5 @@ struct FFT
 }
 }
 }
-}
 
 #endif
diff --git a/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl b/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl
@@ -4,10 +4,7 @@
 #include "nbl/builtin/hlsl/memory_accessor.hlsl"
 
 // TODO: Add other shuffles
-// TODO: Consider adding an enable_if or static assert that 1 <= N <= 4 and that Scalar is a proper scalar type
-// TODO: Consider adding version that doesn't take a precomputed threadID and instead calls workgroup::SubgroupContiguousIndex
 
-// Unlike subgroups we pass a precomputed threadID so we don't go around recomputing it every time
 // We assume the accessor in the adaptor is clean and unaliased when calling this function, but we don't enforce this after the shuffle
 
 namespace nbl 
@@ -17,41 +14,25 @@ namespace hlsl
 namespace workgroup
 {
 
-
 template<typename SharedMemoryAccessor, typename T>
 struct shuffleXor
 {
-    static void __call(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+    static void __call(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
-        sharedmemAdaptor.set(threadID, value);
+        sharedmemAccessor.set(threadID, value);
         
         // Wait until all writes are done before reading
-        sharedmemAdaptor.workgroupExecutionAndMemoryBarrier();
+        sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
     
-        sharedmemAdaptor.get(threadID ^ mask, value);
+        value = sharedmemAccessor.get(threadID ^ mask);
     }
 
-    static void __call(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+    static void __call(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
-        __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAdaptor);
+        __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAccessor);
     }
 };
 
-/*
-
-template<typename SharedMemoryAccessor, typename T>
-void shuffleXor(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
-{
-    sharedmemAdaptor.set(threadID, value);
-        
-    // Wait until all writes are done before reading
-    sharedmemAdaptor.workgroupExecutionAndMemoryBarrier();
-    
-    sharedmemAdaptor.get(threadID ^ mask, value);
-}
-
-*/
-
 }
 }
 }

Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,6 @@ namespace hlsl`
`11`	`11`	`{`
`12`	`12`	`namespace subgroup`
`13`	`13`	`{`
`14`		`-namespace fft`
`15`		`-{`
`16`	`14`
`17`	`15`	`// -----------------------------------------------------------------------------------------------------------------------------------------------------------------`
`18`	`16`	`template<bool Inverse, typename Scalar, class device_capabilities=void>`
`@@ -42,15 +40,15 @@ struct FFT<false, Scalar, device_capabilities>`
`42`	`40`	`hi.imag(exchanged.y);`
`43`	`41`	`}`
`44`	`42`	`// Get twiddle with k = subgroupInvocation mod stride, halfN = stride`
`45`		`- hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
	`43`	`+ fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
`46`	`44`	`}`
`47`	`45`
`48`	`46`	`static void __call(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)`
`49`	`47`	`{`
`50`	`48`	`const uint32_t subgroupSize = glsl::gl_SubgroupSize(); //This is N/2`
`51`	`49`
`52`	`50`	`// special first iteration`
`53`		`- hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
	`51`	`+ fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
`54`	`52`
`55`	`53`	`// Decimation in Frequency`
`56`	`54`	`for (uint32_t stride = subgroupSize >> 1; stride > 0; stride >>= 1)`
`@@ -67,7 +65,7 @@ struct FFT<true, Scalar, device_capabilities>`
`67`	`65`	`static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)`
`68`	`66`	`{`
`69`	`67`	`// Get twiddle with k = subgroupInvocation mod stride, halfN = stride`
`70`		`- hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
	`68`	`+ fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
`71`	`69`
`72`	`70`	`const bool topHalf = bool(glsl::gl_SubgroupInvocationID() & stride);`
`73`	`71`	`const vector <Scalar, 2> toTrade = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());`
`@@ -94,7 +92,7 @@ struct FFT<true, Scalar, device_capabilities>`
`94`	`92`	`FFT_loop(stride, lo, hi);`
`95`	`93`
`96`	`94`	`// special last iteration`
`97`		`- hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
	`95`	`+ fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
`98`	`96`	`divides_assign< complex_t<Scalar> > divAss;`
`99`	`97`	`divAss(lo, doubleSubgroupSize);`
`100`	`98`	`divAss(hi, doubleSubgroupSize);`
`@@ -105,6 +103,5 @@ struct FFT<true, Scalar, device_capabilities>`
`105`	`103`	`}`
`106`	`104`	`}`
`107`	`105`	`}`
`108`		`-}`
`109`	`106`
`110`	`107`	`#endif`