2-element per invocation workgroup FFT working fine

Fletterio · Fletterio · commit e98ef60dee98 · 2024-06-27T17:02:48.000-03:00
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
+Subproject commit 2a0f9968de8e554b6dc104e4bc0f7f7f7122f0cd
diff --git a/include/nbl/builtin/hlsl/subgroup/fft.hlsl b/include/nbl/builtin/hlsl/subgroup/fft.hlsl
@@ -11,6 +11,8 @@ namespace hlsl
 {
 namespace subgroup
 {
+namespace fft
+{
 
 // -----------------------------------------------------------------------------------------------------------------------------------------------------------------
 template<bool Inverse, typename Scalar, class device_capabilities=void>
@@ -40,15 +42,15 @@ struct FFT<false, Scalar, device_capabilities>
             hi.imag(exchanged.y);
         }
         // Get twiddle with k = subgroupInvocation mod stride, halfN = stride
-        fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);
+        hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);
     }
 
     static void __call(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi) 
     {
         const uint32_t subgroupSize = glsl::gl_SubgroupSize();  //This is N/2
 
         // special first iteration
-        fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);                                                                                   
+        hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);                                                                                   
         
         // Decimation in Frequency
         for (uint32_t stride = subgroupSize >> 1; stride > 0; stride >>= 1)
@@ -65,7 +67,7 @@ struct FFT<true, Scalar, device_capabilities>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)
     {
         // Get twiddle with k = subgroupInvocation mod stride, halfN = stride
-        fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);   
+        hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);   
 
         const bool topHalf = bool(glsl::gl_SubgroupInvocationID() & stride);
         const vector <Scalar, 2> toTrade = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());
@@ -92,7 +94,7 @@ struct FFT<true, Scalar, device_capabilities>
             FFT_loop(stride, lo, hi);
         
         // special last iteration 
-        fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);
+        hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);
         divides_assign< complex_t<Scalar> > divAss;
         divAss(lo, doubleSubgroupSize);
         divAss(hi, doubleSubgroupSize);
@@ -103,5 +105,6 @@ struct FFT<true, Scalar, device_capabilities>
 }
 }
 }
+}
 
 #endif
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -11,28 +11,18 @@ namespace nbl
 {
 namespace hlsl
 {
-
-namespace glsl 
-{
-
-// Define this method from glsl_compat/core.hlsl 
-uint32_t3 gl_WorkGroupSize() {
-    return uint32_t3(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1);
-}
-
-} //namespace glsl
-
 namespace workgroup
 {
-
+namespace fft
+{
 // ---------------------------------- Utils -----------------------------------------------
 
 template<typename SharedMemoryAccessor, typename Scalar>
 void exchangeValues(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
 {
     const bool topHalf = bool(threadID & stride);
     vector <Scalar, 2> toExchange = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());
-    shuffleXor<SharedMemoryAccessor, Scalar, 2>(toExchange, stride, threadID, sharedmemAdaptor);
+    shuffleXor<SharedMemoryAccessor, vector <Scalar, 2> >::__call(toExchange, stride, sharedmemAdaptor);
     if (topHalf)
     {
         lo.real(toExchange.x);
@@ -67,60 +57,59 @@ struct FFT<2,false, Scalar, device_capabilities>
         exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAdaptor);
         
         // Get twiddle with k = threadID mod stride, halfN = stride
-        fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(threadID & (stride - 1), stride), lo, hi);    
+        hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID & (stride - 1), stride), lo, hi);    
     }
 
 
     template<typename Accessor, typename SharedMemoryAccessor>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
         // Set up the MemAdaptors
-        MemoryAdaptor<Accessor, _NBL_HLSL_WORKGROUP_SIZE_ << 1> memAdaptor;
+        MemoryAdaptor<Accessor, 1> memAdaptor;
         memAdaptor.accessor = accessor;
         MemoryAdaptor<SharedMemoryAccessor> sharedmemAdaptor;
         sharedmemAdaptor.accessor = sharedmemAccessor;
 
-        // Compute the SubgroupContiguousIndex only once
+        // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
+		const uint32_t loIx = threadID;
+		const uint32_t hiIx = loIx + _NBL_HLSL_WORKGROUP_SIZE_;
 
         // Read lo, hi values from global memory
         vector <Scalar, 2> loVec;
         vector <Scalar, 2> hiVec;
-        memAdaptor.get(threadID, loVec);
-        memAdaptor.get(threadID + _NBL_HLSL_WORKGROUP_SIZE_, hiVec);
+        // TODO: if we get rid of the Memory Adaptor on the accessor and require comples getters and setters, then no `2*`
+        memAdaptor.get(2 * loIx , loVec);
+        memAdaptor.get(2 * hiIx, hiVec);
         complex_t<Scalar> lo = {loVec.x, loVec.y};  
         complex_t<Scalar> hi = {hiVec.x, hiVec.y};
 
-        // special first iteration - only if workgroupsize > subgroupsize
+        // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
         if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize())
-            fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi); 
+        {
+            // special first iteration
+            hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi);
 
-        // Run bigger steps until Subgroup-sized
-        for (uint32_t stride = _NBL_HLSL_WORKGROUP_SIZE_ >> 1; stride > glsl::gl_SubgroupSize(); stride >>= 1)
-        {   
-            // If at least one loop was executed, we must wait for all threads to get their values before we write to shared mem again
-            if ( !(stride & (_NBL_HLSL_WORKGROUP_SIZE_ >> 1)) )
+            // Run bigger steps until Subgroup-sized
+            for (uint32_t stride = _NBL_HLSL_WORKGROUP_SIZE_ >> 1; stride > glsl::gl_SubgroupSize(); stride >>= 1)
+            {   
+                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAdaptor);
                 sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
-            FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAdaptor);
-        }
+            }
 
-        // special last workgroup-shuffle - only if workgroupsize > subgroupsize
-        if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize()) 
-        {
-            // Wait for all threads to be done with reads in the last loop before writing to shared mem      
-            sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
-            exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);
-        }       
+            // special last workgroup-shuffle     
+            exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);  
+        }
 
         // Subgroup-sized FFT
-        subgroup::FFT<false, Scalar, device_capabilities>::__call(lo, hi);
+        subgroup::fft::FFT<false, Scalar, device_capabilities>::__call(lo, hi);
 
         // Put values back in global mem
         loVec = vector <Scalar, 2>(lo.real(), lo.imag());
         hiVec = vector <Scalar, 2>(hi.real(), hi.imag());
 
-        memAdaptor.set(threadID, loVec);
-        memAdaptor.set(threadID + _NBL_HLSL_WORKGROUP_SIZE_, hiVec);
+        memAdaptor.set(2 * loIx, loVec);
+        memAdaptor.set(2 * hiIx, hiVec);
 
         // Update state for accessors
         accessor = memAdaptor.accessor;
@@ -138,7 +127,7 @@ struct FFT<2,true, Scalar, device_capabilities>
     static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
     {
         // Get twiddle with k = threadID mod stride, halfN = stride
-        fft::DIF<Scalar>::radix2(fft::twiddle<true, Scalar>(threadID & (stride - 1), stride), lo, hi);     
+        hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID & (stride - 1), stride), lo, hi);     
     
         exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAdaptor);
     }
@@ -148,53 +137,54 @@ struct FFT<2,true, Scalar, device_capabilities>
     static void __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
         // Set up the MemAdaptors
-        MemoryAdaptor<Accessor, _NBL_HLSL_WORKGROUP_SIZE_ << 1> memAdaptor;
+        MemoryAdaptor<Accessor, 1> memAdaptor;
         memAdaptor.accessor = accessor;
         MemoryAdaptor<SharedMemoryAccessor> sharedmemAdaptor;
         sharedmemAdaptor.accessor = sharedmemAccessor;
 
-        // Compute the SubgroupContiguousIndex only once
+        // Compute the indices only once
         const uint32_t threadID = uint32_t(SubgroupContiguousIndex());
+        const uint32_t loIx = (glsl::gl_SubgroupID()<<(glsl::gl_SubgroupSizeLog2()+1))+glsl::gl_SubgroupInvocationID();
+		const uint32_t hiIx = loIx+glsl::gl_SubgroupSize();
 
         // Read lo, hi values from global memory
         vector <Scalar, 2> loVec;
         vector <Scalar, 2> hiVec;
-        memAdaptor.get(threadID, loVec);
-        memAdaptor.get(threadID + _NBL_HLSL_WORKGROUP_SIZE_, hiVec);
+        memAdaptor.get(2 * loIx , loVec);
+        memAdaptor.get(2 * hiIx, hiVec);
         complex_t<Scalar> lo = {loVec.x, loVec.y};  
         complex_t<Scalar> hi = {hiVec.x, hiVec.y}; 
 
         // Run a subgroup-sized FFT, then continue with bigger steps
-        subgroup::FFT<true, Scalar, device_capabilities>::__call(lo, hi);
+        subgroup::fft::FFT<true, Scalar, device_capabilities>::__call(lo, hi);
         
-        // special first workgroup-shuffle - only if workgroupsize > subgroupsize
+        // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
+
         if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize()) 
         { 
+            // special first workgroup-shuffle
             exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);
-        }
-
-        // The bigger steps
-        for (uint32_t stride = glsl::gl_SubgroupSize() << 1; stride < _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1)
-        {   
-            // If we enter this for loop, then the special first workgroup shuffle went through, so wait on that
-            sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
-            FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAdaptor);
-        }
+        
+            // The bigger steps
+            for (uint32_t stride = glsl::gl_SubgroupSize() << 1; stride < _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1)
+            {   
+                // Order of waiting for shared mem writes is also reversed here, since the shuffle came earlier
+                sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
+                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAdaptor);
+            }
 
-        // special last iteration - only if workgroupsize > subgroupsize
-        if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize())
-        {
-            fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi); 
+            // special last iteration 
+            hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi); 
             divides_assign< complex_t<Scalar> > divAss;
             divAss(lo, _NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize());
-            divAss(hi, _NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize());
-        }
+            divAss(hi, _NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize());      
+        }   
         
         // Put values back in global mem
         loVec = vector <Scalar, 2>(lo.real(), lo.imag());
         hiVec = vector <Scalar, 2>(hi.real(), hi.imag());
-        memAdaptor.set(threadID, loVec);
-        memAdaptor.set(threadID + _NBL_HLSL_WORKGROUP_SIZE_, hiVec);
+        memAdaptor.set(2 * loIx, loVec);
+        memAdaptor.set(2 * hiIx, hiVec);
 
         // Update state for accessors
         accessor = memAdaptor.accessor;
@@ -203,21 +193,6 @@ struct FFT<2,true, Scalar, device_capabilities>
 };
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 // ---------------------------- Below pending --------------------------------------------------
 
 /*
@@ -246,5 +221,6 @@ struct FFT
 }
 }
 }
+}
 
 #endif
diff --git a/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl b/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl
@@ -18,8 +18,29 @@ namespace workgroup
 {
 
 
-template<typename SharedMemoryAccessor, typename Scalar, uint32_t N = 1>
-void shuffleXor(NBL_REF_ARG(vector <Scalar, N>) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+template<typename SharedMemoryAccessor, typename T>
+struct shuffleXor
+{
+    static void __call(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+    {
+        sharedmemAdaptor.set(threadID, value);
+        
+        // Wait until all writes are done before reading
+        sharedmemAdaptor.workgroupExecutionAndMemoryBarrier();
+    
+        sharedmemAdaptor.get(threadID ^ mask, value);
+    }
+
+    static void __call(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
+    {
+        __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAdaptor);
+    }
+};
+
+/*
+
+template<typename SharedMemoryAccessor, typename T>
+void shuffleXor(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(MemoryAdaptor<SharedMemoryAccessor>) sharedmemAdaptor)
 {
     sharedmemAdaptor.set(threadID, value);
         
@@ -29,6 +50,7 @@ void shuffleXor(NBL_REF_ARG(vector <Scalar, N>) value, uint32_t mask, uint32_t t
     sharedmemAdaptor.get(threadID ^ mask, value);
 }
 
+*/
 
 }
 }

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@ namespace hlsl`
`11`	`11`	`{`
`12`	`12`	`namespace subgroup`
`13`	`13`	`{`
	`14`	`+namespace fft`
	`15`	`+{`
`14`	`16`
`15`	`17`	`// -----------------------------------------------------------------------------------------------------------------------------------------------------------------`
`16`	`18`	`template<bool Inverse, typename Scalar, class device_capabilities=void>`
`@@ -40,15 +42,15 @@ struct FFT<false, Scalar, device_capabilities>`
`40`	`42`	`hi.imag(exchanged.y);`
`41`	`43`	`}`
`42`	`44`	`// Get twiddle with k = subgroupInvocation mod stride, halfN = stride`
`43`		`- fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
	`45`	`+ hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
`44`	`46`	`}`
`45`	`47`
`46`	`48`	`static void __call(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)`
`47`	`49`	`{`
`48`	`50`	`const uint32_t subgroupSize = glsl::gl_SubgroupSize(); //This is N/2`
`49`	`51`
`50`	`52`	`// special first iteration`
`51`		`- fft::DIF<Scalar>::radix2(fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
	`53`	`+ hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
`52`	`54`
`53`	`55`	`// Decimation in Frequency`
`54`	`56`	`for (uint32_t stride = subgroupSize >> 1; stride > 0; stride >>= 1)`
`@@ -65,7 +67,7 @@ struct FFT<true, Scalar, device_capabilities>`
`65`	`67`	`static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi)`
`66`	`68`	`{`
`67`	`69`	`// Get twiddle with k = subgroupInvocation mod stride, halfN = stride`
`68`		`- fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
	`70`	`+ hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID() & (stride - 1), stride), lo, hi);`
`69`	`71`
`70`	`72`	`const bool topHalf = bool(glsl::gl_SubgroupInvocationID() & stride);`
`71`	`73`	`const vector <Scalar, 2> toTrade = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());`
`@@ -92,7 +94,7 @@ struct FFT<true, Scalar, device_capabilities>`
`92`	`94`	`FFT_loop(stride, lo, hi);`
`93`	`95`
`94`	`96`	`// special last iteration`
`95`		`- fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
	`97`	`+ hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);`
`96`	`98`	`divides_assign< complex_t<Scalar> > divAss;`
`97`	`99`	`divAss(lo, doubleSubgroupSize);`
`98`	`100`	`divAss(hi, doubleSubgroupSize);`
`@@ -103,5 +105,6 @@ struct FFT<true, Scalar, device_capabilities>`
`103`	`105`	`}`
`104`	`106`	`}`
`105`	`107`	`}`
	`108`	`+}`
`106`	`109`
`107`	`110`	`#endif`