Changed memory accessor pattern for FFT to hide coalescing from user. Fixed bit_cast and a bunch of warnings emitted if using half floats

Fletterio · Fletterio · commit ee4474f74614 · 2024-08-06T17:23:47.000-03:00
diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl
@@ -34,9 +34,8 @@ namespace hlsl
 {
 
 template<class T, class U>
-T bit_cast(U val)
+enable_if_t<sizeof(T) <= sizeof(U), T> bit_cast(U val)
 {
-    static_assert(sizeof(T) <= sizeof(U));
     return spirv::bitcast<T, U>(val);
 }
 
diff --git a/include/nbl/builtin/hlsl/complex.hlsl b/include/nbl/builtin/hlsl/complex.hlsl
@@ -191,6 +191,10 @@ const static complex_t< SCALAR > multiplies< complex_t< SCALAR > >::identity = {
 template<> \
 const static complex_t< SCALAR > divides< complex_t< SCALAR > >::identity = { promote< SCALAR , uint32_t>(1),  promote< SCALAR , uint32_t>(0)};
 
+COMPLEX_ARITHMETIC_IDENTITIES(float16_t)
+COMPLEX_ARITHMETIC_IDENTITIES(float16_t2)
+COMPLEX_ARITHMETIC_IDENTITIES(float16_t3)
+COMPLEX_ARITHMETIC_IDENTITIES(float16_t4)  
 COMPLEX_ARITHMETIC_IDENTITIES(float32_t)
 COMPLEX_ARITHMETIC_IDENTITIES(float32_t2)
 COMPLEX_ARITHMETIC_IDENTITIES(float32_t3)
@@ -287,6 +291,10 @@ COMPLEX_COMPOUND_ASSIGN_IDENTITY(minus, SCALAR) \
 COMPLEX_COMPOUND_ASSIGN_IDENTITY(multiplies, SCALAR) \
 COMPLEX_COMPOUND_ASSIGN_IDENTITY(divides, SCALAR)
 
+COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float16_t)
+COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float16_t2)
+COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float16_t3)
+COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float16_t4)
 COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float32_t)
 COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float32_t2)
 COMPLEX_COMPOUND_ASSIGN_IDENTITIES(float32_t3)
diff --git a/include/nbl/builtin/hlsl/subgroup/fft.hlsl b/include/nbl/builtin/hlsl/subgroup/fft.hlsl
@@ -94,8 +94,8 @@ struct FFT<true, Scalar, device_capabilities>
         // special last iteration 
         fft::DIT<Scalar>::radix2(fft::twiddle<true, Scalar>(glsl::gl_SubgroupInvocationID(), subgroupSize), lo, hi);
         divides_assign< complex_t<Scalar> > divAss;
-        divAss(lo, doubleSubgroupSize);
-        divAss(hi, doubleSubgroupSize);
+        divAss(lo, Scalar(doubleSubgroupSize));
+        divAss(hi, Scalar(doubleSubgroupSize));
     }
 };
 
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -7,6 +7,10 @@
 #include "nbl/builtin/hlsl/workgroup/shuffle.hlsl"
 #include "nbl/builtin/hlsl/mpl.hlsl"
 #include "nbl/builtin/hlsl/memory_accessor.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+// Caveats
+// - Sin and Cos in HLSL take 32-bit floats. Using this library with 64-bit floats works perfectly fine, but DXC will emit warnings
 
 namespace nbl 
 {
@@ -18,20 +22,77 @@ namespace fft
 {
 
 // ---------------------------------- Utils -----------------------------------------------
+template<typename SharedMemoryAdaptor, typename Scalar>
+struct exchangeValues;
 
-template<typename SharedMemoryAccessor, typename Scalar>
-void exchangeValues(NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+template<typename SharedMemoryAdaptor>
+struct exchangeValues<SharedMemoryAdaptor, float16_t>
 {
-    const bool topHalf = bool(threadID & stride);
-    // Ternary won't take structs so we use this aux variable
-    vector <Scalar, 2> toExchange = topHalf ? vector <Scalar, 2>(lo.real(), lo.imag()) : vector <Scalar, 2>(hi.real(), hi.imag());
-    complex_t<Scalar> toExchangeComplex = {toExchange.x, toExchange.y};
-    shuffleXor<SharedMemoryAccessor, complex_t<Scalar> >::__call(toExchangeComplex, stride, sharedmemAccessor);
-    if (topHalf)
-        lo = toExchangeComplex;
-    else
-        hi = toExchangeComplex;
-}
+    static void __call(NBL_REF_ARG(complex_t<float16_t>) lo, NBL_REF_ARG(complex_t<float16_t>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
+    {
+        const bool topHalf = bool(threadID & stride);
+        // Ternary won't take structs so we use this aux variable
+        uint32_t toExchange = bit_cast<uint32_t, vector <float16_t, 2> >(topHalf ? vector <float16_t, 2> (lo.real(), lo.imag()) : vector <float16_t, 2> (hi.real(), hi.imag()));
+        shuffleXor<SharedMemoryAdaptor, uint32_t>::__call(toExchange, stride, sharedmemAdaptor);
+        vector <float16_t, 2> exchanged = bit_cast<vector <float16_t, 2>, uint32_t>(toExchange);
+        if (topHalf)
+        {
+            lo.real(exchanged.x);
+            lo.imag(exchanged.y);
+        }
+        else
+        {
+            hi.real(exchanged.x);
+            lo.imag(exchanged.y);
+        }   
+    }
+};
+
+template<typename SharedMemoryAdaptor>
+struct exchangeValues<SharedMemoryAdaptor, float32_t>
+{
+    static void __call(NBL_REF_ARG(complex_t<float32_t>) lo, NBL_REF_ARG(complex_t<float32_t>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
+    {
+        const bool topHalf = bool(threadID & stride);
+        // Ternary won't take structs so we use this aux variable
+        vector <uint32_t, 2> toExchange = bit_cast<vector <uint32_t, 2>, vector <float32_t, 2> >(topHalf ? vector <float32_t, 2>(lo.real(), lo.imag()) : vector <float32_t, 2>(hi.real(), hi.imag()));
+        shuffleXor<SharedMemoryAdaptor, vector <uint32_t, 2> >::__call(toExchange, stride, sharedmemAdaptor);
+        vector <float32_t, 2> exchanged = bit_cast<vector <float32_t, 2>, vector <uint32_t, 2> >(toExchange);
+        if (topHalf)
+        {
+            lo.real(exchanged.x);
+            lo.imag(exchanged.y);
+        }
+        else
+        {
+            hi.real(exchanged.x);
+            hi.imag(exchanged.y);
+        }      
+    }
+};
+
+template<typename SharedMemoryAdaptor>
+struct exchangeValues<SharedMemoryAdaptor, float64_t>
+{
+    static void __call(NBL_REF_ARG(complex_t<float64_t>) lo, NBL_REF_ARG(complex_t<float64_t>) hi, uint32_t threadID, uint32_t stride, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
+    {
+        const bool topHalf = bool(threadID & stride);
+        // Ternary won't take structs so we use this aux variable
+        vector <uint32_t, 4> toExchange = bit_cast<vector <uint32_t, 4>, vector <float64_t, 2> > (topHalf ? vector <float64_t, 2>(lo.real(), lo.imag()) : vector <float64_t, 2>(hi.real(), hi.imag()));                       
+        shuffleXor<SharedMemoryAdaptor, vector <uint32_t, 4> >::__call(toExchange, stride, sharedmemAdaptor);
+        vector <float64_t, 2> exchanged = bit_cast<vector <float64_t, 2>, vector <uint32_t, 4> >(toExchange);
+        if (topHalf)
+        {
+            lo.real(exchanged.x);
+            lo.imag(exchanged.y);
+        }
+        else
+        {
+            hi.real(exchanged.x);
+            hi.imag(exchanged.y);
+        }      
+    }
+};
 
 } //namespace fft
 
@@ -51,10 +112,10 @@ struct FFT;
 template<typename Scalar, class device_capabilities>
 struct FFT<2,false, Scalar, device_capabilities>
 {
-    template<typename SharedMemoryAccessor>
-    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    template<typename SharedMemoryAdaptor>
+    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
-        fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAccessor);
+        fft::exchangeValues<SharedMemoryAdaptor, Scalar>::__call(lo, hi, threadID, stride, sharedmemAdaptor);
         
         // Get twiddle with k = threadID mod stride, halfN = stride
         hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID & (stride - 1), stride), lo, hi);    
@@ -77,18 +138,25 @@ struct FFT<2,false, Scalar, device_capabilities>
         // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
         if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize())
         {
+            // Set up the memory adaptor
+            MemoryAdaptor<SharedMemoryAccessor> sharedmemAdaptor;
+            sharedmemAdaptor.accessor = sharedmemAccessor;
+
             // special first iteration
             hlsl::fft::DIF<Scalar>::radix2(hlsl::fft::twiddle<false, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi);
 
             // Run bigger steps until Subgroup-sized
             for (uint32_t stride = _NBL_HLSL_WORKGROUP_SIZE_ >> 1; stride > glsl::gl_SubgroupSize(); stride >>= 1)
             {   
-                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAccessor);
-                sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); 
+                FFT_loop< MemoryAdaptor<SharedMemoryAccessor> >(stride, lo, hi, threadID, sharedmemAdaptor);
+                sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
             }
 
             // special last workgroup-shuffle     
-            fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAccessor);  
+            fft::exchangeValues<MemoryAdaptor<SharedMemoryAccessor>, Scalar>::__call(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);  
+            
+            // Remember to update the accessor's state
+            sharedmemAccessor = sharedmemAdaptor.accessor;
         }
 
         // Subgroup-sized FFT
@@ -106,13 +174,13 @@ struct FFT<2,false, Scalar, device_capabilities>
 template<typename Scalar, class device_capabilities>
 struct FFT<2,true, Scalar, device_capabilities>
 {
-    template<typename SharedMemoryAccessor>
-    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    template<typename SharedMemoryAdaptor>
+    static void FFT_loop(uint32_t stride, NBL_REF_ARG(complex_t<Scalar>) lo, NBL_REF_ARG(complex_t<Scalar>) hi, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
         // Get twiddle with k = threadID mod stride, halfN = stride
         hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID & (stride - 1), stride), lo, hi);     
     
-        fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, stride, sharedmemAccessor);
+        fft::exchangeValues<SharedMemoryAdaptor, Scalar>::__call(lo, hi, threadID, stride, sharedmemAdaptor);
     }
 
 
@@ -135,22 +203,29 @@ struct FFT<2,true, Scalar, device_capabilities>
         // If for some reason you're running a small FFT, skip all the bigger-than-subgroup steps
         if (_NBL_HLSL_WORKGROUP_SIZE_ > glsl::gl_SubgroupSize()) 
         { 
+            // Set up the memory adaptor
+            MemoryAdaptor<SharedMemoryAccessor> sharedmemAdaptor;
+            sharedmemAdaptor.accessor = sharedmemAccessor;
+
             // special first workgroup-shuffle
-            fft::exchangeValues<SharedMemoryAccessor, Scalar>(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAccessor);
+            fft::exchangeValues<MemoryAdaptor<SharedMemoryAccessor>, Scalar>::__call(lo, hi, threadID, glsl::gl_SubgroupSize(), sharedmemAdaptor);
         
             // The bigger steps
             for (uint32_t stride = glsl::gl_SubgroupSize() << 1; stride < _NBL_HLSL_WORKGROUP_SIZE_; stride <<= 1)
             {   
                 // Order of waiting for shared mem writes is also reversed here, since the shuffle came earlier
-                sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); 
-                FFT_loop<SharedMemoryAccessor>(stride, lo, hi, threadID, sharedmemAccessor);
+                sharedmemAdaptor.workgroupExecutionAndMemoryBarrier(); 
+                FFT_loop< MemoryAdaptor<SharedMemoryAccessor> >(stride, lo, hi, threadID, sharedmemAdaptor);
             }
 
             // special last iteration 
             hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true, Scalar>(threadID, _NBL_HLSL_WORKGROUP_SIZE_), lo, hi); 
             divides_assign< complex_t<Scalar> > divAss;
-            divAss(lo, _NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize());
-            divAss(hi, _NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize());      
+            divAss(lo, Scalar(_NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize()));
+            divAss(hi, Scalar(_NBL_HLSL_WORKGROUP_SIZE_ / glsl::gl_SubgroupSize()));  
+
+            // Remember to update the accessor's state
+            sharedmemAccessor = sharedmemAdaptor.accessor;    
         }   
         
         // Put values back in global mem
@@ -166,10 +241,10 @@ struct FFT<K, false, Scalar, device_capabilities>
     template<typename Accessor, typename SharedMemoryAccessor>
     static enable_if_t< (mpl::is_pot_v<K> && K > 2), void > __call(NBL_REF_ARG(Accessor) accessor, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
     {
-        for (uint32_t stride = (K >> 1) * _NBL_HLSL_WORKGROUP_SIZE_; stride > _NBL_HLSL_WORKGROUP_SIZE_; stride >>= 1)
+        for (uint32_t stride = (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_; stride > _NBL_HLSL_WORKGROUP_SIZE_; stride >>= 1)
         {
             //[unroll(K/2)]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K >> 1) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -223,7 +298,7 @@ struct FFT<K, true, Scalar, device_capabilities>
         {
             accessor.memoryBarrier(); // no execution barrier just making sure writes propagate to accessor
             //[unroll(K/2)]
-            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K >> 1) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
+            for (uint32_t virtualThreadID = SubgroupContiguousIndex(); virtualThreadID < (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_; virtualThreadID += _NBL_HLSL_WORKGROUP_SIZE_)
             {
                 const uint32_t loIx = ((virtualThreadID & (~(stride - 1))) << 1) | (virtualThreadID & (stride - 1));
                 const uint32_t hiIx = loIx | stride;
@@ -235,11 +310,11 @@ struct FFT<K, true, Scalar, device_capabilities>
                 hlsl::fft::DIT<Scalar>::radix2(hlsl::fft::twiddle<true,Scalar>(virtualThreadID & (stride - 1), stride), lo,hi);
                 
                 // Divide by special factor at the end
-                if ( (K >> 1) * _NBL_HLSL_WORKGROUP_SIZE_ == stride)
+                if ( (K / 2) * _NBL_HLSL_WORKGROUP_SIZE_ == stride)
                 {
                     divides_assign< complex_t<Scalar> > divAss;
-                    divAss(lo, K >> 1);
-                    divAss(hi, K >> 1);  
+                    divAss(lo, K / 2);
+                    divAss(hi, K / 2);  
                 }
 
                 accessor.set(loIx, lo);
diff --git a/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl b/include/nbl/builtin/hlsl/workgroup/shuffle.hlsl
@@ -14,22 +14,42 @@ namespace hlsl
 namespace workgroup
 {
 
-template<typename SharedMemoryAccessor, typename T>
+template<typename SharedMemoryAdaptor, typename T>
 struct shuffleXor
 {
-    static void __call(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    static void __call(NBL_REF_ARG(T) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
-        sharedmemAccessor.set(threadID, value);
+        sharedmemAdaptor.template set<T>(threadID, value);
         
         // Wait until all writes are done before reading
-        sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+        sharedmemAdaptor.workgroupExecutionAndMemoryBarrier();
     
-        sharedmemAccessor.get(threadID ^ mask, value);
+        sharedmemAdaptor.template get<T>(threadID ^ mask, value);
     }
 
-    static void __call(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(SharedMemoryAccessor) sharedmemAccessor)
+    static void __call(NBL_REF_ARG(T) value, uint32_t mask, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
     {
-        __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAccessor);
+        __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAdaptor);
+    }
+};
+
+// Vector specialization
+template<typename SharedMemoryAdaptor, typename T, uint32_t N>
+struct shuffleXor<SharedMemoryAdaptor, vector <T, N> >
+{
+    static enable_if_t<N <= 4> __call(NBL_REF_ARG(vector <T, N>) value, uint32_t mask, uint32_t threadID, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
+    {
+        sharedmemAdaptor.template set<T>(threadID, value);
+        
+        // Wait until all writes are done before reading
+        sharedmemAdaptor.workgroupExecutionAndMemoryBarrier();
+    
+        sharedmemAdaptor.template get<T>(threadID ^ mask, value);
+    }
+
+    static enable_if_t<N <= 4> __call(NBL_REF_ARG(vector <T, N>) value, uint32_t mask, NBL_REF_ARG(SharedMemoryAdaptor) sharedmemAdaptor)
+    {
+        __call(value, mask, uint32_t(SubgroupContiguousIndex()), sharedmemAdaptor);
     }
 };
 

Original file line number	Diff line number	Diff line change
`@@ -34,9 +34,8 @@ namespace hlsl`
`34`	`34`	`{`
`35`	`35`
`36`	`36`	`template<class T, class U>`
`37`		`-T bit_cast(U val)`
	`37`	`+enable_if_t<sizeof(T) <= sizeof(U), T> bit_cast(U val)`
`38`	`38`	`{`
`39`		`- static_assert(sizeof(T) <= sizeof(U));`
`40`	`39`	`return spirv::bitcast<T, U>(val);`
`41`	`40`	`}`
`42`	`41`