plan for real DFTs

devshgraphicsprogramming · devshgraphicsprogramming · commit 48e514b7db4f · 2021-03-03T00:25:19.000+01:00
diff --git a/examples_tests/49.ComputeFFT/fft_convolve_ifft.comp b/examples_tests/49.ComputeFFT/fft_convolve_ifft.comp
@@ -1,10 +1,4 @@
 // WorkGroup Size
-
-
-#ifndef _NBL_GLSL_EXT_FFT_MAX_CHANNELS
-#define _NBL_GLSL_EXT_FFT_MAX_CHANNELS 4
-#endif
-
 #ifndef _NBL_GLSL_WORKGROUP_SIZE_
 #define _NBL_GLSL_WORKGROUP_SIZE_ 256
 #endif
diff --git a/examples_tests/49.ComputeFFT/last_fft.comp b/examples_tests/49.ComputeFFT/last_fft.comp
@@ -1,12 +1,3 @@
-#ifndef _NBL_GLSL_EXT_DEFAULT_COMPUTE_FFT_INCLUDED_
-#define _NBL_GLSL_EXT_DEFAULT_COMPUTE_FFT_INCLUDED_
-
-// WorkGroup Size
-
-#ifndef _NBL_GLSL_EXT_FFT_MAX_CHANNELS
-#define _NBL_GLSL_EXT_FFT_MAX_CHANNELS 4
-#endif
-
 #ifndef _NBL_GLSL_WORKGROUP_SIZE_
 #define _NBL_GLSL_WORKGROUP_SIZE_ 256
 #endif
@@ -83,6 +74,4 @@ void main()
 	{
 		nbl_glsl_ext_FFT(nbl_glsl_ext_FFT_Parameters_t_getIsInverse(), ch);
 	}
-}
-
-#endif
+}
diff --git a/examples_tests/49.ComputeFFT/main.cpp b/examples_tests/49.ComputeFFT/main.cpp
@@ -74,7 +74,6 @@ R"===(#version 430 core
 
 #define _NBL_GLSL_WORKGROUP_SIZE_ %u
 #define _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ %u
-#define _NBL_GLSL_EXT_FFT_MAX_ITEMS_PER_THREAD %u
  
 #include "../fft_convolve_ifft.comp"
 
@@ -83,13 +82,11 @@ R"===(#version 430 core
 	const size_t extraSize = 32 + 32 + 32 + 32;
 	
 	constexpr uint32_t DEFAULT_WORK_GROUP_SIZE = 256u;
-	const uint32_t maxItemsPerThread = (maxPaddedDimensionSize - 1u) / (DEFAULT_WORK_GROUP_SIZE) + 1u;
 	auto shader = core::make_smart_refctd_ptr<ICPUBuffer>(strlen(sourceFmt)+extraSize+1u);
 	snprintf(
 		reinterpret_cast<char*>(shader->getPointer()),shader->getSize(), sourceFmt,
 		DEFAULT_WORK_GROUP_SIZE,
-		maxPaddedDimensionSize,
-		maxItemsPerThread
+		maxPaddedDimensionSize
 	);
 
 	auto cpuSpecializedShader = core::make_smart_refctd_ptr<ICPUSpecializedShader>(
@@ -196,7 +193,6 @@ R"===(#version 430 core
 
 #define _NBL_GLSL_WORKGROUP_SIZE_ %u
 #define _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ %u
-#define _NBL_GLSL_EXT_FFT_MAX_ITEMS_PER_THREAD %u
 
 #include "../last_fft.comp"
 
@@ -205,13 +201,11 @@ R"===(#version 430 core
 	const size_t extraSize = 32 + 32 + 32 + 32;
 	
 	constexpr uint32_t DEFAULT_WORK_GROUP_SIZE = 256u;
-	const uint32_t maxItemsPerThread = (maxPaddedDimensionSize - 1u) / (DEFAULT_WORK_GROUP_SIZE) + 1u;
 	auto shader = core::make_smart_refctd_ptr<ICPUBuffer>(strlen(sourceFmt)+extraSize+1u);
 	snprintf(
 		reinterpret_cast<char*>(shader->getPointer()),shader->getSize(), sourceFmt,
 		DEFAULT_WORK_GROUP_SIZE,
-		maxPaddedDimensionSize,
-		maxItemsPerThread
+		maxPaddedDimensionSize
 	);
 
 	auto cpuSpecializedShader = core::make_smart_refctd_ptr<ICPUSpecializedShader>(
diff --git a/include/nbl/builtin/glsl/ext/FFT/default_compute_fft.comp b/include/nbl/builtin/glsl/ext/FFT/default_compute_fft.comp
@@ -1,16 +1,7 @@
-#ifndef _NBL_GLSL_EXT_DEFAULT_COMPUTE_FFT_INCLUDED_
-#define _NBL_GLSL_EXT_DEFAULT_COMPUTE_FFT_INCLUDED_
-
-// WorkGroup Size
-
 #ifndef USE_SSBO_FOR_INPUT
 #error "USE_SSBO_FOR_INPUT should be defined."
 #endif
 
-#ifndef _NBL_GLSL_EXT_FFT_MAX_CHANNELS
-#define _NBL_GLSL_EXT_FFT_MAX_CHANNELS 4
-#endif
-
 #ifndef _NBL_GLSL_WORKGROUP_SIZE_
 #define _NBL_GLSL_WORKGROUP_SIZE_ 256
 #endif
@@ -127,6 +118,4 @@ void main()
 	{
 		nbl_glsl_ext_FFT(nbl_glsl_ext_FFT_Parameters_t_getIsInverse(), ch);
 	}
-}
-
-#endif
+}
diff --git a/include/nbl/builtin/glsl/ext/FFT/fft.glsl b/include/nbl/builtin/glsl/ext/FFT/fft.glsl
@@ -8,18 +8,10 @@
 #include <nbl/builtin/glsl/math/complex.glsl>
 #include <nbl/builtin/glsl/ext/FFT/parameters.glsl>
 
-#ifndef _NBL_GLSL_EXT_FFT_MAX_CHANNELS
-#error "_NBL_GLSL_EXT_FFT_MAX_CHANNELS should be defined."
-#endif
-
 #ifndef _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_
 #error "_NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ should be defined."
 #endif
 
-#ifndef _NBL_GLSL_EXT_FFT_MAX_ITEMS_PER_THREAD
-#error "_NBL_GLSL_EXT_FFT_MAX_ITEMS_PER_THREAD should be defined."
-#endif
-
 #include "nbl/builtin/glsl/workgroup/shared_fft.glsl"
 
 // Push Constants
@@ -72,7 +64,8 @@ uvec3 nbl_glsl_ext_FFT_getCoordinates(in uint tidx)
 #include "nbl/builtin/glsl/workgroup/fft.glsl"
 
 
-nbl_glsl_complex nbl_glsl_ext_FFT_impl_values[_NBL_GLSL_EXT_FFT_MAX_ITEMS_PER_THREAD*2u]; // TODO: redo later
+nbl_glsl_complex nbl_glsl_ext_FFT_impl_values[(_NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_-1u)/_NBL_GLSL_WORKGROUP_SIZE_+1u];
+
 void nbl_glsl_ext_FFT_loop(in bool is_inverse, in uint virtual_thread_count, in uint step)
 {
     for(uint t=0u; t<virtual_thread_count; t++)
@@ -90,7 +83,7 @@ void nbl_glsl_ext_FFT_loop(in bool is_inverse, in uint virtual_thread_count, in
             nbl_glsl_FFT_DIF_radix2(twiddle,nbl_glsl_ext_FFT_impl_values[lo_ix],nbl_glsl_ext_FFT_impl_values[hi_ix]);
     }
 }
-// TODO: try radix-4 or even radix-8 for perf
+
 void nbl_glsl_ext_FFT(bool is_inverse, uint channel)
 {
     // Virtual Threads Calculation
diff --git a/include/nbl/builtin/glsl/ext/FFT/normalization.comp b/include/nbl/builtin/glsl/ext/FFT/normalization.comp
@@ -1,11 +1,7 @@
 #version 430 core
-
-#ifndef _NBL_GLSL_EXT_FFT_NORMALIZATION_INCLUDED_
-#define _NBL_GLSL_EXT_FFT_NORMALIZATION_INCLUDED_
-
 layout(local_size_x=256, local_size_y=1, local_size_z=1) in;
  
- #define complex_value vec2
+#include <nbl/builtin/glsl/math/complex.glsl>
 
 layout(set=0, binding=0) restrict readonly buffer InBuffer
 {
@@ -20,8 +16,6 @@ layout(set=0, binding=1) restrict buffer OutBuffer
 void main()
 {
 	float power = length(in_data[0]);
-	vec2 normalized_data = in_data[gl_GlobalInvocationID.x];// / power;
+	vec2 normalized_data = in_data[gl_GlobalInvocationID.x]/power;
 	out_data[gl_GlobalInvocationID.x] = normalized_data;
-}
-
-#endif
+}
diff --git a/include/nbl/builtin/glsl/math/complex.glsl b/include/nbl/builtin/glsl/math/complex.glsl
@@ -8,16 +8,18 @@
 #include <nbl/builtin/glsl/math/constants.glsl>
 #include <nbl/builtin/glsl/math/functions.glsl>
 
+#define nbl_glsl_complex16_t uint
+
 #define nbl_glsl_complex vec2
 #define nbl_glsl_cvec2 mat2
 #define nbl_glsl_cvec3 mat3x2
 #define nbl_glsl_cvec4 mat4x2
 
 nbl_glsl_complex nbl_glsl_expImaginary(in float _theta)
 {
-    float r = cos(_theta);
-    float i = sin(_theta);
-    return vec2(r, i);
+    nbl_glsl_complex retval;
+    nbl_glsl_sincos(_theta,retval.y,retval.x);
+    return retval;
 }
 
 nbl_glsl_complex nbl_glsl_complex_mul(in nbl_glsl_complex rhs, in nbl_glsl_complex lhs)
@@ -32,16 +34,47 @@ nbl_glsl_complex nbl_glsl_complex_add(in nbl_glsl_complex rhs, in nbl_glsl_compl
     return rhs + lhs;
 }
 
+nbl_glsl_complex16_t nbl_glsl_complex16_t_conjugate(in nbl_glsl_complex16_t complex) {
+    return complex^0x80000000u;
+}
 nbl_glsl_complex nbl_glsl_complex_conjugate(in nbl_glsl_complex complex) {
-    return complex * vec2(1, -1);
+    return nbl_glsl_complex(complex.x,-complex.y);
 }
 
 
 // FFT
-#include <nbl/builtin/glsl/math/complex.glsl>
+nbl_glsl_complex nbl_glsl_FFT_half_twiddle(in uint k, in float N)
+{
+    const float arg = -2.f*nbl_glsl_PI*float(k)/N;
+    nbl_glsl_complex retval;
+    retval.x = cos(arg);
+    retval.y = sqrt(1.f-retval.x*retval.x); // twiddle is always half the range, so no conditional -1.f needed
+    return retval;
+}
+nbl_glsl_complex nbl_glsl_FFT_half_twiddle(in uint k, in uint logTwoN)
+{
+    return nbl_glsl_FFT_half_twiddle(k,float(1<<logTwoN));
+}
+
+nbl_glsl_complex nbl_glsl_FFT_half_twiddle(in bool is_inverse, in uint k, in float N)
+{
+    nbl_glsl_complex twiddle = nbl_glsl_FFT_half_twiddle(k,N);
+    if (is_inverse)
+        return nbl_glsl_complex_conjugate(twiddle);
+    return twiddle;
+}
+nbl_glsl_complex nbl_glsl_FFT_half_twiddle(in bool is_inverse, in uint k, in uint logTwoN)
+{
+    return nbl_glsl_FFT_half_twiddle(is_inverse,k,float(1<<logTwoN));
+}
+
+
 nbl_glsl_complex nbl_glsl_FFT_twiddle(in uint k, in float N)
 {
-    return nbl_glsl_expImaginary(-2.f*nbl_glsl_PI*float(k)/N);
+    nbl_glsl_complex retval;
+    retval.x = cos(-2.f*nbl_glsl_PI*float(k)/N);
+    retval.y = sqrt(1.f-retval.x*retval.x); // twiddle is always half the range, so no conditional -1.f needed
+    return retval;
 }
 nbl_glsl_complex nbl_glsl_FFT_twiddle(in uint k, in uint logTwoN)
 {
@@ -50,7 +83,7 @@ nbl_glsl_complex nbl_glsl_FFT_twiddle(in uint k, in uint logTwoN)
 
 nbl_glsl_complex nbl_glsl_FFT_twiddle(in bool is_inverse, in uint k, in float N)
 {
-    nbl_glsl_complex twiddle = nbl_glsl_FFT_twiddle(k, N);
+    nbl_glsl_complex twiddle = nbl_glsl_FFT_twiddle(k,N);
     if (is_inverse)
         return nbl_glsl_complex_conjugate(twiddle);
     return twiddle;
@@ -60,6 +93,8 @@ nbl_glsl_complex nbl_glsl_FFT_twiddle(in bool is_inverse, in uint k, in uint log
     return nbl_glsl_FFT_twiddle(is_inverse,k,float(1<<logTwoN));
 }
 
+
+
 // decimation in time
 void nbl_glsl_FFT_DIT_radix2(in nbl_glsl_complex twiddle, inout nbl_glsl_complex lo, inout nbl_glsl_complex hi)
 {
diff --git a/include/nbl/builtin/glsl/math/functions.glsl b/include/nbl/builtin/glsl/math/functions.glsl
@@ -191,7 +191,8 @@ void nbl_glsl_sincos(in float theta, out float s, out float c)
 {
     c = cos(theta);
     s = sqrt(1.0-c*c);
-    s = theta<0.0 ? -s:s; // TODO: do with XOR
+    s = theta<0.0 ? -s:s; // TODO: test with XOR
+    //s = uintBitsToFloat(floatBitsToUint(s)^(floatBitsToUint(theta)&0x80000000u));
 }
 
 mat2x3 nbl_glsl_frisvad(in vec3 n)
diff --git a/include/nbl/builtin/glsl/subgroup/fft.glsl b/include/nbl/builtin/glsl/subgroup/fft.glsl
@@ -15,6 +15,8 @@
 //TODO: optimization for DFT of real signal
 
 // TODO: with stockham or something that does not require stupid shuffles to extract and pack
+// https://cnx.org/contents/8D0YvnW1@7.1:1aiTU8is@6/Alternate-FFT-Structures
+// These twiddle factors can be precomputed once and stored in an array in computer memory, and accessed in the FFT algorithm by table lookup. This simple technique yields very substantial savings and is almost always used in practice.
 void nbl_glsl_subgroupFFT_loop(in bool is_inverse, in uint stride, inout nbl_glsl_complex lo, inout nbl_glsl_complex hi)
 {
     const uint sub_ix = nbl_glsl_SubgroupInvocationID&(stride-1u);
diff --git a/include/nbl/builtin/glsl/workgroup/fft.glsl b/include/nbl/builtin/glsl/workgroup/fft.glsl
@@ -27,8 +27,7 @@
 #endif
 
 
-//TODO: optimization for DFT of real signal
-
+//TODO: try radix-4 or even radix-8 for perf
 
 void nbl_glsl_workgroupFFT_loop(in bool is_inverse, in uint stride)
 {
@@ -42,7 +41,10 @@ void nbl_glsl_workgroupFFT_loop(in bool is_inverse, in uint stride)
     nbl_glsl_complex low = nbl_glsl_complex(uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[lo_x_ix]),uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[lo_y_ix]));
     nbl_glsl_complex high = nbl_glsl_complex(uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[hi_x_ix]),uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[hi_y_ix]));
 
-    nbl_glsl_complex twiddle = nbl_glsl_FFT_twiddle(is_inverse,sub_ix,float(stride<<1u));
+    nbl_glsl_complex twiddle = nbl_glsl_complex(1.f,0.f);
+    if (stride!=1u)
+        twiddle = nbl_glsl_FFT_twiddle(is_inverse,sub_ix,float(stride<<1u));
+
     if (is_inverse)
         nbl_glsl_FFT_DIT_radix2(twiddle,low,high);
     else
@@ -97,16 +99,22 @@ void nbl_glsl_workgroupFFT(in bool is_inverse, inout nbl_glsl_complex lo, inout
     if (is_inverse)
     {
         nbl_glsl_FFT_DIT_radix2(nbl_glsl_FFT_twiddle(true,gl_LocalInvocationIndex,doubleWorkgroupSize),lo,hi);
-        
-    const float doubleSubgroupSize = float(nbl_glsl_SubgroupSize<<1u);
-    lo /= doubleSubgroupSize;
-    hi /= doubleSubgroupSize;
-    const float scaleFactor = float(nbl_glsl_SubgroupSize<<1u)/doubleWorkgroupSize;
-        lo *= scaleFactor;
-        hi *= scaleFactor;
+
+        lo /= doubleWorkgroupSize;
+        hi /= doubleWorkgroupSize;
     }
 }
 
+#if 0 // TODO
+// Computes Forward FFT of two real signals
+void nbl_glsl_workgroupRealFFT(in bool is_inverse, in float sequenceALo, in float sequenceAHi, in float sequenceBLo, in float sequenceBHi)
+{
+    nbl_glsl_complex lo = nbl_glsl_complex(sequenceALo,sequenceBLo);
+    nbl_glsl_complex hi = nbl_glsl_complex(sequenceAHi,sequenceBHi);
+    nbl_glsl_workgroupFFT(false,lo,hi);
+    // extract aDFT and bDFT by using sorensens method
+}
+#endif
 
 
 #endif
diff --git a/include/nbl/ext/FFT/FFT.h b/include/nbl/ext/FFT/FFT.h
@@ -41,11 +41,12 @@ class FFT : public core::TotalInterface
 		enum class PaddingType : uint8_t {
 			CLAMP_TO_EDGE = 0,
 			FILL_WITH_ZERO = 1,
+			// TODO: mirror?
 		};
 
 		enum class DataType {
 			SSBO,
-			TEXTURE2D,
+			TEXTURE2D
 		};
 
 		struct DispatchInfo_t
diff --git a/src/nbl/ext/FFT/FFT.cpp b/src/nbl/ext/FFT/FFT.cpp
@@ -88,23 +88,20 @@ R"===(#version 430 core
 #define USE_SSBO_FOR_INPUT %u
 #define _NBL_GLSL_WORKGROUP_SIZE_ %u
 #define _NBL_GLSL_EXT_FFT_MAX_DIM_SIZE_ %u
-#define _NBL_GLSL_EXT_FFT_MAX_ITEMS_PER_THREAD %u
  
 #include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
 
 )===";
 
 	const size_t extraSize = 32 + 32 + 32 + 32;
 
-	const uint32_t maxItemsPerThread = ((maxPaddedDimensionSize >> 1) - 1u) / (DEFAULT_WORK_GROUP_SIZE) + 1u;
 	const uint32_t useSSBOforInput = (DataType::SSBO == inputType) ? 1 : 0;
 	auto shader = core::make_smart_refctd_ptr<ICPUBuffer>(strlen(sourceFmt)+extraSize+1u);
 	snprintf(
 		reinterpret_cast<char*>(shader->getPointer()),shader->getSize(), sourceFmt,
 		useSSBOforInput,
 		DEFAULT_WORK_GROUP_SIZE,
-		maxPaddedDimensionSize,
-		maxItemsPerThread
+		maxPaddedDimensionSize
 	);
 
 	auto cpuSpecializedShader = core::make_smart_refctd_ptr<ICPUSpecializedShader>(

Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,8 @@ void nbl_glsl_sincos(in float theta, out float s, out float c)`
`191`	`191`	`{`
`192`	`192`	`c = cos(theta);`
`193`	`193`	`s = sqrt(1.0-c*c);`
`194`		`- s = theta<0.0 ? -s:s; // TODO: do with XOR`
	`194`	`+ s = theta<0.0 ? -s:s; // TODO: test with XOR`
	`195`	`+ //s = uintBitsToFloat(floatBitsToUint(s)^(floatBitsToUint(theta)&0x80000000u));`
`195`	`196`	`}`
`196`	`197`
`197`	`198`	`mat2x3 nbl_glsl_frisvad(in vec3 n)`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,8 @@`
`15`	`15`	`//TODO: optimization for DFT of real signal`
`16`	`16`
`17`	`17`	`// TODO: with stockham or something that does not require stupid shuffles to extract and pack`
	`18`	`+// https://cnx.org/contents/[email protected]:1aiTU8is@6/Alternate-FFT-Structures`
	`19`	`+// These twiddle factors can be precomputed once and stored in an array in computer memory, and accessed in the FFT algorithm by table lookup. This simple technique yields very substantial savings and is almost always used in practice.`
`18`	`20`	`void nbl_glsl_subgroupFFT_loop(in bool is_inverse, in uint stride, inout nbl_glsl_complex lo, inout nbl_glsl_complex hi)`
`19`	`21`	`{`
`20`	`22`	`const uint sub_ix = nbl_glsl_SubgroupInvocationID&(stride-1u);`