@@ -18,25 +18,46 @@ layout(set=0, binding=1) uniform sampler2D NormalizedKernel[3];
18
18
#define _NBL_GLSL_EXT_FFT_MAIN_DEFINED_
19
19
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
20
20
21
+ shared vec2 scratch[1024];
22
+
21
23
void convolve(in uint item_per_thread_count, in uint ch)
22
24
{
23
25
// TODO: decouple kernel size from image size (can't get the math to work in my head)
24
- uvec3 dimension = nbl_glsl_ext_FFT_Parameters_t_getDimensions();
25
-
26
- for(uint t=0u; t<item_per_thread_count; t++)
27
- {
28
- uint tid = gl_LocalInvocationIndex + t * _NBL_GLSL_WORKGROUP_SIZE_;
29
- // TODO: refactor for smem usage
30
- uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
31
- const uvec3 log2_size = uvec3(11u, 10u, 0u);
32
- coords = bitfieldReverse(coords)>>(uvec3(32u)-log2_size);
33
-
34
- nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
26
+ const uint i = bitfieldReverse(gl_WorkGroupID.x)>>(32u-11u);
27
+ const float u = float(i)/2048.f+0.5f/512.f;
35
28
36
- vec2 uv = (vec2(coords.xy))/vec2(uvec2(1u)<<log2_size.xy)+vec2(0.5f)/vec2(textureSize(NormalizedKernel[ch],0));
37
- //
38
- nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
39
- nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
29
+ // compile time constants
30
+ const uint HALF_SIZE = _NBL_GLSL_SCRATCH_SHARED_SIZE_DEFINED_>>1u;
31
+ const uint ITEMS_PER_STEP = HALF_SIZE>>_NBL_GLSL_WORKGROUP_SIZE_LOG2_;
32
+ const uint ITEM_MASK = (item_per_thread_count-1u)>>findMSB(ITEMS_PER_STEP);
33
+ //
34
+ const uint hiInvocation = gl_LocalInvocationIndex&(~ITEM_MASK);
35
+ const uint loInvocation = gl_LocalInvocationIndex&ITEM_MASK;
36
+ uint base = 0;
37
+ for(uint i=0u; i<=ITEM_MASK; i++,base+=_NBL_GLSL_WORKGROUP_SIZE_*ITEMS_PER_STEP)
38
+ {
39
+ const float base_v = float(base)/1024.f+0.5f/512.f;
40
+ barrier();
41
+ for (uint j=0u; j<ITEMS_PER_STEP; j++)
42
+ {
43
+ const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*j+gl_LocalInvocationIndex;
44
+ float v = float(tid)/1024.f+base_v;
45
+ nbl_glsl_complex tmp = textureLod(NormalizedKernel[ch],vec2(u,v),0).xy;
46
+ _NBL_GLSL_SCRATCH_SHARED_DEFINED_[tid] = floatBitsToUint(tmp.x);
47
+ _NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+tid] = floatBitsToUint(tmp.y);
48
+ }
49
+ barrier();
50
+ if(loInvocation==i)
51
+ for(uint t=0u; t<item_per_thread_count; t++)
52
+ {
53
+ const uint j = bitfieldReverse(_NBL_GLSL_WORKGROUP_SIZE_*t+hiInvocation)>>(32u-10u);
54
+ nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
55
+ nbl_glsl_complex convSpectrum = nbl_glsl_complex(
56
+ uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[j]),
57
+ uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+j])
58
+ );
59
+ nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
60
+ }
40
61
}
41
62
}
42
63
@@ -54,7 +75,6 @@ void main()
54
75
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_ext_FFT_getPaddedData(nbl_glsl_ext_FFT_getCoordinates(tid),ch);
55
76
}
56
77
nbl_glsl_ext_FFT_preloaded(false,log2FFTSize);
57
- barrier();
58
78
59
79
convolve(item_per_thread_count,ch);
60
80
0 commit comments