@@ -12,52 +12,31 @@ layout(set=0, binding=0) buffer restrict InputOutputBuffer
12
12
13
13
layout(set=0, binding=1) uniform sampler2D NormalizedKernel[3];
14
14
15
+ #include "convolve_parameters.glsl"
15
16
16
17
#define inData inoutData
17
18
#define outData inoutData
18
19
#define _NBL_GLSL_EXT_FFT_MAIN_DEFINED_
19
20
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
20
21
21
- shared vec2 scratch[1024];
22
-
23
22
void convolve(in uint item_per_thread_count, in uint ch)
24
23
{
25
24
// TODO: decouple kernel size from image size (can't get the math to work in my head)
26
- const uint i = bitfieldReverse(gl_WorkGroupID.x)>>(32u-11u);
27
- const float u = float(i)/2048.f+0.5f/512.f;
28
-
29
- // compile time constants
30
- const uint HALF_SIZE = _NBL_GLSL_SCRATCH_SHARED_SIZE_DEFINED_>>1u;
31
- const uint ITEMS_PER_STEP = HALF_SIZE>>_NBL_GLSL_WORKGROUP_SIZE_LOG2_;
32
- const uint ITEM_MASK = (item_per_thread_count-1u)>>findMSB(ITEMS_PER_STEP);
33
- //
34
- const uint hiInvocation = gl_LocalInvocationIndex&(~ITEM_MASK);
35
- const uint loInvocation = gl_LocalInvocationIndex&ITEM_MASK;
36
- uint base = 0;
37
- for(uint i=0u; i<=ITEM_MASK; i++,base+=_NBL_GLSL_WORKGROUP_SIZE_*ITEMS_PER_STEP)
25
+ for(uint t=0u; t<item_per_thread_count; t++)
38
26
{
39
- const float base_v = float(base)/1024.f+0.5f/512.f;
40
- barrier();
41
- for (uint j=0u; j<ITEMS_PER_STEP; j++)
42
- {
43
- const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*j+gl_LocalInvocationIndex;
44
- float v = float(tid)/1024.f+base_v;
45
- nbl_glsl_complex tmp = textureLod(NormalizedKernel[ch],vec2(u,v),0).xy;
46
- _NBL_GLSL_SCRATCH_SHARED_DEFINED_[tid] = floatBitsToUint(tmp.x);
47
- _NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+tid] = floatBitsToUint(tmp.y);
48
- }
49
- barrier();
50
- if(loInvocation==i)
51
- for(uint t=0u; t<item_per_thread_count; t++)
52
- {
53
- const uint j = bitfieldReverse(_NBL_GLSL_WORKGROUP_SIZE_*t+hiInvocation)>>(32u-10u);
54
- nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
55
- nbl_glsl_complex convSpectrum = nbl_glsl_complex(
56
- uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[j]),
57
- uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+j])
58
- );
59
- nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
60
- }
27
+ const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*t+gl_LocalInvocationIndex;
28
+
29
+ // TODO: do push constants here
30
+ uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
31
+ const uvec3 log2_size = uvec3(11u, 10u, 0u);
32
+ coords = bitfieldReverse(coords)>>(uvec3(32u)-log2_size); // reverse_shifts
33
+
34
+ nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
35
+
36
+ vec2 uv = (vec2(coords.xy))/vec2(uvec2(1u)<<log2_size.xy)+vec2(0.5f)/vec2(textureSize(NormalizedKernel[ch],0)); //kernel_half_pixel_size
37
+ //
38
+ nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
39
+ nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
61
40
}
62
41
}
63
42
@@ -75,6 +54,7 @@ void main()
75
54
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_ext_FFT_getPaddedData(nbl_glsl_ext_FFT_getCoordinates(tid),ch);
76
55
}
77
56
nbl_glsl_ext_FFT_preloaded(false,log2FFTSize);
57
+ barrier();
78
58
79
59
convolve(item_per_thread_count,ch);
80
60
0 commit comments