Skip to content

Commit c2fef50

Browse files
achieve cache coherency
1 parent 5d89705 commit c2fef50

File tree

1 file changed

+36
-16
lines changed

1 file changed

+36
-16
lines changed

examples_tests/49.ComputeFFT/fft_convolve_ifft.comp

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,46 @@ layout(set=0, binding=1) uniform sampler2D NormalizedKernel[3];
1818
#define _NBL_GLSL_EXT_FFT_MAIN_DEFINED_
1919
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
2020

21+
shared vec2 scratch[1024];
22+
2123
void convolve(in uint item_per_thread_count, in uint ch)
2224
{
2325
// TODO: decouple kernel size from image size (can't get the math to work in my head)
24-
uvec3 dimension = nbl_glsl_ext_FFT_Parameters_t_getDimensions();
25-
26-
for(uint t=0u; t<item_per_thread_count; t++)
27-
{
28-
uint tid = gl_LocalInvocationIndex + t * _NBL_GLSL_WORKGROUP_SIZE_;
29-
// TODO: refactor for smem usage
30-
uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
31-
const uvec3 log2_size = uvec3(11u, 10u, 0u);
32-
coords = bitfieldReverse(coords)>>(uvec3(32u)-log2_size);
33-
34-
nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
26+
const uint i = bitfieldReverse(gl_WorkGroupID.x)>>(32u-11u);
27+
const float u = float(i)/2048.f+0.5f/512.f;
3528

36-
vec2 uv = (vec2(coords.xy))/vec2(uvec2(1u)<<log2_size.xy)+vec2(0.5f)/vec2(textureSize(NormalizedKernel[ch],0));
37-
//
38-
nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
39-
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
29+
// compile time constants
30+
const uint HALF_SIZE = _NBL_GLSL_SCRATCH_SHARED_SIZE_DEFINED_>>1u;
31+
const uint ITEMS_PER_STEP = HALF_SIZE>>_NBL_GLSL_WORKGROUP_SIZE_LOG2_;
32+
const uint ITEM_MASK = (item_per_thread_count-1u)>>findMSB(ITEMS_PER_STEP);
33+
//
34+
const uint hiInvocation = gl_LocalInvocationIndex&(~ITEM_MASK);
35+
const uint loInvocation = gl_LocalInvocationIndex&ITEM_MASK;
36+
uint base = 0;
37+
for(uint i=0u; i<=ITEM_MASK; i++,base+=_NBL_GLSL_WORKGROUP_SIZE_*ITEMS_PER_STEP)
38+
{
39+
const float base_v = float(base)/1024.f+0.5f/512.f;
40+
barrier();
41+
for (uint j=0u; j<ITEMS_PER_STEP; j++)
42+
{
43+
const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*j+gl_LocalInvocationIndex;
44+
float v = float(tid)/1024.f+base_v;
45+
nbl_glsl_complex tmp = textureLod(NormalizedKernel[ch],vec2(u,v),0).xy;
46+
_NBL_GLSL_SCRATCH_SHARED_DEFINED_[tid] = floatBitsToUint(tmp.x);
47+
_NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+tid] = floatBitsToUint(tmp.y);
48+
}
49+
barrier();
50+
if(loInvocation==i)
51+
for(uint t=0u; t<item_per_thread_count; t++)
52+
{
53+
const uint j = bitfieldReverse(_NBL_GLSL_WORKGROUP_SIZE_*t+hiInvocation)>>(32u-10u);
54+
nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
55+
nbl_glsl_complex convSpectrum = nbl_glsl_complex(
56+
uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[j]),
57+
uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+j])
58+
);
59+
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
60+
}
4061
}
4162
}
4263

@@ -54,7 +75,6 @@ void main()
5475
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_ext_FFT_getPaddedData(nbl_glsl_ext_FFT_getCoordinates(tid),ch);
5576
}
5677
nbl_glsl_ext_FFT_preloaded(false,log2FFTSize);
57-
barrier();
5878

5979
convolve(item_per_thread_count,ch);
6080

0 commit comments

Comments
 (0)