Skip to content

Commit ad5b850

Browse files
revert the shared memory coherency (Was slower)
1 parent c2fef50 commit ad5b850

File tree

3 files changed

+35
-37
lines changed

3 files changed

+35
-37
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
5+
#include "nbl/builtin/glsl/ext/FFT/parameters_struct.glsl"
6+
struct convolve_parameters_t
7+
{
8+
nbl_glsl_ext_FFT_Parameters_t fft_params;
9+
vec2 bitreversed_to_normalized;
10+
vec2 kernel_half_pixel_size;
11+
};

examples_tests/49.ComputeFFT/fft_convolve_ifft.comp

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,52 +12,31 @@ layout(set=0, binding=0) buffer restrict InputOutputBuffer
1212

1313
layout(set=0, binding=1) uniform sampler2D NormalizedKernel[3];
1414

15+
#include "convolve_parameters.glsl"
1516

1617
#define inData inoutData
1718
#define outData inoutData
1819
#define _NBL_GLSL_EXT_FFT_MAIN_DEFINED_
1920
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
2021

21-
shared vec2 scratch[1024];
22-
2322
void convolve(in uint item_per_thread_count, in uint ch)
2423
{
2524
// TODO: decouple kernel size from image size (can't get the math to work in my head)
26-
const uint i = bitfieldReverse(gl_WorkGroupID.x)>>(32u-11u);
27-
const float u = float(i)/2048.f+0.5f/512.f;
28-
29-
// compile time constants
30-
const uint HALF_SIZE = _NBL_GLSL_SCRATCH_SHARED_SIZE_DEFINED_>>1u;
31-
const uint ITEMS_PER_STEP = HALF_SIZE>>_NBL_GLSL_WORKGROUP_SIZE_LOG2_;
32-
const uint ITEM_MASK = (item_per_thread_count-1u)>>findMSB(ITEMS_PER_STEP);
33-
//
34-
const uint hiInvocation = gl_LocalInvocationIndex&(~ITEM_MASK);
35-
const uint loInvocation = gl_LocalInvocationIndex&ITEM_MASK;
36-
uint base = 0;
37-
for(uint i=0u; i<=ITEM_MASK; i++,base+=_NBL_GLSL_WORKGROUP_SIZE_*ITEMS_PER_STEP)
25+
for(uint t=0u; t<item_per_thread_count; t++)
3826
{
39-
const float base_v = float(base)/1024.f+0.5f/512.f;
40-
barrier();
41-
for (uint j=0u; j<ITEMS_PER_STEP; j++)
42-
{
43-
const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*j+gl_LocalInvocationIndex;
44-
float v = float(tid)/1024.f+base_v;
45-
nbl_glsl_complex tmp = textureLod(NormalizedKernel[ch],vec2(u,v),0).xy;
46-
_NBL_GLSL_SCRATCH_SHARED_DEFINED_[tid] = floatBitsToUint(tmp.x);
47-
_NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+tid] = floatBitsToUint(tmp.y);
48-
}
49-
barrier();
50-
if(loInvocation==i)
51-
for(uint t=0u; t<item_per_thread_count; t++)
52-
{
53-
const uint j = bitfieldReverse(_NBL_GLSL_WORKGROUP_SIZE_*t+hiInvocation)>>(32u-10u);
54-
nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
55-
nbl_glsl_complex convSpectrum = nbl_glsl_complex(
56-
uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[j]),
57-
uintBitsToFloat(_NBL_GLSL_SCRATCH_SHARED_DEFINED_[HALF_SIZE+j])
58-
);
59-
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
60-
}
27+
const uint tid = _NBL_GLSL_WORKGROUP_SIZE_*t+gl_LocalInvocationIndex;
28+
29+
// TODO: do push constants here
30+
uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
31+
const uvec3 log2_size = uvec3(11u, 10u, 0u);
32+
coords = bitfieldReverse(coords)>>(uvec3(32u)-log2_size); // reverse_shifts
33+
34+
nbl_glsl_complex sourceSpectrum = nbl_glsl_ext_FFT_impl_values[t];
35+
36+
vec2 uv = (vec2(coords.xy))/vec2(uvec2(1u)<<log2_size.xy)+vec2(0.5f)/vec2(textureSize(NormalizedKernel[ch],0)); //kernel_half_pixel_size
37+
//
38+
nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
39+
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
6140
}
6241
}
6342

@@ -75,6 +54,7 @@ void main()
7554
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_ext_FFT_getPaddedData(nbl_glsl_ext_FFT_getCoordinates(tid),ch);
7655
}
7756
nbl_glsl_ext_FFT_preloaded(false,log2FFTSize);
57+
barrier();
7858

7959
convolve(item_per_thread_count,ch);
8060

examples_tests/49.ComputeFFT/main.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,13 @@ inline void updateDescriptorSet_LastFFT (
172172
driver->updateDescriptorSets(2u, pWrites, 0u, nullptr);
173173
}
174174

175+
using nbl_glsl_ext_FFT_Parameters_t = ext::FFT::FFT::Parameters_t;
176+
struct vec2
177+
{
178+
float x;
179+
float y;
180+
};
181+
#include "convolve_parameters.glsl"
175182

176183

177184
int main()
@@ -560,7 +567,7 @@ int main()
560567

561568
// pipelines
562569
auto fftPipeline_ImageInput = driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver, paddedDim.width, "../image_first_fft.comp"));
563-
auto convolvePipeline = driver->createGPUComputePipeline(nullptr, core::smart_refctd_ptr(convolvePipelineLayout), createShader(driver, paddedDim.height, "../fft_convolve_ifft.comp"));
570+
auto convolvePipeline = driver->createGPUComputePipeline(nullptr, std::move(convolvePipelineLayout), createShader(driver, paddedDim.height, "../fft_convolve_ifft.comp"));
564571
auto lastFFTPipeline = driver->createGPUComputePipeline(nullptr, getPipelineLayout_LastFFT(driver), createShader(driver, paddedDim.width, "../last_fft.comp"));
565572

566573
// Src FFT X

0 commit comments

Comments
 (0)