Skip to content

Commit 902b15f

Browse files
switch to using images to store the blur kernel spectrum
1 parent 52c2cdd commit 902b15f

File tree

3 files changed

+102
-58
lines changed

3 files changed

+102
-58
lines changed

examples_tests/49.ComputeFFT/fft_convolve_ifft.comp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@ layout(set=0, binding=0) buffer restrict InputOutputBuffer
1818
nbl_glsl_complex inoutData[];
1919
};
2020

21-
layout(set=0, binding=1) restrict readonly buffer KernelBuffer
22-
{
23-
nbl_glsl_complex kerData[];
24-
};
21+
layout(set=0, binding=1) uniform sampler2D NormalizedKernel[3];
2522

2623
// Get/Set Data Function
2724
layout(push_constant) uniform PushConstants
@@ -78,8 +75,7 @@ void convolve(in uint item_per_thread_count, in uint ch)
7875
uint tid = gl_LocalInvocationIndex + t * _NBL_GLSL_WORKGROUP_SIZE_;
7976
uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
8077
//coords &= uvec3(0xffeu);
81-
uint idx = ch * (dimension.x * dimension.y * dimension.z) + coords.z * (dimension.x * dimension.y) + coords.y * (dimension.x) + coords.x;
82-
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(nbl_glsl_ext_FFT_impl_values[t],kerData[idx]);
78+
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(nbl_glsl_ext_FFT_impl_values[t],texelFetch(NormalizedKernel[ch],ivec2(coords.xy),0).xy);
8379
}
8480
}
8581

examples_tests/49.ComputeFFT/main.cpp

Lines changed: 90 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -27,47 +27,63 @@ struct DispatchInfo_t
2727
uint32_t workGroupCount[3];
2828
};
2929

30-
static inline core::smart_refctd_ptr<video::IGPUPipelineLayout> getPipelineLayout_Convolution(video::IVideoDriver* driver) {
31-
static IGPUDescriptorSetLayout::SBinding bnd[] =
30+
constexpr uint32_t channelCountOverride = 3u;
31+
32+
inline smart_refctd_ptr<IGPUPipelineLayout> getPipelineLayout_Convolution(IVideoDriver* driver)
33+
{
34+
IGPUSampler::SParams params =
3235
{
3336
{
37+
ISampler::ETC_CLAMP_TO_BORDER,
38+
ISampler::ETC_CLAMP_TO_BORDER,
39+
ISampler::ETC_CLAMP_TO_BORDER,
40+
ISampler::ETBC_FLOAT_OPAQUE_BLACK,
41+
ISampler::ETF_LINEAR, // is it needed?
42+
ISampler::ETF_LINEAR,
43+
ISampler::ESMM_NEAREST,
3444
0u,
35-
EDT_STORAGE_BUFFER,
36-
1u,
37-
ISpecializedShader::ESS_COMPUTE,
38-
nullptr
39-
},
45+
0u,
46+
ISampler::ECO_ALWAYS
47+
}
48+
};
49+
auto sampler = driver->createGPUSampler(std::move(params));
50+
smart_refctd_ptr<IGPUSampler> samplers[channelCountOverride];
51+
std::fill_n(samplers,channelCountOverride,sampler);
52+
53+
IGPUDescriptorSetLayout::SBinding bnd[] =
54+
{
4055
{
41-
1u,
56+
0u,
4257
EDT_STORAGE_BUFFER,
4358
1u,
4459
ISpecializedShader::ESS_COMPUTE,
4560
nullptr
4661
},
4762
{
48-
2u,
49-
EDT_STORAGE_BUFFER,
5063
1u,
64+
EDT_COMBINED_IMAGE_SAMPLER,
65+
channelCountOverride,
5166
ISpecializedShader::ESS_COMPUTE,
52-
nullptr
53-
},
67+
samplers
68+
}
5469
};
5570

5671
using FFTClass = ext::FFT::FFT;
5772
core::SRange<const asset::SPushConstantRange> pcRange = FFTClass::getDefaultPushConstantRanges();
58-
core::SRange<const video::IGPUDescriptorSetLayout::SBinding> bindings = {bnd, bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};;
73+
core::SRange<const video::IGPUDescriptorSetLayout::SBinding> bindings = {bnd,bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};
5974

6075
return driver->createGPUPipelineLayout(
6176
pcRange.begin(),pcRange.end(),
6277
driver->createGPUDescriptorSetLayout(bindings.begin(),bindings.end()),nullptr,nullptr,nullptr
6378
);
6479
}
65-
static inline core::smart_refctd_ptr<video::IGPUSpecializedShader> createShader_Convolution(
80+
81+
inline core::smart_refctd_ptr<video::IGPUSpecializedShader> createShader_Convolution(
6682
video::IVideoDriver* driver,
6783
IAssetManager* am,
6884
uint32_t maxDimensionSize)
6985
{
70-
uint32_t const maxPaddedDimensionSize = core::roundUpToPoT(maxDimensionSize);
86+
const uint32_t maxPaddedDimensionSize = core::roundUpToPoT(maxDimensionSize);
7187

7288
const char* sourceFmt =
7389
R"===(#version 430 core
@@ -100,21 +116,20 @@ R"===(#version 430 core
100116

101117
return gpuSpecializedShader;
102118
}
103-
static inline void updateDescriptorSet_Convolution (
119+
inline void updateDescriptorSet_Convolution (
104120
video::IVideoDriver * driver,
105121
video::IGPUDescriptorSet * set,
106122
core::smart_refctd_ptr<video::IGPUBuffer> inputOutputBufferDescriptor,
107-
core::smart_refctd_ptr<video::IGPUBuffer> kernelBufferDescriptor)
123+
const core::smart_refctd_ptr<video::IGPUImageView>* kernelNormalizedSpectrumImageDescriptors)
108124
{
109125
constexpr uint32_t descCount = 2u;
110-
video::IGPUDescriptorSet::SDescriptorInfo pInfos[descCount];
126+
video::IGPUDescriptorSet::SDescriptorInfo pInfos[1u+channelCountOverride];
111127
video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[descCount];
112128

113129
for (auto i = 0; i < descCount; i++)
114130
{
115131
pWrites[i].dstSet = set;
116132
pWrites[i].arrayElement = 0u;
117-
pWrites[i].count = 1u;
118133
pWrites[i].info = pInfos+i;
119134
}
120135

@@ -128,11 +143,15 @@ static inline void updateDescriptorSet_Convolution (
128143

129144
// Kernel Buffer
130145
pWrites[1].binding = 1;
131-
pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER;
132-
pWrites[1].count = 1;
133-
pInfos[1].desc = kernelBufferDescriptor;
134-
pInfos[1].buffer.size = kernelBufferDescriptor->getSize();
135-
pInfos[1].buffer.offset = 0u;
146+
pWrites[1].descriptorType = asset::EDT_COMBINED_IMAGE_SAMPLER;
147+
pWrites[1].count = channelCountOverride;
148+
for (uint32_t i=0u; i<channelCountOverride; i++)
149+
{
150+
auto& info = pInfos[1u+i];
151+
info.desc = kernelNormalizedSpectrumImageDescriptors[i];
152+
//info.image.imageLayout = ;
153+
info.image.sampler = nullptr;
154+
}
136155

137156
driver->updateDescriptorSets(descCount, pWrites, 0u, nullptr);
138157
}
@@ -339,6 +358,9 @@ int main()
339358
VkExtent3D kerDim = kerImgInfo.extent;
340359
uint32_t srcNumChannels = getFormatChannelCount(srcFormat);
341360
uint32_t kerNumChannels = getFormatChannelCount(kerFormat);
361+
//! OVERRIDE (we dont need alpha)
362+
srcNumChannels = channelCountOverride;
363+
kerNumChannels = channelCountOverride;
342364
assert(srcNumChannels == kerNumChannels); // Just to make sure, because the other case is not handled in this example
343365

344366
VkExtent3D paddedDim = FFTClass::padDimensionToNextPOT(srcDim, kerDim);
@@ -370,21 +392,21 @@ int main()
370392
auto fftPipelineLayout_ImageInput = FFTClass::getDefaultPipelineLayout(driver, FFTClass::DataType::TEXTURE2D);
371393
auto fftPipelineLayout_KernelNormalization = [&]() -> auto
372394
{
373-
static IGPUDescriptorSetLayout::SBinding bnd[] =
395+
IGPUDescriptorSetLayout::SBinding bnd[] =
374396
{
375397
{
376398
0u,
377399
EDT_STORAGE_BUFFER,
378400
1u,
379401
ISpecializedShader::ESS_COMPUTE,
380-
nullptr,
402+
nullptr
381403
},
382404
{
383405
1u,
384-
EDT_STORAGE_BUFFER,
385-
1u,
406+
EDT_STORAGE_IMAGE,
407+
channelCountOverride,
386408
ISpecializedShader::ESS_COMPUTE,
387-
nullptr,
409+
nullptr
388410
},
389411
};
390412
return driver->createGPUPipelineLayout(
@@ -411,8 +433,31 @@ int main()
411433
// Allocate Output Buffer
412434
auto fftOutputBuffer_0 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(paddedDim, srcNumChannels)); // result of: srcFFTX and kerFFTX and Convolution and IFFTY
413435
auto fftOutputBuffer_1 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(paddedDim, srcNumChannels)); // result of: srcFFTY and IFFTX
414-
auto fftOutputBuffer_KernelNormalized = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(paddedDim, srcNumChannels)); // result of: kerFFTY
415-
436+
auto createKernelSpectrum = [&]() -> auto
437+
{
438+
video::IGPUImage::SCreationParams imageParams;
439+
imageParams.flags = static_cast<asset::IImage::E_CREATE_FLAGS>(0u);
440+
imageParams.type = asset::IImage::ET_2D;
441+
imageParams.format = asset::EF_R16G16_SFLOAT;
442+
imageParams.extent = {paddedDim.width,paddedDim.height,1u};
443+
imageParams.mipLevels = 1u;
444+
imageParams.arrayLayers = 1u;
445+
imageParams.samples = asset::IImage::ESCF_1_BIT;
446+
447+
video::IGPUImageView::SCreationParams viewParams;
448+
viewParams.flags = static_cast<video::IGPUImageView::E_CREATE_FLAGS>(0u);
449+
viewParams.image = driver->createGPUImageOnDedMem(std::move(imageParams),driver->getDeviceLocalGPUMemoryReqs());
450+
viewParams.viewType = video::IGPUImageView::ET_2D;
451+
viewParams.format = asset::EF_R16G16_SFLOAT;
452+
viewParams.components = {};
453+
viewParams.subresourceRange = {};
454+
viewParams.subresourceRange.levelCount = 1u;
455+
viewParams.subresourceRange.layerCount = 1u;
456+
return driver->createGPUImageView(std::move(viewParams));
457+
};
458+
core::smart_refctd_ptr<IGPUImageView> kernelNormalizedSpectrums[channelCountOverride];
459+
for (uint32_t i=0u; i<channelCountOverride; i++)
460+
kernelNormalizedSpectrums[i] = createKernelSpectrum();
416461

417462
// Precompute Kernel FFT
418463
{
@@ -429,7 +474,7 @@ int main()
429474
{
430475
auto dset = driver->createGPUDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(fftPipelineLayout_KernelNormalization->getDescriptorSetLayout(0u)));
431476

432-
video::IGPUDescriptorSet::SDescriptorInfo pInfos[2];
477+
video::IGPUDescriptorSet::SDescriptorInfo pInfos[1+channelCountOverride];
433478
video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[2];
434479

435480
for (auto i = 0; i < 2; i++)
@@ -450,11 +495,15 @@ int main()
450495

451496
// Out Buffer
452497
pWrites[1].binding = 1;
453-
pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER;
454-
pWrites[1].count = 1;
455-
pInfos[1].desc = fftOutputBuffer_KernelNormalized;
456-
pInfos[1].buffer.size = fftOutputBuffer_KernelNormalized->getSize();
457-
pInfos[1].buffer.offset = 0u;
498+
pWrites[1].descriptorType = asset::EDT_STORAGE_IMAGE;
499+
pWrites[1].count = channelCountOverride;
500+
for (uint32_t i=0u; i<channelCountOverride; i++)
501+
{
502+
auto& info = pInfos[1u+i];
503+
info.desc = kernelNormalizedSpectrums[i];
504+
//info.image.imageLayout = ;
505+
info.image.sampler = nullptr;
506+
}
458507

459508
driver->updateDescriptorSets(2u, pWrites, 0u, nullptr);
460509
return dset;
@@ -472,18 +521,13 @@ int main()
472521
FFTClass::pushConstants(driver, fftPipelineLayout_SSBOInput.get(), paddedDim, paddedDim, FFTClass::Direction::Y, false, srcNumChannels);
473522
FFTClass::dispatchHelper(driver, fftDispatchInfo_Vertical);
474523

475-
// Ker Image FFT Y
476-
driver->bindComputePipeline(fftPipeline_SSBOInput.get());
477-
driver->bindDescriptorSets(EPBP_COMPUTE, fftPipelineLayout_SSBOInput.get(), 0u, 1u, &fftDescriptorSet_Ker_FFT_Y.get(), nullptr);
478-
FFTClass::pushConstants(driver, fftPipelineLayout_SSBOInput.get(), paddedDim, paddedDim, FFTClass::Direction::Y, false, srcNumChannels);
479-
FFTClass::dispatchHelper(driver, fftDispatchInfo_Vertical);
480-
481524
// Ker Normalization
482525
driver->bindComputePipeline(fftPipeline_KernelNormalization.get());
483526
driver->bindDescriptorSets(EPBP_COMPUTE, fftPipelineLayout_KernelNormalization.get(), 0u, 1u, &fftDescriptorSet_KernelNormalization.get(), nullptr);
484527
{
485-
const uint32_t dispatchSizeX = (paddedDim.width*paddedDim.height*paddedDim.depth*srcNumChannels-1u)/FFTClass::DEFAULT_WORK_GROUP_SIZE+1u;
486-
driver->dispatch(dispatchSizeX,1,1);
528+
const uint32_t dispatchSizeX = (paddedDim.width-1u)/16u+1u;
529+
const uint32_t dispatchSizeY = (paddedDim.height-1u)/16u+1u;
530+
driver->dispatch(dispatchSizeX,dispatchSizeY,kerNumChannels);
487531
FFTClass::defaultBarrier();
488532
}
489533
}
@@ -494,7 +538,7 @@ int main()
494538

495539
// Convolution
496540
auto convolveDescriptorSet = driver->createGPUDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(convolvePipelineLayout->getDescriptorSetLayout(0u)));
497-
updateDescriptorSet_Convolution(driver, convolveDescriptorSet.get(), fftOutputBuffer_0, fftOutputBuffer_KernelNormalized);
541+
updateDescriptorSet_Convolution(driver, convolveDescriptorSet.get(), fftOutputBuffer_0, kernelNormalizedSpectrums);
498542

499543
// Last IFFTX
500544
auto lastFFTDescriptorSet = driver->createGPUDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(lastFFTPipelineLayout->getDescriptorSetLayout(0u)));
Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#version 430 core
2-
layout(local_size_x=256, local_size_y=1, local_size_z=1) in;
2+
layout(local_size_x=16, local_size_y=16, local_size_z=1) in;
33

44
#include "nbl/builtin/glsl/math/complex.glsl"
55

@@ -8,14 +8,19 @@ layout(set=0, binding=0) restrict readonly buffer InBuffer
88
nbl_glsl_complex in_data[];
99
};
1010

11-
layout(set=0, binding=1) restrict buffer OutBuffer
12-
{
13-
nbl_glsl_complex out_data[];
14-
};
11+
layout(set=0, binding=1, rg16f) uniform image2D NormalizedKernel[3];
1512

1613
void main()
1714
{
15+
const uvec3 sizes = gl_WorkGroupSize*gl_NumWorkGroups;
16+
const uvec3 strides = uvec3(1u,sizes.x,sizes.y*sizes.x);
17+
1818
const float power = length(in_data[0]);
19+
20+
nbl_glsl_complex value = in_data[gl_GlobalInvocationID.x*strides.x+gl_GlobalInvocationID.y*strides.y];
21+
value /= power;
22+
imageStore(NormalizedKernel[gl_WorkGroupID.z],ivec2(gl_GlobalInvocationID),vec4(value,0.0,0.0));
23+
1924
#if 0
2025
const uint k = bitfieldReverse(gl_GlobalInvocationID.x%2048u)>>21u;
2126
const uint l = bitfieldReverse(gl_GlobalInvocationID.x/2048u)>>21u;
@@ -24,5 +29,4 @@ void main()
2429
//shift.y = 0.f;
2530
out_data[gl_GlobalInvocationID.x] = nbl_glsl_complex_mul(in_data[gl_GlobalInvocationID.x],shift)/power;
2631
#endif
27-
out_data[gl_GlobalInvocationID.x] = in_data[gl_GlobalInvocationID.x]/power;
2832
}

0 commit comments

Comments
 (0)