Skip to content

Commit afd9cb4

Browse files
more preparation for half-floats
1 parent c3326d2 commit afd9cb4

File tree

3 files changed

+130
-120
lines changed

3 files changed

+130
-120
lines changed

examples_tests/49.ComputeFFT/main.cpp

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,15 @@ using namespace nbl::core;
1818
using namespace nbl::asset;
1919
using namespace nbl::video;
2020

21-
#include "nbl/core/math/intutil.h"
22-
#include "nbl/core/math/glslFunctions.h"
21+
using FFTClass = ext::FFT::FFT;
2322

2423
constexpr uint32_t channelCountOverride = 3u;
2524

2625
inline core::smart_refctd_ptr<video::IGPUSpecializedShader> createShader(
2726
video::IVideoDriver* driver,
28-
uint32_t maxDimensionSize,
27+
const FFTClass* fft,
2928
const char* includeMainName)
3029
{
31-
const uint32_t maxPaddedDimensionSize = core::roundUpToPoT(maxDimensionSize);
32-
3330
const char* sourceFmt =
3431
R"===(#version 430 core
3532
@@ -48,8 +45,8 @@ R"===(#version 430 core
4845
snprintf(
4946
reinterpret_cast<char*>(shader->getPointer()),shader->getSize(), sourceFmt,
5047
DEFAULT_WORK_GROUP_SIZE,
51-
maxPaddedDimensionSize,
52-
0u,
48+
fft->getMaxFFTLength(),
49+
fft->usesHalfFloatStorage() ? 1u:0u,
5350
includeMainName
5451
);
5552

@@ -107,10 +104,8 @@ inline void updateDescriptorSet_Convolution (
107104
driver->updateDescriptorSets(descCount, pWrites, 0u, nullptr);
108105
}
109106

110-
static inline core::smart_refctd_ptr<video::IGPUPipelineLayout> getPipelineLayout_LastFFT(video::IVideoDriver* driver) {
111-
112-
using FFTClass = ext::FFT::FFT;
113-
107+
static inline core::smart_refctd_ptr<video::IGPUPipelineLayout> getPipelineLayout_LastFFT(video::IVideoDriver* driver)
108+
{
114109
static IGPUDescriptorSetLayout::SBinding bnd[] =
115110
{
116111
{
@@ -247,8 +242,6 @@ int main()
247242
}
248243

249244
// agree on formats
250-
using FFTClass = ext::FFT::FFT;
251-
252245
const E_FORMAT srcFormat = srcImageView->getCreationParameters().format;
253246
uint32_t srcNumChannels = getFormatChannelCount(srcFormat);
254247
uint32_t kerNumChannels = getFormatChannelCount(kerImageView->getCreationParameters().format);
@@ -293,7 +286,6 @@ int main()
293286
}
294287
};
295288

296-
using FFTClass = ext::FFT::FFT;
297289
core::SRange<const asset::SPushConstantRange> pcRange = FFTClass::getDefaultPushConstantRanges();
298290
core::SRange<const video::IGPUDescriptorSetLayout::SBinding> bindings = {bnd,bnd+sizeof(bnd)/sizeof(IGPUDescriptorSetLayout::SBinding)};
299291

@@ -350,10 +342,10 @@ int main()
350342
);
351343
}();
352344

353-
const VkExtent3D paddedDim = FFTClass::padDimensionToNextPOT(srcDim);
345+
constexpr bool useHalfFloats = false;
354346
// Allocate Output Buffer
355-
auto fftOutputBuffer_0 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(paddedDim, srcNumChannels)); // result of: srcFFTX and kerFFTX and Convolution and IFFTY
356-
auto fftOutputBuffer_1 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(paddedDim, srcNumChannels)); // result of: srcFFTY and IFFTX
347+
auto fftOutputBuffer_0 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(useHalfFloats,srcDim,srcNumChannels));
348+
auto fftOutputBuffer_1 = driver->createDeviceLocalGPUBufferOnDedMem(FFTClass::getOutputBufferSize(useHalfFloats,srcDim,srcNumChannels));
357349
core::smart_refctd_ptr<IGPUImageView> kernelNormalizedSpectrums[channelCountOverride];
358350

359351
auto updateDescriptorSet = [driver](video::IGPUDescriptorSet* set, core::smart_refctd_ptr<IGPUImageView> inputImageDescriptor, asset::ISampler::E_TEXTURE_CLAMP textureWrap, core::smart_refctd_ptr<IGPUBuffer> outputBufferDescriptor) -> void
@@ -409,7 +401,7 @@ int main()
409401
// Precompute Kernel FFT
410402
{
411403
const auto kerDim = kerImageView->getCreationParameters().image->getCreationParameters().extent;
412-
const VkExtent3D paddedKerDim = FFTClass::padDimensionToNextPOT(kerDim);
404+
const VkExtent3D paddedKerDim = FFTClass::padDimensions(kerDim);
413405

414406
// create kernel spectrums
415407
auto createKernelSpectrum = [&]() -> auto
@@ -442,7 +434,8 @@ int main()
442434
updateDescriptorSet(fftDescriptorSet_Ker_FFT_X.get(), kerImageView, ISampler::ETC_CLAMP_TO_BORDER, fftOutputBuffer_0);
443435

444436
// Ker FFT Y
445-
auto fftPipeline_SSBOInput = FFTClass::getDefaultPipeline(driver,kerDim.height);
437+
auto fft_y = core::make_smart_refctd_ptr<FFTClass>(driver,kerDim.height,useHalfFloats);
438+
auto fftPipeline_SSBOInput = fft_y->getDefaultPipeline();
446439
auto fftDescriptorSet_Ker_FFT_Y = driver->createGPUDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(fftPipeline_SSBOInput->getLayout()->getDescriptorSetLayout(0u)));
447440
FFTClass::updateDescriptorSet(driver, fftDescriptorSet_Ker_FFT_Y.get(), fftOutputBuffer_0, fftOutputBuffer_1);
448441

@@ -527,14 +520,15 @@ int main()
527520

528521
// Ker Image FFT X
529522
{
530-
auto fftPipeline_ImageInput = driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver, paddedKerDim.width, "../image_first_fft.comp"));
523+
auto fft_x = core::make_smart_refctd_ptr<FFTClass>(driver,kerDim.height,useHalfFloats);
524+
auto fftPipeline_ImageInput = driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver,fft_x.get(),"../image_first_fft.comp"));
531525
driver->bindComputePipeline(fftPipeline_ImageInput.get());
532526
driver->bindDescriptorSets(EPBP_COMPUTE, imageFirstFFTPipelineLayout.get(), 0u, 1u, &fftDescriptorSet_Ker_FFT_X.get(), nullptr);
533527
FFTClass::dispatchHelper(driver, imageFirstFFTPipelineLayout.get(), fftPushConstants[0], fftDispatchInfo[0]);
534528
}
535529

536530
// Ker Image FFT Y
537-
driver->bindComputePipeline(fftPipeline_SSBOInput.get());
531+
driver->bindComputePipeline(fftPipeline_SSBOInput);
538532
driver->bindDescriptorSets(EPBP_COMPUTE, fftPipeline_SSBOInput->getLayout(), 0u, 1u, &fftDescriptorSet_Ker_FFT_Y.get(), nullptr);
539533
FFTClass::dispatchHelper(driver, fftPipeline_SSBOInput->getLayout(), fftPushConstants[1], fftDispatchInfo[1]);
540534

@@ -567,9 +561,11 @@ int main()
567561
}
568562

569563
// pipelines
570-
auto fftPipeline_ImageInput = driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver, paddedDim.width, "../image_first_fft.comp"));
571-
auto convolvePipeline = driver->createGPUComputePipeline(nullptr, std::move(convolvePipelineLayout), createShader(driver, paddedDim.height, "../fft_convolve_ifft.comp"));
572-
auto lastFFTPipeline = driver->createGPUComputePipeline(nullptr, getPipelineLayout_LastFFT(driver), createShader(driver, paddedDim.width, "../last_fft.comp"));
564+
auto fft_x = core::make_smart_refctd_ptr<FFTClass>(driver,srcDim.width,useHalfFloats);
565+
auto fft_y = core::make_smart_refctd_ptr<FFTClass>(driver,srcDim.height,useHalfFloats);
566+
auto fftPipeline_ImageInput = driver->createGPUComputePipeline(nullptr,core::smart_refctd_ptr(imageFirstFFTPipelineLayout),createShader(driver,fft_x.get(), "../image_first_fft.comp"));
567+
auto convolvePipeline = driver->createGPUComputePipeline(nullptr, std::move(convolvePipelineLayout), createShader(driver,fft_y.get(), "../fft_convolve_ifft.comp"));
568+
auto lastFFTPipeline = driver->createGPUComputePipeline(nullptr, getPipelineLayout_LastFFT(driver), createShader(driver,fft_x.get(), "../last_fft.comp"));
573569

574570
// Src FFT X
575571
auto fftDescriptorSet_Src_FFT_X = driver->createGPUDescriptorSet(core::smart_refctd_ptr<const IGPUDescriptorSetLayout>(imageFirstFFTPipelineLayout->getDescriptorSetLayout(0u)));

include/nbl/ext/FFT/FFT.h

Lines changed: 51 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ struct alignas(16) uvec4 {
2626
};
2727
#include "nbl/builtin/glsl/ext/FFT/parameters_struct.glsl";
2828

29-
class FFT : public core::TotalInterface
29+
class FFT final : public core::IReferenceCounted
3030
{
3131
public:
3232
struct Parameters_t alignas(16) : nbl_glsl_ext_FFT_Parameters_t
@@ -46,6 +46,7 @@ class FFT : public core::TotalInterface
4646
};
4747

4848
_NBL_STATIC_INLINE_CONSTEXPR uint32_t DEFAULT_WORK_GROUP_SIZE = 256u;
49+
FFT(video::IDriver* driver, uint32_t maxDimensionSize, bool useHalfStorage = false);
4950

5051
// returns how many dispatches necessary for computing the FFT and fills the uniform data
5152
static inline uint32_t buildParameters(bool isInverse, uint32_t numChannels, const asset::VkExtent3D& inputDimensions, Parameters_t* outParams, DispatchInfo_t* outInfos, const PaddingType* paddingType)
@@ -54,7 +55,7 @@ class FFT : public core::TotalInterface
5455

5556
if (numChannels)
5657
{
57-
const auto paddedInputDimensions = padDimensionToNextPOT(inputDimensions);
58+
const auto paddedInputDimensions = padDimensions(inputDimensions);
5859
for (uint32_t i=0u; i<3u; i++)
5960
if ((&inputDimensions.width)[i]>1u)
6061
{
@@ -70,8 +71,7 @@ class FFT : public core::TotalInterface
7071
params.input_dimensions.y = inputDimensions.height;
7172
params.input_dimensions.z = inputDimensions.depth;
7273
{
73-
// round up to workgroup size if too small
74-
const uint32_t fftSize = core::max(DEFAULT_WORK_GROUP_SIZE,(&paddedInputDimensions.width)[i]);
74+
const uint32_t fftSize = (&paddedInputDimensions.width)[i];
7575

7676
params.input_dimensions.w = (isInverse ? 0x80000000u:0x0u)|
7777
(i<<28u)| // direction
@@ -95,78 +95,54 @@ class FFT : public core::TotalInterface
9595
return passesRequired;
9696
}
9797

98-
// TODO: remove?
99-
static inline asset::VkExtent3D padDimensionToNextPOT(asset::VkExtent3D dimension, asset::VkExtent3D const & minimum_dimension = asset::VkExtent3D{ 1, 1, 1 })
98+
static inline asset::VkExtent3D padDimensions(asset::VkExtent3D dimension)
10099
{
101-
if(dimension.width < minimum_dimension.width)
102-
dimension.width = minimum_dimension.width;
103-
if(dimension.height < minimum_dimension.height)
104-
dimension.height = minimum_dimension.height;
105-
if(dimension.depth < minimum_dimension.depth)
106-
dimension.depth = minimum_dimension.depth;
107-
108-
dimension.width = core::roundUpToPoT(dimension.width);
109-
dimension.height = core::roundUpToPoT(dimension.height);
110-
dimension.depth = core::roundUpToPoT(dimension.depth);
111-
100+
static_assert(core::isPoT(MINIMUM_FFT_SIZE),"MINIMUM_FFT_SIZE needs to be Power of Two!");
101+
for (auto i=0u; i<3u; i++)
102+
{
103+
auto& coord = (&dimension.width)[i];
104+
if (coord<=1u)
105+
continue;
106+
coord = core::max(core::roundUpToPoT(coord),MINIMUM_FFT_SIZE);
107+
}
112108
return dimension;
113109
}
114110

115111
//
116112
static core::SRange<const asset::SPushConstantRange> getDefaultPushConstantRanges();
117113

118114
//
119-
static core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> getDefaultDescriptorSetLayout(video::IVideoDriver* driver);
115+
inline auto getDefaultDescriptorSetLayout() const {return m_dsLayout.get();}
120116

121117
//
122-
static core::smart_refctd_ptr<video::IGPUPipelineLayout> getDefaultPipelineLayout(video::IVideoDriver* driver);
118+
inline auto getDefaultPipelineLayout() const {return m_pplnLayout.get();}
119+
120+
//
121+
inline auto getDefaultPipeline() const {return m_ppln.get();}
122+
123+
//
124+
inline uint32_t getMaxFFTLength() const { return m_maxFFTLen; }
125+
inline bool usesHalfFloatStorage() const { return m_halfFloatStorage; }
123126

124-
// TODO: rework?
125-
static inline size_t getOutputBufferSize(asset::VkExtent3D const & paddedInputDimensions, uint32_t numChannels)
127+
//
128+
static inline size_t getOutputBufferSize(bool _halfFloatStorage, const asset::VkExtent3D& inputDimensions, uint32_t numChannels, bool realInput=false)
126129
{
127-
assert(core::isPoT(paddedInputDimensions.width) && core::isPoT(paddedInputDimensions.height) && core::isPoT(paddedInputDimensions.depth));
128-
return (paddedInputDimensions.width * paddedInputDimensions.height * paddedInputDimensions.depth * numChannels) * (sizeof(float) * 2);
130+
size_t retval = getOutputBufferSize_impl(inputDimensions,numChannels);
131+
if (!realInput)
132+
retval <<= 1u;
133+
return retval*(_halfFloatStorage ? sizeof(uint16_t):sizeof(uint32_t));
129134
}
130-
131-
static core::smart_refctd_ptr<video::IGPUComputePipeline> getDefaultPipeline(video::IVideoDriver* driver, uint32_t maxDimensionSize);
132-
133-
_NBL_STATIC_INLINE_CONSTEXPR uint32_t MAX_DESCRIPTOR_COUNT = 2u;
134-
static inline void updateDescriptorSet(
135-
video::IVideoDriver * driver,
136-
video::IGPUDescriptorSet * set,
137-
core::smart_refctd_ptr<video::IGPUBuffer> inputBufferDescriptor,
138-
core::smart_refctd_ptr<video::IGPUBuffer> outputBufferDescriptor)
135+
inline size_t getOutputBufferSize(const asset::VkExtent3D& inputDimensions, uint32_t numChannels, bool realInput = false)
139136
{
140-
video::IGPUDescriptorSet::SDescriptorInfo pInfos[MAX_DESCRIPTOR_COUNT];
141-
video::IGPUDescriptorSet::SWriteDescriptorSet pWrites[MAX_DESCRIPTOR_COUNT];
142-
143-
for (auto i=0; i< MAX_DESCRIPTOR_COUNT; i++)
144-
{
145-
pWrites[i].dstSet = set;
146-
pWrites[i].arrayElement = 0u;
147-
pWrites[i].count = 1u;
148-
pWrites[i].info = pInfos+i;
149-
}
150-
151-
// Input Buffer
152-
pWrites[0].binding = 0;
153-
pWrites[0].descriptorType = asset::EDT_STORAGE_BUFFER;
154-
pWrites[0].count = 1;
155-
pInfos[0].desc = inputBufferDescriptor;
156-
pInfos[0].buffer.size = inputBufferDescriptor->getSize();
157-
pInfos[0].buffer.offset = 0u;
158-
159-
// Output Buffer
160-
pWrites[1].binding = 1;
161-
pWrites[1].descriptorType = asset::EDT_STORAGE_BUFFER;
162-
pWrites[1].count = 1;
163-
pInfos[1].desc = outputBufferDescriptor;
164-
pInfos[1].buffer.size = outputBufferDescriptor->getSize();
165-
pInfos[1].buffer.offset = 0u;
166-
167-
driver->updateDescriptorSets(2u, pWrites, 0u, nullptr);
137+
return getOutputBufferSize(m_halfFloatStorage,inputDimensions,numChannels,realInput);
168138
}
169139

140+
static void updateDescriptorSet(
141+
video::IVideoDriver* driver,
142+
video::IGPUDescriptorSet* set,
143+
core::smart_refctd_ptr<video::IGPUBuffer> inputBufferDescriptor,
144+
core::smart_refctd_ptr<video::IGPUBuffer> outputBufferDescriptor);
145+
170146
static inline void dispatchHelper(
171147
video::IVideoDriver* driver,
172148
const video::IGPUPipelineLayout* pipelineLayout,
@@ -184,8 +160,21 @@ class FFT : public core::TotalInterface
184160
static void defaultBarrier();
185161

186162
private:
187-
FFT() = delete;
188-
//~FFT() = delete;
163+
_NBL_STATIC_INLINE_CONSTEXPR uint32_t MINIMUM_FFT_SIZE = DEFAULT_WORK_GROUP_SIZE<<1u;
164+
~FFT() {}
165+
166+
//
167+
static inline size_t getOutputBufferSize_impl(const asset::VkExtent3D& inputDimensions, uint32_t numChannels)
168+
{
169+
const auto paddedInputDimensions = padDimensions(inputDimensions);
170+
return paddedInputDimensions.width*paddedInputDimensions.height*paddedInputDimensions.depth*numChannels;
171+
}
172+
173+
core::smart_refctd_ptr<video::IGPUDescriptorSetLayout> m_dsLayout;
174+
core::smart_refctd_ptr<video::IGPUPipelineLayout> m_pplnLayout;
175+
core::smart_refctd_ptr<video::IGPUComputePipeline> m_ppln;
176+
uint32_t m_maxFFTLen;
177+
bool m_halfFloatStorage;
189178
};
190179

191180

0 commit comments

Comments
 (0)