Skip to content

Commit 708da4e

Browse files
nothing broken yet, despite radical changes
1 parent 195d3d3 commit 708da4e

File tree

4 files changed

+134
-23
lines changed

4 files changed

+134
-23
lines changed

examples_tests/39.DenoiserTonemapper/CommonPushConstants.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,14 @@ struct CommonPushConstants
1111
{
1212
uint inImageTexelPitch[3];
1313
uint imageWidth;
14-
15-
16-
// 1 if before denoise
17-
uint beforeDenoise;
14+
uint imageHeight;
1815

1916
// luma meter and tonemapping var but also for denoiser
2017
uint percentileRange[2];
2118
uint intensityBufferDWORDOffset;
2219
float denoiserExposureBias;
2320

24-
uint autoexposureOff;
21+
uint flags;
2522
// for the tonemapper
2623
uint tonemappingOperator;
2724
float tonemapperParams[2];

examples_tests/39.DenoiserTonemapper/ShaderCommon.glsl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ layout(push_constant, row_major) uniform PushConstants{
1616
CommonPushConstants data;
1717
} pc;
1818
#define _NBL_GLSL_EXT_LUMA_METER_PUSH_CONSTANTS_DEFINED_
19+
#define _NBL_GLSL_EXT_FFT_PUSH_CONSTANTS_DEFINED_
20+
#define _NBL_GLSL_EXT_FFT_GET_PARAMETERS_DEFINED_
1921

2022

2123
#define SHARED_CHANNELS 3
@@ -29,7 +31,7 @@ struct f16vec3_packed
2931

3032
// luma metering stuff
3133
// those don't really influence anything but need to let the header know that we're using the same number of invocations as bins
32-
#define _NBL_GLSL_EXT_LUMA_METER_DISPATCH_SIZE_X_DEFINED_ 256
34+
#define _NBL_GLSL_EXT_LUMA_METER_DISPATCH_SIZE_X_DEFINED_ COMPUTE_WG_SIZE
3335
#define _NBL_GLSL_EXT_LUMA_METER_DISPATCH_SIZE_Y_DEFINED_ 1
3436

3537
#define _NBL_GLSL_EXT_LUMA_METER_MIN_LUMA_DEFINED_ 0x39800000
@@ -57,19 +59,20 @@ struct f16vec3_packed
5759
#define _NBL_GLSL_EXT_LUMA_METER_INVOCATION_COUNT (_NBL_GLSL_EXT_LUMA_METER_DISPATCH_SIZE_X_DEFINED_*_NBL_GLSL_EXT_LUMA_METER_DISPATCH_SIZE_Y_DEFINED_)
5860
#define _NBL_GLSL_EXT_LUMA_METER_BIN_COUNT _NBL_GLSL_EXT_LUMA_METER_INVOCATION_COUNT
5961
#define _NBL_GLSL_WORKGROUP_SIZE_ _NBL_GLSL_EXT_LUMA_METER_BIN_COUNT
62+
#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ 8
6063
#define _NBL_GLSL_EXT_LUMA_METER_BIN_GLOBAL_REPLICATION 4
6164
#ifdef _NBL_GLSL_EXT_LUMA_METER_FIRST_PASS_DEFINED_
6265
#include "nbl/builtin/glsl/ext/LumaMeter/impl.glsl"
6366

6467
// need to override the offset and color provision functions
6568
int nbl_glsl_ext_LumaMeter_getNextLumaOutputOffset()
6669
{
67-
return pc.data.beforeDenoise!=0u ? 1:0;
70+
return int(pc.data.flags&0x1u);
6871
}
6972

7073
int nbl_glsl_ext_LumaMeter_getCurrentLumaOutputOffset()
7174
{
72-
return pc.data.beforeDenoise!=0u ? 0:1;
75+
return int((~pc.data.flags)&0x1u);
7376
}
7477

7578
vec3 globalPixelData;

examples_tests/39.DenoiserTonemapper/main.cpp

Lines changed: 123 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ layout(binding = 3, std430) restrict writeonly buffer IntensityBuffer
241241
242242
int nbl_glsl_ext_LumaMeter_getCurrentLumaOutputOffset()
243243
{
244-
return pc.data.beforeDenoise!=0u ? 0:1;
244+
return int((~pc.data.flags)&0x1u);
245245
}
246246
nbl_glsl_ext_LumaMeter_output_SPIRV_CROSS_is_dumb_t nbl_glsl_ext_ToneMapper_getLumaMeterOutput()
247247
{
@@ -254,18 +254,17 @@ nbl_glsl_ext_LumaMeter_output_SPIRV_CROSS_is_dumb_t nbl_glsl_ext_ToneMapper_getL
254254
void main()
255255
{
256256
const bool firstInvocation = all(equal(uvec3(0,0,0),gl_GlobalInvocationID));
257-
const bool beforeDenoise = pc.data.beforeDenoise!=0u;
258-
const bool autoexposureOn = pc.data.autoexposureOff==0u;
259257
260258
float optixIntensity = 1.0;
261-
if (beforeDenoise||autoexposureOn)
259+
if (bool(pc.data.flags&0x2u))
262260
{
263261
nbl_glsl_ext_LumaMeter_PassInfo_t lumaPassInfo;
264262
lumaPassInfo.percentileRange[0] = pc.data.percentileRange[0];
265263
lumaPassInfo.percentileRange[1] = pc.data.percentileRange[1];
266264
float measuredLumaLog2 = nbl_glsl_ext_LumaMeter_getMeasuredLumaLog2(nbl_glsl_ext_ToneMapper_getLumaMeterOutput(),lumaPassInfo);
267265
if (firstInvocation)
268266
{
267+
const bool beforeDenoise = bool(pc.data.flags&0x1u);
269268
measuredLumaLog2 += beforeDenoise ? pc.data.denoiserExposureBias:0.0;
270269
optixIntensity = nbl_glsl_ext_LumaMeter_getOptiXIntensity(measuredLumaLog2);
271270
}
@@ -284,10 +283,56 @@ layout(binding = 0, std430) restrict readonly buffer ImageInputBuffer
284283
{
285284
f16vec3_packed inBuffer[];
286285
};
286+
#define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
287287
layout(binding = 1, std430) restrict writeonly buffer ImageOutputBuffer
288288
{
289-
float16_t data[];
290-
} outBuffers[EII_COUNT]; // TODO: do FFT
289+
f16vec2 outBuffer[];
290+
};
291+
#define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_
292+
293+
294+
295+
#include <nbl/builtin/glsl/math/complex.glsl>
296+
nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channel);
297+
#define _NBL_GLSL_EXT_FFT_GET_PADDED_DATA_DEFINED_
298+
299+
300+
uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions()
301+
{
302+
return uvec3(pc.data.imageWidth,pc.data.imageHeight,1u);
303+
}
304+
bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse()
305+
{
306+
return false;
307+
}
308+
uint nbl_glsl_ext_FFT_Parameters_t_getDirection()
309+
{
310+
return 0u;
311+
}
312+
uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel()
313+
{
314+
return 2u;
315+
}
316+
uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize()
317+
{
318+
return max(findMSB(pc.data.imageWidth-1u),_NBL_GLSL_WORKGROUP_SIZE_LOG2_)+1u;
319+
}
320+
uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType()
321+
{
322+
return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_;
323+
}
324+
#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_
325+
326+
327+
void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_complex complex_value)
328+
{
329+
const uint index = ((pc.data.imageHeight*channel+coordinate.x)<<nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize())+coordinate.y;
330+
outBuffer[index] = f16vec2(complex_value);
331+
}
332+
#define _NBL_GLSL_EXT_FFT_SET_DATA_DEFINED_
333+
334+
335+
291336
void main()
292337
{
293338
const uint dataOffset = gl_GlobalInvocationID.y*pc.data.imageWidth+gl_GlobalInvocationID.x;
@@ -296,6 +341,29 @@ void main()
296341
nbl_glsl_ext_LumaMeter(gl_GlobalInvocationID.x<pc.data.imageWidth);
297342
barrier();
298343
}
344+
345+
nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channel)
346+
{
347+
#if 0
348+
if (!nbl_glsl_ext_FFT_wrap_coord(coordinate))
349+
return nbl_glsl_complex(0.f,0.f);
350+
#endif
351+
const uint index = coordinate.y*pc.data.imageWidth+coordinate.x;
352+
float data;
353+
switch (channel)
354+
{
355+
case 2u:
356+
data = float(inBuffer[index].z);
357+
break;
358+
case 1u:
359+
data = float(inBuffer[index].y);
360+
break;
361+
default:
362+
data = float(inBuffer[index].x);
363+
break;
364+
}
365+
return nbl_glsl_complex(data,0.f);
366+
}
299367
)==="));
300368
auto interleaveAndLastFFTShader = driver->createGPUShader(core::make_smart_refctd_ptr<ICPUShader>(R"===(
301369
#version 450 core
@@ -306,14 +374,48 @@ layout(binding = 0, std430) restrict readonly buffer ImageInputBuffer
306374
{
307375
f16vec3_packed inBuffer[];
308376
};
377+
#define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
309378
layout(binding = 1, std430) restrict writeonly buffer ImageOutputBuffer
310379
{
311380
f16vec4 outBuffer[];
312381
};
382+
#define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_
313383
layout(binding = 3, std430) restrict readonly buffer IntensityBuffer
314384
{
315385
float intensity[];
316386
};
387+
388+
389+
#include <nbl/builtin/glsl/math/complex.glsl>
390+
nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channel);
391+
#define _NBL_GLSL_EXT_FFT_GET_PADDED_DATA_DEFINED_
392+
393+
uvec3 nbl_glsl_ext_FFT_Parameters_t_getDimensions()
394+
{
395+
return uvec3(pc.data.imageWidth,pc.data.imageHeight,1u);
396+
}
397+
bool nbl_glsl_ext_FFT_Parameters_t_getIsInverse()
398+
{
399+
return true;
400+
}
401+
uint nbl_glsl_ext_FFT_Parameters_t_getDirection()
402+
{
403+
return 0u;
404+
}
405+
uint nbl_glsl_ext_FFT_Parameters_t_getMaxChannel()
406+
{
407+
return 2u;
408+
}
409+
uint nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize()
410+
{
411+
return 10u;
412+
}
413+
uint nbl_glsl_ext_FFT_Parameters_t_getPaddingType()
414+
{
415+
return 3u; // _NBL_GLSL_EXT_FFT_PAD_MIRROR_;
416+
}
417+
#define _NBL_GLSL_EXT_FFT_PARAMETERS_METHODS_DECLARED_
418+
317419
void main()
318420
{
319421
// TODO: compute iFFT of the image
@@ -352,6 +454,15 @@ void main()
352454
if (alive)
353455
outBuffer[dataOffset] = f16vec4(vec4(color,1.0));
354456
}
457+
458+
nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channel)
459+
{
460+
#if 0
461+
if (!nbl_glsl_ext_FFT_wrap_coord(coordinate))
462+
return nbl_glsl_complex(0.f,0.f);
463+
#endif
464+
return nbl_glsl_complex(0.f,0.f);
465+
}
355466
)==="));
356467
struct SpecializationConstants
357468
{
@@ -586,7 +697,7 @@ void main()
586697
auto* fftPushConstants = outParam.fftPushConstants;
587698
auto* fftDispatchInfo = outParam.fftDispatchInfo;
588699
const ISampler::E_TEXTURE_CLAMP fftPadding[2] = {ISampler::ETC_MIRROR,ISampler::ETC_MIRROR};
589-
const auto passes = FFTClass::buildParameters(false,colorChannelsFFT,extent,fftPushConstants,fftDispatchInfo,fftPadding,marginSrcDim);
700+
const auto passes = FFTClass::buildParameters<false>(false,colorChannelsFFT,extent,fftPushConstants,fftDispatchInfo,fftPadding,marginSrcDim);
590701
{
591702
// override for less work and storage (dont need to store the extra padding of the last axis after iFFT)
592703
fftPushConstants[1].output_strides.x = fftPushConstants[0].input_strides.x;
@@ -697,7 +808,6 @@ void main()
697808
temporaryPixelBuffer = driver->createDeviceLocalGPUBufferOnDedMem(tempBufferSize);
698809
if (check_error(!cuda::CCUDAHandler::defaultHandleResult(cuda::CCUDAHandler::registerBuffer(&temporaryPixelBuffer)),"Could not register buffer for Denoiser scratch memory!"))
699810
return error_code;
700-
// TODO: allocate scratch with Nabla again
701811
scratch = driver->createDeviceLocalGPUBufferOnDedMem(scratchBufferSize);
702812
if (check_error(!cuda::CCUDAHandler::defaultHandleResult(cuda::CCUDAHandler::registerBuffer(&scratch)), "Could not register buffer for Denoiser temporary memory with CUDA natively!"))
703813
return error_code;
@@ -717,13 +827,13 @@ void main()
717827
CommonPushConstants shaderConstants;
718828
{
719829
shaderConstants.imageWidth = param.width;
830+
shaderConstants.imageHeight = param.height;
720831
assert(intensityBufferOffset%IntensityValuesSize==0u);
721-
shaderConstants.beforeDenoise = 1u;
722832

723833
shaderConstants.intensityBufferDWORDOffset = intensityBufferOffset/IntensityValuesSize;
724834
shaderConstants.denoiserExposureBias = denoiserExposureBiasBundle[i].value();
725835

726-
shaderConstants.autoexposureOff = 0u;
836+
shaderConstants.flags = 0b11u; // (autoexposureOn<<1)|beforeDenoise
727837
switch (tonemapperBundle[i].first)
728838
{
729839
case DTEA_TONEMAPPER_REINHARD:
@@ -764,7 +874,7 @@ void main()
764874
if (core::isnan(key))
765875
{
766876
shaderConstants.tonemapperParams[0] = 0.18;
767-
shaderConstants.autoexposureOff = 1u;
877+
shaderConstants.flags &= 0b01u; // ~(autoexposureOn<<1)
768878
}
769879
else
770880
shaderConstants.tonemapperParams[0] = key;
@@ -961,8 +1071,8 @@ void main()
9611071
// compute post-processing
9621072
{
9631073
// let the shaders know we're in the second phase now
964-
shaderConstants.beforeDenoise = 0u;
965-
driver->pushConstants(sharedPipelineLayout.get(), video::IGPUSpecializedShader::ESS_COMPUTE, offsetof(CommonPushConstants,beforeDenoise), sizeof(uint32_t), &shaderConstants.beforeDenoise);
1074+
shaderConstants.flags &= 0b10u;
1075+
driver->pushConstants(sharedPipelineLayout.get(), video::IGPUSpecializedShader::ESS_COMPUTE, offsetof(CommonPushConstants,flags), sizeof(uint32_t), &shaderConstants.flags);
9661076
// Bloom
9671077
uint32_t workgroupCounts[2] = { (param.width + kComputeWGSize - 1u) / kComputeWGSize,param.height }; // TODO: change
9681078
{
@@ -989,7 +1099,6 @@ void main()
9891099
}
9901100

9911101
driver->bindComputePipeline(secondLumaMeterAndFirstFFTPipeline.get());
992-
//FFTClass::dispatchHelper(driver, imageFirstFFTPipelineLayout.get(), fftPushConstants[0], fftDispatchInfo[0]);
9931102
// dispatch
9941103
driver->dispatch(workgroupCounts[0],workgroupCounts[1],1u);
9951104
COpenGLExtensionHandler::extGlMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

include/nbl/ext/FFT/FFT.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class FFT final : public core::IReferenceCounted
4747
FFT(video::IDriver* driver, uint32_t maxDimensionSize, bool useHalfStorage = false);
4848

4949
// returns how many dispatches necessary for computing the FFT and fills the uniform data
50+
template<bool unconstrainedAxisOrder=true>
5051
static inline uint32_t buildParameters(
5152
bool isInverse, uint32_t numChannels, const asset::VkExtent3D& inputDimensions,
5253
Parameters_t* outParams, DispatchInfo_t* outInfos, const asset::ISampler::E_TEXTURE_CLAMP* paddingType,
@@ -68,7 +69,8 @@ class FFT final : public core::IReferenceCounted
6869
continue;
6970
passes[passesRequired++] = {float(dim)/float((&inputDimensions.width)[i]),i,paddingType[i]};
7071
}
71-
std::sort(passes.begin(),passes.begin()+passesRequired);
72+
if (unconstrainedAxisOrder)
73+
std::sort(passes.begin(),passes.begin()+passesRequired);
7274
}
7375

7476
auto computeOutputStride = [](const uvec3& output_dimensions, const auto axis, const auto nextAxis) -> uvec4

0 commit comments

Comments
 (0)