@@ -336,22 +336,28 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
336
336
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
337
337
338
338
339
+ float scaledLogLuma;
339
340
nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channel)
340
341
{
341
342
ivec3 oldCoord = coordinate;
342
343
nbl_glsl_ext_FFT_wrap_coord(coordinate);
343
344
344
345
const uint index = coordinate.y*pc.data.imageWidth+coordinate.x;
345
346
347
+ // rewrite this fetch at some point
346
348
nbl_glsl_complex retval;
347
349
switch (channel)
348
350
{
349
351
case 2u:
350
- retval.z = float(inBuffer[index].z);
352
+ retval[0] = float(inBuffer[index].z);
353
+ break;
351
354
case 1u:
352
- retval.y = float(inBuffer[index].y);
355
+ retval[0] = float(inBuffer[index].y);
356
+ break;
353
357
default:
354
- retval.x = float(inBuffer[index].x);
358
+ scaledLogLuma += nbl_glsl_ext_LumaMeter_local_process(all(equal(coordinate,oldCoord)),vec3(inBuffer[index].x,inBuffer[index].y,inBuffer[index].z));
359
+ retval[0] = float(inBuffer[index].x);
360
+ break;
355
361
}
356
362
return retval;
357
363
}
@@ -363,13 +369,36 @@ void main()
363
369
#endif
364
370
nbl_glsl_ext_LumaMeter_clearFirstPassOutput();
365
371
366
- //
367
372
368
- // prevent overlap between different usages of shared memory
369
- barrier();
373
+ for(uint channel=0u; channel<3u; channel++)
374
+ {
375
+ // Virtual Threads Calculation
376
+ const uint log2FFTSize = nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize();
377
+ const uint item_per_thread_count = 0x1u<<(log2FFTSize-_NBL_GLSL_WORKGROUP_SIZE_LOG2_);
370
378
371
- for(uint ch=0u; ch<=nbl_glsl_ext_FFT_Parameters_t_getMaxChannel(); ++ch)
372
- nbl_glsl_ext_FFT(nbl_glsl_ext_FFT_Parameters_t_getIsInverse(),ch);
379
+ scaledLogLuma = 0.f;
380
+ // Load Values into local memory
381
+ for(uint t=0u; t<item_per_thread_count; t++)
382
+ {
383
+ const uint tid = (t<<_NBL_GLSL_WORKGROUP_SIZE_LOG2_)|gl_LocalInvocationIndex;
384
+ const uint trueDim = nbl_glsl_ext_FFT_Parameters_t_getDimensions()[nbl_glsl_ext_FFT_Parameters_t_getDirection()];
385
+ nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_ext_FFT_getPaddedData(nbl_glsl_ext_FFT_getPaddedCoordinates(tid,log2FFTSize,trueDim),channel);
386
+ }
387
+ if (channel==0u)
388
+ {
389
+ nbl_glsl_ext_LumaMeter_setFirstPassOutput(nbl_glsl_ext_LumaMeter_workgroup_process(scaledLogLuma));
390
+ // prevent overlap between different usages of shared memory
391
+ barrier();
392
+ }
393
+ // do FFT
394
+ nbl_glsl_ext_FFT_preloaded(false,log2FFTSize);
395
+ // write out to main memory
396
+ for(uint t=0u; t<item_per_thread_count; t++)
397
+ {
398
+ const uint tid = (t<<_NBL_GLSL_WORKGROUP_SIZE_LOG2_)|gl_LocalInvocationIndex;
399
+ nbl_glsl_ext_FFT_setData(nbl_glsl_ext_FFT_getCoordinates(tid),channel,nbl_glsl_ext_FFT_impl_values[t]);
400
+ }
401
+ }
373
402
}
374
403
)===" ));
375
404
auto interleaveAndLastFFTShader = driver->createGPUShader (core::make_smart_refctd_ptr<ICPUShader>(R"===(
@@ -970,7 +999,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
970
999
{
971
1000
uint64_t deinterleavedPixelBytesize = getTexelOrBlockBytesize<EF_R16G16B16A16_SFLOAT>(); // TODO do it with EF_R16G16B16_SFLOAT
972
1001
outImageByteOffset[j] = j*param.width *param.height *deinterleavedPixelBytesize;
973
- attachBufferImageRange (EII_COUNT+j,temporaryPixelBuffer.getObject (),outImageByteOffset[j],deinterleavedPixelBytesize);
1002
+ attachBufferImageRange (EII_COUNT+j,temporaryPixelBuffer.getObject (),outImageByteOffset[j],j ? deinterleavedPixelBytesize:fftScratchSize );
974
1003
}
975
1004
attachWholeBuffer (EII_COUNT*2u ,histogramBuffer.get ());
976
1005
attachWholeBuffer (EII_COUNT*2u +1u ,intensityBuffer.getObject ());
@@ -1107,7 +1136,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1107
1136
1108
1137
driver->bindComputePipeline (secondLumaMeterAndFirstFFTPipeline.get ());
1109
1138
// dispatch
1110
- driver->dispatch (workgroupCounts [0 ],workgroupCounts [1 ],1u );
1139
+ driver->dispatch (param. fftDispatchInfo [0 ]. workGroupCount [ 0 ],param. fftDispatchInfo [ 0 ]. workGroupCount [1 ],1u );
1111
1140
COpenGLExtensionHandler::extGlMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT);
1112
1141
1113
1142
// TODO: do X-axis pass of the DFFT
0 commit comments