@@ -289,13 +289,10 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
289
289
290
290
// filtering and alpha handling happens separately for every layer, so save on scratch memory size
291
291
const auto inImageType = inParams.type ;
292
- const auto window_last = [&kernelX,&kernelY,&kernelZ]() -> core::vectorSIMDi32
293
- {
294
- return core::vectorSIMDi32 (kernelX.getWindowSize ().x -1 ,kernelY.getWindowSize ().y -1 ,kernelZ.getWindowSize ().z -1 ,0 );
295
- }();
292
+ const auto window_end = getWindowEnd (inImageType,kernelX,kernelY,kernelZ);
296
293
const core::vectorSIMDi32 intermediateExtent[3 ] = {
297
- core::vectorSIMDi32 (outExtent.width ,inExtent.height +window_last [1 ],inExtent.depth +window_last [2 ]),
298
- core::vectorSIMDi32 (outExtent.width ,outExtent.height ,inExtent.depth +window_last [2 ]),
294
+ core::vectorSIMDi32 (outExtent.width ,inExtent.height +window_end [1 ],inExtent.depth +window_end [2 ]),
295
+ core::vectorSIMDi32 (outExtent.width ,outExtent.height ,inExtent.depth +window_end [2 ]),
299
296
core::vectorSIMDi32 (outExtent.width ,outExtent.height ,outExtent.depth )
300
297
};
301
298
const core::vectorSIMDi32 intermediateLastCoord[3 ] = {
@@ -465,7 +462,7 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
465
462
lineBuffer = intermediateStorage[axis-1 ]+core::dot (static_cast <const core::vectorSIMDi32&>(intermediateStrides[axis-1 ]),localTexCoord)[0 ];
466
463
else
467
464
{
468
- const auto windowEnd = inExtent.width +window_last .x ;
465
+ const auto windowEnd = inExtent.width +window_end .x ;
469
466
decode_offset = alloc_decode_scratch ();
470
467
lineBuffer = intermediateStorage[1 ]+decode_offset*MaxChannels*windowEnd;
471
468
for (auto & i=localTexCoord.x ; i<windowEnd; i++)
@@ -566,6 +563,21 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
566
563
567
564
private:
568
565
static inline constexpr uint32_t VectorizationBoundSTL = /* AVX2*/ 16u ;
566
+ //
567
+ static inline core::vectorSIMDi32 getWindowEnd (const IImage::E_TYPE inImageType,
568
+ const CScaledImageFilterKernel<KernelX>& kernelX,
569
+ const CScaledImageFilterKernel<KernelY>& kernelY,
570
+ const CScaledImageFilterKernel<KernelZ>& kernelZ
571
+ )
572
+ {
573
+ // TODO: investigate properly if its supposed be `size` or `size-1` (polyphase kinda shows need for `size`)
574
+ core::vectorSIMDi32 last (kernelX.getWindowSize ().x ,0 ,0 ,0 );
575
+ if (inImageType>=IImage::ET_2D)
576
+ last.y = kernelY.getWindowSize ().x ;
577
+ if (inImageType>=IImage::ET_3D)
578
+ last.z = kernelZ.getWindowSize ().x ;
579
+ return last;
580
+ }
569
581
// the blit filter will filter one axis at a time, hence necessitating "ping ponging" between two scratch buffers
570
582
static inline uint32_t getScratchOffset (const state_type* state, bool secondPong)
571
583
{
@@ -574,17 +586,14 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
574
586
const auto kernelY = state->contructScaledKernel (state->kernelY );
575
587
const auto kernelZ = state->contructScaledKernel (state->kernelZ );
576
588
577
- const auto window_last = [&kernelX,&kernelY,&kernelZ]() -> core::vectorSIMDi32
578
- {
579
- return core::vectorSIMDi32 (kernelX.getWindowSize ().x -1 ,kernelY.getWindowSize ().y -1 ,kernelZ.getWindowSize ().z -1 ,0 );
580
- }();
589
+ const auto window_end = getWindowEnd (state->inImage ->getCreationParameters ().type ,kernelX,kernelY,kernelZ);
581
590
// TODO: account for the size needed for coverage adjustment
582
591
// the first pass will be along X, so new temporary image will have the width of the output extent, but the height and depth will need to be padded
583
592
// but the last pass will be along Z and the new temporary image will have the exact dimensions of `outExtent` which is why there is a `core::max`
584
- auto texelCount = state->outExtent .width *core::max<uint32_t >((state->inExtent .height +window_last [1 ])*(state->inExtent .depth +window_last [2 ]),state->outExtent .height *state->outExtent .depth );
593
+ auto texelCount = state->outExtent .width *core::max<uint32_t >((state->inExtent .height +window_end [1 ])*(state->inExtent .depth +window_end [2 ]),state->outExtent .height *state->outExtent .depth );
585
594
// the second pass will result in an image that has the width and height equal to `outExtent`
586
595
if (secondPong)
587
- texelCount += core::max<uint32_t >(state->outExtent .width *state->outExtent .height *(state->inExtent .depth +window_last [2 ]),(state->inExtent .width +window_last [0 ])*std::thread::hardware_concurrency ()*VectorizationBoundSTL);
596
+ texelCount += core::max<uint32_t >(state->outExtent .width *state->outExtent .height *(state->inExtent .depth +window_end [2 ]),(state->inExtent .width +window_end [0 ])*std::thread::hardware_concurrency ()*VectorizationBoundSTL);
588
597
// obviously we have multiple channels and each channel has a certain type for arithmetic
589
598
return texelCount*MaxChannels*sizeof (value_type);
590
599
}
0 commit comments