Skip to content

Commit d91717e

Browse files
fix a crashbug (and speed up 1D and 2D blits by a factor of 4), also incorporate fix suggestion by @Achal
1 parent a38c437 commit d91717e

File tree

2 files changed

+23
-14
lines changed

2 files changed

+23
-14
lines changed

include/nbl/asset/filters/CBlitImageFilter.h

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -289,13 +289,10 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
289289

290290
// filtering and alpha handling happens separately for every layer, so save on scratch memory size
291291
const auto inImageType = inParams.type;
292-
const auto window_last = [&kernelX,&kernelY,&kernelZ]() -> core::vectorSIMDi32
293-
{
294-
return core::vectorSIMDi32(kernelX.getWindowSize().x-1,kernelY.getWindowSize().y-1,kernelZ.getWindowSize().z-1,0);
295-
}();
292+
const auto window_end = getWindowEnd(inImageType,kernelX,kernelY,kernelZ);
296293
const core::vectorSIMDi32 intermediateExtent[3] = {
297-
core::vectorSIMDi32(outExtent.width,inExtent.height+window_last[1],inExtent.depth+window_last[2]),
298-
core::vectorSIMDi32(outExtent.width,outExtent.height,inExtent.depth+window_last[2]),
294+
core::vectorSIMDi32(outExtent.width,inExtent.height+window_end[1],inExtent.depth+window_end[2]),
295+
core::vectorSIMDi32(outExtent.width,outExtent.height,inExtent.depth+window_end[2]),
299296
core::vectorSIMDi32(outExtent.width,outExtent.height,outExtent.depth)
300297
};
301298
const core::vectorSIMDi32 intermediateLastCoord[3] = {
@@ -465,7 +462,7 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
465462
lineBuffer = intermediateStorage[axis-1]+core::dot(static_cast<const core::vectorSIMDi32&>(intermediateStrides[axis-1]),localTexCoord)[0];
466463
else
467464
{
468-
const auto windowEnd = inExtent.width+window_last.x;
465+
const auto windowEnd = inExtent.width+window_end.x;
469466
decode_offset = alloc_decode_scratch();
470467
lineBuffer = intermediateStorage[1]+decode_offset*MaxChannels*windowEnd;
471468
for (auto& i=localTexCoord.x; i<windowEnd; i++)
@@ -566,6 +563,21 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
566563

567564
private:
568565
static inline constexpr uint32_t VectorizationBoundSTL = /*AVX2*/16u;
566+
//
567+
static inline core::vectorSIMDi32 getWindowEnd(const IImage::E_TYPE inImageType,
568+
const CScaledImageFilterKernel<KernelX>& kernelX,
569+
const CScaledImageFilterKernel<KernelY>& kernelY,
570+
const CScaledImageFilterKernel<KernelZ>& kernelZ
571+
)
572+
{
573+
// TODO: investigate properly if its supposed be `size` or `size-1` (polyphase kinda shows need for `size`)
574+
core::vectorSIMDi32 last(kernelX.getWindowSize().x,0,0,0);
575+
if (inImageType>=IImage::ET_2D)
576+
last.y = kernelY.getWindowSize().x;
577+
if (inImageType>=IImage::ET_3D)
578+
last.z = kernelZ.getWindowSize().x;
579+
return last;
580+
}
569581
// the blit filter will filter one axis at a time, hence necessitating "ping ponging" between two scratch buffers
570582
static inline uint32_t getScratchOffset(const state_type* state, bool secondPong)
571583
{
@@ -574,17 +586,14 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
574586
const auto kernelY = state->contructScaledKernel(state->kernelY);
575587
const auto kernelZ = state->contructScaledKernel(state->kernelZ);
576588

577-
const auto window_last = [&kernelX,&kernelY,&kernelZ]() -> core::vectorSIMDi32
578-
{
579-
return core::vectorSIMDi32(kernelX.getWindowSize().x-1,kernelY.getWindowSize().y-1,kernelZ.getWindowSize().z-1,0);
580-
}();
589+
const auto window_end = getWindowEnd(state->inImage->getCreationParameters().type,kernelX,kernelY,kernelZ);
581590
// TODO: account for the size needed for coverage adjustment
582591
// the first pass will be along X, so new temporary image will have the width of the output extent, but the height and depth will need to be padded
583592
// but the last pass will be along Z and the new temporary image will have the exact dimensions of `outExtent` which is why there is a `core::max`
584-
auto texelCount = state->outExtent.width*core::max<uint32_t>((state->inExtent.height+window_last[1])*(state->inExtent.depth+window_last[2]),state->outExtent.height*state->outExtent.depth);
593+
auto texelCount = state->outExtent.width*core::max<uint32_t>((state->inExtent.height+window_end[1])*(state->inExtent.depth+window_end[2]),state->outExtent.height*state->outExtent.depth);
585594
// the second pass will result in an image that has the width and height equal to `outExtent`
586595
if (secondPong)
587-
texelCount += core::max<uint32_t>(state->outExtent.width*state->outExtent.height*(state->inExtent.depth+window_last[2]),(state->inExtent.width+window_last[0])*std::thread::hardware_concurrency()*VectorizationBoundSTL);
596+
texelCount += core::max<uint32_t>(state->outExtent.width*state->outExtent.height*(state->inExtent.depth+window_end[2]),(state->inExtent.width+window_end[0])*std::thread::hardware_concurrency()*VectorizationBoundSTL);
588597
// obviously we have multiple channels and each channel has a certain type for arithmetic
589598
return texelCount*MaxChannels*sizeof(value_type);
590599
}

src/nbl/ext/MitsubaLoader/CMitsubaLoader.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818

1919
#if defined(_NBL_DEBUG) || defined(_NBL_RELWITHDEBINFO)
20-
# define DEBUG_MITSUBA_LOADER
20+
//# define DEBUG_MITSUBA_LOADER
2121
#endif
2222

2323
namespace nbl

0 commit comments

Comments
 (0)