Skip to content

Commit 2e25575

Browse files
committed
Fixed more filter stuff, offsetDifference needs to be measured in blocks
1 parent cfd08e0 commit 2e25575

File tree

3 files changed

+21
-16
lines changed

3 files changed

+21
-16
lines changed

include/nbl/asset/filters/CCopyImageFilter.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,8 @@ class NBL_API CCopyImageFilter : public CImageFilter<CCopyImageFilter>, public C
6565
const auto blockDims = asset::getBlockDimensions(commonExecuteData.inFormat);
6666
auto copy = [&commonExecuteData,&blockDims](uint32_t readBlockArrayOffset, core::vectorSIMDu32 readBlockPos) -> void
6767
{
68-
const auto localOutPos = readBlockPos+commonExecuteData.offsetDifference;
69-
const auto writeOffset = commonExecuteData.oit->getByteOffset(localOutPos,commonExecuteData.outByteStrides);
70-
memcpy(commonExecuteData.outData+writeOffset,commonExecuteData.inData+readBlockArrayOffset,commonExecuteData.outBlockByteSize);
68+
const auto localOutPos = readBlockPos + commonExecuteData.offsetDifference;
69+
memcpy(commonExecuteData.outData + commonExecuteData.oit->getByteOffset(localOutPos, commonExecuteData.outBlockByteStrides), commonExecuteData.inData + readBlockArrayOffset, commonExecuteData.outBlockByteSize);
7170
};
7271
CBasicImageFilterCommon::executePerRegion<ExecutionPolicy>(policy,commonExecuteData.inImg,copy,commonExecuteData.inRegions.begin(),commonExecuteData.inRegions.end(),clip);
7372

include/nbl/asset/filters/CMatchedSizeInOutImageFilterCommon.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,8 @@ class NBL_API CMatchedSizeInOutImageFilterCommon : public CBasicImageFilterCommo
160160
const core::SRange<const IImage::SBufferCopy> inRegions;
161161
const core::SRange<const IImage::SBufferCopy> outRegions;
162162
const IImage::SBufferCopy* oit; //!< oit is a current output handled region by commonExecute lambda. Notice that the lambda may execute executePerRegion a few times with different oits data since regions may overlap in a certain mipmap in an image!
163-
core::vectorSIMDu32 offsetDifference, outByteStrides;
163+
core::vectorSIMDu32 offsetDifference;
164+
core::vectorSIMDu32 outBlockByteStrides;
164165
};
165166
template<typename PerOutputFunctor>
166167
static inline bool commonExecute(state_type* state, PerOutputFunctor& perOutput)
@@ -190,17 +191,22 @@ class NBL_API CMatchedSizeInOutImageFilterCommon : public CBasicImageFilterCommo
190191
outRegions.begin(), {}, {}
191192
};
192193

194+
const asset::TexelBlockInfo srcImageTexelBlockInfo(inParams.format);
195+
const asset::TexelBlockInfo dstImageTexelBlockInfo(outParams.format);
196+
193197
// iterate over output regions, then input cause read cache miss is faster
194198
for (; commonExecuteData.oit!=commonExecuteData.outRegions.end(); commonExecuteData.oit++)
195199
{
196200
IImage::SSubresourceLayers subresource = {static_cast<IImage::E_ASPECT_FLAGS>(0u),state->inMipLevel,state->inBaseLayer,state->layerCount};
197201
state_type::TexelRange range = {state->inOffset,state->extent};
198202
CBasicImageFilterCommon::clip_region_functor_t clip(subresource,range,commonExecuteData.inFormat);
199203
// setup convert state
200-
// I know my two's complement wraparound well enough to make this work
201204
const auto& outRegionOffset = commonExecuteData.oit->imageOffset;
202-
commonExecuteData.offsetDifference = state->outOffsetBaseLayer - (core::vectorSIMDu32(outRegionOffset.x, outRegionOffset.y, outRegionOffset.z, commonExecuteData.oit->imageSubresource.baseArrayLayer) + state->inOffsetBaseLayer);
203-
commonExecuteData.outByteStrides = commonExecuteData.oit->getByteStrides(TexelBlockInfo(commonExecuteData.outFormat));
205+
const auto& inOffset = (core::vectorSIMDu32(outRegionOffset.x, outRegionOffset.y, outRegionOffset.z, commonExecuteData.oit->imageSubresource.baseArrayLayer) + state->inOffsetBaseLayer);
206+
207+
// offsetDifference types are uint but I know my two's complement wraparound well enough to make this work
208+
commonExecuteData.offsetDifference = dstImageTexelBlockInfo.convertTexelsToBlocks(state->outOffsetBaseLayer) - srcImageTexelBlockInfo.convertTexelsToBlocks(inOffset);
209+
commonExecuteData.outBlockByteStrides = commonExecuteData.oit->getByteStrides(TexelBlockInfo(commonExecuteData.outFormat));
204210
if (!perOutput(commonExecuteData,clip))
205211
return false;
206212
}

include/nbl/asset/filters/CSwizzleAndConvertImageFilter.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class NBL_API CSwizzleAndConvertImageFilterBase : public CSwizzleableAndDitherab
7777
base_t::template onDecode<kInFormat,encodeBufferType>(state, srcPix, decodeBuffer, encodeBuffer, blockX, blockY);
7878
else
7979
base_t::template onDecode<encodeBufferType>(rInFormat, state, srcPix, decodeBuffer, encodeBuffer, blockX, blockY);
80-
state->normalization.prepass(encodeBuffer,readBlockPos*blockDims+commonExecuteData.offsetDifference,blockX,blockY,4u/*TODO: figure this out*/);
80+
state->normalization.prepass(encodeBuffer,readBlockPos+commonExecuteData.offsetDifference,blockX,blockY,4u/*TODO: figure this out*/);
8181
}
8282
};
8383
CBasicImageFilterCommon::executePerRegion(policy, commonExecuteData.inImg, normalizePrepass, commonExecuteData.inRegions.begin(), commonExecuteData.inRegions.end(), clip);
@@ -146,8 +146,8 @@ class NBL_API CSwizzleAndConvertImageFilter : public CImageFilter<CSwizzleAndCon
146146
for (auto blockY=0u; blockY<blockDims.y; blockY++)
147147
for (auto blockX=0u; blockX<blockDims.x; blockX++)
148148
{
149-
auto localOutPos = readBlockPos*blockDims+commonExecuteData.offsetDifference;
150-
uint8_t* dstPix = commonExecuteData.outData+commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY),commonExecuteData.outByteStrides);
149+
auto localOutPos = readBlockPos+commonExecuteData.offsetDifference;
150+
uint8_t* dstPix = commonExecuteData.outData+commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY),commonExecuteData.outBlockByteStrides);
151151

152152
constexpr auto maxChannels = 4;
153153
decodeBufferType decodeBuffer[maxChannels] = {};
@@ -212,8 +212,8 @@ class NBL_API CSwizzleAndConvertImageFilter<EF_UNKNOWN,EF_UNKNOWN,Swizzle,Dither
212212
for (auto blockY=0u; blockY<blockDims.y; blockY++)
213213
for (auto blockX=0u; blockX<blockDims.x; blockX++)
214214
{
215-
auto localOutPos = readBlockPos*blockDims+commonExecuteData.offsetDifference;
216-
uint8_t* dstPix = commonExecuteData.outData+commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY),commonExecuteData.outByteStrides);
215+
auto localOutPos = readBlockPos+commonExecuteData.offsetDifference;
216+
uint8_t* dstPix = commonExecuteData.outData+commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY),commonExecuteData.outBlockByteStrides);
217217

218218
constexpr auto maxChannels = 4;
219219
double decodeBuffer[maxChannels] = {};
@@ -287,8 +287,8 @@ class NBL_API CSwizzleAndConvertImageFilter<EF_UNKNOWN,outFormat,Swizzle,Dither,
287287
for (auto blockY = 0u; blockY < blockDims.y; blockY++)
288288
for (auto blockX = 0u; blockX < blockDims.x; blockX++)
289289
{
290-
auto localOutPos = readBlockPos * blockDims + commonExecuteData.offsetDifference;
291-
uint8_t* dstPix = commonExecuteData.outData + commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY), commonExecuteData.outByteStrides);
290+
auto localOutPos = readBlockPos + commonExecuteData.offsetDifference;
291+
uint8_t* dstPix = commonExecuteData.outData + commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY), commonExecuteData.outBlockByteStrides);
292292

293293
constexpr auto maxChannels = 4;
294294
double decodeBuffer[maxChannels] = {};
@@ -363,8 +363,8 @@ class NBL_API CSwizzleAndConvertImageFilter<inFormat,EF_UNKNOWN,Swizzle,Dither,N
363363
for (auto blockY = 0u; blockY < blockDims.y; blockY++)
364364
for (auto blockX = 0u; blockX < blockDims.x; blockX++)
365365
{
366-
auto localOutPos = readBlockPos * blockDims + commonExecuteData.offsetDifference;
367-
uint8_t* dstPix = commonExecuteData.outData + commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY), commonExecuteData.outByteStrides);
366+
auto localOutPos = readBlockPos + commonExecuteData.offsetDifference;
367+
uint8_t* dstPix = commonExecuteData.outData + commonExecuteData.oit->getByteOffset(localOutPos + core::vectorSIMDu32(blockX, blockY), commonExecuteData.outBlockByteStrides);
368368

369369
constexpr auto maxChannels = 4;
370370
decodeBufferType decodeBuffer[maxChannels] = {};

0 commit comments

Comments
 (0)