Skip to content

Commit b589f97

Browse files
author
devsh
committed
Nuke core::FloatCompressor and remove bunch of floatutil.h stuff, make more things use hlsl::float16_t
1 parent fc416d4 commit b589f97

File tree

14 files changed

+212
-558
lines changed

14 files changed

+212
-558
lines changed

examples_tests

include/nbl/asset/ICPUSampler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class ICPUSampler : public ISampler, public IAsset
4545
break;
4646
case ISampler::ETC_MIRROR_CLAMP_TO_EDGE:
4747
texelCoord[i] = core::clamp<int32_t,int32_t>(texelCoord[i],-int32_t(mipExtent[i]),mipExtent[i]+mipLastCoord[i]);
48+
[[fallthrough]];
4849
case ISampler::ETC_MIRROR:
4950
{
5051
int32_t repeatID = (originalWasNegative+texelCoord[i])/int32_t(mipExtent[i]);

include/nbl/asset/IMeshBuffer.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#ifndef _NBL_ASSET_I_MESH_BUFFER_H_INCLUDED_
55
#define _NBL_ASSET_I_MESH_BUFFER_H_INCLUDED_
66

7-
#include "nbl/core/shapes/AABB.h"
87

98
#include "nbl/asset/IRenderpassIndependentPipeline.h"
109
#include "nbl/asset/ECommonEnums.h"

include/nbl/asset/filters/CBlitImageFilter.h

Lines changed: 58 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -118,18 +118,18 @@ class CBlitImageFilter :
118118
public:
119119
CState(blit_utils_t::convolution_kernels_t&& _kernels) : kernels(std::move(_kernels))
120120
{
121-
inOffsetBaseLayer = core::vectorSIMDu32();
122-
inExtentLayerCount = core::vectorSIMDu32();
123-
outOffsetBaseLayer = core::vectorSIMDu32();
124-
outExtentLayerCount = core::vectorSIMDu32();
121+
inOffsetBaseLayer = hlsl::uint32_t4();
122+
inExtentLayerCount = hlsl::uint32_t4();
123+
outOffsetBaseLayer = hlsl::uint32_t4();
124+
outExtentLayerCount = hlsl::uint32_t4();
125125
}
126126

127127
CState(const typename blit_utils_t::convolution_kernels_t& _kernels) : kernels(_kernels)
128128
{
129-
inOffsetBaseLayer = core::vectorSIMDu32();
130-
inExtentLayerCount = core::vectorSIMDu32();
131-
outOffsetBaseLayer = core::vectorSIMDu32();
132-
outExtentLayerCount = core::vectorSIMDu32();
129+
inOffsetBaseLayer = hlsl::uint32_t4();
130+
inExtentLayerCount = hlsl::uint32_t4();
131+
outOffsetBaseLayer = hlsl::uint32_t4();
132+
outExtentLayerCount = hlsl::uint32_t4();
133133
}
134134

135135
CState(const CState& other) : IImageFilter::IState(), base_t::CStateBase{other},
@@ -149,23 +149,23 @@ class CBlitImageFilter :
149149
return false;
150150
const size_t offset = getScratchOffset(this,ESU_SCALED_KERNEL_PHASED_LUT);
151151
const auto inType = inImage->getCreationParameters().type;
152-
const size_t size = blit_utils_t::getScaledKernelPhasedLUTSize(inExtentLayerCount,outExtentLayerCount,inType,kernels);
152+
const size_t size = blit_utils_t::getScaledKernelPhasedLUTSize(inExtentLayerCount.xyz,outExtentLayerCount.xyz,inType,kernels);
153153
auto* lut = base_t::CStateBase::scratchMemory+offset;
154-
return blit_utils_t::computeScaledKernelPhasedLUT(lut,inExtentLayerCount,outExtentLayerCount,inType, kernels);
154+
return blit_utils_t::computeScaledKernelPhasedLUT(lut,inExtentLayerCount.xyz,outExtentLayerCount.xyz,inType, kernels);
155155
}
156156

157157
union
158158
{
159-
core::vectorSIMDu32 inOffsetBaseLayer;
159+
hlsl::uint32_t4 inOffsetBaseLayer;
160160
struct
161161
{
162-
VkOffset3D inOffset;
163-
uint32_t inBaseLayer;
162+
VkOffset3D inOffset;
163+
uint32_t inBaseLayer;
164164
};
165165
};
166166
union
167167
{
168-
core::vectorSIMDu32 inExtentLayerCount;
168+
hlsl::uint32_t4 inExtentLayerCount;
169169
struct
170170
{
171171
VkExtent3D inExtent;
@@ -174,7 +174,7 @@ class CBlitImageFilter :
174174
};
175175
union
176176
{
177-
core::vectorSIMDu32 outOffsetBaseLayer;
177+
hlsl::uint32_t4 outOffsetBaseLayer;
178178
struct
179179
{
180180
VkOffset3D outOffset;
@@ -183,7 +183,7 @@ class CBlitImageFilter :
183183
};
184184
union
185185
{
186-
core::vectorSIMDu32 outExtentLayerCount;
186+
hlsl::uint32_t4 outExtentLayerCount;
187187
struct
188188
{
189189
VkExtent3D outExtent;
@@ -208,8 +208,7 @@ class CBlitImageFilter :
208208
const auto windowSize = blit_utils_t::getWindowSize(inType, state->kernels);
209209
const size_t scaledKernelPhasedLUTSize = blit_utils_t::getScaledKernelPhasedLUTSize(state->inExtentLayerCount, state->outExtentLayerCount, inType, windowSize);
210210

211-
core::vectorSIMDi32 intermediateExtent[3];
212-
getIntermediateExtents(intermediateExtent, state, windowSize);
211+
const auto intermediateExtent = getIntermediateExtents(state,windowSize);
213212
assert(intermediateExtent[0].x == intermediateExtent[2].x);
214213

215214
uint32_t pingBufferElementCount = (state->inExtent.width + windowSize[0]) * m_maxParallelism; // decode
@@ -349,13 +348,7 @@ class CBlitImageFilter :
349348
// filtering and alpha handling happens separately for every layer, so save on scratch memory size
350349
const auto inImageType = inParams.type;
351350
const auto real_window_size = blit_utils_t::getWindowSize(inImageType,state->kernels);
352-
core::vectorSIMDi32 intermediateExtent[3];
353-
getIntermediateExtents(intermediateExtent, state, real_window_size);
354-
const core::vectorSIMDi32 intermediateLastCoord[3] = {
355-
intermediateExtent[0]-core::vectorSIMDi32(1,1,1,0),
356-
intermediateExtent[1]-core::vectorSIMDi32(1,1,1,0),
357-
intermediateExtent[2]-core::vectorSIMDi32(1,1,1,0)
358-
};
351+
const hlsl::int32_t3x3 intermediateExtent = getIntermediateExtents(state,real_window_size);
359352
value_t* const intermediateStorage[3] = {
360353
reinterpret_cast<value_t*>(state->scratchMemory + getScratchOffset(state, ESU_BLIT_X_AXIS_WRITE)),
361354
reinterpret_cast<value_t*>(state->scratchMemory + getScratchOffset(state, ESU_BLIT_Y_AXIS_WRITE)),
@@ -381,7 +374,7 @@ class CBlitImageFilter :
381374
};
382375
const std::span<const IImage::SBufferCopy> outRegions = outImg->getRegions(outMipLevel);
383376
auto storeToImage = [policy,coverageSemantic,needsNormalization,outExtent,intermediateStorage,&sampler,outFormat,alphaRefValue,outData,intermediateStrides,alphaChannel,storeToTexel,outMipLevel,outOffset,outRegions,outImg,state](
384-
const core::rational<int64_t>& coverage, const int axis, const core::vectorSIMDu32& outOffsetLayer
377+
const core::rational<int64_t>& coverage, const int axis, const hlsl::uint32_t4& outOffsetLayer
385378
) -> void
386379
{
387380
assert(needsNormalization);
@@ -432,15 +425,16 @@ class CBlitImageFilter :
432425
auto scaleCoverage = [outData,outOffsetLayer,intermediateStrides,axis,intermediateStorage,alphaChannel,coverageScale,storeToTexel](uint32_t writeBlockArrayOffset, core::vectorSIMDu32 writeBlockPos) -> void
433426
{
434427
void* const dstPix = outData+writeBlockArrayOffset;
435-
const core::vectorSIMDu32 localOutPos = writeBlockPos - outOffsetLayer;
428+
for (auto i=0; i<4; i++)
429+
writeBlockPos[i] -= outOffsetLayer[i];
436430

437431
value_t sample[ChannelCount];
438-
const size_t offset = IImage::SBufferCopy::getLocalByteOffset(localOutPos, intermediateStrides[axis]);
432+
const size_t offset = IImage::SBufferCopy::getLocalByteOffset(writeBlockPos, intermediateStrides[axis]);
439433
const auto* first = intermediateStorage[axis]+offset;
440434
std::copy(first,first+ChannelCount,sample);
441435

442436
sample[alphaChannel] *= coverageScale;
443-
storeToTexel(sample,dstPix,localOutPos);
437+
storeToTexel(sample,dstPix,writeBlockPos);
444438
};
445439
const ICPUImage::SSubresourceLayers subresource = {static_cast<IImage::E_ASPECT_FLAGS>(0u),outMipLevel,outOffsetLayer.w,1};
446440
const IImageFilter::IState::TexelRange range = {outOffset,outExtent};
@@ -452,32 +446,34 @@ class CBlitImageFilter :
452446

453447
// process
454448
state->normalization.template initialize<double>();
455-
const core::vectorSIMDf fInExtent(inExtentLayerCount);
456-
const core::vectorSIMDf fOutExtent(outExtentLayerCount);
457-
const auto fScale = fInExtent.preciseDivision(fOutExtent);
458-
const auto halfTexelOffset = fScale*0.5f-core::vectorSIMDf(0.f,0.f,0.f,0.5f);
459-
const auto startCoord = [&halfTexelOffset,state]() -> core::vectorSIMDi32
449+
const hlsl::float64_t3 fInExtent(inExtentLayerCount.x,inExtentLayerCount.y,inExtentLayerCount.z);
450+
const hlsl::float64_t3 fOutExtent(outExtentLayerCount.x,outExtentLayerCount.y,outExtentLayerCount.z);
451+
const auto fScale = hlsl::float32_t3(fInExtent/fOutExtent);
452+
const auto startCoord = [fScale,state]() -> hlsl::int32_t4
460453
{
461-
return core::vectorSIMDi32(
454+
const auto halfTexelOffset = fScale*0.5f;
455+
return hlsl::int32_t4(
462456
std::get<0>(state->kernels).getWindowMinCoord(halfTexelOffset.x),
463457
std::get<1>(state->kernels).getWindowMinCoord(halfTexelOffset.y),
464-
std::get<2>(state->kernels).getWindowMinCoord(halfTexelOffset.z),0);
458+
std::get<2>(state->kernels).getWindowMinCoord(halfTexelOffset.z),
459+
0
460+
);
465461
}();
466-
const auto windowMinCoordBase = inOffsetBaseLayer+startCoord;
462+
// important we are aware of signedness here
463+
const hlsl::int32_t4 windowMinCoordBase = hlsl::int32_t4(inOffsetBaseLayer)+startCoord;
467464

468-
core::vectorSIMDu32 phaseCount = IBlitUtilities::getPhaseCount(inExtentLayerCount, outExtentLayerCount, inImageType);
469-
phaseCount = core::max(phaseCount, core::vectorSIMDu32(1, 1, 1));
470-
const core::vectorSIMDu32 axisOffsets = blit_utils_t::template getScaledKernelPhasedLUTAxisOffsets(phaseCount, real_window_size);
465+
auto phaseCount = IBlitUtilities::getPhaseCount(inExtentLayerCount.xyz, outExtentLayerCount.xyz, inImageType);
466+
phaseCount = hlsl::max(phaseCount,hlsl::uint32_t3(1,1,1));
467+
const auto axisOffsets = blit_utils_t::template getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size);
471468
constexpr auto MaxAxisCount = 3;
472469
lut_value_t* scaledKernelPhasedLUTPixel[MaxAxisCount];
473470
for (auto i = 0; i < MaxAxisCount; ++i)
474471
scaledKernelPhasedLUTPixel[i] = reinterpret_cast<lut_value_t*>(state->scratchMemory + getScratchOffset(state, ESU_SCALED_KERNEL_PHASED_LUT) + axisOffsets[i]);
475472

476473
for (uint32_t layer=0; layer!=layerCount; layer++) // TODO: could be parallelized
477474
{
478-
const core::vectorSIMDi32 vLayer(0,0,0,layer);
479-
const auto windowMinCoord = windowMinCoordBase+vLayer;
480-
const auto outOffsetLayer = outOffsetBaseLayer+vLayer;
475+
const hlsl::int32_t4 windowMinCoord(windowMinCoordBase.xyz,windowMinCoordBase.w+layer);
476+
const hlsl::uint32_t4 outOffsetLayer(outOffsetBaseLayer.xyz,outOffsetBaseLayer.w+layer);
481477
// reset coverage counter
482478
constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t<ExecutionPolicy>,core::execution::sequenced_policy>;
483479
using cond_atomic_int32_t = std::conditional_t<is_seq_policy_v,int32_t,std::atomic_int32_t>;
@@ -517,19 +513,19 @@ class CBlitImageFilter :
517513
uint32_t decode_offset;
518514
// whole line plus window borders
519515
value_t* lineBuffer;
520-
core::vectorSIMDi32 localTexCoord(0);
516+
hlsl::int32_t3 localTexCoord(0,0,0);
521517
localTexCoord[loopCoordID[0]] = batchCoord[0];
522518
localTexCoord[loopCoordID[1]] = batchCoord[1];
523519
if (axis!=IImage::ET_1D)
524-
lineBuffer = intermediateStorage[axis-1]+core::dot(static_cast<const core::vectorSIMDi32&>(intermediateStrides[axis-1]),localTexCoord)[0];
520+
lineBuffer = intermediateStorage[axis-1]+hlsl::dot(reinterpret_cast<const hlsl::int32_t3&>(intermediateStrides[axis-1]),localTexCoord);
525521
else
526522
{
527523
const auto inputEnd = inExtent.width+real_window_size.x;
528524
decode_offset = scratchHelper.template alloc<is_seq_policy_v>();
529525
lineBuffer = intermediateStorage[1]+decode_offset*ChannelCount*inputEnd;
530526
for (auto& i=localTexCoord.x; i<inputEnd; i++)
531527
{
532-
core::vectorSIMDi32 globalTexelCoord(localTexCoord+windowMinCoord);
528+
core::vectorSIMDi32 globalTexelCoord(localTexCoord.x+windowMinCoord.x,localTexCoord.y+windowMinCoord.y,localTexCoord.z+windowMinCoord.z);
533529

534530
core::vectorSIMDu32 blockLocalTexelCoord(0u);
535531
const void* srcPix[] = { // multiple loads for texture boundaries aren't that bad
@@ -562,11 +558,7 @@ class CBlitImageFilter :
562558

563559
auto getWeightedSample = [scaledKernelPhasedLUTPixel, windowSize, lineBuffer, &windowMinCoord, axis](const auto& windowCoord, const auto phaseIndex, const auto windowPixel, const auto channel) -> value_t
564560
{
565-
value_t kernelWeight;
566-
if constexpr (std::is_same_v<lut_value_t, uint16_t>)
567-
kernelWeight = value_t(core::Float16Compressor::decompress(scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel]));
568-
else
569-
kernelWeight = scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel];
561+
const value_t kernelWeight = static_cast<value_t>(scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel]);
570562

571563
return kernelWeight * lineBuffer[(windowCoord - windowMinCoord[axis]) * ChannelCount + channel];
572564
};
@@ -576,11 +568,11 @@ class CBlitImageFilter :
576568
for (auto& i=(localTexCoord[axis]=0); i<outExtentLayerCount[axis]; i++)
577569
{
578570
// get output pixel
579-
auto* const value = intermediateStorage[axis]+core::dot(static_cast<const core::vectorSIMDi32&>(intermediateStrides[axis]),localTexCoord)[0];
571+
auto* const value = intermediateStorage[axis]+hlsl::dot(reinterpret_cast<const hlsl::int32_t3&>(intermediateStrides[axis]),localTexCoord);
580572

581573
// do the filtering
582574
float tmp = float(i)+0.5f;
583-
int32_t windowCoord = kernel.getWindowMinCoord(tmp*fScale[axis], tmp);
575+
int32_t windowCoord = kernel.getWindowMinCoord(tmp*fScale[axis],tmp);
584576

585577
for (auto ch = 0; ch < ChannelCount; ++ch)
586578
value[ch] = getWeightedSample(windowCoord, phaseIndex, 0, ch);
@@ -594,7 +586,12 @@ class CBlitImageFilter :
594586
}
595587
if (lastPass)
596588
{
597-
const core::vectorSIMDu32 localOutPos = localTexCoord+outOffsetBaseLayer+vLayer;
589+
const core::vectorSIMDu32 localOutPos(
590+
outOffsetLayer.x+localTexCoord.x,
591+
outOffsetLayer.y+localTexCoord.y,
592+
outOffsetLayer.z+localTexCoord.z,
593+
outOffsetLayer.w+layer
594+
);
598595
if (needsNormalization)
599596
state->normalization.prepass(value,localOutPos,0u,0u,ChannelCount);
600597
else // store to image, we're done
@@ -678,13 +675,14 @@ class CBlitImageFilter :
678675
std::mutex mutex;
679676
};
680677

681-
static inline void getIntermediateExtents(core::vectorSIMDi32* intermediateExtent, const state_type* state, const core::vectorSIMDi32& real_window_size)
678+
// the WxHxD extent for each blit axis output
679+
static inline hlsl::int32_t3x3 getIntermediateExtents(const state_type* state, const hlsl::int32_t3& real_window_size)
682680
{
683-
assert(intermediateExtent);
684-
685-
intermediateExtent[0] = core::vectorSIMDi32(state->outExtent.width, state->inExtent.height + real_window_size[1], state->inExtent.depth + real_window_size[2]);
686-
intermediateExtent[1] = core::vectorSIMDi32(state->outExtent.width, state->outExtent.height, state->inExtent.depth + real_window_size[2]);
687-
intermediateExtent[2] = core::vectorSIMDi32(state->outExtent.width, state->outExtent.height, state->outExtent.depth);
681+
hlsl::int32_t3x3 intermediateExtent;
682+
intermediateExtent[0] = hlsl::int32_t3(state->outExtent.width, state->inExtent.height + real_window_size[1], state->inExtent.depth + real_window_size[2]);
683+
intermediateExtent[1] = hlsl::int32_t3(state->outExtent.width, state->outExtent.height, state->inExtent.depth + real_window_size[2]);
684+
intermediateExtent[2] = hlsl::int32_t3(state->outExtent.width, state->outExtent.height, state->outExtent.depth);
685+
return intermediateExtent;
688686
}
689687
};
690688

0 commit comments

Comments
 (0)