1
+ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
2
+ // This file is part of the "Nabla Engine".
3
+ // For conditions of distribution and use, see copyright notice in nabla.h
4
+
5
+ #ifndef __NBL_ASSET_C_BLIT_UTILITIES_H_INCLUDED__
6
+ #define __NBL_ASSET_C_BLIT_UTILITIES_H_INCLUDED__
7
+
8
+ #include " nbl/asset/filters/kernels/kernels.h"
9
+
10
+ namespace nbl ::asset
11
+ {
12
+ class IBlitUtilities
13
+ {
14
+ public:
15
+ static constexpr uint32_t MinAlphaBinCount = 256u ;
16
+ static constexpr uint32_t MaxAlphaBinCount = 4096u ;
17
+ static constexpr uint32_t DefaultAlphaBinCount = MinAlphaBinCount;
18
+
19
+ enum E_ALPHA_SEMANTIC : uint32_t
20
+ {
21
+ EAS_NONE_OR_PREMULTIPLIED = 0u , // just filter the channels independently (also works for a texture for blending equation `dstCol*(1-srcAlpha)+srcCol`)
22
+ EAS_REFERENCE_OR_COVERAGE, // try to preserve coverage (percentage of pixels above a threshold value) across mipmap levels
23
+ EAS_SEPARATE_BLEND, // compute a new alpha value for a texture to be used with the blending equation `mix(dstCol,srcCol,srcAlpha)`
24
+ EAS_COUNT
25
+ };
26
+
27
+ static inline core::vectorSIMDu32 getPhaseCount (const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE inImageType)
28
+ {
29
+ core::vectorSIMDu32 result (0u );
30
+ for (uint32_t i = 0u ; i <= inImageType; ++i)
31
+ result[i] = outExtent[i] / std::gcd (inExtent[i], outExtent[i]);
32
+ return result;
33
+ }
34
+
35
+ // we'll need to rescale the kernel support to be relative to the output image but in the input image coordinate system
36
+ // (if support is 3 pixels, it needs to be 3 output texels, but measured in input texels)
37
+ template <class Kernel >
38
+ static inline auto constructScaledKernel (const Kernel& kernel, const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent)
39
+ {
40
+ const core::vectorSIMDf fInExtent (inExtent);
41
+ const core::vectorSIMDf fOutExtent (outExtent);
42
+ const auto fScale = fInExtent .preciseDivision (fOutExtent );
43
+ return CScaledImageFilterKernel<Kernel>(fScale , kernel);
44
+ }
45
+ };
46
+
47
+ template <class KernelX = CBoxImageFilterKernel, class KernelY = KernelX, class KernelZ = KernelX>
48
+ class CBlitUtilities : public IBlitUtilities
49
+ {
50
+ static_assert (std::is_same<typename KernelX::value_type, typename KernelY::value_type>::value&& std::is_same<typename KernelZ::value_type, typename KernelY::value_type>::value, " Kernel value_type need to be identical" );
51
+
52
+ public:
53
+ _NBL_STATIC_INLINE_CONSTEXPR auto MaxChannels = std::max<decltype (KernelX::MaxChannels)>(std::max<decltype (KernelX::MaxChannels)>(KernelX::MaxChannels, KernelY::MaxChannels), KernelZ::MaxChannels);
54
+
55
+ template <typename lut_value_type = KernelX::value_type>
56
+ static inline size_t getScaledKernelPhasedLUTSize (const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE inImageType,
57
+ const KernelX& kernelX, const KernelY& kernelY, const KernelZ& kernelZ)
58
+ {
59
+ const auto scaledKernelX = constructScaledKernel (kernelX, inExtent, outExtent);
60
+ const auto scaledKernelY = constructScaledKernel (kernelY, inExtent, outExtent);
61
+ const auto scaledKernelZ = constructScaledKernel (kernelZ, inExtent, outExtent);
62
+
63
+ const auto phaseCount = getPhaseCount (inExtent, outExtent, inImageType);
64
+
65
+ return ((phaseCount[0 ] * scaledKernelX.getWindowSize ().x ) + (phaseCount[1 ] * scaledKernelY.getWindowSize ().y ) + (phaseCount[2 ] * scaledKernelZ.getWindowSize ().z )) * sizeof (lut_value_type) * MaxChannels;
66
+ }
67
+
68
+ template <typename lut_value_type = KernelX::value_type>
69
+ static bool computeScaledKernelPhasedLUT (void * outKernelWeights, const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE inImageType,
70
+ const KernelX& kernelX, const KernelY& kernelY, const KernelZ& kernelZ)
71
+ {
72
+ const core::vectorSIMDu32 phaseCount = getPhaseCount (inExtent, outExtent, inImageType);
73
+
74
+ for (auto i = 0 ; i <= inImageType; ++i)
75
+ {
76
+ if (phaseCount[i] == 0 )
77
+ return false ;
78
+ }
79
+
80
+ const auto scaledKernelX = constructScaledKernel (kernelX, inExtent, outExtent);
81
+ const auto scaledKernelY = constructScaledKernel (kernelY, inExtent, outExtent);
82
+ const auto scaledKernelZ = constructScaledKernel (kernelZ, inExtent, outExtent);
83
+
84
+ const auto windowDims = getRealWindowSize (inImageType, scaledKernelX, scaledKernelY, scaledKernelZ);
85
+ const auto axisOffsets = getScaledKernelPhasedLUTAxisOffsets<lut_value_type>(phaseCount, windowDims);
86
+
87
+ const core::vectorSIMDf fInExtent (inExtent);
88
+ const core::vectorSIMDf fOutExtent (outExtent);
89
+ const auto fScale = fInExtent .preciseDivision (fOutExtent );
90
+
91
+ // a dummy load functor
92
+ // does nothing but fills up the `windowSample` with 1s (identity) so we can preserve the value of kernel
93
+ // weights when eventually `windowSample` gets multiplied by them later in
94
+ // `CFloatingPointSeparableImageFilterKernelBase<CRTP>::sample_functor_t<PreFilter,PostFilter>::operator()`
95
+ // this exists only because `evaluateImpl` expects a pre filtering step.
96
+ auto dummyLoad = [](double * windowSample, const core::vectorSIMDf&, const core::vectorSIMDi32&, const IImageFilterKernel::UserData*) -> void
97
+ {
98
+ for (auto h = 0 ; h < MaxChannels; h++)
99
+ windowSample[h] = 1.0 ;
100
+ };
101
+
102
+ double kernelWeight[MaxChannels];
103
+ // actually used to put values in the LUT
104
+ auto dummyEvaluate = [&kernelWeight](const double * windowSample, const core::vectorSIMDf&, const core::vectorSIMDi32&, const IImageFilterKernel::UserData*) -> void
105
+ {
106
+ for (auto h = 0 ; h < MaxChannels; h++)
107
+ kernelWeight[h] = windowSample[h];
108
+ };
109
+
110
+ auto computeForAxis = [&](const asset::IImage::E_TYPE axis, const auto & scaledKernel)
111
+ {
112
+ if (axis > inImageType)
113
+ return ;
114
+
115
+ const auto windowSize = scaledKernel.getWindowSize ()[axis];
116
+
117
+ IImageFilterKernel::ScaleFactorUserData scale (1 .f / fScale [axis]);
118
+ const IImageFilterKernel::ScaleFactorUserData* otherScale = nullptr ;
119
+ switch (axis)
120
+ {
121
+ case IImage::ET_1D:
122
+ otherScale = IImageFilterKernel::ScaleFactorUserData::cast (kernelX.getUserData ());
123
+ break ;
124
+ case IImage::ET_2D:
125
+ otherScale = IImageFilterKernel::ScaleFactorUserData::cast (kernelY.getUserData ());
126
+ break ;
127
+ case IImage::ET_3D:
128
+ otherScale = IImageFilterKernel::ScaleFactorUserData::cast (kernelZ.getUserData ());
129
+ break ;
130
+ }
131
+ if (otherScale)
132
+ {
133
+ for (auto k = 0 ; k < MaxChannels; k++)
134
+ scale.factor [k] *= otherScale->factor [k];
135
+ }
136
+
137
+ lut_value_type* outKernelWeightsPixel = reinterpret_cast <lut_value_type*>(reinterpret_cast <uint8_t *>(outKernelWeights) + axisOffsets[axis]);
138
+ for (uint32_t i = 0u ; i < phaseCount[axis]; ++i)
139
+ {
140
+ core::vectorSIMDf tmp (0 .f );
141
+ tmp[axis] = float (i) + 0 .5f ;
142
+
143
+ const int32_t windowCoord = scaledKernel.getWindowMinCoord (tmp * fScale , tmp)[axis];
144
+
145
+ float relativePos = tmp[axis] - float (windowCoord); // relative position of the last pixel in window from current (ith) output pixel having a unique phase sequence of kernel evaluation points
146
+
147
+ for (int32_t j = 0 ; j < windowSize; ++j)
148
+ {
149
+ core::vectorSIMDf tmp (relativePos, 0 .f , 0 .f );
150
+ scaledKernel.evaluateImpl (dummyLoad, dummyEvaluate, kernelWeight, tmp, core::vectorSIMDi32 (), &scale);
151
+ for (uint32_t ch = 0 ; ch < MaxChannels; ++ch)
152
+ {
153
+ if constexpr (std::is_same_v<lut_value_type, uint16_t >)
154
+ outKernelWeightsPixel[(i * windowSize + j) * MaxChannels + ch] = core::Float16Compressor::compress (float (kernelWeight[ch]));
155
+ else
156
+ outKernelWeightsPixel[(i * windowSize + j) * MaxChannels + ch] = lut_value_type (kernelWeight[ch]);
157
+
158
+ }
159
+ relativePos -= 1 .f ;
160
+ }
161
+ }
162
+ };
163
+
164
+ computeForAxis (asset::IImage::ET_1D, scaledKernelX);
165
+ computeForAxis (asset::IImage::ET_2D, scaledKernelY);
166
+ computeForAxis (asset::IImage::ET_3D, scaledKernelZ);
167
+
168
+ return true ;
169
+ }
170
+
171
+ static inline core::vectorSIMDi32 getRealWindowSize (const IImage::E_TYPE inImageType,
172
+ const CScaledImageFilterKernel<KernelX>& kernelX,
173
+ const CScaledImageFilterKernel<KernelY>& kernelY,
174
+ const CScaledImageFilterKernel<KernelZ>& kernelZ)
175
+ {
176
+ core::vectorSIMDi32 last (kernelX.getWindowSize ().x , 0 , 0 , 0 );
177
+ if (inImageType >= IImage::ET_2D)
178
+ last.y = kernelY.getWindowSize ().y ;
179
+ if (inImageType >= IImage::ET_3D)
180
+ last.z = kernelZ.getWindowSize ().z ;
181
+ return last;
182
+ }
183
+
184
+ template <typename lut_value_type = KernelX::value_type>
185
+ static inline core::vectorSIMDu32 getScaledKernelPhasedLUTAxisOffsets (const core::vectorSIMDu32& phaseCount, const core::vectorSIMDi32& real_window_size)
186
+ {
187
+ core::vectorSIMDu32 result;
188
+ result.x = 0u ;
189
+ result.y = (phaseCount[0 ] * real_window_size.x );
190
+ result.z = ((phaseCount[0 ] * real_window_size.x ) + (phaseCount[1 ] * real_window_size.y ));
191
+ return result * sizeof (lut_value_type) * MaxChannels;
192
+ }
193
+ };
194
+ }
195
+
196
+ #endif
0 commit comments