Skip to content

Commit 2cec714

Browse files
Merge pull request #343 from Devsh-Graphics-Programming/gpu_blit_filter
GPU Blit Filter
2 parents d615dde + d91694b commit 2cec714

33 files changed

+2618
-291
lines changed

include/nbl/asset/IDescriptorSet.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class NBL_API IDescriptorSet : public virtual core::IReferenceCounted
5151
};
5252
struct SImageInfo
5353
{
54+
// This will be ignored if the DS layout already has an immutable sampler specified for the binding.
5455
core::smart_refctd_ptr<typename layout_t::sampler_type> sampler;
5556
//! Irrelevant in OpenGL backend
5657
E_IMAGE_LAYOUT imageLayout;

include/nbl/asset/IDescriptorSetLayout.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ class NBL_API IDescriptorSetLayout : public virtual core::IReferenceCounted
8787
E_DESCRIPTOR_TYPE type;
8888
uint32_t count;
8989
IShader::E_SHADER_STAGE stageFlags;
90+
// Use this if you want an immutable sampler that is baked into the DS layout itself.
91+
// If its `nullptr` then the sampler used is mutable and can be specified while writing the image descriptor to a binding while updating the DS.
9092
const core::smart_refctd_ptr<sampler_type>* samplers;
9193

9294
bool operator<(const SBinding& rhs) const

include/nbl/asset/IImage.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,10 @@ class NBL_API IImage : public IDescriptor
104104
};
105105
enum E_TYPE : uint32_t
106106
{
107-
ET_1D,
107+
ET_1D = 0,
108108
ET_2D,
109-
ET_3D
109+
ET_3D,
110+
ET_COUNT
110111
};
111112
enum E_SAMPLE_COUNT_FLAGS : uint32_t
112113
{

include/nbl/asset/filters/CBlitImageFilter.h

Lines changed: 256 additions & 184 deletions
Large diffs are not rendered by default.
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
5+
#ifndef __NBL_ASSET_C_BLIT_UTILITIES_H_INCLUDED__
6+
#define __NBL_ASSET_C_BLIT_UTILITIES_H_INCLUDED__
7+
8+
#include "nbl/asset/filters/kernels/kernels.h"
9+
10+
namespace nbl::asset
11+
{
12+
class IBlitUtilities
13+
{
14+
public:
15+
static constexpr uint32_t MinAlphaBinCount = 256u;
16+
static constexpr uint32_t MaxAlphaBinCount = 4096u;
17+
static constexpr uint32_t DefaultAlphaBinCount = MinAlphaBinCount;
18+
19+
enum E_ALPHA_SEMANTIC : uint32_t
20+
{
21+
EAS_NONE_OR_PREMULTIPLIED = 0u, // just filter the channels independently (also works for a texture for blending equation `dstCol*(1-srcAlpha)+srcCol`)
22+
EAS_REFERENCE_OR_COVERAGE, // try to preserve coverage (percentage of pixels above a threshold value) across mipmap levels
23+
EAS_SEPARATE_BLEND, // compute a new alpha value for a texture to be used with the blending equation `mix(dstCol,srcCol,srcAlpha)`
24+
EAS_COUNT
25+
};
26+
27+
static inline core::vectorSIMDu32 getPhaseCount(const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE inImageType)
28+
{
29+
core::vectorSIMDu32 result(0u);
30+
for (uint32_t i = 0u; i <= inImageType; ++i)
31+
result[i] = outExtent[i] / std::gcd(inExtent[i], outExtent[i]);
32+
return result;
33+
}
34+
35+
// we'll need to rescale the kernel support to be relative to the output image but in the input image coordinate system
36+
// (if support is 3 pixels, it needs to be 3 output texels, but measured in input texels)
37+
template<class Kernel>
38+
static inline auto constructScaledKernel(const Kernel& kernel, const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent)
39+
{
40+
const core::vectorSIMDf fInExtent(inExtent);
41+
const core::vectorSIMDf fOutExtent(outExtent);
42+
const auto fScale = fInExtent.preciseDivision(fOutExtent);
43+
return CScaledImageFilterKernel<Kernel>(fScale, kernel);
44+
}
45+
};
46+
47+
template <class KernelX = CBoxImageFilterKernel, class KernelY = KernelX, class KernelZ = KernelX>
48+
class CBlitUtilities : public IBlitUtilities
49+
{
50+
static_assert(std::is_same<typename KernelX::value_type, typename KernelY::value_type>::value&& std::is_same<typename KernelZ::value_type, typename KernelY::value_type>::value, "Kernel value_type need to be identical");
51+
52+
public:
53+
_NBL_STATIC_INLINE_CONSTEXPR auto MaxChannels = std::max<decltype(KernelX::MaxChannels)>(std::max<decltype(KernelX::MaxChannels)>(KernelX::MaxChannels, KernelY::MaxChannels), KernelZ::MaxChannels);
54+
55+
template <typename lut_value_type = KernelX::value_type>
56+
static inline size_t getScaledKernelPhasedLUTSize(const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE inImageType,
57+
const KernelX& kernelX, const KernelY& kernelY, const KernelZ& kernelZ)
58+
{
59+
const auto scaledKernelX = constructScaledKernel(kernelX, inExtent, outExtent);
60+
const auto scaledKernelY = constructScaledKernel(kernelY, inExtent, outExtent);
61+
const auto scaledKernelZ = constructScaledKernel(kernelZ, inExtent, outExtent);
62+
63+
const auto phaseCount = getPhaseCount(inExtent, outExtent, inImageType);
64+
65+
return ((phaseCount[0] * scaledKernelX.getWindowSize().x) + (phaseCount[1] * scaledKernelY.getWindowSize().y) + (phaseCount[2] * scaledKernelZ.getWindowSize().z)) * sizeof(lut_value_type) * MaxChannels;
66+
}
67+
68+
template <typename lut_value_type = KernelX::value_type>
69+
static bool computeScaledKernelPhasedLUT(void* outKernelWeights, const core::vectorSIMDu32& inExtent, const core::vectorSIMDu32& outExtent, const asset::IImage::E_TYPE inImageType,
70+
const KernelX& kernelX, const KernelY& kernelY, const KernelZ& kernelZ)
71+
{
72+
const core::vectorSIMDu32 phaseCount = getPhaseCount(inExtent, outExtent, inImageType);
73+
74+
for (auto i = 0; i <= inImageType; ++i)
75+
{
76+
if (phaseCount[i] == 0)
77+
return false;
78+
}
79+
80+
const auto scaledKernelX = constructScaledKernel(kernelX, inExtent, outExtent);
81+
const auto scaledKernelY = constructScaledKernel(kernelY, inExtent, outExtent);
82+
const auto scaledKernelZ = constructScaledKernel(kernelZ, inExtent, outExtent);
83+
84+
const auto windowDims = getRealWindowSize(inImageType, scaledKernelX, scaledKernelY, scaledKernelZ);
85+
const auto axisOffsets = getScaledKernelPhasedLUTAxisOffsets<lut_value_type>(phaseCount, windowDims);
86+
87+
const core::vectorSIMDf fInExtent(inExtent);
88+
const core::vectorSIMDf fOutExtent(outExtent);
89+
const auto fScale = fInExtent.preciseDivision(fOutExtent);
90+
91+
// a dummy load functor
92+
// does nothing but fills up the `windowSample` with 1s (identity) so we can preserve the value of kernel
93+
// weights when eventually `windowSample` gets multiplied by them later in
94+
// `CFloatingPointSeparableImageFilterKernelBase<CRTP>::sample_functor_t<PreFilter,PostFilter>::operator()`
95+
// this exists only because `evaluateImpl` expects a pre filtering step.
96+
auto dummyLoad = [](double* windowSample, const core::vectorSIMDf&, const core::vectorSIMDi32&, const IImageFilterKernel::UserData*) -> void
97+
{
98+
for (auto h = 0; h < MaxChannels; h++)
99+
windowSample[h] = 1.0;
100+
};
101+
102+
double kernelWeight[MaxChannels];
103+
// actually used to put values in the LUT
104+
auto dummyEvaluate = [&kernelWeight](const double* windowSample, const core::vectorSIMDf&, const core::vectorSIMDi32&, const IImageFilterKernel::UserData*) -> void
105+
{
106+
for (auto h = 0; h < MaxChannels; h++)
107+
kernelWeight[h] = windowSample[h];
108+
};
109+
110+
auto computeForAxis = [&](const asset::IImage::E_TYPE axis, const auto& scaledKernel)
111+
{
112+
if (axis > inImageType)
113+
return;
114+
115+
const auto windowSize = scaledKernel.getWindowSize()[axis];
116+
117+
IImageFilterKernel::ScaleFactorUserData scale(1.f / fScale[axis]);
118+
const IImageFilterKernel::ScaleFactorUserData* otherScale = nullptr;
119+
switch (axis)
120+
{
121+
case IImage::ET_1D:
122+
otherScale = IImageFilterKernel::ScaleFactorUserData::cast(kernelX.getUserData());
123+
break;
124+
case IImage::ET_2D:
125+
otherScale = IImageFilterKernel::ScaleFactorUserData::cast(kernelY.getUserData());
126+
break;
127+
case IImage::ET_3D:
128+
otherScale = IImageFilterKernel::ScaleFactorUserData::cast(kernelZ.getUserData());
129+
break;
130+
}
131+
if (otherScale)
132+
{
133+
for (auto k = 0; k < MaxChannels; k++)
134+
scale.factor[k] *= otherScale->factor[k];
135+
}
136+
137+
lut_value_type* outKernelWeightsPixel = reinterpret_cast<lut_value_type*>(reinterpret_cast<uint8_t*>(outKernelWeights) + axisOffsets[axis]);
138+
for (uint32_t i = 0u; i < phaseCount[axis]; ++i)
139+
{
140+
core::vectorSIMDf tmp(0.f);
141+
tmp[axis] = float(i) + 0.5f;
142+
143+
const int32_t windowCoord = scaledKernel.getWindowMinCoord(tmp * fScale, tmp)[axis];
144+
145+
float relativePos = tmp[axis] - float(windowCoord); // relative position of the last pixel in window from current (ith) output pixel having a unique phase sequence of kernel evaluation points
146+
147+
for (int32_t j = 0; j < windowSize; ++j)
148+
{
149+
core::vectorSIMDf tmp(relativePos, 0.f, 0.f);
150+
scaledKernel.evaluateImpl(dummyLoad, dummyEvaluate, kernelWeight, tmp, core::vectorSIMDi32(), &scale);
151+
for (uint32_t ch = 0; ch < MaxChannels; ++ch)
152+
{
153+
if constexpr (std::is_same_v<lut_value_type, uint16_t>)
154+
outKernelWeightsPixel[(i * windowSize + j) * MaxChannels + ch] = core::Float16Compressor::compress(float(kernelWeight[ch]));
155+
else
156+
outKernelWeightsPixel[(i * windowSize + j) * MaxChannels + ch] = lut_value_type(kernelWeight[ch]);
157+
158+
}
159+
relativePos -= 1.f;
160+
}
161+
}
162+
};
163+
164+
computeForAxis(asset::IImage::ET_1D, scaledKernelX);
165+
computeForAxis(asset::IImage::ET_2D, scaledKernelY);
166+
computeForAxis(asset::IImage::ET_3D, scaledKernelZ);
167+
168+
return true;
169+
}
170+
171+
static inline core::vectorSIMDi32 getRealWindowSize(const IImage::E_TYPE inImageType,
172+
const CScaledImageFilterKernel<KernelX>& kernelX,
173+
const CScaledImageFilterKernel<KernelY>& kernelY,
174+
const CScaledImageFilterKernel<KernelZ>& kernelZ)
175+
{
176+
core::vectorSIMDi32 last(kernelX.getWindowSize().x, 0, 0, 0);
177+
if (inImageType >= IImage::ET_2D)
178+
last.y = kernelY.getWindowSize().y;
179+
if (inImageType >= IImage::ET_3D)
180+
last.z = kernelZ.getWindowSize().z;
181+
return last;
182+
}
183+
184+
template <typename lut_value_type = KernelX::value_type>
185+
static inline core::vectorSIMDu32 getScaledKernelPhasedLUTAxisOffsets(const core::vectorSIMDu32& phaseCount, const core::vectorSIMDi32& real_window_size)
186+
{
187+
core::vectorSIMDu32 result;
188+
result.x = 0u;
189+
result.y = (phaseCount[0] * real_window_size.x);
190+
result.z = ((phaseCount[0] * real_window_size.x) + (phaseCount[1] * real_window_size.y));
191+
return result * sizeof(lut_value_type) * MaxChannels;
192+
}
193+
};
194+
}
195+
196+
#endif

include/nbl/asset/filters/CMipMapGenerationImageFilter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ class NBL_API CMipMapGenerationImageFilter : public CImageFilter<CMipMapGenerati
122122
//not all kernels are default-constructible, this is going to be a problem (i already added appropriate ctor for blit filter state class though)
123123
//blit.kernel = Kernel(); // gets default constructed, we should probably do a `static_assert` about this property
124124
static_cast<state_base_t&>(blit) = *static_cast<const state_base_t*>(state);
125+
126+
pseudo_base_t::blit_utils_t::computeScaledKernelPhasedLUT(blit.scratchMemory + pseudo_base_t::getScratchOffset(&blit, pseudo_base_t::ESU_SCALED_KERNEL_PHASED_LUT), blit.inExtentLayerCount, blit.outExtentLayerCount, blit.inImage->getCreationParameters().type, blit.kernelX, blit.kernelY, blit.kernelZ);
125127
return blit;
126128
}
127129
};

include/nbl/asset/filters/kernels/kernels.h

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,36 +13,6 @@ namespace nbl
1313
{
1414
namespace asset
1515
{
16-
17-
/*
18-
// caches weights, also should we call it Polyphase?
19-
template<class Kernel>
20-
class CMultiphaseKernel : public CImageFilterKernel<CMultiphaseKernel<Kernel> >, private Kernel
21-
{
22-
public:
23-
_NBL_STATIC_INLINE_CONSTEXPR bool is_separable = Kernel;
24-
25-
CMultiphaseKernel(Kernel&& k) : Kernel(std::move(k)
26-
{
27-
}
28-
29-
protected:
30-
static inline core::vectorSIMDu32 computePhases(const core::vectorSIMDu32& from, const core::vectorSIMDu32& to)
31-
{
32-
assert(!(to>from).any()); // Convolution Kernel cannot be used for upscaling!
33-
return from/core::gcd(to,from);
34-
}
35-
static inline uint32_t computePhaseStorage(const core::vectorSIMDu32& from, const core::vectorSIMDu32& to)
36-
{
37-
auto phases = computePhases(from,to);
38-
auto samplesInSupports = ceil();
39-
if constexpr(is_separable)
40-
{
41-
42-
}
43-
}
44-
};
45-
*/
4616

4717
// to be inline this function relies on any kernel's `create_sample_functor_t` being defined
4818
template<class CRTP, typename value_type>

include/nbl/asset/utils/ICPUVirtualTexture.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,11 @@ class NBL_API ICPUVirtualTexture final : public IVirtualTexture<ICPUImageView, I
145145
blit.scratchMemoryByteSize = blit_filter_t::getRequiredScratchByteSize(&blit);
146146
blit.scratchMemory = reinterpret_cast<uint8_t*>(_NBL_ALIGNED_MALLOC(blit.scratchMemoryByteSize, _NBL_SIMD_ALIGNMENT));
147147

148+
const core::vectorSIMDu32 inExtent(blit.inExtent.width, blit.inExtent.height, blit.inExtent.depth, 1);
149+
const core::vectorSIMDu32 outExtent(blit.outExtent.width, blit.outExtent.height, blit.outExtent.depth, 1);
150+
if (!blit_filter_t::blit_utils_t::computeScaledKernelPhasedLUT(blit.scratchMemory + blit_filter_t::getScratchOffset(&blit, blit_filter_t::ESU_SCALED_KERNEL_PHASED_LUT), inExtent, outExtent, blit.inImage->getCreationParameters().type, blit.kernelX, blit.kernelY, blit.kernelZ))
151+
return nullptr;
152+
148153
const bool blit_succeeded = blit_filter_t::execute(&blit);
149154
_NBL_ALIGNED_FREE(blit.scratchMemory);
150155
if (!blit_succeeded)

include/nbl/asset/utils/IGLSLCompiler.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,64 @@ class NBL_API IGLSLCompiler final : public core::IReferenceCounted
193193
return nbl::core::make_smart_refctd_ptr<ICPUShader>(std::move(outBuffer), IShader::buffer_contains_glsl_t{}, original->getStage(), std::string(original->getFilepathHint()));
194194
}
195195

196+
static inline const char* getStorageImageFormatQualifier(const asset::E_FORMAT format)
197+
{
198+
switch (format)
199+
{
200+
case asset::EF_R32G32B32A32_SFLOAT:
201+
return "rgba32f";
202+
case asset::EF_R16G16B16A16_SFLOAT:
203+
return "rgba16f";
204+
case asset::EF_R32G32_SFLOAT:
205+
return "rg32f";
206+
case asset::EF_R16G16_SFLOAT:
207+
return "rg16f";
208+
case asset::EF_B10G11R11_UFLOAT_PACK32:
209+
return "r11f_g11f_b10f";
210+
case asset::EF_R32_SFLOAT:
211+
return "r32f";
212+
case asset::EF_R16_SFLOAT:
213+
return "r16f";
214+
case asset::EF_R16G16B16A16_UNORM:
215+
return "rgba16";
216+
case asset::EF_A2B10G10R10_UNORM_PACK32:
217+
return "rgb10_a2";
218+
case asset::EF_R8G8B8A8_UNORM:
219+
return "rgba8";
220+
case asset::EF_R16G16_UNORM:
221+
return "rg16";
222+
case asset::EF_R8G8_UNORM:
223+
return "rg8";
224+
case asset::EF_R16_UNORM:
225+
return "r16";
226+
case asset::EF_R8_UNORM:
227+
return "r8";
228+
case asset::EF_R16G16B16A16_SNORM:
229+
return "rgba16_snorm";
230+
case asset::EF_R8G8B8A8_SNORM:
231+
return "rgba8_snorm";
232+
case asset::EF_R16G16_SNORM:
233+
return "rg16_snorm";
234+
case asset::EF_R8G8_SNORM:
235+
return "rg8_snorm";
236+
case asset::EF_R16_SNORM:
237+
return "r16_snorm";
238+
case asset::EF_R8_UINT:
239+
return "r8ui";
240+
case asset::EF_R16_UINT:
241+
return "r16ui";
242+
case asset::EF_R32_UINT:
243+
return "r32ui";
244+
case asset::EF_R32G32_UINT:
245+
return "rg32ui";
246+
case asset::EF_R32G32B32A32_UINT:
247+
return "rgba32ui";
248+
default:
249+
assert(false);
250+
return "";
251+
}
252+
}
253+
196254
private:
197255
core::smart_refctd_ptr<IIncludeHandler> m_inclHandler;
198256
system::ISystem* m_system;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#ifndef _NBL_GLSL_BLIT_ALPHA_TEST_INCLUDED_
2+
#define _NBL_GLSL_BLIT_ALPHA_TEST_INCLUDED_
3+
4+
#ifndef _NBL_GLSL_BLIT_ALPHA_TEST_MAIN_DEFINED_
5+
6+
#include <nbl/builtin/glsl/blit/parameters.glsl>
7+
8+
#ifndef _NBL_GLSL_BLIT_ALPHA_TEST_PASSED_COUNTER_DESCRIPTOR_DEFINED_
9+
#error _NBL_GLSL_BLIT_ALPHA_TEST_PASSED_COUNTER_DESCRIPTOR_DEFINED_ must be defined
10+
#endif
11+
12+
nbl_glsl_blit_parameters_t nbl_glsl_blit_getParameters();
13+
14+
float nbl_glsl_blit_alpha_test_getData(in uvec3 coord, in uint layerIdx);
15+
16+
void nbl_glsl_blit_alpha_test_main()
17+
{
18+
const uvec3 inDim = nbl_glsl_blit_parameters_getInputImageDimensions();
19+
const nbl_glsl_blit_parameters_t params = nbl_glsl_blit_getParameters();
20+
21+
if (all(lessThan(gl_GlobalInvocationID, inDim)))
22+
{
23+
const float alpha = nbl_glsl_blit_alpha_test_getData(gl_GlobalInvocationID, gl_WorkGroupID.z);
24+
if (alpha > params.referenceAlpha)
25+
atomicAdd(_NBL_GLSL_BLIT_ALPHA_TEST_PASSED_COUNTER_DESCRIPTOR_DEFINED_.data[gl_WorkGroupID.z].passedPixelCount, 1u);
26+
}
27+
}
28+
29+
#define _NBL_GLSL_BLIT_ALPHA_TEST_MAIN_DEFINED_
30+
#endif
31+
32+
#endif
33+

0 commit comments

Comments
 (0)