Skip to content

Commit 6d743f3

Browse files
author
devsh
committed
start computing the push constants
1 parent 0fac0cb commit 6d743f3

File tree

7 files changed

+191
-251
lines changed

7 files changed

+191
-251
lines changed

include/nbl/builtin/hlsl/blit/common.hlsl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ RWTexture3D<float4> outAs3D[ConstevalParameters::output_binding_t::Count];
4444

4545
groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
4646

47+
[[vk::push_constant]] const nbl::hlsl::blit::SPerWorkgroup pc;
48+
4749

4850
#include <nbl/builtin/hlsl/concepts.hlsl>
4951
/*

include/nbl/builtin/hlsl/blit/compute_blit.hlsl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ struct compute_blit_t
7777
uint16_t localInvocationIndex)
7878
{
7979
const float3 halfScale = scale * float3(0.5f, 0.5f, 0.5f);
80+
// bottom of the input tile
8081
const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG;
8182
const float3 minOutputPixelCenterOfWG = float3(minOutputPixel)*scale + halfScale;
8283
// this can be negative, in which case HW sampler takes care of wrapping for us

include/nbl/builtin/hlsl/blit/parameters.hlsl

Lines changed: 70 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ namespace blit
1414

1515
struct parameters_t
1616
{
17-
float32_t3 fScale;
17+
float32_t3 fScale; //
1818
float32_t3 negativeSupport;
1919
float32_t referenceAlpha;
2020
uint32_t kernelWeightsOffsetY;
@@ -24,17 +24,15 @@ struct parameters_t
2424

2525
uint16_t3 inputDims;
2626
uint16_t3 outputDims;
27-
uint16_t3 windowDims;
27+
uint16_t3 windowDims; //
2828
uint16_t3 phaseCount;
29-
uint16_t3 preloadRegion;
29+
uint16_t3 preloadRegion; //
3030
uint16_t3 iterationRegionXPrefixProducts;
3131
uint16_t3 iterationRegionYPrefixProducts;
3232
uint16_t3 iterationRegionZPrefixProducts;
3333

34-
//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
35-
//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
36-
uint16_t secondScratchOffset;
37-
uint16_t outputTexelsPerWGZ;
34+
uint16_t secondScratchOffset; //
35+
uint16_t outputTexelsPerWGZ; //
3836

3937
uint32_t3 getOutputTexelsPerWG()
4038
{
@@ -44,36 +42,79 @@ struct parameters_t
4442
}
4543
};
4644

47-
struct parameters2_t
45+
// We do some dumb things with bitfields here like not using `vector<uint16_t,N>`, because AMD doesn't support them in push constants
46+
struct SPerWorkgroup
4847
{
49-
float32_t3 fScale;
50-
float32_t3 negativeSupportMinusHalf;
51-
float32_t referenceAlpha;
52-
uint32_t kernelWeightsOffsetY;
53-
uint32_t kernelWeightsOffsetZ;
54-
uint32_t inPixelCount;
55-
uint32_t outPixelCount;
48+
static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
49+
{
50+
SPerWorkgroup retval;
51+
retval.scale = _scale;
52+
retval.preloadWidth = preload[0];
53+
retval.preloadHeight = preload[1];
54+
retval.preloadDepth = preload[2];
55+
retval.outputWidth = output[0];
56+
retval.outputHeight = output[1];
57+
retval.outputDepth = output[2];
58+
retval.otherPreloadOffset = _otherPreloadOffset;
59+
return retval;
60+
}
5661

57-
uint16_t3 inputDims;
58-
uint16_t3 outputDims;
59-
uint16_t3 windowDims;
60-
uint16_t3 phaseCount;
61-
uint16_t3 preloadRegion;
62-
uint16_t3 iterationRegionXPrefixProducts;
63-
uint16_t3 iterationRegionYPrefixProducts;
64-
uint16_t3 iterationRegionZPrefixProducts;
62+
inline uint16_t3 getOutput() NBL_CONST_MEMBER_FUNC
63+
{
64+
return uint16_t3(outputWidth,outputHeight,outputDepth);
65+
}
66+
67+
inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC
68+
{
69+
uint16_t3 retval = uint16_t3(1,1,1);
70+
retval += (outExtent-uint16_t3(1,1,1))/getOutput();
71+
if (layersToBlit)
72+
retval[3] = layersToBlit;
73+
return retval;
74+
}
6575

76+
#ifndef __HLSL_VERSION
77+
inline operator bool() const
78+
{
79+
return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth;
80+
}
81+
#endif
82+
83+
// ratio of input pixels to output
84+
float32_t3 scale;
85+
// 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels
86+
uint32_t outputWidth : 16;
87+
uint32_t outputHeight : 16;
88+
uint32_t outputDepth : 16;
89+
uint32_t unused0 : 16; // channel, image type, iterationRegionPrefixSums ?
90+
uint32_t preloadWidth : 16;
91+
uint32_t preloadHeight : 16;
92+
uint32_t preloadDepth : 16;
6693
//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
6794
//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
68-
uint16_t secondScratchOffset;
69-
uint16_t outputTexelsPerWGZ;
95+
uint32_t otherPreloadOffset : 16;
96+
};
7097

71-
uint32_t3 getOutputTexelsPerWG()
98+
struct Parameters
99+
{
100+
static Parameters create(
101+
const SPerWorkgroup perWG,
102+
const uint16_t3 inImageExtent, const uint16_t3 outImageExtent
103+
)
72104
{
73-
//! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is
74-
//! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change.
75-
return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ);
105+
Parameters retval;
106+
retval.perWG = perWG;
107+
return retval;
76108
}
109+
110+
SPerWorkgroup perWG;
111+
// general settings
112+
uint32_t lastChannel : 2;
113+
uint32_t coverage : 1;
114+
uint32_t unused : 29;
115+
//! coverage settings
116+
// required to compare the atomic count of passing pixels against, so we can get original coverage
117+
uint32_t inPixelCount;
77118
};
78119

79120

include/nbl/builtin/hlsl/cpp_compat/intrinsics.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@ NBL_BIT_OP_GLM_PASSTHROUGH(findLSB,findLSB)
6666

6767
NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB)
6868

69+
// TODO: some of the functions in this header should move to `tgmath`
70+
template<typename T> requires ::nbl::hlsl::is_floating_point_v<T>
71+
inline T floor(const T& v)
72+
{
73+
return glm::floor(v);
74+
}
75+
76+
6977
// inverse not defined cause its implemented via hidden friend
7078
template<typename T, uint16_t N, uint16_t M>
7179
inline matrix<T,N,M> inverse(const matrix<T,N,M>& m)

include/nbl/builtin/hlsl/type_traits.hlsl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,8 @@ NBL_CONSTEXPR bool is_unsigned_v = is_unsigned<T>::value;
606606
template<class T>
607607
NBL_CONSTEXPR bool is_integral_v = is_integral<T>::value;
608608
template<class T>
609+
NBL_CONSTEXPR bool is_floating_point_v = is_floating_point<T>::value;
610+
template<class T>
609611
NBL_CONSTEXPR bool is_signed_v = is_signed<T>::value;
610612
template<class T>
611613
NBL_CONSTEXPR bool is_scalar_v = is_scalar<T>::value;

0 commit comments

Comments
 (0)