@@ -14,7 +14,7 @@ namespace blit
14
14
15
15
struct parameters_t
16
16
{
17
- float32_t3 fScale;
17
+ float32_t3 fScale; //
18
18
float32_t3 negativeSupport;
19
19
float32_t referenceAlpha;
20
20
uint32_t kernelWeightsOffsetY;
@@ -24,17 +24,15 @@ struct parameters_t
24
24
25
25
uint16_t3 inputDims;
26
26
uint16_t3 outputDims;
27
- uint16_t3 windowDims;
27
+ uint16_t3 windowDims; //
28
28
uint16_t3 phaseCount;
29
- uint16_t3 preloadRegion;
29
+ uint16_t3 preloadRegion; //
30
30
uint16_t3 iterationRegionXPrefixProducts;
31
31
uint16_t3 iterationRegionYPrefixProducts;
32
32
uint16_t3 iterationRegionZPrefixProducts;
33
33
34
- //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
35
- //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
36
- uint16_t secondScratchOffset;
37
- uint16_t outputTexelsPerWGZ;
34
+ uint16_t secondScratchOffset; //
35
+ uint16_t outputTexelsPerWGZ; //
38
36
39
37
uint32_t3 getOutputTexelsPerWG ()
40
38
{
@@ -44,36 +42,79 @@ struct parameters_t
44
42
}
45
43
};
46
44
47
- struct parameters2_t
45
+ // We do some dumb things with bitfields here like not using `vector<uint16_t,N>`, because AMD doesn't support them in push constants
46
+ struct SPerWorkgroup
48
47
{
49
- float32_t3 fScale;
50
- float32_t3 negativeSupportMinusHalf;
51
- float32_t referenceAlpha;
52
- uint32_t kernelWeightsOffsetY;
53
- uint32_t kernelWeightsOffsetZ;
54
- uint32_t inPixelCount;
55
- uint32_t outPixelCount;
48
+ static inline SPerWorkgroup create (const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
49
+ {
50
+ SPerWorkgroup retval;
51
+ retval.scale = _scale;
52
+ retval.preloadWidth = preload[0 ];
53
+ retval.preloadHeight = preload[1 ];
54
+ retval.preloadDepth = preload[2 ];
55
+ retval.outputWidth = output[0 ];
56
+ retval.outputHeight = output[1 ];
57
+ retval.outputDepth = output[2 ];
58
+ retval.otherPreloadOffset = _otherPreloadOffset;
59
+ return retval;
60
+ }
56
61
57
- uint16_t3 inputDims;
58
- uint16_t3 outputDims;
59
- uint16_t3 windowDims;
60
- uint16_t3 phaseCount;
61
- uint16_t3 preloadRegion;
62
- uint16_t3 iterationRegionXPrefixProducts;
63
- uint16_t3 iterationRegionYPrefixProducts;
64
- uint16_t3 iterationRegionZPrefixProducts;
62
+ inline uint16_t3 getOutput () NBL_CONST_MEMBER_FUNC
63
+ {
64
+ return uint16_t3 (outputWidth,outputHeight,outputDepth);
65
+ }
66
+
67
+ inline uint16_t3 getWorkgroupCount (const uint16_t3 outExtent, const uint16_t layersToBlit=0 ) NBL_CONST_MEMBER_FUNC
68
+ {
69
+ uint16_t3 retval = uint16_t3 (1 ,1 ,1 );
70
+ retval += (outExtent-uint16_t3 (1 ,1 ,1 ))/getOutput ();
71
+ if (layersToBlit)
72
+ retval[3 ] = layersToBlit;
73
+ return retval;
74
+ }
65
75
76
+ #ifndef __HLSL_VERSION
77
+ inline operator bool () const
78
+ {
79
+ return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth;
80
+ }
81
+ #endif
82
+
83
+ // ratio of input pixels to output
84
+ float32_t3 scale;
85
+ // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels
86
+ uint32_t outputWidth : 16 ;
87
+ uint32_t outputHeight : 16 ;
88
+ uint32_t outputDepth : 16 ;
89
+ uint32_t unused0 : 16 ; // channel, image type, iterationRegionPrefixSums ?
90
+ uint32_t preloadWidth : 16 ;
91
+ uint32_t preloadHeight : 16 ;
92
+ uint32_t preloadDepth : 16 ;
66
93
//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
67
94
//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
68
- uint16_t secondScratchOffset ;
69
- uint16_t outputTexelsPerWGZ ;
95
+ uint32_t otherPreloadOffset : 16 ;
96
+ } ;
70
97
71
- uint32_t3 getOutputTexelsPerWG ()
98
+ struct Parameters
99
+ {
100
+ static Parameters create (
101
+ const SPerWorkgroup perWG,
102
+ const uint16_t3 inImageExtent, const uint16_t3 outImageExtent
103
+ )
72
104
{
73
- //! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is
74
- //! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change.
75
- return uint32_t3 (iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ) ;
105
+ Parameters retval;
106
+ retval.perWG = perWG;
107
+ return retval ;
76
108
}
109
+
110
+ SPerWorkgroup perWG;
111
+ // general settings
112
+ uint32_t lastChannel : 2 ;
113
+ uint32_t coverage : 1 ;
114
+ uint32_t unused : 29 ;
115
+ //! coverage settings
116
+ // required to compare the atomic count of passing pixels against, so we can get original coverage
117
+ uint32_t inPixelCount;
77
118
};
78
119
79
120
0 commit comments