Skip to content

Commit 7a58da4

Browse files
author
devsh
committed
get the blit to compile and write output image
1 parent 6d743f3 commit 7a58da4

File tree

7 files changed

+62
-29
lines changed

7 files changed

+62
-29
lines changed

examples_tests

include/nbl/builtin/hlsl/blit/common.hlsl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <nbl/builtin/hlsl/binding_info.hlsl>
88

9+
#include <nbl/builtin/hlsl/glsl_compat/core.hlsl>
910
namespace nbl
1011
{
1112
namespace hlsl
@@ -44,7 +45,7 @@ RWTexture3D<float4> outAs3D[ConstevalParameters::output_binding_t::Count];
4445

4546
groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
4647

47-
[[vk::push_constant]] const nbl::hlsl::blit::SPerWorkgroup pc;
48+
[[vk::push_constant]] const nbl::hlsl::blit::Parameters pc;
4849

4950

5051
#include <nbl/builtin/hlsl/concepts.hlsl>

include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,25 @@ using namespace nbl::hlsl::blit;
6262
void main()
6363
{
6464
InImgAccessor inImgA;
65+
6566
OutImgAccessor outImgA;
67+
outImgA.descIx = pc.outputDescIx;
68+
69+
const uint16_t3 wgID = _static_cast<uint16_t3>(glsl::gl_WorkGroupID());
70+
const uint16_t3 baseCoord = pc.perWG.getOutputBaseCoord(wgID);
71+
// TODO: If and when someone can be bothered, change the blit api to compile a pipeline per image dimension, maybe it will be faster
72+
switch (pc.perWG.imageDim)
73+
{
74+
case 1:
75+
outImgA.set(uint16_t1(baseCoord.x),wgID.z,float32_t4(1,0,1,1));
76+
break;
77+
case 2:
78+
outImgA.set(baseCoord.xy,wgID.z,float32_t4(1,0,1,1));
79+
break;
80+
case 3:
81+
outImgA.set(baseCoord,0xdeadu,float32_t4(1,0,1,1));
82+
break;
83+
}
6684
/*
6785
blit::compute_blit_t<ConstevalParameters> blit = blit::compute_blit_t<ConstevalParameters>::create(params);
6886
InCSAccessor inCSA;

include/nbl/builtin/hlsl/blit/parameters.hlsl

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,11 @@ struct parameters_t
4545
// We do some dumb things with bitfields here like not using `vector<uint16_t,N>`, because AMD doesn't support them in push constants
4646
struct SPerWorkgroup
4747
{
48-
static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
48+
static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t _imageDim, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
4949
{
5050
SPerWorkgroup retval;
5151
retval.scale = _scale;
52+
retval.imageDim = _imageDim;
5253
retval.preloadWidth = preload[0];
5354
retval.preloadHeight = preload[1];
5455
retval.preloadDepth = preload[2];
@@ -59,34 +60,37 @@ struct SPerWorkgroup
5960
return retval;
6061
}
6162

62-
inline uint16_t3 getOutput() NBL_CONST_MEMBER_FUNC
63+
inline uint16_t3 getOutputBaseCoord(const uint16_t3 workgroup) NBL_CONST_MEMBER_FUNC
6364
{
64-
return uint16_t3(outputWidth,outputHeight,outputDepth);
65+
return workgroup*uint16_t3(outputWidth,outputHeight,outputDepth);
6566
}
6667

6768
inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC
6869
{
69-
uint16_t3 retval = uint16_t3(1,1,1);
70-
retval += (outExtent-uint16_t3(1,1,1))/getOutput();
70+
const uint16_t3 unit = uint16_t3(1,1,1);
71+
uint16_t3 retval = unit;
72+
retval += (outExtent-unit)/getOutputBaseCoord(unit);
7173
if (layersToBlit)
72-
retval[3] = layersToBlit;
74+
retval[2] = layersToBlit;
7375
return retval;
7476
}
7577

7678
#ifndef __HLSL_VERSION
77-
inline operator bool() const
79+
explicit inline operator bool() const
7880
{
7981
return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth;
8082
}
8183
#endif
8284

8385
// ratio of input pixels to output
8486
float32_t3 scale;
87+
// whether its an image1D, image2D or image3D
88+
uint32_t imageDim : 2;
89+
uint32_t unused0 : 14; // channel, iterationRegionPrefixSums ?
8590
// 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels
8691
uint32_t outputWidth : 16;
8792
uint32_t outputHeight : 16;
8893
uint32_t outputDepth : 16;
89-
uint32_t unused0 : 16; // channel, image type, iterationRegionPrefixSums ?
9094
uint32_t preloadWidth : 16;
9195
uint32_t preloadHeight : 16;
9296
uint32_t preloadDepth : 16;
@@ -97,22 +101,27 @@ struct SPerWorkgroup
97101

98102
struct Parameters
99103
{
100-
static Parameters create(
101-
const SPerWorkgroup perWG,
102-
const uint16_t3 inImageExtent, const uint16_t3 outImageExtent
103-
)
104+
#ifndef __HLSL_VERSION
105+
explicit inline operator bool() const
104106
{
105-
Parameters retval;
106-
retval.perWG = perWG;
107-
return retval;
107+
return bool(perWG);
108108
}
109+
#endif
109110

110-
SPerWorkgroup perWG;
111-
// general settings
112-
uint32_t lastChannel : 2;
113-
uint32_t coverage : 1;
114-
uint32_t unused : 29;
111+
SPerWorkgroup perWG; // rename to perBlitWG?
112+
//! general settings
113+
uint32_t inputDescIx : 19;
114+
uint32_t samplerDescIx : 11;
115+
uint32_t unused0 : 2;
116+
//
117+
uint32_t outputDescIx : 19;
118+
uint32_t channelCount : 3;
119+
uint32_t unused1 : 10;
120+
//
121+
uint32_t unused2 : 12;
115122
//! coverage settings
123+
uint32_t intermAlphaDescIx : 19;
124+
uint32_t coverage : 1;
116125
// required to compare the atomic count of passing pixels against, so we can get original coverage
117126
uint32_t inPixelCount;
118127
};

include/nbl/video/IGPUCommandBuffer.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,11 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
416416

417417
//! dispatches
418418
bool dispatch(const uint32_t groupCountX, const uint32_t groupCountY=1, const uint32_t groupCountZ=1);
419+
template<typename T> requires std::is_integral_v<T>
420+
bool dispatch(const hlsl::vector<T,3> groupCount)
421+
{
422+
return dispatch(groupCount.x,groupCount.y,groupCount.z);
423+
}
419424
bool dispatchIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding);
420425

421426
//! Begin/End RenderPasses

include/nbl/video/utilities/CComputeBlit.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,16 +151,16 @@ class CComputeBlit : public core::IReferenceCounted
151151
template <typename BlitUtilities>
152152
static inline hlsl::blit::SPerWorkgroup computePerWorkGroup(
153153
const uint16_t sharedMemorySize, const typename BlitUtilities::convolution_kernels_t& kernels, const IGPUImage::E_TYPE type,
154-
const bool halfPrecision, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent
154+
const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false
155155
)
156156
{
157157
const hlsl::float32_t3 minSupport(std::get<0>(kernels).getMinSupport(), std::get<1>(kernels).getMinSupport(), std::get<2>(kernels).getMinSupport());
158158
const hlsl::float32_t3 maxSupport(std::get<0>(kernels).getMaxSupport(), std::get<1>(kernels).getMaxSupport(), std::get<2>(kernels).getMaxSupport());
159-
return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,halfPrecision);
159+
return computePerWorkGroup(sharedMemorySize,minSupport,maxSupport,type,inExtent,outExtent,halfPrecision);
160160
}
161-
static hlsl::blit::SPerWorkgroup computePerWorkGroup(
161+
NBL_API2 static hlsl::blit::SPerWorkgroup computePerWorkGroup(
162162
const uint16_t sharedMemorySize, const hlsl::float32_t3 minSupportInOutput, const hlsl::float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type,
163-
const bool halfPrecision, const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent
163+
const hlsl::uint16_t3 inExtent, const hlsl::uint16_t3 outExtent, const bool halfPrecision=false
164164
);
165165

166166
#if 0

src/nbl/video/utilities/CComputeBlit.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,8 @@ struct ConstevalParameters
117117
}
118118

119119
SPerWorkgroup CComputeBlit::computePerWorkGroup(
120-
const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput,
121-
const IGPUImage::E_TYPE type, const bool halfPrecision, const uint16_t3 inExtent, const uint16_t3 outExtent
120+
const uint16_t sharedMemorySize, const float32_t3 minSupportInOutput, const float32_t3 maxSupportInOutput, const IGPUImage::E_TYPE type,
121+
const uint16_t3 inExtent, const uint16_t3 outExtent, const bool halfPrecision
122122
)
123123
{
124124
SPerWorkgroup retval;
@@ -157,7 +157,7 @@ SPerWorkgroup CComputeBlit::computePerWorkGroup(
157157
if (requiredSharedMemory>size_t(sharedMemorySize))
158158
break;
159159
// still fits, update return value
160-
retval = SPerWorkgroup::create(scale,output,preload,otherPreloadOffset);
160+
retval = SPerWorkgroup::create(scale,Dims,output,preload,otherPreloadOffset);
161161
}
162162

163163
// we want to fix the dimension that's the smallest, so that we increase the volume of the support by a smallest increment and stay close to a cube shape

0 commit comments

Comments
 (0)