Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace accessors
#define NBL_CONCEPT_PARAM_0 (accessor, T)
#define NBL_CONCEPT_PARAM_1 (val, V)
#define NBL_CONCEPT_PARAM_2 (index, I)
NBL_CONCEPT_BEGIN(3)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what did you do, this is needed!

#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
Expand Down
178 changes: 178 additions & 0 deletions include/nbl/builtin/hlsl/workgroup/spd.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#include <nbl/builtin/hlsl/cpp_compat.hlsl>
#include <nbl/builtin/hlsl/concepts.hlsl>
#include <nbl/builtin/hlsl/glsl_compat/subgroup_quad.hlsl>

#ifndef _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_
#define _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_

// ------------------------------- COMMON -----------------------------------------

namespace nbl
{
namespace hlsl
{
namespace workgroup
{
namespace spd
{
namespace impl
{
template<typename Reducer>
void subgroupQuadReduce(NBL_CONST_REF_ARG(Reducer) reducer, float32_t4 v)
{
const float32_t4 v0 = v;
const float32_t4 v1 = glsl::subgroupQuadSwapHorizontal(v);
const float32_t4 v2 = glsl::subgroupQuadSwapVertical(v);
const float32_t4 v3 = glsl::subgroupQuadSwapDiagonal(v);
return reducer.reduce(v0, v1, v2, v3);
}

template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
void downsampleMips_0_1(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
{
float32_t4 v[4];

uint32_t x = coord.x;
uint32_t y = coord.y;

int32_t2 tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2);
int32_t2 pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y);
v[0] = srcImage.reduce(tex, slice);
dstImage.set(pix, v[0], 0, slice);

tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2);
pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y);
v[1] = srcImage.reduce(tex, slice);
dstImage.set(pix, v[1], 0, slice);

tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2 + 32);
pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y + 16);
v[2] = srcImage.set(pix, v[2], 0, slice);
dstImage.set(pix, v[2], 0, slice);

tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2 + 32);
pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y + 16);
v[3] = srcImage.set(pix, v[2], 0, slice);
dstImage.set(pix, v[3], 0, slice);

if (mip <= 1)
return;

v[0] = subgroupQuadReduce(reducer, v[0]);
v[1] = subgroupQuadReduce(reducer, v[1]);
v[2] = subgroupQuadReduce(reducer, v[2]);
v[3] = subgroupQuadReduce(reducer, v[3]);

if ((localInvocationIndex % 4) == 0)
{
dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2), v[0], 1, slice);
sharedMem.set(int32_t2(x / 2, y / 2), v[0]);

dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2), v[1], 1, slice);
sharedMem.set(int32_t2(x / 2 + 8, y / 2), v[1]);

dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2 + 8), v[2], 1, slice);
sharedMem.set(int32_t2(x / 2, y / 2 + 8), v[2]);

dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
sharedMem.set(int32_t2(x / 2 + 8, y / 2 + 8), v[3]);
}
}

template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
void downsampleMip_2(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
{
float32_t4 v = sharedMem.get(coord);
v = subgroupQuadReduce(reducer, v);
if (localInvocationIndex % 4 == 0)
{
dstImage.set(int32_t2(workGroupID.xy * 8) + int32_t2(coord.x / 2, coord.y / 2), v, mip, slice);

// store to LDS, try to reduce bank conflicts
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
// ...
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
sharedMem.set(int32_t2(coord.x + (coord.y / 2) % 2, coord.y), v);
}
}

template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
void downsampleMip_3(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
{
if (localInvocationIndex < 64)
{
float32_t4 v = sharedMem.get(int32_t2(x * 2 + y % 2, y * 2));
v = subgropuQuadReduce(reducer, v);
if (localInvocationIndex % 4 == 0)
{
dstImage.set(int32_t2(workGroupID.xy * 4) + int32_t2(x / 2, y / 2), v, mip, slice);
// store to LDS
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
// ...
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
// ...
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
// ...
sharedMem.set(int32_t2(x * 2 + y / 2, y * 2), v);
}
}
}

template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
void downsampleMip_4(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
{
if (localInvocationIndex < 16)
{
float32_t4 v = sharedMem.get(int32_t2(x * 4 + y, y * 4));
v = subgroupQuadReduce(reducer, v);
if (localInvocationIndex % 4 == 0)
{
dstImage.set(int32_t2(workGroupID.xy * 2), int32_t2(x / 2, y / 2), v, mip, slice);
// store to LDS
// x x x x 0 ...
// 0 ...
sharedMem.set(int32_t2(x / 2 + y, 0), v);
}

}
}

template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor>
void downsampleMip_5(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem)
{
if (localInvocationIndex < 4)
{
float32_t4 v = sharedMem.get(int32_t2(localInvocationIndex,0));
v = subgroupQuadReduce(reducer, v);
// quad index 0 stores result
if (localInvocationIndex % 4 == 0)
{
SpdStore(ASU2(workGroupID.xy), v, mip, slice);
}
}
}
}

struct SPD
{
Comment on lines +163 to +165

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you need to template your struct on a Config struct like workgroup scans or BxDFs have but you'll have:

  • arithmetic texel type (what you vall your binop with)
  • binop type, e.g. nbl::hlsl::plus<arithmetic_texel_t>
  • storage texel type (what you pump into output image and scratch)
  • conversion method between arithmetic and storage texel type
  • input "tile" size (how many mip levels you can reduce with a single workgroup)
  • output mipmap count (absolute max is 15, because thats a Max HW texture size)
  • how many rounds of workgroup reduction are needed to downsample the whole image (e.g. if workgroup can do 6 or 7 at once, you simply divide the output mipmap count by this number and round up)
  • how many workgroups output to a single input in the final round
  • number dwords (uints) reserved for the scheduler (to do "last one out closes the door" single pass downsampling)
  • subgroup size
  • workgroup size

The last two you may store indirectly because a Workgroup2 Reduction Config will be needed, so subgroup size and workgroup size come into play there. Although it might end up being that each round needs its own workgroup2 reuction config.

Then your __call needs to template on and have arguments:

  1. Input/Output Accessor (Loadable and Storable Mip Mapped Image, but also a Global/Device Scope memory barrier method)
  2. Global Scratch Accessor (has to have atomicAdd supporting Acquire/Release semantic and scope flags - can be template args instead of regular args, and a set<type_of_your_texel> method)
  3. Workgroup Scratch for the workgroup2::reduce

In the actual usage of the algo you can assume that user will do first mip level reduction by themselves because of cheap tricks like textureGather and applying the binary operation themselves, or tapping inbetween 2x2 pixels and using a bilinear or Min/Max sampler.

This first user-space mipmapping step is not taken into account by the SPD algorithm, so if you do 2048 input, then you only launch SPD with a tile input of 1024.

You need to document that the Global Scratch Accessor needs to have the first Config::SchedulerDWORDs cleared to 0s because thats needed for "last one out closes the door" single dispatch - basically all workgroups increment that counter AFTER they're done writing the output

For example for a 32k x 16k downsample, after the one-off userspace downsample you need to perform SPD on 16k x 8k

This means 14 output mip-maps.

Now suppose your workgroup can do 4096 inputs at once, and reduce a 64x64 patch. Thats 6 output mip levels per round.

If you use Morton codes properly for your 1D Global Virtual Invocation Index, then your first 4096 WORKGROUPS will output one texel each at mip level 6 relative to the base (which is the 16k x 8k).

To run a second round of SPD, you need a patch of 64x64 workgroups to store their values to the mip level 6. Now you make the LAST WORKGROUP which stores its texel to mip level 6, perform the SPD on that 64x64 patch!

How do you do this?

With an Atomic + Barrier! Everyone stores to mip level 6, issues a global memory barrier (not execution barrier) on the Input/Output accessor (1), and only then increments the atomic assigned to the 64x64 workgroup output patch with Device Scope ACQUIRE+RELEASE semantics.

The workgroup for which this atomicAdd(1)/atomicIncr returns 4095 (SPIR-V atomic always returns pre-modification value) is the last one, and can now begin to read the 64x64 values other workgroups wrote.

P.S. This is why I'd make a __round(MortonCodeInMip, GlobalSchedulerOffset) method to build the __call out of.

P.P.S. I can see how "adjusting" the SPD size per round could be more efficient, because in the example I gave after the first round, the relative mip level 6 (real mip level 7 of the 32k x 16k) has 256x128 resolution and if done with a 64x64 round, will produce 4x2 which will severely underutilize the last workgroup in round 3 which forms the cricial path. So just like the workgroup2 scans and reductions, while it makes sense to go as aggressive as possible on the first round of a 3+ round algorithm, when you only have 2 rounds remaining it pays off to split the workload more equally, e.g. in the example given, use 16x16 on rounds 2 and 3 if you have a workgroup size of 256, or 32x32 if you have a workgroup size of 512 or more.


static void __call()
{

}

};


}
}
}
}
2 changes: 2 additions & 0 deletions src/nbl/builtin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/raytracing.h
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_arithmetic.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_ballot.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_basic.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_quad.hlsl")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forgot to add files?

LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_shuffle.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_vote.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/glsl.std.450.hlsl")
Expand All @@ -187,6 +188,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/core.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_arithmetic.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_ballot.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_basic.hlsl")
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_quad.hlsl")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forgot to add files?

LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_shuffle.hlsl")
#stdlib
LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/algorithm.hlsl")
Expand Down
Loading