-
Notifications
You must be signed in to change notification settings - Fork 67
Single pass downsampling #954
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: mortons
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,178 @@ | ||
| #include <nbl/builtin/hlsl/cpp_compat.hlsl> | ||
| #include <nbl/builtin/hlsl/concepts.hlsl> | ||
| #include <nbl/builtin/hlsl/glsl_compat/subgroup_quad.hlsl> | ||
|
|
||
| #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_ | ||
| #define _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_ | ||
|
|
||
| // ------------------------------- COMMON ----------------------------------------- | ||
|
|
||
| namespace nbl | ||
| { | ||
| namespace hlsl | ||
| { | ||
| namespace workgroup | ||
| { | ||
| namespace spd | ||
| { | ||
| namespace impl | ||
| { | ||
| template<typename Reducer> | ||
| void subgroupQuadReduce(NBL_CONST_REF_ARG(Reducer) reducer, float32_t4 v) | ||
| { | ||
| const float32_t4 v0 = v; | ||
| const float32_t4 v1 = glsl::subgroupQuadSwapHorizontal(v); | ||
| const float32_t4 v2 = glsl::subgroupQuadSwapVertical(v); | ||
| const float32_t4 v3 = glsl::subgroupQuadSwapDiagonal(v); | ||
| return reducer.reduce(v0, v1, v2, v3); | ||
| } | ||
|
|
||
| template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor> | ||
| void downsampleMips_0_1(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) | ||
| { | ||
| float32_t4 v[4]; | ||
|
|
||
| uint32_t x = coord.x; | ||
| uint32_t y = coord.y; | ||
|
|
||
| int32_t2 tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2); | ||
| int32_t2 pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y); | ||
| v[0] = srcImage.reduce(tex, slice); | ||
| dstImage.set(pix, v[0], 0, slice); | ||
|
|
||
| tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2); | ||
| pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y); | ||
| v[1] = srcImage.reduce(tex, slice); | ||
| dstImage.set(pix, v[1], 0, slice); | ||
|
|
||
| tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2 + 32); | ||
| pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y + 16); | ||
| v[2] = srcImage.set(pix, v[2], 0, slice); | ||
| dstImage.set(pix, v[2], 0, slice); | ||
|
|
||
| tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2 + 32); | ||
| pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y + 16); | ||
| v[3] = srcImage.set(pix, v[2], 0, slice); | ||
| dstImage.set(pix, v[3], 0, slice); | ||
|
|
||
| if (mip <= 1) | ||
| return; | ||
|
|
||
| v[0] = subgroupQuadReduce(reducer, v[0]); | ||
| v[1] = subgroupQuadReduce(reducer, v[1]); | ||
| v[2] = subgroupQuadReduce(reducer, v[2]); | ||
| v[3] = subgroupQuadReduce(reducer, v[3]); | ||
|
|
||
| if ((localInvocationIndex % 4) == 0) | ||
| { | ||
| dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2), v[0], 1, slice); | ||
| sharedMem.set(int32_t2(x / 2, y / 2), v[0]); | ||
|
|
||
| dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2), v[1], 1, slice); | ||
| sharedMem.set(int32_t2(x / 2 + 8, y / 2), v[1]); | ||
|
|
||
| dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2 + 8), v[2], 1, slice); | ||
| sharedMem.set(int32_t2(x / 2, y / 2 + 8), v[2]); | ||
|
|
||
| dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2 + 8), v[3], 1, slice); | ||
| sharedMem.set(int32_t2(x / 2 + 8, y / 2 + 8), v[3]); | ||
| } | ||
| } | ||
|
|
||
| template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor> | ||
| void downsampleMip_2(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) | ||
| { | ||
| float32_t4 v = sharedMem.get(coord); | ||
| v = subgroupQuadReduce(reducer, v); | ||
| if (localInvocationIndex % 4 == 0) | ||
| { | ||
| dstImage.set(int32_t2(workGroupID.xy * 8) + int32_t2(coord.x / 2, coord.y / 2), v, mip, slice); | ||
|
|
||
| // store to LDS, try to reduce bank conflicts | ||
| // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 | ||
| // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
| // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x | ||
| // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
| // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 | ||
| // ... | ||
| // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 | ||
| sharedMem.set(int32_t2(coord.x + (coord.y / 2) % 2, coord.y), v); | ||
| } | ||
| } | ||
|
|
||
| template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor> | ||
| void downsampleMip_3(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) | ||
| { | ||
| if (localInvocationIndex < 64) | ||
| { | ||
| float32_t4 v = sharedMem.get(int32_t2(x * 2 + y % 2, y * 2)); | ||
| v = subgropuQuadReduce(reducer, v); | ||
| if (localInvocationIndex % 4 == 0) | ||
| { | ||
| dstImage.set(int32_t2(workGroupID.xy * 4) + int32_t2(x / 2, y / 2), v, mip, slice); | ||
| // store to LDS | ||
| // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 | ||
| // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
| // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
| // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
| // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 | ||
| // ... | ||
| // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 | ||
| // ... | ||
| // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x | ||
| // ... | ||
| sharedMem.set(int32_t2(x * 2 + y / 2, y * 2), v); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor> | ||
| void downsampleMip_4(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) | ||
| { | ||
| if (localInvocationIndex < 16) | ||
| { | ||
| float32_t4 v = sharedMem.get(int32_t2(x * 4 + y, y * 4)); | ||
| v = subgroupQuadReduce(reducer, v); | ||
| if (localInvocationIndex % 4 == 0) | ||
| { | ||
| dstImage.set(int32_t2(workGroupID.xy * 2), int32_t2(x / 2, y / 2), v, mip, slice); | ||
| // store to LDS | ||
| // x x x x 0 ... | ||
| // 0 ... | ||
| sharedMem.set(int32_t2(x / 2 + y, 0), v); | ||
| } | ||
|
|
||
| } | ||
| } | ||
|
|
||
| template <typename Reducer, typename SrcImageAccessor, typename DstImageAccessor, typename SharedMemoryAccessor> | ||
| void downsampleMip_5(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) | ||
| { | ||
| if (localInvocationIndex < 4) | ||
| { | ||
| float32_t4 v = sharedMem.get(int32_t2(localInvocationIndex,0)); | ||
| v = subgroupQuadReduce(reducer, v); | ||
| // quad index 0 stores result | ||
| if (localInvocationIndex % 4 == 0) | ||
| { | ||
| SpdStore(ASU2(workGroupID.xy), v, mip, slice); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| struct SPD | ||
| { | ||
|
Comment on lines
+163
to
+165
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you need to template your struct on a Config struct like workgroup scans or BxDFs have but you'll have:
The last two you may store indirectly because a Workgroup2 Reduction Config will be needed, so subgroup size and workgroup size come into play there. Although it might end up being that each round needs its own Then your
In the actual usage of the algo you can assume that user will do first mip level reduction by themselves because of cheap tricks like This first user-space mipmapping step is not taken into account by the SPD algorithm, so if you do 2048 input, then you only launch SPD with a tile input of 1024. You need to document that the Global Scratch Accessor needs to have the first For example for a 32k x 16k downsample, after the one-off userspace downsample you need to perform SPD on 16k x 8k This means 14 output mip-maps. Now suppose your workgroup can do 4096 inputs at once, and reduce a 64x64 patch. Thats 6 output mip levels per round. If you use Morton codes properly for your 1D Global Virtual Invocation Index, then your first 4096 WORKGROUPS will output one texel each at mip level 6 relative to the base (which is the 16k x 8k). To run a second round of SPD, you need a patch of 64x64 workgroups to store their values to the mip level 6. Now you make the LAST WORKGROUP which stores its texel to mip level 6, perform the SPD on that 64x64 patch! How do you do this? With an Atomic + Barrier! Everyone stores to mip level 6, issues a global memory barrier (not execution barrier) on the Input/Output accessor (1), and only then increments the atomic assigned to the 64x64 workgroup output patch with Device Scope ACQUIRE+RELEASE semantics. The workgroup for which this atomicAdd(1)/atomicIncr returns 4095 (SPIR-V atomic always returns pre-modification value) is the last one, and can now begin to read the 64x64 values other workgroups wrote. P.S. This is why I'd make a P.P.S. I can see how "adjusting" the SPD size per round could be more efficient, because in the example I gave after the first round, the relative mip level 6 (real mip level 7 of the 32k x 16k) has 256x128 resolution and if done with a 64x64 round, will produce 4x2 which will severely underutilize the last workgroup in round 3 which forms the cricial path. So just like the |
||
|
|
||
| static void __call() | ||
| { | ||
|
|
||
| } | ||
|
|
||
| }; | ||
|
|
||
|
|
||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -169,6 +169,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/raytracing.h | |
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_arithmetic.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_ballot.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_basic.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_quad.hlsl") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. forgot to add files? |
||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_shuffle.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_vote.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/glsl.std.450.hlsl") | ||
|
|
@@ -187,6 +188,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/core.hlsl") | |
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_arithmetic.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_ballot.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_basic.hlsl") | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_quad.hlsl") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. forgot to add files? |
||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_shuffle.hlsl") | ||
| #stdlib | ||
| LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/algorithm.hlsl") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what did you do, this is needed!