-
Notifications
You must be signed in to change notification settings - Fork 14
bitonic sort sample #209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
bitonic sort sample #209
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
include(common RESULT_VARIABLE RES) | ||
if(NOT RES) | ||
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") | ||
endif() | ||
|
||
nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") | ||
|
||
if(NBL_EMBED_BUILTIN_RESOURCES) | ||
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) | ||
set(RESOURCE_DIR "app_resources") | ||
|
||
get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) | ||
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) | ||
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) | ||
|
||
file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") | ||
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) | ||
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") | ||
endforeach() | ||
|
||
ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") | ||
|
||
LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) | ||
endif() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl" | ||
|
||
struct BitonicPushData | ||
{ | ||
uint64_t inputKeyAddress; | ||
uint64_t inputValueAddress; | ||
uint64_t outputKeyAddress; | ||
uint64_t outputValueAddress; | ||
uint32_t dataElementCount; | ||
}; | ||
|
||
using namespace nbl::hlsl; | ||
|
||
[[vk::push_constant]] BitonicPushData pushData; | ||
|
||
using DataPtr = bda::__ptr<uint32_t>; | ||
using DataAccessor = BdaAccessor<uint32_t>; | ||
|
||
groupshared uint32_t sharedKeys[ElementCount]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good SoA instead of AoS, no bank conflicts There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You shared memory size is fixed, which means you shouldn't have a push constant Unless it's purpose is to allow you to sort less elements than |
||
groupshared uint32_t sharedValues[ElementCount]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its cheaper to move Key & Index to Value instead of the Value instead, because Value might be quite fat (also for a workgroup scan you need only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also because the value never has to get accessed until the end. So you can initialize value_t tmp;
inputValues.get(sharedValueIndex[i],tmp);
inputValues.set(i,tmp); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm actually you wouldn't initialize |
||
|
||
[numthreads(WorkgroupSize, 1, 1)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some static asserts about workgroup size dividing the element mCount evenly and the element count being power of two would be nice |
||
[shader("compute")] | ||
void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
{ | ||
const uint32_t threadId = localId.x; | ||
const uint32_t dataSize = pushData.dataElementCount; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd make that constant and part of the Config, because the workgroup size needs to be hardcoded and as I've mentioned optimal amount of shared memory per invocation to use is <=8 dwords (32 bytes) |
||
|
||
DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress)); | ||
DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress)); | ||
|
||
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ | ||
inputKeys.get(i, sharedKeys[i]); | ||
inputValues.get(i, sharedValues[i]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very nice but why isn't this wrapped up into a statically polymorphic struct like FFT and workgroup prefix sum? So we can actually reuse the code ? You know with shared memory accessors etc. |
||
} | ||
|
||
// Synchronize all threads after loading | ||
GroupMemoryBarrierWithGroupSync(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use glsl or Spir-V barrier, we dislike hlsl intrinsics |
||
|
||
|
||
for (uint32_t stage = 0; stage < Log2ElementCount; stage++) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where does Log2ElementCount come from? I have a feeling that you shouldn't have a data size push constant then. |
||
{ | ||
for (uint32_t pass = 0; pass <= stage; pass++) | ||
{ | ||
const uint32_t compareDistance = 1 << (stage - pass); | ||
|
||
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ | ||
Comment on lines
+48
to
+49
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cool use of virtual invocations There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one slight nitpick, this won't to get a loop that unrolls you need to do for (uint32_t baseIndex=0; baseIndex<dataSize; baseIndex+=WorkgroupSize)
{
const uint32_t i = threadId+baseIndex; compilers are dumb |
||
const uint32_t partnerId = i ^ compareDistance; | ||
|
||
if (partnerId >= dataSize) | ||
continue; | ||
Comment on lines
+52
to
+53
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this causes a 50% efficiency drop when and fit up your index calculations by inserting a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw you might have some issues with making this work for a single subgroup, @Fletterio may have some ideas how to help you there (see his FFT blogpost, about trading halves of FFT with subgroup shuffles while only keeping one invocation active). The main problem is that with a 32-sized subgroup you need to treat 64 items. Ideally you'd want one invocation to get 2 keys and value indices, but keep them rearranged in such a way that always 32 invocations are active. Meaning that with step sizes of 1,2,4,8,... subgroupSize/2 you need to cleverly "reuse" the "partner ID" invocation to do the work of the "would be" second subgroup. |
||
|
||
const uint32_t waveSize = WaveGetLaneCount(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thats a constant which should get hoisted out, btw if you make it a template uint supplied from a Config, the compiler will unroll certain loops (instead of counting on the SPIR-V optimizer in the driver to do it) thats why we do it in FFT code |
||
const uint32_t myWaveId = i / waveSize; | ||
const uint32_t partnerWaveId = partnerId / waveSize; | ||
const bool sameWave = (myWaveId == partnerWaveId); | ||
Comment on lines
+56
to
+58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. integer division is expensive, it would have been better to do just |
||
|
||
uint32_t myKey, myValue, partnerKey, partnerValue; | ||
[branch] | ||
if (sameWave && compareDistance < waveSize) | ||
{ | ||
// WAVE INTRINSIC | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; | ||
Comment on lines
+64
to
+66
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's no need to read-write to shared during subgroup stages, you could keep the value in the register always |
||
|
||
const uint32_t partnerLane = partnerId % waveSize; | ||
partnerKey = WaveReadLaneAt(myKey, partnerLane); | ||
partnerValue = WaveReadLaneAt(myValue, partnerLane); | ||
Comment on lines
+68
to
+70
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use our SPIR-V intrinsics from You may spot that there's a ShuffleXor which lets you simply read the lane at |
||
} | ||
else | ||
{ | ||
// SHARED MEM | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; | ||
partnerKey = sharedKeys[partnerId]; | ||
partnerValue = sharedValues[partnerId]; | ||
} | ||
Comment on lines
+61
to
+79
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the wholly subgroup stages (stage<=subgroupSizeLog2) need separation from the stages where the compareDistance can be big. Thats what I want wrapped up in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw the workgroup stages would still use subgroup to finish off each stage, because |
||
|
||
const uint32_t sequenceSize = 1 << (stage + 1); | ||
const uint32_t sequenceIndex = i / sequenceSize; | ||
const bool sequenceAscending = (sequenceIndex % 2) == 0; | ||
const bool ascending = true; | ||
const bool finalDirection = sequenceAscending == ascending; | ||
|
||
const bool swap = (myKey > partnerKey) == finalDirection; | ||
Comment on lines
+81
to
+87
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can be simply expressed as |
||
|
||
// WORKGROUP COORDINATION: Only lower-indexed element writes both | ||
if (i < partnerId && swap) | ||
{ | ||
sharedKeys[i] = partnerKey; | ||
sharedKeys[partnerId] = myKey; | ||
sharedValues[i] = partnerValue; | ||
sharedValues[partnerId] = myValue; | ||
} | ||
Comment on lines
+89
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the key is not using 2 invocations for 2 inputs, but 1 for 2, throughout the function |
||
} | ||
|
||
GroupMemoryBarrierWithGroupSync(); | ||
} | ||
} | ||
|
||
|
||
DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress)); | ||
DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress)); | ||
Comment on lines
+29
to
+105
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Glad it works, but now the real work begins. This needs to be split into In the Config we need a SubgroupSize, KeyAccessor, ValueAccessor, Key Comparator (for a default see our Only the workgroup config needs a ScratchKeyAccessor (what will mediate your smem accesses), ScratchValueAccessor. The reason is that this lets us choose to sort values from/to BDA, SSBO, Smem, Images, and use various memory as scratch (different offsets of shared, reuse shared mem allocations). And most importantly, this lets us choose whether to load the Keys and move them around in scratch or move their indices and load them from main memory every time (useful if the Key is super fat) without touching the Bitonic sort code, the user would just provide an appropriate scratch-key accessor and comparator which both translate the argument to/from index. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Items Per Invocation should also come from Config. Btw with a uint16_t value index and 1 invocation always processing a pair of items, this means a baseline shared memory consumption of 8 bytes per invocation if the key is also either 2 bytes or an index of a key. Realistically max virtual thread repetition is 4, in your case with an actual 32bit key getting used directly, its 3 but with optimal workgroup size of 512 on NV to keep the multiple PoT it becomes 2. As you can see the point at which one should consider indexing keys instead of using them out right is 14 byte keys. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw when doing subgroup bitonics its also possible to have multiple items per invocation (more than 2), so its useful to bitonic sort in registers first (see subgroup prefix sum and FFT doing stuff locally within an invocation first). Btw barriers are expensive, so make sure that the repeated items sit at the most minor level of indexing. Key keys[2<<ElementsPerInvocationLog2]; this way the last |
||
|
||
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ | ||
outputKeys.set(i, sharedKeys[i]); | ||
outputValues.set(i, sharedValues[i]); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
#ifndef _BITONIC_SORT_COMMON_INCLUDED_ | ||
#define _BITONIC_SORT_COMMON_INCLUDED_ | ||
|
||
struct BitonicPushData | ||
{ | ||
|
||
uint64_t inputKeyAddress; | ||
uint64_t inputValueAddress; | ||
uint64_t outputKeyAddress; | ||
uint64_t outputValueAddress; | ||
uint32_t dataElementCount; | ||
}; | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{ | ||
"enableParallelBuild": true, | ||
"threadsPerBuildProcess" : 2, | ||
"isExecuted": false, | ||
"scriptPath": "", | ||
"cmake": { | ||
"configurations": [ "Release", "Debug", "RelWithDebInfo" ], | ||
"buildModes": [], | ||
"requiredOptions": [] | ||
}, | ||
"profiles": [ | ||
{ | ||
"backend": "vulkan", // should be none | ||
"platform": "windows", | ||
"buildModes": [], | ||
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example | ||
"gpuArchitectures": [] | ||
} | ||
], | ||
"dependencies": [], | ||
"data": [ | ||
{ | ||
"dependencies": [], | ||
"command": [""], | ||
"outputs": [] | ||
} | ||
] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are you not including
common.hlsl
and having duplicate code?