-
Notifications
You must be signed in to change notification settings - Fork 14
bitonic sort sample #209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
bitonic sort sample #209
Conversation
Signed-off-by: CrabeExtra <[email protected]>
@Fletterio wanna try your hand at a review? |
get_target_property(TYPE ${T} TYPE) | ||
if(NOT ${TYPE} MATCHES INTERFACE) | ||
target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) | ||
target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>) | ||
set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF) | ||
target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") | ||
|
||
if(NBL_EMBED_BUILTIN_RESOURCES) | ||
LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource) | ||
LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude) | ||
LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild) | ||
endif() | ||
endif() | ||
target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) | ||
target_include_directories(${T} PUBLIC $<TARGET_PROPERTY:${NBL_EXAMPLES_API_TARGET},INCLUDE_DIRECTORIES>) | ||
target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this needs to match master
, no difference
struct BitonicPushData | ||
{ | ||
uint64_t inputKeyAddress; | ||
uint64_t inputValueAddress; | ||
uint64_t outputKeyAddress; | ||
uint64_t outputValueAddress; | ||
uint32_t dataElementCount; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are you not including common.hlsl
and having duplicate code?
using DataPtr = bda::__ptr<uint32_t>; | ||
using DataAccessor = BdaAccessor<uint32_t>; | ||
|
||
groupshared uint32_t sharedKeys[ElementCount]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good SoA instead of AoS, no bank conflicts
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shared memory size is fixed, which means you shouldn't have a push constant dataElementSize
Unless it's purpose is to allow you to sort less elements than ElementCount
but then you'd need to still initialize all of the sharedKeys
with the highest possible value so you don't end up with garbage getting into the sort
|
||
[numthreads(WorkgroupSize, 1, 1)] | ||
[shader("compute")] | ||
void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use nbl::hlsl::glsl::
functions instead of weird : hlsl semantics, see other examples with gl_LocalInvocationIndex
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ | ||
inputKeys.get(i, sharedKeys[i]); | ||
inputValues.get(i, sharedValues[i]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice but why isn't this wrapped up into a statically polymorphic struct like FFT and workgroup prefix sum? So we can actually reuse the code ? You know with shared memory accessors etc.
} | ||
|
||
// Synchronize all threads after loading | ||
GroupMemoryBarrierWithGroupSync(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use glsl or Spir-V barrier, we dislike hlsl intrinsics
GroupMemoryBarrierWithGroupSync(); | ||
|
||
|
||
for (uint32_t stage = 0; stage < Log2ElementCount; stage++) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where does Log2ElementCount come from? I have a feeling that you shouldn't have a data size push constant then.
groupshared uint32_t sharedKeys[ElementCount]; | ||
groupshared uint32_t sharedValues[ElementCount]; | ||
|
||
[numthreads(WorkgroupSize, 1, 1)] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some static asserts about workgroup size dividing the element mCount evenly and the element count being power of two would be nice
|
||
auto limits = m_physicalDevice->getLimits(); | ||
const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize; | ||
const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's a different limit maxOptimallyResidentWorkgroupInvocations
that tells you the optimal max size.
Reason, most Nvidia GPUs in the wild have space for 1536 invocations, but report 1024 as max workgroup size, you get 33% under utilization. If you use 512 sized workgroups, then 3 fit on the SM.
auto limits = m_physicalDevice->getLimits(); | ||
const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize; | ||
const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit | ||
const uint32_t bytes_per_elements = sizeof(uint32_t) * 2; // 2 uint32_t per element (key and value) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you should get this with sizeof()
of the structs in app_resources/common.hlsl
const uint32_t max_shared_memory_size = limits.maxComputeSharedMemorySize; | ||
const uint32_t max_workgroup_size = limits.maxComputeWorkGroupInvocations; // Get actual GPU limit | ||
const uint32_t bytes_per_elements = sizeof(uint32_t) * 2; // 2 uint32_t per element (key and value) | ||
const uint32_t max_element_in_shared_memory = max_shared_memory_size / bytes_per_elements; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you use all the shared memory, you won't have room for as many workgroups as possible.
Btw a good undocumented rule for occupancy is to make each invocation use no more than 32 bytes of shared memory.
Also all these calculations can be done with templates in HLSL/C++ (see the workgroup FFT and Prefix sum code)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can take maxOptimallyResidentWorkgroupInvocations * 32 / (sizeof(Key)+sizeof(uint16_t))
(don't use 32 as a literal name it something) and round it down to PoT
const uint32_t log2_element_count = static_cast<uint32_t>(log2(element_count)); | ||
|
||
m_logger->log("GPU Limits:", ILogger::ELL_INFO); | ||
m_logger->log(" Max Workgroup Size: " + std::to_string(max_workgroup_size), ILogger::ELL_INFO); | ||
m_logger->log(" Max Shared Memory: " + std::to_string(max_shared_memory_size) + " bytes", ILogger::ELL_INFO); | ||
m_logger->log(" Max elements in shared memory: " + std::to_string(max_element_in_shared_memory), ILogger::ELL_INFO); | ||
m_logger->log(" Using element count (power of 2): " + std::to_string(element_count), ILogger::ELL_INFO); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice logging
auto overrideSource = CHLSLCompiler::createOverridenCopy( | ||
source.get(), "#define ElementCount %d\n#define Log2ElementCount %d\n#define WorkgroupSize %d\n", | ||
element_count, log2_element_count, max_workgroup_size | ||
); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the device limits are available in device_capability_traits.hlsl
in the jit
namespace, see the prefix sum example how we use them.
Idea is that your HLSL code would have a templated version (which computes all the constants via use of templates) and a NBL_CONSTEXPR struct version you use on the C++ side so that you can keep everything in sync and no need to make overriden shaders by slapping #define
into them
if (!bitonicSortShader) | ||
return logFail("Failed to compile bitonic sort shader!"); | ||
|
||
|
||
const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(BitonicPushData) }; | ||
|
||
smart_refctd_ptr<IGPUPipelineLayout> layout; | ||
smart_refctd_ptr<IGPUComputePipeline> bitonicSortPipeline; | ||
{ | ||
layout = m_device->createPipelineLayout({ &pcRange,1 }); | ||
IGPUComputePipeline::SCreationParams params = {}; | ||
params.layout = layout.get(); | ||
params.shader.shader = bitonicSortShader.get(); | ||
params.shader.entryPoint = "main"; | ||
params.shader.entries = nullptr; | ||
if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &bitonicSortPipeline)) | ||
return logFail("Failed to create compute pipeline!\n"); | ||
} | ||
|
||
nbl::video::IDeviceMemoryAllocator::SAllocation allocation[4] = {}; | ||
smart_refctd_ptr<IGPUBuffer> buffers[4]; | ||
|
||
auto build_buffer = [this]( | ||
smart_refctd_ptr<ILogicalDevice> m_device, | ||
nbl::video::IDeviceMemoryAllocator::SAllocation* allocation, | ||
smart_refctd_ptr<IGPUBuffer>& buffer, | ||
size_t buffer_size, | ||
const char* label | ||
) -> void { | ||
IGPUBuffer::SCreationParams params; | ||
params.size = buffer_size; | ||
params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; | ||
buffer = m_device->createBuffer(std::move(params)); | ||
if (!buffer) | ||
logFail("Failed to create GPU buffer of size %d!\n", buffer_size); | ||
|
||
buffer->setObjectDebugName(label); | ||
|
||
auto reqs = buffer->getMemoryReqs(); | ||
reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); | ||
|
||
*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice and clean
allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), | ||
allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), | ||
allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), | ||
allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Vulkan doesn't care about the EMCAF flags, but they're kinda wrong here, I'm pretty sure you WRITE into them with the host, while others you READ at the end
if (!allocation->isValid()) | ||
logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); | ||
|
||
assert(allocation->memory.get() == buffer->getBoundMemory().memory); | ||
}; | ||
|
||
build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); | ||
build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); | ||
build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * element_count, "Output Key Buffer"); | ||
build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Value Buffer"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
one warning about calling logFail
in a lambda, you won't actually quit the app cleanly, you'll continue going through the app
auto bufferData = new uint32_t * [2]; | ||
for (int i = 0; i < 2; ++i) { | ||
bufferData[i] = new uint32_t[element_count]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use a std::array<std::unique_ptr<uint32_t>,2>
using DataAccessor = BdaAccessor<uint32_t>; | ||
|
||
groupshared uint32_t sharedKeys[ElementCount]; | ||
groupshared uint32_t sharedValues[ElementCount]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its cheaper to move Key & Index to Value instead of the Value instead, because Value might be quite fat (also for a workgroup scan you need only uint16_t
)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also because the value never has to get accessed until the end.
So you can initialize sharedValueIndex
with IOTA sequence (0,1,2,...) and not perform any value reads until the end where you do
value_t tmp;
inputValues.get(sharedValueIndex[i],tmp);
inputValues.set(i,tmp);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm actually you wouldn't initialize sharedValueIndex
with IOTA but the result of the subgroup::BitonicSort
DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress)); | ||
DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress)); | ||
|
||
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ | ||
inputKeys.get(i, sharedKeys[i]); | ||
inputValues.get(i, sharedValues[i]); | ||
} | ||
|
||
// Synchronize all threads after loading | ||
GroupMemoryBarrierWithGroupSync(); | ||
|
||
|
||
for (uint32_t stage = 0; stage < Log2ElementCount; stage++) | ||
{ | ||
for (uint32_t pass = 0; pass <= stage; pass++) | ||
{ | ||
const uint32_t compareDistance = 1 << (stage - pass); | ||
|
||
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ | ||
const uint32_t partnerId = i ^ compareDistance; | ||
|
||
if (partnerId >= dataSize) | ||
continue; | ||
|
||
const uint32_t waveSize = WaveGetLaneCount(); | ||
const uint32_t myWaveId = i / waveSize; | ||
const uint32_t partnerWaveId = partnerId / waveSize; | ||
const bool sameWave = (myWaveId == partnerWaveId); | ||
|
||
uint32_t myKey, myValue, partnerKey, partnerValue; | ||
[branch] | ||
if (sameWave && compareDistance < waveSize) | ||
{ | ||
// WAVE INTRINSIC | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; | ||
|
||
const uint32_t partnerLane = partnerId % waveSize; | ||
partnerKey = WaveReadLaneAt(myKey, partnerLane); | ||
partnerValue = WaveReadLaneAt(myValue, partnerLane); | ||
} | ||
else | ||
{ | ||
// SHARED MEM | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; | ||
partnerKey = sharedKeys[partnerId]; | ||
partnerValue = sharedValues[partnerId]; | ||
} | ||
|
||
const uint32_t sequenceSize = 1 << (stage + 1); | ||
const uint32_t sequenceIndex = i / sequenceSize; | ||
const bool sequenceAscending = (sequenceIndex % 2) == 0; | ||
const bool ascending = true; | ||
const bool finalDirection = sequenceAscending == ascending; | ||
|
||
const bool swap = (myKey > partnerKey) == finalDirection; | ||
|
||
// WORKGROUP COORDINATION: Only lower-indexed element writes both | ||
if (i < partnerId && swap) | ||
{ | ||
sharedKeys[i] = partnerKey; | ||
sharedKeys[partnerId] = myKey; | ||
sharedValues[i] = partnerValue; | ||
sharedValues[partnerId] = myValue; | ||
} | ||
} | ||
|
||
GroupMemoryBarrierWithGroupSync(); | ||
} | ||
} | ||
|
||
|
||
DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress)); | ||
DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Glad it works, but now the real work begins.
This needs to be split into subgroup::BitonicSort<Config>
and workgroup::BitonicSort<Config
with the latter using the former.
See the FFT and PrefixSum/Scan (reduce, inclusive_scan) code for inspiration of how to lay this out. I think they don't use our weird C++20/HLSL concepts but you should.
In the Config we need a SubgroupSize, KeyAccessor, ValueAccessor, Key Comparator (for a default see our hlsl::less
binop), and obviously typedefs for value_t and key_t
Only the workgroup config needs a ScratchKeyAccessor (what will mediate your smem accesses), ScratchValueAccessor.
The reason is that this lets us choose to sort values from/to BDA, SSBO, Smem, Images, and use various memory as scratch (different offsets of shared, reuse shared mem allocations).
And most importantly, this lets us choose whether to load the Keys and move them around in scratch or move their indices and load them from main memory every time (useful if the Key is super fat) without touching the Bitonic sort code, the user would just provide an appropriate scratch-key accessor and comparator which both translate the argument to/from index.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Items Per Invocation should also come from Config.
Btw with a uint16_t value index and 1 invocation always processing a pair of items, this means a baseline shared memory consumption of 8 bytes per invocation if the key is also either 2 bytes or an index of a key.
Realistically max virtual thread repetition is 4, in your case with an actual 32bit key getting used directly, its 3 but with optimal workgroup size of 512 on NV to keep the multiple PoT it becomes 2.
As you can see the point at which one should consider indexing keys instead of using them out right is 14 byte keys.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw when doing subgroup bitonics its also possible to have multiple items per invocation (more than 2), so its useful to bitonic sort in registers first (see subgroup prefix sum and FFT doing stuff locally within an invocation first).
Btw barriers are expensive, so make sure that the repeated items sit at the most minor level of indexing.
So that the Subgroup has to access
Key keys[2<<ElementsPerInvocationLog2];
this way the last ElementsPerInvocationLog2
passes in a stage execute without any barriers (subgroup or workgroup).
for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cool use of virtual invocations
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
one slight nitpick, this won't [unroll]
were the dataSize
a compile time constant.
to get a loop that unrolls you need to do
for (uint32_t baseIndex=0; baseIndex<dataSize; baseIndex+=WorkgroupSize)
{
const uint32_t i = threadId+baseIndex;
compilers are dumb
void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID) | ||
{ | ||
const uint32_t threadId = localId.x; | ||
const uint32_t dataSize = pushData.dataElementCount; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd make that constant and part of the Config, because the workgroup size needs to be hardcoded and as I've mentioned optimal amount of shared memory per invocation to use is <=8 dwords (32 bytes)
if (partnerId >= dataSize) | ||
continue; | ||
|
||
const uint32_t waveSize = WaveGetLaneCount(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thats a constant which should get hoisted out, btw if you make it a template uint supplied from a Config, the compiler will unroll certain loops (instead of counting on the SPIR-V optimizer in the driver to do it) thats why we do it in FFT code
[branch] | ||
if (sameWave && compareDistance < waveSize) | ||
{ | ||
// WAVE INTRINSIC | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; | ||
|
||
const uint32_t partnerLane = partnerId % waveSize; | ||
partnerKey = WaveReadLaneAt(myKey, partnerLane); | ||
partnerValue = WaveReadLaneAt(myValue, partnerLane); | ||
} | ||
else | ||
{ | ||
// SHARED MEM | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; | ||
partnerKey = sharedKeys[partnerId]; | ||
partnerValue = sharedValues[partnerId]; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the wholly subgroup stages (stage<=subgroupSizeLog2) need separation from the stages where the compareDistance can be big. Thats what I want wrapped up in subgroup::BitonicSort
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw the workgroup stages would still use subgroup to finish off each stage, because compareDistance
decreases eventually to subgroup sized
if (partnerId >= dataSize) | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this causes a 50% efficiency drop when compareDistance<SubgroupSize
because only half the wave is active, do your for loop from 0
to (dataSize>>1)
(half as many)
and fit up your index calculations by inserting a 0
within the i
bitpattern for the lowerIndex and 1
for the higher, see FFT code for that getting done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw you might have some issues with making this work for a single subgroup, @Fletterio may have some ideas how to help you there (see his FFT blogpost, about trading halves of FFT with subgroup shuffles while only keeping one invocation active).
The main problem is that with a 32-sized subgroup you need to treat 64 items.
Ideally you'd want one invocation to get 2 keys and value indices, but keep them rearranged in such a way that always 32 invocations are active.
Meaning that with step sizes of 1,2,4,8,... subgroupSize/2 you need to cleverly "reuse" the "partner ID" invocation to do the work of the "would be" second subgroup.
const uint32_t myWaveId = i / waveSize; | ||
const uint32_t partnerWaveId = partnerId / waveSize; | ||
const bool sameWave = (myWaveId == partnerWaveId); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
integer division is expensive, it would have been better to do just bool((i^partner)&uint16_t(-SubgroupSize))
or just compareDistance>SubgroupSize
const uint32_t partnerLane = partnerId % waveSize; | ||
partnerKey = WaveReadLaneAt(myKey, partnerLane); | ||
partnerValue = WaveReadLaneAt(myValue, partnerLane); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use our SPIR-V intrinsics from spirv
or glsl
namespace.
You may spot that there's a ShuffleXor which lets you simply read the lane at gl_SubgroupInvocationIndex^mask
// WAVE INTRINSIC | ||
myKey = sharedKeys[i]; | ||
myValue = sharedValues[i]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's no need to read-write to shared during subgroup stages, you could keep the value in the register always
const uint32_t sequenceSize = 1 << (stage + 1); | ||
const uint32_t sequenceIndex = i / sequenceSize; | ||
const bool sequenceAscending = (sequenceIndex % 2) == 0; | ||
const bool ascending = true; | ||
const bool finalDirection = sequenceAscending == ascending; | ||
|
||
const bool swap = (myKey > partnerKey) == finalDirection; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this can be simply expressed as bool(i & (0x1u<< (stage+2)))
, all that code is just checking for one bit being set in i
// WORKGROUP COORDINATION: Only lower-indexed element writes both | ||
if (i < partnerId && swap) | ||
{ | ||
sharedKeys[i] = partnerKey; | ||
sharedKeys[partnerId] = myKey; | ||
sharedValues[i] = partnerValue; | ||
sharedValues[partnerId] = myValue; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the key is not using 2 invocations for 2 inputs, but 1 for 2, throughout the function
buffer->setObjectDebugName(label); | ||
|
||
auto reqs = buffer->getMemoryReqs(); | ||
reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you wanna bitwise-and UpStreaming memory types as well, so you're running from VRAM and not RAM
cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); | ||
cmdBuf->beginDebugMarker("Bitonic Sort Single Dispatch", core::vectorSIMDf(0, 1, 0, 1)); | ||
cmdBuf->bindComputePipeline(bitonicSortPipeline.get()); | ||
cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); | ||
cmdBuf->dispatch(1, 1, 1); | ||
cmdBuf->endDebugMarker(); | ||
cmdBuf->end(); | ||
|
||
{ | ||
auto queue = getComputeQueue(); | ||
|
||
IQueue::SSubmitInfo submit_infos[1]; | ||
IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { | ||
{ | ||
.cmdbuf = cmdBuf.get() | ||
} | ||
}; | ||
submit_infos[0].commandBuffers = cmdBufs; | ||
IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { | ||
{ | ||
.semaphore = progress.get(), | ||
.value = ++timeline, | ||
.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT | ||
} | ||
}; | ||
submit_infos[0].signalSemaphores = signals; | ||
|
||
m_api->startCapture(); | ||
queue->submit(submit_infos); | ||
m_api->endCapture(); | ||
} | ||
|
||
const ISemaphore::SWaitInfo wait_infos[] = { { | ||
.semaphore = progress.get(), | ||
.value = timeline | ||
} }; | ||
m_device->blockForSemaphores(wait_infos); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a separate benchmark mode that does more workgroup dispatches (but allocates the buffer on DeviceLocal without redability) and on a loop with a time elapsed query
If you're curious you can see if you're faster than https://github.com/AndrewBoessen/Bitonic-Merge-Sort
if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) | ||
m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); | ||
if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) | ||
m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wrong, the bit you write must be flushed before you submit
the other buffers (output) are the ones you invalidate
allocation[2].memory->unmap(); | ||
allocation[3].memory->unmap(); | ||
|
||
m_device->waitIdle(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't need to waitIdle because you've blocked on the submit semaphore
No description provided.