Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions 13_BitonicSort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
112 changes: 112 additions & 0 deletions 13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#include "nbl/builtin/hlsl/bda/bda_accessor.hlsl"

struct BitonicPushData
{
uint64_t inputKeyAddress;
uint64_t inputValueAddress;
uint64_t outputKeyAddress;
uint64_t outputValueAddress;
uint32_t dataElementCount;
};
Comment on lines +3 to +10

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you not including common.hlsl and having duplicate code?


using namespace nbl::hlsl;

[[vk::push_constant]] BitonicPushData pushData;

using DataPtr = bda::__ptr<uint32_t>;
using DataAccessor = BdaAccessor<uint32_t>;

groupshared uint32_t sharedKeys[ElementCount];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good SoA instead of AoS, no bank conflicts

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shared memory size is fixed, which means you shouldn't have a push constant dataElementSize

Unless it's purpose is to allow you to sort less elements than ElementCount but then you'd need to still initialize all of the sharedKeys with the highest possible value so you don't end up with garbage getting into the sort

groupshared uint32_t sharedValues[ElementCount];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its cheaper to move Key & Index to Value instead of the Value instead, because Value might be quite fat (also for a workgroup scan you need only uint16_t)

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also because the value never has to get accessed until the end.

So you can initialize sharedValueIndex with IOTA sequence (0,1,2,...) and not perform any value reads until the end where you do

value_t tmp;
inputValues.get(sharedValueIndex[i],tmp);
inputValues.set(i,tmp);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm actually you wouldn't initialize sharedValueIndex with IOTA but the result of the subgroup::BitonicSort


[numthreads(WorkgroupSize, 1, 1)]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some static asserts about workgroup size dividing the element mCount evenly and the element count being power of two would be nice

[shader("compute")]
void main(uint32_t3 dispatchId : SV_DispatchThreadID, uint32_t3 localId : SV_GroupThreadID)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use nbl::hlsl::glsl:: functions instead of weird : hlsl semantics, see other examples with gl_LocalInvocationIndex

{
const uint32_t threadId = localId.x;
const uint32_t dataSize = pushData.dataElementCount;
Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make that constant and part of the Config, because the workgroup size needs to be hardcoded and as I've mentioned optimal amount of shared memory per invocation to use is <=8 dwords (32 bytes)


DataAccessor inputKeys = DataAccessor::create(DataPtr::create(pushData.inputKeyAddress));
DataAccessor inputValues = DataAccessor::create(DataPtr::create(pushData.inputValueAddress));

for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
inputKeys.get(i, sharedKeys[i]);
inputValues.get(i, sharedValues[i]);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice but why isn't this wrapped up into a statically polymorphic struct like FFT and workgroup prefix sum? So we can actually reuse the code ? You know with shared memory accessors etc.

}

// Synchronize all threads after loading
GroupMemoryBarrierWithGroupSync();
Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use glsl or Spir-V barrier, we dislike hlsl intrinsics



for (uint32_t stage = 0; stage < Log2ElementCount; stage++)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does Log2ElementCount come from? I have a feeling that you shouldn't have a data size push constant then.

{
for (uint32_t pass = 0; pass <= stage; pass++)
{
const uint32_t compareDistance = 1 << (stage - pass);

for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
Comment on lines +48 to +49

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool use of virtual invocations

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one slight nitpick, this won't [unroll] were the dataSize a compile time constant.

to get a loop that unrolls you need to do

for (uint32_t baseIndex=0; baseIndex<dataSize; baseIndex+=WorkgroupSize)
{
   const uint32_t i = threadId+baseIndex;

compilers are dumb

const uint32_t partnerId = i ^ compareDistance;

if (partnerId >= dataSize)
continue;
Comment on lines +52 to +53

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this causes a 50% efficiency drop when compareDistance<SubgroupSize because only half the wave is active, do your for loop from 0 to (dataSize>>1) (half as many)

and fit up your index calculations by inserting a 0 within the i bitpattern for the lowerIndex and 1 for the higher, see FFT code for that getting done

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw you might have some issues with making this work for a single subgroup, @Fletterio may have some ideas how to help you there (see his FFT blogpost, about trading halves of FFT with subgroup shuffles while only keeping one invocation active).

The main problem is that with a 32-sized subgroup you need to treat 64 items.

Ideally you'd want one invocation to get 2 keys and value indices, but keep them rearranged in such a way that always 32 invocations are active.

Meaning that with step sizes of 1,2,4,8,... subgroupSize/2 you need to cleverly "reuse" the "partner ID" invocation to do the work of the "would be" second subgroup.


const uint32_t waveSize = WaveGetLaneCount();

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thats a constant which should get hoisted out, btw if you make it a template uint supplied from a Config, the compiler will unroll certain loops (instead of counting on the SPIR-V optimizer in the driver to do it) thats why we do it in FFT code

const uint32_t myWaveId = i / waveSize;
const uint32_t partnerWaveId = partnerId / waveSize;
const bool sameWave = (myWaveId == partnerWaveId);
Comment on lines +56 to +58

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

integer division is expensive, it would have been better to do just bool((i^partner)&uint16_t(-SubgroupSize)) or just compareDistance>SubgroupSize


uint32_t myKey, myValue, partnerKey, partnerValue;
[branch]
if (sameWave && compareDistance < waveSize)
{
// WAVE INTRINSIC
myKey = sharedKeys[i];
myValue = sharedValues[i];
Comment on lines +64 to +66

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's no need to read-write to shared during subgroup stages, you could keep the value in the register always


const uint32_t partnerLane = partnerId % waveSize;
partnerKey = WaveReadLaneAt(myKey, partnerLane);
partnerValue = WaveReadLaneAt(myValue, partnerLane);
Comment on lines +68 to +70

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use our SPIR-V intrinsics from spirv or glsl namespace.

You may spot that there's a ShuffleXor which lets you simply read the lane at gl_SubgroupInvocationIndex^mask

}
else
{
// SHARED MEM
myKey = sharedKeys[i];
myValue = sharedValues[i];
partnerKey = sharedKeys[partnerId];
partnerValue = sharedValues[partnerId];
}
Comment on lines +61 to +79

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the wholly subgroup stages (stage<=subgroupSizeLog2) need separation from the stages where the compareDistance can be big. Thats what I want wrapped up in subgroup::BitonicSort

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw the workgroup stages would still use subgroup to finish off each stage, because compareDistance decreases eventually to subgroup sized


const uint32_t sequenceSize = 1 << (stage + 1);
const uint32_t sequenceIndex = i / sequenceSize;
const bool sequenceAscending = (sequenceIndex % 2) == 0;
const bool ascending = true;
const bool finalDirection = sequenceAscending == ascending;

const bool swap = (myKey > partnerKey) == finalDirection;
Comment on lines +81 to +87

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be simply expressed as bool(i & (0x1u<< (stage+2))), all that code is just checking for one bit being set in i


// WORKGROUP COORDINATION: Only lower-indexed element writes both
if (i < partnerId && swap)
{
sharedKeys[i] = partnerKey;
sharedKeys[partnerId] = myKey;
sharedValues[i] = partnerValue;
sharedValues[partnerId] = myValue;
}
Comment on lines +89 to +96

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the key is not using 2 invocations for 2 inputs, but 1 for 2, throughout the function

}

GroupMemoryBarrierWithGroupSync();
}
}


DataAccessor outputKeys = DataAccessor::create(DataPtr::create(pushData.outputKeyAddress));
DataAccessor outputValues = DataAccessor::create(DataPtr::create(pushData.outputValueAddress));
Comment on lines +29 to +105

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Glad it works, but now the real work begins.

This needs to be split into subgroup::BitonicSort<Config> and workgroup::BitonicSort<Config with the latter using the former.
See the FFT and PrefixSum/Scan (reduce, inclusive_scan) code for inspiration of how to lay this out. I think they don't use our weird C++20/HLSL concepts but you should.

In the Config we need a SubgroupSize, KeyAccessor, ValueAccessor, Key Comparator (for a default see our hlsl::less binop), and obviously typedefs for value_t and key_t

Only the workgroup config needs a ScratchKeyAccessor (what will mediate your smem accesses), ScratchValueAccessor.

The reason is that this lets us choose to sort values from/to BDA, SSBO, Smem, Images, and use various memory as scratch (different offsets of shared, reuse shared mem allocations).

And most importantly, this lets us choose whether to load the Keys and move them around in scratch or move their indices and load them from main memory every time (useful if the Key is super fat) without touching the Bitonic sort code, the user would just provide an appropriate scratch-key accessor and comparator which both translate the argument to/from index.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Items Per Invocation should also come from Config.

Btw with a uint16_t value index and 1 invocation always processing a pair of items, this means a baseline shared memory consumption of 8 bytes per invocation if the key is also either 2 bytes or an index of a key.

Realistically max virtual thread repetition is 4, in your case with an actual 32bit key getting used directly, its 3 but with optimal workgroup size of 512 on NV to keep the multiple PoT it becomes 2.

As you can see the point at which one should consider indexing keys instead of using them out right is 14 byte keys.

Copy link
Member

@devshgraphicsprogramming devshgraphicsprogramming Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw when doing subgroup bitonics its also possible to have multiple items per invocation (more than 2), so its useful to bitonic sort in registers first (see subgroup prefix sum and FFT doing stuff locally within an invocation first).

Btw barriers are expensive, so make sure that the repeated items sit at the most minor level of indexing.
So that the Subgroup has to access

Key keys[2<<ElementsPerInvocationLog2];

this way the last ElementsPerInvocationLog2 passes in a stage execute without any barriers (subgroup or workgroup).


for (uint32_t i = threadId; i < dataSize; i += WorkgroupSize)
{
outputKeys.set(i, sharedKeys[i]);
outputValues.set(i, sharedValues[i]);
}
}
17 changes: 17 additions & 0 deletions 13_BitonicSort/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _BITONIC_SORT_COMMON_INCLUDED_
#define _BITONIC_SORT_COMMON_INCLUDED_

struct BitonicPushData
{

uint64_t inputKeyAddress;
uint64_t inputValueAddress;
uint64_t outputKeyAddress;
uint64_t outputValueAddress;
uint32_t dataElementCount;
};

#endif
28 changes: 28 additions & 0 deletions 13_BitonicSort/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading