Skip to content

Commit cfd44ee

Browse files
committed
new global sort kernel
1 parent 7bf42aa commit cfd44ee

File tree

1 file changed

+23
-28
lines changed

1 file changed

+23
-28
lines changed

global_bitonic_sort.cu

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,25 @@
99
#include "bitonic_sort.cuh"
1010

1111
/**
12-
* Swap
12+
* Global Memory Bitonic Sort Swap
1313
*
1414
* This is used for swapping elements in bitonic sorting
1515
*
1616
* @param x caller line id's value
17-
* @param mask source lane id = caller line id ^ mask
18-
* @param dir direction to swap
17+
* @param i current large step in bitonic sort sequence
18+
* @param j current small step in sequence
1919
* @param arr global memory array
2020
*
2121
*/
22-
__device__ void swap(int x, int mask, int dir, int *arr) {
22+
__global__ void globalSwap(int i, int j, int *arr) {
23+
// thread id within grid
24+
int x = threadIdx.x * blockIdx.x * blockDim.x;
25+
// distance between caller and source lanes
26+
int mask = 1 << (i - j);
27+
28+
// perform compare and swap
29+
int dir = x & (1 << i);
30+
2331
// get correspondin element to x in butterfly diagram
2432
int y = x ^ mask;
2533
// lower ids thread perform swap
@@ -45,40 +53,27 @@ __device__ void swap(int x, int mask, int dir, int *arr) {
4553
/**
4654
* Global Memory Bitonic Sort
4755
*
48-
* The function uses the butterfly network pattern of bitonic sort, leveraging
49-
* CUDA's warp-level primitives for efficient sorting within a warp (32
50-
* threads). The swaps are tiled into warps of 32 threads. This is able to do
51-
* swaps without allocating extra memory for temporary variable.
52-
*
5356
* @param arr Pointer to the array of integers to be sorted
5457
* @param size Total number of elements in the array
58+
* @param block_size Number of threads in one block
59+
* @param num_blocks Number of total block in grid
5560
*
56-
* @note This function assumes that the number of threads per block is at least
57-
* equal to the warp size. Elements beyond the array size are padded with
58-
* INT_MAX.
61+
* @note This function assumes that the number elements in the arrays is a power
62+
* of two
5963
*
60-
* @see swap() for the element comparison and swapping logic
64+
* @see globalSwap() for the element comparison and swapping logic kernel
6165
*/
62-
__global__ void globalBitonicSort(int *arr, int size) {
63-
// local thread id in block
64-
int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
65-
66-
// make bitonic sequence and sort
67-
for (int i = 0; (1 << i) <= blockDim.x; i++) {
66+
void globalBitonicSort(int *arr, int size, int block_size,
67+
int num_blocks) { // make bitonic sequence and sort
68+
for (int i = 0; (1 << i) <= size; i++) {
6869
for (int j = 1; j <= i; j++) {
69-
// distance between caller and source lanes
70-
int mask = 1 << (i - j);
71-
72-
// perform compare and swap
73-
int dir = thread_id & (1 << i);
74-
swap(thread_id, mask, dir, arr);
75-
__syncthreads();
70+
globalSwap<<<block_size, num_blocks>>>(i, j, arr);
7671
}
7772
}
7873
}
79-
8074
void launchBitonicSort(int *arr, int size) {
8175
const int BLOCK_SIZE = 512;
8276
const int NUM_BLOCKS = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
83-
globalBitonicSort<<<NUM_BLOCKS, BLOCK_SIZE>>>(arr, size);
77+
// call sort function
78+
globalBitonicSort(arr, size, BLOCK_SIZE, NUM_BLOCKS);
8479
}

0 commit comments

Comments
 (0)