99#include " bitonic_sort.cuh"
1010
1111/* *
12- * Swap
12+ * Global Memory Bitonic Sort Swap
1313 *
1414 * This is used for swapping elements in bitonic sorting
1515 *
1616 * @param x caller line id's value
17- * @param mask source lane id = caller line id ^ mask
18- * @param dir direction to swap
17+ * @param i current large step in bitonic sort sequence
18+ * @param j current small step in sequence
1919 * @param arr global memory array
2020 *
2121 */
22- __device__ void swap (int x, int mask, int dir, int *arr) {
22+ __global__ void globalSwap (int i, int j, int *arr) {
23+ // thread id within grid
24+ int x = threadIdx .x * blockIdx .x * blockDim .x ;
25+ // distance between caller and source lanes
26+ int mask = 1 << (i - j);
27+
28+ // perform compare and swap
29+ int dir = x & (1 << i);
30+
2331 // get correspondin element to x in butterfly diagram
2432 int y = x ^ mask;
2533 // lower ids thread perform swap
@@ -45,40 +53,27 @@ __device__ void swap(int x, int mask, int dir, int *arr) {
4553/* *
4654 * Global Memory Bitonic Sort
4755 *
48- * The function uses the butterfly network pattern of bitonic sort, leveraging
49- * CUDA's warp-level primitives for efficient sorting within a warp (32
50- * threads). The swaps are tiled into warps of 32 threads. This is able to do
51- * swaps without allocating extra memory for temporary variable.
52- *
5356 * @param arr Pointer to the array of integers to be sorted
5457 * @param size Total number of elements in the array
58+ * @param block_size Number of threads in one block
59+ * @param num_blocks Number of total block in grid
5560 *
56- * @note This function assumes that the number of threads per block is at least
57- * equal to the warp size. Elements beyond the array size are padded with
58- * INT_MAX.
61+ * @note This function assumes that the number elements in the arrays is a power
62+ * of two
5963 *
60- * @see swap () for the element comparison and swapping logic
64+ * @see globalSwap () for the element comparison and swapping logic kernel
6165 */
62- __global__ void globalBitonicSort (int *arr, int size) {
63- // local thread id in block
64- int thread_id = threadIdx .x + blockIdx .x * blockDim .x ;
65-
66- // make bitonic sequence and sort
67- for (int i = 0 ; (1 << i) <= blockDim .x ; i++) {
66+ void globalBitonicSort (int *arr, int size, int block_size,
67+ int num_blocks) { // make bitonic sequence and sort
68+ for (int i = 0 ; (1 << i) <= size; i++) {
6869 for (int j = 1 ; j <= i; j++) {
69- // distance between caller and source lanes
70- int mask = 1 << (i - j);
71-
72- // perform compare and swap
73- int dir = thread_id & (1 << i);
74- swap (thread_id, mask, dir, arr);
75- __syncthreads ();
70+ globalSwap<<<block_size, num_blocks>>> (i, j, arr);
7671 }
7772 }
7873}
79-
8074void launchBitonicSort (int *arr, int size) {
8175 const int BLOCK_SIZE = 512 ;
8276 const int NUM_BLOCKS = (size + BLOCK_SIZE - 1 ) / BLOCK_SIZE;
83- globalBitonicSort<<<NUM_BLOCKS, BLOCK_SIZE>>> (arr, size);
77+ // call sort function
78+ globalBitonicSort (arr, size, BLOCK_SIZE, NUM_BLOCKS);
8479}
0 commit comments