Skip to content

Commit 7bf42aa

Browse files
committed
global memory bitonic sort
1 parent b2cc210 commit 7bf42aa

File tree

2 files changed

+92
-2
lines changed

2 files changed

+92
-2
lines changed

makefile renamed to Makefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,17 @@ CUDA_PATH = /opt/cuda
66
INCLUDES = -I$(CUDA_PATH)/include
77
LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart
88

9-
all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort
9+
all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort global_bitonic_sort
1010

1111
warp_bitonic_sort: main.o warp_bitonic_sort.o
1212
$(CXX) $^ -o $@ $(LDFLAGS)
1313

1414
smem_bitonic_sort: main.o smem_bitonic_sort.o
1515
$(CXX) $^ -o $@ $(LDFLAGS)
1616

17+
global_bitonic_sort: main.o global_bitonic_sort.o
18+
$(CXX) $^ -o $@ $(LDFLAGS)
19+
1720
cpu_bitonic_sort: cpu_bitonic_sort.cpp
1821
$(CXX) $^ -o $@
1922

@@ -26,5 +29,8 @@ warp_bitonic_sort.o: warp_bitonic_sort.cu bitonic_sort.cuh
2629
smem_bitonic_sort.o: smem_bitonic_sort.cu bitonic_sort.cuh
2730
$(NVCC) $(NVCCFLAGS) -c $< -o $@
2831

32+
global_bitonic_sort.o: global_bitonic_sort.cu bitonic_sort.cuh
33+
$(NVCC) $(NVCCFLAGS) -c $< -o $@
34+
2935
clean:
30-
rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort
36+
rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort global_bitonic_sort

global_bitonic_sort.cu

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/**
2+
* Global Memory Bitoic Sort
3+
*
4+
* This uses gpu global memory to sort arrays to sort long arrays of ints
5+
*
6+
* Author: Andrew Boessen
7+
*/
8+
9+
#include "bitonic_sort.cuh"
10+
11+
/**
12+
* Swap
13+
*
14+
* This is used for swapping elements in bitonic sorting
15+
*
16+
* @param x caller line id's value
17+
* @param mask source lane id = caller line id ^ mask
18+
* @param dir direction to swap
19+
* @param arr global memory array
20+
*
21+
*/
22+
__device__ void swap(int x, int mask, int dir, int *arr) {
23+
// get correspondin element to x in butterfly diagram
24+
int y = x ^ mask;
25+
// lower ids thread perform swap
26+
if (y > x) {
27+
if (dir) {
28+
// sort ascending
29+
if (arr[x] < arr[y]) {
30+
int temp = arr[x];
31+
arr[x] = arr[y];
32+
arr[y] = temp;
33+
}
34+
} else {
35+
// sort descending
36+
if (arr[x] > arr[y]) {
37+
int temp = arr[x];
38+
arr[x] = arr[y];
39+
arr[y] = temp;
40+
}
41+
}
42+
}
43+
}
44+
45+
/**
46+
* Global Memory Bitonic Sort
47+
*
48+
* The function uses the butterfly network pattern of bitonic sort, leveraging
49+
* CUDA's warp-level primitives for efficient sorting within a warp (32
50+
* threads). The swaps are tiled into warps of 32 threads. This is able to do
51+
* swaps without allocating extra memory for temporary variable.
52+
*
53+
* @param arr Pointer to the array of integers to be sorted
54+
* @param size Total number of elements in the array
55+
*
56+
* @note This function assumes that the number of threads per block is at least
57+
* equal to the warp size. Elements beyond the array size are padded with
58+
* INT_MAX.
59+
*
60+
* @see swap() for the element comparison and swapping logic
61+
*/
62+
__global__ void globalBitonicSort(int *arr, int size) {
63+
// local thread id in block
64+
int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
65+
66+
// make bitonic sequence and sort
67+
for (int i = 0; (1 << i) <= blockDim.x; i++) {
68+
for (int j = 1; j <= i; j++) {
69+
// distance between caller and source lanes
70+
int mask = 1 << (i - j);
71+
72+
// perform compare and swap
73+
int dir = thread_id & (1 << i);
74+
swap(thread_id, mask, dir, arr);
75+
__syncthreads();
76+
}
77+
}
78+
}
79+
80+
void launchBitonicSort(int *arr, int size) {
81+
const int BLOCK_SIZE = 512;
82+
const int NUM_BLOCKS = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
83+
globalBitonicSort<<<NUM_BLOCKS, BLOCK_SIZE>>>(arr, size);
84+
}

0 commit comments

Comments
 (0)