Skip to content

Commit de7da54

Browse files
Merge pull request #1 from AndrewBoessen/smem
2 parents 853f696 + 724ffd0 commit de7da54

File tree

6 files changed

+205
-96
lines changed

6 files changed

+205
-96
lines changed

bitonic_sort.cuh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef BITONIC_SORT_CUH
2+
#define BITONIC_SORT_CUH
3+
4+
#include <climits>
5+
#include <cuda_runtime.h>
6+
7+
__device__ int swap(int x, int mask, int dir);
8+
__global__ void warpBitonicSort(int *arr, int size);
9+
__global__ void smemBitonicSort(int *arr, int size);
10+
void launchBitonicSort(int *arr, int size);
11+
12+
#endif // BITONIC_SORT_CUH

main.cpp

Lines changed: 52 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,74 @@
1+
#include "bitonic_sort.cuh"
2+
#include <cuda_runtime.h>
13
#include <stdio.h>
24
#include <stdlib.h>
3-
#include <algorithm>
4-
#include <cuda_runtime.h>
5-
#include "warp_bitonic_sort.cuh"
65

76
// Function to check if the array is sorted
8-
bool isSorted(int* arr, int size) {
9-
for (int i = 1; i < size; i++) {
10-
if (arr[i] < arr[i-1]) return false;
11-
}
12-
return true;
7+
bool isSorted(int *arr, int size) {
8+
for (int i = 1; i < size; i++) {
9+
if (arr[i] < arr[i - 1])
10+
return false;
11+
}
12+
return true;
1313
}
1414

1515
int main() {
16-
const int SIZE = 4096; // Must be a multiple of 32 for this example
17-
const int BLOCK_SIZE = 256;
16+
const int SIZE = 1024; // Must be a multiple of 32 for this example
1817

19-
// Allocate and initialize host array
20-
int* h_arr = new int[SIZE];
21-
for (int i = 0; i < SIZE; i++) {
22-
h_arr[i] = rand() % 1000; // Random integers between 0 and 999
23-
}
18+
// Allocate and initialize host array
19+
int *h_arr = new int[SIZE];
20+
for (int i = 0; i < SIZE; i++) {
21+
h_arr[i] = rand() % 1000; // Random integers between 0 and 999
22+
}
2423

25-
// Allocate device array
26-
int* d_arr;
27-
cudaMalloc(&d_arr, SIZE * sizeof(int));
24+
// Allocate device array
25+
int *d_arr;
26+
cudaMalloc(&d_arr, SIZE * sizeof(int));
2827

29-
// Copy host array to device
30-
cudaMemcpy(d_arr, h_arr, SIZE * sizeof(int), cudaMemcpyHostToDevice);
28+
// Copy host array to device
29+
cudaMemcpy(d_arr, h_arr, SIZE * sizeof(int), cudaMemcpyHostToDevice);
3130

32-
// Create CUDA events for timing
33-
cudaEvent_t start, stop;
34-
cudaEventCreate(&start);
35-
cudaEventCreate(&stop);
31+
// Create CUDA events for timing
32+
cudaEvent_t start, stop;
33+
cudaEventCreate(&start);
34+
cudaEventCreate(&stop);
3635

37-
// Record the start event
38-
cudaEventRecord(start, nullptr);
36+
// Record the start event
37+
cudaEventRecord(start, nullptr);
3938

40-
// Launch kernel
41-
launchWarpBitonicSort(d_arr, SIZE);
39+
// Launch kernel
40+
launchBitonicSort(d_arr, SIZE);
4241

43-
// Record the stop event
44-
cudaEventRecord(stop, nullptr);
45-
cudaEventSynchronize(stop);
42+
// Record the stop event
43+
cudaEventRecord(stop, nullptr);
44+
cudaEventSynchronize(stop);
4645

47-
// Calculate elapsed time
48-
float milliseconds = 0;
49-
cudaEventElapsedTime(&milliseconds, start, stop);
46+
// Calculate elapsed time
47+
float milliseconds = 0;
48+
cudaEventElapsedTime(&milliseconds, start, stop);
5049

51-
// Copy result back to host
52-
cudaMemcpy(h_arr, d_arr, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
50+
// Copy result back to host
51+
cudaMemcpy(h_arr, d_arr, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
5352

54-
// Check if sorted
55-
bool sorted = isSorted(h_arr, SIZE);
56-
printf("Array is %s\n", sorted ? "sorted" : "not sorted");
53+
// Check if sorted
54+
bool sorted = isSorted(h_arr, SIZE);
55+
printf("Array is %s\n", sorted ? "sorted" : "not sorted");
5756

58-
// Print first few elements to verify
59-
printf("First 32 elements: ");
60-
for (int i = 0; i < 32; i++) {
61-
printf("%d ", h_arr[i]);
62-
}
63-
printf("\n");
57+
// Print first few elements to verify
58+
printf("First 32 elements: ");
59+
for (int i = 0; i < 32; i++) {
60+
printf("%d ", h_arr[i]);
61+
}
62+
printf("\n");
6463

65-
// Print timing information
66-
printf("Kernel execution time: %f milliseconds\n", milliseconds);
64+
// Print timing information
65+
printf("Kernel execution time: %f milliseconds\n", milliseconds);
6766

68-
// Clean up
69-
delete[] h_arr;
70-
cudaFree(d_arr);
71-
cudaEventDestroy(start);
72-
cudaEventDestroy(stop);
67+
// Clean up
68+
delete[] h_arr;
69+
cudaFree(d_arr);
70+
cudaEventDestroy(start);
71+
cudaEventDestroy(stop);
7372

74-
return 0;
73+
return 0;
7574
}

makefile

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,30 @@
11
CXX = g++
22
NVCC = nvcc
33
CXXFLAGS = -std=c++11 -O2
4-
NVCCFLAGS = -O2
4+
NVCCFLAGS = -O2 -G -g
55
CUDA_PATH = /opt/cuda
66
INCLUDES = -I$(CUDA_PATH)/include
77
LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart
88

9-
all: warp_bitonic_sort cpu_bitonic_sort
9+
all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort
1010

1111
warp_bitonic_sort: main.o warp_bitonic_sort.o
1212
$(CXX) $^ -o $@ $(LDFLAGS)
1313

14+
smem_bitonic_sort: main.o smem_bitonic_sort.o
15+
$(CXX) $^ -o $@ $(LDFLAGS)
16+
1417
cpu_bitonic_sort: cpu_bitonic_sort.cpp
1518
$(CXX) $^ -o $@
1619

17-
main.o: main.cpp warp_bitonic_sort.cuh
20+
main.o: main.cpp bitonic_sort.cuh
1821
$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
1922

20-
warp_bitonic_sort.o: warp_bitonic_sort.cu warp_bitonic_sort.cuh
23+
warp_bitonic_sort.o: warp_bitonic_sort.cu bitonic_sort.cuh
24+
$(NVCC) $(NVCCFLAGS) -c $< -o $@
25+
26+
smem_bitonic_sort.o: smem_bitonic_sort.cu bitonic_sort.cuh
2127
$(NVCC) $(NVCCFLAGS) -c $< -o $@
2228

2329
clean:
24-
rm -f *.o warp_bitonic_sort cpu_bitonic_sort
30+
rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort

smem_bitonic_sort.cu

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/**
2+
* SMEM Bitoic Sort
3+
*
4+
* This uses shared memory to sort arrays. This uses warp shffle operator to
5+
* compare and swap
6+
*
7+
* Author: Andrew Boessen
8+
*/
9+
10+
#include "bitonic_sort.cuh"
11+
12+
/**
13+
* Swap
14+
*
15+
* This is used for swapping elements in bitonic sorting
16+
*
17+
* @param x caller line id's value
18+
* @param mask source lane id = caller line id ^ mask
19+
* @param dir direction to swap
20+
*
21+
* @return min or max of source and caller
22+
*/
23+
__device__ int swap(int x, int mask, int dir) {
24+
// get correspondin element to x in butterfly diagram
25+
int y = __shfl_xor_sync(0xffffffff, x, mask);
26+
// return smaller or larger value based on direction of swap
27+
return x < y == dir ? y : x;
28+
}
29+
30+
/**
31+
* SMEM Bitonic Sort
32+
*
33+
* This function performs a bitonic sort on integers whithin a thread blocks of
34+
* 1024 threads. This stores itermediate products in shared memory for better
35+
* efficiency.
36+
*
37+
* The function uses the butterfly network pattern of bitonic sort, leveraging
38+
* CUDA's warp-level primitives for efficient sorting within a warp (32
39+
* threads). The swaps are tiled into warps of 32 threads. This is able to do
40+
* swaps without allocating extra memory for temporary variable.
41+
*
42+
* @param arr Pointer to the array of integers to be sorted
43+
* @param size Total number of elements in the array
44+
*
45+
* @note This function assumes that the number of threads per block is at least
46+
* equal to the warp size. Elements beyond the array size are padded with
47+
* INT_MAX.
48+
*
49+
* @see swap() for the element comparison and swapping logic
50+
*/
51+
__global__ void smemBitonicSort(int *arr, int size) {
52+
// shared memory for block of 1024 threads
53+
extern __shared__ int smem[];
54+
55+
// local thread id in block
56+
int thread_id = threadIdx.x;
57+
58+
// seed shared memory array with value from global array
59+
// pad overflow threads with INT_MAX
60+
smem[thread_id] = thread_id < size ? arr[thread_id] : INT_MAX;
61+
__syncthreads();
62+
63+
// make bitonic sequence and sort
64+
for (int i = 0; (1 << i) <= size; i++) {
65+
for (int j = 0; j <= i; j++) {
66+
// distance between caller and source lanes
67+
int offset = 1 << (i - j - 1);
68+
// direction to swap caller and source lanes
69+
int dir;
70+
// only alternate direction when forming bitonic sequence
71+
if (1 << i == blockDim.x) {
72+
dir = (thread_id >> (i - j)) & 1;
73+
} else {
74+
dir = (thread_id >> (i + 1)) & 1 ^ (thread_id >> (i - j)) & 1;
75+
}
76+
if (1 << i <= warpSize) {
77+
smem[thread_id] = swap(smem[thread_id], offset, dir);
78+
} else {
79+
__syncthreads();
80+
int partner_val = smem[thread_id ^ offset];
81+
int val = smem[thread_id];
82+
// compare and swap elements
83+
smem[thread_id] = val < partner_val == dir ? val : partner_val;
84+
smem[thread_id ^ offset] = val < partner_val == dir ? partner_val : val;
85+
}
86+
}
87+
}
88+
__syncthreads();
89+
90+
// update value in array with sorted value
91+
if (thread_id < size) {
92+
arr[thread_id] = smem[thread_id];
93+
}
94+
}
95+
96+
void launchBitonicSort(int *arr, int size) {
97+
const int BLOCK_SIZE = 1024;
98+
smemBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE * sizeof(int)>>>(
99+
arr, size);
100+
}

warp_bitonic_sort.cu

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
/**
2-
* Warp Bitoic Sort
3-
*
4-
* This uses warp shuffle to sort integers in a warp with bitonic sort
5-
*
6-
* Author: Andrew Boessen
7-
*/
2+
* Warp Bitoic Sort
3+
*
4+
* This uses warp shuffle to sort integers in a warp with bitonic sort
5+
*
6+
* Author: Andrew Boessen
7+
*/
88

9-
#include "warp_bitonic_sort.cuh"
9+
#include "bitonic_sort.cuh"
1010

1111
/**
12-
* Swap
13-
*
14-
* This is used for swapping elements in bitonic sorting
15-
*
16-
* @param x caller line id's value
17-
* @param mask source lane id = caller line id ^ mask
18-
* @param dir direction to swap
19-
*
20-
* @return min or max of source and caller
21-
*/
12+
* Swap
13+
*
14+
* This is used for swapping elements in bitonic sorting
15+
*
16+
* @param x caller line id's value
17+
* @param mask source lane id = caller line id ^ mask
18+
* @param dir direction to swap
19+
*
20+
* @return min or max of source and caller
21+
*/
2222
__device__ int swap(int x, int mask, int dir) {
2323
// get correspondin element to x in butterfly diagram
2424
int y = __shfl_xor_sync(0xffffffff, x, mask);
@@ -29,17 +29,20 @@ __device__ int swap(int x, int mask, int dir) {
2929
/**
3030
* Warp Bitonic Sort
3131
*
32-
* This function performs a bitonic sort on integers within a warp using warp shuffle operations.
33-
* It sorts a portion of the input array corresponding to the calling thread's warp.
32+
* This function performs a bitonic sort on integers within a warp using warp
33+
* shuffle operations. It sorts a portion of the input array corresponding to
34+
* the calling thread's warp.
3435
*
35-
* The function uses the butterfly network pattern of bitonic sort, leveraging CUDA's warp-level
36-
* primitives for efficient sorting within a warp (32 threads).
36+
* The function uses the butterfly network pattern of bitonic sort, leveraging
37+
* CUDA's warp-level primitives for efficient sorting within a warp (32
38+
* threads).
3739
*
3840
* @param arr Pointer to the array of integers to be sorted
3941
* @param size Total number of elements in the array
4042
*
41-
* @note This function assumes that the number of threads per block is at least equal to the warp size.
42-
* Elements beyond the array size are padded with INT_MAX.
43+
* @note This function assumes that the number of threads per block is at least
44+
* equal to the warp size. Elements beyond the array size are padded with
45+
* INT_MAX.
4346
*
4447
* @see swap() for the element comparison and swapping logic
4548
*/
@@ -54,7 +57,7 @@ __global__ void warpBitonicSort(int *arr, int size) {
5457
for (int i = 0; (1 << i) <= warpSize; i++) {
5558
for (int j = 0; j <= i; j++) {
5659
// distance between caller and source lanes
57-
int mask = 1 << (i-j);
60+
int mask = 1 << (i - j);
5861
// direction to swap caller and source lanes
5962
int dir;
6063
// only alternate direction when forming bitonic sequence
@@ -73,7 +76,7 @@ __global__ void warpBitonicSort(int *arr, int size) {
7376
}
7477
}
7578

76-
void launchWarpBitonicSort(int *arr, int size) {
77-
const int BLOCK_SIZE = 256;
78-
warpBitonicSort<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(arr, size);
79+
void launchBitonicSort(int *arr, int size) {
80+
const int BLOCK_SIZE = 256;
81+
warpBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE>>>(arr, size);
7982
}

warp_bitonic_sort.cuh

Lines changed: 0 additions & 11 deletions
This file was deleted.

0 commit comments

Comments
 (0)