Merge pull request #1 from AndrewBoessen/smem

AndrewBoessen · web-flow · commit de7da541f5a4 · 2025-01-19T16:45:30.000-05:00
diff --git a/bitonic_sort.cuh b/bitonic_sort.cuh
@@ -0,0 +1,12 @@
+#ifndef BITONIC_SORT_CUH
+#define BITONIC_SORT_CUH
+
+#include <climits>
+#include <cuda_runtime.h>
+
+__device__ int swap(int x, int mask, int dir);
+__global__ void warpBitonicSort(int *arr, int size);
+__global__ void smemBitonicSort(int *arr, int size);
+void launchBitonicSort(int *arr, int size);
+
+#endif // BITONIC_SORT_CUH
diff --git a/main.cpp b/main.cpp
@@ -1,75 +1,74 @@
+#include "bitonic_sort.cuh"
+#include <cuda_runtime.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <algorithm>
-#include <cuda_runtime.h>
-#include "warp_bitonic_sort.cuh"
 
 // Function to check if the array is sorted
-bool isSorted(int* arr, int size) {
-    for (int i = 1; i < size; i++) {
-        if (arr[i] < arr[i-1]) return false;
-    }
-    return true;
+bool isSorted(int *arr, int size) {
+  for (int i = 1; i < size; i++) {
+    if (arr[i] < arr[i - 1])
+      return false;
+  }
+  return true;
 }
 
 int main() {
-    const int SIZE = 4096; // Must be a multiple of 32 for this example
-    const int BLOCK_SIZE = 256;
+  const int SIZE = 1024; // Must be a multiple of 32 for this example
 
-    // Allocate and initialize host array
-    int* h_arr = new int[SIZE];
-    for (int i = 0; i < SIZE; i++) {
-        h_arr[i] = rand() % 1000; // Random integers between 0 and 999
-    }
+  // Allocate and initialize host array
+  int *h_arr = new int[SIZE];
+  for (int i = 0; i < SIZE; i++) {
+    h_arr[i] = rand() % 1000; // Random integers between 0 and 999
+  }
 
-    // Allocate device array
-    int* d_arr;
-    cudaMalloc(&d_arr, SIZE * sizeof(int));
+  // Allocate device array
+  int *d_arr;
+  cudaMalloc(&d_arr, SIZE * sizeof(int));
 
-    // Copy host array to device
-    cudaMemcpy(d_arr, h_arr, SIZE * sizeof(int), cudaMemcpyHostToDevice);
+  // Copy host array to device
+  cudaMemcpy(d_arr, h_arr, SIZE * sizeof(int), cudaMemcpyHostToDevice);
 
-    // Create CUDA events for timing
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
+  // Create CUDA events for timing
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
 
-    // Record the start event
-    cudaEventRecord(start, nullptr);
+  // Record the start event
+  cudaEventRecord(start, nullptr);
 
-    // Launch kernel
-    launchWarpBitonicSort(d_arr, SIZE);
+  // Launch kernel
+  launchBitonicSort(d_arr, SIZE);
 
-    // Record the stop event
-    cudaEventRecord(stop, nullptr);
-    cudaEventSynchronize(stop);
+  // Record the stop event
+  cudaEventRecord(stop, nullptr);
+  cudaEventSynchronize(stop);
 
-    // Calculate elapsed time
-    float milliseconds = 0;
-    cudaEventElapsedTime(&milliseconds, start, stop);
+  // Calculate elapsed time
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
 
-    // Copy result back to host
-    cudaMemcpy(h_arr, d_arr, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
+  // Copy result back to host
+  cudaMemcpy(h_arr, d_arr, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
 
-    // Check if sorted
-    bool sorted = isSorted(h_arr, SIZE);
-    printf("Array is %s\n", sorted ? "sorted" : "not sorted");
+  // Check if sorted
+  bool sorted = isSorted(h_arr, SIZE);
+  printf("Array is %s\n", sorted ? "sorted" : "not sorted");
 
-    // Print first few elements to verify
-    printf("First 32 elements: ");
-    for (int i = 0; i < 32; i++) {
-        printf("%d ", h_arr[i]);
-    }
-    printf("\n");
+  // Print first few elements to verify
+  printf("First 32 elements: ");
+  for (int i = 0; i < 32; i++) {
+    printf("%d ", h_arr[i]);
+  }
+  printf("\n");
 
-    // Print timing information
-    printf("Kernel execution time: %f milliseconds\n", milliseconds);
+  // Print timing information
+  printf("Kernel execution time: %f milliseconds\n", milliseconds);
 
-    // Clean up
-    delete[] h_arr;
-    cudaFree(d_arr);
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
+  // Clean up
+  delete[] h_arr;
+  cudaFree(d_arr);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
 
-    return 0;
+  return 0;
 }
diff --git a/makefile b/makefile
@@ -1,24 +1,30 @@
 CXX = g++
 NVCC = nvcc
 CXXFLAGS = -std=c++11 -O2
-NVCCFLAGS = -O2
+NVCCFLAGS = -O2 -G -g
 CUDA_PATH = /opt/cuda
 INCLUDES = -I$(CUDA_PATH)/include
 LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart
 
-all: warp_bitonic_sort cpu_bitonic_sort
+all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort
 
 warp_bitonic_sort: main.o warp_bitonic_sort.o
 	$(CXX) $^ -o $@ $(LDFLAGS)
 
+smem_bitonic_sort: main.o smem_bitonic_sort.o
+	$(CXX) $^ -o $@ $(LDFLAGS)
+
 cpu_bitonic_sort: cpu_bitonic_sort.cpp
 	$(CXX) $^ -o $@
 
-main.o: main.cpp warp_bitonic_sort.cuh
+main.o: main.cpp bitonic_sort.cuh
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
-warp_bitonic_sort.o: warp_bitonic_sort.cu warp_bitonic_sort.cuh
+warp_bitonic_sort.o: warp_bitonic_sort.cu bitonic_sort.cuh
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+
+smem_bitonic_sort.o: smem_bitonic_sort.cu bitonic_sort.cuh
 	$(NVCC) $(NVCCFLAGS) -c $< -o $@
 
 clean:
-	rm -f *.o warp_bitonic_sort cpu_bitonic_sort
+	rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort
diff --git a/smem_bitonic_sort.cu b/smem_bitonic_sort.cu
@@ -0,0 +1,100 @@
+/**
+ * SMEM Bitoic Sort
+ *
+ * This uses shared memory to sort arrays. This uses warp shffle operator to
+ * compare and swap
+ *
+ * Author: Andrew Boessen
+ */
+
+#include "bitonic_sort.cuh"
+
+/**
+ * Swap
+ *
+ * This is used for swapping elements in bitonic sorting
+ *
+ * @param x caller line id's value
+ * @param mask source lane id = caller line id ^ mask
+ * @param dir direction to swap
+ *
+ * @return min or max of source and caller
+ */
+__device__ int swap(int x, int mask, int dir) {
+  // get correspondin element to x in butterfly diagram
+  int y = __shfl_xor_sync(0xffffffff, x, mask);
+  // return smaller or larger value based on direction of swap
+  return x < y == dir ? y : x;
+}
+
+/**
+ * SMEM Bitonic Sort
+ *
+ * This function performs a bitonic sort on integers whithin a thread blocks of
+ * 1024 threads. This stores itermediate products in shared memory for better
+ * efficiency.
+ *
+ * The function uses the butterfly network pattern of bitonic sort, leveraging
+ * CUDA's warp-level primitives for efficient sorting within a warp (32
+ * threads). The swaps are tiled into warps of 32 threads. This is able to do
+ * swaps without allocating extra memory for temporary variable.
+ *
+ * @param arr Pointer to the array of integers to be sorted
+ * @param size Total number of elements in the array
+ *
+ * @note This function assumes that the number of threads per block is at least
+ * equal to the warp size. Elements beyond the array size are padded with
+ * INT_MAX.
+ *
+ * @see swap() for the element comparison and swapping logic
+ */
+__global__ void smemBitonicSort(int *arr, int size) {
+  // shared memory for block of 1024 threads
+  extern __shared__ int smem[];
+
+  // local thread id in block
+  int thread_id = threadIdx.x;
+
+  // seed shared memory array with value from global array
+  // pad overflow threads with INT_MAX
+  smem[thread_id] = thread_id < size ? arr[thread_id] : INT_MAX;
+  __syncthreads();
+
+  // make bitonic sequence and sort
+  for (int i = 0; (1 << i) <= size; i++) {
+    for (int j = 0; j <= i; j++) {
+      // distance between caller and source lanes
+      int offset = 1 << (i - j - 1);
+      // direction to swap caller and source lanes
+      int dir;
+      // only alternate direction when forming bitonic sequence
+      if (1 << i == blockDim.x) {
+        dir = (thread_id >> (i - j)) & 1;
+      } else {
+        dir = (thread_id >> (i + 1)) & 1 ^ (thread_id >> (i - j)) & 1;
+      }
+      if (1 << i <= warpSize) {
+        smem[thread_id] = swap(smem[thread_id], offset, dir);
+      } else {
+        __syncthreads();
+        int partner_val = smem[thread_id ^ offset];
+        int val = smem[thread_id];
+        // compare and swap elements
+        smem[thread_id] = val < partner_val == dir ? val : partner_val;
+        smem[thread_id ^ offset] = val < partner_val == dir ? partner_val : val;
+      }
+    }
+  }
+  __syncthreads();
+
+  // update value in array with sorted value
+  if (thread_id < size) {
+    arr[thread_id] = smem[thread_id];
+  }
+}
+
+void launchBitonicSort(int *arr, int size) {
+  const int BLOCK_SIZE = 1024;
+  smemBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE * sizeof(int)>>>(
+      arr, size);
+}
diff --git a/warp_bitonic_sort.cu b/warp_bitonic_sort.cu
@@ -1,24 +1,24 @@
 /**
-* Warp Bitoic Sort
-*
-* This uses warp shuffle to sort integers in a warp with bitonic sort
-*
-* Author: Andrew Boessen
-*/
+ * Warp Bitoic Sort
+ *
+ * This uses warp shuffle to sort integers in a warp with bitonic sort
+ *
+ * Author: Andrew Boessen
+ */
 
-#include "warp_bitonic_sort.cuh"
+#include "bitonic_sort.cuh"
 
 /**
-* Swap
-* 
-* This is used for swapping elements in bitonic sorting
-*
-* @param x caller line id's value
-* @param mask source lane id = caller line id ^ mask
-* @param dir direction to swap
-*
-* @return min or max of source and caller
-*/
+ * Swap
+ *
+ * This is used for swapping elements in bitonic sorting
+ *
+ * @param x caller line id's value
+ * @param mask source lane id = caller line id ^ mask
+ * @param dir direction to swap
+ *
+ * @return min or max of source and caller
+ */
 __device__ int swap(int x, int mask, int dir) {
   // get correspondin element to x in butterfly diagram
   int y = __shfl_xor_sync(0xffffffff, x, mask);
@@ -29,17 +29,20 @@ __device__ int swap(int x, int mask, int dir) {
 /**
  * Warp Bitonic Sort
  *
- * This function performs a bitonic sort on integers within a warp using warp shuffle operations.
- * It sorts a portion of the input array corresponding to the calling thread's warp.
+ * This function performs a bitonic sort on integers within a warp using warp
+ * shuffle operations. It sorts a portion of the input array corresponding to
+ * the calling thread's warp.
  *
- * The function uses the butterfly network pattern of bitonic sort, leveraging CUDA's warp-level
- * primitives for efficient sorting within a warp (32 threads).
+ * The function uses the butterfly network pattern of bitonic sort, leveraging
+ * CUDA's warp-level primitives for efficient sorting within a warp (32
+ * threads).
  *
  * @param arr Pointer to the array of integers to be sorted
  * @param size Total number of elements in the array
  *
- * @note This function assumes that the number of threads per block is at least equal to the warp size.
- *       Elements beyond the array size are padded with INT_MAX.
+ * @note This function assumes that the number of threads per block is at least
+ * equal to the warp size. Elements beyond the array size are padded with
+ * INT_MAX.
  *
  * @see swap() for the element comparison and swapping logic
  */
@@ -54,7 +57,7 @@ __global__ void warpBitonicSort(int *arr, int size) {
   for (int i = 0; (1 << i) <= warpSize; i++) {
     for (int j = 0; j <= i; j++) {
       // distance between caller and source lanes
-      int mask = 1 << (i-j);
+      int mask = 1 << (i - j);
       // direction to swap caller and source lanes
       int dir;
       // only alternate direction when forming bitonic sequence
@@ -73,7 +76,7 @@ __global__ void warpBitonicSort(int *arr, int size) {
   }
 }
 
-void launchWarpBitonicSort(int *arr, int size) {
-    const int BLOCK_SIZE = 256;
-    warpBitonicSort<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(arr, size);
+void launchBitonicSort(int *arr, int size) {
+  const int BLOCK_SIZE = 256;
+  warpBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE>>>(arr, size);
 }
diff --git a/warp_bitonic_sort.cuh b/warp_bitonic_sort.cuh