use warp suffle in smem sorting

AndrewBoessen · AndrewBoessen · commit 724ffd0af8d1 · 2025-01-19T16:13:26.000-05:00
diff --git a/main.cpp b/main.cpp
@@ -13,8 +13,7 @@ bool isSorted(int *arr, int size) {
 }
 
 int main() {
-  const int SIZE = 4096; // Must be a multiple of 32 for this example
-  const int BLOCK_SIZE = 256;
+  const int SIZE = 1024; // Must be a multiple of 32 for this example
 
   // Allocate and initialize host array
   int *h_arr = new int[SIZE];
diff --git a/smem_bitonic_sort.cu b/smem_bitonic_sort.cu
@@ -8,7 +8,6 @@
  */
 
 #include "bitonic_sort.cuh"
-#include <stdio.h>
 
 /**
  * Swap
@@ -51,41 +50,42 @@ __device__ int swap(int x, int mask, int dir) {
  */
 __global__ void smemBitonicSort(int *arr, int size) {
   // shared memory for block of 1024 threads
-  __shared__ int smem[1 << 10];
+  extern __shared__ int smem[];
 
   // local thread id in block
   int thread_id = threadIdx.x;
 
   // seed shared memory array with value from global array
   // pad overflow threads with INT_MAX
   smem[thread_id] = thread_id < size ? arr[thread_id] : INT_MAX;
+  __syncthreads();
 
   // make bitonic sequence and sort
-  for (int i = 0; (1 << i) <= blockDim.x; i++) {
+  for (int i = 0; (1 << i) <= size; i++) {
     for (int j = 0; j <= i; j++) {
       // distance between caller and source lanes
-      int offset = 1 << (i - j);
-      // number of elements in each sorted subset
-      int sort_size = offset << 1;
-      // id into smem array
-      int arr_id =
-          (thread_id / sort_size * sort_size) + (thread_id % sort_size / 2) ^
-          (thread_id % 2 * offset); // apply xor to odd threads
-      printf("thread %d arr %d\n", thread_id, arr_id);
+      int offset = 1 << (i - j - 1);
       // direction to swap caller and source lanes
       int dir;
       // only alternate direction when forming bitonic sequence
       if (1 << i == blockDim.x) {
-        dir = (arr_id >> (i - j)) & 1;
+        dir = (thread_id >> (i - j)) & 1;
       } else {
-        dir = (arr_id >> (i + 1)) & 1 ^ (arr_id >> (i - j)) & 1;
+        dir = (thread_id >> (i + 1)) & 1 ^ (thread_id >> (i - j)) & 1;
+      }
+      if (1 << i <= warpSize) {
+        smem[thread_id] = swap(smem[thread_id], offset, dir);
+      } else {
+        __syncthreads();
+        int partner_val = smem[thread_id ^ offset];
+        int val = smem[thread_id];
+        // compare and swap elements
+        smem[thread_id] = val < partner_val == dir ? val : partner_val;
+        smem[thread_id ^ offset] = val < partner_val == dir ? partner_val : val;
       }
-      // elements to compare and swap are directly next to eachother in warp
-      smem[arr_id] = swap(smem[arr_id], 1, dir);
-      // wait for all warps to finish swap before going to next layer
-      __syncthreads();
     }
   }
+  __syncthreads();
 
   // update value in array with sorted value
   if (thread_id < size) {
@@ -95,6 +95,6 @@ __global__ void smemBitonicSort(int *arr, int size) {
 
 void launchBitonicSort(int *arr, int size) {
   const int BLOCK_SIZE = 1024;
-  smemBitonicSort<<<(size + (BLOCK_SIZE - 1)) / BLOCK_SIZE, BLOCK_SIZE>>>(arr,
-                                                                          size);
+  smemBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE * sizeof(int)>>>(
+      arr, size);
 }

Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,7 @@ bool isSorted(int *arr, int size) {`
`13`	`13`	`}`
`14`	`14`
`15`	`15`	`int main() {`
`16`		`- const int SIZE = 4096; // Must be a multiple of 32 for this example`
`17`		`- const int BLOCK_SIZE = 256;`
	`16`	`+ const int SIZE = 1024; // Must be a multiple of 32 for this example`
`18`	`17`
`19`	`18`	`// Allocate and initialize host array`
`20`	`19`	`int *h_arr = new int[SIZE];`