diff --git a/README.md b/README.md index 0e38ddb..ff4137e 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,107 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jiyu Huang + * [LinkedIn](https://www.linkedin.com/in/jiyu-huang-0123/) +* Tested on: Windows 10, i7-6700 @ 3.41GHz 16GB, Quadro P1000 4096MB (Towne 062 Lab) -### (TODO: Your README) +### Overview -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project involves CUDA implementation of GPU parallel algorithms such as scan (prefix sum), stream compaction, and Radix sort. Specifically, the following are implemented: +* CPU scan (serialized, used as comparison) +* CPU stream compact (serialized, used as comparison) +* CPU stream compact with CPU scan (serialized, used as comparison) +* naive version of GPU scan +* work-efficient GPU scan +* work-efficient GPU scan with shared memory and no bank conflict +* GPU stream compaction using optimized work-efficient GPU scan +* GPU Radix sort using optimized work-efficient GPU scan + +Thrust library's version of exclusive scan is also used as comparison. + +### Performance Analysis + +The performance of various implementations of scan are illustrated below. + +![performance_chart](/img/performance_chart.png) +![performance_chart_large](/img/performance_chart_large.png) + +* As can be seen from the graph, starting from array size 2^15 (32768), GPU algorithms show performance advantages towards the CPU implementation, due to the advantages of parallelism. + +* The naive version of GPU scan performs sufficiently well until the array size reaches 2^17 (131072), after which point the performance drops significantly. As shown in the large array graph, it becomes the slowest implementation, even slower than CPU implementation. This is due to the fact that the naive version of GPU scan is not work efficient and in total performs the most amount of computations. + +* The work-efficient version of GPU scan initially performs worse than other implementations, but catches up and ends up reducing execution time compared to CPU implemetation and naive GPU implementation. The initial slowness likely results from the extra amount of kernel invocations. + +* The shared memory optimization has a significant impact on improving performance and is the fastest implementation in this project, as it should be; operating on shared memory efficiently does prove to be much faster than operating on global memory. + +* Thrust library's scan function is almost always the fastest version (except for when the array size is small, or when the array size goes from 2^17 (131072) to 2^18 (262144), where Thrust's scan function has a sudden performance drop). + +I would like to delve deeper into the execution timelines for each implementation (and understand why Thrust's scan is so fast), but since I am using the lab computer with no admin access to enable Nsight tracing, I'm temporarily unable to do that. + +### Test Results + +The following test output is generated with array size of one million (2^20). Extra Radix sort tests (one for power-of-two, one for non-power-of-two) testing the sorting correctness are also included at the end. + +``` +**************** +** SCAN TESTS ** +**************** + [ 19 18 31 41 16 17 6 12 41 4 7 45 31 ... 11 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.6692ms (std::chrono Measured) + [ 0 19 37 68 109 125 142 148 160 201 205 212 257 ... 25680674 25680685 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.6274ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 2.83338ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 2.64646ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.11798ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.829952ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.434304ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.463584ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 1 2 1 2 2 1 2 0 3 1 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 3.0586ms (std::chrono Measured) + [ 2 2 1 2 1 2 2 1 2 3 1 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.5556ms (std::chrono Measured) + [ 2 2 1 2 1 2 2 1 2 3 1 1 1 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 5.8896ms (std::chrono Measured) + [ 2 2 1 2 1 2 2 1 2 3 1 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 1.22845ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 1.25226ms (CUDA Measured) + passed + +***************************** +** RADIX SORT TESTS ** +***************************** + [ 2 6 17 6 13 14 18 13 2 16 19 9 1 ... 4 0 ] +==== radix sort, power-of-two ==== + passed +==== radix sort, non-power-of-two ==== + passed +``` diff --git a/img/data.xlsx b/img/data.xlsx new file mode 100644 index 0000000..0da1caa Binary files /dev/null and b/img/data.xlsx differ diff --git a/img/performance_chart.png b/img/performance_chart.png new file mode 100644 index 0000000..e3092d5 Binary files /dev/null and b/img/performance_chart.png differ diff --git a/img/performance_chart_large.png b/img/performance_chart_large.png new file mode 100644 index 0000000..304b363 Binary files /dev/null and b/img/performance_chart_large.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..6c76602 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -44,7 +44,7 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -147,6 +147,29 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 20); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("radix sort, power-of-two"); + StreamCompaction::Efficient::radixSort(SIZE, b, a); + std::memcpy(c, a, SIZE * sizeof(int)); + std::sort(c, c + SIZE); + printCmpResult(SIZE, b, c); + + zeroArray(NPOT, b); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::Efficient::radixSort(NPOT, b, a); + std::memcpy(c, a, NPOT * sizeof(int)); + std::sort(c, c + NPOT); + printCmpResult(NPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..bad955b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -32,7 +36,13 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..bb50e41 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +const int blockSize = 512; + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..cb95a42 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -12,6 +12,13 @@ namespace StreamCompaction { return timer; } + void scanHelper(int n, int* odata, const int* idata) { + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = idata[i - 1] + odata[i - 1]; + } + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -19,7 +26,7 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + scanHelper(n, odata, idata); timer().endCpuTimer(); } @@ -30,9 +37,14 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int j = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[j++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -42,9 +54,21 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *boolArray = new int[n]; + int *scanArray = new int[n]; + for (int i = 0; i < n; ++i) { + boolArray[i] = idata[i] == 0 ? 0 : 1; + } + scanHelper(n, scanArray, boolArray); + int num = 0; + for (int i = 0; i < n; ++i) { + if (boolArray[i] != 0) { + odata[scanArray[i]] = idata[i]; + ++num; + } + } timer().endCpuTimer(); - return -1; + return num; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..1e53fdc 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define USING_SHARED_MEMORY 1 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,170 @@ namespace StreamCompaction { return timer; } +#if USING_SHARED_MEMORY + + const int blockSize_sharedMemory = 128; + + const int logNumBanks = 5; + + #define CONFLICT_FREE_OFFSET(n) ((n) >> logNumBanks) + + __global__ void kernScanPerBlock(int n, int *dev_data, int *dev_blockSum) { + + if (n == 1) { + dev_data[0] = 0; + return; + } + if (threadIdx.x >= n / 2) { + return; + } + + __shared__ int temp[2 * blockSize_sharedMemory + CONFLICT_FREE_OFFSET(2 * blockSize_sharedMemory)]; + + dev_data += blockDim.x * blockIdx.x * 2; + if (n > blockDim.x * 2) { + n = blockDim.x * 2; + } + + int i = threadIdx.x; + int j = threadIdx.x + n / 2; + int ti = i + CONFLICT_FREE_OFFSET(i); + int tj = j + CONFLICT_FREE_OFFSET(j); + temp[ti] = dev_data[i]; + temp[tj] = dev_data[j]; + + int lastElement = 0; + if (dev_blockSum && threadIdx.x == blockDim.x - 1) { + lastElement = temp[tj]; + } + + int offset = 1; + for (int d = n >> 1; d > 0; d >>= 1) { + __syncthreads(); + if (threadIdx.x < d) { + int i = offset * (2 * threadIdx.x + 1) - 1; + int j = offset * (2 * threadIdx.x + 2) - 1; + i += CONFLICT_FREE_OFFSET(i); + j += CONFLICT_FREE_OFFSET(j); + temp[j] += temp[i]; + } + offset *= 2; + } + + if (threadIdx.x == 0) { + temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0; + } + + for (int d = 1; d < n; d *= 2) { + offset >>= 1; + __syncthreads(); + if (threadIdx.x < d) { + int i = offset * (2 * threadIdx.x + 1) - 1; + int j = offset * (2 * threadIdx.x + 2) - 1; + i += CONFLICT_FREE_OFFSET(i); + j += CONFLICT_FREE_OFFSET(j); + int t = temp[i]; + temp[i] = temp[j]; + temp[j] += t; + } + } + + __syncthreads(); + dev_data[i] = temp[ti]; + dev_data[j] = temp[tj]; + + if (dev_blockSum && threadIdx.x == blockDim.x - 1) { + dev_blockSum[blockIdx.x] = lastElement + temp[tj]; + } + } + + __global__ void kernAddPerBlock(int *dev_data, int *dev_add) { + int blockSum = dev_add[blockIdx.x]; + dev_data += blockIdx.x * blockDim.x * 2; + dev_data[threadIdx.x] += blockSum; + dev_data[threadIdx.x + blockDim.x] += blockSum; + } + + void scanHelper(int size, int *dev_data) { + + if (size > 2 * blockSize_sharedMemory) { + + int blocks = size / (2 * blockSize_sharedMemory); + int *dev_blockSum; + cudaMalloc((void**) &dev_blockSum, blocks * sizeof(int)); + + kernScanPerBlock<<>>(size, dev_data, dev_blockSum); + scanHelper(blocks, dev_blockSum); + kernAddPerBlock<<>>(dev_data, dev_blockSum); + + cudaFree(dev_blockSum); + + } else { + kernScanPerBlock<<<1, blockSize_sharedMemory>>>(size, dev_data, nullptr); + } + } + +#else + + __global__ void kernScanUpSweepPhase(int threads, int *dev_temp, int offset) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= threads) { + return; + } + index = (index + 1) * offset * 2 - 1; + dev_temp[index] += dev_temp[index - offset]; + } + + __global__ void kernScanDownSweepPhase(int threads, int *dev_temp, int offset) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= threads) { + return; + } + index = (index + 1) * offset * 2 - 1; + int t = dev_temp[index - offset]; + dev_temp[index - offset] = dev_temp[index]; + dev_temp[index] += t; + } + + void scanHelper(int size, int *dev_temp) { + + int threads = size / 2; + int offset = 1; + for (; threads > 0; threads /= 2, offset *= 2) { + dim3 blocks((threads + blockSize - 1) / blockSize); + kernScanUpSweepPhase<<>>(threads, dev_temp, offset); + } + + cudaMemset(dev_temp + size - 1, 0, sizeof(int)); + threads = 1; + offset = size / 2; + for (; offset > 0; offset /= 2, threads *= 2) { + dim3 blocks((threads + blockSize - 1) / blockSize); + kernScanDownSweepPhase<<>>(threads, dev_temp, offset); + } + } + +#endif + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int size = 1 << ilog2ceil(n); + + int *dev_temp; + cudaMalloc((void**) &dev_temp, size * sizeof(int)); + cudaMemcpy(dev_temp, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_temp + n, 0, (size - n) * sizeof(int)); + timer().startGpuTimer(); - // TODO + scanHelper(size, dev_temp); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_temp, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_temp); } /** @@ -31,10 +190,94 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + dim3 blocks((n + blockSize - 1) / blockSize); + int size = 1 << ilog2ceil(n); + + int *dev_idata, *dev_odata, *dev_bools, *dev_indices; + cudaMalloc((void**) &dev_idata, n * sizeof(int)); + cudaMalloc((void**) &dev_odata, n * sizeof(int)); + cudaMalloc((void**) &dev_bools, n * sizeof(int)); + cudaMalloc((void**) &dev_indices, size * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_indices + n, 0, (size - n) * sizeof(int)); + timer().startGpuTimer(); - // TODO + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + scanHelper(size, dev_indices); + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); timer().endGpuTimer(); - return -1; + + int lastBool, lastIdx; + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIdx, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return lastBool + lastIdx; + } + + __global__ void kernBitKNegative(int n, int *dev_idata, int *dev_bools, int bitK) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + dev_bools[index] = (dev_idata[index] & (1 << bitK)) != 0 ? 0 : 1; + } + + __global__ void kernSplit(int n, int *dev_idata, int *dev_odata, int *dev_scan, int bitK, int totalFalses) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + int data = dev_idata[index]; + int scanIdx = dev_scan[index]; + if ((data & (1 << bitK)) == 0) { + dev_odata[scanIdx] = data; + } else { + dev_odata[index - scanIdx + totalFalses] = data; + } + } + + void radixSort(int n, int *odata, const int *idata) { + + dim3 blocks((n + blockSize - 1) / blockSize); + int size = 1 << ilog2ceil(n); + int maxNum = 0; + for (int i = 0; i < n; ++i) { + maxNum = std::max(maxNum, idata[i]); + } + int maxBit = ilog2ceil(maxNum); + + int *dev_data1, *dev_data2, *dev_scan; + cudaMalloc((void**) &dev_data1, n * sizeof(int)); + cudaMalloc((void**) &dev_data2, n * sizeof(int)); + cudaMalloc((void**) &dev_scan, size * sizeof(int)); + + cudaMemcpy(dev_data1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int i = 0; i <= maxBit; ++i) { + int lastBool, lastScan; + kernBitKNegative<<>>(n, dev_data1, dev_scan, i); + cudaMemcpy(&lastBool, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); + scanHelper(size, dev_scan); + cudaMemcpy(&lastScan, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + kernSplit<<>>(n, dev_data1, dev_data2, dev_scan, i, lastBool + lastScan); + std::swap(dev_data1, dev_data2); + } + + cudaMemcpy(odata, dev_data1, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data1); + cudaFree(dev_data2); + cudaFree(dev_scan); } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..a2b764f 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -9,5 +9,7 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); + + void radixSort(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..995bf42 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,48 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernScanStep(int n, const int *dev_in, int *dev_out, int offset) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + int data = dev_in[index]; + if (index >= offset) { + dev_out[index] = data + dev_in[index - offset]; + } else { + dev_out[index] = data; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + dim3 blocks((n + blockSize - 1) / blockSize); + int steps = ilog2ceil(n); + int offset = 1; + + int *dev_temp1, *dev_temp2; + cudaMalloc((void**) &dev_temp1, n * sizeof(int)); + cudaMalloc((void**) &dev_temp2, n * sizeof(int)); + + cudaMemcpy(dev_temp1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + for (int i = 0; i < steps; ++i) { + kernScanStep<<>>(n, dev_temp1, dev_temp2, offset); + std::swap(dev_temp1, dev_temp2); + offset *= 2; + } timer().endGpuTimer(); + + odata[0] = 0; + cudaMemcpy(odata + 1, dev_temp1, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_temp1); + cudaFree(dev_temp2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..172b221 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,15 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } }