diff --git a/README.md b/README.md index 0e38ddb..5d004e0 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,141 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Jiajun Li -### (TODO: Your README) +Linkedin: [link](https://www.linkedin.com/in/jiajun-li-5063a4217/) + +Tested on: Windows 10, i7-12700 @ 2.10GHz, 32GB, RTX3080 12GB + +CUDA Compute Capability: 8.6 + +## **Overview** + +In this project, different scan methods and some of scan applications are implemented: + +CPU side: + 1. CPU navie scan + 2. CPU compaction using CPU navie scan + 3. CPU navie radix sort using CPU navie scan + +GPU side: + 1. GPU navie scan + 2. GPU efficient scan with threads reduction + 3. GPU efficient stream compaction using GPU efficient scan + +Full explanations of each method can be found in [GPU Gem 3 Ch 39](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda). + +The project also includes thrust::scan as a benchmark in performance analysis. + +## **Project Setup** + +This project included the following changes from the original project: + +1. Add ```radix_sort.h``` and ```radix_sort.cu``` to ```stream_compaction/CMakeLists``` + +2. Add radix sort test code in ```src\main.cpp``` + +## **Output example** + +``` +**************** +** SCAN TESTS ** +**************** + [ 45 11 41 13 34 22 1 22 6 5 3 5 21 ... 9 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0519ms (std::chrono Measured) + [ 0 45 56 97 110 144 166 167 189 195 200 203 208 ... 801487 801496 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0519ms (std::chrono Measured) + [ 0 45 56 97 110 144 166 167 189 195 200 203 208 ... 801379 801414 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.453632ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.403456ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.31744ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.08192ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.044032ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.045056ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 0 3 0 3 0 2 3 1 3 3 1 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0657ms (std::chrono Measured) + [ 1 3 3 2 3 1 3 3 1 1 2 3 1 ... 3 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0648ms (std::chrono Measured) + [ 1 3 3 2 3 1 3 3 1 1 2 3 1 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.1507ms (std::chrono Measured) + [ 1 3 3 2 3 1 3 3 1 1 2 3 1 ... 3 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.17408ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.171008ms (CUDA Measured) + passed + +***************************** +** RADIX SORT TESTS ** +***************************** +==== cpu radix sort, power-of-two ==== + elapsed time: 0.4766ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== cpu radix sort, non-power-of-two ==== + elapsed time: 0.4721ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +``` + + +## **Performance Analysis** + +In all the following analysis, less time is better. + +### **Scan** + +![](img/Scan.png) + +* Work efficient scan out performs cpu scan when number of elements is greater than 2^16. + +* Work efficient scan roughly align with thrust scan when number of elements is greater than 2^18. + +* Navie GPU scan is always slower than navie CPU scan. This is because GPU method accesses data trhough global memory, which is considerably costy. + + +### **Stream Compaction** + +![](img/StreamCompaction.png) + +* Using scan will make it slower in the CPU implementation because scan introduces more iterations over array. + +* Work efficient scan starts to out perform cpu scan when number of elements is greater than 2^18. + +* For work efficient scan, it performs slightly better when the number of elements is not power of two. + +### **Radix Sort** + +![](img/RadixSort.png) + +### **Future Improvement** + +1. Implement parallel radix sort and compare it with navie radix sort. + +2. Make GPU scans even more efficient by using share memory. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/RadixSort.png b/img/RadixSort.png new file mode 100644 index 0000000..5f19786 Binary files /dev/null and b/img/RadixSort.png differ diff --git a/img/Scan.png b/img/Scan.png new file mode 100644 index 0000000..eb22bb6 Binary files /dev/null and b/img/Scan.png differ diff --git a/img/StreamCompaction.png b/img/StreamCompaction.png new file mode 100644 index 0000000..f24750f Binary files /dev/null and b/img/StreamCompaction.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..b164274 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,9 +11,10 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -28,6 +29,7 @@ int main(int argc, char* argv[]) { printf("****************\n"); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -67,18 +69,32 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("naive scan with shared memory, power-of-two"); + StreamCompaction::Naive::scan2(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, false); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan with shared memory, non-power-of-two"); + StreamCompaction::Naive::scan2(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + //printArray(SIZE, c, false); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + //printArray(NPOT, c, false); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -147,6 +163,42 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 50); + + zeroArray(SIZE, c); + printDesc("cpu radix sort, power-of-two"); + StreamCompaction::Radix_Sort::radix_sort_cpu(SIZE, 6, c, a); + count = SIZE; + printElapsedTime(StreamCompaction::Radix_Sort::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + + zeroArray(SIZE, c); + printDesc("cpu radix sort, non-power-of-two"); + count = NPOT; + StreamCompaction::Radix_Sort::radix_sort_cpu(NPOT, 6, c, a); + printElapsedTime(StreamCompaction::Radix_Sort::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + + zeroArray(SIZE, c); + printDesc("parallel radix sort, power-of-two"); + StreamCompaction::Radix_Sort::radix_sort_parallel(SIZE, 6, c, a); + count = SIZE; + printElapsedTime(StreamCompaction::Radix_Sort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + + zeroArray(SIZE, c); + printDesc("parallel radix sort, non-power-of-two"); + count = NPOT; + StreamCompaction::Radix_Sort::radix_sort_parallel(NPOT, 6, c, a); + printElapsedTime(StreamCompaction::Radix_Sort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..7b34ba9 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix_sort.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix_sort.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..86597ae 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +25,12 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + + if (k >= n) + return; + + bools[k] = idata[k] > 0 ? 1 : 0; } /** @@ -33,6 +40,16 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + + if (k >= n) + return; + + if (bools[k] > 0) + { + odata[indices[k]] = idata[k]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..4125392 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,10 +19,22 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + doScan(n, odata, idata); + timer().endCpuTimer(); } + void doScan(int n, int* odata, const int* idata) + { + // Exclusive scan + odata[0] = 0; + for (int i = 1; i < n; ++i) + { + odata[i] = idata[i - 1] + odata[i - 1]; + } + } + /** * CPU stream compaction without using the scan function. * @@ -31,8 +43,18 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + int count = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] > 0) + { + odata[count++] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -41,10 +63,39 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + + int* tmpInputData = new int[n]; + int* tmpOutputData = new int[n]; + timer().startCpuTimer(); - // TODO + + // Transfer idata to 0,1 set + for (int i = 0; i < n; ++i) + { + tmpInputData[i] = idata[i] > 0 ? 1 : 0; + } + + // Exclusive scan + doScan(n, tmpOutputData, tmpInputData); + + // Final array size + int count = tmpOutputData[n - 1] + tmpInputData[n - 1]; + + // Scatter + for (int i = 0; i < n; ++i) + { + if (tmpInputData[i] > 0) + { + odata[tmpOutputData[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + + delete[] tmpInputData; + delete[] tmpOutputData; + + return count; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..859a092 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -6,6 +6,8 @@ namespace StreamCompaction { namespace CPU { StreamCompaction::Common::PerformanceTimer& timer(); + void doScan(int n, int* odata, const int* idata); + void scan(int n, int *odata, const int *idata); int compactWithoutScan(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..b348226 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,6 +1,8 @@ #include #include +#include #include "common.h" +#include #include "efficient.h" namespace StreamCompaction { @@ -12,13 +14,142 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep2(int n, int d, int st, int* odata) + { + // This uses less threads than kernUpSweep() + + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + int gap = 1 << d; + + int readIndex = st + 2 * gap * index; + int writeIndex = readIndex + gap; + + if (writeIndex < n) + { + odata[writeIndex] += odata[readIndex]; + } + + __syncthreads(); + } + + __global__ void kernDownSweep2(int n, int d, int* odata) + { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + int gap = 1 << d; + + int rightIndex = n - 1 - 2 * gap * index; + int leftIndex = rightIndex - gap; + + if (leftIndex >= 0) + { + int tmp = odata[leftIndex]; + odata[leftIndex] = odata[rightIndex]; + odata[rightIndex] += tmp; + } + + __syncthreads(); + } + + __global__ void kernUpSweep(int n, int d, int* odata) + { + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + + int powd = 1 << d; + int powd2 = 1 << (d + 1); + + if (k % powd2 == 0) + { + odata[k + powd2 - 1] += odata[k + powd - 1]; + } + } + + __global__ void kernDownSweep(int n, int d, int* odata) + { + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + + if (k >= n) + return; + + int powd = 1 << d; + int powd2 = 1 << (d + 1); + + if (k % powd2 == 0) + { + int tmp = odata[k + powd - 1]; + odata[k + powd - 1] = odata[k + powd2 - 1]; + odata[k + powd2 - 1] += tmp; + } + } + + void doScan(int n, int* dev_odata) + { + // Exclusive scan + + int round_n = 1 << ilog2ceil(n); + + int BLOCK_SIZE = 128; + dim3 fullBlocksPerGrid((round_n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + // Up sweep 2 (Halve thread count in each iteration) + int st = 0; + int round_n_up_sweep = round_n / 2; + for (int d = 0; d <= ilog2ceil(round_n) - 1; ++d) + { + dim3 fullBlocksPerGridUpSweep((round_n_up_sweep + BLOCK_SIZE - 1) / BLOCK_SIZE); + kernUpSweep2 <<>> (round_n, d, st, dev_odata); + checkCUDAError("kernUpSweep fails."); + st += (1 << d); + round_n_up_sweep /= 2; + } + // Down sweep 2 (Double thread count in each iteration) + int round_n_down_sweep = 1; + cudaMemset(dev_odata + round_n - 1, 0, sizeof(int)); + for (int d = ilog2ceil(round_n) - 1; d >= 0; --d) + { + dim3 fullBlocksPerGridDownSweep((round_n_down_sweep + BLOCK_SIZE - 1) / BLOCK_SIZE); + kernDownSweep2 <<>> (round_n, d, dev_odata); + checkCUDAError("kernDownSweep fails."); + round_n_down_sweep *= 2; + } + + //// Up sweep + //for (int d = 0; d <= ilog2ceil(round_n) - 1; ++d) + //{ + // kernUpSweep <<>> (round_n, d, dev_odata); + // checkCUDAError("kernUpSweep fails."); + //} + //// Down sweep + //cudaMemset(dev_odata + round_n - 1, 0, sizeof(int)); + //for (int d = ilog2ceil(round_n) - 1; d >= 0; --d) + //{ + // kernDownSweep << > > (round_n, d, dev_odata); + // checkCUDAError("kernDownSweep fails."); + //} + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + // Round to 'balanced tree' + int round_n = 1 << ilog2ceil(n); + + // Device memory setup + int* dev_odata; + cudaMalloc((void**)&dev_odata, round_n * sizeof(int)); + cudaMemset(dev_odata, 0, round_n * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + doScan(n, dev_odata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_odata); } /** @@ -31,10 +162,58 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int round_n = 1 << ilog2ceil(n); + + // Config + int BLOCK_SIZE = 128; + dim3 fullBlocksPerGrid((round_n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + // Device Memory setup + int* dev_idata; + int* dev_odata_bool; + int* dev_odata_scan; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata_bool, round_n * sizeof(int)); + cudaMalloc((void**)&dev_odata_scan, round_n * sizeof(int)); + + cudaMemset(dev_odata_bool, 0, round_n * sizeof(int)); + cudaMemset(dev_odata_scan, 0, round_n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // Mark 1 0 + Common::kernMapToBoolean<<>>(round_n, dev_odata_bool, dev_idata); + + // Scan + cudaMemcpy(dev_odata_scan, dev_odata_bool, n * sizeof(int), cudaMemcpyDeviceToDevice); + doScan(n, dev_odata_scan); + + // Read total number from scan result + int* tmp = new int[2]; + cudaMemcpy(tmp, dev_odata_scan + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(tmp + 1, dev_odata_bool + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost); + int count = tmp[0] + tmp[1]; + delete[] tmp; + + // Scatter + int* dev_odata; + cudaMalloc((void**)&dev_odata, count * sizeof(int)); + Common::kernScatter <<>>(n, dev_odata, dev_idata, dev_odata_bool, dev_odata_scan); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); + + cudaFree(dev_idata); + cudaFree(dev_odata_bool); + cudaFree(dev_odata_scan); + + return count; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..5485f24 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,9 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void doScan(int n, int* dev_odata); + + void scan(int n, int* odata, const int* idata); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..96807fe 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,5 +1,7 @@ #include #include +#include +#include #include "common.h" #include "naive.h" @@ -11,7 +13,22 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernScan(int n, int d, int* odata, int *idata) + { + // Inclusive scan + + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + + if (k >= n) + return; + + int powd = 1 << (d - 1); // More efficient than pow() + if (k >= powd) + { + odata[k] = idata[k - powd] + idata[k]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -19,6 +36,99 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + + // Config + int BLOCK_SIZE = 512; // This out performs BLOCK_SIZE = 128 in large data sets + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + // 2 Buffers + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // Scan + for (int d = 1; d <= ilog2ceil(n); ++d) + { + kernScan<<>>(n, d, dev_odata, dev_idata); + checkCUDAError("kernScan fails."); + cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + } + + + // Inclusive to exclusive (shift right, insert identity) + cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(dev_idata); + cudaFree(dev_odata); + + timer().endGpuTimer(); + } + + + __global__ void kernScan2(int n, int* odata, int* idata) + { + // This method should be only used for one block and small data sets + + extern __shared__ int temp[]; // allocated on invocation + int thid = threadIdx.x; + int pout = 0, pin = 1; + + // Load input into shared memory. + // This is exclusive scan, so shift right by one + // and set first element to 0 + temp[pout * n + thid] = (thid > 0) ? idata[thid - 1] : 0; + __syncthreads(); + + for (int offset = 1; offset < n; offset *= 2) + { + pout = 1 - pout; // swap double buffer indices + pin = 1 - pout; + + if (thid >= offset) + temp[pout * n + thid] += temp[pin * n + thid - offset]; + else + temp[pout * n + thid] = temp[pin * n + thid]; + __syncthreads(); + } + + odata[thid] = temp[pout * n + thid]; // write output + } + + /** + * Performs prefix-sum (aka scan) on idata using shared memory, storing the result into odata. + */ + void scan2(int n, int* odata, const int* idata) + { + timer().startGpuTimer(); + + if (n > 1024) + { + std::cout << "n > 1024, navie scan with shared memory failed." << std::endl; + timer().endGpuTimer(); + return; + } + + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + kernScan2<<<1, n, 2 * n * sizeof(int)>>>(n, dev_odata, dev_idata); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + timer().endGpuTimer(); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..612b00e 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -6,6 +6,10 @@ namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void kernScan(int n, int d, int* odata, int* idata); + void scan(int n, int *odata, const int *idata); + + void scan2(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/radix_sort.cu b/stream_compaction/radix_sort.cu new file mode 100644 index 0000000..5204126 --- /dev/null +++ b/stream_compaction/radix_sort.cu @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include "radix_sort.h" +#include "cpu.h" +#include "efficient.h" +#include + +#include "common.h" + +namespace StreamCompaction +{ + namespace Radix_Sort + { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __device__ __host__ int checkDigit(int num, int whichDigit) + { + return (num >> whichDigit) & 1; + } + + void radix_sort_cpu(int n, int digitMax, int* odata, const int* idata) + { + int* oArray = new int[n]; // A copy of odata for scattering + int* eArray = new int[n]; + int* fArray = new int[n]; + int* tArray = new int[n]; + + // Init odata + memcpy(odata, idata, n * sizeof(int)); + + timer().startCpuTimer(); + + for (int i = 0; i < digitMax; ++i) + { + // Save the orignal odata + memcpy(oArray, odata, n * sizeof(int)); + + // Build eArray + for (int j = 0; j < n; ++j) + { + eArray[j] = checkDigit(oArray[j], i) == 1 ? 0 : 1; + } + + // Build fArray by scaning eArray + CPU::doScan(n, fArray, eArray); + + // Scatter data by d + int d; + int totalFalses = fArray[n - 1] + eArray[n - 1]; + for (int j = 0; j < n; ++j) + { + if (eArray[j] == 0) // b[j] == 1 + { + d = j - fArray[j] + totalFalses; // d[j] = t[j] + } + else + { + d = fArray[j]; // d[j] = f[j] + } + odata[d] = oArray[j]; + } + } + + timer().endCpuTimer(); + + delete[] oArray; + delete[] eArray; + delete[] fArray; + delete[] tArray; + } + + + + + __global__ void kernBuildErrorArray(int n, int whichOne, int* eArray, int* idata) + { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + if (index >= n) + return; + + eArray[index] = 1 - checkDigit(idata[index], whichOne); + } + + __global__ void kernSplit(int n, int totalFalses, int* odata, int *idata, int *fArray, int* eArray) + { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + if (index >= n) + return; + + int d = -1; + if (eArray[index] == 0) + { + d = index - fArray[index] + totalFalses; + } + else + { + d = fArray[index]; + } + + odata[d] = idata[index]; + } + + void radix_sort_parallel(int n, int digitMax, int* odata, int* idata) + { + int round_n = 1 << ilog2ceil(n); + + // Configuration + int BLOCK_SIZE = 128; + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + int* dev_eArray; + int* dev_fArray; + int* dev_odata; + int* dev_odata2; + int* tmp = new int[2]; + + cudaMalloc((void**)&dev_eArray, round_n * sizeof(int)); + cudaMalloc((void**)&dev_fArray, round_n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata2, n * sizeof(int)); + + cudaMemset(dev_eArray, 0, round_n * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + int totalFalses = -1; + for (int i = 0; i < digitMax; ++i) + { + cudaMemcpy(dev_odata2, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + + // Build eArray + kernBuildErrorArray<<>>(n, i, dev_eArray, dev_odata2); + + // Build fArray + cudaMemcpy(dev_fArray, dev_eArray, round_n * sizeof(int), cudaMemcpyDeviceToDevice); + Efficient::doScan(round_n, dev_fArray); + + // Read totalfalses + cudaMemcpy(tmp, dev_fArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(tmp + 1, dev_eArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + totalFalses = tmp[0] + tmp[1]; + + // Scatter data + kernSplit<<>>(n, totalFalses, dev_odata, dev_odata2, dev_fArray, dev_eArray); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + delete[] tmp; + cudaFree(dev_eArray); + cudaFree(dev_fArray); + cudaFree(dev_odata); + cudaFree(dev_odata2); + } + } +} diff --git a/stream_compaction/radix_sort.h b/stream_compaction/radix_sort.h new file mode 100644 index 0000000..eefd062 --- /dev/null +++ b/stream_compaction/radix_sort.h @@ -0,0 +1,15 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix_Sort { + StreamCompaction::Common::PerformanceTimer& timer(); + + int checkDigit(int num, int whichDigit); + + void radix_sort_parallel(int n, int digitMax, int* odata, int* idata); + + void radix_sort_cpu(int n, int digitMax, int* odata, const int* idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..928de63 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,25 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::host_vector host_in; + + for (int i = 0; i < n; ++i) + { + host_in.push_back(idata[i]); + } + + thrust::device_vector dv_in = host_in; + thrust::device_vector dv_out(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + for (int i = 0; i < n; ++i) + { + odata[i] = dv_out[i]; + } } } }