diff --git a/README.md b/README.md index 0e38ddb..48744cf 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,93 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Guanlin Huang + * [LinkedIn](https://www.linkedin.com/in/guanlin-huang-4406668502/), [personal website](virulentkid.github.io/personal_web/index.html) +* Tested on: Windows 11, i9-10900K @ 4.9GHz 32GB, RTX3080 10GB; Compute Capability: 8.6 -### (TODO: Your README) +## Project Description -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Stream compaction is an algorithm to produce a smaller output array that contains the indices of the desired elements from the input array; +such an algorithm will come in handy in ray path tracing of computer graphics or huge sparse matrix compression in AI. +The performance of element selection is extremely important due to the enormous quantity of data pieces that need to be filtered. +Modern Graphics Processing Units (GPUs) have been utilized more frequently lately to speed up the processing of extremely big, concurrent data applications. +In this Project, the stream compaction algorithm is implemented in conjunction of an algorithm called prefix-sum, or, "scan." +Here is an visual representation of that algorithm: +!["Scan Algorithm"](img/figure-39-2.jpg) + +The following perfomance analysis of scan and stream compaction will show the benefit of using GPU comparing to CPU only. + + +## Performance Analysis + +!["Scan"](img/scan.png) +!["Stream Compaction"](img/comp.png) +As the charts are shown, when array size is large, both naive and work-efficient algorithm of GPU out-perform the CPU implemtation. + +### Performance Bottleneck +* GPU-based implementations are constrained by global memory reading performance at small array sizes because of no optimization of using share memory. +However, due to the advantages of parallel processing, GPU implementations see a less drastic rise in runtime comparing to the CPU one. + +### Test output + +``` +**************** +** SCAN TESTS ** +**************** + [ 0 7 11 5 35 28 17 42 37 26 38 7 26 ... 25 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 33.3664ms (std::chrono Measured) + [ 0 0 7 18 23 58 86 103 145 182 208 246 253 ... 821757286 821757311 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 25.4672ms (std::chrono Measured) + [ 0 0 7 18 23 58 86 103 145 182 208 246 253 ... 821757208 821757214 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 11.3899ms (CUDA Measured) + [ 0 0 7 18 23 58 86 103 145 182 208 246 253 ... 821757286 821757311 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 12.0769ms (CUDA Measured) + [ 0 0 7 18 23 58 86 103 145 182 208 246 253 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 11.4359ms (CUDA Measured) + [ 0 0 7 18 23 58 86 103 145 182 208 246 253 ... 821757286 821757311 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 10.0679ms (CUDA Measured) + [ 0 0 7 18 23 58 86 103 145 182 208 246 253 ... 821757208 821757214 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.05677ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.978368ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 1 3 1 0 3 2 1 2 2 2 3 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 59.5784ms (std::chrono Measured) + [ 2 1 3 1 3 2 1 2 2 2 3 3 3 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 60.795ms (std::chrono Measured) + [ 2 1 3 1 3 2 1 2 2 2 3 3 3 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 129.865ms (std::chrono Measured) + [ 2 1 3 1 3 2 1 2 2 2 3 3 3 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 16.1052ms (CUDA Measured) + [ 2 1 3 1 3 2 1 2 2 2 3 3 3 ... 2 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 16.0944ms (CUDA Measured) + [ 2 1 3 1 3 2 1 2 2 2 3 3 3 ... 3 2 ] + passed +Press any key to continue . . . diff --git a/img/comp.png b/img/comp.png new file mode 100644 index 0000000..fe4aaa5 Binary files /dev/null and b/img/comp.png differ diff --git a/img/scan.png b/img/scan.png new file mode 100644 index 0000000..ff85f1a Binary files /dev/null and b/img/scan.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..879c8f6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 25; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,21 +64,21 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..504514b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +25,12 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -33,6 +40,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..d7a39ae 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -4,47 +4,86 @@ #include "common.h" namespace StreamCompaction { - namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * CPU scan (prefix sum). - * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. - */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - } - - /** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - - /** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - } + namespace CPU { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + //helper function for stream compaction to remove timer error + void cpuScan(int n, int* odata, const int* idata) { + if (n > 0) { + odata[0] = 0; + + int prevSum = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + prevSum; + prevSum = odata[i]; + } + } + } + + /** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + */ + void scan(int n, int* odata, const int* idata) { + timer().startCpuTimer(); + // TODO + cpuScan(n, odata, idata); + timer().endCpuTimer(); + } + + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int* odata, const int* idata) { + timer().startCpuTimer(); + // TODO + int trueCounts = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[trueCounts] = idata[i]; + trueCounts++; + } + } + timer().endCpuTimer(); + return trueCounts; + } + + /** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithScan(int n, int* odata, const int* idata) { + timer().startCpuTimer(); + int* boolArr = new int[n]; + int* scanArr = new int[n]; + + for (int i = 0; i < n; i++) { + boolArr[i] = idata[i] == 0 ? 0 : 1; + } + + cpuScan(n, scanArr, boolArr); + + int trueCounts = 0; + for (int i = 0; i < n; i++) { + if (boolArr[i] == 1) { + trueCounts++; + odata[scanArr[i]] = idata[i]; + + } + } + delete[] boolArr; + delete[] scanArr; + timer().endCpuTimer(); + return trueCounts; + } + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..523721f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,39 +2,164 @@ #include #include "common.h" #include "efficient.h" +#include + +#define blockSize 128 +int* dev_idata; +int* dev_odata; +int* dev_scan_idata; +int* dev_boolArr; +int* dev_indexArr; namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernUpSweep(int n, int* idata, int d) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + if (index % (1 << (d + 1)) == 0) { + idata[index + (1 << (d + 1)) - 1] += idata[index + (1 << d) - 1]; + } + } + + __global__ void kernSetRootNode(int n, int* idata) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + idata[n - 1] = 0; + } + + __global__ void kernDownSweep(int n, int* idata, int d) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + if (index % ((1 << (d + 1))) == 0) { + int t = idata[index + (1 << d) - 1]; + idata[index + (1 << d) - 1] = idata[index + (1 << (d + 1)) - 1]; + idata[index + (1 << (d + 1)) - 1] += t; + } + } + + //helper function for stream compaction to remove timer error + void eff_scan(int n, int* odata, const int* idata) { + int log2n = ilog2ceil(n); + int sizeOfArr = 1 << log2n; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_scan_idata, n * sizeof(int)); + cudaMemcpy(dev_scan_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int d = 0; d <= log2n - 1; d++) { + kernUpSweep << > > (sizeOfArr, dev_scan_idata, d); + } + + kernSetRootNode << <1, 1 >> > (sizeOfArr, dev_scan_idata); + + for (int d = log2n - 1; d >= 0; d--) { + kernDownSweep << > > (sizeOfArr, dev_scan_idata, d); + } + + cudaMemcpy(odata, dev_scan_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_scan_idata); + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + + int log2n = ilog2ceil(n); + int sizeOfArr = 1 << log2n; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_scan_idata, n * sizeof(int)); + cudaMemcpy(dev_scan_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + for (int d = 0; d <= log2n - 1; d++) { + kernUpSweep << > > (sizeOfArr, dev_scan_idata, d); + } + + kernSetRootNode << <1, 1 >> > (sizeOfArr, dev_scan_idata); + + for (int d = log2n - 1; d >= 0; d--) { + kernDownSweep << > > (sizeOfArr, dev_scan_idata, d); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_scan_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_scan_idata); + + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int* odata, const int* idata) { + + // TODO + + int log2n = ilog2ceil(n); + int sizeOfArr = 1 << log2n; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_boolArr, sizeOfArr * sizeof(int)); + cudaMalloc((void**)&dev_indexArr, sizeOfArr * sizeof(int)); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + + timer().startGpuTimer(); + StreamCompaction::Common::kernMapToBoolean << > > (sizeOfArr, dev_boolArr, dev_idata); + + eff_scan(n, dev_indexArr, dev_boolArr); + + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_boolArr, dev_indexArr); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + int* indices = new int[n]; + int count = 0; + cudaMemcpy(indices, dev_indexArr, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_boolArr); + cudaFree(dev_indexArr); + + + if (idata[n - 1] == 0) { + count = indices[n - 1]; + } + else { + count = indices[n - 1] + 1; + } + + delete[] indices; + return count; + } + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..482a858 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,24 +2,85 @@ #include #include "common.h" #include "naive.h" +#include + +#define blockSize 128 +int* dev_bufferA; +int* dev_bufferB; + namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - } + namespace Naive { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + // TODO: __global__ + __global__ void kernScanIteration(int n, int* odata, int* idata, int d) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + if (index >= (1 << (d-1))) { + odata[index] = idata[index - (1 << (d - 1))] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } + + __global__ void kernCopyExclusive(int n, int* exclusive, int* inclusive) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + if (index == 0) { + exclusive[index] = 0; + } + else { + exclusive[index] = inclusive[index - 1]; + } + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + if (n == 0) { + return; + } + + cudaMalloc((void**)&dev_bufferA, n * sizeof(int)); + cudaMalloc((void**)&dev_bufferB, n * sizeof(int)); + cudaMemcpy(dev_bufferA, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + int log2n = ilog2ceil(n); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + for (int d = 1; d <= log2n; d++) { + kernScanIteration << > > (n, dev_bufferB, dev_bufferA, d); + + if (d < log2n) { + int* tempPtr = dev_bufferB; + dev_bufferB = dev_bufferA; + dev_bufferA = tempPtr; + } + + } + //the inclusive scan is stored in bufferB + kernCopyExclusive << > > (n, dev_bufferA, dev_bufferB); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_bufferA, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_bufferB); + cudaFree(dev_bufferA); + + + } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..17fb36b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -7,22 +7,28 @@ #include "thrust.h" namespace StreamCompaction { - namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } - } + namespace Thrust { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dev_odata(odata, odata + n); + thrust::device_vector dev_idata(idata, idata + n); + timer().startGpuTimer(); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); + + } + } }