diff --git a/README.md b/README.md index 0e38ddb..1437571 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,77 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* XiaoyuDu +* Tested on: Windows 10, i9-11900KF @ 3.50GHz, RTX 3080 (Personal PC) + +### Description +This project tested for different method of scan and compact. + +### Feature +I implemented all the features for part 1 - 5. +* CPU Scan & Stream Compaction +* Naive GPU Scan Algorithm +* Work-Efficient GPU Scan & Stream Compaction +* Thrust's Implementation +* GPU Work-Efficient Method Optimization + +### Performance Analysis +My optimized number of blocks is 128. +I campared different method with different size array, and the result plot is shown below. I am a bit confused why my Thrust implementation takes so long to run. I think my implementation should be correct. +![](./images/1.png) -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Below is the test result with 2^20 size array. +``` +**************** +** SCAN TESTS ** +**************** + [ 28 9 12 41 33 49 46 3 11 27 35 5 47 ... 8 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.7669ms (std::chrono Measured) + [ 0 28 37 49 90 123 172 218 221 232 259 294 299 ... 25674595 25674603 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.7544ms (std::chrono Measured) + [ 0 28 37 49 90 123 172 218 221 232 259 294 299 ... 25674502 25674539 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.510176ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.695424ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.43328ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.631104ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 28.3783ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 7.89008ms (CUDA Measured) + passed +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 1 1 1 2 0 2 2 3 0 3 3 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 3.7818ms (std::chrono Measured) + [ 1 2 1 1 1 2 2 2 3 3 3 2 3 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 3.5542ms (std::chrono Measured) + [ 1 2 1 1 1 2 2 2 3 3 3 2 3 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 9.8808ms (std::chrono Measured) + [ 1 2 1 1 1 2 2 2 3 3 3 2 3 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 1.07133ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.849984ms (CUDA Measured) + passed +``` \ No newline at end of file diff --git a/images/1.png b/images/1.png new file mode 100644 index 0000000..caabacb Binary files /dev/null and b/images/1.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..f494cec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -139,6 +139,8 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); @@ -146,6 +148,7 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..25cb9f9 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int num = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[num] = idata[i]; + ++num; + } + } timer().endCpuTimer(); - return -1; + return num; } /** @@ -43,8 +54,40 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* boolArr = new int[n]; + int* scanArr = new int[n]; + int num = 0; + + //build boolArr + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + boolArr[i] = 1; + } + else { + boolArr[i] = 0; + } + } + + //build scanArr + scanArr[0] = 0; + for (int i = 1; i < n; ++i) { + scanArr[i] = boolArr[i - 1] + scanArr[i - 1]; + } + + //fill odata + for (int i = 0; i < n; ++i) { + if (boolArr[i] == 1) { + odata[scanArr[i]] = idata[i]; + } + } + + //calculate num to return + num = scanArr[n - 1] + boolArr[n - 1]; + + delete[] boolArr; + delete[] scanArr; timer().endCpuTimer(); - return -1; + return num; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..bdd4dbc 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,94 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int threadNeeded, int d, int* dev_idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + //increase1 2^(d+1), increase2 2^d + if (index < threadNeeded) { + int increase1 = 1 << (d + 1); + int increase2 = 1 << d; + int multiIdx = index * increase1; + dev_idata[multiIdx + increase1 - 1] += dev_idata[multiIdx + increase2 - 1]; + } + } + + __global__ void kernDownSweep(int threadNeeded, int d, int* dev_idata){ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < threadNeeded) { + int increase1 = 1 << (d + 1); + int increase2 = 1 << d; + int multiIdx = index * increase1; + int t = dev_idata[multiIdx + increase2 - 1]; + dev_idata[multiIdx + increase2 - 1] = dev_idata[multiIdx + increase1 - 1]; + dev_idata[multiIdx + increase1 - 1] += t; + } + } + + __global__ void kernMapToBoolean(int n, int* temp_Arr, int* dev_idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + if (dev_idata[index] != 0) { + temp_Arr[index] = 1; + } + } + } + + __global__ void kernScatter(int n, int* dev_tempArr, int* dev_finalArr, int* dev_idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < (n - 1)) { + int currScan = dev_tempArr[index]; + int nextScan = dev_tempArr[index + 1]; + if (currScan < nextScan) { + dev_finalArr[currScan] = dev_idata[index]; + } + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + //used to round the array sizes to the next power of two. + int nCeil = ilog2ceil(n); + int n2PowCeil = 1 << nCeil; + + int* dev_idata; + cudaMalloc((void**)&dev_idata, n2PowCeil * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + //timer start timer().startGpuTimer(); + if (n2PowCeil != n) { + cudaMemset(&(dev_idata[n]), 0, (n2PowCeil - n) * sizeof(int)); + checkCUDAError("cudaMemset failed!"); + } + + //open n threads is enough // TODO + //up-sweep + int depth = ilog2ceil(n2PowCeil) - 1; + for (int d = 0; d <= depth; ++d) { + int threadNeeded = 1 << (nCeil - d - 1); + dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize); + kernUpSweep << > > (threadNeeded, d, dev_idata); + } + //down-sweep + cudaMemset(&(dev_idata[n2PowCeil -1]), 0, sizeof(int)); + for (int d = depth; d >= 0; --d) { + int threadNeeded = 1 << (nCeil - d - 1); + dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize); + kernDownSweep << > > (threadNeeded, d, dev_idata); + } timer().endGpuTimer(); + + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("memory dev_idata to odata failed!"); + + cudaFree(dev_idata); } /** @@ -31,10 +114,68 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int nCeil = ilog2ceil(n); + int n2PowCeil = 1 << nCeil; + int* dev_idata; + int* dev_tempArr; + int* dev_finalArr; + + cudaMalloc((void**)&dev_idata, n2PowCeil * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_tempArr, n2PowCeil * sizeof(int)); + checkCUDAError("cudaMalloc dev_tempArr failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy from idata to dev_idata failed!"); + + //start timer().startGpuTimer(); - // TODO + if (n2PowCeil != n) { + cudaMemset(&(dev_idata[n]), 0, (n2PowCeil - n) * sizeof(int)); + checkCUDAError("cudaMemset dev_idata failed!"); + } + + cudaMemset(dev_tempArr, 0, n2PowCeil * sizeof(int)); + checkCUDAError("cudaMemset dev_tempArr failed!"); + + dim3 fullBlocksPerGrid((blockSize + n - 1) / blockSize); + + // build boolean array + kernMapToBoolean << > > (n, dev_tempArr, dev_idata); + int lastElement = idata[n - 1]; + + //up-sweep + int depth = ilog2ceil(n2PowCeil) - 1; + for (int d = 0; d <= depth; ++d) { + int threadNeeded = 1 << (nCeil - d - 1); + dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize); + kernUpSweep << > > (threadNeeded, d, dev_tempArr); + } + //create final array based on up-sweep result + int numOfResults; + cudaMemcpy(&numOfResults, &(dev_tempArr[n2PowCeil - 1]), sizeof(int), cudaMemcpyDeviceToHost); + cudaMalloc((void**)&dev_finalArr, numOfResults * sizeof(int)); + //down-sweep + cudaMemset(&(dev_tempArr[n2PowCeil - 1]), 0, sizeof(int)); + for (int d = depth; d >= 0; --d) { + int threadNeeded = 1 << (nCeil - d - 1); + dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize); + kernDownSweep << > > (threadNeeded, d, dev_tempArr); + } + //scatter + kernScatter << > > (n, dev_tempArr, dev_finalArr, dev_idata); + timer().endGpuTimer(); - return -1; + //end + + cudaMemcpy(odata, dev_finalArr, numOfResults * sizeof(int), cudaMemcpyDeviceToHost); + if (lastElement) { + odata[numOfResults - 1] = lastElement; + } + + return numOfResults; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..cef8fce 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +14,56 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernNaive(int n, int check, const int* dev_idata, int* dev_odata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + if (index >= check) { + dev_odata[index] = dev_idata[index - check] + dev_idata[index]; + } + else { + dev_odata[index] = dev_idata[index]; + } + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO + int* dev_idata; + int* dev_odata; + dim3 fullBlocksPerGrid((blockSize + n - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("generate dev_temp1 failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("generate dev_temp2 failed!"); + + //make dev_idata shift to right by 1 + int identity = 0; + cudaMemcpy(dev_idata, &identity, sizeof(int), cudaMemcpyHostToDevice); + + cudaMemcpy(&dev_idata[1], idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("memory copy to dev_idata failed!"); + + timer().startGpuTimer(); + + for (int d = 1; d <= ilog2ceil(n); ++d) { + int check = pow(2, d - 1); + kernNaive << > > (n, check, dev_idata, dev_odata); + std::swap(dev_idata, dev_odata); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("memory copy to odata failed!"); + + cudaFree(dev_idata); + cudaFree(dev_odata); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..b8062e9 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::host_vector host_thrust_idata(idata, idata+n); + thrust::device_vector dev_thrust_idata(host_thrust_idata); + timer().startGpuTimer(); + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_idata.begin()); timer().endGpuTimer(); + host_thrust_idata = dev_thrust_idata; + + thrust::copy(host_thrust_idata.begin(), host_thrust_idata.end(), odata); + } } }