diff --git a/README.md b/README.md index 0e38ddb..a2aee6f 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,91 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, +Project 2 - CUDA Stream Compaction** + +* Edward Zhang + * https://www.linkedin.com/in/edwardjczhang/ + * https://zedward23.github.io/personal_Website/ + +* Tested on: Windows 10 Home, i7-11800H @ 2.3GHz, 16.0GB, NVIDIA GeForce RTX 3060 Laptop GPU + +## Background +The project contains an implementation of the Scan and Compaction Algorithms. + +### Scan +Description: +Each index i of a scan output array is the sum of the corresponding elements in the input array at the indices that came before i. This algorithm was implemented in the following ways: + +1. CPU - Non-parallel Scan +2. Naive - Naively Parallel Scan +3. Efficient - Parallel Scan using Upsweep and Downsweep on a binary tree representation of an array +4. Thrust - Scan using Thrust API + +### Compaction +Description: +Condenses an array into just its non-zero elements without changing its order + +1. CPU - Non-parallel Compact +2. CPU with Scan - Non-parallel Compact while using Scan +3. GPU - Parallel Compaction using Efficient Parallel Scan + +## Block Size Performance Analysis + +![](img/Graph0.png) + +A blocksize of 256 seems to yield the best results since it was the first size large enough to take advantage of the parallelism offered by the GPU. + +## Scan Performance +### Powers of 2 + +![](img/Graph1.png) + +Observations: +- CPU Scan is our baseline +- Thrust Scan is the fastest; this is expected since it is a library provided to us. +- Efficient and Naive GPU scan were actually fairly inefficient; this is likely due to so suboptimal thread allocation. + +### Non-Powers of 2 + +![](img/Graph2.png) + +Observations: +- The same observations from running the implementations on array lengths that were powers of 2 + +## Compact + +![](img/Graph3.png) + +Observations: +- Compaction without Scan on the CPU is actually faster that with Scan +- GPU implementations are still slower than the CPU implementations + +## Why is My GPU Approach So Slow? (Extra Credit) (+5) + +If you implement your efficient scan version following the slides closely, there's a good chance +that you are getting an "efficient" gpu scan that is actually not that efficient -- it is slower than the cpu approach? + +Though it is totally acceptable for this assignment, +In addition to explain the reason of this phenomena, you are encouraged to try to upgrade your work-efficient gpu scan. + +Thinking about these may lead you to an aha moment: +- What is the occupancy at a deeper level in the upper/down sweep? Are most threads actually working? + + Most threads are just idling since at each level, less and less indices should be written to. + +- Are you always launching the same number of blocks throughout each level of the upper/down sweep? + + I am always launching the same number of blocks regardless of how many indices should actually be written to. + +- If some threads are being lazy, can we do an early termination on them? + + Even if we terminate them early, we cannot move onto the next iteration until the ones that need to be written to are properly finished. + +- How can I compact the threads? What should I modify to keep the remaining threads still working correctly? + + On each iteration, dynamically dispatch the optimal number of threads and blocks that operate only on the specific indices that need to be modified. -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/Graph0.png b/img/Graph0.png new file mode 100644 index 0000000..a7c5c8c Binary files /dev/null and b/img/Graph0.png differ diff --git a/img/Graph1.png b/img/Graph1.png new file mode 100644 index 0000000..fced70a Binary files /dev/null and b/img/Graph1.png differ diff --git a/img/Graph2.png b/img/Graph2.png new file mode 100644 index 0000000..cc39f4c Binary files /dev/null and b/img/Graph2.png differ diff --git a/img/Graph3.png b/img/Graph3.png new file mode 100644 index 0000000..2c54f68 Binary files /dev/null and b/img/Graph3.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..c0bc311 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 26; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,48 +51,49 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); + //For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /*onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true);*/ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); + printArray(SIZE, a, true); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, a, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -131,20 +132,21 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..ea7c16a 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,6 +2,7 @@ #include "cpu.h" #include "common.h" +#define IDENTITY 0 namespace StreamCompaction { namespace CPU { @@ -20,6 +21,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + odata[0] = idata[0]; + for (int i = 1; i < n; i++) { + odata[i] = idata[i] + odata[i-1]; + } + timer().endCpuTimer(); } @@ -31,8 +38,18 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int currIdx = 0; + + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[currIdx] = idata[i]; + currIdx++; + } + } + + timer().endCpuTimer(); - return -1; + return currIdx; } /** @@ -42,9 +59,45 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* scanned = new int[n]; + //Filter + for (int i = 0; i < n; i++) { + (idata[i] != 0) ? scanned[i] = 1 : scanned[i] = 0; + } + + //Exclusive Scan + idata[0] == 0 ? scanned[0] = 0 : scanned[0] = 1; + for (int i = 1; i < n; i++) { + scanned[i] = scanned[i] + scanned[i - 1]; + } + + //Scatter + int currIdx = 0; + //odata[currIdx] = idata[0]; + + + + //SCATTERING MUTHA FUCKAAAA REEEEE + if (idata[0] > 0) { + odata[0] = idata[0]; + currIdx++; + } + + for (int i = 1; i < n; i++) { + if (scanned[i] > scanned[i - 1]) { + odata[scanned[i]-1] = idata[i]; + currIdx++; + } + else { + odata[i] = 0; + } + } + + delete[] scanned; + timer().endCpuTimer(); - return -1; + return currIdx; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..0ac5564 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,12 +12,142 @@ namespace StreamCompaction { return timer; } + #define blockSize 8 + + int* dev_idata; + int* dev_odata; + int* dev_buf; + + __global__ void upSweep(int N, int* idata, int* odata, int depth) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + if ((k+1)%(1 << depth) == 0) { + odata[k] = idata[k] + idata[k - (1 << (depth-1))]; + } + else { + odata[k] = idata[k]; + } + } + + __global__ void downSweep(int N, int* idata, int* odata, int depth) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + + if ((k + 1) % (1 << depth) == 0) { + if ((k + 1) % (1 << (depth + 1)) == 0) { + odata[k] = idata[k - (1 << depth)] + idata[k]; + } + else { + odata[k] = idata[k + (1 << depth)]; + } + } + else { + odata[k] = idata[k]; + } + } + + __global__ void toInclusive(int N, int* idata, int* odata, int* buf) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + if (k < N-1){ + odata[k] = buf[k + 1]; + } + else { + odata[k] = buf[k] + idata[k]; + } + } + + __global__ void scatter(int N, int* idata, int* odata, int* buf) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + if (k == 0 && idata[k] > 0) { + odata[k] = idata[k]; + } + else { + if (buf[k] > buf[k-1]) { + odata[buf[k] - 1] = idata[k]; + } + } + + //odata[k] = buf[k]; + } + + __global__ void binarize(int N, int* idata, int* odata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + idata[k] == 0 ? odata[k] = 0 : odata[k] = 1; + } + + void zeroArray(int n, int* a) { + for (int i = 0; i < n; i++) { + a[i] = 0; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int* odata, const int* idata) { timer().startGpuTimer(); - // TODO + int arrLen; + int maxDepth = ilog2ceil(n); + maxDepth > ilog2(n) ? arrLen = pow(2, maxDepth) : arrLen = n; + zeroArray(arrLen, odata); + + dim3 threadsPerBlock(arrLen / blockSize); + + int* buf = new int[arrLen]; + + for (int i = 0; i < arrLen; i++) { + if (i < n) { + buf[i] = idata[i]; + } + else { + buf[i] = 0; + } + } + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, arrLen * sizeof(int)); + cudaMalloc((void**)&dev_buf, arrLen * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int) * arrLen, cudaMemcpyHostToDevice); + cudaMemcpy(dev_buf, buf, sizeof(int) * arrLen, cudaMemcpyHostToDevice); + + for (int i = 1; i <= maxDepth; i++) { + upSweep << > > (arrLen, dev_buf, dev_odata, i); + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + } + + cudaMemset(&dev_buf[arrLen - 1], 0, sizeof(int) * 1); + + for (int i = maxDepth - 1; i >= 0; i--) { + downSweep << > > (arrLen, dev_buf, dev_odata, i); + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + } + + toInclusive << > > (arrLen, dev_idata, dev_odata, dev_buf); + + cudaMemcpy((void**)idata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + cudaMemcpy(buf, dev_buf, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_buf); + + timer().endGpuTimer(); } @@ -32,9 +162,67 @@ namespace StreamCompaction { */ int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + + int arrLen; + int maxDepth = ilog2ceil(n); + maxDepth > ilog2(n) ? arrLen = pow(2, maxDepth) : arrLen = n; + zeroArray(arrLen, odata); + + dim3 threadsPerBlock(arrLen / blockSize); + + int* buf = new int[arrLen]; + + for (int i = 0; i < arrLen; i++) { + if (i < n) { + buf[i] = idata[i]; + } + else { + buf[i] = 0; + } + } + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, arrLen * sizeof(int)); + cudaMalloc((void**)&dev_buf, arrLen * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int) * arrLen, cudaMemcpyHostToDevice); + cudaMemcpy(dev_buf, buf, sizeof(int) * arrLen, cudaMemcpyHostToDevice); + + binarize << > > (arrLen, dev_idata, dev_odata); + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + + for (int i = 1; i <= maxDepth; i++) { + upSweep << > > (arrLen, dev_buf, dev_odata, i); + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + } + + cudaMemset(&dev_buf[arrLen - 1], 0, sizeof(int) * 1); + + for (int i = maxDepth - 1; i >= 0; i--) { + downSweep << > > (arrLen, dev_buf, dev_odata, i); + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + } + + toInclusive << > > (arrLen, dev_idata, dev_odata, dev_buf); + + cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + int retLen = odata[n - 1]; + + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + + scatter << > > (arrLen, dev_idata, dev_odata, dev_buf); + + cudaMemcpy((void**)idata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + cudaMemcpy(buf, dev_buf, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_buf); + timer().endGpuTimer(); - return -1; + return retLen; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..521578f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,24 +1,99 @@ +#define GLM_FORCE_CUDA #include +#include +#include #include +#include +#include #include "common.h" #include "naive.h" namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; + + #define blockSize 8 + + int* dev_idata; + int* dev_odata; + int* dev_buf; + PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } + // TODO: __global__ + __global__ void kernScan(int N, int* idata, int* odata, int depth) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + + if (k >= 1 << (depth - 1)) { + odata[k] = idata[k - (1 << (depth - 1))] + idata[k]; + } + else { + odata[k] = idata[k]; + } + + } + + void zeroArray(int n, int* a) { + for (int i = 0; i < n; i++) { + a[i] = 0; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); + + int arrLen; + int maxDepth = ilog2ceil(n); + maxDepth > ilog2(n) ? arrLen = pow(2, maxDepth) : arrLen = n; + zeroArray(arrLen, odata); + + dim3 threadsPerBlock(arrLen/blockSize); + + int* buf = new int[arrLen]; + + for (int i = 0; i < arrLen; i++) { + if (i < n) { + buf[i] = idata[i]; + } + else { + buf[i] = 0; + } + } + // TODO + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, arrLen * sizeof(int)); + cudaMalloc((void**)&dev_buf, arrLen * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int) * arrLen, cudaMemcpyHostToDevice); + cudaMemcpy(dev_buf, buf, sizeof(int) * arrLen, cudaMemcpyHostToDevice); + + for (int i = 1; i <= maxDepth; i++) { + kernScan << > > (arrLen, dev_buf, dev_odata, i); + cudaDeviceSynchronize(); + cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + } + + cudaMemcpy((void**)idata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + cudaMemcpy(buf, dev_buf, sizeof(int) * arrLen, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_buf); + timer().endGpuTimer(); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..62b6cbd 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -9,11 +9,30 @@ namespace StreamCompaction { namespace Thrust { using StreamCompaction::Common::PerformanceTimer; + + int* dev_idata; + int* dev_odata; + int* dev_buf; + PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } + + __global__ void toInclusive(int N, int* idata, int* odata, int* buf) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + if (k < N - 1) { + odata[k] = buf[k + 1]; + } + else { + odata[k] = buf[k] + idata[k]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -22,6 +41,21 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_buf, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + + thrust::device_ptr dev_thrust_idata = thrust::device_ptr(dev_idata); + thrust::device_ptr dev_thrust_odata = thrust::device_ptr(dev_odata); + + thrust::inclusive_scan(dev_thrust_idata, dev_thrust_idata+n, dev_thrust_odata); + //cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice); + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + timer().endGpuTimer(); } }