diff --git a/README.md b/README.md index 0e38ddb..9c439d5 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,187 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Wayne Wu + * [LinkedIn](https://www.linkedin.com/in/wayne-wu/), [Personal Website](https://www.wuwayne.com/) +* Tested on: Windows 10, AMD Ryzen 5 5600X @ 3.70GHz 32GB, RTX 3070 8GB (personal) -### (TODO: Your README) +## Background -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project examines two fundamental GPU programming algorithms, **Scan** and **Stream Compaction**. +### Scan +Scanning is the process of calculating the prefix sums of all elements in an array. Five different implementations are examined in this project: + +1. **CPU**: +Non-parallelized scanning in CPU. +2. **GPU Naive**: +Naively parallelized setup. +3. **GPU Work Efficient**: +Parallelized setup that reduces the amount of work required by breaking the scanning process into two phases: reduce phase and down-sweep phase. See `kernUpSweep()` and `kernDownSweep()` kernels for implementation details. +4. **GPU Work Efficient with Shared Memory (Extra Credit)**: +Based on the concept from 3, both reduction and down-sweep phases are now done in one kernel (`kernScan()`) using shared memory. In order to support **an array of arbitrary size**, the implementation uses the block-sum method to recursively divide the array into manageable chunks by each block and aggregate divided results together using an auxiliary array (See `recursiveScan()`). +5. **GPU using Thrust API**: +Uses the `inclusive_scan()` call from the Thrust library. + +### Stream Compaction +Stream compaction is the process of only keeping the elements in an array that satisfy a specific condition, thereby compacting the array. Three different implementations are examined in this project: + +1. **CPU without Scan**: +Non-parallelized compaction without using scan functions. +2. **CPU with Scan** +Non-parallelized compaction that uses the scan and scatter paradigm of stream compaction. +3. **GPU Work Efficient** +Parallelized setup that uses the scan and scatter paradigm of stream compaction. +The scan function used is identical to Implementation 3 of scanning (i.e. `kernUpSweep()` and `kernDownSweep()`). + +### Power of Two +All implementations are made to support both an array size that is a power of two and one that isn't. + +## Optimal Block Sizes + +We begin by optimizing the block sizes for each implementation such that all implementations are analyzed and compared in the optimized state. + +### GPU Naive: **128** +![Figure 1](img/naive-blocksize.png) + +*Figure 1. GPU Naive Implementation Block Size Comparison* + +### GPU Work-Efficient: **128** +![Figure 2](img/efficient-blocksize.png) + +*Figure 2. GPU Work-Efficient (w/o Shared Memory) Implementation Block Size Comparison* + +It should be noted here that the algorithm also dynamically shrinks/grows the block size depending on the iteration depth, such that the index value does not overflow. 128 is the optimal starting block size. + + +### GPU Work-Efficient with Shared Memory: **128** +This implementation does not use a `blockSize` variable unlike the previous two methods. +Instead, a `B` variable is used which is the number of elements that will be processed by each block. +The actual number of threads required is `B/2`. + +The optimal `B` value is 128, as all other `B` values fail at `N = 2^25`. + +## Sample Output +Below is the output of the test program used for testing and timing the different implementations. `N = 2^24`. +``` +**************** +** SCAN TESTS ** +**************** + [ 3 42 32 13 26 31 45 7 20 2 5 1 10 ... 1 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 9.7722ms (std::chrono Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841500 410841500 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 9.9391ms (std::chrono Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841449 410841488 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 9.41562ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841500 410841500 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 9.46506ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 4.65533ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841500 410841500 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 4.6632ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841449 410841488 ] + passed +==== work-efficient scan (shared memory, recursive), power-of-two ==== + elapsed time: 0.965856ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841500 410841500 ] + passed +==== work-efficient scan (shared memory, recursive), non-power-of-two ==== + elapsed time: 1.99728ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841449 410841488 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.47568ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841500 410841500 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.976416ms (CUDA Measured) + [ 3 45 77 90 116 147 192 199 219 221 226 227 237 ... 410841449 410841488 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 2 2 1 0 0 3 2 2 2 1 1 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 24.0027ms (std::chrono Measured) + [ 2 2 1 3 2 2 2 1 1 3 1 3 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 24.8678ms (std::chrono Measured) + [ 2 2 1 3 2 2 2 1 1 3 1 3 2 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 56.7523ms (std::chrono Measured) + [ 2 2 1 3 2 2 2 1 1 3 1 3 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 8.44557ms (CUDA Measured) + [ 2 2 1 3 2 2 2 1 1 3 1 3 2 ... 3 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 8.28093ms (CUDA Measured) + [ 2 2 1 3 2 2 2 1 1 3 1 3 2 ... 3 3 ] + passed +Press any key to continue . . . + +``` + +## Performance Analysis + +### Scan +![Figure 3](img/scan-performance.png) + +*Figure 3. Scan Algorithm Performance Comparison (Power of 2)* + +From Figure 3, we can observe the following: +* **GPU Naive** implementation is equally inefficient as the CPU implementation +* **GPU Work-Efficient** implementation is better than the CPU implementation. +However the trajectory of the curve still implies that it is getting exponentially slower with larger elements. +* **GPU Work-Efficient with Shared Memory** is significantly more efficient than without shared memory. +* **GPU Thrust** implementation is the fastest out of all. + +#### GPU Work-Efficient +The work-efficient algorithm (without shared memory) is faster than the naive approach as there is overall less work to do for scanning. However, the performance bottlenecks are still noticeable as they mainly come from: +1. Accessing Global Memory - there are lots of read and write operations from/to the global memory. +2. Unused Threads - there are many threads that are unused, especially when the index offset is large. +3. Memory Random Access - global memory are accessed randomly with no particular order. + +To fix these, we can: +1. Use Shared Memory - this is essentially the shared memory scanning implementation as discussed in the next section. +2. Use Warp Partitioning - ensure that all the non-active threads are under the same warp so that the warp can be deactivated automatically once the warp is no longer in use. +3. Ensure Memory Coalescing - ensure that we're accessing memory contiguously by rearranging the data array in each iteration. + +#### GPU Work-Efficient with Shared Memory +By using shared memory, the need of accessing global memory is significantly reduced. Additionally, shared memory is faster with random memory access. Therefore, problem 1 and problem 3 from above are greatly alleviated. However, with the usage of shared memory, we also introduce a bottleneck from **bank conflicts**. As such, there is still space for improvement once we remove the bank conflicts. + +#### GPU Thrust +![Figure 4](img/thrust-timeline.png) + +*Figure 4. Nsight timeline for the Thrust implementation* + +If we look at the Nsight timeline for the Thrust implementation, we can see that it calls the `cudaMemcpyAsync` function instead of `cudaMemcpy` like all other implementations. This suggests that there may be an asynchronous process going on in Thrust's implementation that copies the buffer data while another process is running, thus having some performance gain. + +#### Non Power of 2 +![Figure 5](img/scan-performance-npot.png) + +*Figure 5. Scan Algorithm Performance Comparison (Non Power of 2)* + +There is no significant performance difference between power-of-two sized versus. non-power-of-two-sized arrays. + +### Stream Compaction +![Figure 6](img/compact-performance.png) + +*Figure 5. Stream Compaction Algorithm Performance Comparison (Non Power of 2)* + +As the stream compaction algorithm uses the GPU Work-Efficient (w/o Shared Memory) scan function, +it naturally follows the same performance result. Performance gain can be achieved by alleviating the problems discussed above. diff --git a/img/compact-performance.png b/img/compact-performance.png new file mode 100644 index 0000000..6d5133b Binary files /dev/null and b/img/compact-performance.png differ diff --git a/img/efficient-blocksize.png b/img/efficient-blocksize.png new file mode 100644 index 0000000..c7e3782 Binary files /dev/null and b/img/efficient-blocksize.png differ diff --git a/img/naive-blocksize.png b/img/naive-blocksize.png new file mode 100644 index 0000000..bf7c44e Binary files /dev/null and b/img/naive-blocksize.png differ diff --git a/img/scan-performance-npot.png b/img/scan-performance-npot.png new file mode 100644 index 0000000..699b844 Binary files /dev/null and b/img/scan-performance-npot.png differ diff --git a/img/scan-performance.png b/img/scan-performance.png new file mode 100644 index 0000000..05e9215 Binary files /dev/null and b/img/scan-performance.png differ diff --git a/img/thrust-timeline.png b/img/thrust-timeline.png new file mode 100644 index 0000000..f09abac Binary files /dev/null and b/img/thrust-timeline.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..bd99952 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +64,49 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan (shared memory, recursive), power-of-two"); + StreamCompaction::Efficient::recursiveScan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan (shared memory, recursive), non-power-of-two"); + StreamCompaction::Efficient::recursiveScan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +151,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..a6aabf3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + bools[index] = (idata[index] > 0) ? 1 : 0; } /** @@ -32,7 +36,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + if (bools[index] == 1) + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..0585831 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -30,6 +30,12 @@ inline int ilog2ceil(int x) { return x == 1 ? 0 : ilog2(x - 1) + 1; } +inline int imakepower2(int x) { + while ((x & (x - 1)) != 0) + ++x; + return x; +} + namespace StreamCompaction { namespace Common { __global__ void kernMapToBoolean(int n, int *bools, const int *idata); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..bd988f8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,12 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = idata[0]; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i]; + } + timer().endCpuTimer(); } @@ -30,9 +35,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int j = 0; + for (int i = 0; i < n; i++) { + if (idata[i] > 0) { + odata[j++] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return j; } /** @@ -41,10 +53,45 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + + // criteria array + int* tmp = new int[n * sizeof(int)]; + int* scan_out = new int[n * sizeof(int)]; + + timer().startCpuTimer(); + + for (int i = 0; i < n; i++) { + tmp[i] = (idata[i] > 0) ? 1 : 0; + } + + // inclusive scan + scan_out[0] = tmp[0]; + for (int i = 1; i < n; i++) { + scan_out[i] = scan_out[i-1] + tmp[i]; + } + int N = scan_out[n - 1]; // total number + + // make exclusive + // shift array right + for (int i = n; i > 0; i--) { + scan_out[i] = scan_out[i-1]; + } + scan_out[0] = 0; // insert identity + + // scatter + + for (int i = 0; i < n; i++) { + if (tmp[i] > 0) { + odata[scan_out[i]] = idata[i]; + } + } + + timer().endCpuTimer(); + + free(tmp); + free(scan_out); + + return N; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..89f2ef7 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,8 +1,12 @@ #include #include +#include #include "common.h" #include "efficient.h" +#define blockSize 128 +#define B 128 // Number of elements processed per block + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +16,252 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int N, int d, int* data) { + //parallel reduction + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int k = index * 2 * d; + int r = k + 2 * d - 1; + int l = k + d - 1; + + if (r < N && r >= 0) { + // right node += left node + data[r] += data[l]; + } + } + + __global__ void kernUpdateElement(int i, int* arr, int val) { + arr[i] = val; + } + + __global__ void kernDownSweep(int N, int d, int* data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int k = index * 2 * d; + int r = k + 2 * d - 1; + int l = k + d - 1; + + if (r < N && r >= 0) { + int t = data[l]; //left child + data[l] = data[r]; //left child = right child (current) + data[r] += t; // right child += previous left child + } + } + + __global__ void kernMakeExclusive(int N, int* data, int auxOffset) { + + extern __shared__ int tmp[]; + + int tid = threadIdx.x; + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= N) + return; + + tmp[tid] = data[auxOffset + index - 1]; + + __syncthreads(); + + data[auxOffset + index] = (index > 0) ? tmp[tid] : 0; + } + + __global__ void kernMakeInclusive(int N, int* odata, int* scan_data, int* idata) { + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + if (index == N - 1) { + odata[index] = scan_data[index] + idata[index]; + } + else { + odata[index] = scan_data[index + 1]; + } + } + + __global__ void kernAddBlockSum(int N, int* data, int* aux, int dataOffset, int auxOffset) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < N) + data[dataOffset + index] += aux[auxOffset + blockIdx.x]; + } + + /** + * Performs the full scan inside a kernel instead of breaking it apart + */ + __global__ void kernScan(int N, int *data, int *aux, int dataOffset, int auxOffset) { + + extern __shared__ int temp[]; // dynamic shared array + + int tid = threadIdx.x; // [0 - B/2) + int idx = tid + blockDim.x * blockIdx.x; + int n = blockDim.x * 2; // B + + // NOTE: Is it fine to read and write to data arr in the same kernel? + temp[2 * tid] = data[dataOffset + 2 * idx]; + temp[2 * tid + 1] = data[dataOffset + 2 * idx + 1]; + + for (int d = 1; d < n; d <<= 1) { + + __syncthreads(); + + int k = tid * 2 * d; + int r = k + 2 * d - 1; + int l = k + d - 1; + if (r < n) { + temp[r] += temp[l]; + } + } + + __syncthreads(); + if (tid == blockDim.x - 1) + aux[auxOffset + blockIdx.x] = temp[n - 1]; + + __syncthreads(); + if (tid == 0) + temp[n - 1] = 0; // set root to 0 + + for (int d = n >> 1; d > 0; d >>= 1) { + + __syncthreads(); + + int k = tid * 2 * d; + int r = k + 2 * d - 1; + int l = k + d - 1; + if (r < n) { + int t = temp[l]; // left child + temp[l] = temp[r]; // left child = right child (current) + temp[r] += t; // right child += previous left child + } + } + + __syncthreads(); + + // make inclusive + if (tid == blockDim.x - 1) { + data[dataOffset + 2 * idx] = temp[2 * tid + 1]; + data[dataOffset + 2 * idx + 1] += temp[2 * tid + 1]; + } + else { + data[dataOffset + 2 * idx] = temp[2 * tid + 1]; + data[dataOffset + 2 * idx + 1] = temp[2 * tid + 2]; + } + } + + void recursiveScan(int n, int* dev_data, int* dev_aux, int dataOffset, int auxOffset, int *odata) { + int M = std::max(n / B, 1); + kernScan<<>> (M, dev_data, dev_aux, dataOffset, auxOffset); + checkCUDAErrorFn("kernScan Aux failed."); + + if (M > 1) { + recursiveScan(M, dev_aux, dev_aux, auxOffset, auxOffset + M, odata); + } + + int m = std::max(M / B, 1); + kernMakeExclusive<<>>(M, dev_aux, auxOffset); + checkCUDAErrorFn("kernMakeExclusive Aux failed."); + + kernAddBlockSum<<>>(n, dev_data, dev_aux, dataOffset, auxOffset); + checkCUDAErrorFn("kernAddBlockSum failed."); + } + + void recursiveScan(int n, int* odata, const int* idata) { + int N = imakepower2(n); + + int* dev_data; + int* dev_aux; + + cudaMalloc((void**)&dev_data, sizeof(int) * N); + checkCUDAErrorFn("dev_data malloc failed."); + cudaMalloc((void**)&dev_aux, sizeof(int) * N); // allocate maximum aux size + checkCUDAErrorFn("dev_aux malloc failed."); + + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("memCpy from idata to dev_data failed."); + + cudaMemset(dev_aux, 0, sizeof(int) * N); + checkCUDAErrorFn("aux memset failed."); + + timer().startGpuTimer(); + + recursiveScan(N, dev_data, dev_aux, 0, 0, odata); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("memCpy from dev_data to odata failed."); + + cudaFree(dev_data); + cudaFree(dev_aux); + checkCUDAErrorFn("cudaFree failed."); + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + int N = imakepower2(n); + + int* dev_data; + int* dev_data2; + int* dev_aux; + + cudaMalloc((void**)&dev_data, sizeof(int) * N); + checkCUDAErrorFn("dev_data malloc failed."); + cudaMalloc((void**)&dev_data2, sizeof(int) * N); + checkCUDAErrorFn("dev_data malloc failed."); + + cudaMalloc((void**)&dev_aux, sizeof(int) * N); // allocate maximum aux size + checkCUDAErrorFn("dev_aux malloc failed."); + + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("memCpy from idata to dev_data failed."); + cudaMemcpy(dev_data2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("memCpy from idata to dev_data2 failed."); + + cudaMemset(dev_aux, 0, sizeof(int) * N); + checkCUDAErrorFn("aux memset failed."); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + int T = blockSize; + int M = N / T; + + timer().startGpuTimer(); + + for (int d = 1; d < N; d <<= 1) { + while (T * d > N && T > 1) + T >>= 1; // shrink block size + kernUpSweep<<>>(N, d, dev_data); + M = std::max(M >> 1, 1); // divide by 2 + } + //cudaDeviceSynchronize(); + checkCUDAErrorFn("kernUpSweep failed."); + + // set last element to 0 + cudaMemset(dev_data + N - 1, 0, sizeof(int)); + checkCUDAErrorFn("memset failed."); + + M = 1; + for (int d = N >> 1; d > 0; d >>= 1) { + while (T * d < N && T < blockSize) + T <<= 1; // expand block size + kernDownSweep<<>>(N, d, dev_data); + M = std::min(M << 1, N / T); + } + //cudaDeviceSynchronize(); + checkCUDAErrorFn("kernDownSweep failed."); + + kernMakeInclusive<<>>(N, dev_data2, dev_data, dev_data2); + checkCUDAErrorFn("kernMakeInclusive failed."); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data2, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("memCpy from dev_data to odata failed."); + + cudaFree(dev_data); + cudaFree(dev_data2); + cudaFree(dev_aux); + checkCUDAErrorFn("cudaFree failed."); } /** @@ -31,10 +274,73 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int N = imakepower2(n); + + int* dev_idata; + int* dev_indices; + int* dev_bools; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, sizeof(int) * N); + checkCUDAErrorFn("dev_idata malloc failed."); + cudaMalloc((void**)&dev_indices, sizeof(int) * N); + checkCUDAErrorFn("dev_indices malloc failed."); + cudaMalloc((void**)&dev_bools, sizeof(int) * N); + checkCUDAErrorFn("dev_bools malloc failed."); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + int T = blockSize; + int M = N / T; + timer().startGpuTimer(); - // TODO + + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + checkCUDAErrorFn("kernMapToBoolean failed."); + + cudaMemcpy(dev_indices, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToDevice); + + for (int d = 1; d < N; d <<= 1) { + while (T * d > N && T > 1) + T >>= 1; // shrink block size + kernUpSweep<<>>(N, d, dev_indices); + M = std::max(M >> 1, 1); // divide by 2 + } + checkCUDAErrorFn("kernUpSweep failed."); + + // grab the size + int L; + cudaMemcpy(&L, &dev_indices[N-1], sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("memcpy dev_indices failed."); + cudaMalloc((void**)&dev_odata, sizeof(int) * L); + + cudaMemset(&dev_indices[N - 1], 0, sizeof(int)); + + M = 1; + for (int d = N >> 1; d > 0; d >>= 1) { + while (T * d < N && T < blockSize) + T <<= 1; // expand block size + kernDownSweep<<>>(N, d, dev_indices); + M = std::min(M << 1, N / T); + } + checkCUDAErrorFn("kernDownSweep failed."); + + StreamCompaction::Common::kernScatter<<>>(N, dev_odata, dev_idata, dev_bools, dev_indices); + checkCUDAErrorFn("kernScatter failed."); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, sizeof(int) * L, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("memCpy from dev_data to odata failed."); + + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_indices); + cudaFree(dev_odata); + + return L; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..432f0ed 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,8 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + void recursiveScan(int n, int* odata, const int* idata); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..2d91428 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +13,68 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + int* dev_odata; + int* dev_idata; + + __global__ void kernPrefixSum(int n, int offset, int *out, const int *in) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + out[index] = in[index]; + if (index >= offset) + out[index] += in[index - offset]; + } + + __global__ void kernShiftRight(int n, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + + // make exclusive + odata[index] = (index > 0) ? idata[index - 1] : 0; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + // TODO: see if there's better way to force power of 2 + int N = imakepower2(n); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + cudaMalloc((void **)&dev_odata, sizeof(int) * N); + checkCUDAErrorFn("dev_odata malloc failed."); + cudaMalloc((void **)&dev_idata, sizeof(int) * N); + checkCUDAErrorFn("dev_idata malloc failed."); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("memCpy idata to dev_idata failed."); + + timer().startGpuTimer(); + + for (int d = 1; d < N; d *= 2) { + // Copy dev_odata to dev_idata (to be used as input for the next iteration) + kernPrefixSum<<>>(N, d, dev_odata, dev_idata); + + int* tmp = dev_idata; + dev_idata = dev_odata; + dev_odata = tmp; + } + + timer().endGpuTimer(); + checkCUDAErrorFn("kernPrefixSum failed."); + + cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("memCpy dev_odata1 to odata failed."); + + cudaFree(dev_odata); + cudaFree(dev_idata); + checkCUDAErrorFn("cudaFree failed."); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..cb33e2c 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,23 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int N = imakepower2(n); + + thrust::device_vector dv_in(idata, idata + N); + thrust::device_vector dv_out(N); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::inclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::host_vector h_out = dv_out; + + // TODO: not sure why I can't just do odata = &h_out[0]; + for (int i = 0; i < N; i++) + odata[i] = h_out[i]; } } }