diff --git a/.gitignore b/.gitignore index a59ec56..fe07d70 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ cis565_getting_started_generated_kernel* *.vcxproj *.xcodeproj build - +radix_sort # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode ### Linux ### @@ -25,7 +25,8 @@ build .LSOverride # Icon must end with two \r -Icon +Icon + # Thumbnails ._* diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a126c9f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,64 @@ +{ + "files.associations": { + "xlocale": "cpp", + "chrono": "cpp", + "algorithm": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "exception": "cpp", + "functional": "cpp", + "initializer_list": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "iterator": "cpp", + "limits": "cpp", + "list": "cpp", + "map": "cpp", + "memory": "cpp", + "mutex": "cpp", + "new": "cpp", + "ostream": "cpp", + "ratio": "cpp", + "set": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "string": "cpp", + "system_error": "cpp", + "thread": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "unordered_map": "cpp", + "utility": "cpp", + "vector": "cpp", + "xfacet": "cpp", + "xhash": "cpp", + "xiosbase": "cpp", + "xlocinfo": "cpp", + "xlocnum": "cpp", + "xmemory": "cpp", + "xstddef": "cpp", + "xstring": "cpp", + "xtr1common": "cpp", + "xtree": "cpp", + "xutility": "cpp" + } +} \ No newline at end of file diff --git a/README.md b/README.md index 0e38ddb..fb21fdd 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,89 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Nuofan Xu +* Tested on: Windows 10, AMD Ryzen 3800x @ 3.9Hz 2x16GB RAM, RTX 2080 Super 8GB -### (TODO: Your README) +**Overview** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project implements Scan and Stream Compaction and tests performance of various implementations with different array size and block size( all implementations support Non-Power-Of-Two input). The detailed list of implementations is shown below. +* Scan + * cpu + * naive (gpu) + * work-efficient (gpu, optimized indexing) + * thrust (gpu) +* Stream Compaction + * cpu without scan + * cpu with scan + * gpu with work-efficient scan + +### Introduction + +Stream compaction, also known as stream filtering or selection, produces a smaller output array which contains the indices of the only wanted elements from the input array for further processing. It is commonly used in applications such as path tracing, collision detection, sparse matrix compression, etc. With the tremendous amount of data elements to be filtered, the performance of selection is of great concern. Modern Graphics Processing Units (GPUs) have been increasingly used to accelerate the execution of massively large, data parallel applications which include stream compaction. + +A efficient parallelized Stream Compaction algorithm uses the scan algorithm as its backbone. Scanning involves converting an input arrayinto an output array such that every position in the output array is equal to a specified operation over every element before it in the input array. + +For example, given an input array x = {1, 3, 5, 9} and the addition operation, any element in the output array, y[i], is equal to x[0] + x[1] + ... + x[i]. This makes y = {1, 1+3, 1+3+5, 1+3+5+9} = {1, 4, 9, 18}. + +When the first element of the output array is simply a copy of the first element of the input array, as is the case here, this is called an Inclusive Scan. An Exclusive Scan is an inclusive scan shifted to the right by one element and filling in a '0' where the first element of the array was. + +### CPU Scan +This is a simple loop over all the N elements in an array which keeps accumulating value in its successive elements. This algorithm is very lean and runs in O(N) time, but in a serialized loop. + +### Naive GPU Scan +![](/img/figure-39-2.jpg) + +The naive parallel implementation found in the solution file is essentially a Kogge-Stone Adder. This is naive because it isn't work efficient (it does relatively excessive amournt of work). We simply traverse over the N elements log N times in parallel. On the first iteration, each pair of elements is summed, creating partial sums that we will use in the next iterations. The total complexity is O(N log N). + +#### upsweep +![](/img/figure-39-4.jpg) + +In the upward sweep, threads collaborate to generate partial sums across the input array while traversing "upwards" in a tree like fashion. By the end of this phase, we have partial sums leading up to the final element, which contains a sum of all values in the input array. + +#### Downsweep +After generating partial sums, we can begin the downsweep phase. We initially replace the last element in the array with a zero (in order to achieve an exclusive scan). We then traverse back down the tree, replacing the left child of each element with the current value and the right child with the sum of the old left child and the current value. + +The combination of UpSweep and DownSweep give us an exclusive scan which runs log N times for UpSweep and another log N times for DownSweep. The total complexity is O(N). + +### BlockSize Optimization +A perilimary optimazation is done on the GPU block size parameter. Two different array lengths, 2^8 and 2^18 are used in this step. Through testing, changing blockSize does almost no effect on the performance the performance of Navie and efficient implmentation of scan and stream compaction with small input array size. In the case of big array size, block size does slightly affect the performance. There is no obvious pattern that purely increasing or decreasing block size would lead to a noticeable difference in performance, rather, there seem to be a sweet spot around blozk size 64 to 128. After consideration, bloci size of 128 is used for all the subsequent test results. The graph is plotted as following: +

+ + +

+ + +### ArraySize Performance Analysis +Investigations have also been done on array size to see the performance of all implementations. The resulting plot is shown below. The cpu implementation is super fast for small arrays, as it has less memory overheads and data transfers in comparison to the parallelized versions on GPU. When the array size increases, the parallelization begin to manifest its power with complexity O(nlogn) for Navie and O(n) for efficient implemtation. The speed increase caused by parallel processing of array elements on different threads overweighs the cost of memory overhead. + + + +

+ + +

+ +### Work Efficient Scan Optimization +Several optimization attempts have been done to increase the performance on GPU. +* Reduce the number of steps that some threads need to go through. +Not all the threads need to go through the UpSweep and DownSweep part. Threads that are not involved in the process can be terminated early. + +* Adjust the blockSize. +Block size in the GPU can be changed to allow a bigger number of threads running in the same block. No obvious effect is obeserved. + +* Reduce the number of threads that need to be launched. +This is because not all threads are actually working. For example, if the input size is 1024, we only need 512 threads at most instead of 1024 for the first depth (the number of nodes in the addition tree is only half of the size). + +Before those optimaztions, the performance of efficient scan and stream compaction is very low, even lower than the serialized CPU implemention with complexity O(N). With the above steps, the performance of parallelized implementations exceeds pure CPU approach at input array size of approximately 2^14 to 2^16. + +### Thrust Scan Library +Scan and stream compaction is also implemented using thrust library. However, the speed of thrust scan is very slow. The reason behind that, in my opinion, is that these libraries, especially thrust, try to be as generic as possible and optimization often requires specialization: for example a specialization of an algorithm can use shared memory for fundamental types (like int or float) but the generic version can't. Thrust focuses on providing a generic template that can be easily used by all users and sacrifices speed for generalizability. + +### Sample Test Result + +Sample performance test result of input array size of 2^8 with blockSize of 128: +![](img/raw_array_size_2_8.png) + +### Feedback +Any feedback on errors in the above analysis or any other places is appreciated. \ No newline at end of file diff --git a/img/array_length.PNG b/img/array_length.PNG new file mode 100644 index 0000000..eb37723 Binary files /dev/null and b/img/array_length.PNG differ diff --git a/img/array_length_big.PNG b/img/array_length_big.PNG new file mode 100644 index 0000000..736a4b5 Binary files /dev/null and b/img/array_length_big.PNG differ diff --git a/img/block_size.PNG b/img/block_size.PNG new file mode 100644 index 0000000..45455ad Binary files /dev/null and b/img/block_size.PNG differ diff --git a/img/block_size_big_array.PNG b/img/block_size_big_array.PNG new file mode 100644 index 0000000..6744c03 Binary files /dev/null and b/img/block_size_big_array.PNG differ diff --git a/img/compaction.PNG b/img/compaction.PNG new file mode 100644 index 0000000..d56ac1c Binary files /dev/null and b/img/compaction.PNG differ diff --git a/img/raw_array_size_2_10.PNG b/img/raw_array_size_2_10.PNG new file mode 100644 index 0000000..afc1316 Binary files /dev/null and b/img/raw_array_size_2_10.PNG differ diff --git a/img/raw_array_size_2_11.PNG b/img/raw_array_size_2_11.PNG new file mode 100644 index 0000000..6a4ac37 Binary files /dev/null and b/img/raw_array_size_2_11.PNG differ diff --git a/img/raw_array_size_2_14.PNG b/img/raw_array_size_2_14.PNG new file mode 100644 index 0000000..69d3789 Binary files /dev/null and b/img/raw_array_size_2_14.PNG differ diff --git a/img/raw_array_size_2_16.PNG b/img/raw_array_size_2_16.PNG new file mode 100644 index 0000000..e489a85 Binary files /dev/null and b/img/raw_array_size_2_16.PNG differ diff --git a/img/raw_array_size_2_18.PNG b/img/raw_array_size_2_18.PNG new file mode 100644 index 0000000..2c3b099 Binary files /dev/null and b/img/raw_array_size_2_18.PNG differ diff --git a/img/raw_array_size_2_20.PNG b/img/raw_array_size_2_20.PNG new file mode 100644 index 0000000..2f00267 Binary files /dev/null and b/img/raw_array_size_2_20.PNG differ diff --git a/img/raw_array_size_2_24.PNG b/img/raw_array_size_2_24.PNG new file mode 100644 index 0000000..71f1f7b Binary files /dev/null and b/img/raw_array_size_2_24.PNG differ diff --git a/img/raw_array_size_2_5.PNG b/img/raw_array_size_2_5.PNG new file mode 100644 index 0000000..4839829 Binary files /dev/null and b/img/raw_array_size_2_5.PNG differ diff --git a/img/raw_array_size_2_6.PNG b/img/raw_array_size_2_6.PNG new file mode 100644 index 0000000..52e6087 Binary files /dev/null and b/img/raw_array_size_2_6.PNG differ diff --git a/img/raw_array_size_2_7.PNG b/img/raw_array_size_2_7.PNG new file mode 100644 index 0000000..c7775dd Binary files /dev/null and b/img/raw_array_size_2_7.PNG differ diff --git a/img/raw_array_size_2_8.png b/img/raw_array_size_2_8.png new file mode 100644 index 0000000..5a6ebdd Binary files /dev/null and b/img/raw_array_size_2_8.png differ diff --git a/img/raw_array_size_2_9.PNG b/img/raw_array_size_2_9.PNG new file mode 100644 index 0000000..f7e8129 Binary files /dev/null and b/img/raw_array_size_2_9.PNG differ diff --git a/img/raw_blocksize_256.PNG b/img/raw_blocksize_256.PNG new file mode 100644 index 0000000..7a042f3 Binary files /dev/null and b/img/raw_blocksize_256.PNG differ diff --git a/img/raw_blocksize_32.PNG b/img/raw_blocksize_32.PNG new file mode 100644 index 0000000..1051ce7 Binary files /dev/null and b/img/raw_blocksize_32.PNG differ diff --git a/img/raw_blocksize_512.PNG b/img/raw_blocksize_512.PNG new file mode 100644 index 0000000..f3daa46 Binary files /dev/null and b/img/raw_blocksize_512.PNG differ diff --git a/img/raw_blocksize_64.PNG b/img/raw_blocksize_64.PNG new file mode 100644 index 0000000..5c94aa6 Binary files /dev/null and b/img/raw_blocksize_64.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..ae95635 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 18; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +64,35 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); + zeroArray(SIZE, c); // 128 printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + count = StreamCompaction::Efficient::compact(NPOT, c, a); // NPOT = 128 - 3 = 125 printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..fb79f11 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + bools[idx] = idata[idx] != 0; } /** @@ -32,7 +37,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + if (bools[idx]) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..347a211 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,7 +2,7 @@ #include "cpu.h" #include "common.h" - +#include namespace StreamCompaction { namespace CPU { using StreamCompaction::Common::PerformanceTimer; @@ -18,21 +18,40 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { + if (n <= 0) return; timer().startCpuTimer(); - // TODO + // exclusive scan + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } + void scanNoTimer(int n, int *odata, const int *idata) { + if (n <= 0) return; + // exclusive scan + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } + /** * CPU stream compaction without using the scan function. * * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + int k = 0; timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; i++) { + if (idata[i]) { + odata[k++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return k; } /** @@ -41,10 +60,27 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int *scanResults = new int[n]; + // timer starts after allocation timer().startCpuTimer(); - // TODO + + // mapping boolean function + for (int i = 0; i < n; i++) { + odata[i] = idata[i] != 0; + } + //scan + scanNoTimer(n, scanResults, odata); + //compaction + int k = 0; + for (int i = 0; i < n; i++) { + if (idata[i]) { + k++; + odata[scanResults[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + + return k; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..da8d20b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include + +#define blockSize 32 namespace StreamCompaction { namespace Efficient { @@ -12,14 +15,77 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int N, int offset, int *buffer){ + // offset: current depth of the tree + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= (N >> offset)) return; + int k = index << (offset); + buffer[k + (1 << (offset)) - 1] += buffer[k + (1 << (offset-1)) - 1]; + } + + __global__ void kernDownSweep(int N, int offset, int *buffer){ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= (N >> offset)) return; + int k = index << offset; + int tmp = buffer[k + (1 << offset) - 1]; + buffer[k + (1 << offset) - 1] += buffer[k + (1 << (offset - 1)) - 1]; + buffer[k + (1 << (offset - 1)) - 1] = tmp; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *buffer, fullBlocksPerGrid; + // padded to the power of 2s and get the max depth D of the balanced tree + int D = ilog2ceil(n); + int N = 1 << D; + + // float time1, time2; + cudaMalloc((void**)&buffer, N * sizeof(int)); + cudaMemcpy(buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + + for (int d= 1; d <= D; d++) { + fullBlocksPerGrid = ((N >> d) + blockSize - 1) / blockSize; + kernUpSweep << > >(N, d, buffer); + } + // timer().endGpuTimer(); + // time1 = timer().getGpuElapsedTimeForPreviousOperation(); + cudaMemset(buffer + N - 1, 0, sizeof(int)); + // timer().startGpuTimer(); + for (int d = D; d >= 1; d--) { + fullBlocksPerGrid = ((N >> d) + blockSize - 1) / blockSize; + kernDownSweep << > >(N, d, buffer); + } timer().endGpuTimer(); - } + cudaMemcpy(odata, buffer, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer); + + // time2 = timer().getGpuElapsedTimeForPreviousOperation(); + // printf("Work-Efficient compact(scan): %f ms\n", time1+time2); + } + + void scanNoTimer(int n, int *odata, const int *idata) { + int *buffer, fullBlocksPerGrid; + // padded to the power of 2s and get the max depth D of the balanced tree + int D = ilog2ceil(n); + int N = 1 << D; + cudaMalloc((void**)&buffer, N * sizeof(int)); + cudaMemcpy(buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int d= 1; d <= D; d++) { + fullBlocksPerGrid = ((N >> d) + blockSize - 1) / blockSize; + kernUpSweep << > >(N, d, buffer); + } + cudaMemset(buffer + N - 1, 0, sizeof(int)); + for (int d = D; d >= 1; d--) { + fullBlocksPerGrid = ((N >> d) + blockSize - 1) / blockSize; + kernDownSweep << > >(N, d, buffer); + } + cudaMemcpy(odata, buffer, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer); + } /** * Performs stream compaction on idata, storing the result into odata. @@ -30,11 +96,51 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ + int compact(int n, int *odata, const int *idata) { + // Work-Efficient Compact + // float time = 0; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + // init + int *bools, *indices, *in, *out; + // memory allocation + cudaMalloc((void**)&bools, n * sizeof(int)); + cudaMalloc((void**)&indices, n * sizeof(int)); + cudaMalloc((void**)&in, n * sizeof(int)); + cudaMalloc((void**)&out, n * sizeof(int)); + // copy to device + cudaMemcpy(in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + StreamCompaction::Common::kernMapToBoolean << > >(n, bools, in); + // timer().endGpuTimer(); + // time += timer().getGpuElapsedTimeForPreviousOperation(); + + // copy to host + cudaMemcpy(odata, bools, n * sizeof(int), cudaMemcpyDeviceToHost); + // work efficient scan + scanNoTimer(n, odata, odata); + + int lenCompacted = odata[n - 1]; + // std::cout << lenCompacted; + // lenCompacted = (1<> >(n, out, in, bools, indices); timer().endGpuTimer(); - return -1; + // time += timer().getGpuElapsedTimeForPreviousOperation(); + // printf("Work-Efficient compact(sweep): %f ms\n", time); + cudaMemcpy(odata, out, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(bools); + cudaFree(indices); + cudaFree(in); + cudaFree(out); + lenCompacted = ((1<= n) return; + if (index >= offset) { + dev_idata[index] = dev_odata[index - offset] + dev_odata[index]; + } + else { + dev_idata[index] = dev_odata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int offset, *dev_odata, *dev_idata; + // malloc memory before timing + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + int D = ilog2ceil(n); timer().startGpuTimer(); - // TODO + // calling kernel function in for loop, will be executed in parallel + for (int d=1; d<=D;d++){ + offset = 1 << (d - 1); + kernScan << > >(offset, n, dev_odata, dev_idata); + // ping pong buffer + std::swap(dev_odata, dev_idata); + } timer().endGpuTimer(); + // printf("Naive scan: %f ms\n", timer().getGpuElapsedTimeForPreviousOperation()); + cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + // free memory + cudaFree(dev_odata); + cudaFree(dev_idata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..d2c06ce 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,16 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + printf("Thrust scan: %f ms\n",timer().getGpuElapsedTimeForPreviousOperation()); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }