Skip to content

Commit 4580221

Browse files
author
Zixin Zhang
committed
1 parent e9cafcc commit 4580221

File tree

3 files changed

+59
-14
lines changed

3 files changed

+59
-14
lines changed

src/main.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "testing_helpers.hpp"
1717

1818
// The tests default to an array of size 1 << 8 = 256
19-
const int SIZE = 1 << 16; // feel free to change the size of array
19+
const int SIZE = 1 << 20; // feel free to change the size of array
2020
const int NPOT = SIZE - 3; // Non-Power-Of-Two
2121
int *a = new int[SIZE];
2222
int *b = new int[SIZE];
@@ -101,7 +101,7 @@ int main(int argc, char* argv[]) {
101101
printCmpResult(NPOT, b, c);
102102

103103
printf("\n");
104-
104+
#if 0
105105
zeroArray(SIZE, c);
106106
printDesc("work-efficient scan, power-of-two");
107107
StreamCompaction::Efficient::scan(SIZE, c, a);
@@ -115,12 +115,12 @@ int main(int argc, char* argv[]) {
115115
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
116116
//printArray(NPOT, c, true);
117117
printCmpResult(NPOT, b, c);
118-
118+
#endif
119119
zeroArray(SIZE, c);
120120
printDesc("naive scan, power-of-two");
121121
StreamCompaction::Naive::scan(SIZE, c, a);
122122
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
123-
// printArray(SIZE, c, true);
123+
printArray(SIZE, c, false);
124124
printCmpResult(SIZE, b, c);
125125

126126
/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -141,14 +141,14 @@ int main(int argc, char* argv[]) {
141141
printDesc("thrust scan, power-of-two");
142142
StreamCompaction::Thrust::scan(SIZE, c, a);
143143
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
144-
printArray(SIZE, c, true);
144+
// printArray(SIZE, c, true);
145145
printCmpResult(SIZE, b, c);
146146

147147
zeroArray(SIZE, c);
148148
printDesc("thrust scan, non-power-of-two");
149149
StreamCompaction::Thrust::scan(NPOT, c, a);
150150
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
151-
printArray(NPOT, c, true);
151+
// printArray(NPOT, c, true);
152152
printCmpResult(NPOT, b, c);
153153

154154

stream_compaction/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
#include <stdexcept>
1212

1313
/*! Block size used for CUDA kernel launch. */
14-
#define blockSize 1024
15-
#define sectionSize 1024
14+
#define blockSize 256
15+
#define sectionSize 256
1616

1717
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
1818
// usage: checkCUDAError("a descriptive name of this error")

stream_compaction/naive.cu

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ namespace StreamCompaction {
144144
}
145145
}
146146

147+
148+
147149
/**
148150
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
149151
*/
@@ -157,6 +159,7 @@ namespace StreamCompaction {
157159
int* d_OutputExclusiveData;
158160
int* d_SumArray;
159161
int* d_SumArrayOutput;
162+
int* d_SumArrayAx;
160163

161164
cudaMalloc((void**)&d_InputData, size);
162165
checkCUDAError("cudaMalloc d_InputData failed!");
@@ -173,6 +176,9 @@ namespace StreamCompaction {
173176
cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
174177
checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
175178

179+
cudaMalloc((void**)&d_SumArrayAx, sumArraySize);
180+
checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
181+
176182
cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
177183

178184
dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1);
@@ -181,20 +187,61 @@ namespace StreamCompaction {
181187
dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
182188
dim3 dimBlockSumArray(blockSize, 1, 1);
183189

190+
// for testing
191+
int* sumArray = new int[sumArrayNumEle];
192+
int* sumArrayOutput = new int[sumArrayNumEle];
193+
184194
timer().startGpuTimer();
185195
// First step: compute the scan result for individual sections
186196
// then, store their block sum to sumArray
187197
kernNaiveGPUScanFirstStep << <dimGridArray, dimBlockArray >> > (d_InputData,
188198
d_OutputData, d_SumArray, n);
189199
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
190200

191-
//();
201+
// cudaDeviceSynchronize();
202+
203+
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
204+
checkCUDAError("memCpy back failed!");
205+
206+
cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
207+
checkCUDAError("memCpy back failed!");
208+
209+
std::cout << '\n';
210+
for (int i = 0; i < n; i++)
211+
{
212+
std::cout << odata[i] << ' ';
213+
if ((i + 1) % 8 == 0) {
214+
std::cout << std::endl;
215+
}
216+
}
217+
218+
std::cout << '\n';
219+
for (int i = 0; i < sumArrayNumEle; i++)
220+
{
221+
std::cout << sumArray[i] << ' ';
222+
}
223+
224+
std::cout << '\n';
192225

193226
// Second step: scan block sums
194-
kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
195-
d_SumArray, d_SumArrayOutput, sumArrayNumEle);
196-
checkCUDAError("kernNaiveGPUScanSecondStep failed!");
227+
kernNaiveGPUScanFirstStep << <dimGridSumArray, dimBlockSumArray >> > (d_SumArray,
228+
d_SumArrayOutput, d_SumArrayAx, n);
229+
230+
kernNaiveGPUScanThirdStep << <dimGridSumArray, dimBlockSumArray >> > (
231+
d_SumArrayAx, d_SumArrayOutput, n);
232+
233+
cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize,
234+
cudaMemcpyDeviceToHost);
235+
checkCUDAError("memCpy back failed!");
197236

237+
printf("\n");
238+
239+
for (int i = 0; i < sumArrayNumEle; i++)
240+
{
241+
std::cout << sumArrayOutput[i] << ' ';
242+
}
243+
244+
printf("\n");
198245
//cudaDeviceSynchronize();
199246

200247
// Third step: add scanned block sum i to all values of scanned block
@@ -203,8 +250,6 @@ namespace StreamCompaction {
203250
d_SumArrayOutput, d_OutputData, n);
204251
checkCUDAError("kernNaiveGPUScanThirdStep failed!");
205252

206-
// cudaDeviceSynchronize();
207-
208253
// Last step:
209254
convertFromInclusiveToExclusive << <dimGridArray, dimBlockArray >> > (
210255
d_OutputData, d_OutputExclusiveData, n);

0 commit comments

Comments
 (0)