66#include < iostream> // testing
77
88/* ! Block size used for CUDA kernel launch. */
9- #define blockSize 128
10- #define sectionSize 128
9+ #define blockSize 8
10+ #define sectionSize 8
1111
1212namespace StreamCompaction {
1313 namespace Naive {
@@ -18,16 +18,72 @@ namespace StreamCompaction {
1818 return timer;
1919 }
2020
21- __device__ void computeScanToOutputArray (const int * inputArray, int * outputArray,
22- int * XY, int inputSize)
21+ // write a kernel to convert from inclusive scan to exclusive scan
22+
23+ __global__ void convertFromInclusiveToExclusive (const int * inputArray,
24+ int * outputArray, int inputSize)
2325 {
2426 int i = blockIdx .x * blockDim .x + threadIdx .x ;
2527 // convert inclusive scan into exclusive scan by shifting
2628 // all elements to the right by one position and fill the frist
2729 // element and out-of-bound elements with 0.
28- if (i < inputSize && threadIdx . x != 0 )
30+ if (i < inputSize && i != 0 )
2931 {
30- XY[threadIdx .x ] = inputArray[i - 1 ];
32+
33+ outputArray[i] = inputArray[i - 1 ];
34+ }
35+ else {
36+ outputArray[i] = 0 ;
37+ }
38+ }
39+
40+ void unitTestConversion ()
41+ {
42+ // for testing
43+ int numObject = 8 ;
44+ int size = numObject * sizeof (int );
45+ int * toyExclusiveArray = new int [numObject];
46+ int * toyInclusiveArray = new int [numObject] {3 , 4 , 11 , 11 , 15 , 16 , 22 , 25 };
47+
48+ int * dev_toyExclusiveArray;
49+ int * dev_toyInclusiveArray;
50+
51+ cudaMalloc ((void **)&dev_toyExclusiveArray, size);
52+ checkCUDAError (" cudaMalloc dev_toyExclusiveArray failed!" );
53+
54+ cudaMalloc ((void **)&dev_toyInclusiveArray, size);
55+ checkCUDAError (" cudaMalloc dev_toyInclusiveArray failed!" );
56+
57+ cudaMemcpy (dev_toyInclusiveArray, toyInclusiveArray, size,
58+ cudaMemcpyHostToDevice);
59+
60+ dim3 dimGridArray ((numObject + blockSize - 1 ) / blockSize, 1 , 1 );
61+ dim3 dimBlockArray (blockSize, 1 , 1 );
62+ convertFromInclusiveToExclusive <<<dimGridArray, dimBlockArray >>> (
63+ dev_toyInclusiveArray, dev_toyExclusiveArray, numObject);
64+
65+ cudaMemcpy (toyExclusiveArray, dev_toyExclusiveArray, size,
66+ cudaMemcpyDeviceToHost);
67+ checkCUDAError (" memCpy back failed!" );
68+
69+ printf (" \n " );
70+
71+ for (int i = 0 ; i < numObject; i++)
72+ {
73+ std::cout << toyExclusiveArray[i] << ' \n ' ;
74+ }
75+
76+ printf (" \n " );
77+
78+ }
79+
80+ __device__ void computeScanToOutputArray (const int * inputArray, int * outputArray,
81+ int * XY, int inputSize)
82+ {
83+ int i = blockIdx .x * blockDim .x + threadIdx .x ;
84+ if (i < inputSize)
85+ {
86+ XY[threadIdx .x ] = inputArray[i];
3187 }
3288 else {
3389 XY[threadIdx .x ] = 0 ;
@@ -37,16 +93,16 @@ namespace StreamCompaction {
3793 {
3894 // make sure that input is in place
3995 __syncthreads ();
40- int index = threadIdx . x ;
41- int previousIndex = index - stride;
42- if (previousIndex < 0 )
96+ int previousValue = 0 ;
97+ int previousIndex = threadIdx . x - stride;
98+ if (previousIndex >= 0 )
4399 {
44- previousIndex = 0 ;
100+ previousValue = XY[previousIndex] ;
45101 }
46- int temp = XY[index ] + XY[previousIndex] ;
102+ int temp = XY[threadIdx . x ] + previousValue ;
47103 // make sure previous output has been consumed
48104 __syncthreads ();
49- XY[index ] = temp;
105+ XY[threadIdx . x ] = temp;
50106 }
51107
52108 // each thread writes its result into the output array
@@ -86,28 +142,28 @@ namespace StreamCompaction {
86142 int * outputArray, int inputSize)
87143 {
88144 int i = blockIdx .x * blockDim .x + threadIdx .x ;
89- if (i < inputSize && blockIdx . x > 0 )
145+ if (i < inputSize)
90146 {
91- outputArray[i] += inputArray[blockIdx .x - 1 ];
147+ outputArray[i] += inputArray[blockIdx .x ];
92148 }
93149 }
94150
95151 /* *
96152 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
97153 */
98154 void scan (int n, int *odata, const int *idata) {
155+ // unitTestConversion();
99156 int size = n * sizeof (int );
100157 int sumArrayNumEle = (n + blockSize - 1 ) / blockSize;
101158 int sumArraySize = sumArrayNumEle * sizeof (int );
102159
103160 int * d_InputData;
104161 int * d_OutputData;
105162 int * d_SumArray;
106- int * d_SumArrayOutput;
163+ // int* d_SumArrayOutput;
107164
108165 // for testing
109166 int * sumArray = new int [sumArrayNumEle];
110- int * sumArrayOutput = new int [sumArrayNumEle];
111167
112168 cudaMalloc ((void **)&d_InputData, size);
113169 checkCUDAError (" cudaMalloc d_InputData failed!" );
@@ -118,15 +174,64 @@ namespace StreamCompaction {
118174 cudaMalloc ((void **)&d_SumArray, sumArraySize);
119175 checkCUDAError (" cudaMalloc d_SumArray failed!" );
120176
121- cudaMalloc ((void **)&d_SumArrayOutput, sumArraySize);
122- checkCUDAError (" cudaMalloc d_SumArrayOutput failed!" );
177+ // cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
178+ // checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
123179
124180 cudaMemcpy (d_InputData, idata, size, cudaMemcpyHostToDevice);
125- cudaMemcpy (d_OutputData, odata, size, cudaMemcpyHostToDevice);
126181
127182 dim3 dimGridArray ((n + blockSize - 1 ) / blockSize, 1 , 1 );
128183 dim3 dimBlockArray (blockSize, 1 , 1 );
129184
185+ timer ().startGpuTimer ();
186+ // First step: compute the scan result for individual sections
187+ // then, store their block sum to sumArray
188+ kernNaiveGPUScanFirstStep << <dimGridArray, dimBlockArray >> > (d_InputData,
189+ d_OutputData, d_SumArray, n);
190+ checkCUDAError (" kernNaiveGPUScanFirstStep failed!" );
191+ timer ().endGpuTimer ();
192+
193+ cudaMemcpy (odata, d_OutputData, size, cudaMemcpyDeviceToHost);
194+ checkCUDAError (" memCpy back failed!" );
195+
196+ // testing:
197+ cudaMemcpy (sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
198+ checkCUDAError (" memCpy back failed!" );
199+
200+ printf (" \n " );
201+ for (int i = 0 ; i < sumArrayNumEle; i++)
202+ {
203+ std::cout << sumArray[i] << ' \n ' ;
204+ }
205+
206+ std::cout << ' \n ' ;
207+ for (int i = 0 ; i < n; i++)
208+ {
209+ std::cout << odata[i] << ' \n ' ;
210+ }
211+
212+ // cleanup
213+ cudaFree (d_InputData);
214+ cudaFree (d_OutputData);
215+ cudaFree (d_SumArray);
216+ // cudaFree(d_SumArrayOutput);
217+ checkCUDAError (" cudaFree failed!" );
218+
219+ // testing clean up
220+ delete[] sumArray;
221+ // delete[] sumArrayOutput;
222+
223+ #if 0
224+
225+ int* sumArrayOutput = new int[sumArrayNumEle];
226+
227+
228+ dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
229+ dim3 dimBlockSumArray(blockSize, 1, 1);
230+
231+
232+ cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
233+
234+
130235 dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
131236 dim3 dimBlockSumArray(blockSize, 1, 1);
132237
@@ -137,42 +242,43 @@ namespace StreamCompaction {
137242 d_OutputData, d_SumArray, n);
138243 checkCUDAError("kernNaiveGPUScanFirstStep failed!");
139244
140- // cudaDeviceSynchronize();
245+ cudaDeviceSynchronize();
246+
247+ cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
248+ checkCUDAError("memCpy back failed!");
249+
250+
251+
141252
142- kernNaiveGPUScanSecondStep <<<dimGridSumArray, dimBlockSumArray >>> (
253+ kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
143254 d_SumArray, d_SumArrayOutput, sumArrayNumEle);
144255 checkCUDAError("kernNaiveGPUScanSecondStep failed!");
145- #if 0
146-
147256
148- kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
149- d_OutputData, d_SumArray, n);
150- checkCUDAError("kernNaiveGPUScanFirstStep failed!");
257+ cudaDeviceSynchronize();
151258
152- // cudaDeviceSynchronize();
259+ kernNaiveGPUScanThirdStep <<<dimGridArray, dimBlockArray >>> (
260+ d_SumArrayOutput, d_OutputData, n);
261+ checkCUDAError("kernNaiveGPUScanThirdStep failed!");
153262
154- kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
155- d_OutputData, d_SumArray, n);
156- checkCUDAError("kernNaiveGPUScanFirstStep failed!");
263+ cudaDeviceSynchronize();
157264
158- // cudaDeviceSynchronize();
159- #endif
160265 timer().endGpuTimer();
161266
162267 cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
163268 checkCUDAError("memCpy back failed!");
164269
165- #if 1
166- // testing:
167- cudaMemcpy (sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
168- checkCUDAError (" memCpy back failed!" );
270+
271+
272+
169273 cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize,
170274 cudaMemcpyDeviceToHost);
171275 checkCUDAError("memCpy back failed!");
172276
277+ printf("\n");
278+
173279 for (int i = 0; i < sumArrayNumEle; i++)
174280 {
175- std::cout << sumArray [i] << ' \n ' ;
281+ std::cout << sumArrayOutput [i] << '\n';
176282 }
177283
178284 printf("\n");
@@ -182,23 +288,20 @@ namespace StreamCompaction {
182288 std::cout << sumArrayOutput[i] << '\n';
183289 }
184290 printf("\n");
185-
291+ for (int i = 0; i < n; i++)
292+ {
293+ std::cout << idata[i] << '\n';
294+ }
295+ std::cout << '\n';
186296 for (int i = 0; i < n; i++)
187297 {
188298 std::cout << odata[i] << '\n';
189299 }
190- #endif
191300
192- // cleanup
193- cudaFree (d_InputData);
194- cudaFree (d_OutputData);
195- cudaFree (d_SumArray);
196- cudaFree (d_SumArrayOutput);
197- checkCUDAError (" cudaFree failed!" );
198301
199- // testing clean up
200- delete[] sumArray;
201- delete[] sumArrayOutput;
302+
303+
304+ # endif
202305 }
203306 }
204307}
0 commit comments