@@ -144,6 +144,8 @@ namespace StreamCompaction {
144144 }
145145 }
146146
147+
148+
147149 /* *
148150 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
149151 */
@@ -157,6 +159,7 @@ namespace StreamCompaction {
157159 int * d_OutputExclusiveData;
158160 int * d_SumArray;
159161 int * d_SumArrayOutput;
162+ int * d_SumArrayAx;
160163
161164 cudaMalloc ((void **)&d_InputData, size);
162165 checkCUDAError (" cudaMalloc d_InputData failed!" );
@@ -173,6 +176,9 @@ namespace StreamCompaction {
173176 cudaMalloc ((void **)&d_SumArrayOutput, sumArraySize);
174177 checkCUDAError (" cudaMalloc d_SumArrayOutput failed!" );
175178
179+ cudaMalloc ((void **)&d_SumArrayAx, sumArraySize);
180+ checkCUDAError (" cudaMalloc d_SumArrayOutput failed!" );
181+
176182 cudaMemcpy (d_InputData, idata, size, cudaMemcpyHostToDevice);
177183
178184 dim3 dimGridArray ((n + blockSize - 1 ) / blockSize, 1 , 1 );
@@ -181,20 +187,61 @@ namespace StreamCompaction {
181187 dim3 dimGridSumArray ((sumArrayNumEle + blockSize - 1 ) / blockSize, 1 , 1 );
182188 dim3 dimBlockSumArray (blockSize, 1 , 1 );
183189
190+ // for testing
191+ int * sumArray = new int [sumArrayNumEle];
192+ int * sumArrayOutput = new int [sumArrayNumEle];
193+
184194 timer ().startGpuTimer ();
185195 // First step: compute the scan result for individual sections
186196 // then, store their block sum to sumArray
187197 kernNaiveGPUScanFirstStep << <dimGridArray, dimBlockArray >> > (d_InputData,
188198 d_OutputData, d_SumArray, n);
189199 checkCUDAError (" kernNaiveGPUScanFirstStep failed!" );
190200
191- // ();
201+ // cudaDeviceSynchronize();
202+
203+ cudaMemcpy (odata, d_OutputData, size, cudaMemcpyDeviceToHost);
204+ checkCUDAError (" memCpy back failed!" );
205+
206+ cudaMemcpy (sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
207+ checkCUDAError (" memCpy back failed!" );
208+
209+ std::cout << ' \n ' ;
210+ for (int i = 0 ; i < n; i++)
211+ {
212+ std::cout << odata[i] << ' ' ;
213+ if ((i + 1 ) % 8 == 0 ) {
214+ std::cout << std::endl;
215+ }
216+ }
217+
218+ std::cout << ' \n ' ;
219+ for (int i = 0 ; i < sumArrayNumEle; i++)
220+ {
221+ std::cout << sumArray[i] << ' ' ;
222+ }
223+
224+ std::cout << ' \n ' ;
192225
193226 // Second step: scan block sums
194- kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
195- d_SumArray, d_SumArrayOutput, sumArrayNumEle);
196- checkCUDAError (" kernNaiveGPUScanSecondStep failed!" );
227+ kernNaiveGPUScanFirstStep << <dimGridSumArray, dimBlockSumArray >> > (d_SumArray,
228+ d_SumArrayOutput, d_SumArrayAx, n);
229+
230+ kernNaiveGPUScanThirdStep << <dimGridSumArray, dimBlockSumArray >> > (
231+ d_SumArrayAx, d_SumArrayOutput, n);
232+
233+ cudaMemcpy (sumArrayOutput, d_SumArrayOutput, sumArraySize,
234+ cudaMemcpyDeviceToHost);
235+ checkCUDAError (" memCpy back failed!" );
197236
237+ printf (" \n " );
238+
239+ for (int i = 0 ; i < sumArrayNumEle; i++)
240+ {
241+ std::cout << sumArrayOutput[i] << ' ' ;
242+ }
243+
244+ printf (" \n " );
198245 // cudaDeviceSynchronize();
199246
200247 // Third step: add scanned block sum i to all values of scanned block
@@ -203,8 +250,6 @@ namespace StreamCompaction {
203250 d_SumArrayOutput, d_OutputData, n);
204251 checkCUDAError (" kernNaiveGPUScanThirdStep failed!" );
205252
206- // cudaDeviceSynchronize();
207-
208253 // Last step:
209254 convertFromInclusiveToExclusive << <dimGridArray, dimBlockArray >> > (
210255 d_OutputData, d_OutputExclusiveData, n);
0 commit comments