@@ -97,14 +97,17 @@ namespace StreamCompaction {
9797 */
9898 void scan (int n, int *odata, const int *idata) {
9999 int size = n * sizeof (int );
100- int * d_InputData;
101- int * d_OutputData;
102100 int sumArrayNumEle = (n + blockSize - 1 ) / blockSize;
103101 int sumArraySize = sumArrayNumEle * sizeof (int );
102+
103+ int * d_InputData;
104+ int * d_OutputData;
104105 int * d_SumArray;
106+ int * d_SumArrayOutput;
105107
106108 // for testing
107109 int * sumArray = new int [sumArrayNumEle];
110+ int * sumArrayOutput = new int [sumArrayNumEle];
108111
109112 cudaMalloc ((void **)&d_InputData, size);
110113 checkCUDAError (" cudaMalloc d_InputData failed!" );
@@ -115,6 +118,9 @@ namespace StreamCompaction {
115118 cudaMalloc ((void **)&d_SumArray, sumArraySize);
116119 checkCUDAError (" cudaMalloc d_SumArray failed!" );
117120
121+ cudaMalloc ((void **)&d_SumArrayOutput, sumArraySize);
122+ checkCUDAError (" cudaMalloc d_SumArrayOutput failed!" );
123+
118124 cudaMemcpy (d_InputData, idata, size, cudaMemcpyHostToDevice);
119125 cudaMemcpy (d_OutputData, odata, size, cudaMemcpyHostToDevice);
120126
@@ -124,20 +130,21 @@ namespace StreamCompaction {
124130 dim3 dimGridSumArray ((sumArrayNumEle + blockSize - 1 ) / blockSize, 1 , 1 );
125131 dim3 dimBlockSumArray (blockSize, 1 , 1 );
126132
127-
128133 timer ().startGpuTimer ();
129134 // First step: compute the scan result for individual sections
130135 // then, store their block sum to sumArray
131136 kernNaiveGPUScanFirstStep <<<dimGridArray, dimBlockArray >>> (d_InputData,
132137 d_OutputData, d_SumArray, n);
133138 checkCUDAError (" kernNaiveGPUScanFirstStep failed!" );
134139
135-
136- #if 0
137- kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
138- sumArray, sumArray, sumArrayNumEle);
139140 // cudaDeviceSynchronize();
140141
142+ kernNaiveGPUScanSecondStep <<<dimGridSumArray, dimBlockSumArray >>> (
143+ d_SumArray, d_SumArrayOutput, sumArrayNumEle);
144+ checkCUDAError (" kernNaiveGPUScanSecondStep failed!" );
145+ #if 0
146+
147+
141148 kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
142149 d_OutputData, d_SumArray, n);
143150 checkCUDAError("kernNaiveGPUScanFirstStep failed!");
@@ -159,11 +166,23 @@ namespace StreamCompaction {
159166 // testing:
160167 cudaMemcpy (sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
161168 checkCUDAError (" memCpy back failed!" );
169+ cudaMemcpy (sumArrayOutput, d_SumArrayOutput, sumArraySize,
170+ cudaMemcpyDeviceToHost);
171+ checkCUDAError (" memCpy back failed!" );
172+
162173 for (int i = 0 ; i < sumArrayNumEle; i++)
163174 {
164175 std::cout << sumArray[i] << ' \n ' ;
165176 }
177+
166178 printf (" \n " );
179+
180+ for (int i = 0 ; i < sumArrayNumEle; i++)
181+ {
182+ std::cout << sumArrayOutput[i] << ' \n ' ;
183+ }
184+ printf (" \n " );
185+
167186 for (int i = 0 ; i < n; i++)
168187 {
169188 std::cout << odata[i] << ' \n ' ;
@@ -173,10 +192,13 @@ namespace StreamCompaction {
173192 // cleanup
174193 cudaFree (d_InputData);
175194 cudaFree (d_OutputData);
195+ cudaFree (d_SumArray);
196+ cudaFree (d_SumArrayOutput);
176197 checkCUDAError (" cudaFree failed!" );
177198
178199 // testing clean up
179200 delete[] sumArray;
201+ delete[] sumArrayOutput;
180202 }
181203 }
182204}
0 commit comments