@@ -17,13 +17,10 @@ namespace StreamCompaction {
1717 static PerformanceTimer timer;
1818 return timer;
1919 }
20-
21- __global__ void kernNaiveGPUScanFirstStep ( int * inputArray, int * outputArray,
22- int * SumArray , int inputSize)
20+
21+ __device__ void computeScanToOutputArray ( const int * inputArray, int * outputArray,
22+ int * XY , int inputSize)
2323 {
24- // Each thread loads one value from the input array into shared
25- // memory array XY
26- __shared__ int XY[sectionSize];
2724 int i = blockIdx .x * blockDim .x + threadIdx .x ;
2825 // convert inclusive scan into exclusive scan by shifting
2926 // all elements to the right by one position and fill the frist
@@ -39,7 +36,7 @@ namespace StreamCompaction {
3936 for (unsigned int stride = 1 ; stride < blockDim .x ; stride *= 2 )
4037 {
4138 // make sure that input is in place
42- __syncthreads ();
39+ __syncthreads ();
4340 int index = threadIdx .x ;
4441 int previousIndex = index - stride;
4542 if (previousIndex < 0 )
@@ -54,6 +51,15 @@ namespace StreamCompaction {
5451
5552 // each thread writes its result into the output array
5653 outputArray[i] = XY[threadIdx .x ];
54+ }
55+
56+ __global__ void kernNaiveGPUScanFirstStep (const int * inputArray,
57+ int * outputArray, int * SumArray, int inputSize)
58+ {
59+ // Each thread loads one value from the input array into shared
60+ // memory array XY
61+ __shared__ int XY[sectionSize];
62+ computeScanToOutputArray (inputArray, outputArray, XY, inputSize);
5763
5864 // the last thread in the block should write the output value of
5965 // the last XY element in the block to the blockIdx.x position of
@@ -67,46 +73,17 @@ namespace StreamCompaction {
6773 }
6874 }
6975
70- __global__ void kernNaiveGPUScanSecondStep (int * inputArray, int * outputArray,
71- int inputSize)
76+ __global__ void kernNaiveGPUScanSecondStep (const int * inputArray,
77+ int * outputArray, int inputSize)
7278 {
7379 // Each thread loads one value from the input array into shared
7480 // memory array XY
7581 __shared__ int XY[sectionSize];
76- int i = blockIdx .x * blockDim .x + threadIdx .x ;
77- // convert inclusive scan into exclusive scan by shifting
78- // all elements to the right by one position and fill the frist
79- // element and out-of-bound elements with 0.
80- if (i < inputSize && threadIdx .x != 0 )
81- {
82- XY[threadIdx .x ] = inputArray[i - 1 ];
83- }
84- else {
85- XY[threadIdx .x ] = 0 ;
86- }
87- // perform naive scan
88- for (unsigned int stride = 1 ; stride < blockDim .x ; stride *= 2 )
89- {
90- // make sure that input is in place
91- __syncthreads ();
92- int index = threadIdx .x ;
93- int previousIndex = index - stride;
94- if (previousIndex < 0 )
95- {
96- previousIndex = 0 ;
97- }
98- int temp = XY[index] + XY[previousIndex];
99- // make sure previous output has been consumed
100- __syncthreads ();
101- XY[index] = temp;
102- }
103-
104- // each thread writes its result into the output array
105- outputArray[i] = XY[threadIdx .x ];
82+ computeScanToOutputArray (inputArray, outputArray, XY, inputSize);
10683 }
10784
108- __global__ void kernNaiveGPUScanThirdStep (int * inputArray, int * outputArray,
109- int inputSize)
85+ __global__ void kernNaiveGPUScanThirdStep (const int * inputArray,
86+ int * outputArray, int inputSize)
11087 {
11188 int i = blockIdx .x * blockDim .x + threadIdx .x ;
11289 if (i < inputSize && blockIdx .x > 0 )
@@ -141,16 +118,24 @@ namespace StreamCompaction {
141118 cudaMemcpy (d_InputData, idata, size, cudaMemcpyHostToDevice);
142119 cudaMemcpy (d_OutputData, odata, size, cudaMemcpyHostToDevice);
143120
144- dim3 dimGrid ((n + blockSize - 1 ) / blockSize, 1 , 1 );
145- dim3 dimBlock (blockSize, 1 , 1 );
121+ dim3 dimGridArray ((n + blockSize - 1 ) / blockSize, 1 , 1 );
122+ dim3 dimBlockArray (blockSize, 1 , 1 );
123+
124+ dim3 dimGridSumArray ((sumArrayNumEle + blockSize - 1 ) / blockSize, 1 , 1 );
125+ dim3 dimBlockSumArray (blockSize, 1 , 1 );
126+
146127
147128 timer ().startGpuTimer ();
148129 // First step: compute the scan result for individual sections
149130 // then, store their block sum to sumArray
150- kernNaiveGPUScanFirstStep <<<dimGrid, dimBlock >>> (d_InputData,
131+ kernNaiveGPUScanFirstStep <<<dimGridArray, dimBlockArray >>> (d_InputData,
151132 d_OutputData, d_SumArray, n);
152133 checkCUDAError (" kernNaiveGPUScanFirstStep failed!" );
134+
135+
153136#if 0
137+ kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
138+ sumArray, sumArray, sumArrayNumEle);
154139 // cudaDeviceSynchronize();
155140
156141 kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
@@ -170,6 +155,7 @@ namespace StreamCompaction {
170155 cudaMemcpy (odata, d_OutputData, size, cudaMemcpyDeviceToHost);
171156 checkCUDAError (" memCpy back failed!" );
172157
158+ #if 1
173159 // testing:
174160 cudaMemcpy (sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
175161 checkCUDAError (" memCpy back failed!" );
@@ -182,7 +168,7 @@ namespace StreamCompaction {
182168 {
183169 std::cout << odata[i] << ' \n ' ;
184170 }
185-
171+ # endif
186172
187173 // cleanup
188174 cudaFree (d_InputData);
0 commit comments