33#include " common.h"
44#include " naive.h"
55
6+ #include < iostream> // testing
7+
68/* ! Block size used for CUDA kernel launch. */
79#define blockSize 128
810#define sectionSize 128
@@ -16,8 +18,8 @@ namespace StreamCompaction {
1618 return timer;
1719 }
1820
19- __global__ void kernNaiveGPUScan (int * inputArray, int * outputArray,
20- int inputSize)
21+ __global__ void kernNaiveGPUScanFirstStep (int * inputArray, int * outputArray,
22+ int * SumArray, int inputSize)
2123 {
2224 // Each thread loads one value from the input array into shared
2325 // memory array XY
@@ -52,6 +54,65 @@ namespace StreamCompaction {
5254
5355 // each thread writes its result into the output array
5456 outputArray[i] = XY[threadIdx .x ];
57+
58+ // the last thread in the block should write the output value of
59+ // the last XY element in the block to the blockIdx.x position of
60+ // SumArray
61+
62+ // make sure XY[sectionSize - 1] has the correct partial sum
63+ __syncthreads ();
64+ if (threadIdx .x == blockDim .x - 1 )
65+ {
66+ SumArray[blockIdx .x ] = XY[sectionSize - 1 ];
67+ }
68+ }
69+
70+ __global__ void kernNaiveGPUScanSecondStep (int * inputArray, int * outputArray,
71+ int inputSize)
72+ {
73+ // Each thread loads one value from the input array into shared
74+ // memory array XY
75+ __shared__ int XY[sectionSize];
76+ int i = blockIdx .x * blockDim .x + threadIdx .x ;
77+ // convert inclusive scan into exclusive scan by shifting
78+ // all elements to the right by one position and fill the frist
79+ // element and out-of-bound elements with 0.
80+ if (i < inputSize && threadIdx .x != 0 )
81+ {
82+ XY[threadIdx .x ] = inputArray[i - 1 ];
83+ }
84+ else {
85+ XY[threadIdx .x ] = 0 ;
86+ }
87+ // perform naive scan
88+ for (unsigned int stride = 1 ; stride < blockDim .x ; stride *= 2 )
89+ {
90+ // make sure that input is in place
91+ __syncthreads ();
92+ int index = threadIdx .x ;
93+ int previousIndex = index - stride;
94+ if (previousIndex < 0 )
95+ {
96+ previousIndex = 0 ;
97+ }
98+ int temp = XY[index] + XY[previousIndex];
99+ // make sure previous output has been consumed
100+ __syncthreads ();
101+ XY[index] = temp;
102+ }
103+
104+ // each thread writes its result into the output array
105+ outputArray[i] = XY[threadIdx .x ];
106+ }
107+
108+ __global__ void kernNaiveGPUScanThirdStep (int * inputArray, int * outputArray,
109+ int inputSize)
110+ {
111+ int i = blockIdx .x * blockDim .x + threadIdx .x ;
112+ if (i < inputSize && blockIdx .x > 0 )
113+ {
114+ outputArray[i] += inputArray[blockIdx .x - 1 ];
115+ }
55116 }
56117
57118 /* *
@@ -61,32 +122,75 @@ namespace StreamCompaction {
61122 int size = n * sizeof (int );
62123 int * d_InputData;
63124 int * d_OutputData;
125+ int sumArrayNumEle = (n + blockSize - 1 ) / blockSize;
126+ int sumArraySize = sumArrayNumEle * sizeof (int );
127+ int * d_SumArray;
128+
129+ // for testing
130+ int * sumArray = new int [sumArrayNumEle];
64131
65132 cudaMalloc ((void **)&d_InputData, size);
66133 checkCUDAError (" cudaMalloc d_InputData failed!" );
67134
68135 cudaMalloc ((void **)&d_OutputData, size);
69136 checkCUDAError (" cudaMalloc d_OutputData failed!" );
70137
138+ cudaMalloc ((void **)&d_SumArray, sumArraySize);
139+ checkCUDAError (" cudaMalloc d_SumArray failed!" );
140+
71141 cudaMemcpy (d_InputData, idata, size, cudaMemcpyHostToDevice);
72142 cudaMemcpy (d_OutputData, odata, size, cudaMemcpyHostToDevice);
73143
74144 dim3 dimGrid ((n + blockSize - 1 ) / blockSize, 1 , 1 );
75145 dim3 dimBlock (blockSize, 1 , 1 );
76146
77147 timer ().startGpuTimer ();
78- kernNaiveGPUScan <<<dimGrid, dimBlock>>> (d_InputData,
79- d_OutputData, n);
80- checkCUDAError (" kernNaiveGPUScan failed!" );
148+ // First step: compute the scan result for individual sections
149+ // then, store their block sum to sumArray
150+ kernNaiveGPUScanFirstStep <<<dimGrid, dimBlock>>> (d_InputData,
151+ d_OutputData, d_SumArray, n);
152+ checkCUDAError (" kernNaiveGPUScanFirstStep failed!" );
153+ #if 0
154+ // cudaDeviceSynchronize();
155+
156+ kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
157+ d_OutputData, d_SumArray, n);
158+ checkCUDAError("kernNaiveGPUScanFirstStep failed!");
159+
160+ // cudaDeviceSynchronize();
161+
162+ kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
163+ d_OutputData, d_SumArray, n);
164+ checkCUDAError("kernNaiveGPUScanFirstStep failed!");
165+
166+ // cudaDeviceSynchronize();
167+ #endif
81168 timer ().endGpuTimer ();
82169
83170 cudaMemcpy (odata, d_OutputData, size, cudaMemcpyDeviceToHost);
84171 checkCUDAError (" memCpy back failed!" );
85172
173+ // testing:
174+ cudaMemcpy (sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
175+ checkCUDAError (" memCpy back failed!" );
176+ for (int i = 0 ; i < sumArrayNumEle; i++)
177+ {
178+ std::cout << sumArray[i] << ' \n ' ;
179+ }
180+ printf (" \n " );
181+ for (int i = 0 ; i < n; i++)
182+ {
183+ std::cout << odata[i] << ' \n ' ;
184+ }
185+
186+
86187 // cleanup
87188 cudaFree (d_InputData);
88189 cudaFree (d_OutputData);
89190 checkCUDAError (" cudaFree failed!" );
191+
192+ // testing clean up
193+ delete[] sumArray;
90194 }
91195 }
92196}
0 commit comments