Skip to content

Commit a5cae91

Browse files
author
Zixin Zhang
committed
Tested kernNaiveGPUScanFirstStep power/non power of two
1 parent 37835f7 commit a5cae91

File tree

2 files changed

+113
-7
lines changed

2 files changed

+113
-7
lines changed

src/main.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ int main(int argc, char* argv[]) {
8080
a[SIZE - 1] = 0;
8181
printArray(SIZE, a, true);
8282

83+
#if 0
8384
// initialize b using StreamCompaction::CPU::scan you implement
8485
// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
8586
// At first all cases passed because b && c are all zeroes.
@@ -110,7 +111,7 @@ int main(int argc, char* argv[]) {
110111

111112
printf("\n");
112113

113-
114+
#endif
114115
zeroArray(SIZE, c);
115116
printDesc("naive scan, power-of-two");
116117
StreamCompaction::Naive::scan(SIZE, c, a);
@@ -162,6 +163,7 @@ int main(int argc, char* argv[]) {
162163

163164
#endif
164165

166+
#if 0
165167
printf("\n");
166168
printf("*****************************\n");
167169
printf("** STREAM COMPACTION TESTS **\n");
@@ -200,7 +202,7 @@ int main(int argc, char* argv[]) {
200202
printArray(count, c, true);
201203
printCmpLenResult(count, expectedCount, b, c);
202204

203-
#if 0
205+
204206
zeroArray(SIZE, c);
205207
printDesc("work-efficient compact, power-of-two");
206208
count = StreamCompaction::Efficient::compact(SIZE, c, a);

stream_compaction/naive.cu

Lines changed: 109 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include "common.h"
44
#include "naive.h"
55

6+
#include <iostream> // testing
7+
68
/*! Block size used for CUDA kernel launch. */
79
#define blockSize 128
810
#define sectionSize 128
@@ -16,8 +18,8 @@ namespace StreamCompaction {
1618
return timer;
1719
}
1820

19-
__global__ void kernNaiveGPUScan(int* inputArray, int* outputArray,
20-
int inputSize)
21+
__global__ void kernNaiveGPUScanFirstStep(int* inputArray, int* outputArray,
22+
int* SumArray, int inputSize)
2123
{
2224
// Each thread loads one value from the input array into shared
2325
// memory array XY
@@ -52,6 +54,65 @@ namespace StreamCompaction {
5254

5355
// each thread writes its result into the output array
5456
outputArray[i] = XY[threadIdx.x];
57+
58+
// the last thread in the block should write the output value of
59+
// the last XY element in the block to the blockIdx.x position of
60+
// SumArray
61+
62+
// make sure XY[sectionSize - 1] has the correct partial sum
63+
__syncthreads();
64+
if (threadIdx.x == blockDim.x - 1)
65+
{
66+
SumArray[blockIdx.x] = XY[sectionSize - 1];
67+
}
68+
}
69+
70+
__global__ void kernNaiveGPUScanSecondStep(int* inputArray, int* outputArray,
71+
int inputSize)
72+
{
73+
// Each thread loads one value from the input array into shared
74+
// memory array XY
75+
__shared__ int XY[sectionSize];
76+
int i = blockIdx.x * blockDim.x + threadIdx.x;
77+
// convert inclusive scan into exclusive scan by shifting
78+
// all elements to the right by one position and fill the frist
79+
// element and out-of-bound elements with 0.
80+
if (i < inputSize && threadIdx.x != 0)
81+
{
82+
XY[threadIdx.x] = inputArray[i - 1];
83+
}
84+
else {
85+
XY[threadIdx.x] = 0;
86+
}
87+
// perform naive scan
88+
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
89+
{
90+
// make sure that input is in place
91+
__syncthreads();
92+
int index = threadIdx.x;
93+
int previousIndex = index - stride;
94+
if (previousIndex < 0)
95+
{
96+
previousIndex = 0;
97+
}
98+
int temp = XY[index] + XY[previousIndex];
99+
// make sure previous output has been consumed
100+
__syncthreads();
101+
XY[index] = temp;
102+
}
103+
104+
// each thread writes its result into the output array
105+
outputArray[i] = XY[threadIdx.x];
106+
}
107+
108+
__global__ void kernNaiveGPUScanThirdStep(int* inputArray, int* outputArray,
109+
int inputSize)
110+
{
111+
int i = blockIdx.x * blockDim.x + threadIdx.x;
112+
if (i < inputSize && blockIdx.x > 0)
113+
{
114+
outputArray[i] += inputArray[blockIdx.x - 1];
115+
}
55116
}
56117

57118
/**
@@ -61,32 +122,75 @@ namespace StreamCompaction {
61122
int size = n * sizeof(int);
62123
int* d_InputData;
63124
int* d_OutputData;
125+
int sumArrayNumEle = (n + blockSize - 1) / blockSize;
126+
int sumArraySize = sumArrayNumEle * sizeof(int);
127+
int* d_SumArray;
128+
129+
// for testing
130+
int* sumArray = new int[sumArrayNumEle];
64131

65132
cudaMalloc((void**)&d_InputData, size);
66133
checkCUDAError("cudaMalloc d_InputData failed!");
67134

68135
cudaMalloc((void**)&d_OutputData, size);
69136
checkCUDAError("cudaMalloc d_OutputData failed!");
70137

138+
cudaMalloc((void**)&d_SumArray, sumArraySize);
139+
checkCUDAError("cudaMalloc d_SumArray failed!");
140+
71141
cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
72142
cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
73143

74144
dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1);
75145
dim3 dimBlock(blockSize, 1, 1);
76146

77147
timer().startGpuTimer();
78-
kernNaiveGPUScan <<<dimGrid, dimBlock>>> (d_InputData,
79-
d_OutputData, n);
80-
checkCUDAError("kernNaiveGPUScan failed!");
148+
// First step: compute the scan result for individual sections
149+
// then, store their block sum to sumArray
150+
kernNaiveGPUScanFirstStep <<<dimGrid, dimBlock>>> (d_InputData,
151+
d_OutputData, d_SumArray, n);
152+
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
153+
#if 0
154+
// cudaDeviceSynchronize();
155+
156+
kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
157+
d_OutputData, d_SumArray, n);
158+
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
159+
160+
// cudaDeviceSynchronize();
161+
162+
kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
163+
d_OutputData, d_SumArray, n);
164+
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
165+
166+
// cudaDeviceSynchronize();
167+
#endif
81168
timer().endGpuTimer();
82169

83170
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
84171
checkCUDAError("memCpy back failed!");
85172

173+
// testing:
174+
cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
175+
checkCUDAError("memCpy back failed!");
176+
for (int i = 0; i < sumArrayNumEle; i++)
177+
{
178+
std::cout << sumArray[i] << '\n';
179+
}
180+
printf("\n");
181+
for (int i = 0; i < n; i++)
182+
{
183+
std::cout << odata[i] << '\n';
184+
}
185+
186+
86187
// cleanup
87188
cudaFree(d_InputData);
88189
cudaFree(d_OutputData);
89190
checkCUDAError("cudaFree failed!");
191+
192+
// testing clean up
193+
delete[] sumArray;
90194
}
91195
}
92196
}

0 commit comments

Comments
 (0)