Skip to content

Commit 5f90d77

Browse files
author
Zixin Zhang
committed
refactor and const correctness
1 parent a5cae91 commit 5f90d77

File tree

2 files changed

+37
-48
lines changed

2 files changed

+37
-48
lines changed

src/main.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "testing_helpers.hpp"
1717

1818
// The tests default to an array of size 1 << 8 = 256
19-
const int SIZE = 1 << 8; // feel free to change the size of array
19+
const int SIZE = 1 << 9; // feel free to change the size of array
2020
const int NPOT = SIZE - 3; // Non-Power-Of-Two
2121
int *a = new int[SIZE];
2222
int *b = new int[SIZE];
@@ -112,14 +112,14 @@ int main(int argc, char* argv[]) {
112112
printf("\n");
113113

114114
#endif
115+
115116
zeroArray(SIZE, c);
116117
printDesc("naive scan, power-of-two");
117118
StreamCompaction::Naive::scan(SIZE, c, a);
118119
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
119120
printArray(SIZE, c, true);
120121
printCmpResult(SIZE, b, c);
121-
122-
#if 0
122+
123123
/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
124124
onesArray(SIZE, c);
125125
printDesc("1s array for finding bugs");
@@ -133,6 +133,9 @@ int main(int argc, char* argv[]) {
133133
//printArray(SIZE, c, true);
134134
printCmpResult(NPOT, b, c);
135135

136+
#if 0
137+
138+
136139
zeroArray(SIZE, c);
137140
printDesc("work-efficient scan, power-of-two");
138141
StreamCompaction::Efficient::scan(SIZE, c, a);

stream_compaction/naive.cu

Lines changed: 31 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,10 @@ namespace StreamCompaction {
1717
static PerformanceTimer timer;
1818
return timer;
1919
}
20-
21-
__global__ void kernNaiveGPUScanFirstStep(int* inputArray, int* outputArray,
22-
int* SumArray, int inputSize)
20+
21+
__device__ void computeScanToOutputArray(const int* inputArray, int* outputArray,
22+
int* XY, int inputSize)
2323
{
24-
// Each thread loads one value from the input array into shared
25-
// memory array XY
26-
__shared__ int XY[sectionSize];
2724
int i = blockIdx.x * blockDim.x + threadIdx.x;
2825
// convert inclusive scan into exclusive scan by shifting
2926
// all elements to the right by one position and fill the frist
@@ -39,7 +36,7 @@ namespace StreamCompaction {
3936
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
4037
{
4138
// make sure that input is in place
42-
__syncthreads();
39+
__syncthreads();
4340
int index = threadIdx.x;
4441
int previousIndex = index - stride;
4542
if (previousIndex < 0)
@@ -54,6 +51,15 @@ namespace StreamCompaction {
5451

5552
// each thread writes its result into the output array
5653
outputArray[i] = XY[threadIdx.x];
54+
}
55+
56+
__global__ void kernNaiveGPUScanFirstStep(const int* inputArray,
57+
int* outputArray, int* SumArray, int inputSize)
58+
{
59+
// Each thread loads one value from the input array into shared
60+
// memory array XY
61+
__shared__ int XY[sectionSize];
62+
computeScanToOutputArray(inputArray, outputArray, XY, inputSize);
5763

5864
// the last thread in the block should write the output value of
5965
// the last XY element in the block to the blockIdx.x position of
@@ -67,46 +73,17 @@ namespace StreamCompaction {
6773
}
6874
}
6975

70-
__global__ void kernNaiveGPUScanSecondStep(int* inputArray, int* outputArray,
71-
int inputSize)
76+
__global__ void kernNaiveGPUScanSecondStep(const int* inputArray,
77+
int* outputArray, int inputSize)
7278
{
7379
// Each thread loads one value from the input array into shared
7480
// memory array XY
7581
__shared__ int XY[sectionSize];
76-
int i = blockIdx.x * blockDim.x + threadIdx.x;
77-
// convert inclusive scan into exclusive scan by shifting
78-
// all elements to the right by one position and fill the frist
79-
// element and out-of-bound elements with 0.
80-
if (i < inputSize && threadIdx.x != 0)
81-
{
82-
XY[threadIdx.x] = inputArray[i - 1];
83-
}
84-
else {
85-
XY[threadIdx.x] = 0;
86-
}
87-
// perform naive scan
88-
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
89-
{
90-
// make sure that input is in place
91-
__syncthreads();
92-
int index = threadIdx.x;
93-
int previousIndex = index - stride;
94-
if (previousIndex < 0)
95-
{
96-
previousIndex = 0;
97-
}
98-
int temp = XY[index] + XY[previousIndex];
99-
// make sure previous output has been consumed
100-
__syncthreads();
101-
XY[index] = temp;
102-
}
103-
104-
// each thread writes its result into the output array
105-
outputArray[i] = XY[threadIdx.x];
82+
computeScanToOutputArray(inputArray, outputArray, XY, inputSize);
10683
}
10784

108-
__global__ void kernNaiveGPUScanThirdStep(int* inputArray, int* outputArray,
109-
int inputSize)
85+
__global__ void kernNaiveGPUScanThirdStep(const int* inputArray,
86+
int* outputArray, int inputSize)
11087
{
11188
int i = blockIdx.x * blockDim.x + threadIdx.x;
11289
if (i < inputSize && blockIdx.x > 0)
@@ -141,16 +118,24 @@ namespace StreamCompaction {
141118
cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
142119
cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
143120

144-
dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1);
145-
dim3 dimBlock(blockSize, 1, 1);
121+
dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1);
122+
dim3 dimBlockArray(blockSize, 1, 1);
123+
124+
dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
125+
dim3 dimBlockSumArray(blockSize, 1, 1);
126+
146127

147128
timer().startGpuTimer();
148129
// First step: compute the scan result for individual sections
149130
// then, store their block sum to sumArray
150-
kernNaiveGPUScanFirstStep <<<dimGrid, dimBlock>>> (d_InputData,
131+
kernNaiveGPUScanFirstStep <<<dimGridArray, dimBlockArray >>> (d_InputData,
151132
d_OutputData, d_SumArray, n);
152133
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
134+
135+
153136
#if 0
137+
kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
138+
sumArray, sumArray, sumArrayNumEle);
154139
// cudaDeviceSynchronize();
155140

156141
kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
@@ -170,6 +155,7 @@ namespace StreamCompaction {
170155
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
171156
checkCUDAError("memCpy back failed!");
172157

158+
#if 1
173159
// testing:
174160
cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
175161
checkCUDAError("memCpy back failed!");
@@ -182,7 +168,7 @@ namespace StreamCompaction {
182168
{
183169
std::cout << odata[i] << '\n';
184170
}
185-
171+
#endif
186172

187173
// cleanup
188174
cudaFree(d_InputData);

0 commit comments

Comments
 (0)