Skip to content

Commit eee8f54

Browse files
author
Zixin Zhang
committed
Do inclusive first CIS565-Fall-2021#1
1 parent f90c4dd commit eee8f54

File tree

2 files changed

+169
-64
lines changed

2 files changed

+169
-64
lines changed

src/main.cpp

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "testing_helpers.hpp"
1717

1818
// The tests default to an array of size 1 << 8 = 256
19-
const int SIZE = 1 << 10; // feel free to change the size of array
19+
const int SIZE = 1 << 4; // feel free to change the size of array
2020
const int NPOT = SIZE - 3; // Non-Power-Of-Two
2121
int *a = new int[SIZE];
2222
int *b = new int[SIZE];
@@ -78,9 +78,9 @@ int main(int argc, char* argv[]) {
7878

7979
genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
8080
a[SIZE - 1] = 0;
81-
printArray(SIZE, a, true);
81+
printArray(SIZE, a, false);
82+
8283

83-
#if 0
8484
// initialize b using StreamCompaction::CPU::scan you implement
8585
// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
8686
// At first all cases passed because b && c are all zeroes.
@@ -89,35 +89,37 @@ int main(int argc, char* argv[]) {
8989
printDesc("cpu scan, power-of-two");
9090
StreamCompaction::CPU::scan(SIZE, b, a);
9191
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
92-
printArray(SIZE, b, true);
92+
printArray(SIZE, b, false);
9393

9494
printf("\n");
95-
96-
printArray(BOOK_SIZE, bookArraya, false);
97-
zeroArray(BOOK_SIZE, bookArrayb);
98-
printDesc("cpu scan, power-of-two");
99-
StreamCompaction::CPU::scan(BOOK_SIZE, bookArrayb, bookArraya);
100-
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
101-
printArray(BOOK_SIZE, bookArrayb, false);
10295

103-
printf("\n");
96+
97+
#if 0
10498

10599
zeroArray(SIZE, c);
106100
printDesc("cpu scan, non-power-of-two");
107101
StreamCompaction::CPU::scan(NPOT, c, a);
108102
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
109-
printArray(NPOT, b, true);
103+
printArray(NPOT, b, false);
110104
printCmpResult(NPOT, b, c);
111105

112-
printf("\n");
106+
printArray(BOOK_SIZE, bookArraya, false);
107+
zeroArray(BOOK_SIZE, bookArrayb);
108+
printDesc("cpu scan, power-of-two");
109+
StreamCompaction::CPU::scan(BOOK_SIZE, bookArrayb, bookArraya);
110+
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
111+
printArray(BOOK_SIZE, bookArrayb, false);
113112

113+
printf("\n");
114114
#endif
115-
115+
printf("\n");
116+
117+
116118
zeroArray(SIZE, c);
117119
printDesc("naive scan, power-of-two");
118120
StreamCompaction::Naive::scan(SIZE, c, a);
119121
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
120-
printArray(SIZE, c, true);
122+
printArray(SIZE, c, false);
121123
printCmpResult(SIZE, b, c);
122124

123125
/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan

stream_compaction/naive.cu

Lines changed: 151 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
#include <iostream> // testing
77

88
/*! Block size used for CUDA kernel launch. */
9-
#define blockSize 128
10-
#define sectionSize 128
9+
#define blockSize 8
10+
#define sectionSize 8
1111

1212
namespace StreamCompaction {
1313
namespace Naive {
@@ -18,16 +18,72 @@ namespace StreamCompaction {
1818
return timer;
1919
}
2020

21-
__device__ void computeScanToOutputArray(const int* inputArray, int* outputArray,
22-
int* XY, int inputSize)
21+
// write a kernel to convert from inclusive scan to exclusive scan
22+
23+
__global__ void convertFromInclusiveToExclusive(const int* inputArray,
24+
int* outputArray, int inputSize)
2325
{
2426
int i = blockIdx.x * blockDim.x + threadIdx.x;
2527
// convert inclusive scan into exclusive scan by shifting
2628
// all elements to the right by one position and fill the frist
2729
// element and out-of-bound elements with 0.
28-
if (i < inputSize && threadIdx.x != 0)
30+
if (i < inputSize && i != 0)
2931
{
30-
XY[threadIdx.x] = inputArray[i - 1];
32+
33+
outputArray[i] = inputArray[i - 1];
34+
}
35+
else {
36+
outputArray[i] = 0;
37+
}
38+
}
39+
40+
void unitTestConversion()
41+
{
42+
// for testing
43+
int numObject = 8;
44+
int size = numObject * sizeof(int);
45+
int* toyExclusiveArray = new int[numObject];
46+
int* toyInclusiveArray = new int[numObject] {3, 4, 11, 11, 15, 16, 22, 25};
47+
48+
int* dev_toyExclusiveArray;
49+
int* dev_toyInclusiveArray;
50+
51+
cudaMalloc((void**)&dev_toyExclusiveArray, size);
52+
checkCUDAError("cudaMalloc dev_toyExclusiveArray failed!");
53+
54+
cudaMalloc((void**)&dev_toyInclusiveArray, size);
55+
checkCUDAError("cudaMalloc dev_toyInclusiveArray failed!");
56+
57+
cudaMemcpy(dev_toyInclusiveArray, toyInclusiveArray, size,
58+
cudaMemcpyHostToDevice);
59+
60+
dim3 dimGridArray((numObject + blockSize - 1) / blockSize, 1, 1);
61+
dim3 dimBlockArray(blockSize, 1, 1);
62+
convertFromInclusiveToExclusive <<<dimGridArray, dimBlockArray >>> (
63+
dev_toyInclusiveArray, dev_toyExclusiveArray, numObject);
64+
65+
cudaMemcpy(toyExclusiveArray, dev_toyExclusiveArray, size,
66+
cudaMemcpyDeviceToHost);
67+
checkCUDAError("memCpy back failed!");
68+
69+
printf("\n");
70+
71+
for (int i = 0; i < numObject; i++)
72+
{
73+
std::cout << toyExclusiveArray[i] << '\n';
74+
}
75+
76+
printf("\n");
77+
78+
}
79+
80+
__device__ void computeScanToOutputArray(const int* inputArray, int* outputArray,
81+
int* XY, int inputSize)
82+
{
83+
int i = blockIdx.x * blockDim.x + threadIdx.x;
84+
if (i < inputSize)
85+
{
86+
XY[threadIdx.x] = inputArray[i];
3187
}
3288
else {
3389
XY[threadIdx.x] = 0;
@@ -37,16 +93,16 @@ namespace StreamCompaction {
3793
{
3894
// make sure that input is in place
3995
__syncthreads();
40-
int index = threadIdx.x;
41-
int previousIndex = index - stride;
42-
if (previousIndex < 0)
96+
int previousValue = 0;
97+
int previousIndex = threadIdx.x - stride;
98+
if (previousIndex >= 0)
4399
{
44-
previousIndex = 0;
100+
previousValue = XY[previousIndex];
45101
}
46-
int temp = XY[index] + XY[previousIndex];
102+
int temp = XY[threadIdx.x] + previousValue;
47103
// make sure previous output has been consumed
48104
__syncthreads();
49-
XY[index] = temp;
105+
XY[threadIdx.x] = temp;
50106
}
51107

52108
// each thread writes its result into the output array
@@ -86,28 +142,28 @@ namespace StreamCompaction {
86142
int* outputArray, int inputSize)
87143
{
88144
int i = blockIdx.x * blockDim.x + threadIdx.x;
89-
if (i < inputSize && blockIdx.x > 0)
145+
if (i < inputSize)
90146
{
91-
outputArray[i] += inputArray[blockIdx.x - 1];
147+
outputArray[i] += inputArray[blockIdx.x];
92148
}
93149
}
94150

95151
/**
96152
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
97153
*/
98154
void scan(int n, int *odata, const int *idata) {
155+
// unitTestConversion();
99156
int size = n * sizeof(int);
100157
int sumArrayNumEle = (n + blockSize - 1) / blockSize;
101158
int sumArraySize = sumArrayNumEle * sizeof(int);
102159

103160
int* d_InputData;
104161
int* d_OutputData;
105162
int* d_SumArray;
106-
int* d_SumArrayOutput;
163+
// int* d_SumArrayOutput;
107164

108165
// for testing
109166
int* sumArray = new int[sumArrayNumEle];
110-
int* sumArrayOutput = new int[sumArrayNumEle];
111167

112168
cudaMalloc((void**)&d_InputData, size);
113169
checkCUDAError("cudaMalloc d_InputData failed!");
@@ -118,15 +174,64 @@ namespace StreamCompaction {
118174
cudaMalloc((void**)&d_SumArray, sumArraySize);
119175
checkCUDAError("cudaMalloc d_SumArray failed!");
120176

121-
cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
122-
checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
177+
// cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
178+
// checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
123179

124180
cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
125-
cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
126181

127182
dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1);
128183
dim3 dimBlockArray(blockSize, 1, 1);
129184

185+
timer().startGpuTimer();
186+
// First step: compute the scan result for individual sections
187+
// then, store their block sum to sumArray
188+
kernNaiveGPUScanFirstStep << <dimGridArray, dimBlockArray >> > (d_InputData,
189+
d_OutputData, d_SumArray, n);
190+
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
191+
timer().endGpuTimer();
192+
193+
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
194+
checkCUDAError("memCpy back failed!");
195+
196+
// testing:
197+
cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
198+
checkCUDAError("memCpy back failed!");
199+
200+
printf("\n");
201+
for (int i = 0; i < sumArrayNumEle; i++)
202+
{
203+
std::cout << sumArray[i] << '\n';
204+
}
205+
206+
std::cout << '\n';
207+
for (int i = 0; i < n; i++)
208+
{
209+
std::cout << odata[i] << '\n';
210+
}
211+
212+
// cleanup
213+
cudaFree(d_InputData);
214+
cudaFree(d_OutputData);
215+
cudaFree(d_SumArray);
216+
// cudaFree(d_SumArrayOutput);
217+
checkCUDAError("cudaFree failed!");
218+
219+
// testing clean up
220+
delete[] sumArray;
221+
// delete[] sumArrayOutput;
222+
223+
#if 0
224+
225+
int* sumArrayOutput = new int[sumArrayNumEle];
226+
227+
228+
dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
229+
dim3 dimBlockSumArray(blockSize, 1, 1);
230+
231+
232+
cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
233+
234+
130235
dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
131236
dim3 dimBlockSumArray(blockSize, 1, 1);
132237

@@ -137,42 +242,43 @@ namespace StreamCompaction {
137242
d_OutputData, d_SumArray, n);
138243
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
139244

140-
// cudaDeviceSynchronize();
245+
cudaDeviceSynchronize();
246+
247+
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
248+
checkCUDAError("memCpy back failed!");
249+
250+
251+
141252

142-
kernNaiveGPUScanSecondStep <<<dimGridSumArray, dimBlockSumArray >>> (
253+
kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
143254
d_SumArray, d_SumArrayOutput, sumArrayNumEle);
144255
checkCUDAError("kernNaiveGPUScanSecondStep failed!");
145-
#if 0
146-
147256

148-
kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
149-
d_OutputData, d_SumArray, n);
150-
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
257+
cudaDeviceSynchronize();
151258

152-
// cudaDeviceSynchronize();
259+
kernNaiveGPUScanThirdStep <<<dimGridArray, dimBlockArray >>> (
260+
d_SumArrayOutput, d_OutputData, n);
261+
checkCUDAError("kernNaiveGPUScanThirdStep failed!");
153262

154-
kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
155-
d_OutputData, d_SumArray, n);
156-
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
263+
cudaDeviceSynchronize();
157264

158-
// cudaDeviceSynchronize();
159-
#endif
160265
timer().endGpuTimer();
161266

162267
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
163268
checkCUDAError("memCpy back failed!");
164269

165-
#if 1
166-
// testing:
167-
cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
168-
checkCUDAError("memCpy back failed!");
270+
271+
272+
169273
cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize,
170274
cudaMemcpyDeviceToHost);
171275
checkCUDAError("memCpy back failed!");
172276

277+
printf("\n");
278+
173279
for (int i = 0; i < sumArrayNumEle; i++)
174280
{
175-
std::cout << sumArray[i] << '\n';
281+
std::cout << sumArrayOutput[i] << '\n';
176282
}
177283

178284
printf("\n");
@@ -182,23 +288,20 @@ namespace StreamCompaction {
182288
std::cout << sumArrayOutput[i] << '\n';
183289
}
184290
printf("\n");
185-
291+
for (int i = 0; i < n; i++)
292+
{
293+
std::cout << idata[i] << '\n';
294+
}
295+
std::cout << '\n';
186296
for (int i = 0; i < n; i++)
187297
{
188298
std::cout << odata[i] << '\n';
189299
}
190-
#endif
191300

192-
// cleanup
193-
cudaFree(d_InputData);
194-
cudaFree(d_OutputData);
195-
cudaFree(d_SumArray);
196-
cudaFree(d_SumArrayOutput);
197-
checkCUDAError("cudaFree failed!");
198301

199-
// testing clean up
200-
delete[] sumArray;
201-
delete[] sumArrayOutput;
302+
303+
304+
#endif
202305
}
203306
}
204307
}

0 commit comments

Comments
 (0)