Skip to content

Commit f90c4dd

Browse files
author
Zixin Zhang
committed
tested kernNaiveGPUScanSecondStep
1 parent 5f90d77 commit f90c4dd

File tree

2 files changed

+30
-8
lines changed

2 files changed

+30
-8
lines changed

src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "testing_helpers.hpp"
1717

1818
// The tests default to an array of size 1 << 8 = 256
19-
const int SIZE = 1 << 9; // feel free to change the size of array
19+
const int SIZE = 1 << 10; // feel free to change the size of array
2020
const int NPOT = SIZE - 3; // Non-Power-Of-Two
2121
int *a = new int[SIZE];
2222
int *b = new int[SIZE];

stream_compaction/naive.cu

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,17 @@ namespace StreamCompaction {
9797
*/
9898
void scan(int n, int *odata, const int *idata) {
9999
int size = n * sizeof(int);
100-
int* d_InputData;
101-
int* d_OutputData;
102100
int sumArrayNumEle = (n + blockSize - 1) / blockSize;
103101
int sumArraySize = sumArrayNumEle * sizeof(int);
102+
103+
int* d_InputData;
104+
int* d_OutputData;
104105
int* d_SumArray;
106+
int* d_SumArrayOutput;
105107

106108
// for testing
107109
int* sumArray = new int[sumArrayNumEle];
110+
int* sumArrayOutput = new int[sumArrayNumEle];
108111

109112
cudaMalloc((void**)&d_InputData, size);
110113
checkCUDAError("cudaMalloc d_InputData failed!");
@@ -115,6 +118,9 @@ namespace StreamCompaction {
115118
cudaMalloc((void**)&d_SumArray, sumArraySize);
116119
checkCUDAError("cudaMalloc d_SumArray failed!");
117120

121+
cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
122+
checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
123+
118124
cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
119125
cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
120126

@@ -124,20 +130,21 @@ namespace StreamCompaction {
124130
dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
125131
dim3 dimBlockSumArray(blockSize, 1, 1);
126132

127-
128133
timer().startGpuTimer();
129134
// First step: compute the scan result for individual sections
130135
// then, store their block sum to sumArray
131136
kernNaiveGPUScanFirstStep <<<dimGridArray, dimBlockArray >>> (d_InputData,
132137
d_OutputData, d_SumArray, n);
133138
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
134139

135-
136-
#if 0
137-
kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
138-
sumArray, sumArray, sumArrayNumEle);
139140
// cudaDeviceSynchronize();
140141

142+
kernNaiveGPUScanSecondStep <<<dimGridSumArray, dimBlockSumArray >>> (
143+
d_SumArray, d_SumArrayOutput, sumArrayNumEle);
144+
checkCUDAError("kernNaiveGPUScanSecondStep failed!");
145+
#if 0
146+
147+
141148
kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
142149
d_OutputData, d_SumArray, n);
143150
checkCUDAError("kernNaiveGPUScanFirstStep failed!");
@@ -159,11 +166,23 @@ namespace StreamCompaction {
159166
// testing:
160167
cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
161168
checkCUDAError("memCpy back failed!");
169+
cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize,
170+
cudaMemcpyDeviceToHost);
171+
checkCUDAError("memCpy back failed!");
172+
162173
for (int i = 0; i < sumArrayNumEle; i++)
163174
{
164175
std::cout << sumArray[i] << '\n';
165176
}
177+
166178
printf("\n");
179+
180+
for (int i = 0; i < sumArrayNumEle; i++)
181+
{
182+
std::cout << sumArrayOutput[i] << '\n';
183+
}
184+
printf("\n");
185+
167186
for (int i = 0; i < n; i++)
168187
{
169188
std::cout << odata[i] << '\n';
@@ -173,10 +192,13 @@ namespace StreamCompaction {
173192
// cleanup
174193
cudaFree(d_InputData);
175194
cudaFree(d_OutputData);
195+
cudaFree(d_SumArray);
196+
cudaFree(d_SumArrayOutput);
176197
checkCUDAError("cudaFree failed!");
177198

178199
// testing clean up
179200
delete[] sumArray;
201+
delete[] sumArrayOutput;
180202
}
181203
}
182204
}

0 commit comments

Comments
 (0)