Skip to content

Commit 152f9d2

Browse files
author
Zixin Zhang
committed
Bloopers 1
1 parent dbeb241 commit 152f9d2

File tree

2 files changed

+118
-4
lines changed

2 files changed

+118
-4
lines changed

src/main.cpp

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
*/
88

99
#include <cstdio>
10+
#include <sstream>
11+
#include <fstream>
1012
#include <stream_compaction/cpu.h>
1113
#include <stream_compaction/naive.h>
1214
#include <stream_compaction/efficient.h>
@@ -24,7 +26,49 @@ int* bookArraya = new int[8]{ 3, 1, 7, 0 ,4 ,1 ,6, 3 };
2426
int* bookArrayb = new int[8]{};
2527
const int BOOK_SIZE = 8;
2628

29+
std::string deviceName;
30+
int deviceMaxThreadsPerBlock;
31+
int deviceSharedMemPerBlock;
32+
int deviceMaxThreadsPerSM;
33+
int deviceMaxBlocksPerSM;
34+
2735
int main(int argc, char* argv[]) {
36+
cudaDeviceProp deviceProp;
37+
int gpuDevice = 0;
38+
int device_count = 0;
39+
cudaGetDeviceCount(&device_count);
40+
if (gpuDevice > device_count) {
41+
std::cout
42+
<< "Error: GPU device number is greater than the number of devices!"
43+
<< " Perhaps a CUDA-capable GPU is not installed?"
44+
<< std::endl;
45+
return false;
46+
}
47+
cudaGetDeviceProperties(&deviceProp, gpuDevice);
48+
int major = deviceProp.major;
49+
int minor = deviceProp.minor;
50+
deviceMaxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
51+
deviceSharedMemPerBlock = deviceProp.sharedMemPerBlock;
52+
deviceMaxThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor;
53+
deviceMaxBlocksPerSM = deviceProp.maxBlocksPerMultiProcessor;
54+
55+
56+
57+
std::ostringstream ss;
58+
ss << " [SM " << major << "." << minor << " " << deviceProp.name << "]"
59+
<< "\n Max threads per block: " << deviceMaxThreadsPerBlock
60+
<< "\n Shared memory per block: " << deviceSharedMemPerBlock << " bytes"
61+
// << "\n Shared memory in each block can fit " << deviceSharedMemPerBlock / sizeof(int) << " number of integers"
62+
<< "\n Max threads per SM: " << deviceMaxThreadsPerSM
63+
<< "\n Max blocks per SM: " << deviceMaxBlocksPerSM
64+
<< "\n Max grid size: " << deviceProp.maxGridSize[0] << ", "
65+
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2];
66+
67+
68+
deviceName = ss.str();
69+
70+
std::cout << deviceName << '\n';
71+
2872
// Scan tests
2973

3074
printf("\n");
@@ -66,14 +110,15 @@ int main(int argc, char* argv[]) {
66110

67111
printf("\n");
68112

69-
#if 0
113+
70114
zeroArray(SIZE, c);
71115
printDesc("naive scan, power-of-two");
72116
StreamCompaction::Naive::scan(SIZE, c, a);
73117
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
74-
//printArray(SIZE, c, true);
118+
printArray(SIZE, c, true);
75119
printCmpResult(SIZE, b, c);
76120

121+
#if 0
77122
/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
78123
onesArray(SIZE, c);
79124
printDesc("1s array for finding bugs");

stream_compaction/naive.cu

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
#include "common.h"
44
#include "naive.h"
55

6+
/*! Block size used for CUDA kernel launch. */
7+
#define blockSize 128
8+
#define sectionSize 128
9+
610
namespace StreamCompaction {
711
namespace Naive {
812
using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +15,80 @@ namespace StreamCompaction {
1115
static PerformanceTimer timer;
1216
return timer;
1317
}
14-
// TODO: __global__
18+
19+
__global__ void kernNaiveGPUScan(int* inputArray, int* outputArray,
20+
int inputSize)
21+
{
22+
// Each thread loads one value from the input array into shared
23+
// memory array XY
24+
__shared__ int XY[sectionSize];
25+
int i = blockIdx.x * blockDim.x + threadIdx.x;
26+
// convert inclusive scan into exclusive scan by shifting
27+
// all elements to the right by one position and fill the frist
28+
// element and out-of-bound elements with 0.
29+
if (i < inputSize && threadIdx.x != 0)
30+
{
31+
XY[threadIdx.x] = inputArray[i - 1];
32+
}
33+
else {
34+
XY[threadIdx.x] = 0;
35+
}
36+
// perform naive scan
37+
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
38+
{
39+
// make sure that input is in place
40+
__syncthreads();
41+
int index = threadIdx.x;
42+
int previousIndex = index - stride;
43+
#if 0
44+
if (previousIndex < 0)
45+
{
46+
previousIndex = 0;
47+
}
48+
#endif
49+
int temp = XY[index] + XY[previousIndex];
50+
// make sure previous output has been consumed
51+
__syncthreads();
52+
XY[index] = temp;
53+
}
54+
55+
// each thread writes its result into the output array
56+
outputArray[i] = XY[threadIdx.x];
57+
}
1558

1659
/**
1760
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
1861
*/
1962
void scan(int n, int *odata, const int *idata) {
63+
int size = n * sizeof(int);
64+
int* d_InputData;
65+
int* d_OutputData;
66+
67+
cudaMalloc((void**)&d_InputData, size);
68+
checkCUDAError("cudaMalloc d_InputData failed!");
69+
70+
cudaMalloc((void**)&d_OutputData, size);
71+
checkCUDAError("cudaMalloc d_OutputData failed!");
72+
73+
cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
74+
cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
75+
76+
dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1);
77+
dim3 dimBlock(blockSize, 1, 1);
78+
2079
timer().startGpuTimer();
21-
// TODO
80+
kernNaiveGPUScan <<<dimGrid, dimBlock>>> (d_InputData,
81+
d_OutputData, n);
82+
checkCUDAError("kernNaiveGPUScan failed!");
2283
timer().endGpuTimer();
84+
85+
cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
86+
checkCUDAError("memCpy back failed!");
87+
88+
// cleanup
89+
cudaFree(d_InputData);
90+
cudaFree(d_OutputData);
91+
checkCUDAError("cudaFree failed!");
2392
}
2493
}
2594
}

0 commit comments

Comments
 (0)