Skip to content

Commit 0ce84b9

Browse files
authored
[Code] add reduce_interleaved_addressing.cu (#36)
1 parent 8cdc4b9 commit 0ce84b9

File tree

1 file changed

+101
-0
lines changed

1 file changed

+101
-0
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <cuda_runtime.h>
4+
5+
const int len = 32 * 1024 * 1024;
6+
7+
template <int BLOCKSIZE>
8+
__global__ void reduce_naive_kernel(int *arr, int *out, int len)
9+
{
10+
__shared__ int sdata[BLOCKSIZE];
11+
int tid = threadIdx.x; // 线程 id (block 内)
12+
int bid = blockIdx.x; // block id (grid 内)
13+
int bdim = blockDim.x; // block 大小
14+
int i = bid * bdim + tid; // 全局 id
15+
16+
// 将数据拷贝到共享内存
17+
if (i < len)
18+
{
19+
sdata[tid] = arr[i];
20+
}
21+
22+
__syncthreads(); // 等待所有线程完成
23+
24+
// 使用交错寻址
25+
for (int s = 1; s < bdim; s *= 2)
26+
{
27+
int index = 2 * s * tid;
28+
if ((index + s < bdim) && (bdim * bid + s < len))
29+
{
30+
sdata[index] += sdata[index + s];
31+
}
32+
__syncthreads();
33+
}
34+
35+
// 每个 block 的第一个线程将结果写入到 out 中
36+
if (tid == 0)
37+
{
38+
out[bid] = sdata[0];
39+
}
40+
}
41+
42+
int main()
43+
{
44+
int *arr = new int[len];
45+
int *out = new int[len];
46+
int *d_arr, *d_out;
47+
48+
// 初始化数组
49+
for (int i = 0; i < len; i++)
50+
{
51+
arr[i] = 1;
52+
}
53+
54+
// 分配内存
55+
cudaMalloc((void **)&d_arr, sizeof(int) * len);
56+
cudaMalloc((void **)&d_out, sizeof(int) * len);
57+
58+
// 拷贝数据到显存
59+
cudaMemcpy(d_arr, arr, sizeof(int) * len, cudaMemcpyHostToDevice);
60+
61+
// 计算 block 和 grid 的大小
62+
const int blocksize = 256;
63+
const int gridsize = (len + blocksize - 1) / blocksize;
64+
65+
// 调用 kernel 函数
66+
reduce_naive_kernel<blocksize><<<gridsize, blocksize>>>(d_arr, d_out, len);
67+
68+
// 拷贝数据到内存
69+
cudaMemcpy(out, d_out, sizeof(int) * len, cudaMemcpyDeviceToHost);
70+
71+
// 计算结果
72+
long long sum = 0;
73+
for (int i = 0; i < gridsize; i++)
74+
{
75+
sum += out[i];
76+
}
77+
printf("sum = %d\n", sum);
78+
79+
// 核对结果
80+
long long sum2 = 0;
81+
for (int i = 0; i < len; i++)
82+
{
83+
sum2 += arr[i];
84+
}
85+
86+
if (sum == sum2)
87+
{
88+
printf("success\n");
89+
}
90+
else
91+
{
92+
printf("failed, the result is %d\n", sum2);
93+
}
94+
95+
// 释放内存
96+
cudaFree(d_arr);
97+
cudaFree(d_out);
98+
delete[] arr;
99+
delete[] out;
100+
return 0;
101+
}⏎

0 commit comments

Comments
 (0)