Skip to content

Commit effce44

Browse files
committed
Replace aligned_ptr by vector_ptr
1 parent 564ccb4 commit effce44

File tree

4 files changed

+206
-217
lines changed

4 files changed

+206
-217
lines changed

examples/vector_add/main.cu

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@ void cuda_check(cudaError_t code) {
1313
}
1414

1515
template<int N>
16-
__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
16+
__global__ void my_kernel(
17+
int length,
18+
kf::vec_ptr<const __half, N> input,
19+
double constant,
20+
kf::vec_ptr<float, N> output) {
1721
int i = blockIdx.x * blockDim.x + threadIdx.x;
1822

1923
if (i * N < length) {
20-
auto a = kf::read_aligned<N>(input + i * N);
21-
auto b = kf::fma(a, a, kf::cast<__half>(constant));
22-
kf::write_aligned<N>(output + i * N, b);
24+
output[i] = kf::fma(input[i], input[i], kf::cast<__half>(constant));
2325
}
2426
}
2527

@@ -51,9 +53,9 @@ void run_kernel(int n) {
5153
int grid_size = (n + items_per_block - 1) / items_per_block;
5254
my_kernel<items_per_thread><<<grid_size, block_size>>>(
5355
n,
54-
kf::aligned_ptr(input_dev),
56+
kf::vector_ptr<const half, items_per_thread>(input_dev),
5557
constant,
56-
kf::aligned_ptr(output_dev));
58+
kf::vector_ptr<float, items_per_thread>(output_dev));
5759

5860
// Copy results back
5961
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));

examples/vector_add_tiling/main.cu

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,7 @@ void cuda_check(cudaError_t code) {
1414
}
1515

1616
template<int N, int B>
17-
__global__ void my_kernel(
18-
int length,
19-
kf::aligned_ptr<const __half> input,
20-
double constant,
21-
kf::aligned_ptr<float> output) {
17+
__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
2218
auto tiling = kf::tiling<
2319
kf::tile_factor<N>,
2420
kf::block_size<B>,
@@ -27,9 +23,9 @@ __global__ void my_kernel(
2723
auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
2824
auto mask = tiling.local_mask();
2925

30-
auto a = input.read(points, mask);
26+
auto a = kf::read(input, points, mask);
3127
auto b = (a * a) * constant;
32-
output.write(points, b, mask);
28+
kf::write(output, points, b, mask);
3329
}
3430

3531
template<int items_per_thread, int block_size = 256>
@@ -57,11 +53,8 @@ void run_kernel(int n) {
5753
// Launch kernel!
5854
int items_per_block = block_size * items_per_thread;
5955
int grid_size = (n + items_per_block - 1) / items_per_block;
60-
my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
61-
n,
62-
kf::aligned_ptr(input_dev),
63-
constant,
64-
kf::aligned_ptr(output_dev));
56+
my_kernel<items_per_thread, block_size>
57+
<<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
6558

6659
// Copy results back
6760
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));

0 commit comments

Comments
 (0)