@@ -14,11 +14,7 @@ void cuda_check(cudaError_t code) {
1414}
1515
1616template <int N, int B>
17- __global__ void my_kernel (
18- int length,
19- kf::aligned_ptr<const __half> input,
20- double constant,
21- kf::aligned_ptr<float > output) {
17+ __global__ void my_kernel (int length, const __half* input, double constant, float * output) {
2218 auto tiling = kf::tiling<
2319 kf::tile_factor<N>,
2420 kf::block_size<B>,
@@ -27,9 +23,9 @@ __global__ void my_kernel(
2723 auto points = int (blockIdx .x * tiling.tile_size (0 )) + tiling.local_points (0 );
2824 auto mask = tiling.local_mask ();
2925
30- auto a = input. read (points, mask);
26+ auto a = kf:: read (input, points, mask);
3127 auto b = (a * a) * constant;
32- output. write (points, b, mask);
28+ kf:: write (output, points, b, mask);
3329}
3430
3531template <int items_per_thread, int block_size = 256 >
@@ -57,11 +53,8 @@ void run_kernel(int n) {
5753 // Launch kernel!
5854 int items_per_block = block_size * items_per_thread;
5955 int grid_size = (n + items_per_block - 1 ) / items_per_block;
60- my_kernel<items_per_thread, block_size><<<grid_size, block_size>>> (
61- n,
62- kf::aligned_ptr (input_dev),
63- constant,
64- kf::aligned_ptr (output_dev));
56+ my_kernel<items_per_thread, block_size>
57+ <<<grid_size, block_size>>> (n, input_dev, constant, output_dev);
6558
6659 // Copy results back
6760 cuda_check (cudaMemcpy (output_dev, output_result.data (), sizeof (float ) * n, cudaMemcpyDefault));
0 commit comments