@@ -57,31 +57,30 @@ Notice how easy it would be to change the precision (for example, `double` to `h
5757#include " kernel_float.h"
5858namespace kf = kernel_float;
5959
60- __ global__ void kernel(const kf::vec< half, 2>* input, float constant, kf::vec <float, 2>* output) {
60+ __ global__ void kernel(kf::vec_ptr<const half, 2> input, int constant, kf::vec_ptr <float, 2> output) {
6161 int i = blockIdx.x * blockDim.x + threadIdx.x;
62- output[ i] = input[ i] + kf::cast< half >( constant) ;
62+ output[ i] + = input[ i] * constant;
6363}
6464
6565```
6666
6767Here is how the same kernel would look for CUDA without Kernel Float.
6868
6969```cpp
70- __global__ void kernel(const __half * input, float constant, float* output) {
70+ __global__ void kernel(const half * input, double constant, float* output) {
7171 int i = blockIdx.x * blockDim.x + threadIdx.x;
7272 __half in0 = input[2 * i + 0];
7373 __half in1 = input[2 * i + 1];
7474 __half2 a = __halves2half2(in0, in1);
75- float b = float(constant);
76- __half c = __float2half(b);
77- __half2 d = __half2half2(c);
78- __half2 e = __hadd2(a, d);
79- __half f = __low2half(e);
80- __half g = __high2half(e);
81- float out0 = __half2float(f);
82- float out1 = __half2float(g);
83- output[2 * i + 0] = out0;
84- output[2 * i + 1] = out1;
75+ __half b = __int2half_rn(constant);
76+ __half2 c = __half2half2(b);
77+ __half2 d = __hmul2(a, c);
78+ __half e = __low2half(d);
79+ __half f = __high2half(d);
80+ float out0 = __half2float(e);
81+ float out1 = __half2float(f);
82+ output[2 * i + 0] += out0;
83+ output[2 * i + 1] += out1;
8584}
8685
8786```
0 commit comments