@@ -122,28 +122,27 @@ <h2>Example<a class="headerlink" href="#example" title="Permalink to this headin
122122< div class ="highlight-cpp notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="cp "> #include</ span > < span class ="w "> </ span > < span class ="cpf "> "kernel_float.h"</ span >
123123< span class ="k "> namespace</ span > < span class ="w "> </ span > < span class ="nn "> kf</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="nn "> kernel_float</ span > < span class ="p "> ;</ span >
124124
125- < span class ="n "> __global__</ span > < span class ="w "> </ span > < span class ="kt "> void</ span > < span class ="w "> </ span > < span class ="n "> kernel</ span > < span class ="p "> (</ span > < span class ="k "> const</ span > < span class ="w "> </ span > < span class ="n "> kf</ span > < span class ="o "> ::</ span > < span class ="n "> vec</ span > < span class ="o "> <</ span > < span class ="n "> half</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="mi "> 2</ span > < span class ="o "> >*</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="w "> </ span > < span class ="n "> constant</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="n "> kf</ span > < span class ="o "> ::</ span > < span class ="n "> vec</ span > < span class ="o "> <</ span > < span class ="kt "> float</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="mi "> 2</ span > < span class ="o "> >*</ span > < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> )</ span > < span class ="w "> </ span > < span class ="p "> {</ span >
125+ < span class ="n "> __global__</ span > < span class ="w "> </ span > < span class ="kt "> void</ span > < span class ="w "> </ span > < span class ="n "> kernel</ span > < span class ="p "> (</ span > < span class ="n "> kf</ span > < span class ="o "> ::</ span > < span class ="n "> vec_ptr</ span > < span class ="o "> <</ span > < span class ="k "> const</ span > < span class ="w "> </ span > < span class ="n "> half</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="mi "> 2</ span > < span class ="o "> ></ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="kt "> int</ span > < span class ="w "> </ span > < span class ="n "> constant</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="n "> kf</ span > < span class ="o "> ::</ span > < span class ="n "> vec_ptr</ span > < span class ="o "> <</ span > < span class ="kt "> float</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="mi "> 2</ span > < span class ="o "> ></ span > < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> )</ span > < span class ="w "> </ span > < span class ="p "> {</ span >
126126< span class ="w "> </ span > < span class ="kt "> int</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> blockIdx</ span > < span class ="p "> .</ span > < span class ="n "> x</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> blockDim</ span > < span class ="p "> .</ span > < span class ="n "> x</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="n "> threadIdx</ span > < span class ="p "> .</ span > < span class ="n "> x</ span > < span class ="p "> ;</ span >
127- < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> [</ span > < span class ="n "> i</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> [</ span > < span class ="n "> i</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> + </ span > < span class ="w "> </ span > < span class ="n "> kf </ span > < span class =" o " > :: </ span > < span class =" n " > cast </ span > < span class =" o " > < </ span > < span class =" n " > half </ span > < span class =" o " > > </ span > < span class =" p " > ( </ span > < span class =" n " > constant</ span > < span class ="p "> ) ;</ span >
127+ < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> [</ span > < span class ="n "> i</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> + =</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> [</ span > < span class ="n "> i</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> * </ span > < span class ="w "> </ span > < span class ="n "> constant</ span > < span class ="p "> ;</ span >
128128< span class ="p "> }</ span >
129129</ pre > </ div >
130130</ div >
131131< p > Here is how the same kernel would look for CUDA without Kernel Float.</ p >
132- < div class ="highlight-cpp notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="n "> __global__</ span > < span class ="w "> </ span > < span class ="kt "> void</ span > < span class ="w "> </ span > < span class ="n "> kernel</ span > < span class ="p "> (</ span > < span class ="k "> const</ span > < span class ="w "> </ span > < span class ="n "> __half </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="kt "> float </ span > < span class ="w "> </ span > < span class ="n "> constant</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> )</ span > < span class ="w "> </ span > < span class ="p "> {</ span >
132+ < div class ="highlight-cpp notranslate "> < div class ="highlight "> < pre > < span > </ span > < span class ="n "> __global__</ span > < span class ="w "> </ span > < span class ="kt "> void</ span > < span class ="w "> </ span > < span class ="n "> kernel</ span > < span class ="p "> (</ span > < span class ="k "> const</ span > < span class ="w "> </ span > < span class ="n "> half </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="kt "> double </ span > < span class ="w "> </ span > < span class ="n "> constant</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> )</ span > < span class ="w "> </ span > < span class ="p "> {</ span >
133133< span class ="w "> </ span > < span class ="kt "> int</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> blockIdx</ span > < span class ="p "> .</ span > < span class ="n "> x</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> blockDim</ span > < span class ="p "> .</ span > < span class ="n "> x</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="n "> threadIdx</ span > < span class ="p "> .</ span > < span class ="n "> x</ span > < span class ="p "> ;</ span >
134134< span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> in0</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> [</ span > < span class ="mi "> 2</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="mi "> 0</ span > < span class ="p "> ];</ span >
135135< span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> in1</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> input</ span > < span class ="p "> [</ span > < span class ="mi "> 2</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="mi "> 1</ span > < span class ="p "> ];</ span >
136136< span class ="w "> </ span > < span class ="n "> __half2</ span > < span class ="w "> </ span > < span class ="n "> a</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __halves2half2</ span > < span class ="p "> (</ span > < span class ="n "> in0</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="n "> in1</ span > < span class ="p "> );</ span >
137- < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="w "> </ span > < span class ="n "> b</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="p "> (</ span > < span class ="n "> constant</ span > < span class ="p "> );</ span >
138- < span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> c</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __float2half</ span > < span class ="p "> (</ span > < span class ="n "> b</ span > < span class ="p "> );</ span >
139- < span class ="w "> </ span > < span class ="n "> __half2</ span > < span class ="w "> </ span > < span class ="n "> d</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __half2half2</ span > < span class ="p "> (</ span > < span class ="n "> c</ span > < span class ="p "> );</ span >
140- < span class ="w "> </ span > < span class ="n "> __half2</ span > < span class ="w "> </ span > < span class ="n "> e</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __hadd2</ span > < span class ="p "> (</ span > < span class ="n "> a</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="n "> d</ span > < span class ="p "> );</ span >
141- < span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> f</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __low2half</ span > < span class ="p "> (</ span > < span class ="n "> e</ span > < span class ="p "> );</ span >
142- < span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> g</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __high2half</ span > < span class ="p "> (</ span > < span class ="n "> e</ span > < span class ="p "> );</ span >
143- < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="w "> </ span > < span class ="n "> out0</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __half2float</ span > < span class ="p "> (</ span > < span class ="n "> f</ span > < span class ="p "> );</ span >
144- < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="w "> </ span > < span class ="n "> out1</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __half2float</ span > < span class ="p "> (</ span > < span class ="n "> g</ span > < span class ="p "> );</ span >
145- < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> [</ span > < span class ="mi "> 2</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="mi "> 0</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> out0</ span > < span class ="p "> ;</ span >
146- < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> [</ span > < span class ="mi "> 2</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="mi "> 1</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> out1</ span > < span class ="p "> ;</ span >
137+ < span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> b</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __int2half_rn</ span > < span class ="p "> (</ span > < span class ="n "> constant</ span > < span class ="p "> );</ span >
138+ < span class ="w "> </ span > < span class ="n "> __half2</ span > < span class ="w "> </ span > < span class ="n "> c</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __half2half2</ span > < span class ="p "> (</ span > < span class ="n "> b</ span > < span class ="p "> );</ span >
139+ < span class ="w "> </ span > < span class ="n "> __half2</ span > < span class ="w "> </ span > < span class ="n "> d</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __hmul2</ span > < span class ="p "> (</ span > < span class ="n "> a</ span > < span class ="p "> ,</ span > < span class ="w "> </ span > < span class ="n "> c</ span > < span class ="p "> );</ span >
140+ < span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> e</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __low2half</ span > < span class ="p "> (</ span > < span class ="n "> d</ span > < span class ="p "> );</ span >
141+ < span class ="w "> </ span > < span class ="n "> __half</ span > < span class ="w "> </ span > < span class ="n "> f</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __high2half</ span > < span class ="p "> (</ span > < span class ="n "> d</ span > < span class ="p "> );</ span >
142+ < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="w "> </ span > < span class ="n "> out0</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __half2float</ span > < span class ="p "> (</ span > < span class ="n "> e</ span > < span class ="p "> );</ span >
143+ < span class ="w "> </ span > < span class ="kt "> float</ span > < span class ="w "> </ span > < span class ="n "> out1</ span > < span class ="w "> </ span > < span class ="o "> =</ span > < span class ="w "> </ span > < span class ="n "> __half2float</ span > < span class ="p "> (</ span > < span class ="n "> f</ span > < span class ="p "> );</ span >
144+ < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> [</ span > < span class ="mi "> 2</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="mi "> 0</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> +=</ span > < span class ="w "> </ span > < span class ="n "> out0</ span > < span class ="p "> ;</ span >
145+ < span class ="w "> </ span > < span class ="n "> output</ span > < span class ="p "> [</ span > < span class ="mi "> 2</ span > < span class ="w "> </ span > < span class ="o "> *</ span > < span class ="w "> </ span > < span class ="n "> i</ span > < span class ="w "> </ span > < span class ="o "> +</ span > < span class ="w "> </ span > < span class ="mi "> 1</ span > < span class ="p "> ]</ span > < span class ="w "> </ span > < span class ="o "> +=</ span > < span class ="w "> </ span > < span class ="n "> out1</ span > < span class ="p "> ;</ span >
147146< span class ="p "> }</ span >
148147</ pre > </ div >
149148</ div >
0 commit comments