1
- // ====---------- math-emu-bf16.cu- ---------- *- CUDA -* ------------------===//
1
+ // ====---------- math-emu-bf16.cu - ---------- *- CUDA -* ------------------===//
2
2
//
3
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
4
// See https://llvm.org/LICENSE.txt for license information.
15
15
16
16
using namespace std ;
17
17
18
+ typedef vector<__nv_bfloat16> bf16_vector;
18
19
typedef pair<__nv_bfloat16, int > bf16i_pair;
19
20
20
21
int passed = 0 ;
@@ -72,6 +73,245 @@ void testHabsCases(const vector<pair<__nv_bfloat16, bf16i_pair>> &TestCases) {
72
73
}
73
74
}
74
75
76
+ __global__ void hadd (float *const Result, __nv_bfloat16 Input1,
77
+ __nv_bfloat16 Input2) {
78
+ *Result = __hadd (Input1, Input2);
79
+ }
80
+
81
+ void testHaddCases (
82
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
83
+ &TestCases) {
84
+ float *Result;
85
+ cudaMallocManaged (&Result, sizeof (*Result));
86
+ for (const auto &TestCase : TestCases) {
87
+ hadd<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
88
+ cudaDeviceSynchronize ();
89
+ checkResult (" __hadd" , {TestCase.first .first , TestCase.first .second },
90
+ TestCase.second .first , *Result, TestCase.second .second );
91
+ }
92
+ }
93
+
94
+ __global__ void hadd_rn (float *const Result, __nv_bfloat16 Input1,
95
+ __nv_bfloat16 Input2) {
96
+ *Result = __hadd_rn (Input1, Input2);
97
+ }
98
+
99
+ void testHadd_rnCases (
100
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
101
+ &TestCases) {
102
+ float *Result;
103
+ cudaMallocManaged (&Result, sizeof (*Result));
104
+ for (const auto &TestCase : TestCases) {
105
+ hadd_rn<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
106
+ cudaDeviceSynchronize ();
107
+ checkResult (" __hadd_rn" , {TestCase.first .first , TestCase.first .second },
108
+ TestCase.second .first , *Result, TestCase.second .second );
109
+ }
110
+ }
111
+
112
+ __global__ void hadd_sat (float *const Result, __nv_bfloat16 Input1,
113
+ __nv_bfloat16 Input2) {
114
+ *Result = __hadd_sat (Input1, Input2);
115
+ }
116
+
117
+ void testHadd_satCases (
118
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
119
+ &TestCases) {
120
+ float *Result;
121
+ cudaMallocManaged (&Result, sizeof (*Result));
122
+ for (const auto &TestCase : TestCases) {
123
+ hadd_sat<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
124
+ cudaDeviceSynchronize ();
125
+ checkResult (" __hadd_sat" , {TestCase.first .first , TestCase.first .second },
126
+ TestCase.second .first , *Result, TestCase.second .second );
127
+ }
128
+ }
129
+
130
+ __global__ void hdiv (float *const Result, __nv_bfloat16 Input1,
131
+ __nv_bfloat16 Input2) {
132
+ *Result = __hdiv (Input1, Input2);
133
+ }
134
+
135
+ void testHdivCases (
136
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
137
+ &TestCases) {
138
+ float *Result;
139
+ cudaMallocManaged (&Result, sizeof (*Result));
140
+ for (const auto &TestCase : TestCases) {
141
+ hdiv<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
142
+ cudaDeviceSynchronize ();
143
+ checkResult (" __hdiv" , {TestCase.first .first , TestCase.first .second },
144
+ TestCase.second .first , *Result, TestCase.second .second );
145
+ }
146
+ }
147
+
148
+ __global__ void hfma (float *const Result, __nv_bfloat16 Input1,
149
+ __nv_bfloat16 Input2, __nv_bfloat16 Input3) {
150
+ *Result = __hfma (Input1, Input2, Input3);
151
+ }
152
+
153
+ void testHfmaCases (const vector<pair<bf16_vector, bf16i_pair>> &TestCases) {
154
+ float *Result;
155
+ cudaMallocManaged (&Result, sizeof (*Result));
156
+ for (const auto &TestCase : TestCases) {
157
+ hfma<<<1 , 1 >>> (Result, TestCase.first [0 ], TestCase.first [1 ],
158
+ TestCase.first [2 ]);
159
+ cudaDeviceSynchronize ();
160
+ checkResult (" __hfma" , TestCase.first , TestCase.second .first , *Result,
161
+ TestCase.second .second );
162
+ if (TestCase.first .size () != 3 ) {
163
+ failed++;
164
+ cout << " ---- failed" << endl;
165
+ return ;
166
+ }
167
+ }
168
+ }
169
+
170
+ __global__ void hfma_sat (float *const Result, __nv_bfloat16 Input1,
171
+ __nv_bfloat16 Input2, __nv_bfloat16 Input3) {
172
+ *Result = __hfma_sat (Input1, Input2, Input3);
173
+ }
174
+
175
+ void testHfma_satCases (const vector<pair<bf16_vector, bf16i_pair>> &TestCases) {
176
+ float *Result;
177
+ cudaMallocManaged (&Result, sizeof (*Result));
178
+ for (const auto &TestCase : TestCases) {
179
+ hfma_sat<<<1 , 1 >>> (Result, TestCase.first [0 ], TestCase.first [1 ],
180
+ TestCase.first [2 ]);
181
+ cudaDeviceSynchronize ();
182
+ checkResult (" __hfma_sat" , TestCase.first , TestCase.second .first , *Result,
183
+ TestCase.second .second );
184
+ if (TestCase.first .size () != 3 ) {
185
+ failed++;
186
+ cout << " ---- failed" << endl;
187
+ return ;
188
+ }
189
+ }
190
+ }
191
+
192
+ __global__ void hmul (float *const Result, __nv_bfloat16 Input1,
193
+ __nv_bfloat16 Input2) {
194
+ *Result = __hmul (Input1, Input2);
195
+ }
196
+
197
+ void testHmulCases (
198
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
199
+ &TestCases) {
200
+ float *Result;
201
+ cudaMallocManaged (&Result, sizeof (*Result));
202
+ for (const auto &TestCase : TestCases) {
203
+ hmul<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
204
+ cudaDeviceSynchronize ();
205
+ checkResult (" __hmul" , {TestCase.first .first , TestCase.first .second },
206
+ TestCase.second .first , *Result, TestCase.second .second );
207
+ }
208
+ }
209
+
210
+ __global__ void hmul_rn (float *const Result, __nv_bfloat16 Input1,
211
+ __nv_bfloat16 Input2) {
212
+ *Result = __hmul_rn (Input1, Input2);
213
+ }
214
+
215
+ void testHmul_rnCases (
216
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
217
+ &TestCases) {
218
+ float *Result;
219
+ cudaMallocManaged (&Result, sizeof (*Result));
220
+ for (const auto &TestCase : TestCases) {
221
+ hmul_rn<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
222
+ cudaDeviceSynchronize ();
223
+ checkResult (" __hmul_rn" , {TestCase.first .first , TestCase.first .second },
224
+ TestCase.second .first , *Result, TestCase.second .second );
225
+ }
226
+ }
227
+
228
+ __global__ void hmul_sat (float *const Result, __nv_bfloat16 Input1,
229
+ __nv_bfloat16 Input2) {
230
+ *Result = __hmul_sat (Input1, Input2);
231
+ }
232
+
233
+ void testHmul_satCases (
234
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
235
+ &TestCases) {
236
+ float *Result;
237
+ cudaMallocManaged (&Result, sizeof (*Result));
238
+ for (const auto &TestCase : TestCases) {
239
+ hmul_sat<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
240
+ cudaDeviceSynchronize ();
241
+ checkResult (" __hmul_sat" , {TestCase.first .first , TestCase.first .second },
242
+ TestCase.second .first , *Result, TestCase.second .second );
243
+ }
244
+ }
245
+
246
+ __global__ void hneg (float *const Result, __nv_bfloat16 Input1) {
247
+ *Result = __hneg (Input1);
248
+ }
249
+
250
+ void testHnegCases (const vector<pair<__nv_bfloat16, bf16i_pair>> &TestCases) {
251
+ float *Result;
252
+ cudaMallocManaged (&Result, sizeof (*Result));
253
+ for (const auto &TestCase : TestCases) {
254
+ hneg<<<1 , 1 >>> (Result, TestCase.first );
255
+ cudaDeviceSynchronize ();
256
+ checkResult (" __hneg" , {TestCase.first }, TestCase.second .first , *Result,
257
+ TestCase.second .second );
258
+ }
259
+ }
260
+
261
+ __global__ void hsub (float *const Result, __nv_bfloat16 Input1,
262
+ __nv_bfloat16 Input2) {
263
+ *Result = __hsub (Input1, Input2);
264
+ }
265
+
266
+ void testHsubCases (
267
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
268
+ &TestCases) {
269
+ float *Result;
270
+ cudaMallocManaged (&Result, sizeof (*Result));
271
+ for (const auto &TestCase : TestCases) {
272
+ hsub<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
273
+ cudaDeviceSynchronize ();
274
+ checkResult (" __hsub" , {TestCase.first .first , TestCase.first .second },
275
+ TestCase.second .first , *Result, TestCase.second .second );
276
+ }
277
+ }
278
+
279
+ __global__ void hsub_rn (float *const Result, __nv_bfloat16 Input1,
280
+ __nv_bfloat16 Input2) {
281
+ *Result = __hsub_rn (Input1, Input2);
282
+ }
283
+
284
+ void testHsub_rnCases (
285
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
286
+ &TestCases) {
287
+ float *Result;
288
+ cudaMallocManaged (&Result, sizeof (*Result));
289
+ for (const auto &TestCase : TestCases) {
290
+ hsub_rn<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
291
+ cudaDeviceSynchronize ();
292
+ checkResult (" __hsub_rn" , {TestCase.first .first , TestCase.first .second },
293
+ TestCase.second .first , *Result, TestCase.second .second );
294
+ }
295
+ }
296
+
297
+ __global__ void hsub_sat (float *const Result, __nv_bfloat16 Input1,
298
+ __nv_bfloat16 Input2) {
299
+ *Result = __hsub_sat (Input1, Input2);
300
+ }
301
+
302
+ void testHsub_satCases (
303
+ const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
304
+ &TestCases) {
305
+ float *Result;
306
+ cudaMallocManaged (&Result, sizeof (*Result));
307
+ for (const auto &TestCase : TestCases) {
308
+ hsub_sat<<<1 , 1 >>> (Result, TestCase.first .first , TestCase.first .second );
309
+ cudaDeviceSynchronize ();
310
+ checkResult (" __hsub_sat" , {TestCase.first .first , TestCase.first .second },
311
+ TestCase.second .first , *Result, TestCase.second .second );
312
+ }
313
+ }
314
+
75
315
int main () {
76
316
testHabsCases ({
77
317
{{-0.3 }, {0.30078125 , 16 }},
@@ -80,6 +320,97 @@ int main() {
80
320
{{0.4 }, {0.400390625 , 16 }},
81
321
{{6 }, {6 , 15 }},
82
322
});
323
+ testHaddCases ({
324
+ {{-0.3 , -0.4 }, {-0.703125 , 16 }},
325
+ {{0.3 , -0.4 }, {-0.099609375 , 17 }},
326
+ {{0.3 , 0.4 }, {0.703125 , 16 }},
327
+ {{0.3 , 0.8 }, {1.1015625 , 15 }},
328
+ {{3 , 4 }, {7 , 15 }},
329
+ });
330
+ testHadd_rnCases ({
331
+ {{-0.3 , -0.4 }, {-0.703125 , 16 }},
332
+ {{0.3 , -0.4 }, {-0.099609375 , 17 }},
333
+ {{0.3 , 0.4 }, {0.703125 , 16 }},
334
+ {{0.3 , 0.8 }, {1.1015625 , 15 }},
335
+ {{3 , 4 }, {7 , 15 }},
336
+ });
337
+ testHadd_satCases ({
338
+ {{-0.3 , -0.4 }, {0 , 37 }},
339
+ {{0.3 , -0.4 }, {0 , 37 }},
340
+ {{0.3 , 0.4 }, {0.703125 , 16 }},
341
+ {{0.3 , 0.8 }, {1 , 15 }},
342
+ {{3 , 4 }, {1 , 15 }},
343
+ });
344
+ testHdivCases ({
345
+ {{-0.3 , -0.4 }, {0.75 , 16 }},
346
+ {{0.3 , -0.4 }, {-0.75 , 16 }},
347
+ {{0.3 , 0.4 }, {0.75 , 16 }},
348
+ {{0.3 , 0.8 }, {0.375 , 16 }},
349
+ {{3 , 4 }, {0.75 , 16 }},
350
+ });
351
+ testHfmaCases ({
352
+ {{-0.3 , -0.4 , -0.2 }, {-0.07958984375 , 17 }},
353
+ {{0.3 , -0.4 , -0.1 }, {-0.220703125 , 16 }},
354
+ {{0.3 , 0.4 , 0.1 }, {0.220703125 , 16 }},
355
+ {{0.3 , 0.4 , 0 }, {0.12060546875 , 17 }},
356
+ {{3 , 4 , 5 }, {17 , 14 }},
357
+ });
358
+ testHfma_satCases ({
359
+ {{-0.3 , -0.4 , -0.2 }, {0 , 37 }},
360
+ {{0.3 , -0.4 , -0.1 }, {0 , 37 }},
361
+ {{0.3 , 0.4 , 0.1 }, {0.220703125 , 16 }},
362
+ {{0.3 , 0.4 , 0 }, {0.12060546875 , 17 }},
363
+ {{3 , 4 , 5 }, {1 , 15 }},
364
+ });
365
+ testHmulCases ({
366
+ {{-0.3 , -0.4 }, {0.12060546875 , 17 }},
367
+ {{0.3 , -0.4 }, {-0.12060546875 , 17 }},
368
+ {{0.3 , 0.4 }, {0.12060546875 , 17 }},
369
+ {{0.3 , 0.8 }, {0.2412109375 , 16 }},
370
+ {{3 , 4 }, {12 , 15 }},
371
+ });
372
+ testHmul_rnCases ({
373
+ {{-0.3 , -0.4 }, {0.12060546875 , 17 }},
374
+ {{0.3 , -0.4 }, {-0.12060546875 , 17 }},
375
+ {{0.3 , 0.4 }, {0.12060546875 , 17 }},
376
+ {{0.3 , 0.8 }, {0.2412109375 , 16 }},
377
+ {{3 , 4 }, {12 , 15 }},
378
+ });
379
+ testHmul_satCases ({
380
+ {{-0.3 , -0.4 }, {0.12060546875 , 17 }},
381
+ {{0.3 , -0.4 }, {0 , 37 }},
382
+ {{0.3 , 0.4 }, {0.12060546875 , 17 }},
383
+ {{0.3 , 0.8 }, {0.2412109375 , 16 }},
384
+ {{3 , 4 }, {1 , 15 }},
385
+ });
386
+ testHnegCases ({
387
+ {{-0.3 }, {0.30078125 , 16 }},
388
+ {{0.3 }, {-0.30078125 , 16 }},
389
+ {{0.5 }, {-0.5 , 16 }},
390
+ {{0.4 }, {-0.400390625 , 16 }},
391
+ {{6 }, {-6 , 15 }},
392
+ });
393
+ testHsubCases ({
394
+ {{-0.3 , -0.4 }, {0.099609375 , 17 }},
395
+ {{0.3 , -0.4 }, {0.703125 , 16 }},
396
+ {{0.3 , 0.4 }, {-0.099609375 , 17 }},
397
+ {{0.3 , -0.8 }, {1.1015625 , 15 }},
398
+ {{3 , 4 }, {-1 , 15 }},
399
+ });
400
+ testHsub_rnCases ({
401
+ {{-0.3 , -0.4 }, {0.099609375 , 17 }},
402
+ {{0.3 , -0.4 }, {0.703125 , 16 }},
403
+ {{0.3 , 0.4 }, {-0.099609375 , 17 }},
404
+ {{0.3 , -0.8 }, {1.1015625 , 15 }},
405
+ {{3 , 4 }, {-1 , 15 }},
406
+ });
407
+ testHsub_satCases ({
408
+ {{-0.3 , -0.4 }, {0.099609375 , 17 }},
409
+ {{0.3 , -0.4 }, {0.703125 , 16 }},
410
+ {{0.3 , 0.4 }, {0 , 37 }},
411
+ {{0.3 , -0.8 }, {1 , 15 }},
412
+ {{3 , 4 }, {0 , 37 }},
413
+ });
83
414
cout << " passed " << passed << " /" << passed + failed << " cases!" << endl;
84
415
if (failed) {
85
416
cout << " failed!" << endl;
0 commit comments