Skip to content

Commit 97d1b13

Browse files
authored
[SYCLomatic #1107] Add 14 bf16 and 15 bf162 tests. (#411)
Signed-off-by: Tang, Jiajun [email protected]
1 parent f1f422f commit 97d1b13

File tree

6 files changed

+1001
-5
lines changed

6 files changed

+1001
-5
lines changed

features/feature_case/math/math-emu-bf16.cu

Lines changed: 332 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//====---------- math-emu-bf16.cu- ---------- *- CUDA -* ------------------===//
1+
//====---------- math-emu-bf16.cu ----------- *- CUDA -* ------------------===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -15,6 +15,7 @@
1515

1616
using namespace std;
1717

18+
typedef vector<__nv_bfloat16> bf16_vector;
1819
typedef pair<__nv_bfloat16, int> bf16i_pair;
1920

2021
int passed = 0;
@@ -72,6 +73,245 @@ void testHabsCases(const vector<pair<__nv_bfloat16, bf16i_pair>> &TestCases) {
7273
}
7374
}
7475

76+
__global__ void hadd(float *const Result, __nv_bfloat16 Input1,
77+
__nv_bfloat16 Input2) {
78+
*Result = __hadd(Input1, Input2);
79+
}
80+
81+
void testHaddCases(
82+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
83+
&TestCases) {
84+
float *Result;
85+
cudaMallocManaged(&Result, sizeof(*Result));
86+
for (const auto &TestCase : TestCases) {
87+
hadd<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
88+
cudaDeviceSynchronize();
89+
checkResult("__hadd", {TestCase.first.first, TestCase.first.second},
90+
TestCase.second.first, *Result, TestCase.second.second);
91+
}
92+
}
93+
94+
__global__ void hadd_rn(float *const Result, __nv_bfloat16 Input1,
95+
__nv_bfloat16 Input2) {
96+
*Result = __hadd_rn(Input1, Input2);
97+
}
98+
99+
void testHadd_rnCases(
100+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
101+
&TestCases) {
102+
float *Result;
103+
cudaMallocManaged(&Result, sizeof(*Result));
104+
for (const auto &TestCase : TestCases) {
105+
hadd_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
106+
cudaDeviceSynchronize();
107+
checkResult("__hadd_rn", {TestCase.first.first, TestCase.first.second},
108+
TestCase.second.first, *Result, TestCase.second.second);
109+
}
110+
}
111+
112+
__global__ void hadd_sat(float *const Result, __nv_bfloat16 Input1,
113+
__nv_bfloat16 Input2) {
114+
*Result = __hadd_sat(Input1, Input2);
115+
}
116+
117+
void testHadd_satCases(
118+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
119+
&TestCases) {
120+
float *Result;
121+
cudaMallocManaged(&Result, sizeof(*Result));
122+
for (const auto &TestCase : TestCases) {
123+
hadd_sat<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
124+
cudaDeviceSynchronize();
125+
checkResult("__hadd_sat", {TestCase.first.first, TestCase.first.second},
126+
TestCase.second.first, *Result, TestCase.second.second);
127+
}
128+
}
129+
130+
__global__ void hdiv(float *const Result, __nv_bfloat16 Input1,
131+
__nv_bfloat16 Input2) {
132+
*Result = __hdiv(Input1, Input2);
133+
}
134+
135+
void testHdivCases(
136+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
137+
&TestCases) {
138+
float *Result;
139+
cudaMallocManaged(&Result, sizeof(*Result));
140+
for (const auto &TestCase : TestCases) {
141+
hdiv<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
142+
cudaDeviceSynchronize();
143+
checkResult("__hdiv", {TestCase.first.first, TestCase.first.second},
144+
TestCase.second.first, *Result, TestCase.second.second);
145+
}
146+
}
147+
148+
__global__ void hfma(float *const Result, __nv_bfloat16 Input1,
149+
__nv_bfloat16 Input2, __nv_bfloat16 Input3) {
150+
*Result = __hfma(Input1, Input2, Input3);
151+
}
152+
153+
void testHfmaCases(const vector<pair<bf16_vector, bf16i_pair>> &TestCases) {
154+
float *Result;
155+
cudaMallocManaged(&Result, sizeof(*Result));
156+
for (const auto &TestCase : TestCases) {
157+
hfma<<<1, 1>>>(Result, TestCase.first[0], TestCase.first[1],
158+
TestCase.first[2]);
159+
cudaDeviceSynchronize();
160+
checkResult("__hfma", TestCase.first, TestCase.second.first, *Result,
161+
TestCase.second.second);
162+
if (TestCase.first.size() != 3) {
163+
failed++;
164+
cout << " ---- failed" << endl;
165+
return;
166+
}
167+
}
168+
}
169+
170+
__global__ void hfma_sat(float *const Result, __nv_bfloat16 Input1,
171+
__nv_bfloat16 Input2, __nv_bfloat16 Input3) {
172+
*Result = __hfma_sat(Input1, Input2, Input3);
173+
}
174+
175+
void testHfma_satCases(const vector<pair<bf16_vector, bf16i_pair>> &TestCases) {
176+
float *Result;
177+
cudaMallocManaged(&Result, sizeof(*Result));
178+
for (const auto &TestCase : TestCases) {
179+
hfma_sat<<<1, 1>>>(Result, TestCase.first[0], TestCase.first[1],
180+
TestCase.first[2]);
181+
cudaDeviceSynchronize();
182+
checkResult("__hfma_sat", TestCase.first, TestCase.second.first, *Result,
183+
TestCase.second.second);
184+
if (TestCase.first.size() != 3) {
185+
failed++;
186+
cout << " ---- failed" << endl;
187+
return;
188+
}
189+
}
190+
}
191+
192+
__global__ void hmul(float *const Result, __nv_bfloat16 Input1,
193+
__nv_bfloat16 Input2) {
194+
*Result = __hmul(Input1, Input2);
195+
}
196+
197+
void testHmulCases(
198+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
199+
&TestCases) {
200+
float *Result;
201+
cudaMallocManaged(&Result, sizeof(*Result));
202+
for (const auto &TestCase : TestCases) {
203+
hmul<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
204+
cudaDeviceSynchronize();
205+
checkResult("__hmul", {TestCase.first.first, TestCase.first.second},
206+
TestCase.second.first, *Result, TestCase.second.second);
207+
}
208+
}
209+
210+
__global__ void hmul_rn(float *const Result, __nv_bfloat16 Input1,
211+
__nv_bfloat16 Input2) {
212+
*Result = __hmul_rn(Input1, Input2);
213+
}
214+
215+
void testHmul_rnCases(
216+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
217+
&TestCases) {
218+
float *Result;
219+
cudaMallocManaged(&Result, sizeof(*Result));
220+
for (const auto &TestCase : TestCases) {
221+
hmul_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
222+
cudaDeviceSynchronize();
223+
checkResult("__hmul_rn", {TestCase.first.first, TestCase.first.second},
224+
TestCase.second.first, *Result, TestCase.second.second);
225+
}
226+
}
227+
228+
__global__ void hmul_sat(float *const Result, __nv_bfloat16 Input1,
229+
__nv_bfloat16 Input2) {
230+
*Result = __hmul_sat(Input1, Input2);
231+
}
232+
233+
void testHmul_satCases(
234+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
235+
&TestCases) {
236+
float *Result;
237+
cudaMallocManaged(&Result, sizeof(*Result));
238+
for (const auto &TestCase : TestCases) {
239+
hmul_sat<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
240+
cudaDeviceSynchronize();
241+
checkResult("__hmul_sat", {TestCase.first.first, TestCase.first.second},
242+
TestCase.second.first, *Result, TestCase.second.second);
243+
}
244+
}
245+
246+
__global__ void hneg(float *const Result, __nv_bfloat16 Input1) {
247+
*Result = __hneg(Input1);
248+
}
249+
250+
void testHnegCases(const vector<pair<__nv_bfloat16, bf16i_pair>> &TestCases) {
251+
float *Result;
252+
cudaMallocManaged(&Result, sizeof(*Result));
253+
for (const auto &TestCase : TestCases) {
254+
hneg<<<1, 1>>>(Result, TestCase.first);
255+
cudaDeviceSynchronize();
256+
checkResult("__hneg", {TestCase.first}, TestCase.second.first, *Result,
257+
TestCase.second.second);
258+
}
259+
}
260+
261+
__global__ void hsub(float *const Result, __nv_bfloat16 Input1,
262+
__nv_bfloat16 Input2) {
263+
*Result = __hsub(Input1, Input2);
264+
}
265+
266+
void testHsubCases(
267+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
268+
&TestCases) {
269+
float *Result;
270+
cudaMallocManaged(&Result, sizeof(*Result));
271+
for (const auto &TestCase : TestCases) {
272+
hsub<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
273+
cudaDeviceSynchronize();
274+
checkResult("__hsub", {TestCase.first.first, TestCase.first.second},
275+
TestCase.second.first, *Result, TestCase.second.second);
276+
}
277+
}
278+
279+
__global__ void hsub_rn(float *const Result, __nv_bfloat16 Input1,
280+
__nv_bfloat16 Input2) {
281+
*Result = __hsub_rn(Input1, Input2);
282+
}
283+
284+
void testHsub_rnCases(
285+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
286+
&TestCases) {
287+
float *Result;
288+
cudaMallocManaged(&Result, sizeof(*Result));
289+
for (const auto &TestCase : TestCases) {
290+
hsub_rn<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
291+
cudaDeviceSynchronize();
292+
checkResult("__hsub_rn", {TestCase.first.first, TestCase.first.second},
293+
TestCase.second.first, *Result, TestCase.second.second);
294+
}
295+
}
296+
297+
__global__ void hsub_sat(float *const Result, __nv_bfloat16 Input1,
298+
__nv_bfloat16 Input2) {
299+
*Result = __hsub_sat(Input1, Input2);
300+
}
301+
302+
void testHsub_satCases(
303+
const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf16i_pair>>
304+
&TestCases) {
305+
float *Result;
306+
cudaMallocManaged(&Result, sizeof(*Result));
307+
for (const auto &TestCase : TestCases) {
308+
hsub_sat<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
309+
cudaDeviceSynchronize();
310+
checkResult("__hsub_sat", {TestCase.first.first, TestCase.first.second},
311+
TestCase.second.first, *Result, TestCase.second.second);
312+
}
313+
}
314+
75315
int main() {
76316
testHabsCases({
77317
{{-0.3}, {0.30078125, 16}},
@@ -80,6 +320,97 @@ int main() {
80320
{{0.4}, {0.400390625, 16}},
81321
{{6}, {6, 15}},
82322
});
323+
testHaddCases({
324+
{{-0.3, -0.4}, {-0.703125, 16}},
325+
{{0.3, -0.4}, {-0.099609375, 17}},
326+
{{0.3, 0.4}, {0.703125, 16}},
327+
{{0.3, 0.8}, {1.1015625, 15}},
328+
{{3, 4}, {7, 15}},
329+
});
330+
testHadd_rnCases({
331+
{{-0.3, -0.4}, {-0.703125, 16}},
332+
{{0.3, -0.4}, {-0.099609375, 17}},
333+
{{0.3, 0.4}, {0.703125, 16}},
334+
{{0.3, 0.8}, {1.1015625, 15}},
335+
{{3, 4}, {7, 15}},
336+
});
337+
testHadd_satCases({
338+
{{-0.3, -0.4}, {0, 37}},
339+
{{0.3, -0.4}, {0, 37}},
340+
{{0.3, 0.4}, {0.703125, 16}},
341+
{{0.3, 0.8}, {1, 15}},
342+
{{3, 4}, {1, 15}},
343+
});
344+
testHdivCases({
345+
{{-0.3, -0.4}, {0.75, 16}},
346+
{{0.3, -0.4}, {-0.75, 16}},
347+
{{0.3, 0.4}, {0.75, 16}},
348+
{{0.3, 0.8}, {0.375, 16}},
349+
{{3, 4}, {0.75, 16}},
350+
});
351+
testHfmaCases({
352+
{{-0.3, -0.4, -0.2}, {-0.07958984375, 17}},
353+
{{0.3, -0.4, -0.1}, {-0.220703125, 16}},
354+
{{0.3, 0.4, 0.1}, {0.220703125, 16}},
355+
{{0.3, 0.4, 0}, {0.12060546875, 17}},
356+
{{3, 4, 5}, {17, 14}},
357+
});
358+
testHfma_satCases({
359+
{{-0.3, -0.4, -0.2}, {0, 37}},
360+
{{0.3, -0.4, -0.1}, {0, 37}},
361+
{{0.3, 0.4, 0.1}, {0.220703125, 16}},
362+
{{0.3, 0.4, 0}, {0.12060546875, 17}},
363+
{{3, 4, 5}, {1, 15}},
364+
});
365+
testHmulCases({
366+
{{-0.3, -0.4}, {0.12060546875, 17}},
367+
{{0.3, -0.4}, {-0.12060546875, 17}},
368+
{{0.3, 0.4}, {0.12060546875, 17}},
369+
{{0.3, 0.8}, {0.2412109375, 16}},
370+
{{3, 4}, {12, 15}},
371+
});
372+
testHmul_rnCases({
373+
{{-0.3, -0.4}, {0.12060546875, 17}},
374+
{{0.3, -0.4}, {-0.12060546875, 17}},
375+
{{0.3, 0.4}, {0.12060546875, 17}},
376+
{{0.3, 0.8}, {0.2412109375, 16}},
377+
{{3, 4}, {12, 15}},
378+
});
379+
testHmul_satCases({
380+
{{-0.3, -0.4}, {0.12060546875, 17}},
381+
{{0.3, -0.4}, {0, 37}},
382+
{{0.3, 0.4}, {0.12060546875, 17}},
383+
{{0.3, 0.8}, {0.2412109375, 16}},
384+
{{3, 4}, {1, 15}},
385+
});
386+
testHnegCases({
387+
{{-0.3}, {0.30078125, 16}},
388+
{{0.3}, {-0.30078125, 16}},
389+
{{0.5}, {-0.5, 16}},
390+
{{0.4}, {-0.400390625, 16}},
391+
{{6}, {-6, 15}},
392+
});
393+
testHsubCases({
394+
{{-0.3, -0.4}, {0.099609375, 17}},
395+
{{0.3, -0.4}, {0.703125, 16}},
396+
{{0.3, 0.4}, {-0.099609375, 17}},
397+
{{0.3, -0.8}, {1.1015625, 15}},
398+
{{3, 4}, {-1, 15}},
399+
});
400+
testHsub_rnCases({
401+
{{-0.3, -0.4}, {0.099609375, 17}},
402+
{{0.3, -0.4}, {0.703125, 16}},
403+
{{0.3, 0.4}, {-0.099609375, 17}},
404+
{{0.3, -0.8}, {1.1015625, 15}},
405+
{{3, 4}, {-1, 15}},
406+
});
407+
testHsub_satCases({
408+
{{-0.3, -0.4}, {0.099609375, 17}},
409+
{{0.3, -0.4}, {0.703125, 16}},
410+
{{0.3, 0.4}, {0, 37}},
411+
{{0.3, -0.8}, {1, 15}},
412+
{{3, 4}, {0, 37}},
413+
});
83414
cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
84415
if (failed) {
85416
cout << "failed!" << endl;

0 commit comments

Comments
 (0)