@@ -46,58 +46,52 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
4646; CHECKPTX71-LABEL: test(
4747; CHECKPTX71: {
4848; CHECKPTX71-NEXT: .reg .pred %p<5>;
49- ; CHECKPTX71-NEXT: .reg .b16 %rs<22 >;
49+ ; CHECKPTX71-NEXT: .reg .b16 %rs<26 >;
5050; CHECKPTX71-NEXT: .reg .b32 %r<4>;
51- ; CHECKPTX71-NEXT: .reg .f32 %f<12>;
5251; CHECKPTX71-EMPTY:
5352; CHECKPTX71-NEXT: // %bb.0:
5453; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
5554; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
5655; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
5756; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
58- ; CHECKPTX71-NEXT: ld.b16 %rs18, [%r1];
59- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
57+ ; CHECKPTX71-NEXT: ld.b16 %rs22, [%r1];
6058; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14
6159; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
62- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs18;
63- ; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
64- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
65- ; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs18, %rs14;
66- ; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs18;
67- ; CHECKPTX71-NEXT: mov.u16 %rs18, %rs3;
60+ ; CHECKPTX71-NEXT: mov.b16 %rs14, 0x3F80;
61+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs15, %rs22, %rs14, %rs13;
62+ ; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs22, %rs15;
63+ ; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs22;
64+ ; CHECKPTX71-NEXT: mov.u16 %rs22, %rs3;
6865; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
6966; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13
70- ; CHECKPTX71-NEXT: ld.b16 %rs19 , [%r1];
67+ ; CHECKPTX71-NEXT: ld.b16 %rs23 , [%r1];
7168; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8
7269; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
73- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs19;
74- ; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
75- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs15, %f5;
76- ; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs19, %rs15;
77- ; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs19;
78- ; CHECKPTX71-NEXT: mov.u16 %rs19, %rs6;
70+ ; CHECKPTX71-NEXT: mov.b16 %rs16, 0x3F80;
71+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs17, %rs23, %rs16, %rs16;
72+ ; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs23, %rs17;
73+ ; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs23;
74+ ; CHECKPTX71-NEXT: mov.u16 %rs23, %rs6;
7975; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
8076; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7
81- ; CHECKPTX71-NEXT: ld.global.b16 %rs20 , [%r2];
77+ ; CHECKPTX71-NEXT: ld.global.b16 %rs24 , [%r2];
8278; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2
8379; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
84- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs20;
85- ; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
86- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f8;
87- ; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs20, %rs16;
88- ; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs20;
89- ; CHECKPTX71-NEXT: mov.u16 %rs20, %rs9;
80+ ; CHECKPTX71-NEXT: mov.b16 %rs18, 0x3F80;
81+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs19, %rs24, %rs18, %rs13;
82+ ; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs24, %rs19;
83+ ; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs24;
84+ ; CHECKPTX71-NEXT: mov.u16 %rs24, %rs9;
9085; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
9186; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1
92- ; CHECKPTX71-NEXT: ld.shared.b16 %rs21 , [%r3];
87+ ; CHECKPTX71-NEXT: ld.shared.b16 %rs25 , [%r3];
9388; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
9489; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
95- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs21;
96- ; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
97- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs17, %f11;
98- ; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs21, %rs17;
99- ; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs21;
100- ; CHECKPTX71-NEXT: mov.u16 %rs21, %rs12;
90+ ; CHECKPTX71-NEXT: mov.b16 %rs20, 0x3F80;
91+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs21, %rs25, %rs20, %rs13;
92+ ; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs25, %rs21;
93+ ; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs25;
94+ ; CHECKPTX71-NEXT: mov.u16 %rs25, %rs12;
10195; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
10296; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
10397; CHECKPTX71-NEXT: ret;
0 commit comments