@@ -45,102 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
4545;
4646; CHECKPTX71-LABEL: test(
4747; CHECKPTX71: {
48- ; CHECKPTX71-NEXT: .reg .pred %p<5>;
49- ; CHECKPTX71-NEXT: .reg .b16 %rs<18 >;
50- ; CHECKPTX71-NEXT: .reg .b32 %r<58 >;
51- ; CHECKPTX71-NEXT: .reg .f32 %f<12>;
48+ ; CHECKPTX71-NEXT: .reg .pred %p<5>;
49+ ; CHECKPTX71-NEXT: .reg .b16 %rs<34 >;
50+ ; CHECKPTX71-NEXT: .reg .b32 %r<4 >;
51+ ; CHECKPTX71-NEXT: .reg .f32 %f<12>;
5252; CHECKPTX71-EMPTY:
5353; CHECKPTX71-NEXT: // %bb.0:
54- ; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
55- ; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
56- ; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
57- ; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
58- ; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
59- ; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
60- ; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
61- ; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
62- ; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
63- ; CHECKPTX71-NEXT: not.b32 %r3, %r27;
64- ; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
65- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1;
66- ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
54+ ; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
55+ ; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
56+ ; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
57+ ; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
58+ ; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1];
59+ ; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
60+ ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
6761; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
68- ; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
69- ; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
70- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
71- ; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
72- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
73- ; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
74- ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
75- ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
76- ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
77- ; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
78- ; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
79- ; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
80- ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
81- ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
82- ; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
83- ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start9
62+ ; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30;
63+ ; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
64+ ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
65+ ; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
66+ ; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30;
67+ ; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17;
68+ ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
69+ ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
70+ ; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1];
71+ ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start2
8472; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
85- ; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
86- ; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
87- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
88- ; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
89- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
90- ; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
91- ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
92- ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
93- ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
94- ; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
95- ; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
96- ; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
97- ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
98- ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end8
99- ; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
100- ; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
101- ; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
102- ; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
103- ; CHECKPTX71-NEXT: not.b32 %r12, %r40;
104- ; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
105- ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start27
73+ ; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31;
74+ ; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
75+ ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5;
76+ ; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
77+ ; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31;
78+ ; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21;
79+ ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
80+ ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end1
81+ ; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2];
82+ ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start8
10683; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
107- ; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
108- ; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
109- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
110- ; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
111- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
112- ; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
113- ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
114- ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
115- ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
116- ; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
117- ; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
118- ; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
119- ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
120- ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end26
121- ; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
122- ; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
123- ; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
124- ; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
125- ; CHECKPTX71-NEXT: not.b32 %r18, %r48;
126- ; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
127- ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start45
84+ ; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32;
85+ ; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
86+ ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8;
87+ ; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
88+ ; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32;
89+ ; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25;
90+ ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
91+ ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end7
92+ ; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3];
93+ ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start14
12894; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
129- ; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
130- ; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
131- ; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
132- ; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
133- ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
134- ; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
135- ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
136- ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
137- ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
138- ; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
139- ; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
140- ; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
141- ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
142- ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end44
143- ; CHECKPTX71-NEXT: ret;
95+ ; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33;
96+ ; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
97+ ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11;
98+ ; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
99+ ; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33;
100+ ; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29;
101+ ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
102+ ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end13
103+ ; CHECKPTX71-NEXT: ret;
144104 %r1 = atomicrmw fadd ptr %dp0 , bfloat %val seq_cst
145105 %r2 = atomicrmw fadd ptr %dp0 , bfloat 1 .0 seq_cst
146106 %r3 = atomicrmw fadd ptr addrspace (1 ) %dp1 , bfloat %val seq_cst
0 commit comments