Skip to content

Commit a2dc4e0

Browse files
authored
[AMDGPU] Enable multi-group xnack replay in hardware (GFX1250) (#169016)
This patch enables the multi-group xnack replay mode by configuring the hardware MODE register at kernel entry. This aligns the hardware behavior with the compiler's existing multi-group s_wait_xcnt insertion logic.
1 parent 49995b2 commit a2dc4e0

File tree

149 files changed

+7121
-717
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+7121
-717
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
772772
PreloadedScratchRsrcReg,
773773
ScratchRsrcReg, ScratchWaveOffsetReg);
774774
}
775+
776+
if (ST.hasWaitXCnt()) {
777+
// Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
778+
// replay. This aligns hardware behavior with the compiler's s_wait_xcnt
779+
// insertion logic, which assumes multi-group mode by default.
780+
unsigned RegEncoding =
781+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1);
782+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
783+
.addImm(1)
784+
.addImm(RegEncoding);
785+
}
775786
}
776787

777788
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 48 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll

Lines changed: 130 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
3434
;
3535
; GFX1250-LABEL: abs_sgpr_i16:
3636
; GFX1250: ; %bb.0:
37+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
3738
; GFX1250-NEXT: s_sext_i32_i16 s0, s0
3839
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3940
; GFX1250-NEXT: s_abs_i32 s0, s0
@@ -43,10 +44,26 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
4344
}
4445

4546
define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
46-
; GFX-LABEL: abs_sgpr_i32:
47-
; GFX: ; %bb.0:
48-
; GFX-NEXT: s_abs_i32 s0, s0
49-
; GFX-NEXT: ; return to shader part epilog
47+
; GFX6-LABEL: abs_sgpr_i32:
48+
; GFX6: ; %bb.0:
49+
; GFX6-NEXT: s_abs_i32 s0, s0
50+
; GFX6-NEXT: ; return to shader part epilog
51+
;
52+
; GFX8-LABEL: abs_sgpr_i32:
53+
; GFX8: ; %bb.0:
54+
; GFX8-NEXT: s_abs_i32 s0, s0
55+
; GFX8-NEXT: ; return to shader part epilog
56+
;
57+
; GFX10-LABEL: abs_sgpr_i32:
58+
; GFX10: ; %bb.0:
59+
; GFX10-NEXT: s_abs_i32 s0, s0
60+
; GFX10-NEXT: ; return to shader part epilog
61+
;
62+
; GFX1250-LABEL: abs_sgpr_i32:
63+
; GFX1250: ; %bb.0:
64+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
65+
; GFX1250-NEXT: s_abs_i32 s0, s0
66+
; GFX1250-NEXT: ; return to shader part epilog
5067
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
5168
ret i32 %res
5269
}
@@ -81,6 +98,7 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
8198
;
8299
; GFX1250-LABEL: abs_sgpr_i64:
83100
; GFX1250: ; %bb.0:
101+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
84102
; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
85103
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
86104
; GFX1250-NEXT: s_mov_b32 s3, s2
@@ -93,13 +111,38 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
93111
}
94112

95113
define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
96-
; GFX-LABEL: abs_sgpr_v4i32:
97-
; GFX: ; %bb.0:
98-
; GFX-NEXT: s_abs_i32 s0, s0
99-
; GFX-NEXT: s_abs_i32 s1, s1
100-
; GFX-NEXT: s_abs_i32 s2, s2
101-
; GFX-NEXT: s_abs_i32 s3, s3
102-
; GFX-NEXT: ; return to shader part epilog
114+
; GFX6-LABEL: abs_sgpr_v4i32:
115+
; GFX6: ; %bb.0:
116+
; GFX6-NEXT: s_abs_i32 s0, s0
117+
; GFX6-NEXT: s_abs_i32 s1, s1
118+
; GFX6-NEXT: s_abs_i32 s2, s2
119+
; GFX6-NEXT: s_abs_i32 s3, s3
120+
; GFX6-NEXT: ; return to shader part epilog
121+
;
122+
; GFX8-LABEL: abs_sgpr_v4i32:
123+
; GFX8: ; %bb.0:
124+
; GFX8-NEXT: s_abs_i32 s0, s0
125+
; GFX8-NEXT: s_abs_i32 s1, s1
126+
; GFX8-NEXT: s_abs_i32 s2, s2
127+
; GFX8-NEXT: s_abs_i32 s3, s3
128+
; GFX8-NEXT: ; return to shader part epilog
129+
;
130+
; GFX10-LABEL: abs_sgpr_v4i32:
131+
; GFX10: ; %bb.0:
132+
; GFX10-NEXT: s_abs_i32 s0, s0
133+
; GFX10-NEXT: s_abs_i32 s1, s1
134+
; GFX10-NEXT: s_abs_i32 s2, s2
135+
; GFX10-NEXT: s_abs_i32 s3, s3
136+
; GFX10-NEXT: ; return to shader part epilog
137+
;
138+
; GFX1250-LABEL: abs_sgpr_v4i32:
139+
; GFX1250: ; %bb.0:
140+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
141+
; GFX1250-NEXT: s_abs_i32 s0, s0
142+
; GFX1250-NEXT: s_abs_i32 s1, s1
143+
; GFX1250-NEXT: s_abs_i32 s2, s2
144+
; GFX1250-NEXT: s_abs_i32 s3, s3
145+
; GFX1250-NEXT: ; return to shader part epilog
103146
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
104147
ret <4 x i32> %res
105148
}
@@ -278,13 +321,38 @@ define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
278321
}
279322

280323
define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
281-
; GFX-LABEL: abs_sgpr_v2i8:
282-
; GFX: ; %bb.0:
283-
; GFX-NEXT: s_sext_i32_i8 s0, s0
284-
; GFX-NEXT: s_sext_i32_i8 s1, s1
285-
; GFX-NEXT: s_abs_i32 s0, s0
286-
; GFX-NEXT: s_abs_i32 s1, s1
287-
; GFX-NEXT: ; return to shader part epilog
324+
; GFX6-LABEL: abs_sgpr_v2i8:
325+
; GFX6: ; %bb.0:
326+
; GFX6-NEXT: s_sext_i32_i8 s0, s0
327+
; GFX6-NEXT: s_sext_i32_i8 s1, s1
328+
; GFX6-NEXT: s_abs_i32 s0, s0
329+
; GFX6-NEXT: s_abs_i32 s1, s1
330+
; GFX6-NEXT: ; return to shader part epilog
331+
;
332+
; GFX8-LABEL: abs_sgpr_v2i8:
333+
; GFX8: ; %bb.0:
334+
; GFX8-NEXT: s_sext_i32_i8 s0, s0
335+
; GFX8-NEXT: s_sext_i32_i8 s1, s1
336+
; GFX8-NEXT: s_abs_i32 s0, s0
337+
; GFX8-NEXT: s_abs_i32 s1, s1
338+
; GFX8-NEXT: ; return to shader part epilog
339+
;
340+
; GFX10-LABEL: abs_sgpr_v2i8:
341+
; GFX10: ; %bb.0:
342+
; GFX10-NEXT: s_sext_i32_i8 s0, s0
343+
; GFX10-NEXT: s_sext_i32_i8 s1, s1
344+
; GFX10-NEXT: s_abs_i32 s0, s0
345+
; GFX10-NEXT: s_abs_i32 s1, s1
346+
; GFX10-NEXT: ; return to shader part epilog
347+
;
348+
; GFX1250-LABEL: abs_sgpr_v2i8:
349+
; GFX1250: ; %bb.0:
350+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
351+
; GFX1250-NEXT: s_sext_i32_i8 s0, s0
352+
; GFX1250-NEXT: s_sext_i32_i8 s1, s1
353+
; GFX1250-NEXT: s_abs_i32 s0, s0
354+
; GFX1250-NEXT: s_abs_i32 s1, s1
355+
; GFX1250-NEXT: ; return to shader part epilog
288356
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
289357
ret <2 x i8> %res
290358
}
@@ -340,15 +408,46 @@ define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
340408
}
341409

342410
define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
343-
; GFX-LABEL: abs_sgpr_v3i8:
344-
; GFX: ; %bb.0:
345-
; GFX-NEXT: s_sext_i32_i8 s0, s0
346-
; GFX-NEXT: s_sext_i32_i8 s1, s1
347-
; GFX-NEXT: s_sext_i32_i8 s2, s2
348-
; GFX-NEXT: s_abs_i32 s0, s0
349-
; GFX-NEXT: s_abs_i32 s1, s1
350-
; GFX-NEXT: s_abs_i32 s2, s2
351-
; GFX-NEXT: ; return to shader part epilog
411+
; GFX6-LABEL: abs_sgpr_v3i8:
412+
; GFX6: ; %bb.0:
413+
; GFX6-NEXT: s_sext_i32_i8 s0, s0
414+
; GFX6-NEXT: s_sext_i32_i8 s1, s1
415+
; GFX6-NEXT: s_sext_i32_i8 s2, s2
416+
; GFX6-NEXT: s_abs_i32 s0, s0
417+
; GFX6-NEXT: s_abs_i32 s1, s1
418+
; GFX6-NEXT: s_abs_i32 s2, s2
419+
; GFX6-NEXT: ; return to shader part epilog
420+
;
421+
; GFX8-LABEL: abs_sgpr_v3i8:
422+
; GFX8: ; %bb.0:
423+
; GFX8-NEXT: s_sext_i32_i8 s0, s0
424+
; GFX8-NEXT: s_sext_i32_i8 s1, s1
425+
; GFX8-NEXT: s_sext_i32_i8 s2, s2
426+
; GFX8-NEXT: s_abs_i32 s0, s0
427+
; GFX8-NEXT: s_abs_i32 s1, s1
428+
; GFX8-NEXT: s_abs_i32 s2, s2
429+
; GFX8-NEXT: ; return to shader part epilog
430+
;
431+
; GFX10-LABEL: abs_sgpr_v3i8:
432+
; GFX10: ; %bb.0:
433+
; GFX10-NEXT: s_sext_i32_i8 s0, s0
434+
; GFX10-NEXT: s_sext_i32_i8 s1, s1
435+
; GFX10-NEXT: s_sext_i32_i8 s2, s2
436+
; GFX10-NEXT: s_abs_i32 s0, s0
437+
; GFX10-NEXT: s_abs_i32 s1, s1
438+
; GFX10-NEXT: s_abs_i32 s2, s2
439+
; GFX10-NEXT: ; return to shader part epilog
440+
;
441+
; GFX1250-LABEL: abs_sgpr_v3i8:
442+
; GFX1250: ; %bb.0:
443+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
444+
; GFX1250-NEXT: s_sext_i32_i8 s0, s0
445+
; GFX1250-NEXT: s_sext_i32_i8 s1, s1
446+
; GFX1250-NEXT: s_sext_i32_i8 s2, s2
447+
; GFX1250-NEXT: s_abs_i32 s0, s0
448+
; GFX1250-NEXT: s_abs_i32 s1, s1
449+
; GFX1250-NEXT: s_abs_i32 s2, s2
450+
; GFX1250-NEXT: ; return to shader part epilog
352451
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
353452
ret <3 x i8> %res
354453
}
@@ -446,6 +545,7 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
446545
;
447546
; GFX1250-LABEL: abs_sgpr_v2i16:
448547
; GFX1250: ; %bb.0:
548+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
449549
; GFX1250-NEXT: s_sext_i32_i16 s1, s0
450550
; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
451551
; GFX1250-NEXT: s_abs_i32 s1, s1
@@ -536,6 +636,7 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
536636
;
537637
; GFX1250-LABEL: abs_sgpr_v3i16:
538638
; GFX1250: ; %bb.0:
639+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
539640
; GFX1250-NEXT: s_sext_i32_i16 s2, s0
540641
; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
541642
; GFX1250-NEXT: s_abs_i32 s2, s2
@@ -598,3 +699,5 @@ define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
598699
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
599700
ret <3 x i16> %res
600701
}
702+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
703+
; GFX: {{.*}}

llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
930930
;
931931
; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
932932
; GFX1250-UNALIGNED: ; %bb.0:
933+
; GFX1250-UNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
933934
; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
934935
; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
935936
; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
@@ -940,6 +941,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
940941
;
941942
; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
942943
; GFX1250-NOUNALIGNED: ; %bb.0:
944+
; GFX1250-NOUNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
943945
; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
944946
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
945947
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
@@ -1208,6 +1210,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
12081210
;
12091211
; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
12101212
; GFX1250-UNALIGNED: ; %bb.0:
1213+
; GFX1250-UNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
12111214
; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
12121215
; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
12131216
; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
@@ -1218,6 +1221,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
12181221
;
12191222
; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
12201223
; GFX1250-NOUNALIGNED: ; %bb.0:
1224+
; GFX1250-NOUNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
12211225
; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
12221226
; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2
12231227
; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6
@@ -1362,6 +1366,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
13621366
;
13631367
; GFX1250-LABEL: s_load_constant_v3i32_align4:
13641368
; GFX1250: ; %bb.0:
1369+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
13651370
; GFX1250-NEXT: s_mov_b32 s4, s0
13661371
; GFX1250-NEXT: s_mov_b32 s5, s1
13671372
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1413,6 +1418,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
14131418
;
14141419
; GFX1250-LABEL: s_load_constant_i96_align8:
14151420
; GFX1250: ; %bb.0:
1421+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
14161422
; GFX1250-NEXT: s_mov_b32 s4, s0
14171423
; GFX1250-NEXT: s_mov_b32 s5, s1
14181424
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1464,6 +1470,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
14641470
;
14651471
; GFX1250-LABEL: s_load_constant_v3i32_align8:
14661472
; GFX1250: ; %bb.0:
1473+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
14671474
; GFX1250-NEXT: s_mov_b32 s4, s0
14681475
; GFX1250-NEXT: s_mov_b32 s5, s1
14691476
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1515,6 +1522,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
15151522
;
15161523
; GFX1250-LABEL: s_load_constant_v6i16_align8:
15171524
; GFX1250: ; %bb.0:
1525+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
15181526
; GFX1250-NEXT: s_mov_b32 s4, s0
15191527
; GFX1250-NEXT: s_mov_b32 s5, s1
15201528
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1593,6 +1601,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
15931601
;
15941602
; GFX1250-LABEL: s_load_constant_v12i8_align8:
15951603
; GFX1250: ; %bb.0:
1604+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
15961605
; GFX1250-NEXT: s_mov_b32 s4, s0
15971606
; GFX1250-NEXT: s_mov_b32 s5, s1
15981607
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
@@ -1670,11 +1679,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
16701679
}
16711680

16721681
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) {
1673-
; GFX12-LABEL: s_load_constant_v3i32_align16:
1674-
; GFX12: ; %bb.0:
1675-
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1676-
; GFX12-NEXT: s_wait_kmcnt 0x0
1677-
; GFX12-NEXT: ; return to shader part epilog
1682+
; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align16:
1683+
; GFX12-UNALIGNED: ; %bb.0:
1684+
; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1685+
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
1686+
; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
1687+
;
1688+
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align16:
1689+
; GFX12-NOUNALIGNED: ; %bb.0:
1690+
; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1691+
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
1692+
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
1693+
;
1694+
; GFX1250-LABEL: s_load_constant_v3i32_align16:
1695+
; GFX1250: ; %bb.0:
1696+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
1697+
; GFX1250-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1698+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1699+
; GFX1250-NEXT: ; return to shader part epilog
16781700
;
16791701
; GCN-LABEL: s_load_constant_v3i32_align16:
16801702
; GCN: ; %bb.0:
@@ -1684,3 +1706,5 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg
16841706
%load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
16851707
ret <3 x i32> %load
16861708
}
1709+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1710+
; GFX12: {{.*}}

llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ define i64 @test_abs_i64(i64 %a) {
132132
define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
133133
; CHECK-LABEL: test_umin_i64_s:
134134
; CHECK: ; %bb.0:
135+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
135136
; CHECK-NEXT: v_min_u64 v[0:1], s[0:1], s[2:3]
136137
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
137138
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -144,6 +145,7 @@ define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
144145
define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
145146
; CHECK-LABEL: test_umax_i64_s:
146147
; CHECK: ; %bb.0:
148+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
147149
; CHECK-NEXT: v_max_u64 v[0:1], s[0:1], s[2:3]
148150
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
149151
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -156,6 +158,7 @@ define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
156158
define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
157159
; CHECK-LABEL: test_smin_i64_s:
158160
; CHECK: ; %bb.0:
161+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
159162
; CHECK-NEXT: v_min_i64 v[0:1], s[0:1], s[2:3]
160163
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
161164
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -168,6 +171,7 @@ define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
168171
define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
169172
; CHECK-LABEL: test_smax_i64_s:
170173
; CHECK: ; %bb.0:
174+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
171175
; CHECK-NEXT: v_max_i64 v[0:1], s[0:1], s[2:3]
172176
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
173177
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -180,6 +184,7 @@ define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
180184
define amdgpu_ps i64 @test_abs_i64_s(i64 inreg %a) {
181185
; CHECK-LABEL: test_abs_i64_s:
182186
; CHECK: ; %bb.0:
187+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
183188
; CHECK-NEXT: s_ashr_i32 s2, s1, 31
184189
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
185190
; CHECK-NEXT: s_mov_b32 s3, s2

0 commit comments

Comments
 (0)