Skip to content

Commit 5a21128

Browse files
authored
AMDGPU: Relax legal register operand constraint (#157989)
Find a common subclass instead of directly checking for a subclass relationship. This fixes folding logic for unaligned register defs into aligned use contexts. e.g., a vreg_64 def into an av_64_align2 use should be able to find the common subclass vreg_align2. This avoids regressions in future patches. Checking the subclass was also redundant on the subregister path; getMatchingSuperRegClass is sufficient.
1 parent 28743fa commit 5a21128

File tree

4 files changed

+51
-52
lines changed

4 files changed

+51
-52
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6125,12 +6125,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
61256125
const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
61266126
if (!SuperRC)
61276127
return false;
6128-
6129-
DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
6130-
if (!DRC)
6131-
return false;
6128+
return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
61326129
}
6133-
return RC->hasSuperClassEq(DRC);
6130+
6131+
return RI.getCommonSubClass(DRC, RC) != nullptr;
61346132
}
61356133

61366134
bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,33 +7,33 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
77
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
88
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
99
; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
10-
; GFX906-NEXT: v_mov_b32_e32 v4, 8
10+
; GFX906-NEXT: v_mov_b32_e32 v3, 8
1111
; GFX906-NEXT: v_mov_b32_e32 v5, 16
1212
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
13-
; GFX906-NEXT: global_load_dword v3, v2, s[0:1]
13+
; GFX906-NEXT: global_load_dword v4, v2, s[0:1]
1414
; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
1515
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
1616
; GFX906-NEXT: s_waitcnt vmcnt(0)
17-
; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v3
18-
; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
19-
; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
20-
; GFX906-NEXT: v_or3_b32 v3, v6, v7, v3
17+
; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
18+
; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
19+
; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
20+
; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
2121
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
2222
; GFX906-NEXT: s_cbranch_execz .LBB0_2
2323
; GFX906-NEXT: ; %bb.1: ; %bb.1
2424
; GFX906-NEXT: global_load_dword v0, v2, s[2:3]
2525
; GFX906-NEXT: s_waitcnt vmcnt(0)
2626
; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
27-
; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
27+
; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
2828
; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
29-
; GFX906-NEXT: v_or3_b32 v3, v2, v3, v0
29+
; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
3030
; GFX906-NEXT: .LBB0_2: ; %bb.2
3131
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
32-
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3
32+
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
3333
; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
3434
; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
35-
; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
36-
; GFX906-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
35+
; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
36+
; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3737
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
3838
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
3939
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0

llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -969,37 +969,38 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
969969
; GFX950: ; %bb.0:
970970
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
971971
; GFX950-NEXT: s_mov_b64 s[0:1], 0x50
972-
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
972+
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
973973
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
974-
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
974+
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
975975
; GFX950-NEXT: ;;#ASMSTART
976-
; GFX950-NEXT: ; def v[0:1]
976+
; GFX950-NEXT: ; def v[4:5]
977977
; GFX950-NEXT: ;;#ASMEND
978-
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
978+
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
979979
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
980980
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
981981
; GFX950-NEXT: s_cbranch_execz .LBB14_2
982982
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
983983
; GFX950-NEXT: buffer_wbl2 sc0 sc1
984-
; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] sc0 sc1
984+
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
985985
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
986986
; GFX950-NEXT: buffer_inv sc0 sc1
987+
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
987988
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
988989
; GFX950-NEXT: .LBB14_2: ; %Flow
989990
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
990991
; GFX950-NEXT: s_cbranch_execz .LBB14_4
991992
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
992-
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
993+
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
993994
; GFX950-NEXT: s_nop 1
994-
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
995-
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
995+
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
996+
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
996997
; GFX950-NEXT: s_nop 0
997-
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
998+
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
998999
; GFX950-NEXT: .LBB14_4: ; %atomicrmw.phi
9991000
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
10001001
; GFX950-NEXT: s_waitcnt vmcnt(1)
10011002
; GFX950-NEXT: ;;#ASMSTART
1002-
; GFX950-NEXT: ; use v[2:3]
1003+
; GFX950-NEXT: ; use v[0:1]
10031004
; GFX950-NEXT: ;;#ASMEND
10041005
; GFX950-NEXT: s_waitcnt vmcnt(0)
10051006
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -1058,37 +1059,38 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
10581059
; GFX950: ; %bb.0:
10591060
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10601061
; GFX950-NEXT: s_mov_b64 s[0:1], 0x50
1061-
; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
1062+
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
10621063
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
1063-
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
1064+
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
10641065
; GFX950-NEXT: ;;#ASMSTART
1065-
; GFX950-NEXT: ; def v[0:1]
1066+
; GFX950-NEXT: ; def v[4:5]
10661067
; GFX950-NEXT: ;;#ASMEND
1067-
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
1068+
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
10681069
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
10691070
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
10701071
; GFX950-NEXT: s_cbranch_execz .LBB15_2
10711072
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
10721073
; GFX950-NEXT: buffer_wbl2 sc0 sc1
1073-
; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] sc0 sc1
1074+
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
10741075
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10751076
; GFX950-NEXT: buffer_inv sc0 sc1
1077+
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
10761078
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
10771079
; GFX950-NEXT: .LBB15_2: ; %Flow
10781080
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
10791081
; GFX950-NEXT: s_cbranch_execz .LBB15_4
10801082
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
1081-
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1083+
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
10821084
; GFX950-NEXT: s_nop 1
1083-
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1084-
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
1085+
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
1086+
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
10851087
; GFX950-NEXT: s_nop 0
1086-
; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
1088+
; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
10871089
; GFX950-NEXT: .LBB15_4: ; %atomicrmw.phi
10881090
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
10891091
; GFX950-NEXT: s_waitcnt vmcnt(1)
10901092
; GFX950-NEXT: ;;#ASMSTART
1091-
; GFX950-NEXT: ; use v[2:3]
1093+
; GFX950-NEXT: ; use v[0:1]
10921094
; GFX950-NEXT: ;;#ASMEND
10931095
; GFX950-NEXT: s_waitcnt vmcnt(0)
10941096
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -1149,34 +1151,35 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
11491151
; GFX950: ; %bb.0:
11501152
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11511153
; GFX950-NEXT: s_mov_b64 s[0:1], 0x50
1152-
; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
1154+
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
11531155
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
1154-
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
1156+
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
11551157
; GFX950-NEXT: ;;#ASMSTART
1156-
; GFX950-NEXT: ; def v[0:1]
1158+
; GFX950-NEXT: ; def v[2:3]
11571159
; GFX950-NEXT: ;;#ASMEND
11581160
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
11591161
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
11601162
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
11611163
; GFX950-NEXT: s_cbranch_execz .LBB16_2
11621164
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
11631165
; GFX950-NEXT: buffer_wbl2 sc0 sc1
1164-
; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0 sc1
1166+
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
11651167
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11661168
; GFX950-NEXT: buffer_inv sc0 sc1
1167-
; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
1168-
; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
11691169
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
1170+
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
1171+
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
1172+
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
11701173
; GFX950-NEXT: .LBB16_2: ; %Flow
11711174
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
11721175
; GFX950-NEXT: s_cbranch_execz .LBB16_4
11731176
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
1174-
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1177+
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
11751178
; GFX950-NEXT: s_nop 1
1176-
; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
1177-
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v2, off
1179+
; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1180+
; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
11781181
; GFX950-NEXT: s_nop 0
1179-
; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off
1182+
; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
11801183
; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi
11811184
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
11821185
; GFX950-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@ body: |
1717
...
1818

1919
# GCN-LABEL: name: fold_sgpr_to_sgpr_copy_subreg
20-
# GCN: %0:sreg_64 = IMPLICIT_DEF
21-
# GCN-NEXT: %2:sgpr_32 = COPY %0.sub0
22-
# GCN-NEXT: S_STORE_DWORD_IMM %2, undef $sgpr10_sgpr11, 0, 0
20+
# GCN: %0:sreg_64_xexec = IMPLICIT_DEF
21+
# GCN-NEXT: S_STORE_DWORD_IMM %0.sub0, undef $sgpr10_sgpr11, 0, 0
2322

2423
name: fold_sgpr_to_sgpr_copy_subreg
2524
body: |
@@ -32,9 +31,8 @@ body: |
3231
...
3332

3433
# GCN-LABEL: name: fold_sgpr_to_sgpr_copy_subreg2
35-
# GCN: %0:sreg_64 = IMPLICIT_DEF
36-
# GCN-NEXT: %3:sreg_32_xm0_xexec = COPY %0.sub0
37-
# GCN-NEXT: S_STORE_DWORD_IMM %3, undef $sgpr10_sgpr11, 0, 0
34+
# GCN: %0:sreg_64_xexec = IMPLICIT_DEF
35+
# GCN-NEXT: S_STORE_DWORD_IMM %0.sub0, undef $sgpr10_sgpr11, 0, 0
3836

3937
name: fold_sgpr_to_sgpr_copy_subreg2
4038
body: |

0 commit comments

Comments
 (0)