Skip to content

Commit c6ee2d9

Browse files
authored
AMDGPU: Constrain readfirstlane operand to vgpr_32 (#168001)
1 parent 71eaf14 commit c6ee2d9

File tree

3 files changed

+158
-3
lines changed

3 files changed

+158
-3
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,9 +1122,20 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
11221122
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
11231123
.addReg(VReg32);
11241124
} else if (SrcSize == 32) {
1125-
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1126-
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1127-
MIB.addReg(SrcReg, 0, SubReg);
1125+
const MCInstrDesc &ReadFirstLaneDesc =
1126+
TII->get(AMDGPU::V_READFIRSTLANE_B32);
1127+
const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
1128+
BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
1129+
.addReg(SrcReg, 0, SubReg);
1130+
1131+
const TargetRegisterClass *ConstrainRC =
1132+
SubReg == AMDGPU::NoSubRegister
1133+
? OpRC
1134+
: TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
1135+
SubReg);
1136+
1137+
if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
1138+
llvm_unreachable("failed to constrain register");
11281139
} else {
11291140
auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
11301141
TII->get(AMDGPU::REG_SEQUENCE), DstReg);
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
3+
4+
; SIFixSGPRCopies will insert a readfirstlane from an AV source
5+
; register, which needs to be constrained by VGPR to satisfy the
6+
; operand constraint.
7+
8+
define amdgpu_kernel void @constrain_readfirstlane_av(i64 %arg, ptr addrspace(1) %ptr) {
9+
; CHECK-LABEL: constrain_readfirstlane_av:
10+
; CHECK: ; %bb.0: ; %bb
11+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
12+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
13+
; CHECK-NEXT: s_mov_b32 s5, 0
14+
; CHECK-NEXT: s_mov_b64 s[6:7], 0
15+
; CHECK-NEXT: s_and_b64 vcc, exec, -1
16+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
17+
; CHECK-NEXT: global_load_ushort v1, v0, s[2:3] glc
18+
; CHECK-NEXT: s_waitcnt vmcnt(0)
19+
; CHECK-NEXT: v_readfirstlane_b32 s4, v1
20+
; CHECK-NEXT: s_and_b32 s4, s4, 0xffff
21+
; CHECK-NEXT: .LBB0_1: ; %bb16
22+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
23+
; CHECK-NEXT: s_mul_i32 s8, s6, s1
24+
; CHECK-NEXT: s_mul_hi_u32 s9, s6, s0
25+
; CHECK-NEXT: s_mul_i32 s7, s7, s0
26+
; CHECK-NEXT: s_add_i32 s8, s9, s8
27+
; CHECK-NEXT: s_mul_i32 s6, s6, s0
28+
; CHECK-NEXT: s_add_i32 s7, s8, s7
29+
; CHECK-NEXT: s_lshl_b64 s[6:7], s[6:7], 5
30+
; CHECK-NEXT: s_add_u32 s6, s2, s6
31+
; CHECK-NEXT: s_addc_u32 s7, s3, s7
32+
; CHECK-NEXT: global_load_dword v1, v0, s[6:7] glc
33+
; CHECK-NEXT: s_waitcnt vmcnt(0)
34+
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
35+
; CHECK-NEXT: s_mov_b64 vcc, vcc
36+
; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
37+
; CHECK-NEXT: ; %bb.2: ; %DummyReturnBlock
38+
; CHECK-NEXT: s_endpgm
39+
bb:
40+
%i = load volatile i16, ptr addrspace(1) %ptr, align 2
41+
%i6 = zext i16 %i to i64
42+
br label %bb16
43+
44+
bb16: ; preds = %bb16, %bb
45+
%i17 = phi i64 [ %i6, %bb16 ], [ 0, %bb ]
46+
%i23 = mul i64 %i17, %arg
47+
%i25.split = getelementptr [16 x half], ptr addrspace(1) %ptr, i64 %i23
48+
%i27 = load volatile <2 x half>, ptr addrspace(1) %i25.split, align 16
49+
br label %bb16
50+
}
51+
52+
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s
3+
4+
---
5+
name: constrain_readfirstlane_av
6+
tracksRegLiveness: true
7+
body: |
8+
; CHECK-LABEL: name: constrain_readfirstlane_av
9+
; CHECK: bb.0:
10+
; CHECK-NEXT: successors: %bb.1(0x80000000)
11+
; CHECK-NEXT: liveins: $vgpr0
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
14+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
15+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
16+
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_READFIRSTLANE_B32_]], [[DEF]], implicit-def dead $scc
17+
; CHECK-NEXT: {{ $}}
18+
; CHECK-NEXT: bb.1:
19+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
20+
; CHECK-NEXT: {{ $}}
21+
; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_AND_B32_]], [[S_AND_B32_]]
22+
; CHECK-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[S_AND_B32_]], [[S_MUL_I32_]]
23+
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MUL_HI_U32_]], [[S_MUL_I32_]], implicit-def dead $scc
24+
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
25+
; CHECK-NEXT: S_BRANCH %bb.2
26+
; CHECK-NEXT: {{ $}}
27+
; CHECK-NEXT: bb.2:
28+
; CHECK-NEXT: S_ENDPGM 0
29+
bb.0:
30+
liveins: $vgpr0
31+
32+
%0:sreg_32 = IMPLICIT_DEF
33+
%1:av_32 = COPY $vgpr0
34+
%2:sreg_32 = COPY %1
35+
%3:sreg_32 = S_AND_B32 %2, %0, implicit-def dead $scc
36+
37+
bb.1:
38+
%4:sreg_32 = S_MUL_I32 %3, %3
39+
%5:sreg_32 = S_MUL_HI_U32 %3, %4
40+
%6:sreg_32 = S_ADD_I32 %5, %4, implicit-def dead $scc
41+
S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
42+
S_BRANCH %bb.2
43+
44+
bb.2:
45+
S_ENDPGM 0
46+
...
47+
48+
# Need to respect subregister on copy source
49+
---
50+
name: constrain_readfirstlane_av64
51+
tracksRegLiveness: true
52+
body: |
53+
; CHECK-LABEL: name: constrain_readfirstlane_av64
54+
; CHECK: bb.0:
55+
; CHECK-NEXT: successors: %bb.1(0x80000000)
56+
; CHECK-NEXT: liveins: $vgpr0_vgpr1
57+
; CHECK-NEXT: {{ $}}
58+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
59+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
60+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec
61+
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_READFIRSTLANE_B32_]], [[DEF]], implicit-def dead $scc
62+
; CHECK-NEXT: {{ $}}
63+
; CHECK-NEXT: bb.1:
64+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
65+
; CHECK-NEXT: {{ $}}
66+
; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_AND_B32_]], [[S_AND_B32_]]
67+
; CHECK-NEXT: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[S_AND_B32_]], [[S_MUL_I32_]]
68+
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MUL_HI_U32_]], [[S_MUL_I32_]], implicit-def dead $scc
69+
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
70+
; CHECK-NEXT: S_BRANCH %bb.2
71+
; CHECK-NEXT: {{ $}}
72+
; CHECK-NEXT: bb.2:
73+
; CHECK-NEXT: S_ENDPGM 0
74+
bb.0:
75+
liveins: $vgpr0_vgpr1
76+
77+
%0:sreg_32 = IMPLICIT_DEF
78+
%1:av_64 = COPY $vgpr0_vgpr1
79+
%2:sreg_32 = COPY %1.sub0
80+
%3:sreg_32 = S_AND_B32 %2, %0, implicit-def dead $scc
81+
82+
bb.1:
83+
%4:sreg_32 = S_MUL_I32 %3, %3
84+
%5:sreg_32 = S_MUL_HI_U32 %3, %4
85+
%6:sreg_32 = S_ADD_I32 %5, %4, implicit-def dead $scc
86+
S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
87+
S_BRANCH %bb.2
88+
89+
bb.2:
90+
S_ENDPGM 0
91+
...
92+

0 commit comments

Comments
 (0)