Skip to content

Commit 4ec8908

Browse files
authored
AMDGPU: Try to constrain av registers to VGPR to enable ds_write2 formation (#156400)
In future changes we will have more AV_ virtual registers, which currently block the formation of write2. Most of the time these registers can simply be constrained to VGPR, so do that. Also relaxes the constraint in flat merging case. We already have the necessary code to insert copies to the original result registers, so there's no point in avoiding it. Addresses the easy half of #155769
1 parent 681046e commit 4ec8908

File tree

5 files changed

+289
-36
lines changed

5 files changed

+289
-36
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 57 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class SILoadStoreOptimizer {
119119
unsigned DMask;
120120
InstClassEnum InstClass;
121121
unsigned CPol = 0;
122-
bool IsAGPR;
122+
const TargetRegisterClass *DataRC;
123123
bool UseST64;
124124
int AddrIdx[MaxAddressRegs];
125125
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -203,6 +203,7 @@ class SILoadStoreOptimizer {
203203
using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
204204

205205
private:
206+
MachineFunction *MF = nullptr;
206207
const GCNSubtarget *STM = nullptr;
207208
const SIInstrInfo *TII = nullptr;
208209
const SIRegisterInfo *TRI = nullptr;
@@ -245,6 +246,8 @@ class SILoadStoreOptimizer {
245246

246247
unsigned write2Opcode(unsigned EltSize) const;
247248
unsigned write2ST64Opcode(unsigned EltSize) const;
249+
unsigned getWrite2Opcode(const CombineInfo &CI) const;
250+
248251
MachineBasicBlock::iterator
249252
mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
250253
MachineBasicBlock::iterator InsertBefore);
@@ -846,7 +849,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
846849
if (InstClass == UNKNOWN)
847850
return;
848851

849-
IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
852+
DataRC = LSO.getDataRegClass(*MI);
850853

851854
switch (InstClass) {
852855
case DS_READ:
@@ -1313,6 +1316,50 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
13131316
// have already been confirmed to be mergeable.
13141317
if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
13151318
offsetsCanBeCombined(CI, *STM, Paired, true);
1319+
1320+
if (CI.InstClass == DS_WRITE) {
1321+
// Both data operands must be AGPR or VGPR, so the data registers needs to
1322+
// be constrained to one or the other. We expect to only emit the VGPR form
1323+
// here for now.
1324+
//
1325+
// FIXME: There is currently a hack in getRegClass to report that the write2
1326+
// operands are VGPRs. In the future we should have separate agpr
1327+
// instruction definitions.
1328+
const MachineOperand *Data0 =
1329+
TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1330+
const MachineOperand *Data1 =
1331+
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1332+
1333+
const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
1334+
int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1335+
AMDGPU::OpName::data0);
1336+
int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1337+
AMDGPU::OpName::data1);
1338+
1339+
const TargetRegisterClass *DataRC0 =
1340+
TII->getRegClass(Write2Opc, Data0Idx, TRI, *MF);
1341+
1342+
const TargetRegisterClass *DataRC1 =
1343+
TII->getRegClass(Write2Opc, Data1Idx, TRI, *MF);
1344+
1345+
if (unsigned SubReg = Data0->getSubReg()) {
1346+
DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
1347+
DataRC0, SubReg);
1348+
}
1349+
1350+
if (unsigned SubReg = Data1->getSubReg()) {
1351+
DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
1352+
DataRC1, SubReg);
1353+
}
1354+
1355+
if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
1356+
!MRI->constrainRegClass(Data1->getReg(), DataRC1))
1357+
return nullptr;
1358+
1359+
// TODO: If one register can be constrained, and not the other, insert a
1360+
// copy.
1361+
}
1362+
13161363
return Where;
13171364
}
13181365

@@ -1462,6 +1509,10 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
14621509
: AMDGPU::DS_WRITE2ST64_B64_gfx9;
14631510
}
14641511

1512+
unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
1513+
return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1514+
}
1515+
14651516
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
14661517
CombineInfo &CI, CombineInfo &Paired,
14671518
MachineBasicBlock::iterator InsertBefore) {
@@ -1478,8 +1529,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
14781529

14791530
unsigned NewOffset0 = CI.Offset;
14801531
unsigned NewOffset1 = Paired.Offset;
1481-
unsigned Opc =
1482-
CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1532+
unsigned Opc = getWrite2Opcode(CI);
14831533

14841534
if (NewOffset0 > NewOffset1) {
14851535
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -2032,6 +2082,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
20322082
}
20332083
}
20342084

2085+
// FIXME: This should compute the instruction to use, and then use the result
2086+
// of TII->getRegClass.
20352087
unsigned BitWidth = 32 * (CI.Width + Paired.Width);
20362088
return TRI->isAGPRClass(getDataRegClass(*CI.I))
20372089
? TRI->getAGPRClassForBitWidth(BitWidth)
@@ -2400,7 +2452,6 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
24002452
std::list<std::list<CombineInfo> > &MergeableInsts) const {
24012453
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
24022454
if (AddrList.front().InstClass == CI.InstClass &&
2403-
AddrList.front().IsAGPR == CI.IsAGPR &&
24042455
AddrList.front().hasSameBaseAddress(CI)) {
24052456
AddrList.emplace_back(CI);
24062457
return;
@@ -2465,19 +2516,6 @@ SILoadStoreOptimizer::collectMergeableInsts(
24652516
if (!CI.hasMergeableAddress(*MRI))
24662517
continue;
24672518

2468-
if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2469-
LLVM_DEBUG(
2470-
dbgs() << "cannot merge ds writes with mixed AGPR and VGPR data\n");
2471-
2472-
// FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2473-
// operands. However we are reporting that ds_write2 shall have
2474-
// only VGPR data so that machine copy propagation does not
2475-
// create an illegal instruction with a VGPR and AGPR sources.
2476-
// Consequenctially if we create such instruction the verifier
2477-
// will complain.
2478-
continue;
2479-
}
2480-
24812519
LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
24822520

24832521
addInstToMergeableList(CI, MergeableInsts);
@@ -2650,6 +2688,7 @@ bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
26502688
}
26512689

26522690
bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2691+
this->MF = &MF;
26532692
STM = &MF.getSubtarget<GCNSubtarget>();
26542693
if (!STM->loadStoreOptEnabled())
26552694
return false;
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -o - %s | FileCheck %s
3+
4+
---
5+
name: ds_write_b32__av32_x2
6+
body: |
7+
bb.0:
8+
liveins: $vgpr0, $vgpr1, $vgpr2
9+
10+
; CHECK-LABEL: name: ds_write_b32__av32_x2
11+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
14+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
15+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
16+
; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
17+
%0:vgpr_32 = COPY $vgpr0
18+
%1:av_32 = COPY $vgpr1
19+
%2:av_32 = COPY $vgpr2
20+
DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3)
21+
DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3)
22+
23+
...
24+
25+
---
26+
name: ds_write_b32__av32_x2_subregs_different_reg
27+
body: |
28+
bb.0:
29+
liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
30+
31+
; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_different_reg
32+
; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
33+
; CHECK-NEXT: {{ $}}
34+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
35+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
36+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
37+
; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
38+
%0:vgpr_32 = COPY $vgpr0
39+
%1:av_64_align2 = COPY $vgpr2_vgpr3
40+
%2:av_64_align2 = COPY $vgpr4_vgpr5
41+
DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3)
42+
DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3)
43+
44+
...
45+
46+
---
47+
name: ds_write_b32__unaligned_av64_subregs
48+
body: |
49+
bb.0:
50+
liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
51+
52+
; CHECK-LABEL: name: ds_write_b32__unaligned_av64_subregs
53+
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
54+
; CHECK-NEXT: {{ $}}
55+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
56+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
57+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
58+
; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY2]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
59+
%0:vgpr_32 = COPY $vgpr0
60+
%1:av_64 = COPY $vgpr1_vgpr2
61+
%2:av_64 = COPY $vgpr3_vgpr4
62+
DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3)
63+
DS_WRITE_B32_gfx9 %0, %2.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3)
64+
65+
...
66+
67+
---
68+
name: ds_write_b32__av32_x2_subregs_same_reg
69+
body: |
70+
bb.0:
71+
liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
72+
73+
; CHECK-LABEL: name: ds_write_b32__av32_x2_subregs_same_reg
74+
; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
75+
; CHECK-NEXT: {{ $}}
76+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
77+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
78+
; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]].sub0, [[COPY1]].sub1, 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
79+
%0:vgpr_32 = COPY $vgpr0
80+
%1:av_64_align2 = COPY $vgpr2_vgpr3
81+
DS_WRITE_B32_gfx9 %0, %1.sub0, 40, 0, implicit $exec :: (store (s32), addrspace 3)
82+
DS_WRITE_B32_gfx9 %0, %1.sub1, 96, 0, implicit $exec :: (store (s32), addrspace 3)
83+
84+
...
85+
86+
---
87+
name: ds_write_b32__av32__vgpr32
88+
body: |
89+
bb.0:
90+
liveins: $vgpr0, $vgpr1, $vgpr2
91+
92+
; CHECK-LABEL: name: ds_write_b32__av32__vgpr32
93+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
94+
; CHECK-NEXT: {{ $}}
95+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
96+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
97+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
98+
; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
99+
%0:vgpr_32 = COPY $vgpr0
100+
%1:av_32 = COPY $vgpr1
101+
%2:vgpr_32 = COPY $vgpr2
102+
DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3)
103+
DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3)
104+
105+
...
106+
107+
---
108+
name: ds_write_b32__vgpr32__av32
109+
body: |
110+
bb.0:
111+
liveins: $vgpr0, $vgpr1, $vgpr2
112+
113+
; CHECK-LABEL: name: ds_write_b32__vgpr32__av32
114+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
115+
; CHECK-NEXT: {{ $}}
116+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
117+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
118+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
119+
; CHECK-NEXT: DS_WRITE2_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 10, 24, 0, implicit $exec :: (store (s32), addrspace 3)
120+
%0:vgpr_32 = COPY $vgpr0
121+
%1:vgpr_32 = COPY $vgpr1
122+
%2:av_32 = COPY $vgpr2
123+
DS_WRITE_B32_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s32), addrspace 3)
124+
DS_WRITE_B32_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s32), addrspace 3)
125+
126+
...
127+
128+
---
129+
name: ds_write_b64__av64_x2
130+
body: |
131+
bb.0:
132+
liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
133+
134+
; CHECK-LABEL: name: ds_write_b64__av64_x2
135+
; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
136+
; CHECK-NEXT: {{ $}}
137+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
138+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
139+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
140+
; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 5, 12, 0, implicit $exec :: (store (s64), addrspace 3)
141+
%0:vgpr_32 = COPY $vgpr0
142+
%1:av_64_align2 = COPY $vgpr2_vgpr3
143+
%2:av_64_align2 = COPY $vgpr4_vgpr5
144+
DS_WRITE_B64_gfx9 %0, %1, 40, 0, implicit $exec :: (store (s64), addrspace 3)
145+
DS_WRITE_B64_gfx9 %0, %2, 96, 0, implicit $exec :: (store (s64), addrspace 3)
146+
147+
...
148+
149+
---
150+
name: ds_write_b64__av64_x2_subregs
151+
body: |
152+
bb.0:
153+
liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9
154+
155+
; CHECK-LABEL: name: ds_write_b64__av64_x2_subregs
156+
; CHECK: liveins: $vgpr0, $vgpr2_vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8_vgpr9
157+
; CHECK-NEXT: {{ $}}
158+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
159+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
160+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9
161+
; CHECK-NEXT: DS_WRITE2_B64_gfx9 [[COPY]], [[COPY1]].sub2_sub3, [[COPY2]].sub2_sub3, 5, 12, 0, implicit $exec :: (store (s64), addrspace 3)
162+
%0:vgpr_32 = COPY $vgpr0
163+
%1:av_128_align2 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
164+
%2:av_128_align2 = COPY $vgpr6_vgpr7_vgpr8_vgpr9
165+
DS_WRITE_B64_gfx9 %0, %1.sub2_sub3, 40, 0, implicit $exec :: (store (s64), addrspace 3)
166+
DS_WRITE_B64_gfx9 %0, %2.sub2_sub3, 96, 0, implicit $exec :: (store (s64), addrspace 3)
167+
168+
...
169+
170+
---
171+
name: ds_writest64_b32__av32_x2
172+
body: |
173+
bb.0:
174+
liveins: $vgpr0, $vgpr1, $vgpr2
175+
176+
; CHECK-LABEL: name: ds_writest64_b32__av32_x2
177+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
178+
; CHECK-NEXT: {{ $}}
179+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
180+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
181+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
182+
; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s32), addrspace 3)
183+
%0:vgpr_32 = COPY $vgpr0
184+
%1:av_32 = COPY $vgpr1
185+
%2:av_32 = COPY $vgpr2
186+
DS_WRITE_B32_gfx9 %0, %1, 256, 0, implicit $exec :: (store (s32), addrspace 3)
187+
DS_WRITE_B32_gfx9 %0, %2, 768, 0, implicit $exec :: (store (s32), addrspace 3)
188+
189+
...
190+
191+
---
192+
name: ds_writest64_b64__av64_x2
193+
body: |
194+
bb.0:
195+
liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
196+
197+
; CHECK-LABEL: name: ds_writest64_b64__av64_x2
198+
; CHECK: liveins: $vgpr0, $vgpr2_vgpr3, $vgpr4_vgpr5
199+
; CHECK-NEXT: {{ $}}
200+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
201+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
202+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
203+
; CHECK-NEXT: DS_WRITE2ST64_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 1, 3, 0, implicit $exec :: (store (s64), addrspace 3)
204+
%0:vgpr_32 = COPY $vgpr0
205+
%1:av_64_align2 = COPY $vgpr2_vgpr3
206+
%2:av_64_align2 = COPY $vgpr4_vgpr5
207+
DS_WRITE_B64_gfx9 %0, %1, 512, 0, implicit $exec :: (store (s64), addrspace 3)
208+
DS_WRITE_B64_gfx9 %0, %2, 1536, 0, implicit $exec :: (store (s64), addrspace 3)
209+
210+
...

llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,10 @@ body: |
172172
173173
; GCN-LABEL: name: no_merge_flat_load_dword_agpr_with_vgpr
174174
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
175-
; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
176-
; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
177-
; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
175+
; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr poison`, align 4)
176+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
177+
; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
178+
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
178179
%0:vreg_64_align2 = IMPLICIT_DEF
179180
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4)
180181
%2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`, align 4)
@@ -398,8 +399,8 @@ body: |
398399
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
399400
; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
400401
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
401-
; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
402-
; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`)
402+
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
403+
; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr poison`, align 4)
403404
%0:vreg_64_align2 = IMPLICIT_DEF
404405
%1:agpr_32 = IMPLICIT_DEF
405406
%2:vgpr_32 = IMPLICIT_DEF

0 commit comments

Comments
 (0)