Skip to content

Commit 640a436

Browse files
committed
[AMDGPU] Improve isBasicBlockPrologue to only add necessary instructions
1 parent df1f231 commit 640a436

File tree

3 files changed

+79
-67
lines changed

3 files changed

+79
-67
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8906,11 +8906,13 @@ bool SIInstrInfo::isPrologueOperandReload(const MachineInstr &MI) const {
89068906
if ((isSGPRSpill(MI) &&
89078907
(MI.mayLoad() || Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)) ||
89088908
(isWWMRegSpillOpcode(Opcode) && MI.mayLoad())) {
8909-
Register Reg = MI.defs().begin()->getReg();
8909+
Register Reg = MI.getOperand(0).getReg();
89108910
const MachineBasicBlock *MBB = MI.getParent();
89118911
MachineBasicBlock::const_instr_iterator I(MI), E = MBB->instr_end();
89128912
while (++I != E) {
8913-
if (I->readsRegister(Reg, &RI) && isBasicBlockPrologue(*I))
8913+
if (!isBasicBlockPrologue(*I))
8914+
return false;
8915+
if (I->readsRegister(Reg, &RI))
89148916
return true;
89158917
}
89168918
}
@@ -8933,9 +8935,10 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89338935

89348936
uint16_t Opcode = MI.getOpcode();
89358937
return IsNullOrVectorRegister &&
8936-
(isPrologueOperandReload(MI) || Opcode == AMDGPU::IMPLICIT_DEF ||
8938+
(Opcode == AMDGPU::IMPLICIT_DEF ||
89378939
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8938-
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8940+
MI.modifiesRegister(AMDGPU::EXEC, &RI)) ||
8941+
isPrologueOperandReload(MI));
89398942
}
89408943

89418944
MachineInstrBuilder

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
13411341

13421342
bool isBasicBlockPrologue(const MachineInstr &MI,
13431343
Register Reg = Register()) const override;
1344+
/// Returns "true" if \p MI defines register that is used by
1345+
/// another prologue instruction.
1346+
bool isPrologueOperandReload(const MachineInstr &MI) const;
13441347

13451348
bool isPrologueOperandReload(const MachineInstr &MI) const;
13461349

llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Lines changed: 69 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,14 +1600,8 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
16001600
; NOOPT-NEXT: ; implicit-def: $vgpr0
16011601
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
16021602
; NOOPT-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1603-
; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
1604-
; NOOPT-NEXT: s_waitcnt expcnt(0)
1605-
; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
1606-
; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
1607-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
1608-
; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
1609-
; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
16101603
; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
1604+
; NOOPT-NEXT: s_waitcnt expcnt(1)
16111605
; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
16121606
; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
16131607
; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -1618,14 +1612,26 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
16181612
; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
16191613
; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
16201614
; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
1615+
; NOOPT-NEXT: s_waitcnt expcnt(6)
16211616
; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
1617+
; NOOPT-NEXT: s_waitcnt expcnt(5)
16221618
; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
1619+
; NOOPT-NEXT: s_waitcnt expcnt(4)
16231620
; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
1621+
; NOOPT-NEXT: s_waitcnt expcnt(3)
16241622
; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
1623+
; NOOPT-NEXT: s_waitcnt expcnt(2)
16251624
; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
1625+
; NOOPT-NEXT: s_waitcnt expcnt(1)
16261626
; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
16271627
; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
1628+
; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
1629+
; NOOPT-NEXT: s_waitcnt expcnt(0)
1630+
; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
1631+
; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
16281632
; NOOPT-NEXT: s_waitcnt vmcnt(0)
1633+
; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
1634+
; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
16291635
; NOOPT-NEXT: v_readfirstlane_b32 s2, v16
16301636
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16
16311637
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -4122,13 +4128,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
41224128
; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
41234129
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
41244130
; NOOPT-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
4125-
; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4126-
; NOOPT-NEXT: s_waitcnt expcnt(0)
4127-
; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4128-
; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
4129-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
4130-
; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4131-
; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
41324131
; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
41334132
; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
41344133
; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -4154,7 +4153,12 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
41544153
; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
41554154
; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
41564155
; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
4156+
; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4157+
; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4158+
; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
41574159
; NOOPT-NEXT: s_waitcnt vmcnt(0)
4160+
; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4161+
; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
41584162
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
41594163
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
41604164
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -4607,13 +4611,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
46074611
; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
46084612
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
46094613
; NOOPT-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4610-
; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4611-
; NOOPT-NEXT: s_waitcnt expcnt(0)
4612-
; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4613-
; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
4614-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
4615-
; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4616-
; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
46174614
; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
46184615
; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
46194616
; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -4639,7 +4636,12 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
46394636
; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
46404637
; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
46414638
; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
4639+
; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4640+
; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4641+
; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
46424642
; NOOPT-NEXT: s_waitcnt vmcnt(0)
4643+
; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4644+
; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
46434645
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
46444646
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
46454647
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -5161,14 +5163,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
51615163
; NOOPT-NEXT: ; implicit-def: $vgpr0
51625164
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
51635165
; NOOPT-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
5164-
; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5165-
; NOOPT-NEXT: s_waitcnt expcnt(0)
5166-
; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5167-
; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
5168-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
5169-
; NOOPT-NEXT: v_readlane_b32 s0, v18, 23
5170-
; NOOPT-NEXT: v_readlane_b32 s1, v18, 24
51715166
; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
5167+
; NOOPT-NEXT: s_waitcnt expcnt(1)
51725168
; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
51735169
; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
51745170
; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
@@ -5179,14 +5175,26 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
51795175
; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload
51805176
; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload
51815177
; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload
5178+
; NOOPT-NEXT: s_waitcnt expcnt(6)
51825179
; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload
5180+
; NOOPT-NEXT: s_waitcnt expcnt(5)
51835181
; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload
5182+
; NOOPT-NEXT: s_waitcnt expcnt(4)
51845183
; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload
5184+
; NOOPT-NEXT: s_waitcnt expcnt(3)
51855185
; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
5186+
; NOOPT-NEXT: s_waitcnt expcnt(2)
51865187
; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
5188+
; NOOPT-NEXT: s_waitcnt expcnt(1)
51875189
; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
51885190
; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
5191+
; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5192+
; NOOPT-NEXT: s_waitcnt expcnt(0)
5193+
; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5194+
; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
51895195
; NOOPT-NEXT: s_waitcnt vmcnt(0)
5196+
; NOOPT-NEXT: v_readlane_b32 s0, v18, 23
5197+
; NOOPT-NEXT: v_readlane_b32 s1, v18, 24
51905198
; NOOPT-NEXT: v_readfirstlane_b32 s2, v16
51915199
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16
51925200
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -5278,14 +5286,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
52785286
; NOOPT-NEXT: ; implicit-def: $vgpr0
52795287
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
52805288
; NOOPT-NEXT: .LBB16_4: ; =>This Inner Loop Header: Depth=1
5281-
; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5282-
; NOOPT-NEXT: s_waitcnt expcnt(0)
5283-
; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5284-
; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
5285-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
5286-
; NOOPT-NEXT: v_readlane_b32 s0, v18, 28
5287-
; NOOPT-NEXT: v_readlane_b32 s1, v18, 29
52885289
; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
5290+
; NOOPT-NEXT: s_waitcnt expcnt(1)
52895291
; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
52905292
; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
52915293
; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
@@ -5296,14 +5298,26 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
52965298
; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload
52975299
; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload
52985300
; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload
5301+
; NOOPT-NEXT: s_waitcnt expcnt(6)
52995302
; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload
5303+
; NOOPT-NEXT: s_waitcnt expcnt(5)
53005304
; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload
5305+
; NOOPT-NEXT: s_waitcnt expcnt(4)
53015306
; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload
5307+
; NOOPT-NEXT: s_waitcnt expcnt(3)
53025308
; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
5309+
; NOOPT-NEXT: s_waitcnt expcnt(2)
53035310
; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
5311+
; NOOPT-NEXT: s_waitcnt expcnt(1)
53045312
; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
53055313
; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
5314+
; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5315+
; NOOPT-NEXT: s_waitcnt expcnt(0)
5316+
; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5317+
; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
53065318
; NOOPT-NEXT: s_waitcnt vmcnt(0)
5319+
; NOOPT-NEXT: v_readlane_b32 s0, v18, 28
5320+
; NOOPT-NEXT: v_readlane_b32 s1, v18, 29
53075321
; NOOPT-NEXT: v_readfirstlane_b32 s2, v16
53085322
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16
53095323
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -5889,13 +5903,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
58895903
; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
58905904
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
58915905
; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
5892-
; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
5893-
; NOOPT-NEXT: s_waitcnt expcnt(0)
5894-
; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
5895-
; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
5896-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
5897-
; NOOPT-NEXT: v_readlane_b32 s0, v32, 7
5898-
; NOOPT-NEXT: v_readlane_b32 s1, v32, 8
58995906
; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
59005907
; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
59015908
; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
@@ -5921,7 +5928,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
59215928
; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
59225929
; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
59235930
; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
5931+
; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
5932+
; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
5933+
; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
59245934
; NOOPT-NEXT: s_waitcnt vmcnt(0)
5935+
; NOOPT-NEXT: v_readlane_b32 s0, v32, 7
5936+
; NOOPT-NEXT: v_readlane_b32 s1, v32, 8
59255937
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
59265938
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
59275939
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -6023,13 +6035,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
60236035
; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
60246036
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
60256037
; NOOPT-NEXT: .LBB17_4: ; =>This Inner Loop Header: Depth=1
6026-
; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
6027-
; NOOPT-NEXT: s_waitcnt expcnt(0)
6028-
; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
6029-
; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
6030-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
6031-
; NOOPT-NEXT: v_readlane_b32 s0, v32, 11
6032-
; NOOPT-NEXT: v_readlane_b32 s1, v32, 12
60336038
; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
60346039
; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
60356040
; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
@@ -6055,7 +6060,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
60556060
; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
60566061
; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
60576062
; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
6063+
; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
6064+
; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
6065+
; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
60586066
; NOOPT-NEXT: s_waitcnt vmcnt(0)
6067+
; NOOPT-NEXT: v_readlane_b32 s0, v32, 11
6068+
; NOOPT-NEXT: v_readlane_b32 s1, v32, 12
60596069
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
60606070
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
60616071
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -9146,13 +9156,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
91469156
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
91479157
; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1
91489158
; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2
9149-
; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
9150-
; NOOPT-NEXT: s_waitcnt expcnt(0)
9151-
; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
9152-
; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
9153-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
9154-
; NOOPT-NEXT: v_readlane_b32 s0, v18, 6
9155-
; NOOPT-NEXT: v_readlane_b32 s1, v18, 7
91569159
; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
91579160
; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
91589161
; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
@@ -9178,7 +9181,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
91789181
; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
91799182
; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
91809183
; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
9184+
; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
9185+
; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
9186+
; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
91819187
; NOOPT-NEXT: s_waitcnt vmcnt(0)
9188+
; NOOPT-NEXT: v_readlane_b32 s0, v18, 6
9189+
; NOOPT-NEXT: v_readlane_b32 s1, v18, 7
91829190
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
91839191
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
91849192
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -9637,13 +9645,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
96379645
; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
96389646
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
96399647
; NOOPT-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
9640-
; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
9641-
; NOOPT-NEXT: s_waitcnt expcnt(0)
9642-
; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
9643-
; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
9644-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
9645-
; NOOPT-NEXT: v_readlane_b32 s0, v33, 9
9646-
; NOOPT-NEXT: v_readlane_b32 s1, v33, 10
96479648
; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
96489649
; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
96499650
; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
@@ -9669,7 +9670,12 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
96699670
; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
96709671
; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
96719672
; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
9673+
; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
9674+
; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
9675+
; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
96729676
; NOOPT-NEXT: s_waitcnt vmcnt(0)
9677+
; NOOPT-NEXT: v_readlane_b32 s0, v33, 9
9678+
; NOOPT-NEXT: v_readlane_b32 s1, v33, 10
96739679
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
96749680
; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
96759681
; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]

0 commit comments

Comments
 (0)