Skip to content

Commit e5e74e9

Browse files
authored
AMDGPU: Use getMergedLocation in SILoadStoreOptimizer (#156396)
This is merging loads and stores so use the combined DebugLoc. Not sure if computeBase should be using the merged location from all the involved instructions. I'm also not sure how to test this sort of thing.
1 parent c5ce802 commit e5e74e9

File tree

2 files changed

+130
-25
lines changed

2 files changed

+130
-25
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 41 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,11 @@ class SILoadStoreOptimizer {
233233

234234
void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
235235
MachineBasicBlock::iterator InsertBefore,
236-
AMDGPU::OpName OpName, Register DestReg) const;
236+
const DebugLoc &DL, AMDGPU::OpName OpName,
237+
Register DestReg) const;
237238
Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
238239
MachineBasicBlock::iterator InsertBefore,
239-
AMDGPU::OpName OpName) const;
240+
const DebugLoc &DL, AMDGPU::OpName OpName) const;
240241

241242
unsigned read2Opcode(unsigned EltSize) const;
242243
unsigned read2ST64Opcode(unsigned EltSize) const;
@@ -1367,10 +1368,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
13671368
// Paired.
13681369
void SILoadStoreOptimizer::copyToDestRegs(
13691370
CombineInfo &CI, CombineInfo &Paired,
1370-
MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
1371-
Register DestReg) const {
1371+
MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
1372+
AMDGPU::OpName OpName, Register DestReg) const {
13721373
MachineBasicBlock *MBB = CI.I->getParent();
1373-
DebugLoc DL = CI.I->getDebugLoc();
13741374

13751375
auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
13761376

@@ -1398,9 +1398,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
13981398
Register
13991399
SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
14001400
MachineBasicBlock::iterator InsertBefore,
1401+
const DebugLoc &DL,
14011402
AMDGPU::OpName OpName) const {
14021403
MachineBasicBlock *MBB = CI.I->getParent();
1403-
DebugLoc DL = CI.I->getDebugLoc();
14041404

14051405
auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
14061406

@@ -1456,7 +1456,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
14561456
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14571457
Register DestReg = MRI->createVirtualRegister(SuperRC);
14581458

1459-
DebugLoc DL = CI.I->getDebugLoc();
1459+
DebugLoc DL =
1460+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
14601461

14611462
Register BaseReg = AddrReg->getReg();
14621463
unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1484,7 +1485,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
14841485
.addImm(0) // gds
14851486
.cloneMergedMemRefs({&*CI.I, &*Paired.I});
14861487

1487-
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1488+
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
14881489

14891490
CI.I->eraseFromParent();
14901491
Paired.I->eraseFromParent();
@@ -1541,7 +1542,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
15411542
(NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
15421543

15431544
const MCInstrDesc &Write2Desc = TII->get(Opc);
1544-
DebugLoc DL = CI.I->getDebugLoc();
1545+
DebugLoc DL =
1546+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
15451547

15461548
Register BaseReg = AddrReg->getReg();
15471549
unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1582,7 +1584,9 @@ MachineBasicBlock::iterator
15821584
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
15831585
MachineBasicBlock::iterator InsertBefore) {
15841586
MachineBasicBlock *MBB = CI.I->getParent();
1585-
DebugLoc DL = CI.I->getDebugLoc();
1587+
DebugLoc DL =
1588+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1589+
15861590
const unsigned Opcode = getNewOpcode(CI, Paired);
15871591

15881592
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1607,7 +1611,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
16071611

16081612
MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
16091613

1610-
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1614+
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
16111615

16121616
CI.I->eraseFromParent();
16131617
Paired.I->eraseFromParent();
@@ -1618,7 +1622,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
16181622
CombineInfo &CI, CombineInfo &Paired,
16191623
MachineBasicBlock::iterator InsertBefore) {
16201624
MachineBasicBlock *MBB = CI.I->getParent();
1621-
DebugLoc DL = CI.I->getDebugLoc();
1625+
DebugLoc DL =
1626+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1627+
16221628
const unsigned Opcode = getNewOpcode(CI, Paired);
16231629

16241630
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1639,7 +1645,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
16391645
New.addImm(MergedOffset);
16401646
New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
16411647

1642-
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1648+
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
16431649

16441650
CI.I->eraseFromParent();
16451651
Paired.I->eraseFromParent();
@@ -1650,7 +1656,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
16501656
CombineInfo &CI, CombineInfo &Paired,
16511657
MachineBasicBlock::iterator InsertBefore) {
16521658
MachineBasicBlock *MBB = CI.I->getParent();
1653-
DebugLoc DL = CI.I->getDebugLoc();
1659+
1660+
DebugLoc DL =
1661+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
16541662

16551663
const unsigned Opcode = getNewOpcode(CI, Paired);
16561664

@@ -1680,7 +1688,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
16801688
.addImm(0) // swz
16811689
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
16821690

1683-
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1691+
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
16841692

16851693
CI.I->eraseFromParent();
16861694
Paired.I->eraseFromParent();
@@ -1691,7 +1699,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
16911699
CombineInfo &CI, CombineInfo &Paired,
16921700
MachineBasicBlock::iterator InsertBefore) {
16931701
MachineBasicBlock *MBB = CI.I->getParent();
1694-
DebugLoc DL = CI.I->getDebugLoc();
1702+
1703+
DebugLoc DL =
1704+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
16951705

16961706
const unsigned Opcode = getNewOpcode(CI, Paired);
16971707

@@ -1731,7 +1741,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
17311741
.addImm(0) // swz
17321742
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
17331743

1734-
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1744+
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
17351745

17361746
CI.I->eraseFromParent();
17371747
Paired.I->eraseFromParent();
@@ -1742,12 +1752,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
17421752
CombineInfo &CI, CombineInfo &Paired,
17431753
MachineBasicBlock::iterator InsertBefore) {
17441754
MachineBasicBlock *MBB = CI.I->getParent();
1745-
DebugLoc DL = CI.I->getDebugLoc();
1755+
DebugLoc DL =
1756+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
17461757

17471758
const unsigned Opcode = getNewOpcode(CI, Paired);
17481759

17491760
Register SrcReg =
1750-
copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1761+
copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
17511762

17521763
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
17531764
.addReg(SrcReg, RegState::Kill);
@@ -1789,7 +1800,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
17891800
CombineInfo &CI, CombineInfo &Paired,
17901801
MachineBasicBlock::iterator InsertBefore) {
17911802
MachineBasicBlock *MBB = CI.I->getParent();
1792-
DebugLoc DL = CI.I->getDebugLoc();
1803+
1804+
DebugLoc DL =
1805+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
17931806

17941807
const unsigned Opcode = getNewOpcode(CI, Paired);
17951808

@@ -1807,7 +1820,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
18071820
.addImm(CI.CPol)
18081821
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
18091822

1810-
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1823+
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
18111824

18121825
CI.I->eraseFromParent();
18131826
Paired.I->eraseFromParent();
@@ -1818,12 +1831,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
18181831
CombineInfo &CI, CombineInfo &Paired,
18191832
MachineBasicBlock::iterator InsertBefore) {
18201833
MachineBasicBlock *MBB = CI.I->getParent();
1821-
DebugLoc DL = CI.I->getDebugLoc();
1834+
1835+
DebugLoc DL =
1836+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
18221837

18231838
const unsigned Opcode = getNewOpcode(CI, Paired);
18241839

18251840
Register SrcReg =
1826-
copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1841+
copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
18271842

18281843
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
18291844
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -2094,12 +2109,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
20942109
CombineInfo &CI, CombineInfo &Paired,
20952110
MachineBasicBlock::iterator InsertBefore) {
20962111
MachineBasicBlock *MBB = CI.I->getParent();
2097-
DebugLoc DL = CI.I->getDebugLoc();
2112+
DebugLoc DL =
2113+
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
20982114

20992115
const unsigned Opcode = getNewOpcode(CI, Paired);
21002116

21012117
Register SrcReg =
2102-
copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
2118+
copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
21032119

21042120
auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
21052121
.addReg(SrcReg, RegState::Kill);
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt -passes=debugify < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s
3+
4+
@lds = addrspace(3) global [512 x float] poison, align 4
5+
6+
define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
7+
; CHECK-LABEL: simple_write2_one_val_f32:
8+
; CHECK: .Lfunc_begin0:
9+
; CHECK-NEXT: .cfi_sections .debug_frame
10+
; CHECK-NEXT: .cfi_startproc
11+
; CHECK-NEXT: ; %bb.0:
12+
; CHECK-NEXT: .file 1 "/" "<stdin>"
13+
; CHECK-NEXT: .loc 1 1 1 prologue_end ; <stdin>:1:1
14+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
15+
; CHECK-NEXT: .Ltmp0:
16+
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:1 <- $vgpr0
17+
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:5 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] $vgpr0
18+
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:3 <- undef
19+
; CHECK-NEXT: .loc 1 2 1 ; <stdin>:2:1
20+
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
21+
; CHECK-NEXT: .Ltmp1:
22+
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:4 <- $vgpr0
23+
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:2 <- undef
24+
; CHECK-NEXT: .loc 1 3 1 ; <stdin>:3:1
25+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
26+
; CHECK-NEXT: global_load_dword v1, v0, s[0:1]
27+
; CHECK-NEXT: .Ltmp2:
28+
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:6 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr0
29+
; CHECK-NEXT: .loc 1 0 0 is_stmt 0 ; <stdin>:0
30+
; CHECK-NEXT: s_waitcnt vmcnt(0)
31+
; CHECK-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
32+
; CHECK-NEXT: .loc 1 9 1 is_stmt 1 ; <stdin>:9:1
33+
; CHECK-NEXT: s_endpgm
34+
; CHECK-NEXT: .Ltmp3:
35+
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
36+
%in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
37+
%val = load float, ptr addrspace(1) %in.gep, align 4
38+
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
39+
store float %val, ptr addrspace(3) %arrayidx0, align 4
40+
%add.x = add nsw i32 %x.i, 8
41+
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
42+
store float %val, ptr addrspace(3) %arrayidx1, align 4
43+
ret void
44+
}
45+
46+
define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
47+
; CHECK-LABEL: simple_read2_f32:
48+
; CHECK: .Lfunc_begin1:
49+
; CHECK-NEXT: .cfi_startproc
50+
; CHECK-NEXT: ; %bb.0:
51+
; CHECK-NEXT: .loc 1 11 1 prologue_end ; <stdin>:11:1
52+
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v0
53+
; CHECK-NEXT: .Ltmp4:
54+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:8 <- $vgpr2
55+
; CHECK-NEXT: .loc 1 0 0 is_stmt 0 ; <stdin>:0
56+
; CHECK-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
57+
; CHECK-NEXT: .Ltmp5:
58+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:9 <- undef
59+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:11 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr2
60+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:12 <- undef
61+
; CHECK-NEXT: .loc 1 10 1 is_stmt 1 ; <stdin>:10:1
62+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
63+
; CHECK-NEXT: .Ltmp6:
64+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:7 <- undef
65+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:10 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] undef
66+
; CHECK-NEXT: .loc 1 16 1 ; <stdin>:16:1
67+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
68+
; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
69+
; CHECK-NEXT: .Ltmp7:
70+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:13 <- $vgpr0
71+
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:14 <- undef
72+
; CHECK-NEXT: .loc 1 18 1 ; <stdin>:18:1
73+
; CHECK-NEXT: global_store_dword v2, v0, s[0:1]
74+
; CHECK-NEXT: .loc 1 19 1 ; <stdin>:19:1
75+
; CHECK-NEXT: s_endpgm
76+
; CHECK-NEXT: .Ltmp8:
77+
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
78+
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
79+
%val0 = load float, ptr addrspace(3) %arrayidx0, align 4
80+
%add.x = add nsw i32 %x.i, 8
81+
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
82+
%val1 = load float, ptr addrspace(3) %arrayidx1, align 4
83+
%sum = fadd float %val0, %val1
84+
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
85+
store float %sum, ptr addrspace(1) %out.gep, align 4
86+
ret void
87+
}
88+
89+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)