Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 41 additions & 25 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,11 @@ class SILoadStoreOptimizer {

void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
AMDGPU::OpName OpName, Register DestReg) const;
const DebugLoc &DL, AMDGPU::OpName OpName,
Register DestReg) const;
Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
AMDGPU::OpName OpName) const;
const DebugLoc &DL, AMDGPU::OpName OpName) const;

unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
Expand Down Expand Up @@ -1367,10 +1368,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
// Paired.
void SILoadStoreOptimizer::copyToDestRegs(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
Register DestReg) const {
MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
AMDGPU::OpName OpName, Register DestReg) const {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);

Expand Down Expand Up @@ -1398,9 +1398,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
Register
SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL,
AMDGPU::OpName OpName) const {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);

Expand Down Expand Up @@ -1456,7 +1456,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);

DebugLoc DL = CI.I->getDebugLoc();
DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
Expand Down Expand Up @@ -1484,7 +1485,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
.addImm(0) // gds
.cloneMergedMemRefs({&*CI.I, &*Paired.I});

copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
Expand Down Expand Up @@ -1541,7 +1542,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
(NewOffset0 != NewOffset1) && "Computed offset doesn't fit");

const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
Expand Down Expand Up @@ -1582,7 +1584,9 @@ MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Expand All @@ -1607,7 +1611,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
Expand All @@ -1618,7 +1622,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Expand All @@ -1639,7 +1645,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
New.addImm(MergedOffset);
New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
Expand All @@ -1650,7 +1656,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

Expand Down Expand Up @@ -1680,7 +1688,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
Expand All @@ -1691,7 +1699,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

Expand Down Expand Up @@ -1731,7 +1741,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
Expand All @@ -1742,12 +1752,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

Register SrcReg =
copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);

auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
Expand Down Expand Up @@ -1789,7 +1800,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

Expand All @@ -1807,7 +1820,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
.addImm(CI.CPol)
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);

CI.I->eraseFromParent();
Paired.I->eraseFromParent();
Expand All @@ -1818,12 +1831,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();

DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

Register SrcReg =
copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);

auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
Expand Down Expand Up @@ -2094,12 +2109,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
CombineInfo &CI, CombineInfo &Paired,
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
DebugLoc DL =
DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());

const unsigned Opcode = getNewOpcode(CI, Paired);

Register SrcReg =
copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);

auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
Expand Down
89 changes: 89 additions & 0 deletions llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes=debugify < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s

@lds = addrspace(3) global [512 x float] poison, align 4

define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
; CHECK-LABEL: simple_write2_one_val_f32:
; CHECK: .Lfunc_begin0:
; CHECK-NEXT: .cfi_sections .debug_frame
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: .file 1 "/" "<stdin>"
; CHECK-NEXT: .loc 1 1 1 prologue_end ; <stdin>:1:1
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:1 <- $vgpr0
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:5 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] $vgpr0
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:3 <- undef
; CHECK-NEXT: .loc 1 2 1 ; <stdin>:2:1
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: .Ltmp1:
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:4 <- $vgpr0
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:2 <- undef
; CHECK-NEXT: .loc 1 3 1 ; <stdin>:3:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dword v1, v0, s[0:1]
; CHECK-NEXT: .Ltmp2:
; CHECK-NEXT: ;DEBUG_VALUE: simple_write2_one_val_f32:6 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr0
; CHECK-NEXT: .loc 1 0 0 is_stmt 0 ; <stdin>:0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write2_b32 v0, v1, v1 offset1:8
; CHECK-NEXT: .loc 1 9 1 is_stmt 1 ; <stdin>:9:1
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: .Ltmp3:
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
%val = load float, ptr addrspace(1) %in.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
store float %val, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
store float %val, ptr addrspace(3) %arrayidx1, align 4
ret void
}

define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
; CHECK-LABEL: simple_read2_f32:
; CHECK: .Lfunc_begin1:
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: .loc 1 11 1 prologue_end ; <stdin>:11:1
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; CHECK-NEXT: .Ltmp4:
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:8 <- $vgpr2
; CHECK-NEXT: .loc 1 0 0 is_stmt 0 ; <stdin>:0
; CHECK-NEXT: ds_read2_b32 v[0:1], v2 offset1:8
; CHECK-NEXT: .Ltmp5:
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:9 <- undef
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:11 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr2
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:12 <- undef
; CHECK-NEXT: .loc 1 10 1 is_stmt 1 ; <stdin>:10:1
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: .Ltmp6:
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:7 <- undef
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:10 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] undef
; CHECK-NEXT: .loc 1 16 1 ; <stdin>:16:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
; CHECK-NEXT: .Ltmp7:
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:13 <- $vgpr0
; CHECK-NEXT: ;DEBUG_VALUE: simple_read2_f32:14 <- undef
; CHECK-NEXT: .loc 1 18 1 ; <stdin>:18:1
; CHECK-NEXT: global_store_dword v2, v0, s[0:1]
; CHECK-NEXT: .loc 1 19 1 ; <stdin>:19:1
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: .Ltmp8:
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
%val0 = load float, ptr addrspace(3) %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
%arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
%val1 = load float, ptr addrspace(3) %arrayidx1, align 4
%sum = fadd float %val0, %val1
%out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
store float %sum, ptr addrspace(1) %out.gep, align 4
ret void
}

attributes #0 = { nounwind }
Loading