Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 42 additions & 5 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1411,11 +1411,48 @@ SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);

BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
// Make sure the generated REG_SEQUENCE has sensibly aligned registers.
const TargetRegisterClass *Src0RC = TRI->findCommonRegClass(
MRI->getRegClass(Src0->getReg()), Src0->getSubReg(), SuperRC, SubRegIdx0);
const TargetRegisterClass *Src1RC = TRI->findCommonRegClass(
MRI->getRegClass(Src1->getReg()), Src1->getSubReg(), SuperRC, SubRegIdx1);
if (!Src0RC || !Src1RC) {
unsigned SuperRCWSize = TRI->getRegSizeInBits(*SuperRC) / 32;
unsigned Src1WSizeOffset = CI.Width;

auto BMI =
BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg);

unsigned Src0SubReg = Src0->getSubReg();
unsigned Src1SubReg = Src1->getSubReg();
unsigned It = 0;
for (; It < Src1WSizeOffset; ++It) {
unsigned ChOffset =
Src0SubReg ? TRI->getChannelFromSubReg(Src0SubReg) : 0;
unsigned NewSubReg = Src0SubReg ? TRI->getSubRegFromChannel(ChOffset + It)
: CI.Width == 1 ? 0
: TRI->getSubRegFromChannel(It);
BMI.addUse(Src0->getReg(), /*Flags=*/0U, NewSubReg)
.addImm(TRI->getSubRegFromChannel(It));
}
for (; It < SuperRCWSize; ++It) {
unsigned ChOffset =
Src1SubReg ? TRI->getChannelFromSubReg(Src1SubReg) : 0;
unsigned NewSubReg = Src1SubReg ? TRI->getSubRegFromChannel(ChOffset + It)
: Paired.Width == 1
? 0
: TRI->getSubRegFromChannel(It - CI.Width);
BMI.addUse(Src1->getReg(), /*Flags=*/0U, NewSubReg)
.addImm(TRI->getSubRegFromChannel(It));
}

} else {
BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
}

return SrcReg;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1
; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr poison`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:agpr_32 = IMPLICIT_DEF
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]].sub0, %subreg.sub1, [[DEF2]].sub1, %subreg.sub2
; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr poison`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
Expand All @@ -230,7 +230,7 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2_sub3
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]].sub0, %subreg.sub1, [[DEF2]].sub1, %subreg.sub2, [[DEF2]].sub2, %subreg.sub3
; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1
; GCN-NEXT: GLOBAL_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 4, addrspace 1)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:agpr_32 = IMPLICIT_DEF
Expand Down
29 changes: 29 additions & 0 deletions llvm/test/CodeGen/AMDGPU/siloadstoreopt-misalign.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -start-before=si-load-store-opt %s -o - | FileCheck %s

# CHECK-LABEL: misaligned_vgpr:
# CHECK: ; %bb.0:
# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
# CHECK: v_mov_b64_e32 v[0:1], 0
# CHECK: v_mov_b32_e32 v2, 0
# CHECK: v_mov_b32_e32 v3, v0
# CHECK: v_mov_b32_e32 v4, v1
# CHECK: flat_store_dwordx3 v[0:1], v[2:4]
# CHECK: s_endpgm

--- |
define void @misaligned_vgpr() { ret void }
...

---
name: misaligned_vgpr
tracksRegLiveness: true
body: |
bb.0:
%10:vreg_64_align2 = IMPLICIT_DEF
%11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
FLAT_STORE_DWORD %10:vreg_64_align2, %11:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`)
%14:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
FLAT_STORE_DWORDX2 %10:vreg_64_align2, killed %14:vreg_64_align2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr addrspace(1) undef`, align 4)
S_ENDPGM 0

---