diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f0d1117664983..36b22e457da10 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1411,11 +1411,48 @@ SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); - BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) - .add(*Src0) - .addImm(SubRegIdx0) - .add(*Src1) - .addImm(SubRegIdx1); + // Make sure the generated REG_SEQUENCE has sensibly aligned registers. + const TargetRegisterClass *Src0RC = TRI->findCommonRegClass( + MRI->getRegClass(Src0->getReg()), Src0->getSubReg(), SuperRC, SubRegIdx0); + const TargetRegisterClass *Src1RC = TRI->findCommonRegClass( + MRI->getRegClass(Src1->getReg()), Src1->getSubReg(), SuperRC, SubRegIdx1); + if (!Src0RC || !Src1RC) { + unsigned SuperRCWSize = TRI->getRegSizeInBits(*SuperRC) / 32; + unsigned Src1WSizeOffset = CI.Width; + + auto BMI = + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg); + + unsigned Src0SubReg = Src0->getSubReg(); + unsigned Src1SubReg = Src1->getSubReg(); + unsigned It = 0; + for (; It < Src1WSizeOffset; ++It) { + unsigned ChOffset = + Src0SubReg ? TRI->getChannelFromSubReg(Src0SubReg) : 0; + unsigned NewSubReg = Src0SubReg ? TRI->getSubRegFromChannel(ChOffset + It) + : CI.Width == 1 ? 0 + : TRI->getSubRegFromChannel(It); + BMI.addUse(Src0->getReg(), /*Flags=*/0U, NewSubReg) + .addImm(TRI->getSubRegFromChannel(It)); + } + for (; It < SuperRCWSize; ++It) { + unsigned ChOffset = + Src1SubReg ? TRI->getChannelFromSubReg(Src1SubReg) : 0; + unsigned NewSubReg = Src1SubReg ? TRI->getSubRegFromChannel(ChOffset + It) + : Paired.Width == 1 + ? 0 + : TRI->getSubRegFromChannel(It - CI.Width); + BMI.addUse(Src1->getReg(), /*Flags=*/0U, NewSubReg) + .addImm(TRI->getSubRegFromChannel(It)); + } + + } else { + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + } return SrcReg; } diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir index 09aae9152c4ee..4add3cb294132 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir @@ -399,7 +399,7 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1 ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr poison`, align 4) %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir index a67cf22bdd1ce..b45b69010141e 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir @@ -213,7 +213,7 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]].sub0, %subreg.sub1, [[DEF2]].sub1, %subreg.sub2 ; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr poison`, align 4) %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF @@ -230,7 +230,7 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2_sub3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]].sub0, %subreg.sub1, [[DEF2]].sub1, %subreg.sub2, [[DEF2]].sub2, %subreg.sub3 ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 4) %0:vreg_64_align2 = IMPLICIT_DEF %1:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir index 0817694295f86..a3de9f05d889b 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir @@ -605,7 +605,7 @@ body: | ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1 ; GCN-NEXT: GLOBAL_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 4, addrspace 1) %0:vreg_64_align2 = IMPLICIT_DEF %1:agpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misalign.mir b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misalign.mir new file mode 100644 index 0000000000000..4fb0320f1e253 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misalign.mir @@ -0,0 +1,29 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -start-before=si-load-store-opt %s -o - | FileCheck %s + +# CHECK-LABEL: misaligned_vgpr: +# CHECK: ; %bb.0: +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +# CHECK: v_mov_b64_e32 v[0:1], 0 +# CHECK: v_mov_b32_e32 v2, 0 +# CHECK: v_mov_b32_e32 v3, v0 +# CHECK: v_mov_b32_e32 v4, v1 +# CHECK: flat_store_dwordx3 v[0:1], v[2:4] +# CHECK: s_endpgm + +--- | + define void @misaligned_vgpr() { ret void } +... + +--- +name: misaligned_vgpr +tracksRegLiveness: true +body: | + bb.0: + %10:vreg_64_align2 = IMPLICIT_DEF + %11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + FLAT_STORE_DWORD %10:vreg_64_align2, %11:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`) + %14:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + FLAT_STORE_DWORDX2 %10:vreg_64_align2, killed %14:vreg_64_align2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr addrspace(1) undef`, align 4) + S_ENDPGM 0 + +---