diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1e2c77b08b9a6..c912a580854c1 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -194,6 +194,23 @@ bool SIFoldOperandsImpl::frameIndexMayFold( return false; const unsigned Opc = UseMI.getOpcode(); + switch (Opc) { + case AMDGPU::S_ADD_I32: + case AMDGPU::V_ADD_U32_e32: + case AMDGPU::V_ADD_CO_U32_e32: + // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have + // to insert the wave size shift at every point we use the index. + // TODO: Fix depending on visit order to fold immediates into the operand + return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() && + MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg()); + case AMDGPU::V_ADD_U32_e64: + case AMDGPU::V_ADD_CO_U32_e64: + return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() && + MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg()); + default: + break; + } + if (TII->isMUBUF(UseMI)) return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); if (!TII->isFLATScratch(UseMI)) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 3e4b43d9cfcd3..c5d4ef23070eb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4705,8 +4705,7 @@ define amdgpu_ps void @large_offset() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_movk_i32 s0, 0x810 -; GFX10-NEXT: s_addk_i32 s0, 0x3c0 +; GFX10-NEXT: s_movk_i32 s0, 0xbd0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4823,8 +4822,7 @@ define amdgpu_ps void @large_offset() { ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 -; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 +; GFX10-PAL-NEXT: s_movk_i32 s0, 0xbd0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir index 2b5ec86244ec2..8626ac0f23ec7 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir @@ -183,8 +183,7 @@ body: | bb.0: ; GCN-LABEL: name: shrink_vgpr_imm_vgpr_fi_v_add_i32_e64_no_carry_out_use - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, [[V_MOV_B32_e32_]], 0, implicit $exec + ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] %0:vgpr_32 = V_MOV_B32_e32 16, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir index 0d6511cbfceb2..d10dec6ca8289 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir @@ -13,8 +13,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec + ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -34,8 +33,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_const - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 %stack.0, 128, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -57,8 +55,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 128, %stack.0, 0, implicit $exec ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -78,8 +75,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_const - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 128, 0, implicit $exec ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -99,8 +95,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64___fi_const_v - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, %stack.0, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index aa91a4f9f988f..280126a0d7cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -14,8 +14,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_const - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MOV_B32_]], 128, implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 @@ -35,8 +34,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__const_fi - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, %stack.0, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 @@ -56,8 +54,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__materializedconst_fi - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 256 @@ -101,8 +98,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_1 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 256 @@ -173,8 +169,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec + ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -215,21 +210,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX10-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi - ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX12-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $sgpr4 + ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec + ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_ADD_U32_e64 64, %0, 0, implicit $exec $sgpr4 = COPY %1 @@ -246,21 +230,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec - ; GFX9-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX10-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $sgpr4 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm - ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX12-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $sgpr4 + ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec + ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_ADD_U32_e64 %0, 64, 0, implicit $exec $sgpr4 = COPY %1 @@ -278,8 +251,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e32__const_v_fi - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec + ; CHECK: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def $vcc, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -298,21 +270,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec - ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm - ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm - ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec - ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 %0, 64, 0, implicit $exec $vgpr0 = COPY %1 @@ -329,21 +290,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX9-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi - ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX10-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi - ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec - ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX12-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi + ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 64, %0, 0, implicit $exec $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 4215ae43345fd..e3cd8028422dd 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -64,8 +64,8 @@ define void @func_mov_fi_i32_offset() #0 { ; GFX9-MUBUF: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GFX9-FLATSCR: v_mov_b32_e32 [[ADD:v[0-9]+]], s32 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]] +; FIXME: Should commute and shrink +; GFX9-FLATSCR: v_add_u32_e64 v0, 4, s32 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -164,12 +164,12 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8 ; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}} ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 -; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] +; CI: v_add_i32_e64 [[GEP:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] -; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 -; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 +; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32 +; GFX9-MUBUF: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] -; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] +; GFX9-FLATSCR: v_add_u32_e64 [[GEP:v[0-9]+]], 4, s32 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index e86ef52e413b6..302b140e32f3a 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -1426,17 +1426,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32 ; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART -; GFX10_1-NEXT: ; use alloca0 v1 +; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0 +; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND @@ -1456,17 +1455,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 ; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32 ; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART -; GFX10_3-NEXT: ; use alloca0 v1 +; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0 +; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND @@ -1485,19 +1483,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 ; GFX11-NEXT: scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX11-NEXT: v_writelane_b32 v2, s59, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: v_writelane_b32 v2, s59, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s32 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use alloca0 v1 +; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s59, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 +; GFX11-NEXT: v_readfirstlane_b32 s59, v1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND @@ -1520,17 +1515,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 +; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v3, s32 ; GFX12-NEXT: v_writelane_b32 v2, s59, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART -; GFX12-NEXT: ; use alloca0 v1 +; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0 -; GFX12-NEXT: v_readfirstlane_b32 s59, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x43ec, v3 +; GFX12-NEXT: v_readfirstlane_b32 s59, v1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND @@ -1550,10 +1543,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v0 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x442c, v1 ; GFX8-NEXT: v_writelane_b32 v2, s59, 0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 ; GFX8-NEXT: v_readfirstlane_b32 s59, v0 diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index e9cd94620a6b9..308411fa225da 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -1582,12 +1582,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 +; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6 ; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0 ; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x4240, v1 ; GFX7-NEXT: v_writelane_b32 v23, s59, 27 ; GFX7-NEXT: v_readfirstlane_b32 s59, v0 ; GFX7-NEXT: s_and_b64 vcc, 0, exec @@ -1723,12 +1721,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 ; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0 ; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x4240, v1 ; GFX8-NEXT: v_writelane_b32 v23, s59, 27 ; GFX8-NEXT: v_readfirstlane_b32 s59, v0 ; GFX8-NEXT: s_and_b64 vcc, 0, exec @@ -1983,17 +1979,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1 -; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 +; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART -; GFX10_1-NEXT: ; use alloca0 v1 +; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND ; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 ; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3 ; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4 ; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5 @@ -2070,17 +2065,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1 -; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 +; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART -; GFX10_3-NEXT: ; use alloca0 v1 +; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND ; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 ; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3 ; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4 ; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5 @@ -2156,17 +2150,15 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v23, s30, 0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_writelane_b32 v23, s31, 1 -; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 +; GFX11-NEXT: v_writelane_b32 v23, s31, 1 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use alloca0 v1 +; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 ; GFX11-NEXT: v_writelane_b32 v23, s33, 2 ; GFX11-NEXT: v_writelane_b32 v23, s34, 3 ; GFX11-NEXT: v_writelane_b32 v23, s35, 4 @@ -2248,16 +2240,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: v_writelane_b32 v23, s30, 0 -; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo -; GFX12-NEXT: v_writelane_b32 v23, s31, 1 ; GFX12-NEXT: ;;#ASMSTART -; GFX12-NEXT: ; use alloca0 v1 +; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: v_writelane_b32 v23, s31, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x4200, v1 ; GFX12-NEXT: v_writelane_b32 v23, s33, 2 ; GFX12-NEXT: v_writelane_b32 v23, s34, 3 ; GFX12-NEXT: v_writelane_b32 v23, s35, 4