|
12 | 12 | #include "AMDGPU.h" |
13 | 13 | #include "GCNSubtarget.h" |
14 | 14 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 15 | +#include "SIInstrInfo.h" |
15 | 16 | #include "SIMachineFunctionInfo.h" |
| 17 | +#include "SIRegisterInfo.h" |
16 | 18 | #include "llvm/ADT/DepthFirstIterator.h" |
| 19 | +#include "llvm/CodeGen/MachineFunction.h" |
17 | 20 | #include "llvm/CodeGen/MachineFunctionPass.h" |
18 | 21 | #include "llvm/CodeGen/MachineOperand.h" |
19 | 22 |
|
@@ -570,6 +573,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { |
570 | 573 | } |
571 | 574 |
|
572 | 575 | MachineOperand *New = Fold.OpToFold; |
| 576 | + // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs. |
| 577 | + // Rework once the VS_16 register class is updated to include proper |
| 578 | + // 16-bit SGPRs instead of 32-bit ones. |
| 579 | + if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) |
| 580 | + Old.setSubReg(AMDGPU::NoSubRegister); |
573 | 581 | Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); |
574 | 582 | Old.setIsUndef(New->isUndef()); |
575 | 583 | return true; |
@@ -875,9 +883,14 @@ void SIFoldOperandsImpl::foldOperand( |
875 | 883 | return; |
876 | 884 |
|
877 | 885 | // FIXME: Fold operands with subregs. |
878 | | - if (UseOp->isReg() && OpToFold.isReg() && |
879 | | - (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister)) |
880 | | - return; |
| 886 | + if (UseOp->isReg() && OpToFold.isReg()) { |
| 887 | + if (UseOp->isImplicit()) |
| 888 | + return; |
| 889 | + // Allow folding from SGPRs to 16-bit VGPRs. |
| 890 | + if (UseOp->getSubReg() != AMDGPU::NoSubRegister && |
| 891 | + UseOp->getSubReg() != AMDGPU::lo16) |
| 892 | + return; |
| 893 | + } |
881 | 894 |
|
882 | 895 | // Special case for REG_SEQUENCE: We can't fold literals into |
883 | 896 | // REG_SEQUENCE instructions, so we have to fold them into the |
@@ -958,14 +971,20 @@ void SIFoldOperandsImpl::foldOperand( |
958 | 971 | return; |
959 | 972 |
|
960 | 973 | const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); |
961 | | - if (!DestReg.isPhysical()) { |
962 | | - if (DestRC == &AMDGPU::AGPR_32RegClass && |
963 | | - TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
964 | | - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); |
965 | | - UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); |
966 | | - CopiesToReplace.push_back(UseMI); |
967 | | - return; |
968 | | - } |
| 974 | + if (DestRC == &AMDGPU::AGPR_32RegClass && |
| 975 | + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
| 976 | + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); |
| 977 | + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); |
| 978 | + CopiesToReplace.push_back(UseMI); |
| 979 | + return; |
| 980 | + } |
| 981 | + |
| 982 | + // Allow immediates COPYd into sgpr_lo16 to be further folded while |
| 983 | + // still being legal if not further folded |
| 984 | + if (DestRC == &AMDGPU::SGPR_LO16RegClass) { |
| 985 | + assert(ST->useRealTrue16Insts()); |
| 986 | + MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass); |
| 987 | + DestRC = &AMDGPU::SGPR_32RegClass; |
969 | 988 | } |
970 | 989 |
|
971 | 990 | // In order to fold immediates into copies, we need to change the |
@@ -1004,7 +1023,40 @@ void SIFoldOperandsImpl::foldOperand( |
1004 | 1023 | unsigned Size = TII->getOpSize(*UseMI, 1); |
1005 | 1024 | Register UseReg = OpToFold.getReg(); |
1006 | 1025 | UseMI->getOperand(1).setReg(UseReg); |
1007 | | - UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); |
| 1026 | + unsigned SubRegIdx = OpToFold.getSubReg(); |
| 1027 | + // Hack to allow 32-bit SGPRs to be folded into True16 instructions |
| 1028 | + // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the |
| 1029 | + // VS_16RegClass |
| 1030 | + // |
| 1031 | + // Excerpt from AMDGPUGenRegisterInfo.inc |
| 1032 | + // NoSubRegister, //0 |
| 1033 | + // hi16, // 1 |
| 1034 | + // lo16, // 2 |
| 1035 | + // sub0, // 3 |
| 1036 | + // ... |
| 1037 | + // sub1, // 11 |
| 1038 | + // sub1_hi16, // 12 |
| 1039 | + // sub1_lo16, // 13 |
| 1040 | + static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed"); |
| 1041 | + if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && |
| 1042 | + TRI->isSGPRReg(*MRI, UseReg)) { |
| 1043 | + // Produce the 32 bit subregister index to which the 16-bit subregister |
| 1044 | + // is aligned. |
| 1045 | + if (SubRegIdx > AMDGPU::sub1) { |
| 1046 | + LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx); |
| 1047 | + M |= M.getLane(M.getHighestLane() - 1); |
| 1048 | + SmallVector<unsigned, 4> Indexes; |
| 1049 | + TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M, |
| 1050 | + Indexes); |
| 1051 | + assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover"); |
| 1052 | + SubRegIdx = Indexes[0]; |
| 1053 | + // 32-bit registers do not have a sub0 index |
| 1054 | + } else if (TII->getOpSize(*UseMI, 1) == 4) |
| 1055 | + SubRegIdx = 0; |
| 1056 | + else |
| 1057 | + SubRegIdx = AMDGPU::sub0; |
| 1058 | + } |
| 1059 | + UseMI->getOperand(1).setSubReg(SubRegIdx); |
1008 | 1060 | UseMI->getOperand(1).setIsKill(false); |
1009 | 1061 | CopiesToReplace.push_back(UseMI); |
1010 | 1062 | OpToFold.setIsKill(false); |
@@ -1584,6 +1636,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( |
1584 | 1636 | if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) |
1585 | 1637 | return false; |
1586 | 1638 |
|
| 1639 | + // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt |
| 1640 | + // Can remove this code if proper 16-bit SGPRs are implemented |
| 1641 | + // Example: Pre-peephole-opt |
| 1642 | + // %29:sgpr_lo16 = COPY %16.lo16:sreg_32 |
| 1643 | + // %32:sreg_32 = COPY %29:sgpr_lo16 |
| 1644 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 |
| 1645 | + // Post-peephole-opt and DCE |
| 1646 | + // %32:sreg_32 = COPY %16.lo16:sreg_32 |
| 1647 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 |
| 1648 | + // After this transform |
| 1649 | + // %32:sreg_32 = COPY %16:sreg_32 |
| 1650 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 |
| 1651 | + // After the fold operands pass |
| 1652 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32 |
| 1653 | + if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() && |
| 1654 | + OpToFold.getSubReg()) { |
| 1655 | + const TargetRegisterClass *DstRC = |
| 1656 | + MRI->getRegClass(MI.getOperand(0).getReg()); |
| 1657 | + if (DstRC == &AMDGPU::SReg_32RegClass && |
| 1658 | + DstRC == MRI->getRegClass(OpToFold.getReg())) { |
| 1659 | + assert(OpToFold.getSubReg() == AMDGPU::lo16); |
| 1660 | + OpToFold.setSubReg(0); |
| 1661 | + } |
| 1662 | + } |
| 1663 | + |
1587 | 1664 | // Prevent folding operands backwards in the function. For example, |
1588 | 1665 | // the COPY opcode must not be replaced by 1 in this example: |
1589 | 1666 | // |
|
0 commit comments