|
12 | 12 | #include "AMDGPU.h" |
13 | 13 | #include "GCNSubtarget.h" |
14 | 14 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 15 | +#include "SIInstrInfo.h" |
15 | 16 | #include "SIMachineFunctionInfo.h" |
| 17 | +#include "SIRegisterInfo.h" |
16 | 18 | #include "llvm/ADT/DepthFirstIterator.h" |
| 19 | +#include "llvm/CodeGen/MachineFunction.h" |
17 | 20 | #include "llvm/CodeGen/MachineFunctionPass.h" |
18 | 21 | #include "llvm/CodeGen/MachineOperand.h" |
19 | 22 |
|
@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { |
576 | 579 | } |
577 | 580 |
|
578 | 581 | MachineOperand *New = Fold.OpToFold; |
| 582 | + // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs. |
| 583 | + // Rework once the VS_16 register class is updated to include proper |
| 584 | + // 16-bit SGPRs instead of 32-bit ones. |
| 585 | + if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) |
| 586 | + Old.setSubReg(AMDGPU::NoSubRegister); |
579 | 587 | Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); |
580 | 588 | Old.setIsUndef(New->isUndef()); |
581 | 589 | return true; |
@@ -947,9 +955,14 @@ void SIFoldOperandsImpl::foldOperand( |
947 | 955 | return; |
948 | 956 |
|
949 | 957 | // FIXME: Fold operands with subregs. |
950 | | - if (UseOp->isReg() && OpToFold.isReg() && |
951 | | - (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister)) |
952 | | - return; |
| 958 | + if (UseOp->isReg() && OpToFold.isReg()) { |
| 959 | + if (UseOp->isImplicit()) |
| 960 | + return; |
| 961 | + // Allow folding from SGPRs to 16-bit VGPRs. |
| 962 | + if (UseOp->getSubReg() != AMDGPU::NoSubRegister && |
| 963 | + UseOp->getSubReg() != AMDGPU::lo16) |
| 964 | + return; |
| 965 | + } |
953 | 966 |
|
954 | 967 | // Special case for REG_SEQUENCE: We can't fold literals into |
955 | 968 | // REG_SEQUENCE instructions, so we have to fold them into the |
@@ -1030,14 +1043,20 @@ void SIFoldOperandsImpl::foldOperand( |
1030 | 1043 | return; |
1031 | 1044 |
|
1032 | 1045 | const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); |
1033 | | - if (!DestReg.isPhysical()) { |
1034 | | - if (DestRC == &AMDGPU::AGPR_32RegClass && |
1035 | | - TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
1036 | | - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); |
1037 | | - UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); |
1038 | | - CopiesToReplace.push_back(UseMI); |
1039 | | - return; |
1040 | | - } |
| 1046 | + if (DestRC == &AMDGPU::AGPR_32RegClass && |
| 1047 | + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { |
| 1048 | + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); |
| 1049 | + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); |
| 1050 | + CopiesToReplace.push_back(UseMI); |
| 1051 | + return; |
| 1052 | + } |
| 1053 | + |
| 1054 | + // Allow immediates COPYd into sgpr_lo16 to be further folded while |
| 1055 | + // still being legal if not further folded |
| 1056 | + if (DestRC == &AMDGPU::SGPR_LO16RegClass) { |
| 1057 | + assert(ST->useRealTrue16Insts()); |
| 1058 | + MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass); |
| 1059 | + DestRC = &AMDGPU::SGPR_32RegClass; |
1041 | 1060 | } |
1042 | 1061 |
|
1043 | 1062 | // In order to fold immediates into copies, we need to change the |
@@ -1073,9 +1092,43 @@ void SIFoldOperandsImpl::foldOperand( |
1073 | 1092 | UseMI->getOperand(0).getReg().isVirtual() && |
1074 | 1093 | !UseMI->getOperand(1).getSubReg()) { |
1075 | 1094 | LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); |
| 1095 | + unsigned Size = TII->getOpSize(*UseMI, 1); |
1076 | 1096 | Register UseReg = OpToFold.getReg(); |
1077 | 1097 | UseMI->getOperand(1).setReg(UseReg); |
1078 | | - UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); |
| 1098 | + unsigned SubRegIdx = OpToFold.getSubReg(); |
| 1099 | + // Hack to allow 32-bit SGPRs to be folded into True16 instructions |
| 1100 | + // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the |
| 1101 | + // VS_16RegClass |
| 1102 | + // |
| 1103 | + // Excerpt from AMDGPUGenRegisterInfo.inc |
| 1104 | + // NoSubRegister, //0 |
| 1105 | + // hi16, // 1 |
| 1106 | + // lo16, // 2 |
| 1107 | + // sub0, // 3 |
| 1108 | + // ... |
| 1109 | + // sub1, // 11 |
| 1110 | + // sub1_hi16, // 12 |
| 1111 | + // sub1_lo16, // 13 |
| 1112 | + static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed"); |
| 1113 | + if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && |
| 1114 | + TRI->isSGPRReg(*MRI, UseReg)) { |
| 1115 | + // Produce the 32 bit subregister index to which the 16-bit subregister |
| 1116 | + // is aligned. |
| 1117 | + if (SubRegIdx > AMDGPU::sub1) { |
| 1118 | + LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx); |
| 1119 | + M |= M.getLane(M.getHighestLane() - 1); |
| 1120 | + SmallVector<unsigned, 4> Indexes; |
| 1121 | + TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M, |
| 1122 | + Indexes); |
| 1123 | + assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover"); |
| 1124 | + SubRegIdx = Indexes[0]; |
| 1125 | + // 32-bit registers do not have a sub0 index |
| 1126 | + } else if (TII->getOpSize(*UseMI, 1) == 4) |
| 1127 | + SubRegIdx = 0; |
| 1128 | + else |
| 1129 | + SubRegIdx = AMDGPU::sub0; |
| 1130 | + } |
| 1131 | + UseMI->getOperand(1).setSubReg(SubRegIdx); |
1079 | 1132 | UseMI->getOperand(1).setIsKill(false); |
1080 | 1133 | CopiesToReplace.push_back(UseMI); |
1081 | 1134 | OpToFold.setIsKill(false); |
@@ -1713,6 +1766,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( |
1713 | 1766 | if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) |
1714 | 1767 | return false; |
1715 | 1768 |
|
| 1769 | + // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt |
| 1770 | + // Can remove this code if proper 16-bit SGPRs are implemented |
| 1771 | + // Example: Pre-peephole-opt |
| 1772 | + // %29:sgpr_lo16 = COPY %16.lo16:sreg_32 |
| 1773 | + // %32:sreg_32 = COPY %29:sgpr_lo16 |
| 1774 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 |
| 1775 | + // Post-peephole-opt and DCE |
| 1776 | + // %32:sreg_32 = COPY %16.lo16:sreg_32 |
| 1777 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 |
| 1778 | + // After this transform |
| 1779 | + // %32:sreg_32 = COPY %16:sreg_32 |
| 1780 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 |
| 1781 | + // After the fold operands pass |
| 1782 | + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32 |
| 1783 | + if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() && |
| 1784 | + OpToFold.getSubReg()) { |
| 1785 | + const TargetRegisterClass *DstRC = |
| 1786 | + MRI->getRegClass(MI.getOperand(0).getReg()); |
| 1787 | + if (DstRC == &AMDGPU::SReg_32RegClass && |
| 1788 | + DstRC == MRI->getRegClass(OpToFold.getReg())) { |
| 1789 | + assert(OpToFold.getSubReg() == AMDGPU::lo16); |
| 1790 | + OpToFold.setSubReg(0); |
| 1791 | + } |
| 1792 | + } |
| 1793 | + |
1716 | 1794 | // Prevent folding operands backwards in the function. For example, |
1717 | 1795 | // the COPY opcode must not be replaced by 1 in this example: |
1718 | 1796 | // |
|
0 commit comments