-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][True16][MC] VOPC profile fake16 pseudo update #113175
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][MC] VOPC profile fake16 pseudo update #113175
Conversation
6326dc2 to
80c2eb6
Compare
80c2eb6 to
835593c
Compare
|
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesUpdate VOPC profile with VOP3 pesudo:
Patch is 61.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113175.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 800bdbe04cf70d..e1c780c33e9678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1104,10 +1104,13 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
if (Size == 16 && !ST.has16BitInsts())
return -1;
- const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
+ const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
+ unsigned FakeS16Opc, unsigned S32Opc,
unsigned S64Opc) {
if (Size == 16)
- return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
+ return ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
+ : S16Opc;
if (Size == 32)
return S32Opc;
return S64Opc;
@@ -1118,83 +1121,109 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
llvm_unreachable("Unknown condition code!");
case CmpInst::ICMP_NE:
return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
- AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
+ AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
+ AMDGPU::V_CMP_NE_U64_e64);
case CmpInst::ICMP_EQ:
return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
- AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
+ AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
+ AMDGPU::V_CMP_EQ_U64_e64);
case CmpInst::ICMP_SGT:
return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
- AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
+ AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
+ AMDGPU::V_CMP_GT_I64_e64);
case CmpInst::ICMP_SGE:
return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
- AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
+ AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
+ AMDGPU::V_CMP_GE_I64_e64);
case CmpInst::ICMP_SLT:
return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
- AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
+ AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
+ AMDGPU::V_CMP_LT_I64_e64);
case CmpInst::ICMP_SLE:
return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
- AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
+ AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
+ AMDGPU::V_CMP_LE_I64_e64);
case CmpInst::ICMP_UGT:
return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
- AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
+ AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
+ AMDGPU::V_CMP_GT_U64_e64);
case CmpInst::ICMP_UGE:
return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
- AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
+ AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
+ AMDGPU::V_CMP_GE_U64_e64);
case CmpInst::ICMP_ULT:
return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
- AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
+ AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
+ AMDGPU::V_CMP_LT_U64_e64);
case CmpInst::ICMP_ULE:
return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
- AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
+ AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
+ AMDGPU::V_CMP_LE_U64_e64);
case CmpInst::FCMP_OEQ:
return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
- AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
+ AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
+ AMDGPU::V_CMP_EQ_F64_e64);
case CmpInst::FCMP_OGT:
return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
- AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
+ AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
+ AMDGPU::V_CMP_GT_F64_e64);
case CmpInst::FCMP_OGE:
return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
- AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
+ AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
+ AMDGPU::V_CMP_GE_F64_e64);
case CmpInst::FCMP_OLT:
return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
- AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
+ AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
+ AMDGPU::V_CMP_LT_F64_e64);
case CmpInst::FCMP_OLE:
return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
- AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
+ AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
+ AMDGPU::V_CMP_LE_F64_e64);
case CmpInst::FCMP_ONE:
return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
- AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
+ AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
+ AMDGPU::V_CMP_NEQ_F64_e64);
case CmpInst::FCMP_ORD:
return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
- AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
+ AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
+ AMDGPU::V_CMP_O_F64_e64);
case CmpInst::FCMP_UNO:
return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
- AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
+ AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
+ AMDGPU::V_CMP_U_F64_e64);
case CmpInst::FCMP_UEQ:
return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
- AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
+ AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
+ AMDGPU::V_CMP_NLG_F64_e64);
case CmpInst::FCMP_UGT:
return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
- AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
+ AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
+ AMDGPU::V_CMP_NLE_F64_e64);
case CmpInst::FCMP_UGE:
return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
- AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
+ AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
+ AMDGPU::V_CMP_NLT_F64_e64);
case CmpInst::FCMP_ULT:
return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
- AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
+ AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
+ AMDGPU::V_CMP_NGE_F64_e64);
case CmpInst::FCMP_ULE:
return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
- AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
+ AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
+ AMDGPU::V_CMP_NGT_F64_e64);
case CmpInst::FCMP_UNE:
return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
- AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
+ AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
+ AMDGPU::V_CMP_NEQ_F64_e64);
case CmpInst::FCMP_TRUE:
return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
- AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
+ AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
+ AMDGPU::V_CMP_TRU_F64_e64);
case CmpInst::FCMP_FALSE:
return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
- AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
+ AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
+ AMDGPU::V_CMP_F_F64_e64);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 89a2eb4f18946b..fce88860aeea42 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5501,20 +5501,48 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
- case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
- case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
- case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
- case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
- case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
- case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
- case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
- case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
- case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
- case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
- case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
- case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
- case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
- case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
+ case AMDGPU::S_CMP_LT_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
+ : AMDGPU::V_CMP_LT_F16_fake16_e64;
+ case AMDGPU::S_CMP_EQ_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
+ : AMDGPU::V_CMP_EQ_F16_fake16_e64;
+ case AMDGPU::S_CMP_LE_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
+ : AMDGPU::V_CMP_LE_F16_fake16_e64;
+ case AMDGPU::S_CMP_GT_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
+ : AMDGPU::V_CMP_GT_F16_fake16_e64;
+ case AMDGPU::S_CMP_LG_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
+ : AMDGPU::V_CMP_LG_F16_fake16_e64;
+ case AMDGPU::S_CMP_GE_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
+ : AMDGPU::V_CMP_GE_F16_fake16_e64;
+ case AMDGPU::S_CMP_O_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
+ : AMDGPU::V_CMP_O_F16_fake16_e64;
+ case AMDGPU::S_CMP_U_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
+ : AMDGPU::V_CMP_U_F16_fake16_e64;
+ case AMDGPU::S_CMP_NGE_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
+ : AMDGPU::V_CMP_NGE_F16_fake16_e64;
+ case AMDGPU::S_CMP_NLG_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
+ : AMDGPU::V_CMP_NLG_F16_fake16_e64;
+ case AMDGPU::S_CMP_NGT_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
+ : AMDGPU::V_CMP_NGT_F16_fake16_e64;
+ case AMDGPU::S_CMP_NLE_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
+ : AMDGPU::V_CMP_NLE_F16_fake16_e64;
+ case AMDGPU::S_CMP_NEQ_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
+ : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
+ case AMDGPU::S_CMP_NLT_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
+ : AMDGPU::V_CMP_NLT_F16_fake16_e64;
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
@@ -7343,14 +7371,16 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
auto NewInstr =
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
.setMIFlags(Inst.getFlags());
- if (AMDGPU::getNamedOperandIdx(NewOpcode,
- AMDGPU::OpName::src0_modifiers) >= 0) {
+ if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
NewInstr
.addImm(0) // src0_modifiers
.add(Inst.getOperand(0)) // src0
.addImm(0) // src1_modifiers
.add(Inst.getOperand(1)) // src1
.addImm(0); // clamp
+
+ if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
+ NewInstr.addImm(0); // op_sel0
} else {
NewInstr
.add(Inst.getOperand(0))
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index faa0b6d6c3f506..2f3b3370c3393e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3463,7 +3463,7 @@ def : GCNPat <
SRCMODS.NONE,
(V_MOV_B64_PSEUDO (i64 0x3fefffffffffffff))),
$x,
- (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
+ (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, SRCMODS.NONE, (i32 3 /*NaN*/))))
>;
} // End SubtargetPredicates = isGFX6
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index d6e08dce130ced..08881792b59847 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -192,6 +192,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.Pseudo
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
+ let True16Predicate = ps.True16Predicate;
+ let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
@@ -314,7 +316,7 @@ multiclass VOPC_Pseudos <string opName,
let isCommutable = 1;
}
- def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
+ def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret, 0/*IsVOP3P*/, P.HasOpSel>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
VCMPXNoSDstTable<1, opName#"_e64">,
VCMPVCMPXTable<opName#"_e64"> {
@@ -373,7 +375,7 @@ multiclass VOPCX_Pseudos <string opName,
let IsVCMPX = 1;
}
- def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
+ def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst, [], 0/*IsVOP3P*/, P_NoSDst.HasOpSel>,
Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
VCMPXNoSDstTable<0, opName#"_e64">,
VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
@@ -801,24 +803,11 @@ defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">;
class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType src1VT = i32> :
VOPC_Profile<sched, src0VT, src1VT> {
- let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
- let AsmDPP16 = AsmDPP#"$fi";
- let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
- let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
- // DPP8 forbids modifiers and can inherit from VOPC_Profile
-
- let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1);
- let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
- (ins)));
- let AsmVOP3Base = "$sdst, $src0_modifiers, $src1";
-
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
Clamp:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
- let HasSrc1Mods = 0;
let HasClamp = 0;
let HasOMod = 0;
}
@@ -837,16 +826,26 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
}
- def _fake16 : VOPC_Class_Profile<sched, f16, i16> {
+ def _fake16 : VOPC_Class_Profile<sched, f16, f16> {
let IsTrue16 = 1;
+ let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
+ let DstRC64 = getVALUDstForVT<DstVT>.ret;
+ let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1RC64 = VSrc_b32;
let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
- let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
- let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
- let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+ let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
+ let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
+ let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
+ let Src0VOP3DPP = VGPRSrc_32;
+ let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
+ let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
+ let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
+ let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
+ let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+
}
}
@@ -889,17 +888,34 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
}
}
-class getVOPCClassPat64 <VOPProfile P> {
- list<dag> ret =
- [(set i1:$sdst,
+multiclass VOPCClassPat64<string inst_name> {
+ defvar inst = !cast<VOP_Pseudo>(inst_name#"_e64");
+ defvar P = inst.Pfl;
+ def : GCNPat <
+ (i1:$sdst
(AMDGPUfp_class
(P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)),
- i32:$src1))];
+ P.Src1VT:$src1)),
+ (inst i32:$src0_modifiers, P.Src0VT:$src0,
+ 0 /*src1_modifiers*/, P.Src1VT:$src1)
+ >;
+}
+multiclass VOPCClassPat64_fake16<string inst_name> {
+ defvar inst = !cast<VOP_Pseudo>(inst_name#"_fake16_e64");
+ defvar P = inst.Pfl;
+ def : GCNPat <
+ (i1:$sdst
+ (AMDGPUfp_class
+ (P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)),
+ i32:$src1)),
+ (inst i32:$src0_modifiers, P.Src0VT:$src0,
+ 0 /*src1_modifiers*/, VGPR_32:$src1)
+ >;
}
-
-// Special case for class instructions which only have modifiers on
-// the 1st source operand.
+// cmp_class ignores the FP mode and faithfully reports the unmodified
+// source value.
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
bit DefVcc = 1> {
def _e32 : VOPC_Pseudo <opName, p>,
@@ -910,7 +926,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
let isConvergent = DefExec;
}
- def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret>,
+ def _e64 : VOP3_Pseudo<opName, p, [], 0/*IsVOP3P*/, p.HasOpSel>,
...
[truncated]
|
835593c to
6e9adfc
Compare
|
ping! |
| (V_MOV_B64_PSEUDO (i64 0x3fefffffffffffff))), | ||
| $x, | ||
| (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) | ||
| (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, SRCMODS.NONE, (i32 3 /*NaN*/)))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shouldn't have gained a modifier operand
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. Since this patch change the instruction format from f16, i16 to f16, f16, there is an addtional modifier operand for the floating point
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why "f16, f16"? Can you avoid adding this, it is not useful here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. The context of this change is that the v_cmp_class_f16 has f16 in the src1 type. We change this to f16 to enable inline of both int and floating points immediates. However it seems we could not find a way to suppress the additonal src modifier. also @Sisyph to help clarify if I misunderstood anything
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can always define it as its own profile without the operand
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I misunderstood some part of the code when I took a closer look. Let me trim this patch and repost
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Modified this patch so that 32 and 64 bit instructions are not impacted
6e9adfc to
9bb47f7
Compare
9bb47f7 to
ec34e2b
Compare
0147973 to
8d3b933
Compare
Sisyph
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like part of this patch is renaming some instructions which were erroneously called _t16 to _fake16. Please note which instructions are affected in the commit message.
| let SchedRW = ps.SchedRW; | ||
| let Uses = ps.Uses; | ||
| let OtherPredicates = ps.OtherPredicates; | ||
| let True16Predicate = ps.True16Predicate; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Duplicate line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed
|
There seems still some errors in the codeGen that I wasn't realize before. I will address those asap |
9a053a5 to
941a225
Compare
updated the commit message |
Added back the missing source code change on inst selection and now the CodeGen test should be fixed |
941a225 to
c89978d
Compare
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
c89978d to
ad4ebe2
Compare
Sisyph
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
kosarev
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a nit.
b455874 to
ea3edae
Compare
f16 for fake16 format
ea3edae to
87d820e
Compare
|
Squash the commits to make it easier for downstream review |
Update VOPC profile with VOP3 pseudo:
t16, but actually using 32 bit registers. Correct it by updating the pseudo definitions with useRealTrue16/useFakeTrue16 predicates and rename theset16instructions tofake16.t16/fake16instructions are selected in true16/fake16 flow.