-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU][MC] GFX9 - allow op_sel in v_interp_p2_f16 #150712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][MC] GFX9 - allow op_sel in v_interp_p2_f16 #150712
Conversation
@llvm/pr-subscribers-mc Author: Jun Wang (jwanggit86) ChangesAMDGPU documentation states op_sel[3] can be used in v_interp_p2_f16 in GFX9. Full diff: https://github.com/llvm/llvm-project/pull/150712.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5bb0e3648d2ec..d7ce772ab1ec6 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -9164,6 +9164,26 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOModSI);
+
+ // Some v_interp instrutions use op_sel[3] for dst.
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOpSel);
+
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ // Check if op_sel[3] is set, which is meant for dst.
+ if ((OpSel & (1 << 3)) != 0) {
+ int ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+
+ ModVal |= SISrcMods::DST_OP_SEL;
+
+ Inst.getOperand(ModIdx).setImm(ModVal);
+ }
+ }
}
void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 197bb3f0b569b..b2aa407302c95 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1293,6 +1293,14 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
O << " op_sel:[" << FI << ',' << BC << ']';
return;
}
+ if (Opc == AMDGPU::V_INTERP_P2_F16_opsel_gfx9) {
+ int ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = MI->getOperand(ModIdx).getImm();
+ if (ModVal & SISrcMods::DST_OP_SEL)
+ O << " op_sel:[0,0,0,1]";
+ return;
+ }
printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index aee2f2cb3d771..fcc9961b722ea 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -67,6 +67,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
VOP3_Pseudo<OpName, P, pattern> {
let AsmMatchConverter = "cvtVOP3Interp";
let mayRaiseFPException = 0;
+ let VOP3_OPSEL = P.HasOpSel;
}
def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
@@ -89,16 +90,17 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
let HasSrc0Mods = 0;
}
-class getInterp16Asm <bit HasSrc2, bit HasOMod> {
+class getInterp16Asm <bit HasSrc2, bit HasOMod, bit OpSel> {
string src2 = !if(HasSrc2, ", $src2_modifiers", "");
string omod = !if(HasOMod, "$omod", "");
+ string opsel = !if(OpSel, "$op_sel", "");
string ret =
- " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod;
+ " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod#opsel;
}
class getInterp16Ins <bit HasSrc2, bit HasOMod,
- Operand Src0Mod, Operand Src2Mod> {
- dag ret = !if(HasSrc2,
+ Operand Src0Mod, Operand Src2Mod, bit OpSel> {
+ dag ret1 = !if(HasSrc2,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
InterpAttr:$attr, InterpAttrChan:$attrchan,
@@ -113,19 +115,22 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
InterpAttr:$attr, InterpAttrChan:$attrchan,
highmod:$high, Clamp0:$clamp, omod0:$omod)
);
+ dag ret2 = !if(OpSel, (ins op_sel0:$op_sel), (ins));
+ dag ret = !con(ret1, ret2);
}
-class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
+class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> {
let IsSingle = 1;
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
+ let HasOpSel = OpSel;
let Src0Mod = FPVRegInputMods;
let Src2Mod = FPVRegInputMods;
let Outs64 = (outs DstRC.RegClass:$vdst);
- let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
- let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
+ let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod, OpSel>.ret;
+ let Asm64 = getInterp16Asm<HasSrc2, HasOMod, OpSel>.ret;
}
//===----------------------------------------------------------------------===//
@@ -435,7 +440,7 @@ let SubtargetPredicate = isGFX9Plus in {
defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>;
defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>;
let OtherPredicates = [isNotGFX90APlus] in
-def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+def V_INTERP_P2_F16_opsel : VOP3Interp <"v_interp_p2_f16_opsel", VOP3_INTERP16<[f16, f32, i32, f32], /*OpSel*/ 1>>;
} // End SubtargetPredicate = isGFX9Plus
// This predicate should only apply to the selection pattern. The
@@ -2241,6 +2246,14 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
}
}
+multiclass VOP3Interp_F16_OpSel_Real_gfx9<bits<10> op, string OpName, string AsmName> {
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ VOP3Interp_OpSel_gfx9 <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
@@ -2353,7 +2366,7 @@ defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
-defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
+defm V_INTERP_P2_F16_opsel : VOP3Interp_F16_OpSel_Real_gfx9 <0x277, "V_INTERP_P2_F16_opsel", "v_interp_p2_f16">;
defm V_ADD_I32 : VOP3_Real_vi <0x29c>;
defm V_SUB_I32 : VOP3_Real_vi <0x29d>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c21e2d38398fa..64f0a32d0c0a4 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -401,6 +401,10 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3Interp_OpSel_gfx9<bits<10> op, VOPProfile p> : VOP3Interp_vi<op, p> {
+ let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
diff --git a/llvm/test/MC/AMDGPU/vop3-gfx9.s b/llvm/test/MC/AMDGPU/vop3-gfx9.s
index a61b0c87e199f..0a43cb9615461 100644
--- a/llvm/test/MC/AMDGPU/vop3-gfx9.s
+++ b/llvm/test/MC/AMDGPU/vop3-gfx9.s
@@ -566,6 +566,36 @@ v_interp_p2_f16 v5, v2, attr0.x, v3 clamp
// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
// VI: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp ; encoding: [0x05,0x80,0x76,0xd2,0x00,0x04,0x0e,0x04]
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 ; encoding: [0x05,0x00,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 ; encoding: [0x05,0x00,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 ; encoding: [0x05,0x00,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1,1,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 high op_sel:[0,0,0,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 high op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x05,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 clamp op_sel:[0,0,0,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp op_sel:[0,0,0,1] ; encoding: [0x05,0xc0,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
v_interp_p2_legacy_f16 v5, v2, attr31.x, v3
// GFX9: v_interp_p2_legacy_f16 v5, v2, attr31.x, v3 ; encoding: [0x05,0x00,0x76,0xd2,0x1f,0x04,0x0e,0x04]
// NOGCN: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
index 802d6368507e2..35b02602e39df 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
@@ -19311,6 +19311,15 @@
# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp ; encoding: [0x05,0x80,0x77,0xd2,0x00,0x04,0x0e,0x04]
0x05,0x80,0x77,0xd2,0x00,0x04,0x0e,0x04
+# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x04,0x0e,0x04]
+0x05,0x40,0x77,0xd2,0x00,0x04,0x0e,0x04
+
+# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 high op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x05,0x0e,0x04]
+0x05,0x40,0x77,0xd2,0x00,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp op_sel:[0,0,0,1] ; encoding: [0x05,0xc0,0x77,0xd2,0x00,0x04,0x0e,0x04]
+0x05,0xc0,0x77,0xd2,0x00,0x04,0x0e,0x04
+
# CHECK: v_add_f64 v[5:6], v[1:2], v[2:3] ; encoding: [0x05,0x00,0x80,0xd2,0x01,0x05,0x02,0x00]
0x05,0x00,0x80,0xd2,0x01,0x05,0x02,0x00
|
@llvm/pr-subscribers-backend-amdgpu Author: Jun Wang (jwanggit86) ChangesAMDGPU documentation states op_sel[3] can be used in v_interp_p2_f16 in GFX9. Full diff: https://github.com/llvm/llvm-project/pull/150712.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5bb0e3648d2ec..d7ce772ab1ec6 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -9164,6 +9164,26 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOModSI);
+
+ // Some v_interp instrutions use op_sel[3] for dst.
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOpSel);
+
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ // Check if op_sel[3] is set, which is meant for dst.
+ if ((OpSel & (1 << 3)) != 0) {
+ int ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+
+ ModVal |= SISrcMods::DST_OP_SEL;
+
+ Inst.getOperand(ModIdx).setImm(ModVal);
+ }
+ }
}
void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 197bb3f0b569b..b2aa407302c95 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1293,6 +1293,14 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
O << " op_sel:[" << FI << ',' << BC << ']';
return;
}
+ if (Opc == AMDGPU::V_INTERP_P2_F16_opsel_gfx9) {
+ int ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = MI->getOperand(ModIdx).getImm();
+ if (ModVal & SISrcMods::DST_OP_SEL)
+ O << " op_sel:[0,0,0,1]";
+ return;
+ }
printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index aee2f2cb3d771..fcc9961b722ea 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -67,6 +67,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
VOP3_Pseudo<OpName, P, pattern> {
let AsmMatchConverter = "cvtVOP3Interp";
let mayRaiseFPException = 0;
+ let VOP3_OPSEL = P.HasOpSel;
}
def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
@@ -89,16 +90,17 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
let HasSrc0Mods = 0;
}
-class getInterp16Asm <bit HasSrc2, bit HasOMod> {
+class getInterp16Asm <bit HasSrc2, bit HasOMod, bit OpSel> {
string src2 = !if(HasSrc2, ", $src2_modifiers", "");
string omod = !if(HasOMod, "$omod", "");
+ string opsel = !if(OpSel, "$op_sel", "");
string ret =
- " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod;
+ " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod#opsel;
}
class getInterp16Ins <bit HasSrc2, bit HasOMod,
- Operand Src0Mod, Operand Src2Mod> {
- dag ret = !if(HasSrc2,
+ Operand Src0Mod, Operand Src2Mod, bit OpSel> {
+ dag ret1 = !if(HasSrc2,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
InterpAttr:$attr, InterpAttrChan:$attrchan,
@@ -113,19 +115,22 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
InterpAttr:$attr, InterpAttrChan:$attrchan,
highmod:$high, Clamp0:$clamp, omod0:$omod)
);
+ dag ret2 = !if(OpSel, (ins op_sel0:$op_sel), (ins));
+ dag ret = !con(ret1, ret2);
}
-class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
+class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> {
let IsSingle = 1;
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
+ let HasOpSel = OpSel;
let Src0Mod = FPVRegInputMods;
let Src2Mod = FPVRegInputMods;
let Outs64 = (outs DstRC.RegClass:$vdst);
- let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
- let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
+ let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod, OpSel>.ret;
+ let Asm64 = getInterp16Asm<HasSrc2, HasOMod, OpSel>.ret;
}
//===----------------------------------------------------------------------===//
@@ -435,7 +440,7 @@ let SubtargetPredicate = isGFX9Plus in {
defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>;
defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>;
let OtherPredicates = [isNotGFX90APlus] in
-def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+def V_INTERP_P2_F16_opsel : VOP3Interp <"v_interp_p2_f16_opsel", VOP3_INTERP16<[f16, f32, i32, f32], /*OpSel*/ 1>>;
} // End SubtargetPredicate = isGFX9Plus
// This predicate should only apply to the selection pattern. The
@@ -2241,6 +2246,14 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName>
}
}
+multiclass VOP3Interp_F16_OpSel_Real_gfx9<bits<10> op, string OpName, string AsmName> {
+ def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
+ VOP3Interp_OpSel_gfx9 <op, !cast<VOP3_Pseudo>(OpName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName);
+ let AsmString = AsmName # ps.AsmOperands;
+ }
+}
+
multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
@@ -2353,7 +2366,7 @@ defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">;
defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">;
-defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">;
+defm V_INTERP_P2_F16_opsel : VOP3Interp_F16_OpSel_Real_gfx9 <0x277, "V_INTERP_P2_F16_opsel", "v_interp_p2_f16">;
defm V_ADD_I32 : VOP3_Real_vi <0x29c>;
defm V_SUB_I32 : VOP3_Real_vi <0x29d>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c21e2d38398fa..64f0a32d0c0a4 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -401,6 +401,10 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3Interp_OpSel_gfx9<bits<10> op, VOPProfile p> : VOP3Interp_vi<op, p> {
+ let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
diff --git a/llvm/test/MC/AMDGPU/vop3-gfx9.s b/llvm/test/MC/AMDGPU/vop3-gfx9.s
index a61b0c87e199f..0a43cb9615461 100644
--- a/llvm/test/MC/AMDGPU/vop3-gfx9.s
+++ b/llvm/test/MC/AMDGPU/vop3-gfx9.s
@@ -566,6 +566,36 @@ v_interp_p2_f16 v5, v2, attr0.x, v3 clamp
// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
// VI: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp ; encoding: [0x05,0x80,0x76,0xd2,0x00,0x04,0x0e,0x04]
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 ; encoding: [0x05,0x00,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 ; encoding: [0x05,0x00,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 ; encoding: [0x05,0x00,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1,1,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 high op_sel:[0,0,0,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 high op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x05,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
+v_interp_p2_f16 v5, v2, attr0.x, v3 clamp op_sel:[0,0,0,1]
+// GFX9: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp op_sel:[0,0,0,1] ; encoding: [0x05,0xc0,0x77,0xd2,0x00,0x04,0x0e,0x04]
+// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand.
+
v_interp_p2_legacy_f16 v5, v2, attr31.x, v3
// GFX9: v_interp_p2_legacy_f16 v5, v2, attr31.x, v3 ; encoding: [0x05,0x00,0x76,0xd2,0x1f,0x04,0x0e,0x04]
// NOGCN: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
index 802d6368507e2..35b02602e39df 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
@@ -19311,6 +19311,15 @@
# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp ; encoding: [0x05,0x80,0x77,0xd2,0x00,0x04,0x0e,0x04]
0x05,0x80,0x77,0xd2,0x00,0x04,0x0e,0x04
+# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x04,0x0e,0x04]
+0x05,0x40,0x77,0xd2,0x00,0x04,0x0e,0x04
+
+# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 high op_sel:[0,0,0,1] ; encoding: [0x05,0x40,0x77,0xd2,0x00,0x05,0x0e,0x04]
+0x05,0x40,0x77,0xd2,0x00,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp op_sel:[0,0,0,1] ; encoding: [0x05,0xc0,0x77,0xd2,0x00,0x04,0x0e,0x04]
+0x05,0xc0,0x77,0xd2,0x00,0x04,0x0e,0x04
+
# CHECK: v_add_f64 v[5:6], v[1:2], v[2:3] ; encoding: [0x05,0x00,0x80,0xd2,0x01,0x05,0x02,0x00]
0x05,0x00,0x80,0xd2,0x01,0x05,0x02,0x00
|
According to llvm/llvm-project#150712, op_sel[3] can be used in v_interp_p2_f16 in GFX9. Implement that for ACO plus make use of it via a peephole optimization in the optimizer. Noticed small perf gain in Cyberpunk.
b13b463
to
e66fea5
Compare
@@ -1299,6 +1299,14 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, | |||
O << " op_sel:[" << FI << ',' << BC << ']'; | |||
return; | |||
} | |||
if (Opc == AMDGPU::V_INTERP_P2_F16_opsel_gfx9) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you not add this block, and let printPackedModifer handle it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is done in the latest commit. However, in printPackedModifier()
it checks for the operands src0, src1, and src2 in that order to determine the number of operands (translate to number of elements in the op_sel array to be printed). It stops as soon as an operand is missing. This particular instruction is a little unusual in that it has src0, src2, but no src1. So printPackedModifier()
would assume it only has 1 operand, src0. So in the printout, op_sel[] would have 2 elements, one for src0 and one for dst. To accommodate this instruction, I had to make a change to printPackedModifier()
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still, because there's no src1, the printed op_sel[] only contains 3 elements, even when the input has 4 elements.
llvm/test/MC/AMDGPU/vop3-gfx9.s
Outdated
@@ -566,6 +566,36 @@ v_interp_p2_f16 v5, v2, attr0.x, v3 clamp | |||
// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU | |||
// VI: v_interp_p2_f16 v5, v2, attr0.x, v3 clamp ; encoding: [0x05,0x80,0x76,0xd2,0x00,0x04,0x0e,0x04] | |||
|
|||
v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the convention here is to pass whatever op_sel bits were input to the output, not drop them. For comparison, sp3 assembler takes whatever opsel bits you specified, and puts them in the output. We do the same for v_pack_b32_f16 on gfx9. On newer targets, especially in true16 mode, we tend to give an assembler error when opsel bit is not supported on a certain. But what this does, just dropping the bits that are non-functional, is not what we usually do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest commit tries to do this. That means encoding op_sel[0] in src0_modifier, op_sel[2] in src2_modifier. However, op_sel[1] has nowhere to go because the instruction has no src1. As a result, in the output op_sel[] would have 3 elements: src0, src2, and dst.
✅ With the latest revision this PR passed the C/C++ code formatter. |
1 similar comment
int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); | ||
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); | ||
|
||
const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks copied from cvtVINTERP. Can you outline it into a helper method?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Created a helper method, cvtOpSelHelper
, for the common code in 2 functions.
// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU | ||
// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand. | ||
|
||
v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,0,1,1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fact that this does not round trip from assembly to disassembly will be pretty confusing for users I think. @rampitec What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I know, round-trip already is not always guaranteed for op_sel, e.g. when the input has extraneous bits. Here is an example:
$ cat z.s
v_pk_max_f16 v5, v1, v2 op_sel:[1,1,1]
v_pk_max_f16 v5, v1, v2 op_sel:[1,1,1,1]
$ ../../build/bin/llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding z.s
v_pk_max_f16 v5, v1, v2 op_sel:[1,1] ; encoding: [0x05,0x58,0x92,0xd3,0x01,0x05,0x02,0x18]
v_pk_max_f16 v5, v1, v2 op_sel:[1,1] ; encoding: [0x05,0x58,0x92,0xd3,0x01,0x05,0x02,0x18]
$ cat z.txt
0x05,0x58,0x92,0xd3,0x01,0x05,0x02,0x18
$ ../../build/bin/llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding -disassemble z.txt
v_pk_max_f16 v5, v1, v2 op_sel:[1,1] ; encoding: [0x05,0x58,0x92,0xd3,0x01,0x05,0x02,0x18]
Here only two bits are expected in op_sel, but the input instructions have more. The extra bits cannot be encoded. So the assembler only encodes the two valid bits, and the printer only prints two bits. The disassembler only sees two bits, and only prints two bits.
One option is for the assembler to reject the instruction when there are too many op_sel bits, which is a non-trivial change. Do you think we should do this? @rampitec @Sisyph
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't round trip lots of extraneous constructs, this isn't unique. I don't think it's worth putting extra effort into preserving defaulted junk bits. If they are set to non-default values, I'd almost expect them to pass through
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I find it hard to read that v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,0,1,0]
&
v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,1,0]
represent the same bits. For the second representation you have to know src1 does not take an opsel so the idx 1 value in the op_sel array is for src2. I don't have a huge issue if the value of junk bits change in a round trip either, but this is about which indices in the op_sel map to which source operands. I don't like the idea of doing lots of extra work to validate the number of op_sel bits either. I won't block the review over this, just pointing out it could lead to confusion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the given example, the input and output are different because the input is incorrect with 4 bits while the instruction only takes 3. If we don't want to do the extra work of checking the number of op_sel bits, then the extraneous bit is silently ignored, which is what happened in the given example, and, as pointed out above, is also what happens elsewhere with op_sel.
@@ -9310,6 +9310,38 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) | |||
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod)) | |||
addOptionalImmOperand(Inst, Operands, OptionalIdx, | |||
AMDGPUOperand::ImmTyOModSI); | |||
|
|||
// Some v_interp instrutions use op_sel[3] for dst. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Some v_interp instrutions use op_sel[3] for dst. | |
// Some v_interp instructions use op_sel[3] for dst. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
// NOSICI: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU | ||
// NOVI: :[[@LINE-3]]:{{[0-9]+}}: error: not a valid operand. | ||
|
||
v_interp_p2_f16 v5, v2, attr0.x, v3 op_sel:[1,0,1,1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't round trip lots of extraneous constructs, this isn't unique. I don't think it's worth putting extra effort into preserving defaulted junk bits. If they are set to non-default values, I'd almost expect them to pass through
AMDGPU documentation states op_sel[3] can be used in v_interp_p2_f16 in GFX9.
(1) In parsing, encode op_sel bits in src0_modifiers and src2_modifiers. (2) In printing, let printPackedModier() handle it, but had to modify that function a little.
e235dae
to
6aaf38e
Compare
const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, | ||
AMDGPU::OpName::src2}; | ||
const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, | ||
AMDGPU::OpName::src1_modifiers, | ||
AMDGPU::OpName::src2_modifiers}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I'd just merge these two, and then use i
and i + 3
if needed.
Also, constexpr
can be used.
@@ -1944,6 +1928,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { | |||
|
|||
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); | |||
void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); | |||
void cvtOpSelHelper(MCInst &Inst, unsigned OpSel); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not clear to me what this helper does. Can you make the function name more clear, and add comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It converts the raw op_sel value in the asm instruction to an MCInst operand. This function was created to avoid code duplication in two functions (the two call sites).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add a few assembly tests with 3 values in op_sel? like '... op_sel:[1,0,1]'
With that LGTM
Good suggestion! This, however, revealed a problem in the code. Because we are now using Additionally, when we parse the instruction, the bits are interpreted differently from the printer. For example, for Is there any objection to reverting to the 1st commit? @Sisyph I'll see if I can modify it to print the original bits but I don't think we should use |
In the printer, check for this special case, i.e., instructions with src0, src2 but no src1, and ensure 4 bits for op_sel are printed. For src1 print the default value. |
Added 3 bits tests. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, LGTM
AMDGPU documentation states op_sel[3] can be used in v_interp_p2_f16 in GFX9.