AMDGPU: MC support for v_cvt_scalef32_pk_fp4_{f|bf}16 on gfx950. (#117594)

arsenm · Pravin Jagtap · web-flow · commit d727b6f7778d · 2024-11-25T19:37:04.000-08:00
These instructions have non-standard use of OPSEL bits to select
dest write byte. The src2_modifiers operand is used without having
its corresponding src2 operand by introducing dummy src2.

Co-authored-by: Pravin Jagtap &lt;Pravin.Jagtap@amd.com&gt;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8824,7 +8824,9 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
   const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
 
-  if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
+  if (Opc == AMDGPU::V_CVT_SCALEF32_PK_FP4_F16_vi ||
+      Opc == AMDGPU::V_CVT_SCALEF32_PK_FP4_BF16_vi ||
+      Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
       Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
       Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
       Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -899,6 +899,23 @@ def VOP3_CVT_SCALE_FP4FP8BF8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, f3
   let HasOMod = 0;
 }
 
+def VOP3_CVT_SCALE_FP4_F16BF16_Profile : VOP3_Profile<VOPProfile<[i32, v2f16, f32, f32]>,
+                                              VOP3_OPSEL> {
+  let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+                          FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                          op_sel0:$op_sel);
+  let HasClamp = 0;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 1;
+  let HasOpSel = 1;
+  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
+                            getAsmVOP3OpSel<3, HasClamp, HasOMod,
+                                            HasSrc0FloatMods, HasSrc1FloatMods,
+                                            HasSrc2FloatMods>.ret);
+  let HasExtVOP3DPP = 0;
+}
+
 class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
                                               VOP3_OPSEL> {
   let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
@@ -965,6 +982,13 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
   defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_Profile>;
   defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
   defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
+
+  // These instructions have non-standard use of op_sel. In particular they are
+  // using op_sel bits 2 and 3 while only having two sources.
+  let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+    defm V_CVT_SCALEF32_PK_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f16", VOP3_CVT_SCALE_FP4_F16BF16_Profile>;
+    defm V_CVT_SCALEF32_PK_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk_fp4_bf16", VOP3_CVT_SCALE_FP4_F16BF16_Profile>;
+  }
 }
 
 let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1930,6 +1954,8 @@ defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3OpSel_Real_gfx9 <0x23f>;
 defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3OpSel_Real_gfx9 <0x23d>;
 defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3OpSel_Real_gfx9 <0x250>;
 defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3OpSel_Real_gfx9 <0x251>;
+defm V_CVT_SCALEF32_PK_FP4_F16 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x24c>;
+defm V_CVT_SCALEF32_PK_FP4_BF16: VOP3OpSel_Real_gfx9_forced_opsel2 <0x24d>;
 }
 let OtherPredicates = [HasFP6BF6ConversionScaleInsts] in {
 defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3_Real_gfx9<0x256, "v_cvt_scalef32_pk32_f32_fp6">;
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
@@ -1025,3 +1025,43 @@ v_cvt_scalef32_pk_bf16_bf8 v1, v2, s3 op_sel:[1,0,0]
 // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
 // GFX950: v_cvt_scalef32_pk_bf16_bf8 v1, s2, 3 op_sel:[1,0,0] ; encoding: [0x01,0x08,0x6a,0xd2,0x02,0x06,0x01,0x00]
 v_cvt_scalef32_pk_bf16_bf8 v1, s2, 3 op_sel:[1,0,0]
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_f16 v1, v2, v3    ; encoding: [0x01,0x00,0x4c,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_f16 v1, s2, 3     ; encoding: [0x01,0x00,0x4c,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_scalef32_pk_fp4_f16 v1, s2, 3
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0x4c,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 op_sel:[0,0,1,1]
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x4c,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 op_sel:[0,0,0,1]
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_f16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4c,0xd2,0x02,0x06,0x02,0x20]
+v_cvt_scalef32_pk_fp4_f16 v1, -|s2|, v3
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3   ; encoding: [0x01,0x00,0x4d,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, s2, 3    ; encoding: [0x01,0x00,0x4d,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_scalef32_pk_fp4_bf16 v1, s2, 3
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0x4d,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,1,1]
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x4d,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,0,1]
+
+// NOT-GFX950: error: instruction not supported on this GPU
+// GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
+v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -293,3 +293,27 @@ v_cvt_scalef32_pk_bf16_bf8 v[20:25], v[10:25], v8 div:2
 
 // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
 v_cvt_scalef32_pk_bf16_bf8 v[20:25], v[10:25], v8 clamp div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 mul:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 clamp div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 mul:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 div:2
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand
+v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 clamp div:2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -713,3 +713,33 @@
 
 # GFX950: v_cvt_scalef32_pk_bf16_bf8 v1, s2, 3 op_sel:[1,0,0] ; encoding: [0x01,0x08,0x6a,0xd2,0x02,0x06,0x01,0x00]
 0x01,0x08,0x6a,0xd2,0x02,0x06,0x01,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_f16 v1, v2, v3    ; encoding: [0x01,0x00,0x4c,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0x4c,0xd2,0x02,0x07,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_f16 v1, s2, 3     ; encoding: [0x01,0x00,0x4c,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0x4c,0xd2,0x02,0x06,0x01,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0x4c,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x60,0x4c,0xd2,0x02,0x07,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_f16 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x4c,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0x4c,0xd2,0x02,0x07,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_f16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4c,0xd2,0x02,0x06,0x02,0x20]
+0x01,0x01,0x4c,0xd2,0x02,0x06,0x02,0x20
+
+# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3   ; encoding: [0x01,0x00,0x4d,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0x4d,0xd2,0x02,0x07,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, s2, 3    ; encoding: [0x01,0x00,0x4d,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0x4d,0xd2,0x02,0x06,0x01,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0x4d,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x60,0x4d,0xd2,0x02,0x07,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x4d,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0x4d,0xd2,0x02,0x07,0x02,0x00
+
+# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
+0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20