From c51f487a55be8d3b98c8e260de1cdd46486e8d84 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 18 Jun 2025 11:59:39 -0700 Subject: [PATCH 1/7] [AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10. --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 8 ++++++++ llvm/test/MC/AMDGPU/gfx10_asm_vop2.s | 12 ++++++++++++ llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s | 13 +++++++++++++ .../MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt | 7 +++++++ .../test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt | 3 +++ .../MC/Disassembler/AMDGPU/gfx9_vop2_features.txt | 10 ++++++++++ 6 files changed, 53 insertions(+) create mode 100644 llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 0c7e20fc1ebf3..c459c4df11e9e 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; let IsSingle = 1 in { defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; } +defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : @@ -2504,6 +2505,11 @@ multiclass VOP2_Real_e32e64_gfx9 op> { VOP2_DPPe(NAME#"_dpp")>; } + multiclass VOP2_Real_dpp_gfx9 op> { + if !cast(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : VOP2_DPP16(NAME#"_dpp"), SIEncodingFamily.GFX9>; + } + } // End DecoderNamespace = "GFX9" multiclass VOP2_Real_e32e64_vi op> : @@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; + +defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>; } // End AssemblerPredicate = isGFX9Only defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s index 3dcf288bbbaa5..bbd36a9140b96 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s @@ -13185,3 +13185,15 @@ v_pk_fmac_f16 v5, -4.0, v2 v_pk_fmac_f16 v5, v1, v255 // GFX10: encoding: [0x01,0xff,0x0b,0x78] + +v_pk_fmac_f16 v5, v1, v2 +// GFX10: encoding: [0x01,0x05,0x0a,0x78] + +v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] +// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff] + +v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00] + +v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX10: encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s new file mode 100644 index 0000000000000..f7dab2d0359dc --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s @@ -0,0 +1,13 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx942 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s + +v_pk_fmac_f16 v5, v1, v2 +// CHECK-MI: [0x01,0x05,0x0a,0x78] + +v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff] + +v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt index 1774efa4a65c7..4a7471e6c6f98 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt @@ -2476,3 +2476,10 @@ # W32: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff] # W64: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff] 0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff + +# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff] +0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff + +# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00] +0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00 + diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt index 40b8f24e4d72f..233f93a5b8e7d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt @@ -222,3 +222,6 @@ # W32: v_cndmask_b32_dpp v0, v1, v2, vcc_lo dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa] # W64: v_cndmask_b32_dpp v0, v1, v2, vcc dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa] 0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa + +# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05] +0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt new file mode 100644 index 0000000000000..ac1ef4baa9aa4 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt @@ -0,0 +1,10 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s + +# CHECK-MI: v_pk_fmac_f16_e32 v5, v1, v2 +0x01,0x05,0x0a,0x78 + +# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff + +# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00 From 76a36b2bd0b5a9f24af997caa06779047b53ea1c Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 18 Jun 2025 17:48:13 -0700 Subject: [PATCH 2/7] for gfx9, use existing VOP2_Real_e32e64_gfx9 --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index c459c4df11e9e..8c6473a43ec4e 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -2505,11 +2505,6 @@ multiclass VOP2_Real_e32e64_gfx9 op> { VOP2_DPPe(NAME#"_dpp")>; } - multiclass VOP2_Real_dpp_gfx9 op> { - if !cast(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : VOP2_DPP16(NAME#"_dpp"), SIEncodingFamily.GFX9>; - } - } // End DecoderNamespace = "GFX9" multiclass VOP2_Real_e32e64_vi op> : @@ -2567,7 +2562,7 @@ defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; -defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>; +defm V_PK_FMAC_F16 : VOP2_Real_e32e64_gfx9<0x03c>; } // End AssemblerPredicate = isGFX9Only defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; From e16fa23f1f624321d3e49114bc439113afcaecdf Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 19 Jun 2025 09:41:31 -0700 Subject: [PATCH 3/7] for gfx10, use existing VOP2_Real_gfx10 --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 5 +-- llvm/test/MC/AMDGPU/literalv216.s | 2 +- .../MC/Disassembler/AMDGPU/gfx10_vop2.txt | 32 +++++++++---------- .../AMDGPU/gfx10_vop3p_literalv216.txt | 2 +- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 8c6473a43ec4e..4ab8144b45b74 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -2169,10 +2169,7 @@ defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; -let IsSingle = 1 in { - defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; -} -defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; +defm V_PK_FMAC_F16 : VOP2_Real_gfx10<0x03c>; // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s index c695bc3600c38..f5afaa6bd6181 100644 --- a/llvm/test/MC/AMDGPU/literalv216.s +++ b/llvm/test/MC/AMDGPU/literalv216.s @@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0 // FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid v_pk_fmac_f16 v5, 0x12345678, v2 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU -// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] +// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt index fb1099d709940..b6cb64fad9d84 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt @@ -1779,52 +1779,52 @@ # GFX10: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] 0x6a,0x04,0x0a,0x38 -# GFX10: v_pk_fmac_f16 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79] +# GFX10: v_pk_fmac_f16_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79] 0x01,0x05,0xfe,0x79 -# GFX10: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] 0xc1,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78] 0xf7,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78] 0x80,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] 0xf0,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] 0x7f,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] 0x7e,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78] 0x7c,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] 0x01,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78] 0x67,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78] 0x77,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] 0x01,0x05,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78] 0x01,0xff,0x0b,0x78 -# GFX10: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] 0xff,0x05,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] 0x6b,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] 0x6a,0x04,0x0a,0x78 # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x53] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt index a022c79fe97e6..97c81ed1a629a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt @@ -144,5 +144,5 @@ # Packed VOP2 #===----------------------------------------------------------------------===// -# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] +# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] 0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12 From bccf7eb5dd409e7f4b5ac3e5a7d9d507f53e412a Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 23 Jun 2025 11:49:28 -0700 Subject: [PATCH 4/7] Revert "for gfx10, use existing VOP2_Real_gfx10" This reverts commit f2d51a235dc7f85e99c2f9a54376d5530a9daddc. --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 5 ++- llvm/test/MC/AMDGPU/literalv216.s | 2 +- .../MC/Disassembler/AMDGPU/gfx10_vop2.txt | 32 +++++++++---------- .../AMDGPU/gfx10_vop3p_literalv216.txt | 2 +- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 4ab8144b45b74..8c6473a43ec4e 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -2169,7 +2169,10 @@ defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; -defm V_PK_FMAC_F16 : VOP2_Real_gfx10<0x03c>; +let IsSingle = 1 in { + defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; +} +defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s index f5afaa6bd6181..c695bc3600c38 100644 --- a/llvm/test/MC/AMDGPU/literalv216.s +++ b/llvm/test/MC/AMDGPU/literalv216.s @@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0 // FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid v_pk_fmac_f16 v5, 0x12345678, v2 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU -// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] +// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt index b6cb64fad9d84..fb1099d709940 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt @@ -1779,52 +1779,52 @@ # GFX10: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] 0x6a,0x04,0x0a,0x38 -# GFX10: v_pk_fmac_f16_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79] +# GFX10: v_pk_fmac_f16 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x79] 0x01,0x05,0xfe,0x79 -# GFX10: v_pk_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] 0xc1,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, -4.0, v2 ; encoding: [0xf7,0x04,0x0a,0x78] 0xf7,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, 0, v2 ; encoding: [0x80,0x04,0x0a,0x78] 0x80,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] 0xf0,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] 0x7f,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] 0x7e,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7c,0x04,0x0a,0x78] 0x7c,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] 0x01,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, s103, v2 ; encoding: [0x67,0x04,0x0a,0x78] 0x67,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, ttmp11, v2 ; encoding: [0x77,0x04,0x0a,0x78] 0x77,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] 0x01,0x05,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78] +# GFX10: v_pk_fmac_f16 v5, v1, v255 ; encoding: [0x01,0xff,0x0b,0x78] 0x01,0xff,0x0b,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] 0xff,0x05,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] 0x6b,0x04,0x0a,0x78 -# GFX10: v_pk_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] +# GFX10: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] 0x6a,0x04,0x0a,0x78 # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x53] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt index 97c81ed1a629a..a022c79fe97e6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt @@ -144,5 +144,5 @@ # Packed VOP2 #===----------------------------------------------------------------------===// -# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] +# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12] 0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12 From e9acb6cfc105b68e839d3fd9c89a5300fb06b4ac Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 23 Jun 2025 11:59:14 -0700 Subject: [PATCH 5/7] GFX9 has SDWA instruction while GFX10 does not. Add tests to verify. --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 3 +- llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s | 3 + llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s | 78 ++++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 8c6473a43ec4e..c839c4a2f07b7 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -2561,8 +2561,7 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; - -defm V_PK_FMAC_F16 : VOP2_Real_e32e64_gfx9<0x03c>; +defm V_PK_FMAC_F16 : VOP2_Real_e32e64_gfx9<0x03c>; } // End AssemblerPredicate = isGFX9Only defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s b/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s index 88db110ad9c20..681b34e4c1c56 100644 --- a/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s +++ b/llvm/test/MC/AMDGPU/gfx10_unsupported_sdwa.s @@ -32,6 +32,9 @@ v_min_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD s v_mul_lo_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: sdwa variant of this instruction is not supported +v_pk_fmac_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: sdwa variant of this instruction is not supported + v_sub_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: sdwa variant of this instruction is not supported diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s index f7dab2d0359dc..4b5efd00a7adf 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s @@ -11,3 +11,81 @@ v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 // CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x00,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x01,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x02,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x03,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x04,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x05,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x0e,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x16,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x16,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x00,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x01,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x02,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x03,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x04,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x05,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x00] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x01] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x02] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x03] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x04] + +v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x05] + +v_pk_fmac_f16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x16] From ff2c36883a7237675e690e8a4ac643fa2d283fed Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 23 Jun 2025 16:46:32 -0700 Subject: [PATCH 6/7] (1) A new multiclass for gfx10 (2) tests for disassembler --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 5 +- .../AMDGPU/gfx9_vop2_features.txt | 82 +++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index c839c4a2f07b7..1fb66b7c87b8a 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1945,6 +1945,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { if !cast(NAME#"_e32").Pfl.HasExt32BitDPP then def _dpp8_gfx10 : VOP2_DPP8(NAME#"_e32")>; } + multiclass VOP2_Real_dpp_dpp8_gfx10 op> : + VOP2_Real_dpp_gfx10, + VOP2_Real_dpp8_gfx10; //===------------------------- VOP2 (with name) -------------------------===// multiclass VOP2_Real_e32_gfx10_with_name op, string opName, @@ -2172,7 +2175,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; let IsSingle = 1 in { defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; } -defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; +defm V_PK_FMAC_F16 : VOP2_Real_dpp_dpp8_gfx10<0x03c>; // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt index ac1ef4baa9aa4..2b8d58853847b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt @@ -1,4 +1,7 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx942 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s # CHECK-MI: v_pk_fmac_f16_e32 v5, v1, v2 0x01,0x05,0x0a,0x78 @@ -8,3 +11,82 @@ # CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x00,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x01,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x02,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x03,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x04,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x05,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x0e,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x16,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x16,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x00,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x01,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x02,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x03,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x04,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x05,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x00 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x01 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x02 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x03 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x04 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x05 + +# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x16 + From 39fcf976f0dc82efa3422240520471ea045a7986 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 24 Jun 2025 10:33:04 -0700 Subject: [PATCH 7/7] Combine two defs into one --- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 1fb66b7c87b8a..19c13804dd330 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1945,7 +1945,12 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { if !cast(NAME#"_e32").Pfl.HasExt32BitDPP then def _dpp8_gfx10 : VOP2_DPP8(NAME#"_e32")>; } - multiclass VOP2_Real_dpp_dpp8_gfx10 op> : + multiclass VOP2Only_Real_e32_gfx10 op> { + let IsSingle = 1 in + defm NAME: VOP2_Real_e32_gfx10; + } + multiclass VOP2_Real_e32_dpp_dpp8_gfx10 op> : + VOP2Only_Real_e32_gfx10, VOP2_Real_dpp_gfx10, VOP2_Real_dpp8_gfx10; @@ -2171,11 +2176,7 @@ defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>; defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; - -let IsSingle = 1 in { - defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; -} -defm V_PK_FMAC_F16 : VOP2_Real_dpp_dpp8_gfx10<0x03c>; +defm V_PK_FMAC_F16 : VOP2_Real_e32_dpp_dpp8_gfx10<0x03c>; // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 :