[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction #151482

rampitec · 2025-07-31T09:37:43Z

No description provided.

rampitec · 2025-07-31T09:37:59Z

[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction #151482 👈 (View in Graphite)
[AMDGPU] Add v_cvt_pk_f16_f32 instruction for gfx1250 #151469
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-07-31T09:39:28Z

@llvm/pr-subscribers-mc
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-llvm-ir

Author: Stanislav Mekhanoshin (rampitec)

Changes

Patch is 43.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151482.diff

16 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+27)
(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+3)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll (+64)
(modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+45)
(modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+45)
(modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s (+48)
(modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s (+48)
(modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s (+40)
(modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s (+40)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+45)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt (+36)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt (+30)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 172ac467f7cad..5b1c14ec5a17e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -698,6 +698,7 @@ TARGET_BUILTIN(__builtin_amdgcn_sin_bf16, "yy", "nc", "bf16-trans-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cos_bf16, "yy", "nc", "bf16-trans-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_pk_bf16_f32, "V2yffi", "nc", "bf16-cvt-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_pk_f16_f32, "V2hffi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 1c67fc3879bff..77d56739b9eb6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -282,6 +282,33 @@ void test_cvt_sr_pk_bf16_f32(global bfloat2* out, float a, float b, uint sr)
   *out = __builtin_amdgcn_cvt_sr_pk_bf16_f32(a, b, sr);
 }
 
+// CHECK-LABEL: @test_cvt_sr_pk_f16_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT:    [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float [[TMP0]], float [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_cvt_sr_pk_f16_f32(global half2* out, float a, float b, uint sr)
+{
+  *out = __builtin_amdgcn_cvt_sr_pk_f16_f32(a, b, sr);
+}
+
 // CHECK-LABEL: @test_cvt_f16_fp8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4a50558ca8e86..e57f9b3b44bfc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -593,6 +593,10 @@ def int_amdgcn_tanh : DefaultAttrsIntrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
 
+def int_amdgcn_cvt_sr_pk_f16_f32 : DefaultAttrsIntrinsic<
+  [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]
+>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_f16_f32">;
+
 def int_amdgcn_cvt_sr_pk_bf16_f32 : DefaultAttrsIntrinsic<
   [llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]
 >, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_bf16_f32">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6bca2fec17c6d..c3b14abb736f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4574,6 +4574,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_cvt_pknorm_u16:
     case Intrinsic::amdgcn_cvt_pk_i16:
     case Intrinsic::amdgcn_cvt_pk_u16:
+    case Intrinsic::amdgcn_cvt_sr_pk_f16_f32:
     case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
     case Intrinsic::amdgcn_cvt_pk_f16_fp8:
     case Intrinsic::amdgcn_cvt_pk_f16_bf8:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index efcc88e564e65..c5931fcd5d909 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2928,6 +2928,7 @@ def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
 def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
 def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
 def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
+def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>;
 def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
 def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
 def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 96fe503c369ad..cfc5fe519a3c1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1650,6 +1650,8 @@ def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_P
 
 let SubtargetPredicate = isGFX1250Plus in {
   let ReadsModeReg = 0 in {
+    defm V_CVT_SR_PK_F16_F32 : VOP3Inst<"v_cvt_sr_pk_f16_f32", VOP3_Profile<VOP_V2F16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_f16_f32>;
+
     // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel
     // to select a byte in the vdst. Bits 0 and 1 are unused.
     let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
@@ -2067,6 +2069,7 @@ defm V_ASHR_PK_U8_I32                : VOP3Only_Realtriple_gfx1250<0x291>;
 defm V_CVT_PK_BF16_F32               : VOP3Only_Realtriple_gfx1250<0x36d>;
 defm V_CVT_SR_PK_BF16_F32            : VOP3Only_Realtriple_gfx1250<0x36e>;
 defm V_CVT_PK_F16_F32                : VOP3Only_Realtriple_gfx1250<0x36f>;
+defm V_CVT_SR_PK_F16_F32             : VOP3Only_Realtriple_gfx1250<0x370>;
 defm V_CVT_PK_FP8_F16_gfx1250        : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">;
 defm V_CVT_PK_BF8_F16_gfx1250        : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">;
 defm V_CVT_SR_FP8_F16                : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll
new file mode 100644
index 0000000000000..2179800f9d317
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+declare <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float, float, i32) #0
+
+define amdgpu_ps float @cvt_sr_pk_f16_f32_vvv(float %src0, float %src1, i32 %src2) #1 {
+; GCN-LABEL: cvt_sr_pk_f16_f32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, v0, v1, v2
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 %src2) #0
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @cvt_sr_pk_f16_f32_sss(float inreg %src0, float inreg %src1, i32 inreg %src2) #1 {
+; GCN-LABEL: cvt_sr_pk_f16_f32_sss:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 %src2) #0
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi(float %src0, float %src1) #1 {
+; GCN-LABEL: cvt_sr_pk_f16_f32_vvi:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, v0, v1, 0x10002
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 65538) #0
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi_mods(float %src0, float %src1) #1 {
+; GCN-LABEL: cvt_sr_pk_f16_f32_vvi_mods:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, -v0, |v1|, 1
+; GCN-NEXT:    ; return to shader part epilog
+  %s0 = fneg float %src0
+  %s1 = call float @llvm.fabs.f32(float %src1) #0
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %s0, float %s1, i32 1) #0
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @cvt_sr_pk_f16_f32_ssi(float inreg %src0, float inreg %src1) #1 {
+; GCN-LABEL: cvt_sr_pk_f16_f32_ssi:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, s0, s1, 1
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 1) #0
+  %ret = bitcast <2 x half> %cvt to float
+  ret float %ret
+}
+
+declare float @llvm.fabs.f32(float) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index 7d4f28f28b4c5..70090ac6bc3a4 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -512,6 +512,51 @@ v_cvt_pk_f16_f32 v5, src_scc, vcc_lo mul:4
 v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
 // GFX1250: v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6f,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
 
+v_cvt_sr_pk_f16_f32 v5, v1, v2, s3
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, v1, v2, s3      ; encoding: [0x05,0x00,0x70,0xd7,0x01,0x05,0x0e,0x00]
+
+v_cvt_sr_pk_f16_f32 v5, v255, s2, s105
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, v255, s2, s105  ; encoding: [0x05,0x00,0x70,0xd7,0xff,0x05,0xa4,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x70,0xd7,0x01,0xfe,0xff,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x70,0xd7,0x69,0xd2,0xf8,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x70,0xd7,0x6a,0xf6,0x0c,0x04]
+
+v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x70,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
+
+v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0x70,0xd7,0x7b,0xfa,0xed,0x61]
+
+v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0     ; encoding: [0x05,0x00,0x70,0xd7,0x7d,0xe0,0xf5,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x70,0xd7,0x7e,0x82,0xad,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0x70,0xd7,0x7f,0xf8,0xa8,0x21]
+
+v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x70,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]
+
+v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0x70,0xd7,0xc1,0xfe,0xf4,0x43]
+
+v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x70,0xd7,0xf0,0xfa,0xc0,0x4b]
+
+v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x70,0xd7,0xfd,0xd4,0x04,0x33]
+
+v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
+// GFX1250: v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x70,0xd7,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
+
 v_cvt_sr_bf8_f16 v1, v2, v3
 // GFX1250: v_cvt_sr_bf8_f16 v1, v2, v3             ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index f910d85e3b9b1..8e6f238f48d2e 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -512,6 +512,51 @@ v_cvt_pk_f16_f32 v5, src_scc, vcc_lo mul:4
 v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
 // GFX1250: v_cvt_pk_f16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6f,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
 
+v_cvt_sr_pk_f16_f32 v5, v1, v2, s3
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, v1, v2, s3      ; encoding: [0x05,0x00,0x70,0xd7,0x01,0x05,0x0e,0x00]
+
+v_cvt_sr_pk_f16_f32 v5, v255, s2, s105
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, v255, s2, s105  ; encoding: [0x05,0x00,0x70,0xd7,0xff,0x05,0xa4,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x70,0xd7,0x01,0xfe,0xff,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x70,0xd7,0x69,0xd2,0xf8,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x70,0xd7,0x6a,0xf6,0x0c,0x04]
+
+v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x70,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
+
+v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0x70,0xd7,0x7b,0xfa,0xed,0x61]
+
+v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, m0, 0.5, m0     ; encoding: [0x05,0x00,0x70,0xd7,0x7d,0xe0,0xf5,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x70,0xd7,0x7e,0x82,0xad,0x01]
+
+v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0x70,0xd7,0x7f,0xf8,0xa8,0x21]
+
+v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x70,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]
+
+v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0x70,0xd7,0xc1,0xfe,0xf4,0x43]
+
+v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x70,0xd7,0xf0,0xfa,0xc0,0x4b]
+
+v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
+// GFX1250: v_cvt_sr_pk_f16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x70,0xd7,0xfd,0xd4,0x04,0x33]
+
+v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
+// GFX1250: v_cvt_sr_pk_f16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x70,0xd7,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
+
 v_cvt_sr_bf8_f16 v1, v2.l, v3
 // GFX1250: v_cvt_sr_bf8_f16 v1, v2.l, v3           ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
index 4ffc9057acff4..64304693859dd 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
@@ -402,6 +402,54 @@ v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0
 // GFX1250: v_cvt_pk_f16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6f,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 row_mirror
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 row_shl:1
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x70,0xd7,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x70,0xd7,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x70,0xd7,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, -|v1|, v2, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x70,0xd7,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_sr_pk_f16_f32_e64_dpp v5, v1, -|v2|, 5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x70,0xd7,0xfa,0x04,0x16,0x52,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_sr_pk_f16_f32_e64_dpp v255, -|...
[truncated]

llvm-ci · 2025-07-31T20:12:18Z

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-5 while building clang,llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/24605

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll' FAILED ********************
Exit Code: 2

Command Output (stderr):
--
/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll | /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll # RUN: at line 1
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
 #0 0x000000010505b334 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100f03334)
 #1 0x00000001050590e4 llvm::sys::RunSignalHandlers() (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100f010e4)
 #2 0x000000010505be34 SignalHandler(int, __siginfo*, void*) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100f03e34)
 #3 0x000000019f823584 (/usr/lib/system/libsystem_platform.dylib+0x18047b584)
 #4 0x000000019f7f221c (/usr/lib/system/libsystem_pthread.dylib+0x18044a21c)
 #5 0x000000019f718ad0 (/usr/lib/libc++.1.dylib+0x180370ad0)
 #6 0x0000000104be6894 void llvm::detail::UniqueFunctionBase<void, llvm::Expected<llvm::DenseMap<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>, llvm::detail::DenseMapPair<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef>>>>::CallImpl<llvm::orc::Platform::lookupInitSymbols(llvm::orc::ExecutionSession&, llvm::DenseMap<llvm::orc::JITDylib*, llvm::orc::SymbolLookupSet, llvm::DenseMapInfo<llvm::orc::JITDylib*, void>, llvm::detail::DenseMapPair<llvm::orc::JITDylib*, llvm::orc::SymbolLookupSet>> const&)::$_45>(void*, llvm::Expected<llvm::DenseMap<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef, llvm::DenseMapInfo<llvm::orc::SymbolStringPtr, void>, llvm::detail::DenseMapPair<llvm::orc::SymbolStringPtr, llvm::orc::ExecutorSymbolDef>>>&) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100a8e894)
 #7 0x0000000104be24c8 llvm::orc::AsynchronousSymbolQuery::handleComplete(llvm::orc::ExecutionSession&)::RunQueryCompleteTask::run() (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100a8a4c8)
 #8 0x0000000104caa3c4 void* std::__1::__thread_proxy[abi:un170006]<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct>>, llvm::orc::DynamicThreadPoolTaskDispatcher::dispatch(std::__1::unique_ptr<llvm::orc::Task, std::__1::default_delete<llvm::orc::Task>>)::$_0>>(void*) (/Users/buildbot/buildbot-root/aarch64-darwin/build/bin/lli+0x100b523c4)
 #9 0x000000019f7f2f94 (/usr/lib/system/libsystem_pthread.dylib+0x18044af94)
#10 0x000000019f7edd34 (/usr/lib/system/libsystem_pthread.dylib+0x180445d34)
FileCheck error: '<stdin>' is empty.
FileCheck command line:  /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll

--

********************

rampitec mentioned this pull request Jul 31, 2025

[AMDGPU] Add v_cvt_pk_f16_f32 instruction for gfx1250 #151469

Merged

rampitec requested review from changpeng and shiltian July 31, 2025 09:38

rampitec marked this pull request as ready for review July 31, 2025 09:38

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:mc Machine (object) code llvm:ir labels Jul 31, 2025

shiltian approved these changes Jul 31, 2025

View reviewed changes

Base automatically changed from users/rampitec/07-31-_amdgpu_add_v_cvt_pk_f16_f32_instruction_for_gfx1250 to main July 31, 2025 17:45

rampitec force-pushed the users/rampitec/07-31-_amdgpu_v_cvt_sr_pk_f16_f32_gfx1250_instruction branch from fd6a565 to b7f3d6c Compare July 31, 2025 17:46

[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction

351a7b4

rampitec force-pushed the users/rampitec/07-31-_amdgpu_v_cvt_sr_pk_f16_f32_gfx1250_instruction branch from b7f3d6c to 351a7b4 Compare July 31, 2025 18:42

rampitec merged commit e46d938 into main Jul 31, 2025
9 checks passed

rampitec deleted the users/rampitec/07-31-_amdgpu_v_cvt_sr_pk_f16_f32_gfx1250_instruction branch July 31, 2025 19:25

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction #151482

[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction #151482

Uh oh!

rampitec commented Jul 31, 2025

Uh oh!

rampitec commented Jul 31, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jul 31, 2025 •

edited

Loading

Uh oh!

Uh oh!

llvm-ci commented Jul 31, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction #151482

[AMDGPU] v_cvt_sr_pk_f16_f32 gfx1250 instruction #151482

Uh oh!

Conversation

rampitec commented Jul 31, 2025

Uh oh!

rampitec commented Jul 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 31, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

llvm-ci commented Jul 31, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

rampitec commented Jul 31, 2025 •

edited

Loading

llvmbot commented Jul 31, 2025 •

edited

Loading