Skip to content

Conversation

@broxigarchen
Copy link
Contributor

@broxigarchen broxigarchen commented Nov 18, 2024

vinterp 16bit instructions codeGen support in True16 format

Currently only enable two tests, will enable more when more true16 instructions are supported

@broxigarchen broxigarchen marked this pull request as ready for review November 19, 2024 00:49
@llvmbot
Copy link
Member

llvmbot commented Nov 19, 2024

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Brox Chen (broxigarchen)

Changes

vinterp 16bit instructions codeGen support in True16 format

Currently only enable two tests, will enable more when more true16 instructions are supported


Patch is 36.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116702.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VINTERPInstructions.td (+34)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll (+162-112)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll (+162-112)
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index fa06d96085820e..f8b717c2e794ae 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -181,9 +181,43 @@ multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
   def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
 }
 
+class VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
+                     ValueType dstVT, bit high, bit isP2> : GCNPat <
+   (dstVT (op
+      (VINTERPMods f32:$src0, i32:$src0_modifiers),
+      (VINTERPMods f32:$src1, i32:$src1_modifiers),
+      (VINTERPMods f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers,
+          (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
+          $src1_modifiers, VGPR_32:$src1,
+          $src2_modifiers,
+          !if(isP2, (f32 VGPR_32:$src2),
+                    (f16 (EXTRACT_SUBREG VGPR_32:$src2, !if(high, hi16, lo16)))),
+          0, /* clamp */
+          7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
+                          ValueType dstVT, bit isP2> {
+  def : VInterpF16Pat_t16<op, inst, dstVT, 0, isP2>;
+  def : VInterpF16Pat_t16<op, inst, dstVT, 1, isP2>;
+}
+
 def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
 def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
 
+let True16Predicate = UseRealTrue16Insts in {
+defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p10_f16,
+                     V_INTERP_P10_F16_F32_inreg_t16, f32, 0>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p2_f16,
+                     V_INTERP_P2_F16_F32_inreg_t16, f16, 1>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_p10_rtz_f16,
+                     V_INTERP_P10_RTZ_F16_F32_inreg_t16, f32, 0>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_p2_rtz_f16,
+                     V_INTERP_P2_RTZ_F16_F32_inreg_t16, f16, 1>;
+}
+
 let True16Predicate = UseFakeTrue16Insts in {
 defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
                      V_INTERP_P10_F16_F32_inreg_fake16, f32,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index de46037e96e802..2215df9cef2623 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -1,25 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v0, attr0.y wait_vdst:15
-; GCN-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v4, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
-; GCN-NEXT:    v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
-; GCN-NEXT:    exp mrt0 v3, v2, v5, v4 done
-; GCN-NEXT:    s_endpgm
+; GFX11-LABEL: v_interp_f32:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    lds_param_load v0, attr0.y wait_vdst:15
+; GFX11-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
+; GFX11-NEXT:    v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
+; GFX11-NEXT:    exp mrt0 v3, v2, v5, v4 done
+; GFX11-NEXT:    s_endpgm
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
   %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -32,30 +33,30 @@ main_body:
 }
 
 define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32_many:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v0, attr0.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v2, attr2.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v3, attr3.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
-; GCN-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
-; GCN-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
-; GCN-NEXT:    v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
-; GCN-NEXT:    exp mrt0 v6, v7, v8, v4 done
-; GCN-NEXT:    s_endpgm
+; GFX11-LABEL: v_interp_f32_many:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    lds_param_load v0, attr0.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v2, attr2.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v3, attr3.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
+; GFX11-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
+; GFX11-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
+; GFX11-NEXT:    v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
+; GFX11-NEXT:    exp mrt0 v6, v7, v8, v4 done
+; GFX11-NEXT:    s_endpgm
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -74,30 +75,30 @@ main_body:
 }
 
 define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32_many_vm:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_mov_b32 s0, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    lds_param_load v2, attr0.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v3, attr1.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v4, attr2.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v5, attr3.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
-; GCN-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
-; GCN-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
-; GCN-NEXT:    exp mrt0 v6, v7, v8, v0 done
-; GCN-NEXT:    s_endpgm
+; GFX11-LABEL: v_interp_f32_many_vm:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
+; GFX11-NEXT:    s_mov_b32 m0, s0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    lds_param_load v2, attr0.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v3, attr1.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v4, attr2.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v5, attr3.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
+; GFX11-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
+; GFX11-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GFX11-NEXT:    exp mrt0 v6, v7, v8, v0 done
+; GFX11-NEXT:    s_endpgm
 main_body:
   %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
   %i = load float, ptr addrspace(1) %i.ptr, align 4
@@ -120,23 +121,41 @@ main_body:
 }
 
 define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f16:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GCN-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GCN-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-TRUE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX11-TRUE16-NEXT:    v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX11-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-FAKE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX11-FAKE16-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -148,23 +167,41 @@ main_body:
 }
 
 define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_rtz_f16:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-TRUE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX11-TRUE16-NEXT:    v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX11-TRUE16-NEXT:    v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-FAKE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX11-FAKE16-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -176,17 +213,30 @@ main_body:
 }
 
 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
-; GCN-LABEL: v_interp_f16_imm_params:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_add_f16_e32 v0, v1, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
+; GFX11-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
+; GFX11-FAKE16-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
   %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index e3dd036ecc3083..bf545c82f2d568 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -1,25 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 def...
[truncated]

(VINTERPMods f32:$src2, i32:$src2_modifiers),
!if(high, (i1 -1), (i1 0)))),
(inst $src0_modifiers,
(f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does arbitrarily extracting from a float work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey Matt. Is it the extract_subreg and the hi16/lo16 that you are pointing to?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I think this is not extracting from a 32bits float to a 16bits float. The input is always a 16bits float and this is just selecting the high half or low half of a 32 bit register

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the marked types are wrong, they shouldn't be f32

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking at the documentation for these intrinsics, I do not understand why they are using float. src0 and src2 should be using <2 x half>?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems there is a historical reason.

https://reviews.llvm.org/D127756 The same question is discussed here and the reason might be this:

Wouldn't it be more natural to declare p and p0 to be llvm_v2f16_ty?

I think so, but it would require coordinated changes in Mesa and LLPC, and we should probably also provide a v2f16 version of llvm.amdgcn.lds.param.load which currently always returns a float.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The historical reason is it was just wrong to begin with

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Matt. Regarding the actual implementation I think @Sisyph knows better than I do. He is in vacation right now and he will come back next week. We can dicuss the next step with some input from him. Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree with @arsenm that these should have been 2 x half to begin with. However, I would be in favor of landing this patch as is, and adding a comment and filing an LLVM issue to change the intrinsic type. Since this is working for the current frontends it does not seem urgent to change the instrinsic.

@kosarev kosarev requested a review from Sisyph November 21, 2024 19:31
@broxigarchen broxigarchen requested a review from arsenm November 22, 2024 17:12
(VINTERPMods f32:$src2, i32:$src2_modifiers),
!if(high, (i1 -1), (i1 0)))),
(inst $src0_modifiers,
(f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking at the documentation for these intrinsics, I do not understand why they are using float. src0 and src2 should be using <2 x half>?

@broxigarchen broxigarchen requested a review from arsenm November 27, 2024 19:19
(VINTERPMods f32:$src2, i32:$src2_modifiers),
!if(high, (i1 -1), (i1 0)))),
(inst $src0_modifiers,
(f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree with @arsenm that these should have been 2 x half to begin with. However, I would be in favor of landing this patch as is, and adding a comment and filing an LLVM issue to change the intrinsic type. Since this is working for the current frontends it does not seem urgent to change the instrinsic.

@broxigarchen broxigarchen merged commit 85142f5 into llvm:main Dec 9, 2024
13 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants