[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions #116702

broxigarchen · 2024-11-18T22:26:37Z

vinterp 16bit instructions codeGen support in True16 format

Currently only enable two tests, will enable more when more true16 instructions are supported

llvmbot · 2024-11-19T00:50:16Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Brox Chen (broxigarchen)

Changes

vinterp 16bit instructions codeGen support in True16 format

Currently only enable two tests, will enable more when more true16 instructions are supported

Patch is 36.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116702.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VINTERPInstructions.td (+34)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll (+162-112)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll (+162-112)

diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index fa06d96085820e..f8b717c2e794ae 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -181,9 +181,43 @@ multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
   def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
 }
 
+class VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
+                     ValueType dstVT, bit high, bit isP2> : GCNPat <
+   (dstVT (op
+      (VINTERPMods f32:$src0, i32:$src0_modifiers),
+      (VINTERPMods f32:$src1, i32:$src1_modifiers),
+      (VINTERPMods f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers,
+          (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
+          $src1_modifiers, VGPR_32:$src1,
+          $src2_modifiers,
+          !if(isP2, (f32 VGPR_32:$src2),
+                    (f16 (EXTRACT_SUBREG VGPR_32:$src2, !if(high, hi16, lo16)))),
+          0, /* clamp */
+          7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
+                          ValueType dstVT, bit isP2> {
+  def : VInterpF16Pat_t16<op, inst, dstVT, 0, isP2>;
+  def : VInterpF16Pat_t16<op, inst, dstVT, 1, isP2>;
+}
+
 def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
 def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
 
+let True16Predicate = UseRealTrue16Insts in {
+defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p10_f16,
+                     V_INTERP_P10_F16_F32_inreg_t16, f32, 0>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p2_f16,
+                     V_INTERP_P2_F16_F32_inreg_t16, f16, 1>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_p10_rtz_f16,
+                     V_INTERP_P10_RTZ_F16_F32_inreg_t16, f32, 0>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_p2_rtz_f16,
+                     V_INTERP_P2_RTZ_F16_F32_inreg_t16, f16, 1>;
+}
+
 let True16Predicate = UseFakeTrue16Insts in {
 defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
                      V_INTERP_P10_F16_F32_inreg_fake16, f32,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index de46037e96e802..2215df9cef2623 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -1,25 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v0, attr0.y wait_vdst:15
-; GCN-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v4, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
-; GCN-NEXT:    v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
-; GCN-NEXT:    exp mrt0 v3, v2, v5, v4 done
-; GCN-NEXT:    s_endpgm
+; GFX11-LABEL: v_interp_f32:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    lds_param_load v0, attr0.y wait_vdst:15
+; GFX11-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
+; GFX11-NEXT:    v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
+; GFX11-NEXT:    exp mrt0 v3, v2, v5, v4 done
+; GFX11-NEXT:    s_endpgm
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
   %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -32,30 +33,30 @@ main_body:
 }
 
 define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32_many:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v0, attr0.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v2, attr2.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v3, attr3.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
-; GCN-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
-; GCN-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
-; GCN-NEXT:    v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
-; GCN-NEXT:    exp mrt0 v6, v7, v8, v4 done
-; GCN-NEXT:    s_endpgm
+; GFX11-LABEL: v_interp_f32_many:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    s_mov_b32 m0, s2
+; GFX11-NEXT:    lds_param_load v0, attr0.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v1, attr1.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v2, attr2.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v3, attr3.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
+; GFX11-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
+; GFX11-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
+; GFX11-NEXT:    v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
+; GFX11-NEXT:    exp mrt0 v6, v7, v8, v4 done
+; GFX11-NEXT:    s_endpgm
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -74,30 +75,30 @@ main_body:
 }
 
 define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32_many_vm:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_mov_b32 s0, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    lds_param_load v2, attr0.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v3, attr1.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v4, attr2.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v5, attr3.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
-; GCN-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
-; GCN-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
-; GCN-NEXT:    exp mrt0 v6, v7, v8, v0 done
-; GCN-NEXT:    s_endpgm
+; GFX11-LABEL: v_interp_f32_many_vm:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
+; GFX11-NEXT:    s_mov_b32 m0, s0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT:    lds_param_load v2, attr0.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v3, attr1.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v4, attr2.x wait_vdst:15
+; GFX11-NEXT:    lds_param_load v5, attr3.x wait_vdst:15
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
+; GFX11-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
+; GFX11-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
+; GFX11-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GFX11-NEXT:    exp mrt0 v6, v7, v8, v0 done
+; GFX11-NEXT:    s_endpgm
 main_body:
   %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
   %i = load float, ptr addrspace(1) %i.ptr, align 4
@@ -120,23 +121,41 @@ main_body:
 }
 
 define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f16:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GCN-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GCN-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-TRUE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX11-TRUE16-NEXT:    v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX11-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-FAKE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX11-FAKE16-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -148,23 +167,41 @@ main_body:
 }
 
 define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_rtz_f16:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 s3, exec_lo
-; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
-; GCN-NEXT:    s_mov_b32 exec_lo, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-TRUE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX11-TRUE16-NEXT:    v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX11-TRUE16-NEXT:    v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 m0, s2
+; GFX11-FAKE16-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX11-FAKE16-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -176,17 +213,30 @@ main_body:
 }
 
 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
-; GCN-LABEL: v_interp_f16_imm_params:
-; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_add_f16_e32 v0, v1, v0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
+; GFX11-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
+; GFX11-FAKE16-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
   %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index e3dd036ecc3083..bf545c82f2d568 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -1,25 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 def...
[truncated]

arsenm · 2024-11-21T04:15:24Z

llvm/lib/Target/AMDGPU/VINTERPInstructions.td

+      (VINTERPMods f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers,
+          (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),


How does arbitrarily extracting from a float work?

Hey Matt. Is it the extract_subreg and the hi16/lo16 that you are pointing to?

I see. I think this is not extracting from a 32bits float to a 16bits float. The input is always a 16bits float and this is just selecting the high half or low half of a 32 bit register

So the marked types are wrong, they shouldn't be f32

After looking at the documentation for these intrinsics, I do not understand why they are using float. src0 and src2 should be using <2 x half>?

It seems there is a historical reason.

https://reviews.llvm.org/D127756 The same question is discussed here and the reason might be this:

Wouldn't it be more natural to declare p and p0 to be llvm_v2f16_ty? I think so, but it would require coordinated changes in Mesa and LLPC, and we should probably also provide a v2f16 version of llvm.amdgcn.lds.param.load which currently always returns a float.

The historical reason is it was just wrong to begin with

Hi Matt. Regarding the actual implementation I think @Sisyph knows better than I do. He is in vacation right now and he will come back next week. We can dicuss the next step with some input from him. Thanks!

Yes, I agree with @arsenm that these should have been 2 x half to begin with. However, I would be in favor of landing this patch as is, and adding a comment and filing an LLVM issue to change the intrinsic type. Since this is working for the current frontends it does not seem urgent to change the instrinsic.

arsenm · 2024-11-22T18:18:26Z

llvm/lib/Target/AMDGPU/VINTERPInstructions.td

+      (VINTERPMods f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers,
+          (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),


After looking at the documentation for these intrinsics, I do not understand why they are using float. src0 and src2 should be using <2 x half>?

Sisyph · 2024-12-04T19:26:20Z

llvm/lib/Target/AMDGPU/VINTERPInstructions.td

+      (VINTERPMods f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers,
+          (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),


Yes, I agree with @arsenm that these should have been 2 x half to begin with. However, I would be in favor of landing this patch as is, and adding a comment and filing an LLVM issue to change the intrinsic type. Since this is working for the current frontends it does not seem urgent to change the instrinsic.

codegen support for true16 for vinterp instructions

fee7883

broxigarchen marked this pull request as ready for review November 19, 2024 00:49

llvmbot added backend:AMDGPU llvm:globalisel labels Nov 19, 2024

broxigarchen requested review from arsenm and kosarev November 19, 2024 00:49

arsenm reviewed Nov 21, 2024

View reviewed changes

kosarev requested a review from Sisyph November 21, 2024 19:31

broxigarchen requested a review from arsenm November 22, 2024 17:12

arsenm approved these changes Nov 22, 2024

View reviewed changes

broxigarchen requested a review from arsenm November 27, 2024 19:19

arsenm approved these changes Dec 3, 2024

View reviewed changes

Sisyph approved these changes Dec 4, 2024

View reviewed changes

broxigarchen merged commit 85142f5 into llvm:main Dec 9, 2024
13 checks passed

[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions #116702

[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions #116702

Uh oh!

Conversation

broxigarchen commented Nov 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

broxigarchen commented Nov 18, 2024 •

edited

Loading

llvmbot commented Nov 19, 2024 •

edited

Loading