-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions #116702
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions #116702
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Brox Chen (broxigarchen) Changesvinterp 16bit instructions codeGen support in True16 format Currently only enable two tests, will enable more when more true16 instructions are supported Patch is 36.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116702.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index fa06d96085820e..f8b717c2e794ae 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -181,9 +181,43 @@ multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
}
+class VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
+ ValueType dstVT, bit high, bit isP2> : GCNPat <
+ (dstVT (op
+ (VINTERPMods f32:$src0, i32:$src0_modifiers),
+ (VINTERPMods f32:$src1, i32:$src1_modifiers),
+ (VINTERPMods f32:$src2, i32:$src2_modifiers),
+ !if(high, (i1 -1), (i1 0)))),
+ (inst $src0_modifiers,
+ (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
+ $src1_modifiers, VGPR_32:$src1,
+ $src2_modifiers,
+ !if(isP2, (f32 VGPR_32:$src2),
+ (f16 (EXTRACT_SUBREG VGPR_32:$src2, !if(high, hi16, lo16)))),
+ 0, /* clamp */
+ 7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
+ ValueType dstVT, bit isP2> {
+ def : VInterpF16Pat_t16<op, inst, dstVT, 0, isP2>;
+ def : VInterpF16Pat_t16<op, inst, dstVT, 1, isP2>;
+}
+
def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
+let True16Predicate = UseRealTrue16Insts in {
+defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p10_f16,
+ V_INTERP_P10_F16_F32_inreg_t16, f32, 0>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p2_f16,
+ V_INTERP_P2_F16_F32_inreg_t16, f16, 1>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_p10_rtz_f16,
+ V_INTERP_P10_RTZ_F16_F32_inreg_t16, f32, 0>;
+defm : VInterpF16Pat_t16<int_amdgcn_interp_p2_rtz_f16,
+ V_INTERP_P2_RTZ_F16_F32_inreg_t16, f16, 1>;
+}
+
let True16Predicate = UseFakeTrue16Insts in {
defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
V_INTERP_P10_F16_F32_inreg_fake16, f32,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index de46037e96e802..2215df9cef2623 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -1,25 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_mov_b32 s3, exec_lo
-; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15
-; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
-; GCN-NEXT: s_mov_b32 exec_lo, s3
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v4, s1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
-; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
-; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
-; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
-; GCN-NEXT: s_endpgm
+; GFX11-LABEL: v_interp_f32:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 m0, s2
+; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15
+; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
+; GFX11-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v4, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
+; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
+; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
+; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
+; GFX11-NEXT: s_endpgm
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -32,30 +33,30 @@ main_body:
}
define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32_many:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_mov_b32 s3, exec_lo
-; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15
-; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
-; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15
-; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15
-; GCN-NEXT: s_mov_b32 exec_lo, s3
-; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
-; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
-; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
-; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
-; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
-; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
-; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
-; GCN-NEXT: s_endpgm
+; GFX11-LABEL: v_interp_f32_many:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 m0, s2
+; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15
+; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
+; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15
+; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15
+; GFX11-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
+; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
+; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
+; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
+; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
+; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
+; GFX11-NEXT: s_endpgm
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -74,30 +75,30 @@ main_body:
}
define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f32_many_vm:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
-; GCN-NEXT: s_mov_b32 m0, s0
-; GCN-NEXT: s_mov_b32 s0, exec_lo
-; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15
-; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15
-; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15
-; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15
-; GCN-NEXT: s_mov_b32 exec_lo, s0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
-; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
-; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
-; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
-; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
-; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
-; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
-; GCN-NEXT: s_endpgm
+; GFX11-LABEL: v_interp_f32_many_vm:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
+; GFX11-NEXT: s_mov_b32 m0, s0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15
+; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15
+; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15
+; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
+; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
+; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
+; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
+; GFX11-NEXT: s_endpgm
main_body:
%i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
%i = load float, ptr addrspace(1) %i.ptr, align 4
@@ -120,23 +121,41 @@ main_body:
}
define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_f16:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_mov_b32 s3, exec_lo
-; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
-; GCN-NEXT: s_mov_b32 exec_lo, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GCN-NEXT: v_add_f16_e32 v0, v3, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_f16:
+; GFX11-TRUE16: ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
+; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_f16:
+; GFX11-FAKE16: ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
+; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -148,23 +167,41 @@ main_body:
}
define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
-; GCN-LABEL: v_interp_rtz_f16:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_mov_b32 s3, exec_lo
-; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
-; GCN-NEXT: s_mov_b32 exec_lo, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GCN-NEXT: v_add_f16_e32 v0, v3, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
+; GFX11-TRUE16: ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
+; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
+; GFX11-FAKE16: ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
+; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -176,17 +213,30 @@ main_body:
}
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
-; GCN-LABEL: v_interp_f16_imm_params:
-; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
-; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_add_f16_e32 v0, v1, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
+; GFX11-TRUE16: ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
+; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
+; GFX11-FAKE16: ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
+; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
%l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index e3dd036ecc3083..bf545c82f2d568 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -1,25 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
def...
[truncated]
|
| (VINTERPMods f32:$src2, i32:$src2_modifiers), | ||
| !if(high, (i1 -1), (i1 0)))), | ||
| (inst $src0_modifiers, | ||
| (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does arbitrarily extracting from a float work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey Matt. Is it the extract_subreg and the hi16/lo16 that you are pointing to?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. I think this is not extracting from a 32bits float to a 16bits float. The input is always a 16bits float and this is just selecting the high half or low half of a 32 bit register
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So the marked types are wrong, they shouldn't be f32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After looking at the documentation for these intrinsics, I do not understand why they are using float. src0 and src2 should be using <2 x half>?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems there is a historical reason.
https://reviews.llvm.org/D127756 The same question is discussed here and the reason might be this:
Wouldn't it be more natural to declare p and p0 to be llvm_v2f16_ty?
I think so, but it would require coordinated changes in Mesa and LLPC, and we should probably also provide a v2f16 version of llvm.amdgcn.lds.param.load which currently always returns a float.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The historical reason is it was just wrong to begin with
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. Regarding the actual implementation I think @Sisyph knows better than I do. He is in vacation right now and he will come back next week. We can dicuss the next step with some input from him. Thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I agree with @arsenm that these should have been 2 x half to begin with. However, I would be in favor of landing this patch as is, and adding a comment and filing an LLVM issue to change the intrinsic type. Since this is working for the current frontends it does not seem urgent to change the instrinsic.
| (VINTERPMods f32:$src2, i32:$src2_modifiers), | ||
| !if(high, (i1 -1), (i1 0)))), | ||
| (inst $src0_modifiers, | ||
| (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After looking at the documentation for these intrinsics, I do not understand why they are using float. src0 and src2 should be using <2 x half>?
| (VINTERPMods f32:$src2, i32:$src2_modifiers), | ||
| !if(high, (i1 -1), (i1 0)))), | ||
| (inst $src0_modifiers, | ||
| (f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I agree with @arsenm that these should have been 2 x half to begin with. However, I would be in favor of landing this patch as is, and adding a comment and filing an LLVM issue to change the intrinsic type. Since this is working for the current frontends it does not seem urgent to change the instrinsic.
vinterp 16bit instructions codeGen support in True16 format
Currently only enable two tests, will enable more when more true16 instructions are supported