-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types #154875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
ffd3634 to
ef99f09
Compare
ef99f09 to
bc46fe4
Compare
bc46fe4 to
dc87997
Compare
|
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesUpdate isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests. Patch is 245.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154875.diff 18 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b6e697d34c3d3..fba5d9de28306 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2201,6 +2201,8 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2231,6 +2233,42 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (fcopysign fp16vt:$src0, fp16vt:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+ (fcopysign f32:$src0, fp16vt:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+ (fcopysign f64:$src0, fp16vt:$src1),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f32:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f64:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
} // End foreach fp16vt = [f16, bf16]
@@ -3154,6 +3192,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
}
let True16Predicate = UseRealTrue16Insts in {
@@ -3171,6 +3214,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
}
def : GCNPat <
@@ -3199,11 +3247,6 @@ def : GCNPat <
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
-def : GCNPat <
- (i1 (DivergentUnaryFrag<trunc> i16:$a)),
- (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
def IMMBitSelConst : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
MVT::i32);
@@ -3807,7 +3850,8 @@ def : GCNPat <
(v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
>;
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -3827,6 +3871,29 @@ def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (v2f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v2i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 49fe1eed9c514..44c719f3635c8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -40442,11 +40442,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX11TRUE16-LABEL: v_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v2.h, s0
@@ -42871,16 +42871,16 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v3.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v5.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v4.l, s0
@@ -43195,28 +43195,28 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX11TRUE16-LABEL: v_vselect_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v5.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v4
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v6.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v4.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v7.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v1.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v0.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v2.l
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v15.l, v11.l, s2
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v14.l, v10.l, s3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v8.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v9.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v14.h, v10.h, s5
@@ -43872,38 +43872,38 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.h, 1, v6.l
+; GFX11TRUE16-NEXT: v_and_b16 v4.l, 1, v9.l
+; GFX11TRUE16-NEXT: v_and_b16 v4.h, 1, v8.l
+; GFX11TRUE16-NEXT: v_and_b16 v5.l, 1, v11.l
+; GFX11TRUE16-NEXT: v_and_b16 v5.h, 1, v10.l
+; GFX11TRUE16-NEXT: v_and_b16 v6.l, 1, v13.l
+; GFX11TRUE16-NEXT: v_and_b16 v6.h, 1, v12.l
+; GFX11TRUE16-NEXT: v_and_b16 v7.l, 1, v15.l
+; GFX11TRUE16-NEXT: v_and_b16 v7.h, 1, v14.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v2.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v2.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v3.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v3.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 1, v4.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 1, v4.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 1, v5.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 1, v6.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 1, v6.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 1, v5.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 1, v7.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 1, v7.h
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v30.l, v22.l, s10
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v30.h, v22.h, s11
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v29.l, v21.l, s12
@@ -45512,149 +45512,149 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60
-; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120
-; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116
-; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48
-; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108
-; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104
-; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32
-; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92
-; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88
-; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20
-; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128
+; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60
+; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120
+; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56
+; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116
+; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112
+; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48
+; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108
+; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:44
+; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:104
+; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:40
+; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:100
+; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:36
+; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:96
+; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92
+; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28
+; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88
+; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24
+; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84
+; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:20
; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:8
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v28
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.h...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor code quality issue, but can we get the folding of s0 into the bfi back?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should be code there to do this folding in the foldoperand pass. Let me check why this is not working
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nvm. I think we don't do this folding (we know that s0 and v1 is the same i16 in ISel, but in folding pass this seems require some additional context analysis).
I don't think we can do much in ISel. In ISel we don't know if the 16bit operand is a vpgr or sgpr, and we force to use vgpr16 which leads to this additional copy. We might be able to add this folding in folding pass, but not sure if it worth
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it seems like a job for SIFoldOperands.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will open another patch for this
dc87997 to
e1b2852
Compare
|
CI error not related |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/25151 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/24992 Here is the relevant piece of the build log for the reference |
The CI shows these two tests being broken: Those tests were added in this PR. How are the failures not related? This change also broke postcommit buildbots. Please either revert or fix forward. |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/25001 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/22671 Here is the relevant piece of the build log for the reference |
Oh, that's weird. Not sure how I miss it. Fix is here #157684 |
#154875 failed two lit tests
This is a temporary fix for a regression from #154875. The new pattern sets the hi part of V_BFI result and that confuses si-fix-sgpr-copies - where the proper fix is likely to be. During si-fix-sgpr-copies, an incorrect fold happens: %86:vgpr_32 = V_BFI_B32_e64 %87:sreg_32 = COPY %86.hi16:vgpr_32 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %87:sreg_32, 0, %63:vgpr_16, 0, 0 into %86:vgpr_32 = V_BFI_B32_e64 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, %86.lo16:vgpr_32, 0, %63:vgpr_16, 0, 0 Fixes: Vulkan CTS dEQP-VK.glsl.builtin.precision_fp16_storage32b.*.
) This is a temporary fix for a regression from llvm#154875. The new pattern sets the hi part of V_BFI result and that confuses si-fix-sgpr-copies - where the proper fix is likely to be. During si-fix-sgpr-copies, an incorrect fold happens: %86:vgpr_32 = V_BFI_B32_e64 %87:sreg_32 = COPY %86.hi16:vgpr_32 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %87:sreg_32, 0, %63:vgpr_16, 0, 0 into %86:vgpr_32 = V_BFI_B32_e64 %95:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, %86.lo16:vgpr_32, 0, %63:vgpr_16, 0, 0 Fixes: Vulkan CTS dEQP-VK.glsl.builtin.precision_fp16_storage32b.*.
Update isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal
vgpr32 = copy vpgr16This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests.
Stacking up these changes in one patch as I realized that doing these seperately could lead to unexpected failures in between.