Skip to content

Commit 5138b61

Browse files
authored
[AMDGPU][True16][Codegen] remove packed build_vector pattern from true16 (#148715)
Some of the packed build_vector use vgpr_32 for i16/f16/bf16. In gfx11, bf16 arithmetic get promoted to f32 and this is done via v2i16 pack. In true16 mode this v2i16 pack is selected to a build_vector/v_lshlrev pattern which only accepts VGPR32. This causes isel to insert an illegal copy "vgpr32 = copy vgpr16" between def and use. In the end this illegal copy confuses cse pass and trigger wrong code elimination. Remove the packed build_vector pattern from true16. After removal, ISel will use vgpr16 build_vector patterns instead.
1 parent 73e4b58 commit 5138b61

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+25634
-25313
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3427,30 +3427,32 @@ def : GCNPat <
34273427
(S_LSHL_B32 SReg_32:$src1, (i16 16))
34283428
>;
34293429

3430+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3431+
let True16Predicate = p in {
34303432
def : GCNPat <
34313433
(v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
34323434
(v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
34333435
>;
34343436

3435-
34363437
def : GCNPat <
3437-
(v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
3438-
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
3438+
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
3439+
(v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
34393440
>;
34403441

34413442
def : GCNPat <
3442-
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
3443-
(v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
3443+
(v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
3444+
(v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
34443445
>;
3446+
}
34453447

34463448
def : GCNPat <
3447-
(v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
3449+
(v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
34483450
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
34493451
>;
34503452

34513453
def : GCNPat <
3452-
(v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
3453-
(v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
3454+
(v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
3455+
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
34543456
>;
34553457

34563458
foreach vecTy = [v2i16, v2f16, v2bf16] in {

llvm/test/CodeGen/AMDGPU/add.v2i16.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
780780
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
781781
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
782782
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
783-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
783+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
784+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
784785
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
785786
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
786787
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
@@ -789,11 +790,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
789790
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
790791
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
791792
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
792-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
793-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
794-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
795-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
796-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2
793+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
794+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
795+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
796+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
797+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
798+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
797799
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
798800
; GFX11-TRUE16-NEXT: s_endpgm
799801
;

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 5976 additions & 5930 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 959 additions & 994 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,8 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
659659
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
660660
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4
661661
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
662-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
662+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
663+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
663664
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
664665
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
665666
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
@@ -1132,7 +1133,8 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
11321133
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
11331134
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4
11341135
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
1135-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
1136+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
1137+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
11361138
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11371139
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
11381140
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll

Lines changed: 664 additions & 644 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll

Lines changed: 1516 additions & 1430 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll

Lines changed: 114 additions & 102 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -145,37 +145,36 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
145145
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
146146
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2
147147
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
148-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
149-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
150-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
151-
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
148+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
149+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
150+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
151+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
152+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
153+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
152154
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
153-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
155+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
156+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
154157
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
155158
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
159+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
160+
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
156161
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
157162
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
158-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
163+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
159164
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
165+
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
166+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
160167
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
168+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
161169
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
162170
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
163-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
164-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
165171
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
166-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
167-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
168-
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
169-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
170-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
171-
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
172-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
173172
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
174173
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
175174
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0
175+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
176176
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
177177
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
178-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
179178
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
180179
; GFX11-TRUE16-NEXT: .LBB0_2: ; %end
181180
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -797,40 +796,40 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
797796
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
798797
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2
799798
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
800-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
801-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
802-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
803-
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
804-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
799+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
800+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
801+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
805802
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
806-
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
807-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
808-
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
809-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
810-
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
811-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
812-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
813-
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
814-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
815-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
816-
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
803+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2
804+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
805+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
806+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
807+
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
808+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
809+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
810+
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
817811
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
818812
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
819-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
820-
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
821-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
822-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
823-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
824-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
825-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
813+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
814+
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
815+
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
816+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
817+
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
818+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
819+
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
820+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
826821
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
822+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
823+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
824+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
825+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
826+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
827+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
828+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
827829
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
828-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
829830
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
830-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
831-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
832-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
833-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
831+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
832+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2
834833
; GFX11-TRUE16-NEXT: .LBB4_2: ; %end
835834
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
836835
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)