Skip to content

Commit 03a61c0

Browse files
committed
update codegen pattern for fake16 flow
1 parent b18ae32 commit 03a61c0

File tree

4 files changed

+371
-253
lines changed

4 files changed

+371
-253
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2473,6 +2473,7 @@ def : AMDGPUPat <
24732473
$src1), sub1)
24742474
>;
24752475

2476+
let True16Predicate = NotHasTrue16BitInsts in {
24762477
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
24772478

24782479
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2482,6 +2483,42 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
24822483
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
24832484
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
24842485
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2486+
} //end True16Predicate = NotHasTrue16BitInsts
2487+
2488+
let True16Predicate = UseFakeTrue16Insts in {
2489+
def ROTRPattern_fake16 : GCNPat <
2490+
(rotr i32:$src0, i32:$src1),
2491+
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2492+
/* src1_modifiers */ 0, $src0,
2493+
/* src2_modifiers */ 0,
2494+
$src1, /* clamp */ 0, /* op_sel */ 0)
2495+
>;
2496+
2497+
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2498+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
2499+
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2500+
0, /* src1_modifiers */
2501+
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2502+
0, /* src2_modifiers */
2503+
$src1, /* clamp */ 0, /* op_sel */ 0)
2504+
>;
2505+
2506+
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2507+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
2508+
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2509+
0, /* src1_modifiers */
2510+
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2511+
0, /* src2_modifiers */
2512+
$src1, /* clamp */ 0, /* op_sel */ 0)
2513+
>;
2514+
2515+
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2516+
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2517+
/* src1_modifiers */ 0, $src1,
2518+
/* src2_modifiers */ 0,
2519+
$src2, /* clamp */ 0, /* op_sel */ 0)
2520+
>;
2521+
} // end True16Predicate = UseFakeTrue16Insts
24852522

24862523
/********** ====================== **********/
24872524
/********** Indirect addressing **********/
@@ -2984,15 +3021,35 @@ def : GCNPat <
29843021
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
29853022
>;
29863023

3024+
let True16Predicate = NotHasTrue16BitInsts in
29873025
def : GCNPat <
29883026
(i32 (bswap i32:$a)),
29893027
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
29903028
(V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
29913029
(V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
29923030
>;
29933031

3032+
let True16Predicate = UseFakeTrue16Insts in
3033+
def : GCNPat <
3034+
(i32 (bswap i32:$a)),
3035+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3036+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3037+
VSrc_b32:$a,
3038+
0, /* src1_modifiers */
3039+
VSrc_b32:$a,
3040+
0, /* src2_modifiers */
3041+
(i32 24), /* clamp */ 0, /* op_sel */ 0),
3042+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3043+
VSrc_b32:$a,
3044+
0, /* src1_modifiers */
3045+
VSrc_b32:$a,
3046+
0, /* src2_modifiers */
3047+
(i32 8), /* clamp */ 0, /* op_sel */ 0))
3048+
>;
3049+
29943050
// FIXME: This should have been narrowed to i32 during legalization.
29953051
// This pattern should also be skipped for GlobalISel
3052+
let True16Predicate = NotHasTrue16BitInsts in
29963053
def : GCNPat <
29973054
(i64 (bswap i64:$a)),
29983055
(REG_SEQUENCE VReg_64,
@@ -3014,6 +3071,40 @@ def : GCNPat <
30143071
sub1)
30153072
>;
30163073

3074+
let True16Predicate = UseFakeTrue16Insts in
3075+
def : GCNPat <
3076+
(i64 (bswap i64:$a)),
3077+
(REG_SEQUENCE VReg_64,
3078+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3079+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3080+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3081+
0, /* src1_modifiers */
3082+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3083+
0, /* src2_modifiers */
3084+
(i32 24), /* clamp */ 0, /* op_sel */ 0),
3085+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3086+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3087+
0, /* src1_modifiers */
3088+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3089+
0, /* src2_modifiers */
3090+
(i32 8), /* clamp */ 0, /* op_sel */ 0)),
3091+
sub0,
3092+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3093+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3094+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3095+
0, /* src1_modifiers */
3096+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3097+
0, /* src2_modifiers */
3098+
(i32 24), /* clamp */ 0, /* op_sel */ 0),
3099+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3100+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3101+
0, /* src1_modifiers */
3102+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3103+
0, /* src2_modifiers */
3104+
(i32 8), /* clamp */ 0, /* op_sel */ 0)),
3105+
sub1)
3106+
>;
3107+
30173108
// FIXME: The AddedComplexity should not be needed, but in GlobalISel
30183109
// the BFI pattern ends up taking precedence without it.
30193110
let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
@@ -3379,6 +3470,7 @@ def : GCNPat <
33793470

33803471
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
33813472
// Special case, can use V_ALIGNBIT (always uses encoded literal)
3473+
let True16Predicate = NotHasTrue16BitInsts in
33823474
def : GCNPat <
33833475
(vecTy (DivergentBinFrag<build_vector>
33843476
(Ty !if(!eq(Ty, i16),
@@ -3388,6 +3480,16 @@ def : GCNPat <
33883480
(V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
33893481
>;
33903482

3483+
let True16Predicate = UseFakeTrue16Insts in
3484+
def : GCNPat <
3485+
(vecTy (DivergentBinFrag<build_vector>
3486+
(Ty !if(!eq(Ty, i16),
3487+
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
3488+
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
3489+
(Ty VGPR_32:$b))),
3490+
(V_ALIGNBIT_B32_fake16_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i16 16), 0, 0)
3491+
>;
3492+
33913493
// Take the upper 16 bits from each VGPR_32 and concat them
33923494
def : GCNPat <
33933495
(vecTy (DivergentBinFrag<build_vector>

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,11 @@ defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>,
211211
defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
212212
defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
213213
defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
214-
defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>,
215-
VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
216-
VOP3_Profile_Fake16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
217-
fshr, null_frag>;
214+
defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
215+
VOP3_Profile<VOP_I32_I32_I32_I32>,
216+
VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
217+
VOP3_Profile_Fake16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
218+
fshr, null_frag>;
218219

219220
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
220221

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
44
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
55
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
6-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
6+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
77

88
---
99

@@ -23,6 +23,15 @@ body: |
2323
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
2424
; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
2525
; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]]
26+
;
27+
; GFX11-LABEL: name: fshr_s32
28+
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
29+
; GFX11-NEXT: {{ $}}
30+
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
31+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
32+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
33+
; GFX11-NEXT: [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
34+
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_fake16_e64_]]
2635
%0:vgpr(s32) = COPY $vgpr0
2736
%1:vgpr(s32) = COPY $vgpr1
2837
%2:vgpr(s32) = COPY $vgpr2

0 commit comments

Comments
 (0)