Skip to content

Commit 7ab30ef

Browse files
committed
VOP3 v_alignbit_b32 mc support true16
1 parent cfdeca3 commit 7ab30ef

17 files changed

+4235
-7302
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 98 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2498,6 +2498,7 @@ def : AMDGPUPat <
24982498
$src1), sub1)
24992499
>;
25002500

2501+
let True16Predicate = NotHasTrue16BitInsts in {
25012502
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
25022503

25032504
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2507,6 +2508,42 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
25072508
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
25082509
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
25092510
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2511+
} // end True16Predicate = NotHasTrue16BitInsts
2512+
2513+
let True16Predicate = UseFakeTrue16Insts in {
2514+
def : GCNPat <
2515+
(rotr i32:$src0, i32:$src1),
2516+
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2517+
/* src1_modifiers */ 0, $src0,
2518+
/* src2_modifiers */ 0,
2519+
$src1, /* clamp */ 0, /* op_sel */ 0)
2520+
>;
2521+
2522+
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2523+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
2524+
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2525+
0, /* src1_modifiers */
2526+
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2527+
0, /* src2_modifiers */
2528+
$src1, /* clamp */ 0, /* op_sel */ 0)
2529+
>;
2530+
2531+
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2532+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
2533+
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2534+
0, /* src1_modifiers */
2535+
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2536+
0, /* src2_modifiers */
2537+
$src1, /* clamp */ 0, /* op_sel */ 0)
2538+
>;
2539+
2540+
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2541+
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2542+
/* src1_modifiers */ 0, $src1,
2543+
/* src2_modifiers */ 0,
2544+
$src2, /* clamp */ 0, /* op_sel */ 0)
2545+
>;
2546+
} // end True16Predicate = UseFakeTrue16Insts
25102547

25112548
/********** ====================== **********/
25122549
/********** Indirect addressing **********/
@@ -3014,35 +3051,69 @@ def : GCNPat <
30143051
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
30153052
>;
30163053

3054+
let True16Predicate = NotHasTrue16BitInsts in
30173055
def : GCNPat <
30183056
(i32 (bswap i32:$a)),
30193057
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
30203058
(V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
30213059
(V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
30223060
>;
30233061

3024-
// FIXME: This should have been narrowed to i32 during legalization.
3025-
// This pattern should also be skipped for GlobalISel
3062+
let True16Predicate = UseFakeTrue16Insts in
3063+
def : GCNPat <
3064+
(i32 (bswap i32:$a)),
3065+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3066+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3067+
VSrc_b32:$a,
3068+
0, /* src1_modifiers */
3069+
VSrc_b32:$a,
3070+
0, /* src2_modifiers */
3071+
(i32 24), /* clamp */ 0, /* op_sel */ 0),
3072+
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
3073+
VSrc_b32:$a,
3074+
0, /* src1_modifiers */
3075+
VSrc_b32:$a,
3076+
0, /* src2_modifiers */
3077+
(i32 8), /* clamp */ 0, /* op_sel */ 0))
3078+
>;
3079+
3080+
class AlignBit32Inst<dag op1, dag op2, dag op3, bit isTrue16> {
3081+
defvar inst = !if(isTrue16, V_ALIGNBIT_B32_fake16_e64, V_ALIGNBIT_B32_e64);
3082+
defvar NoMods = !if(isTrue16, (inst 0), (inst));
3083+
dag ret = !con(NoMods, (inst op1), NoMods, (inst op2),
3084+
NoMods, (inst op3), NoMods, NoMods);
3085+
}
3086+
3087+
multiclass bswapi64ExtPat<bit hasTrue16> {
30263088
def : GCNPat <
30273089
(i64 (bswap i64:$a)),
30283090
(REG_SEQUENCE VReg_64,
30293091
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3030-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3031-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3032-
(i32 24)),
3033-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3034-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3035-
(i32 8))),
3092+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3093+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3094+
(i32 24), hasTrue16>.ret,
3095+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3096+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
3097+
(i32 8), hasTrue16>.ret),
30363098
sub0,
30373099
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
3038-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3039-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3040-
(i32 24)),
3041-
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3042-
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3043-
(i32 8))),
3100+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3101+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3102+
(i32 24), hasTrue16>.ret,
3103+
AlignBit32Inst<(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3104+
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
3105+
(i32 8), hasTrue16>.ret),
30443106
sub1)
30453107
>;
3108+
}
3109+
3110+
// FIXME: This should have been narrowed to i32 during legalization.
3111+
// This pattern should also be skipped for GlobalISel
3112+
let True16Predicate = NotHasTrue16BitInsts in
3113+
defm : bswapi64ExtPat</*hasTrue16*/0>;
3114+
3115+
let True16Predicate = UseFakeTrue16Insts in
3116+
defm : bswapi64ExtPat</*hasTrue16*/1>;
30463117

30473118
// FIXME: The AddedComplexity should not be needed, but in GlobalISel
30483119
// the BFI pattern ends up taking precedence without it.
@@ -3455,8 +3526,7 @@ def : GCNPat <
34553526

34563527
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
34573528
// Special case, can use V_ALIGNBIT (always uses encoded literal)
3458-
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3459-
let True16Predicate = p in {
3529+
let True16Predicate = NotHasTrue16BitInsts in
34603530
def : GCNPat <
34613531
(vecTy (DivergentBinFrag<build_vector>
34623532
(Ty !if(!eq(Ty, i16),
@@ -3466,7 +3536,19 @@ def : GCNPat <
34663536
(V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
34673537
>;
34683538

3539+
let True16Predicate = UseFakeTrue16Insts in
3540+
def : GCNPat <
3541+
(vecTy (DivergentBinFrag<build_vector>
3542+
(Ty !if(!eq(Ty, i16),
3543+
(Ty (trunc (srl VGPR_32:$a, (i32 16)))),
3544+
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
3545+
(Ty VGPR_32:$b))),
3546+
(V_ALIGNBIT_B32_fake16_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i16 16), 0, 0)
3547+
>;
3548+
34693549
// Take the upper 16 bits from each VGPR_32 and concat them
3550+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3551+
let True16Predicate = p in
34703552
def : GCNPat <
34713553
(vecTy (DivergentBinFrag<build_vector>
34723554
(Ty !if(!eq(Ty, i16),
@@ -3477,7 +3559,6 @@ def : GCNPat <
34773559
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
34783560
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302)))
34793561
>;
3480-
}
34813562

34823563
} // end foreach Ty
34833564

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,12 @@ defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>,
211211
defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
212212
defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
213213
defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
214-
defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
214+
defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
215+
VOP3_Profile<VOP_I32_I32_I32_I32>,
216+
VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
217+
VOP3_Profile_Fake16<VOP_I32_I32_I32_I16, VOP3_OPSEL>,
218+
fshr, null_frag>;
215219

216-
let True16Predicate = NotHasTrue16BitInsts in
217220
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
218221
let True16Predicate = UseRealTrue16Insts in
219222
defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
@@ -1726,7 +1729,7 @@ defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12<0x212>;
17261729
defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x213>;
17271730
defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12<0x214>;
17281731
defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12<0x215>;
1729-
defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11_gfx12<0x216>;
1732+
defm V_ALIGNBIT_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x216, "v_alignbit_b32">;
17301733
defm V_ALIGNBYTE_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x217, "v_alignbyte_b32">;
17311734
defm V_MULLIT_F32 : VOP3_Realtriple_gfx11_gfx12<0x218>;
17321735
defm V_MIN3_F32 : VOP3_Realtriple_gfx11<0x219>;

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
44
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
55
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
6-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
6+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
77

88
---
99

@@ -23,6 +23,15 @@ body: |
2323
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
2424
; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
2525
; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]]
26+
;
27+
; GFX11-LABEL: name: fshr_s32
28+
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
29+
; GFX11-NEXT: {{ $}}
30+
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
31+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
32+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
33+
; GFX11-NEXT: [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
34+
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_fake16_e64_]]
2635
%0:vgpr(s32) = COPY $vgpr0
2736
%1:vgpr(s32) = COPY $vgpr1
2837
%2:vgpr(s32) = COPY $vgpr2

0 commit comments

Comments
 (0)