Skip to content

Commit b36f89f

Browse files
authored
[AMDGPU] Make rotr illegal (#166558)
fshr is already legal and is strictly more powerful than rotr, so we should only need selection patterns for fshr.
1 parent 740a3ad commit b36f89f

File tree

10 files changed

+465
-248
lines changed

10 files changed

+465
-248
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -504,9 +504,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
504504
// The hardware supports 32-bit FSHR, but not FSHL.
505505
setOperationAction(ISD::FSHR, MVT::i32, Legal);
506506

507-
// The hardware supports 32-bit ROTR, but not ROTL.
508-
setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
509-
setOperationAction(ISD::ROTR, MVT::i64, Expand);
507+
setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
510508

511509
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
512510

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -808,12 +808,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
808808
(vt rc:$addr)
809809
>;
810810

811-
// rotr pattern
812-
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
813-
(rotr i32:$src0, i32:$src1),
814-
(BIT_ALIGN $src0, $src0, $src1)
815-
>;
816-
817811
// Special conversion patterns
818812

819813
def cvt_rpi_i32_f32 : PatFrag <

llvm/lib/Target/AMDGPU/EvergreenInstructions.td

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,6 @@ def : AMDGPUPat <
505505
(fshr i32:$src0, i32:$src1, i32:$src2),
506506
(BIT_ALIGN_INT_eg $src0, $src1, $src2)
507507
>;
508-
def : ROTRPattern <BIT_ALIGN_INT_eg>;
509508
def MULADD_eg : MULADD_Common<0x14>;
510509
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
511510
def FMA_eg : FMA_Common<0x7>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14042,6 +14042,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
1404214042
assert(OtherOp.getValueSizeInBits() == 32);
1404314043
}
1404414044

14045+
// Check that we haven't just recreated the same FSHR node.
14046+
if (N->getOpcode() == ISD::FSHR &&
14047+
(N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14048+
(N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14049+
return SDValue();
14050+
1404514051
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
1404614052

1404714053
assert(Op.getValueType().isByteSized() &&

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,8 +2685,6 @@ def : AMDGPUPat <
26852685

26862686
let True16Predicate = NotHasTrue16BitInsts in {
26872687
let SubtargetPredicate = isNotGFX9Plus in {
2688-
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
2689-
26902688
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
26912689
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
26922690
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2697,14 +2695,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
26972695
} // isNotGFX9Plus
26982696

26992697
let SubtargetPredicate = isGFX9GFX10 in {
2700-
def : GCNPat <
2701-
(rotr i32:$src0, i32:$src1),
2702-
(V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
2703-
/* src1_modifiers */ 0, $src0,
2704-
/* src2_modifiers */ 0,
2705-
$src1, /* clamp */ 0, /* op_sel */ 0)
2706-
>;
2707-
27082698
foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
27092699
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
27102700
def : GCNPat<pat,
@@ -2726,15 +2716,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
27262716
} // end True16Predicate = NotHasTrue16BitInsts
27272717

27282718
let True16Predicate = UseRealTrue16Insts in {
2729-
def : GCNPat <
2730-
(rotr i32:$src0, i32:$src1),
2731-
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2732-
/* src1_modifiers */ 0, $src0,
2733-
/* src2_modifiers */ 0,
2734-
(EXTRACT_SUBREG $src1, lo16),
2735-
/* clamp */ 0, /* op_sel */ 0)
2736-
>;
2737-
27382719
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
27392720
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
27402721
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2753,14 +2734,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
27532734
} // end True16Predicate = UseRealTrue16Insts
27542735

27552736
let True16Predicate = UseFakeTrue16Insts in {
2756-
def : GCNPat <
2757-
(rotr i32:$src0, i32:$src1),
2758-
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
2759-
/* src1_modifiers */ 0, $src0,
2760-
/* src2_modifiers */ 0,
2761-
$src1, /* clamp */ 0, /* op_sel */ 0)
2762-
>;
2763-
27642737
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
27652738
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
27662739
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),

llvm/test/CodeGen/AMDGPU/packetizer.ll

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,43 +5,37 @@
55
define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
66
; R600-LABEL: test:
77
; R600: ; %bb.0: ; %entry
8-
; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
8+
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
99
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1010
; R600-NEXT: CF_END
1111
; R600-NEXT: PAD
1212
; R600-NEXT: ALU clause starting at 4:
13-
; R600-NEXT: ADD_INT T0.Y, KC0[3].X, 1,
14-
; R600-NEXT: ADD_INT T0.Z, KC0[3].Y, 1,
15-
; R600-NEXT: ADD_INT T0.W, KC0[2].Z, 1,
16-
; R600-NEXT: ADD_INT * T1.W, KC0[2].W, 1,
17-
; R600-NEXT: BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
18-
; R600-NEXT: BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
19-
; R600-NEXT: BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
20-
; R600-NEXT: BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
21-
; R600-NEXT: OR_INT T0.W, PV.W, PV.Z,
22-
; R600-NEXT: OR_INT * T1.W, PV.Y, PV.X,
23-
; R600-NEXT: OR_INT T0.X, PS, PV.W,
13+
; R600-NEXT: ADD_INT T0.Y, KC0[2].W, 1,
14+
; R600-NEXT: ADD_INT T0.Z, KC0[2].Z, 1,
15+
; R600-NEXT: ADD_INT T0.W, KC0[3].Y, 1,
16+
; R600-NEXT: ADD_INT * T1.W, KC0[3].X, 1,
17+
; R600-NEXT: OR_INT T0.W, PS, PV.W,
18+
; R600-NEXT: OR_INT * T1.W, PV.Z, PV.Y,
19+
; R600-NEXT: OR_INT * T0.W, PS, PV.W,
20+
; R600-NEXT: BIT_ALIGN_INT T0.X, PV.W, PV.W, KC0[3].Z,
2421
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2522
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2623
;
2724
; CM-LABEL: test:
2825
; CM: ; %bb.0: ; %entry
29-
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
26+
; CM-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
3027
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
3128
; CM-NEXT: CF_END
3229
; CM-NEXT: PAD
3330
; CM-NEXT: ALU clause starting at 4:
34-
; CM-NEXT: ADD_INT T0.X, KC0[3].X, 1,
35-
; CM-NEXT: ADD_INT T0.Y, KC0[3].Y, 1,
36-
; CM-NEXT: ADD_INT T0.Z, KC0[2].Z, 1,
37-
; CM-NEXT: ADD_INT * T0.W, KC0[2].W, 1,
38-
; CM-NEXT: BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z,
39-
; CM-NEXT: BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z,
40-
; CM-NEXT: BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z,
41-
; CM-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z,
31+
; CM-NEXT: ADD_INT T0.X, KC0[2].W, 1,
32+
; CM-NEXT: ADD_INT T0.Y, KC0[2].Z, 1,
33+
; CM-NEXT: ADD_INT T0.Z, KC0[3].Y, 1,
34+
; CM-NEXT: ADD_INT * T0.W, KC0[3].X, 1,
4235
; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
4336
; CM-NEXT: OR_INT * T0.W, PV.Y, PV.X,
44-
; CM-NEXT: OR_INT * T0.X, PV.W, PV.Z,
37+
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
38+
; CM-NEXT: BIT_ALIGN_INT * T0.X, PV.W, PV.W, KC0[3].Z,
4539
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4640
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4741
entry:

llvm/test/CodeGen/AMDGPU/permute.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,13 @@ define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %a
118118
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
119119
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
120120
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
121-
; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007
122121
; GCN-NEXT: s_waitcnt lgkmcnt(0)
123122
; GCN-NEXT: v_mov_b32_e32 v1, s1
124123
; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0
125124
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
126125
; GCN-NEXT: flat_load_dword v2, v[0:1]
127126
; GCN-NEXT: s_waitcnt vmcnt(0)
128-
; GCN-NEXT: v_perm_b32 v2, s2, v2, v3
127+
; GCN-NEXT: v_alignbit_b32 v2, v2, s2, 24
129128
; GCN-NEXT: flat_store_dword v[0:1], v2
130129
; GCN-NEXT: s_endpgm
131130
bb:

0 commit comments

Comments
 (0)