Skip to content

Commit affeeab

Browse files
jrbyrnesmahesh-attarde
authored andcommitted
[AMDGPU] Ensure divergence for v_alignbit (llvm#129159)
Selecting vgpr for the uniform version of this pattern may lead to unnecessary vgpr and waterfall loops.
1 parent 150dc29 commit affeeab

40 files changed

+34218
-30762
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2660,11 +2660,11 @@ let True16Predicate = NotHasTrue16BitInsts in {
26602660
let SubtargetPredicate = isNotGFX9Plus in {
26612661
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
26622662

2663-
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2663+
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
26642664
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
26652665
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
26662666

2667-
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2667+
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
26682668
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
26692669
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
26702670
} // isNotGFX9Plus
@@ -2678,8 +2678,8 @@ def : GCNPat <
26782678
$src1, /* clamp */ 0, /* op_sel */ 0)
26792679
>;
26802680

2681-
foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2682-
(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
2681+
foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
2682+
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
26832683
def : GCNPat<pat,
26842684
(V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
26852685
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2708,7 +2708,7 @@ def : GCNPat <
27082708
/* clamp */ 0, /* op_sel */ 0)
27092709
>;
27102710

2711-
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2711+
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
27122712
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
27132713
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
27142714
0, /* src1_modifiers */
@@ -2734,7 +2734,7 @@ def : GCNPat <
27342734
$src1, /* clamp */ 0, /* op_sel */ 0)
27352735
>;
27362736

2737-
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2737+
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
27382738
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
27392739
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
27402740
0, /* src1_modifiers */
@@ -2743,7 +2743,7 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
27432743
$src1, /* clamp */ 0, /* op_sel */ 0)
27442744
>;
27452745

2746-
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2746+
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
27472747
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
27482748
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
27492749
0, /* src1_modifiers */

llvm/test/CodeGen/AMDGPU/alignbit-pat.ll

Lines changed: 98 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
12
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
23

3-
; GCN-LABEL: {{^}}alignbit_shr_pat:
4-
; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
5-
; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
6-
; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]]
7-
84
define amdgpu_kernel void @alignbit_shr_pat(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
5+
; GCN-LABEL: alignbit_shr_pat:
6+
; GCN: ; %bb.0: ; %bb
7+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
8+
; GCN-NEXT: s_load_dword s8, s[4:5], 0xd
9+
; GCN-NEXT: s_mov_b32 s7, 0xf000
10+
; GCN-NEXT: s_mov_b32 s6, -1
11+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
12+
; GCN-NEXT: s_mov_b32 s4, s0
13+
; GCN-NEXT: s_mov_b32 s5, s1
14+
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
15+
; GCN-NEXT: s_mov_b32 s4, s2
16+
; GCN-NEXT: s_mov_b32 s5, s3
17+
; GCN-NEXT: s_and_b32 s0, s8, 31
18+
; GCN-NEXT: s_waitcnt vmcnt(0)
19+
; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], s0
20+
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
21+
; GCN-NEXT: s_endpgm
922
bb:
1023
%tmp = load i64, ptr addrspace(1) %arg, align 8
1124
%tmp3 = and i32 %arg2, 31
@@ -16,12 +29,24 @@ bb:
1629
ret void
1730
}
1831

19-
; GCN-LABEL: {{^}}alignbit_shr_pat_v:
20-
; GCN-DAG: load_dword v[[SHR:[0-9]+]],
21-
; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
22-
; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]
23-
2432
define amdgpu_kernel void @alignbit_shr_pat_v(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
33+
; GCN-LABEL: alignbit_shr_pat_v:
34+
; GCN: ; %bb.0: ; %bb
35+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
36+
; GCN-NEXT: s_mov_b32 s7, 0xf000
37+
; GCN-NEXT: s_mov_b32 s6, 0
38+
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
39+
; GCN-NEXT: v_mov_b32_e32 v2, 0
40+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
41+
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
42+
; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
43+
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
44+
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
45+
; GCN-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64
46+
; GCN-NEXT: s_waitcnt vmcnt(0)
47+
; GCN-NEXT: v_alignbit_b32 v0, v4, v3, v0
48+
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
49+
; GCN-NEXT: s_endpgm
2550
bb:
2651
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
2752
%gep1 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid
@@ -36,12 +61,24 @@ bb:
3661
ret void
3762
}
3863

39-
; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30:
40-
; Negative test, wrong constant
41-
; GCN: v_lshr_b64
42-
; GCN-NOT: v_alignbit_b32
43-
4464
define amdgpu_kernel void @alignbit_shr_pat_wrong_and30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
65+
; GCN-LABEL: alignbit_shr_pat_wrong_and30:
66+
; GCN: ; %bb.0: ; %bb
67+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
68+
; GCN-NEXT: s_load_dword s8, s[4:5], 0xd
69+
; GCN-NEXT: s_mov_b32 s7, 0xf000
70+
; GCN-NEXT: s_mov_b32 s6, -1
71+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
72+
; GCN-NEXT: s_mov_b32 s4, s0
73+
; GCN-NEXT: s_mov_b32 s5, s1
74+
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
75+
; GCN-NEXT: s_mov_b32 s4, s2
76+
; GCN-NEXT: s_mov_b32 s5, s3
77+
; GCN-NEXT: s_and_b32 s0, s8, 30
78+
; GCN-NEXT: s_waitcnt vmcnt(0)
79+
; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], s0
80+
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
81+
; GCN-NEXT: s_endpgm
4582
bb:
4683
%tmp = load i64, ptr addrspace(1) %arg, align 8
4784
%tmp3 = and i32 %arg2, 30
@@ -52,12 +89,23 @@ bb:
5289
ret void
5390
}
5491

55-
; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63:
56-
; Negative test, wrong constant
57-
; GCN: v_lshr_b64
58-
; GCN-NOT: v_alignbit_b32
59-
6092
define amdgpu_kernel void @alignbit_shr_pat_wrong_and63(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
93+
; GCN-LABEL: alignbit_shr_pat_wrong_and63:
94+
; GCN: ; %bb.0: ; %bb
95+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
96+
; GCN-NEXT: s_load_dword s8, s[4:5], 0xd
97+
; GCN-NEXT: s_mov_b32 s7, 0xf000
98+
; GCN-NEXT: s_mov_b32 s6, -1
99+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
100+
; GCN-NEXT: s_mov_b32 s4, s0
101+
; GCN-NEXT: s_mov_b32 s5, s1
102+
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
103+
; GCN-NEXT: s_mov_b32 s4, s2
104+
; GCN-NEXT: s_mov_b32 s5, s3
105+
; GCN-NEXT: s_waitcnt vmcnt(0)
106+
; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], s8
107+
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
108+
; GCN-NEXT: s_endpgm
61109
bb:
62110
%tmp = load i64, ptr addrspace(1) %arg, align 8
63111
%tmp3 = and i32 %arg2, 63
@@ -68,11 +116,22 @@ bb:
68116
ret void
69117
}
70118

71-
; GCN-LABEL: {{^}}alignbit_shr_pat_const30:
72-
; GCN: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
73-
; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], 30
74-
75119
define amdgpu_kernel void @alignbit_shr_pat_const30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
120+
; GCN-LABEL: alignbit_shr_pat_const30:
121+
; GCN: ; %bb.0: ; %bb
122+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
123+
; GCN-NEXT: s_mov_b32 s7, 0xf000
124+
; GCN-NEXT: s_mov_b32 s6, -1
125+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
126+
; GCN-NEXT: s_mov_b32 s4, s0
127+
; GCN-NEXT: s_mov_b32 s5, s1
128+
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
129+
; GCN-NEXT: s_mov_b32 s4, s2
130+
; GCN-NEXT: s_mov_b32 s5, s3
131+
; GCN-NEXT: s_waitcnt vmcnt(0)
132+
; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 30
133+
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
134+
; GCN-NEXT: s_endpgm
76135
bb:
77136
%tmp = load i64, ptr addrspace(1) %arg, align 8
78137
%tmp5 = lshr i64 %tmp, 30
@@ -81,12 +140,22 @@ bb:
81140
ret void
82141
}
83142

84-
; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_const33:
85-
; Negative test, shift amount more than 31
86-
; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
87-
; GCN-NOT: v_alignbit_b32
88-
89143
define amdgpu_kernel void @alignbit_shr_pat_wrong_const33(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
144+
; GCN-LABEL: alignbit_shr_pat_wrong_const33:
145+
; GCN: ; %bb.0: ; %bb
146+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
147+
; GCN-NEXT: s_mov_b32 s7, 0xf000
148+
; GCN-NEXT: s_mov_b32 s6, -1
149+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
150+
; GCN-NEXT: s_mov_b32 s4, s2
151+
; GCN-NEXT: s_mov_b32 s5, s3
152+
; GCN-NEXT: s_mov_b32 s2, s6
153+
; GCN-NEXT: s_mov_b32 s3, s7
154+
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
155+
; GCN-NEXT: s_waitcnt vmcnt(0)
156+
; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
157+
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
158+
; GCN-NEXT: s_endpgm
90159
bb:
91160
%tmp = load i64, ptr addrspace(1) %arg, align 8
92161
%tmp5 = lshr i64 %tmp, 33

0 commit comments

Comments
 (0)