Skip to content

Commit 13023a4

Browse files
added pass to llc pipeline
1 parent 11f6c0b commit 13023a4

File tree

3 files changed

+156
-0
lines changed

3 files changed

+156
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,6 +1222,8 @@ void AMDGPUPassConfig::addIRPasses() {
12221222
if (isPassEnabled(EnableImageIntrinsicOptimizer))
12231223
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
12241224

1225+
if (EnableUniformIntrinsicCombine)
1226+
addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
12251227
// This can be disabled by passing ::Disable here or on the command line
12261228
// with --expand-variadics-override=disable.
12271229
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
3+
4+
define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
5+
; CHECK-LABEL: readfirstlane_with_readfirstlane:
6+
; CHECK: ; %bb.0:
7+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8+
; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
9+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
10+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
11+
; CHECK-NEXT: s_endpgm
12+
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
13+
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
14+
store i32 %v2, ptr addrspace(1) %out
15+
ret void
16+
}
17+
18+
define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
19+
; CHECK-LABEL: readfirstlane_with_readlane:
20+
; CHECK: ; %bb.0:
21+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
22+
; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
23+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
24+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
25+
; CHECK-NEXT: v_readfirstlane_b32 s2, v1
26+
; CHECK-NEXT: v_readlane_b32 s2, v0, s2
27+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
28+
; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
29+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
30+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
31+
; CHECK-NEXT: s_endpgm
32+
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
33+
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
34+
%v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
35+
%v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
36+
store i32 %v2, ptr addrspace(1) %out
37+
ret void
38+
}
39+
40+
define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
41+
; CHECK-LABEL: readlane_with_firstlane:
42+
; CHECK: ; %bb.0:
43+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
44+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
45+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46+
; CHECK-NEXT: v_readfirstlane_b32 s2, v0
47+
; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
48+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
49+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
50+
; CHECK-NEXT: s_endpgm
51+
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
52+
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
53+
%v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
54+
store i32 %v2, ptr addrspace(1) %out
55+
ret void
56+
}
57+
58+
define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
59+
; CHECK-LABEL: readlane_readlane:
60+
; CHECK: ; %bb.0:
61+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
62+
; CHECK-NEXT: v_bfe_u32 v1, v0, 10, 10
63+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
64+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
65+
; CHECK-NEXT: v_readfirstlane_b32 s2, v1
66+
; CHECK-NEXT: v_readlane_b32 s2, v0, s2
67+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
68+
; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
69+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
70+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
71+
; CHECK-NEXT: s_endpgm
72+
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
73+
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
74+
%v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
75+
%v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
76+
store i32 %v2, ptr addrspace(1) %out
77+
ret void
78+
}
79+
80+
define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
81+
; CHECK-LABEL: permlane64_uniform:
82+
; CHECK: ; %bb.0:
83+
; CHECK-NEXT: s_clause 0x1
84+
; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
85+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
86+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
87+
; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
88+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
89+
; CHECK-NEXT: s_endpgm
90+
%v = call i32 @llvm.amdgcn.permlane64(i32 %src)
91+
store i32 %v, ptr addrspace(1) %out
92+
ret void
93+
}
94+
95+
define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
96+
; CHECK-LABEL: permlane64_nonuniform:
97+
; CHECK: ; %bb.0:
98+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
99+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
100+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
101+
; CHECK-NEXT: v_permlane64_b32 v1, v0
102+
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
103+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
104+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
105+
; CHECK-NEXT: s_endpgm
106+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
107+
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
108+
%out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
109+
store i32 %v, i32 addrspace(1)* %out_ptr
110+
ret void
111+
}
112+
113+
define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
114+
; CHECK-LABEL: permlane64_nonuniform_expression:
115+
; CHECK: ; %bb.0:
116+
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
117+
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
118+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
119+
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
120+
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
121+
; CHECK-NEXT: v_permlane64_b32 v1, v1
122+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
123+
; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
124+
; CHECK-NEXT: s_endpgm
125+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
126+
%tid2 = add i32 %tid, 1
127+
%v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
128+
%out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
129+
store i32 %v, i32 addrspace(1)* %out_ptr
130+
ret void
131+
}

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131
; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions
3232
; GCN-O0-NEXT: AMDGPU Printf lowering
3333
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
34+
; GCN-O0-NEXT: FunctionPass Manager
35+
; GCN-O0-NEXT: Dominator Tree Construction
36+
; GCN-O0-NEXT: Cycle Info Analysis
37+
; GCN-O0-NEXT: Uniformity Analysis
38+
; GCN-O0-NEXT: AMDGPU uniformIntrinsic Combine
3439
; GCN-O0-NEXT: Expand variadic functions
3540
; GCN-O0-NEXT: AMDGPU Inline All Functions
3641
; GCN-O0-NEXT: Inliner for always_inline functions
@@ -181,6 +186,11 @@
181186
; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions
182187
; GCN-O1-NEXT: AMDGPU Printf lowering
183188
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
189+
; GCN-O1-NEXT: FunctionPass Manager
190+
; GCN-O1-NEXT: Dominator Tree Construction
191+
; GCN-O1-NEXT: Cycle Info Analysis
192+
; GCN-O1-NEXT: Uniformity Analysis
193+
; GCN-O1-NEXT: AMDGPU uniformIntrinsic Combine
184194
; GCN-O1-NEXT: Expand variadic functions
185195
; GCN-O1-NEXT: AMDGPU Inline All Functions
186196
; GCN-O1-NEXT: Inliner for always_inline functions
@@ -466,6 +476,11 @@
466476
; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions
467477
; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
468478
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
479+
; GCN-O1-OPTS-NEXT: FunctionPass Manager
480+
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
481+
; GCN-O1-OPTS-NEXT: Cycle Info Analysis
482+
; GCN-O1-OPTS-NEXT: Uniformity Analysis
483+
; GCN-O1-OPTS-NEXT: AMDGPU uniformIntrinsic Combine
469484
; GCN-O1-OPTS-NEXT: Expand variadic functions
470485
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
471486
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
@@ -781,6 +796,10 @@
781796
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
782797
; GCN-O2-NEXT: FunctionPass Manager
783798
; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
799+
; GCN-O2-NEXT: Dominator Tree Construction
800+
; GCN-O2-NEXT: Cycle Info Analysis
801+
; GCN-O2-NEXT: Uniformity Analysis
802+
; GCN-O2-NEXT: AMDGPU uniformIntrinsic Combine
784803
; GCN-O2-NEXT: Expand variadic functions
785804
; GCN-O2-NEXT: AMDGPU Inline All Functions
786805
; GCN-O2-NEXT: Inliner for always_inline functions
@@ -1100,6 +1119,10 @@
11001119
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
11011120
; GCN-O3-NEXT: FunctionPass Manager
11021121
; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
1122+
; GCN-O3-NEXT: Dominator Tree Construction
1123+
; GCN-O3-NEXT: Cycle Info Analysis
1124+
; GCN-O3-NEXT: Uniformity Analysis
1125+
; GCN-O3-NEXT: AMDGPU uniformIntrinsic Combine
11031126
; GCN-O3-NEXT: Expand variadic functions
11041127
; GCN-O3-NEXT: AMDGPU Inline All Functions
11051128
; GCN-O3-NEXT: Inliner for always_inline functions

0 commit comments

Comments
 (0)