Skip to content

Commit 5924f46

Browse files
committed
Instead of optimization parameters, have our own optimization pipeline.
Disable the vectorizers in it, they aren't generally useful for GPU code.
1 parent 629d2d3 commit 5924f46

File tree

3 files changed

+102
-43
lines changed

3 files changed

+102
-43
lines changed

src/interface.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ function process_entry!(@nospecialize(job::CompilerJob), mod::LLVM.Module,
189189
return entry
190190
end
191191

192+
# post-Julia optimization processing of the module
193+
optimize_module!(@nospecialize(job::CompilerJob), mod::LLVM.Module) = return
194+
192195
# final processing of the IR module, right before validation and machine-code generation
193196
finish_module!(@nospecialize(job::CompilerJob), mod::LLVM.Module) = return
194197

@@ -216,7 +219,3 @@ function llvm_debug_info(@nospecialize(job::CompilerJob))
216219
LLVM.API.LLVMDebugEmissionKindFullDebug
217220
end
218221
end
219-
220-
# optimization
221-
optimization_params(@nospecialize(job::CompilerJob)) = GPUOptimizationParams()
222-
optimize_module!(@nospecialize(job::CompilerJob), mod::LLVM.Module) = return

src/optim.jl

Lines changed: 99 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,115 @@
11
# LLVM IR optimization
22

3-
Base.@kwdef struct GPUOptimizationParams
4-
julia::Bool = true
5-
intrinsics::Bool = true
6-
ipo::Bool = true
3+
function addTargetPasses!(pm, tm, triple)
4+
add_library_info!(pm, triple)
5+
add_transform_info!(pm, tm)
6+
end
77

8-
optlevel::Int = Base.JLOptions().opt_level
8+
# Based on Julia's optimization pipeline, minus the SLP and loop vectorizers.
9+
function addOptimizationPasses!(pm)
10+
constant_merge!(pm)
11+
12+
propagate_julia_addrsp!(pm)
13+
scoped_no_alias_aa!(pm)
14+
type_based_alias_analysis!(pm)
15+
basic_alias_analysis!(pm)
16+
cfgsimplification!(pm)
17+
dce!(pm)
18+
scalar_repl_aggregates!(pm)
19+
20+
#mem_cpy_opt!(pm)
21+
22+
always_inliner!(pm) # Respect always_inline
23+
24+
# Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard
25+
# time merging the `alloca` for the unboxed data and the `alloca` created by
26+
# the `alloc_opt` pass.
27+
28+
alloc_opt!(pm)
29+
# consider AggressiveInstCombinePass at optlevel > 2
30+
31+
instruction_combining!(pm)
32+
cfgsimplification!(pm)
33+
scalar_repl_aggregates!(pm)
34+
instruction_combining!(pm) # TODO: createInstSimplifyLegacy
35+
jump_threading!(pm)
36+
37+
reassociate!(pm)
38+
39+
early_cse!(pm)
40+
41+
# Load forwarding above can expose allocations that aren't actually used
42+
# remove those before optimizing loops.
43+
alloc_opt!(pm)
44+
loop_rotate!(pm)
45+
# moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1)
46+
loop_idiom!(pm)
47+
48+
# LoopRotate strips metadata from terminator, so run LowerSIMD afterwards
49+
lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
50+
licm!(pm)
51+
julia_licm!(pm)
52+
# Subsequent passes not stripping metadata from terminator
53+
instruction_combining!(pm) # TODO: createInstSimplifyLegacy
54+
ind_var_simplify!(pm)
55+
loop_deletion!(pm)
56+
loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll
57+
58+
# Run our own SROA on heap objects before LLVM's
59+
alloc_opt!(pm)
60+
# Re-run SROA after loop-unrolling (useful for small loops that operate,
61+
# over the structure of an aggregate)
62+
scalar_repl_aggregates!(pm)
63+
instruction_combining!(pm) # TODO: createInstSimplifyLegacy
64+
65+
gvn!(pm)
66+
mem_cpy_opt!(pm)
67+
sccp!(pm)
68+
69+
# Run instcombine after redundancy elimination to exploit opportunities
70+
# opened up by them.
71+
# This needs to be InstCombine instead of InstSimplify to allow
72+
# loops over Union-typed arrays to vectorize.
73+
instruction_combining!(pm)
74+
jump_threading!(pm)
75+
dead_store_elimination!(pm)
76+
77+
# More dead allocation (store) deletion before loop optimization
78+
# consider removing this:
79+
alloc_opt!(pm)
80+
81+
# see if all of the constant folding has exposed more loops
82+
# to simplification and deletion
83+
# this helps significantly with cleaning up iteration
84+
cfgsimplification!(pm)
85+
loop_deletion!(pm)
86+
instruction_combining!(pm)
87+
loop_vectorize!(pm)
88+
# TODO: createLoopLoadEliminationPass
89+
cfgsimplification!(pm)
90+
91+
aggressive_dce!(pm)
992
end
1093

1194
function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
1295
triple = llvm_triple(job.target)
1396
tm = llvm_machine(job.target)
1497

15-
function initialize!(pm)
16-
add_library_info!(pm, triple)
17-
add_transform_info!(pm, tm)
18-
end
19-
2098
global current_job
2199
current_job = job
22100

23-
params = optimization_params(job)
101+
ModulePassManager() do pm
102+
addTargetPasses!(pm, tm, triple)
103+
addOptimizationPasses!(pm)
104+
run!(pm, mod)
105+
end
24106

25-
# Julia-specific optimizations
26-
#
27107
# NOTE: we need to use multiple distinct pass managers to force pass ordering;
28108
# intrinsics should never get lowered before Julia has optimized them.
109+
# XXX: why doesn't the barrier noop pass work here?
29110

30111
ModulePassManager() do pm
31-
initialize!(pm)
32-
if params.julia
33-
ccall(:jl_add_optimization_passes, Cvoid,
34-
(LLVM.API.LLVMPassManagerRef, Cint, Cint),
35-
pm, params.optlevel, #=lower_intrinsics=# 0)
36-
end
37-
if params.optlevel < 2
38-
# Julia doesn't run the alloc optimizer on lower optimization levels,
39-
# but the pass is crucial to remove possibly unsupported malloc calls.
40-
alloc_opt!(pm)
41-
end
42-
run!(pm, mod)
43-
end
44-
45-
params.intrinsics && ModulePassManager() do pm
46-
initialize!(pm)
112+
addTargetPasses!(pm, tm, triple)
47113

48114
# lower intrinsics
49115
add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
@@ -62,6 +128,8 @@ function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
62128
run!(pm, mod)
63129
end
64130

131+
# TODO: combine_mul_add and create_div_rem_pairs from addMachinePasses
132+
65133
# target-specific optimizations
66134
optimize_module!(job, mod)
67135

@@ -73,8 +141,8 @@ function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
73141
# part of the LateLowerGCFrame pass) aren't collected properly.
74142
#
75143
# these might not always be safe, as Julia's IR metadata isn't designed for IPO.
76-
params.ipo && ModulePassManager() do pm
77-
initialize!(pm)
144+
ModulePassManager() do pm
145+
addTargetPasses!(pm, tm, triple)
78146

79147
dead_arg_elimination!(pm) # parent doesn't use return value --> ret void
80148

src/spirv.jl

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,6 @@ function finish_module!(job::CompilerJob{SPIRVCompilerTarget}, mod::LLVM.Module)
6262
end
6363
end
6464

65-
# the LLVM to SPIRV translator does not support optimized LLVM IR
66-
# (KhronosGroup/SPIRV-LLVM-Translator#203). however, not optimizing at all
67-
# doesn't work either, as we then don't even run the alloc optimizer (resulting
68-
# in many calls to gpu_alloc that spirv-opt cannot remove) or even 'invalid IR'
69-
# like casts to addrspace-less pointers (which aren't allowed in SPIR-V).
70-
optimization_params(@nospecialize(job::CompilerJob{SPIRVCompilerTarget})) =
71-
GPUOptimizationParams(; optlevel=1)
72-
7365
@unlocked function mcgen(job::CompilerJob{SPIRVCompilerTarget}, mod::LLVM.Module,
7466
format=LLVM.API.LLVMAssemblyFile)
7567
# The SPIRV Tools don't handle Julia's debug info, rejecting DW_LANG_Julia...

0 commit comments

Comments
 (0)