Skip to content

Commit 123cecb

Browse files
committed
Remove runtime dispatches in lower_and_split_loops and lower (using JETTest), remove some unused code, test size on LowDimArrays
1 parent 832c95c commit 123cecb

File tree

9 files changed

+172
-167
lines changed

9 files changed

+172
-167
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.43"
4+
version = "0.12.44"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/lower_compute.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ function reduce_parent!(q::Expr, ls::LoopSet, op::Operation, opp::Operation, par
375375
if isvectorized(opp)
376376
oppt = opp
377377
elseif isidentityop(opp)
378-
oppt = only(parents(opp))
378+
oppt = parents(opp)[1]
379379
# @show oppt
380380
isvectorized(oppt) || return parent
381381
else
@@ -570,7 +570,7 @@ function lower_compute!(
570570
else# mask last u₁
571571
:ifelselast # ifelse only the last one
572572
end
573-
if last(instrcall.args) == varsym
573+
if last(instrcall.args) === varsym
574574
pushfirst!(instrcall.args, lv(ifelsefunc))
575575
# showexpr = true
576576
insert!(instrcall.args, 3, MASKSYMBOL)

src/codegen/lower_load.jl

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ function add_prefetches!(q::Expr, ls::LoopSet, op::Operation, td::UnrollArgs, pr
7474
if !isknown(prefetchloop_step)
7575
for i eachindex(gespinds.args)
7676
if i == prefetchind
77-
gespinds.args[i] = mulexpr(getsym(prefetchloop_step), gespinds.args[i])
77+
gespinds.args[i] = mulexpr(getsym(prefetchloop_step), (gespinds.args[i])::Union{Symbol,Expr})
7878
end
7979
# gespinds.args[i] = Expr(:call, lv(:data), gespinds.args[i])
8080
end
@@ -163,31 +163,6 @@ function lower_load_no_optranslation!(
163163
end
164164
nothing
165165
end
166-
# function lower_load_vectorized!(
167-
# q::Expr, ls::LoopSet, op::Operation, td::UnrollArgs, mask::Union{Nothing,Symbol,Unsigned} = nothing
168-
# )
169-
# @unpack u₁, u₁loopsym, u₂loopsym, vectorized, suffix = td
170-
# loopdeps = loopdependencies(op)
171-
# @assert isvectorized(op)
172-
# opu₁ = isu₁unrolled(op)
173-
# inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref)
174-
# if opu₁
175-
# umin = 0
176-
# U = u₁
177-
# else
178-
# umin = -1
179-
# U = 0
180-
# end
181-
# # Urange = unrolled ∈ loopdeps ? 0:U-1 : 0
182-
# var = variable_name(op, suffix)
183-
# for u ∈ umin:U-1
184-
# td = UnrollArgs(td, u)
185-
# pushvectorload!(q, op, var, td, U, vectorized, mask, opu₁, inds_calc_by_ptr_offset, reg_size(ls))
186-
# end
187-
# prefetchind = prefetchisagoodidea(ls, op, td)
188-
# iszero(prefetchind) || add_prefetches!(q, ls, op, td, prefetchind, umin)
189-
# nothing
190-
# end
191166
function indisvectorized(ls::LoopSet, ind::Symbol)
192167
for op operations(ls)
193168
((op.variable === ind) && isvectorized(op)) && return true
@@ -217,7 +192,6 @@ function lower_load_for_optranslation!(
217192
# abs of steps are equal
218193
equal_steps = (step₁ == step₂) (posindicator 0x03)
219194
# @show step₁, step₂, posindicator, equal_steps
220-
# _td = UnrollArgs(u₁loop, u₂loop, vloop, total_unroll, u₂max, Core.ifelse(equal_steps, 0, u₂max - 1))
221195
_td = UnrollArgs(u₁loop, u₂loop, vloop, u₁, u₂max, Core.ifelse(equal_steps, 0, u₂max - 1))
222196
gespinds = mem_offset(op, _td, inds_by_ptroff, false, ls)
223197
ptr = vptr(op)

src/codegen/lower_memory_common.jl

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -40,22 +40,36 @@ function addoffset!(ret::Expr, stride, ind) # 3 args
4040
end
4141
# dropping `calcbypointeroffset` has to be delayed until after multiplying the `indexstride` by `index.
4242
function addoffset!(ret::Expr, stride, ind, offset, calcbypointeroffset::Bool) # 5 -> 3 args
43-
if calcbypointeroffset
44-
addoffset!(ret, stride, offset)
45-
elseif _iszero(offset)
46-
addoffset!(ret, stride, ind)
47-
else
48-
addoffset!(ret, stride, addexpr(ind, offset))
49-
end
43+
if calcbypointeroffset
44+
addoffset!(ret, stride, offset)
45+
elseif _iszero(offset)
46+
addoffset!(ret, stride, ind)
47+
else
48+
addoffset!(ret, stride, addexpr(ind, offset))
49+
end
50+
end
51+
function _addoffset!(ret::Expr, vloopstride, indexstride::Union{Integer,MaybeKnown}, index, offset, calcbypointeroffset::Bool) # 6 -> 5 args
52+
if _isone(indexstride)
53+
addoffset!(ret, vloopstride, index, offset, calcbypointeroffset)
54+
else
55+
__addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
56+
end
5057
end
5158
function _addoffset!(ret::Expr, vloopstride, indexstride, index, offset, calcbypointeroffset::Bool) # 6 -> 5 args
52-
if _isone(indexstride)
53-
addoffset!(ret, vloopstride, index, offset, calcbypointeroffset)
54-
elseif isknown(vloopstride) & isknown(indexstride)
55-
addoffset!(ret, gethint(vloopstride)*gethint(indexstride), index, offset, calcbypointeroffset)
56-
else
57-
addoffset!(ret, mulexpr(vloopstride,indexstride), index, offset, calcbypointeroffset)
58-
end
59+
___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
60+
end
61+
function __addoffset!(ret::Expr, vloopstride, indexstride, index, offset, calcbypointeroffset::Bool) # 6 -> 5 args
62+
___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
63+
end
64+
function __addoffset!(ret::Expr, vloopstride::Union{Integer,MaybeKnown}, indexstride::Union{Integer,MaybeKnown}, index, offset, calcbypointeroffset::Bool) # 6 -> 5 args
65+
if isknown(vloopstride) & isknown(indexstride)
66+
addoffset!(ret, gethint(vloopstride)*gethint(indexstride), index, offset, calcbypointeroffset)
67+
else
68+
___addoffset!(ret, vloopstride, indexstride, index, offset, calcbypointeroffset)
69+
end
70+
end
71+
function ___addoffset!(ret::Expr, vloopstride, indexstride, index, offset, calcbypointeroffset::Bool) # 6 -> 5 args
72+
addoffset!(ret, mulexpr(vloopstride,indexstride), index, offset, calcbypointeroffset)
5973
end
6074
# multiply `index` by `indexstride`
6175
function addoffset!(ret::Expr, vloopstride, indexstride, index, offset, calcbypointeroffset::Bool) # 6 -> (5 or 6) args
@@ -400,19 +414,19 @@ function add_memory_mask!(memopexpr::Expr, op::Operation, td::UnrollArgs, mask::
400414
nothing
401415
end
402416

403-
varassignname(var::Symbol, u::Int, isunrolled::Bool) = isunrolled ? Symbol(var, u) : var
404-
# name_memoffset only gets called when vectorized
405-
function name_memoffset(var::Symbol, op::Operation, td::UnrollArgs, u₁unrolled::Bool, inds_calc_by_ptr_offset::Vector{Bool}, ls::LoopSet)
406-
@unpack u₁, u₁loopsym, u₂loopsym, suffix = td
407-
if (suffix == -1) && u₁ < 0 # u₁ == -1 sentinel value meaning not unrolled
408-
name = var
409-
mo = mem_offset(op, td, inds_calc_by_ptr_offset, true, 0, ls)
410-
else
411-
name = u₁unrolled ? Symbol(var, u₁) : var
412-
mo = mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls)
413-
end
414-
name, mo
415-
end
417+
# varassignname(var::Symbol, u::Int, isunrolled::Bool) = isunrolled ? Symbol(var, u) : var
418+
# # name_memoffset only gets called when vectorized
419+
# function name_memoffset(var::Symbol, op::Operation, td::UnrollArgs, u₁unrolled::Bool, inds_calc_by_ptr_offset::Vector{Bool}, ls::LoopSet)
420+
# @unpack u₁, u₁loopsym, u₂loopsym, suffix = td
421+
# if (suffix == -1) && u₁ < 0 # u₁ == -1 sentinel value meaning not unrolled
422+
# name = var
423+
# mo = mem_offset(op, td, inds_calc_by_ptr_offset, true, 0, ls)
424+
# else
425+
# name = u₁unrolled ? Symbol(var, u₁) : var
426+
# mo = mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls)
427+
# end
428+
# name, mo
429+
# end
416430

417431
function condvarname_and_unroll(cond::Operation, u₁loop::Symbol, u₂loop::Symbol, vloop::Symbol, suffix::Int, opu₂::Bool, ls::LoopSet)
418432
condvar, condu₁, condu₂ = variable_name_and_unrolled(cond, u₁loop, u₂loop, vloop, Core.ifelse(opu₂, suffix, -1), ls)

src/codegen/lower_store.jl

Lines changed: 83 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1+
function opisreduced(op::Operation)
2+
for rdep reduceddependencies(op)
3+
rdep loopdependencies(op) && return true
4+
end
5+
false
6+
end
17
function storeinstr_preprend(op::Operation, vloopsym::Symbol)
28
# defaultstoreop = :vstore!
39
# defaultstoreop = :vnoaliasstore!
410
isvectorized(op) && return Symbol("")
511
vloopsym reduceddependencies(op) && return Symbol("")
612
# vectorized is not a loopdep, but is a reduced dep
7-
opp = first(parents(op))
13+
opp::Operation = first(parents(op))
814
# while vectorized ∉ loopdependencies(opp)
9-
while ((!isvectorized(opp)) || (any(rdep -> rdep loopdependencies(opp), reduceddependencies(opp))))
15+
while ((!isvectorized(opp)) || opisreduced(opp))
1016
oppold = opp
1117
for oppp parents(opp)
1218
if vloopsym reduceddependencies(oppp)
@@ -145,88 +151,92 @@ function lower_store_collection!(
145151
end
146152
gf(s::Symbol, n::Int) = Expr(:call, GlobalRef(Core,:getfield), s, n, false)
147153
function lower_store!(
148-
q::Expr, ls::LoopSet, op::Operation, ua::UnrollArgs, mask::Bool,
149-
reductfunc::Symbol = storeinstr_preprend(op, ua.vloop.itersymbol), inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref)
150-
)
151-
@unpack u₁, u₁loopsym, u₂loopsym, vloopsym, vloop, u₂max, suffix = ua
152-
omop = offsetloadcollection(ls)
153-
batchid, opind = omop.batchedcollectionmap[identifier(op)]
154-
if ((batchid 0) && isvectorized(op)) && (!rejectinterleave(op))
155-
(opind == 1) && lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset)
156-
return
157-
end
158-
falseexpr = Expr(:call, lv(:False));
159-
aliasexpr = falseexpr;
160-
# trueexpr = Expr(:call, lv(:True));
161-
rs = staticexpr(reg_size(ls));
162-
opp = first(parents(op))
163-
if (((opp.instruction.instr === reductfunc) || (opp.instruction.instr === :identity)) && isone(length(parents(opp))))
164-
oppp = only(parents(opp))
165-
if isu₂unrolled(op) == isu₂unrolled(oppp)
166-
opp = oppp
167-
end
154+
q::Expr, ls::LoopSet, op::Operation, ua::UnrollArgs, mask::Bool,
155+
reductfunc::Symbol = storeinstr_preprend(op, ua.vloop.itersymbol), inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref)
156+
)
157+
@unpack u₁, u₁loopsym, u₂loopsym, vloopsym, vloop, u₂max, suffix = ua
158+
omop = offsetloadcollection(ls)
159+
batchid, opind = omop.batchedcollectionmap[identifier(op)]
160+
if ((batchid 0) && isvectorized(op)) && (!rejectinterleave(op))
161+
(opind == 1) && lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset)
162+
return
163+
end
164+
falseexpr = Expr(:call, lv(:False));
165+
aliasexpr = falseexpr;
166+
# trueexpr = Expr(:call, lv(:True));
167+
rs = staticexpr(reg_size(ls));
168+
opp = first(parents(op))
169+
if ((opp.instruction.instr === reductfunc) || (opp.instruction.instr === :identity))
170+
parents_opp = parents(opp)
171+
opppstate = Base.iterate(parents_opp)
172+
if opppstate nothing
173+
oppp, state = opppstate
174+
if (Base.iterate(parents_opp, state) === nothing) && isu₂unrolled(op) == isu₂unrolled(oppp)
175+
opp = oppp
176+
end
168177
end
178+
end
169179
# __u₂max = ls.unrollspecification.u₂
170-
isu₁, isu₂ = isunrolled_sym(opp, u₁loopsym, u₂loopsym, vloopsym, ls)#, __u₂max)
171-
# @show isu₁, isu₂, u₁loopsym, u₂loopsym
172-
# @show isu₁, isu₂, opp, u₁loopsym, u₂loopsym, vloopsym
173-
u = isu₁ ? u₁ : 1
174-
mvar = Symbol(variable_name(opp, ifelse(isu₂, suffix, -1)), '_', u)
175-
if all(op.ref.loopedindex)
176-
inds = unrolledindex(op, ua, mask, inds_calc_by_ptr_offset, ls)
177-
storeexpr = if reductfunc === Symbol("")
178-
Expr(:call, lv(:_vstore!), sptr(op), mvar, inds)
179-
else
180-
Expr(:call, lv(:_vstore!), lv(reductfunc), sptr(op), mvar, inds)
181-
end
182-
add_memory_mask!(storeexpr, op, ua, mask, ls)
183-
push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs)
184-
push!(q.args, storeexpr)
180+
isu₁, isu₂ = isunrolled_sym(opp, u₁loopsym, u₂loopsym, vloopsym, ls)#, __u₂max)
181+
# @show isu₁, isu₂, u₁loopsym, u₂loopsym
182+
# @show isu₁, isu₂, opp, u₁loopsym, u₂loopsym, vloopsym
183+
u = isu₁ ? u₁ : 1
184+
mvar = Symbol(variable_name(opp, ifelse(isu₂, suffix, -1)), '_', u)
185+
if all(op.ref.loopedindex)
186+
inds = unrolledindex(op, ua, mask, inds_calc_by_ptr_offset, ls)
187+
storeexpr = if reductfunc === Symbol("")
188+
Expr(:call, lv(:_vstore!), sptr(op), mvar, inds)
185189
else
186-
parents_op = parents(op)
187-
data_u₁ = isu₁ & (u₁ > 1)
188-
189-
indices_u₁ = data_u₁
190-
if !data_u₁ & (length(parents_op) > 1)
191-
indices_u₁ = first(isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym, ls))
192-
end
193-
if indices_u₁
194-
mvard = Symbol(mvar, "##data##")
195-
# isu₁ &&
196-
data_u₁ && push!(q.args, Expr(:(=), mvard, Expr(:call, lv(:data), mvar)))
197-
sptrsym = sptr!(q, op)
198-
for u 1:u₁
199-
inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, u-1, ls)
200-
# @show isu₁unrolled(opp), opp
201-
storeexpr = if data_u₁
202-
if reductfunc === Symbol("")
203-
Expr(:call, lv(:_vstore!), sptrsym, gf(mvard,u), inds)
204-
else
205-
Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvaru, inds)
206-
end
207-
elseif reductfunc === Symbol("")
208-
Expr(:call, lv(:_vstore!), sptrsym, mvar, inds)
190+
Expr(:call, lv(:_vstore!), lv(reductfunc), sptr(op), mvar, inds)
191+
end
192+
add_memory_mask!(storeexpr, op, ua, mask, ls)
193+
push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs)
194+
push!(q.args, storeexpr)
195+
else
196+
parents_op = parents(op)
197+
data_u₁ = isu₁ & (u₁ > 1)
198+
199+
indices_u₁ = data_u₁
200+
if !data_u₁ & (length(parents_op) > 1)
201+
indices_u₁ = first(isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym, ls))
202+
end
203+
if indices_u₁
204+
mvard = Symbol(mvar, "##data##")
205+
# isu₁ &&
206+
data_u₁ && push!(q.args, Expr(:(=), mvard, Expr(:call, lv(:data), mvar)))
207+
sptrsym = sptr!(q, op)
208+
for u 1:u₁
209+
inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, u-1, ls)
210+
# @show isu₁unrolled(opp), opp
211+
storeexpr = if data_u₁
212+
if reductfunc === Symbol("")
213+
Expr(:call, lv(:_vstore!), sptrsym, gf(mvard,u), inds)
209214
else
210-
Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvar, inds)
215+
Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvaru, inds)
211216
end
212-
domask = mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym)))
213-
add_memory_mask!(storeexpr, op, ua, domask, ls)# & ((u == u₁) | isvectorized(op)))
214-
push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs)
215-
push!(q.args, storeexpr)
216-
end
217-
else
218-
inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, 0, ls)
219-
storeexpr = if reductfunc === Symbol("")
220-
Expr(:call, lv(:_vstore!), sptr(op), mvar, inds)
217+
elseif reductfunc === Symbol("")
218+
Expr(:call, lv(:_vstore!), sptrsym, mvar, inds)
221219
else
222-
Expr(:call, lv(:_vstore!), lv(reductfunc), sptr(op), mvar, inds)
220+
Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvar, inds)
223221
end
224-
add_memory_mask!(storeexpr, op, ua, mask, ls)
222+
domask = mask && (isvectorized(op) & ((u == u₁) | (vloopsym !== u₁loopsym)))
223+
add_memory_mask!(storeexpr, op, ua, domask, ls)# & ((u == u₁) | isvectorized(op)))
225224
push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs)
226225
push!(q.args, storeexpr)
227226
end
227+
else
228+
inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, true, 0, ls)
229+
storeexpr = if reductfunc === Symbol("")
230+
Expr(:call, lv(:_vstore!), sptr(op), mvar, inds)
231+
else
232+
Expr(:call, lv(:_vstore!), lv(reductfunc), sptr(op), mvar, inds)
233+
end
234+
add_memory_mask!(storeexpr, op, ua, mask, ls)
235+
push!(storeexpr.args, falseexpr, aliasexpr, falseexpr, rs)
236+
push!(q.args, storeexpr)
228237
end
229-
nothing
238+
end
239+
nothing
230240
end
231241

232242
function lower_tiled_store!(

src/codegen/lowering.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,7 @@ function determine_eltype(ls::LoopSet, ortypdefined::Bool)::Union{Symbol,Expr}
677677
if narrays == 1
678678
return Expr(:call, lv(:eltype), first(ls.includedactualarrays))
679679
else
680-
oreducop = ls.operations[only(ls.outer_reductions)]
680+
oreducop = ls.operations[ls.outer_reductions[1]]
681681
if ortypdefined
682682
return typeof_expr(oreducop)
683683
else

src/codegen/operation_evaluation_order.jl

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,28 @@
2525

2626
function isnopidentity(ls::LoopSet, op::Operation, u₁loop::Symbol, u₂loop::Symbol, vectorized::Symbol, u₂max::Int)
2727
parents_op = parents(op)
28-
if iscompute(op) && instruction(op).instr === :identity && isone(length(parents_op)) && name(first(parents_op)) === name(op)
29-
# loopistiled = u₂max ≠ -1
30-
# parents_u₁syms, parents_u₂syms = parent_unroll_status(op, u₁loop, u₂loop, u₂max)
31-
# if (u₁unrolledsym == first(parents_u₁syms)) && (isu₂unrolled(op) == parents_u₂syms[1])
32-
opp = only(parents_op)
33-
# @show op opp isu₁unrolled(op), isu₁unrolled(opp), isu₂unrolled(op), isu₂unrolled(opp)
34-
if (isu₁unrolled(op) == isu₁unrolled(opp)) & (isu₂unrolled(op) == isu₂unrolled(opp))
35-
true
36-
else
37-
# if isvectorized(opp) & (!isvectorized(op))
38-
# op.instruction = reduction_to_scalar(instruction(opp))
39-
# op.mangledvariable = gensym(op.mangledvariable)
40-
# end
41-
false
42-
end
28+
if iscompute(op) && instruction(op).instr === :identity
29+
# loopistiled = u₂max ≠ -1
30+
# parents_u₁syms, parents_u₂syms = parent_unroll_status(op, u₁loop, u₂loop, u₂max)
31+
# if (u₁unrolledsym == first(parents_u₁syms)) && (isu₂unrolled(op) == parents_u₂syms[1])
32+
oppstate = Base.iterate(parents_op)
33+
oppstate === nothing && return false
34+
opp, state = oppstate
35+
Base.iterate(parents_op, state) === nothing || return false
36+
name(opp) === name(op) || return false
37+
# @show op opp isu₁unrolled(op), isu₁unrolled(opp), isu₂unrolled(op), isu₂unrolled(opp)
38+
if (isu₁unrolled(op) == isu₁unrolled(opp)) & (isu₂unrolled(op) == isu₂unrolled(opp))
39+
true
4340
else
44-
false
41+
# if isvectorized(opp) & (!isvectorized(op))
42+
# op.instruction = reduction_to_scalar(instruction(opp))
43+
# op.mangledvariable = gensym(op.mangledvariable)
44+
# end
45+
false
4546
end
47+
else
48+
false
49+
end
4650
end
4751

4852
function set_upstream_family!(adal::Vector{T}, op::Operation, val::T, ld::Vector{Symbol}, id::Int) where {T}

0 commit comments

Comments
 (0)