Skip to content

Commit 23f5d56

Browse files
committed
Add support for storing loop induction variables directly, fixes #392.
1 parent 1fdc689 commit 23f5d56

File tree

6 files changed

+37
-30
lines changed

6 files changed

+37
-30
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.104"
4+
version = "0.12.105"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/loopstartstopmanager.jl

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ function indices_calculated_by_pointer_offsets(ls::LoopSet, ar::ArrayReferenceMe
9292
ls.isbroadcast && return fill(false, length(indices))
9393
looporder = names(ls)
9494
offset = isdiscontiguous(ar)
95-
gespinds = Expr(:tuple)
9695
out = Vector{Bool}(undef, length(indices))
9796
strds = getstrides(ar)
9897
li = ar.loopedindex
@@ -310,7 +309,7 @@ function isloopvalue(
310309
if (isrooted nothing)
311310
isrooted[i] || continue
312311
end
313-
iscompute(op) || continue
312+
(iscompute(op) | isstore(op)) || continue
314313
for opp parents(op)# this is to confirm `ind` still has children
315314
# (isloopvalue(opp) && instruction(opp).instr === ind) && return true
316315
if (isloopvalue(opp) && instruction(opp).instr === ind)
@@ -331,10 +330,8 @@ function cse_constant_offsets!(
331330
# vptrar = vptr(ar)
332331
arrayref_to_name_op = arrayref_to_name_op_collection[allarrayrefsind]
333332
array_refs_with_same_name = name_to_array_map[first(first(arrayref_to_name_op))]
334-
us = ls.unrollspecification
335333
li = ar.loopedindex
336334
indices = getindices(ar)
337-
strides = getstrides(ar)
338335
offset = first(indices) === DISCONTIGUOUS
339336
# gespindoffsets = fill(Symbol(""), length(li))
340337
gespindsummary = Vector{Symbol}(undef, length(li))
@@ -814,7 +811,6 @@ function use_loop_induct_var!(
814811
q::Expr,
815812
ar::ArrayReferenceMeta,
816813
allarrayrefs::Vector{ArrayReferenceMeta},
817-
allarrayrefsind::Int,
818814
includeinlet::Bool,
819815
# array_refs_with_same_name::Vector{Int}, arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
820816
)::Vector{Int}
@@ -868,7 +864,7 @@ function use_loop_induct_var!(
868864
Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, true, false)
869865
else
870866
uliv[i] = findfirst(Base.Fix2(===, ind), looporder)::Int
871-
loop = getloop(ls, ind)
867+
# loop = getloop(ls, ind)
872868
push!(offsetprecalc_descript.args, max(5, us.u₁ + 1, us.u₂ + 1))
873869
use_offsetprecalc = true
874870
Wisz || pushgespind!(gespinds, ls, Symbol(""), 0, 1, ind, isli, false, false)
@@ -904,7 +900,6 @@ end
904900
# Plan here is that we increment every unique array
905901
function add_loop_start_stop_manager!(ls::LoopSet)
906902
q = Expr(:block)
907-
us = ls.unrollspecification
908903
# Presence of an explicit use of a loopinducation var means we should use that, so we look for one
909904
# TODO: replace first with only once you add Compat as a dep or drop support for older Julia versions
910905
loopinductvars = Symbol[]
@@ -917,7 +912,7 @@ function add_loop_start_stop_manager!(ls::LoopSet)
917912
use_livs = Vector{Vector{Int}}(undef, length(arrayrefs))
918913
# for i ∈ eachindex(name_to_array_map)
919914
for i eachindex(arrayrefs)
920-
use_livs[i] = use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, i, includeinlet[i])
915+
use_livs[i] = use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, includeinlet[i])
921916
#name_to_array_map[first(first(unique_to_name_and_op_map[i]))], unique_to_name_and_op_map)
922917
end
923918
# loops, sorted from outer-most to inner-most
@@ -934,11 +929,9 @@ function add_loop_start_stop_manager!(ls::LoopSet)
934929
reached_indices = zeros(Int, length(arrayrefs))
935930
for (i, loopsym) enumerate(looporder) # iterates from outer to inner
936931
loopstartᵢ = ArrayReferenceMeta[]
937-
arⱼ = 0
938-
minrem = typemax(Int)
939932
ric = Tuple{Int,Int}[]
940933
for j eachindex(use_livs) # j is array ref number
941-
for (l, k) enumerate(use_livs[j])# l is index number, k is loop number
934+
for k use_livs[j]# l is index number, k is loop number
942935
if k == i
943936
push!(loopstartᵢ, arrayrefs[j])
944937
push!(ric, ((reached_indices[j] += 1), length(loopstartᵢ)))
@@ -1006,7 +999,6 @@ function pointermax_index(
1006999
# @unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us
10071000
loopsym = names(ls)[n]
10081001
index = Expr(:tuple)
1009-
found_loop_sym = false
10101002
ind = 0
10111003
for (j, i) enumerate(getindicesonly(ar))
10121004
if i === loopsym
@@ -1227,7 +1219,6 @@ function startloop(ls::LoopSet, us::UnrollSpecification, n::Int, staticinit::Boo
12271219
termind = lssm.terminators[n]
12281220
ptrdefs = lssm.incrementedptrs[n]
12291221
loopstart = Expr(:block)
1230-
firstloop = n == num_loops(ls)
12311222
for ar ptrdefs
12321223
ptr_offset = vptr_offset(ar)
12331224
push!(loopstart.args, Expr(:(=), ptr_offset, ptr_offset))
@@ -1288,7 +1279,7 @@ function incrementloopcounter!(
12881279
if iszero(termind) # increment liv
12891280
push!(q.args, incrementloopcounter(us, n, loopsym, UF, loop))
12901281
end
1291-
for (j, ar) enumerate(ptrdefs)
1282+
for ar ptrdefs
12921283
offsetinds = indices_calculated_by_pointer_offsets(ls, ar)
12931284
push!(q.args, offset_ptr(ar, us, loopsym, n, UF, offsetinds, loop))
12941285
end

src/codegen/lower_store.jl

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ function lower_store_collection!(
9494
inds_calc_by_ptr_offset::Vector{Bool},
9595
)
9696
omop = offsetloadcollection(ls)
97-
batchid, bopind = omop.batchedcollectionmap[identifier(op)]
98-
collectionid, copind = omop.opidcollectionmap[identifier(op)]
97+
batchid, _ = omop.batchedcollectionmap[identifier(op)]
98+
collectionid, __ = omop.opidcollectionmap[identifier(op)]
9999
opidmap = offsetloadcollection(ls).opids[collectionid]
100100
idsformap = omop.batchedcollections[batchid]
101101

@@ -105,15 +105,18 @@ function lower_store_collection!(
105105
nouter = length(idsformap)
106106

107107
t = Expr(:tuple)
108-
for (i, (opid, _)) enumerate(idsformap)
108+
for (opid, _) idsformap
109109
opp = first(parents(ops[opidmap[opid]]))
110-
110+
111111
isu₁, isu₂ = isunrolled_sym(opp, u₁loopsym, u₂loopsym, vloopsym, ls)#, __u₂max)
112112
u = Core.ifelse(isu₁, u₁, 1)
113-
mvar = Symbol(variable_name(opp, ifelse(isu₂, suffix, -1)), '_', u)
114-
# mvar = Symbol(variable_name(_op, suffix), '_', u)
115-
# mvar = Symbol(variable_name(_op, ifelse(isu₂, suffix, -1)), '_', u)
116-
push!(t.args, mvar)
113+
if isloopvalue(opp)
114+
loopval = first(loopdependencies(opp))
115+
add_loopvalue!(t, loopval, ua, u₁, getloop(ls, loopval))
116+
else
117+
mvar = Symbol(variable_name(opp, ifelse(isu₂, suffix, -1)), '_', u)
118+
push!(t.args, mvar)
119+
end
117120
end
118121
offset_dummy_loop = Loop(
119122
first(getindices(op)),
@@ -149,7 +152,6 @@ function lower_store_collection!(
149152
false
150153
end
151154
uinds = Expr(:call, unrollcurl₂, inds)
152-
vp = vptr(op)
153155
sptrsym = sptr!(q, op)
154156
storeexpr = Expr(:call, lv(:_vstore!), sptrsym, Expr(:call, lv(:VecUnroll), t), uinds)
155157
# not using `add_memory_mask!(storeexpr, op, ua, mask, ls)` because we checked `isconditionalmemop` earlier in `lower_load_collection!`
@@ -235,6 +237,12 @@ function lower_store!(
235237
isu₁, isu₂ = isunrolled_sym(opp, u₁loopsym, u₂loopsym, vloopsym, ls)#, __u₂max)
236238
u = isu₁ ? u₁ : 1
237239
mvar = Symbol(variable_name(opp, ifelse(isu₂, suffix, -1)), '_', u)
240+
if isloopvalue(opp)
241+
def = Expr(:(=), mvar)
242+
loopval = first(loopdependencies(opp))
243+
add_loopvalue!(def, loopval, ua, u₁, getloop(ls, loopval))
244+
push!(q.args, def)
245+
end
238246
if all(op.ref.loopedindex)
239247
inds = unrolledindex(op, ua, mask, inds_calc_by_ptr_offset, ls)
240248
storeexpr = if reductfunc === Symbol("")
@@ -312,7 +320,6 @@ end
312320
function donot_tile_store(
313321
ls::LoopSet,
314322
op::Operation,
315-
vloop::Loop,
316323
reductfunc::Symbol,
317324
u₂::Int,
318325
)
@@ -323,7 +330,7 @@ function donot_tile_store(
323330
) && return true
324331
rejectcurly(op) && return true
325332
omop = offsetloadcollection(ls)
326-
batchid, opind = omop.batchedcollectionmap[identifier(op)]
333+
batchid, _ = omop.batchedcollectionmap[identifier(op)]
327334
return ((batchid 0) && isvectorized(op)) && (!rejectinterleave(op))
328335
end
329336

@@ -344,7 +351,7 @@ function lower_tiled_store!(
344351
reductfunc = storeinstr_preprend(op, vloopsym)
345352
inds_calc_by_ptr_offset = indices_calculated_by_pointer_offsets(ls, op.ref)
346353

347-
if donot_tile_store(ls, op, vloop, reductfunc, u₂)
354+
if donot_tile_store(ls, op, reductfunc, u₂)
348355
# If we have a reductfunc, we're using a reducing store instead of a contiuguous or shuffle store anyway
349356
# so no benefit to being able to handle that case here, vs just calling the default `lower_store!` method
350357
@unpack u₁, u₂max = ua
@@ -368,6 +375,8 @@ function lower_tiled_store!(
368375
u = Core.ifelse(isu₁, u₁, 1)
369376
tup = Expr(:tuple)
370377
for t 0:u₂-1
378+
# tiled stores cannot be loop values, as they're necessarilly
379+
# functions of at least two loops, meaning we do not need to handle them here.
371380
push!(tup.args, Symbol(variable_name(opp, ifelse(isu₂, t, -1)), '_', u))
372381
end
373382
vut = Expr(:call, lv(:VecUnroll), tup) # `VecUnroll` of `VecUnroll`s

src/condense_loopset.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,6 @@ function generate_call_types(
781781
arrayref_descriptions = Expr(:tuple)
782782
duplicate_ref = fill(false, length(ls.refs_aliasing_syms))
783783
for (j, ref) enumerate(ls.refs_aliasing_syms)
784-
vpref = vptr(ref)
785784
# duplicate_ref[j] ≠ 0 && continue
786785
duplicate_ref[j] && continue
787786
push!(arrayref_descriptions.args, ArrayRefStruct(ls, ref, arraysymbolinds, ids))

src/reconstruct_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
980980
),
981981
)
982982
post = hoist_constant_memory_accesses!(ls)
983-
# return @show avx_body(ls, var"#UNROLL#")
983+
# q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post)
984984
q = if (last(var"#UNROLL#") > 1) && length(var"#LPSYM#") == length(ls.loops)
985985
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, nt = var"#UNROLL#"
986986
# wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types

test/copy.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,13 @@ using LoopVectorization, OffsetArrays, Test
183183
dest[i] = src[p, 3*p]
184184
end
185185
end
186-
186+
function collect_turbo(N, ::Type{T}) where {T}
187+
l = Vector{T}(undef, N)
188+
@turbo for i 1:length(l)
189+
l[i] = i
190+
end
191+
return l
192+
end
187193
for T (Float32, Float64, Int32, Int64)
188194
@show T, @__LINE__
189195
R = T <: Integer ? (-T(100):T(100)) : T
@@ -320,5 +326,7 @@ using LoopVectorization, OffsetArrays, Test
320326
end
321327
@. arr2[rng-ifirst] += 1
322328
@test arr1 == arr2
329+
330+
@test collect_turbo(77) == T.(1:77)
323331
end
324332
end

0 commit comments

Comments
 (0)