Skip to content

Commit c96464e

Browse files
committed
Don't use negative multiplier strides for comparing loop bounds, fixes #350.
1 parent 5bccb7a commit c96464e

File tree

6 files changed

+45
-12
lines changed

6 files changed

+45
-12
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.91"
4+
version = "0.12.92"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -38,5 +38,5 @@ SLEEFPirates = "0.6.23"
3838
Static = "0.3.3, 0.4"
3939
ThreadingUtilities = "0.4.5"
4040
UnPack = "1"
41-
VectorizationBase = "0.21.19"
41+
VectorizationBase = "0.21.20"
4242
julia = "1.5"

src/codegen/loopstartstopmanager.jl

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,14 @@ function uniquearrayrefs(ls::LoopSet)
6161
uniquerefs, includeinlet
6262
end
6363

64-
otherindexunrolled(loopsym::Symbol, ind::Symbol, loopdeps::Vector{Symbol}) = (loopsym !== ind) && (loopsym loopdeps)
64+
otherindexunrolled(loopsym::Symbol, ind::Symbol, loopdeps::Vector{Symbol}) = ((loopsym ind) & (loopsym Symbol("##undefined##"))) && (loopsym loopdeps)
6565
function otherindexunrolled(ls::LoopSet, ind::Symbol, ref::ArrayReferenceMeta)
6666
us = ls.unrollspecification
67-
u₁sym = names(ls)[us.u₁loopnum]
68-
u₂sym = us.u₂loopnum > 0 ? names(ls)[us.u₂loopnum] : Symbol("##undefined##")
67+
@unpack u₁loopnum, u₂loopnum, u₁, u₂ = us
68+
u₁sym = u₁ > 1 ? names(ls)[u₁loopnum] : Symbol("##undefined##")
69+
u₂sym = ((u₂ > 1) & (u₂loopnum > 0)) ? names(ls)[u₂loopnum] : Symbol("##undefined##")
70+
# u₁sym = names(ls)[u₁loopnum]
71+
# u₂sym = ((u₂loopnum > 0)) ? names(ls)[u₂loopnum] : Symbol("##undefined##")
6972
otherindexunrolled(u₁sym, ind, loopdependencies(ref)) || otherindexunrolled(u₂sym, ind, loopdependencies(ref))
7073
end
7174
function multiple_with_name(n::Symbol, v::Vector{ArrayReferenceMeta})
@@ -86,19 +89,21 @@ function indices_calculated_by_pointer_offsets(ls::LoopSet, ar::ArrayReferenceMe
8689
offset = isdiscontiguous(ar)
8790
gespinds = Expr(:tuple)
8891
out = Vector{Bool}(undef, length(indices))
92+
strds = getstrides(ar)
8993
li = ar.loopedindex
9094
for i eachindex(li)
9195
ii = i + offset
9296
ind = indices[ii]
9397
if (!li[i]) || (ind === CONSTANTZEROINDEX) || multiple_with_name(vptr(ar), ls.lssm.uniquearrayrefs) ||
94-
(iszero(ls.vector_width) && isstaticloop(getloop(ls, ind)))# ||
98+
(iszero(ls.vector_width) && isstaticloop(getloop(ls, ind))) || (strds[i] 0)
9599
out[i] = false
96100
elseif (isone(ii) && (first(looporder) === ind))
97101
out[i] = otherindexunrolled(ls, ind, ar)
98102
else
99103
out[i] = true
100104
end
101105
end
106+
# @show ar out
102107
out
103108
end
104109

@@ -677,6 +682,7 @@ function use_loop_induct_var!(
677682
looporder = reversenames(ls)
678683
uliv = Vector{Int}(undef, length(li))
679684
indices = getindices(ar)
685+
strds = getstrides(ar)
680686
offset = first(indices) === DISCONTIGUOUS
681687
if length(indices) != offset + length(li)
682688
println(ar)
@@ -711,7 +717,7 @@ function use_loop_induct_var!(
711717
elseif isbroadcast ||
712718
((isone(ii) && (last(looporder) === ind)) && !(otherindexunrolled(ls, ind, ar)) ||
713719
multiple_with_name(vptrar, allarrayrefs)) ||
714-
(iszero(ls.vector_width) && isstaticloop(getloop(ls, ind)))# ||
720+
(iszero(ls.vector_width) && isstaticloop(getloop(ls, ind))) || (strds[i] 0)
715721
# Not doing normal offset indexing
716722
uliv[i] = -findfirst(Base.Fix2(===,ind), looporder)::Int
717723
push!(offsetprecalc_descript.args, 0) # not doing offset indexing, so push 0

src/codegen/lowering.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -614,10 +614,12 @@ end
614614
## This performs reduction to one `Vec`
615615
function reduce_expr!(q::Expr, ls::LoopSet, U::Int)
616616
us = ls.unrollspecification
617-
u₁f, u₂f = if us.u₂ == -1
618-
ifelse(U == -1, us.u₁, U), -1
617+
if us.u₂ == -1
618+
u₁f = ifelse(U == -1, us.u₁, U)
619+
u₂f = -1
619620
else
620-
us.u₁, U
621+
u₁f = us.u₁
622+
u₂f = U
621623
end
622624
# u₁loop, u₂loop = getunrolled(ls)
623625
u₁loop = getloop(ls, us.u₁loopnum).itersymbol

src/codegen/split_loops.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
109109
order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, cost_fused, shouldinline_fused = choose_order_cost(ls)
110110
remaining_ops = Vector{Int}(undef, length(split_candidates) - 1); split_1 = Int[0];
111111
# for (ind,i) ∈ enumerate(split_candidates)
112+
looplenpen = 0.05
113+
ls_looplen = looplengthprod(ls)*looplenpen
112114
for (ind,i) enumerate(split_candidates)
113115
split_1[1] = i
114116
ls_1 = split_loopset(ls, split_1, false)
@@ -118,7 +120,7 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
118120
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
119121
# U_1 = T_1 = U_2 = T_2 = 2
120122
# return ls_1, ls_2
121-
if cost_1 + cost_2 0.9cost_fused
123+
if cost_1 + cost_2 + looplenpen*(looplengthprod(ls_1) + looplengthprod(ls_2)) muladd(0.9, cost_fused, ls_looplen)
122124
ls_2_lowered = if length(remaining_ops) > 1
123125
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
124126
lower_and_split_loops(ls_2, inline)

src/precompile.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,4 +263,6 @@ function _precompile_()
263263
Base.precompile(Tuple{typeof(choose_order),LoopSet}) # time: 0.001016861
264264
Base.precompile(Tuple{typeof(capture_a_muladd),Expr,Nothing}) # time: 0.001010088
265265
Base.precompile(Tuple{typeof(canonicalize_range),CartesianIndices{4, NTuple{4, Base.OneTo{Int}}}}) # time: 0.001000169
266+
267+
Base.precompile(reduce_expr!, (Expr, LoopSet, Int))
266268
end

test/shuffleloadstores.jl

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,26 @@ function issue348_v1!(hi, lo)
380380
end
381381
end
382382
end
383-
383+
function reverse_part(n1,n2)
384+
A = zeros(n1,n2)
385+
@turbo for i=1:n1÷2, j = 1:n2
386+
c = 1.0
387+
A[i, j] = c
388+
r = n1 + 1 - i
389+
A[r, j] = c
390+
end
391+
return A
392+
end
393+
function reverse_part_ref(n1,n2)
394+
A = zeros(n1,n2)
395+
@inbounds for i=1:n1÷2; @simd for j = 1:n2
396+
c = 1.0
397+
A[i, j] = c
398+
r = n1 + 1 - i
399+
A[r, j] = c
400+
end; end
401+
return A
402+
end
384403

385404
@testset "shuffles load/stores" begin
386405
@show @__LINE__
@@ -474,5 +493,7 @@ end
474493
@test a_hi_tmp_ref == a_hi_tmp1
475494
@turbo a_hi_tmp1 .= 0;
476495
@test all(iszero, parent(a_hi_tmp1))
496+
497+
@test reverse_part(n_hi,4) == reverse_part_ref(n_hi,4)
477498
end
478499
end

0 commit comments

Comments
 (0)