Skip to content

Commit ea5b46b

Browse files
committed
2 parents b5cf11b + 0977109 commit ea5b46b

File tree

6 files changed

+93
-46
lines changed

6 files changed

+93
-46
lines changed

src/determinestrategy.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -548,8 +548,11 @@ function stride_penalty(ls::LoopSet, order::Vector{Symbol})
548548
push!(v, stride_penalty(ls, op, order, loopfreqs))
549549
end
550550
end
551-
# 1 / 1024 = 0.0009765625
552-
10.0sum(maximum, values(stridepenaltydict)) * Base.power_by_squaring(0.0009765625, length(order))
551+
if iszero(length(values(stridepenaltydict)))
552+
0.0
553+
else # 1 / 1024 = 0.0009765625
554+
10.0sum(maximum, values(stridepenaltydict)) * Base.power_by_squaring(0.0009765625, length(order))
555+
end
553556
end
554557
function isoptranslation(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols)
555558
@unpack u₁loopsym, u₂loopsym, vectorized = unrollsyms

src/loopstartstopmanager.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,11 @@ function add_loop_start_stop_manager!(ls::LoopSet)
157157
end
158158
end
159159
loopstarts[i] = loopstartᵢ
160-
terminators[i] = if (loopsym loopinductvars) || (any(r -> any(isequal(-i), r), use_livs))
160+
terminators[i] = if (loopsym loopinductvars) || (any(r -> any(isequal(-i), r), use_livs)) || iszero(length(loopstartᵢ))
161161
0
162162
else
163-
@assert !iszero(length(loopstartᵢ))
163+
# @show i, loopsym loopdependencies.(operations(ls)) operations(ls)
164+
# @assert !iszero(length(loopstartᵢ))
164165
last(ric[argmin(first.(ric))]) # index corresponds to array ref's position in loopstart
165166
end
166167
end

src/lower_compute.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ function lower_compute!(
251251
if last(instrcall.args) == varsym
252252
pushfirst!(instrcall.args, lv(:vifelse))
253253
insert!(instrcall.args, 3, mask)
254-
else
254+
elseif all(in(loopdependencies(op)), reduceddeps) || any(opp -> mangledvar(opp) === mangledvar(op), parents_op)
255255
push!(q.args, Expr(:(=), varsym, Expr(:call, lv(:vifelse), mask, instrcall, varsym)))
256256
continue
257257
end

src/reconstruct_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
494494
@generated function _avx_!(::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UNROLL, OPS, ARF, AM, LPSYM, LB}
495495
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
496496
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
497-
# @show avx_body(ls, UNROLL)
497+
# return @show avx_body(ls, UNROLL)
498498
# @show UNROLL, OPS, ARF, AM, LPSYM, LB
499499
avx_body(ls, UNROLL)
500500
end

test/miscellaneous.jl

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,38 @@ function maxavx!(R::AbstractArray{T}, Q, keep=nothing) where T
660660
end
661661
R
662662
end
663+
function splitintonoloop(U = randn(2,2), E1 = randn(2))
664+
t = 1
665+
a = 1.0
666+
_s = 0.0
667+
n, k = size(U)
668+
@avx for j = 1:k
669+
for i = 1:n
670+
u = tanh(a * U[i,j])
671+
v = a * (1 - t * t)
672+
U[i,j] = u
673+
_s += v
674+
end
675+
E1[j] = _s / n
676+
end
677+
U, E1
678+
end
679+
function splitintonoloop_reference(U = randn(2,2), E1 = randn(2))
680+
t = 1
681+
a = 1.0
682+
_s = 0.0
683+
n, k = size(U)
684+
for j = 1:k
685+
for i = 1:n
686+
u = tanh(a * U[i,j])
687+
v = a * (1 - t * t)
688+
U[i,j] = u
689+
_s += v
690+
end
691+
E1[j] = _s / n
692+
end
693+
U, E1
694+
end
663695

664696

665697

@@ -865,7 +897,12 @@ end
865897
@test maxavx!(R, Q) == vec(maximum(Q, dims=(2,3)))
866898
R .+= randn.(T); Rc = copy(R);
867899
@test maxavx!(R, Q, true) == max.(vec(maximum(Q, dims=(2,3))), Rc)
868-
900+
901+
U1 = randn(5,7); E1 = randn(7);
902+
U2, E2 = splitintonoloop_reference(copy(U1), copy(E1));
903+
splitintonoloop(U1, E1);
904+
@test U1 U2
905+
@test E1 E2
869906
end
870907
for T [Int16, Int32, Int64]
871908
n = 8sizeof(T) - 1

test/offsetarrays.jl

Lines changed: 45 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
using LoopVectorization, OffsetArrays, Test
22
using LoopVectorization.VectorizationBase: StaticUnitRange
3-
T = Float64
3+
# T = Float64
44
# T = Float32
55

66
@testset "OffsetArrays" begin
@@ -201,55 +201,61 @@ T = Float64
201201
for T (Float32, Float64)
202202
@show T, @__LINE__
203203
A = rand(T, 100, 100); At = copy(A');
204-
kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1);
205-
out1 = OffsetArray(view(similar(A, size(A) .+ 32), (1:98) .+ 32, (1:98) .+ 32), 1, 1); # stay away from the edges of A
206-
# out1 = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A
207-
out2 = similar(out1); out3 = similar(out1); out4 = similar(out1);
208-
skern = SizedOffsetMatrix{T,-1,1,-1,1}(parent(kern));
204+
for r (-1:1, -2:2)
205+
@show r
206+
fr = first(r); lr = last(r);
207+
kern = OffsetArray(rand(T, length(r), length(r)), r, r);
208+
out1 = OffsetArray(view(similar(A, size(A) .+ 32), (1+lr:100-lr) .+ 32, (1+lr:100-lr) .+ 32), lr, lr); # stay away from the edges of A
209+
# out1 = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A
210+
out2 = similar(out1); out3 = similar(out1); out4 = similar(out1);
211+
skern = SizedOffsetMatrix{T,fr,lr,fr,lr}(parent(kern));
209212

210-
old2d!(out1, A, kern);
211-
avx2d!(out2, A, kern);
212-
@test out1 out2
213+
old2d!(out1, A, kern);
214+
avx2d!(out2, A, kern);
215+
@test out1 out2
213216

214-
avx2douter!(out3, A, kern);
215-
@test out1 out3
217+
avx2douter!(out3, A, kern);
218+
@test out1 out3
216219

217-
fill!(out2, NaN); avx2d!(out2, A, skern);
218-
@test out1 out2
220+
fill!(out2, NaN); avx2d!(out2, A, skern);
221+
@test out1 out2
219222

220-
fill!(out2, NaN); avx2douter!(out2, At', kern);
221-
@test out1 out2
223+
fill!(out2, NaN); avx2douter!(out2, At', kern);
224+
@test out1 out2
222225

223-
fill!(out2, NaN); avx2douter!(out2', A, kern);
224-
@test out1 out2'
226+
fill!(out2, NaN); avx2douter!(out2', A, kern);
227+
@test out1 out2'
225228

226-
fill!(out2, NaN); avx2douter!(out2', At', kern);
227-
@test out1 out2'
229+
fill!(out2, NaN); avx2douter!(out2', At', kern);
230+
@test out1 out2'
228231

229-
fill!(out3, NaN); avx2douter!(out3, A, skern);
230-
@test out1 out3
232+
fill!(out3, NaN); avx2douter!(out3, A, skern);
233+
@test out1 out3
231234

232-
fill!(out3, NaN); avx2dunrolled!(out3, A, skern);
233-
@test out1 out3
235+
if r == -1:1
236+
fill!(out3, NaN); avx2dunrolled!(out3, A, skern);
237+
@test out1 out3
234238

235-
fill!(out3, NaN); avx2dunrolled2x2!(out3, A, skern);
236-
@test out1 out3
239+
fill!(out3, NaN); avx2dunrolled2x2!(out3, A, skern);
240+
@test out1 out3
237241

238-
fill!(out3, NaN); avx2dunrolled3x3!(out3, A, skern);
239-
@test out1 out3
240-
241-
@test avxgeneric!(out4, A, kern) out1
242-
fill!(out4, NaN);
243-
@test avxgeneric!(out4, A, skern) out1
242+
fill!(out3, NaN); avx2dunrolled3x3!(out3, A, skern);
243+
@test out1 out3
244+
end
245+
246+
@test avxgeneric!(out4, A, kern) out1
247+
fill!(out4, NaN);
248+
@test avxgeneric!(out4, A, skern) out1
244249

245-
fill!(out4, NaN); @test avxgeneric2!(out4, A, kern) out1
246-
fill!(out4, NaN); @test avxgeneric2!(out4, A, skern) out1
247-
fill!(out4, NaN); @test avxgeneric2!(out4, At', kern) out1
248-
fill!(out4, NaN); @test avxgeneric2!(out4, At', skern) out1
249-
fill!(out4, NaN); @test avxgeneric2!(out4', A, kern) out1
250-
fill!(out4, NaN); @test avxgeneric2!(out4', A, skern) out1
251-
fill!(out4, NaN); @test avxgeneric2!(out4', At', kern) out1
252-
fill!(out4, NaN); @test avxgeneric2!(out4', At', skern) out1
250+
fill!(out4, NaN); @test avxgeneric2!(out4, A, kern) out1
251+
fill!(out4, NaN); @test avxgeneric2!(out4, A, skern) out1
252+
fill!(out4, NaN); @test avxgeneric2!(out4, At', kern) out1
253+
fill!(out4, NaN); @test avxgeneric2!(out4, At', skern) out1
254+
fill!(out4, NaN); @test avxgeneric2!(out4', A, kern) out1
255+
fill!(out4, NaN); @test avxgeneric2!(out4', A, skern) out1
256+
fill!(out4, NaN); @test avxgeneric2!(out4', At', kern) out1
257+
fill!(out4, NaN); @test avxgeneric2!(out4', At', skern) out1
258+
end
253259
end
254260

255261

0 commit comments

Comments
 (0)