Skip to content

Commit ae2e002

Browse files
committed
Fix for reductions while unrolling 5x-7x with static loops.
1 parent bda3516 commit ae2e002

File tree

6 files changed

+61
-8
lines changed

6 files changed

+61
-8
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.8.19"
4+
version = "0.8.20"
55

66
[deps]
77
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -15,10 +15,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1515
[compat]
1616
DocStringExtensions = "0.8"
1717
OffsetArrays = "1"
18-
SIMDPirates = "0.8.16"
18+
SIMDPirates = "0.8.19"
1919
SLEEFPirates = "0.5.4"
2020
UnPack = "0,1"
21-
VectorizationBase = "0.12.24"
21+
VectorizationBase = "0.12.28"
2222
julia = "1.1"
2323

2424
[extras]

src/condense_loopset.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,6 @@ end
293293

294294
make_fast(q) = Expr(:macrocall, Symbol("@fastmath"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
295295
make_crashy(q) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
296-
make_fast_and_crashy(q) = q |> make_fast |> make_crashy
297296

298297
function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
299298
call = generate_call(ls, (inline,U,T))
@@ -335,5 +334,5 @@ function setup_call(ls::LoopSet, q = nothing, inline::Int8 = zero(Int8), check_e
335334
call = setup_call_inline(ls, inline, u₁, u₂)
336335
call = check_empty ? check_if_empty(ls, call) : call
337336
isnothing(q) && return Expr(:block, ls.prepreamble, call)
338-
Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), call, make_fast_and_crashy(q)))
337+
Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), call, make_crashy(make_fast(q))))
339338
end

src/lower_store.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ function reduce_range!(q::Expr, toreduct::Symbol, instr::Instruction, Uh::Int, U
5757
push!(q.args, Expr(:(=), Symbol(toreduct, (u>>>1)), instrexpr))
5858
end
5959
else
60-
for u Uh:Uh2-1
60+
for u Uh:Uh2-2
6161
tru = Symbol(toreduct, u - Uh)
6262
instrexpr = callexpr(instr)
6363
push!(instrexpr.args, tru)

src/lowering.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -652,11 +652,13 @@ function lsexpr(ls::LoopSet, q)
652652
end
653653

654654
function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
655-
@unpack u₁loopnum, u₁, u₂ = us
655+
@unpack u₁loopnum, u₁, u₂, vectorizedloopnum = us
656656
if iszero(length(ls.outer_reductions))
657657
-1
658658
elseif u₂ == -1
659-
min(u₁, 4)
659+
loopisstatic = isstaticloop(getloop(ls, names(ls)[u₁loopnum]))
660+
loopisstatic &= ((vectorizedloopnum != u₁loopnum) | (!iszero(ls.vector_width[])))
661+
loopisstatic ? u₁ : min(u₁, 4)
660662
else
661663
8#u₂#u₁
662664
# elseif num_loops(ls) == u₁loopnum

test/dot.jl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,20 @@ using Test
210210
end
211211
s
212212
end
213+
function dot33(a,b)
214+
s = zero(eltype(a))
215+
@avx for i 1:33
216+
s += a[i] * b[i]
217+
end
218+
s
219+
end
220+
function dot17(a,b)
221+
s = zero(eltype(a))
222+
@avx for i 1:17
223+
s += a[i] * b[i]
224+
end
225+
s
226+
end
213227
# @macroexpand @_avx for i = 1:length(a_re) - 1
214228
# c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
215229
# c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1]
@@ -239,6 +253,9 @@ using Test
239253
@test myselfdot_avx(a) s
240254
@test myselfdotavx(a) s
241255

256+
@test dot17(a,b) @view(a[1:17])' * @view(b[1:17])
257+
@test dot33(a,b) @view(a[1:33])' * @view(b[1:33])
258+
242259
if T <: Union{Float32,Float64}
243260
πest = pi(a, b)
244261
@test πest == piavx(a, b)

test/miscellaneous.jl

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,3 +1059,38 @@ end
10591059
end
10601060

10611061

1062+
function mul1!(y::Vector{T}, A::Matrix{UInt8}, x::Vector{T}) where T
1063+
packedstride = size(A, 1)
1064+
m, n = size(A)
1065+
@avx for j eachindex(x)
1066+
for i eachindex(y)
1067+
k = 2 * ((i-1) & 3)
1068+
block = A[(j-1) * packedstride + ((i-1) >> 2) + 1]
1069+
Aij = (block >> k) & 3
1070+
y[i] += (((Aij >= 2) + (Aij >= 3))) * x[j]
1071+
end
1072+
end
1073+
y
1074+
end
1075+
function mul2!(y::Vector{T}, A::Matrix{UInt8}, x::Vector{T}) where T
1076+
packedstride = size(A, 1)
1077+
m, n = size(A)
1078+
for j eachindex(x)
1079+
for i eachindex(y)
1080+
k = 2 * ((i-1) & 3)
1081+
block = A[(j-1) * packedstride + ((i-1) >> 2) + 1]
1082+
Aij = (block >> k) & 3
1083+
y[i] += (((Aij >= 2) + (Aij >= 3))) * x[j]
1084+
end
1085+
end
1086+
y
1087+
end
1088+
@testset "UInt8 mul" begin
1089+
for n in 1:200
1090+
v1 = rand(n); v3 =copy(v1);
1091+
v2 = rand(n);
1092+
A = rand(UInt8, (length(v1)>>2) + (length(v1)%4 != 0), length(v2))
1093+
@test mul1!(v1, A, v2) mul2!(v3, A, v2)
1094+
end
1095+
end
1096+

0 commit comments

Comments
 (0)