Skip to content

Commit 7204ebe

Browse files
committed
choose_order test updates.
1 parent 002e88f commit 7204ebe

File tree

8 files changed

+56
-33
lines changed

8 files changed

+56
-33
lines changed

src/determinestrategy.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -880,8 +880,8 @@ function choose_tile(ls::LoopSet)
880880
while true
881881
for new_vec new_order # view to skip first
882882
u₁temp, u₂temp, cost_temp = evaluate_cost_tile(ls, new_order, UnrollSymbols(newu₁, newu₂, new_vec))
883-
# if cost_temp < lowest_cost
884-
if cost_temp lowest_cost
883+
# if cost_temp < lowest_cost # leads to 4 vmovapds
884+
if cost_temp lowest_cost # lead to 2 vmovapds
885885
lowest_cost = cost_temp
886886
u₁, u₂ = u₁temp, u₂temp
887887
best_vec = new_vec

src/lower_compute.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ end
7171
function add_loopvalue!(instrcall::Expr, loopval::Symbol, vectorized::Symbol, u::Int)
7272
if loopval === vectorized
7373
if isone(u)
74-
push!(instrcall.args, Expr(:call, :valadd, VECTORWIDTHSYMBOL, loopval))
74+
push!(instrcall.args, Expr(:call, lv(:valadd), VECTORWIDTHSYMBOL, loopval))
7575
else
7676
push!(instrcall.args, Expr(:call, lv(:valmuladd), VECTORWIDTHSYMBOL, u, loopval))
7777
end

src/lowering.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,12 @@ function lower_block(
7878
u₁ = n == u₁loopnum ? UF : u₁
7979
dontmaskfirsttiles = !isnothing(mask) && vectorizedloopnum == u₂loopnum
8080
blockq = Expr(:block)
81+
delay_u₁ = true
82+
# delay_u₁ = false
8183
for prepost 1:2
8284
# !u₁ && !u₂
8385
lower!(blockq, ops[1,1,prepost,n], ls, unrollsyms, u₁, nothing, mask)
84-
if u₁ == 4
86+
if !delay_u₁ || u₁ == 4
8587
lower!(blockq, ops[2,1,prepost,n], ls, unrollsyms, u₁, nothing, mask)
8688
end
8789
opsv1 = ops[1,2,prepost,n]
@@ -112,7 +114,7 @@ function lower_block(
112114
else # !u₁ && u₂
113115
lower!(blockq, opsv1, ls, unrollsyms, u₁, t, mask, store)
114116
end
115-
if iszero(t) && !store && u₁ != 4 # u₁ && !u₂
117+
if delay_u₁ && iszero(t) && !store && u₁ != 4 # u₁ && !u₂
116118
# for u ∈ 0:u₁-1
117119
lower!(blockq, ops[2,1,prepost,n], ls, unrollsyms, u₁, nothing, mask)
118120
# end
@@ -129,7 +131,7 @@ function lower_block(
129131
end
130132
nstores == 0 && break
131133
end
132-
elseif u₁ != 4
134+
elseif delay_u₁ && u₁ != 4
133135
# for u ∈ 0:u₁-1 # u₁ && !u₂
134136
lower!(blockq, ops[2,1,prepost,n], ls, unrollsyms, u₁, nothing, mask)
135137
# end

src/operation_evaluation_order.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ function addoptoorder!(
8181
isnopidentity(ls, op, u₁loop, u₂loop, vectorized, nothing) || push!(lo[isunrolled,1,after_loop,_n], op)
8282
end
8383
end
84+
# @show op, after_loop
8485
# isloopvalue(op) || push!(lo[isunrolled,istiled,after_loop,_n], op)
8586
# all(opp -> iszero(length(reduceddependencies(opp))), parents(op)) &&
8687
set_upstream_family!(place_after_loop, op, false, loopdependencies(op), identifier(op)) # parents that have already been included are not moved, so no need to check included_vars to filter

test/gemm.jl

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
end
99
end);
1010
lsAmulBt1 = LoopVectorization.LoopSet(AmulBtq1);
11-
@test LoopVectorization.choose_order(lsAmulBt1) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
11+
# @test LoopVectorization.choose_order(lsAmulBt1) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
12+
@test LoopVectorization.choose_order(lsAmulBt1) == (Symbol[:n,:m,:k], :m, :n, :m, Unum, Tnum)
1213

1314
AmulBq1 = :(for m 1:size(A,1), n 1:size(B,2)
1415
C[m,n] = zeroB
@@ -17,15 +18,17 @@
1718
end
1819
end)
1920
lsAmulB1 = LoopVectorization.LoopSet(AmulBq1);
20-
@test LoopVectorization.choose_order(lsAmulB1) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
21+
# @test LoopVectorization.choose_order(lsAmulB1) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
22+
@test LoopVectorization.choose_order(lsAmulB1) == (Symbol[:n,:m,:k], :m, :n, :m, Unum, Tnum)
2123
AmulBq2 = :(for m 1:M, n 1:N
2224
C[m,n] = zero(eltype(B))
2325
for k 1:K
2426
C[m,n] += A[m,k] * B[k,n]
2527
end
2628
end)
2729
lsAmulB2 = LoopVectorization.LoopSet(AmulBq2);
28-
@test LoopVectorization.choose_order(lsAmulB2) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
30+
# @test LoopVectorization.choose_order(lsAmulB2) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
31+
@test LoopVectorization.choose_order(lsAmulB2) == (Symbol[:n,:m,:k], :m, :n, :m, Unum, Tnum)
2932
AmulBq3 = :(for m 1:size(A,1), n 1:size(B,2)
3033
ΔCₘₙ = zero(eltype(C))
3134
for k 1:size(A,2)
@@ -34,7 +37,7 @@
3437
C[m,n] += ΔCₘₙ
3538
end)
3639
lsAmulB3 = LoopVectorization.LoopSet(AmulBq3);
37-
@test LoopVectorization.choose_order(lsAmulB3) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
40+
@test LoopVectorization.choose_order(lsAmulB3) == (Symbol[:n,:m,:k], :m, :n, :m, Unum, Tnum)
3841

3942
function AmulB!(C, A, B)
4043
C .= 0
@@ -113,7 +116,8 @@
113116
C[m,n] = α * ΔCₘₙ + β * C[m,n]
114117
end);
115118
lsAmuladd = LoopVectorization.LoopSet(Amuladdq);
116-
@test LoopVectorization.choose_order(lsAmuladd) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
119+
# @test LoopVectorization.choose_order(lsAmuladd) == (Symbol[:n,:m,:k], :n, :m, :m, Unum, Tnum)
120+
@test LoopVectorization.choose_order(lsAmuladd) == (Symbol[:n,:m,:k], :m, :n, :m, Unum, Tnum)
117121
Atmuladdq = :(for m 1:size(A,2), n 1:size(B,2)
118122
ΔCₘₙ = zero(eltype(C))
119123
for k 1:size(A,1)
@@ -126,7 +130,8 @@
126130
# lsAmuladd.operations
127131
# LoopVectorization.loopdependencies.(lsAmuladd.operations)
128132
# LoopVectorization.reduceddependencies.(lsAmuladd.operations)
129-
@test LoopVectorization.choose_order(lsAtmuladd) == (Symbol[:n,:m,:k], :n, :m, :k, Unum, Tnum)
133+
# @test LoopVectorization.choose_order(lsAtmuladd) == (Symbol[:n,:m,:k], :n, :m, :k, Unum, Tnum)
134+
@test LoopVectorization.choose_order(lsAtmuladd) == (Symbol[:n,:m,:k], :m, :n, :k, Unum, Tnum)
130135

131136
function AmulB_avx1!(C, A, B)
132137
@_avx for m 1:size(A,1), n 1:size(B,2)
@@ -238,7 +243,8 @@
238243
end)
239244
lsAtmulB = LoopVectorization.LoopSet(AtmulBq);
240245
# LoopVectorization.choose_order(lsAtmulB)
241-
@test LoopVectorization.choose_order(lsAtmulB) == (Symbol[:n,:m,:k], :m, :n, :k, Unum, Tnum)
246+
# @test LoopVectorization.choose_order(lsAtmulB) == (Symbol[:n,:m,:k], :m, :n, :k, Unum, Tnum)
247+
@test LoopVectorization.choose_order(lsAtmulB) == (Symbol[:n,:m,:k], :n, :m, :k, Unum, Tnum)
242248

243249
function AtmulBavx1!(C, A, B)
244250
@avx for n 1:size(C,2), m 1:size(C,1)
@@ -319,9 +325,11 @@
319325
end)
320326
lsr2amb = LoopVectorization.LoopSet(r2ambq);
321327
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
322-
@test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 3, 3)
328+
# @test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 3, 3)
329+
@test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :m, :n, :m, 3, 6)
323330
else
324-
@test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 2, 2)
331+
# @test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 2, 2)
332+
@test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :m, :n, :m, 2, 4)
325333
end
326334
function rank2AmulBavx!(C, Aₘ, Aₖ, B)
327335
@avx for m 1:size(C,1), n 1:size(C,2)

test/gemv.jl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ using LoopVectorization
22
using Test
33

44
@testset "GEMV" begin
5-
Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 6)
5+
# Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 6)
6+
Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 4)
67
gemvq = :(for i eachindex(y)
78
yᵢ = 0.0
89
for j eachindex(x)
@@ -11,7 +12,11 @@ using Test
1112
y[i] = yᵢ
1213
end)
1314
lsgemv = LoopVectorization.LoopSet(gemvq);
14-
@test LoopVectorization.choose_order(lsgemv) == (Symbol[:i, :j], :j, :i, :i, Unum, Tnum)
15+
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 16
16+
@test LoopVectorization.choose_order(lsgemv) == (Symbol[:i, :j], :j, :i, :i, Unum, Tnum)
17+
else
18+
@test LoopVectorization.choose_order(lsgemv) == (Symbol[:i, :j], :i, :j, :i, 4, 4)
19+
end
1520

1621
function mygemv!(y, A, x)
1722
@inbounds for i eachindex(y)
@@ -127,7 +132,11 @@ using Test
127132
G[d1,κ] = z
128133
end)
129134
lsgemv = LoopVectorization.LoopSet(gemvq);
130-
@test LoopVectorization.choose_order(lsgemv) == ([:d1,:d2], :d2, :d1, :d2, Unum, Tnum)
135+
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 16
136+
@test LoopVectorization.choose_order(lsgemv) == ([:d1,:d2], :d2, :d1, :d2, Unum, Tnum)
137+
else
138+
@test LoopVectorization.choose_order(lsgemv) == ([:d1,:d2], :d1, :d2, :d2, 4, 4)
139+
end
131140
function AtmulvB_avx3!(G, B,κ)
132141
d = size(G,1)
133142
@_avx for d1=1:d
@@ -144,7 +153,8 @@ using Test
144153
end
145154
end)
146155
lsp = LoopVectorization.LoopSet(pq);
147-
@test LoopVectorization.choose_order(lsp) == ([:d1, :d2], :d2, :d1, :d2, Unum, Tnum)
156+
# @test LoopVectorization.choose_order(lsp) == ([:d1, :d2], :d2, :d1, :d2, Unum, Tnum)
157+
@test LoopVectorization.choose_order(lsp) == ([:d1, :d2], :d1, :d2, :d2, Unum, Tnum)
148158
# lsp.preamble_symsym
149159

150160
function hhavx!(A::AbstractVector{T}, B, C, D) where {T}

test/ifelsemasks.jl

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -406,18 +406,15 @@ T = Float32
406406

407407
ai = [rand(Bool) for _ in 1:71];
408408
bi = [rand(Bool) for _ in 1:71];
409-
if LoopVectorization.VectorizationBase.AVX2 || Base.libllvm_version v"8" #FIXME Why doesn't this work on Travis Ivy Bridge Julia 1.1?
409+
# if LoopVectorization.VectorizationBase.AVX2 || Base.libllvm_version ≥ v"8" #FIXME Why doesn't this work on Travis Ivy Bridge Julia 1.1?
410410
@test (ai .& bi) == (@avx ai .& bi)
411411
@test (ai .| bi) == (@avx ai .| bi)
412412
@test (ai .⊻ bi) == (@avx ai .⊻ bi)
413-
else
414-
@test_broken (ai .& bi) == (@avx ai .& bi)
415-
@test_broken (ai .| bi) == (@avx ai .| bi)
416-
@test_broken (ai .⊻ bi) == (@avx ai .⊻ bi)
417-
# @test_broken (Ai .& bi) == (@avx Ai .& bi)
418-
# @test_broken (ai .| Bi) == (@avx ai .| Bi)
419-
# @test_broken (Ai .⊻ Bi) == (@avx Ai .⊻ Bi)
420-
end
413+
# else
414+
# @test_broken (ai .& bi) == (@avx ai .& bi)
415+
# @test_broken (ai .| bi) == (@avx ai .| bi)
416+
# @test_broken (ai .⊻ bi) == (@avx ai .⊻ bi)
417+
# end
421418
a = bitrand(127); b = bitrand(127);
422419
@test (a .& b) == (@avx a .& b)
423420
@test (a .| b) == (@avx a .| b)

test/miscellaneous.jl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ using Test
44

55
@testset "Miscellaneous" begin
66

7-
Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 4)
7+
# Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 4)
8+
Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 6)
89
dot3q = :(for m 1:M, n 1:N
910
s += x[m] * A[m,n] * y[n]
1011
end);
1112
lsdot3 = LoopVectorization.LoopSet(dot3q);
1213
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
13-
@test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :n, :m, :m, Unum, Tnum)#&-2
14+
# @test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :n, :m, :m, Unum, Tnum)#&-2
15+
@test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :m, :n, :m, Unum, Tnum)#&-2
1416
else
1517
@test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :m, :n, :m, Unum, Tnum)#&-2
1618
end
@@ -64,7 +66,8 @@ using Test
6466
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
6567
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
6668
else
67-
@test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :j, :i, :j, 3, 4)#&-2
69+
# @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :j, :i, :j, 3, 4)#&-2
70+
@test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :i, :j, :j, 4, 4)#&-2
6871
end
6972
## @avx is SLOWER!!!!
7073
## need to fix!
@@ -93,7 +96,8 @@ using Test
9396
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
9497
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
9598
else
96-
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 3, 4)
99+
# @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 3, 4)
100+
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
97101
end
98102
# my colsum is wrong (by 0.25), but slightly more interesting
99103
function mycolsum!(x, A)
@@ -133,7 +137,8 @@ using Test
133137
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
134138
@test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
135139
else
136-
@test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
140+
# @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
141+
@test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 4)
137142
end
138143

139144
function myvar!(s², A, x̄)

0 commit comments

Comments
 (0)