Skip to content

Commit 30a9ddf

Browse files
committed
Try to improve test coverage slightly.
1 parent b568584 commit 30a9ddf

11 files changed

+109
-105
lines changed

src/add_compute.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ function pushparent!(parents::Vector{Operation}, deps::Vector{Symbol}, reducedde
4444
push!(parents, parent)
4545
update_deps!(deps, reduceddeps, parent)
4646
end
47-
function pushparent!(mpref::ArrayReferenceMetaPosition, parent::Operation)
48-
pushparent!(mpref.parents, mpref.loopdependencies, mpref.reduceddeps, parent)
49-
end
47+
# function pushparent!(mpref::ArrayReferenceMetaPosition, parent::Operation)
48+
# pushparent!(mpref.parents, mpref.loopdependencies, mpref.reduceddeps, parent)
49+
# end
5050
function add_parent!(
5151
vparents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var, elementbytes::Int, position::Int
5252
)

src/broadcast.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ Base.@propagate_inbounds Base.getindex(A::LowDimArray, i...) = getindex(A.data,
131131
s = Expr(:call, smul, T, multup)
132132
f = D[1] ? :PackedStridedPointer : :SparseStridedPointer
133133
Expr(:block, Expr(:meta,:inline), Expr(:(=), :strideA, Expr(:call, :strides, Expr(:(.), :A, QuoteNode(:data)))),
134-
Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(f)), Expr(:call, :pointer, Expr(:(.), :A, QuoteNode(:data))), s))
134+
Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(f)), Expr(:call, :pointer, :A), s))
135135
end
136136
function LowDimArray{D}(data::A) where {D,T,N,A <: AbstractArray{T,N}}
137137
LowDimArray{D,T,N,A}(data)

src/condense_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ function findmatchingarray(ls::LoopSet, mref::ArrayReferenceMeta)
8181
end
8282
0x00
8383
end
84-
filled_4byte_chunks(u::UInt64) = 16 - (leading_zeros(u) >>> 2)
84+
# filled_4byte_chunks(u::UInt64) = 16 - (leading_zeros(u) >>> 2)
8585
filled_8byte_chunks(u::UInt64) = 8 - (leading_zeros(u) >>> 3)
8686

8787
# num_loop_deps(os::OperationStruct) = filled_4byte_chunks(os.loopdeps)

src/determinestrategy.jl

Lines changed: 52 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11

2-
function indexappearences(op::Operation, s::Symbol)
3-
s loopdependencies(op) && return 0
4-
appearences = 0
5-
if isloopvalue(op)
6-
return s === first(loopdependencies(op)) ? 1 : 0
7-
elseif isload(op)
8-
return 100
9-
end
10-
newapp = 0
11-
for opp parents(op)
12-
newapp += indexappearences(opp, s)
13-
end
14-
factor = instruction(op).instr (:+, :vadd, :add_fast, :evadd) ? 1 : 10
15-
newapp * factor
16-
end
2+
# function indexappearences(op::Operation, s::Symbol)
3+
# s ∉ loopdependencies(op) && return 0
4+
# appearences = 0
5+
# if isloopvalue(op)
6+
# return s === first(loopdependencies(op)) ? 1 : 0
7+
# elseif isload(op)
8+
# return 100
9+
# end
10+
# newapp = 0
11+
# for opp ∈ parents(op)
12+
# newapp += indexappearences(opp, s)
13+
# end
14+
# factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :evadd) ? 1 : 10
15+
# newapp * factor
16+
# end
1717
function findparent(ls::LoopSet, s::Symbol)#opdict isn't filled when reconstructing
1818
id = findfirst(op -> name(op) === s, operations(ls))
1919
id === nothing && throw("$s not found")
@@ -42,13 +42,13 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
4242
true
4343
end
4444

45-
function register_pressure(op::Operation)
46-
if isconstant(op) || isloopvalue(op)
47-
0
48-
else
49-
instruction_cost(instruction(op)).register_pressure
50-
end
51-
end
45+
# function register_pressure(op::Operation)
46+
# if isconstant(op) || isloopvalue(op)
47+
# 0
48+
# else
49+
# instruction_cost(instruction(op)).register_pressure
50+
# end
51+
# end
5252
function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
5353
isconstant(op) && return 0.0, 0, Float64(length(loopdependencies(op)) > 0)
5454
isloopvalue(op) && return 0.0, 0, 0.0
@@ -82,7 +82,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
8282
# this feature is common to all of them.
8383
srt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
8484
end
85-
elseif instr === :setindex! # broadcast or reductionstore; if store we want to penalize reduction
85+
elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
8686
srt *= 3
8787
sl *= 3
8888
end
@@ -95,12 +95,12 @@ end
9595
function biggest_type_size(ls::LoopSet)
9696
maximum(elsize, operations(ls))
9797
end
98-
function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
99-
VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
100-
end
101-
function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
102-
VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
103-
end
98+
# function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
99+
# VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
100+
# end
101+
# function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
102+
# VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
103+
# end
104104
function hasintersection(a, b)
105105
for aᵢ a, bᵢ b
106106
aᵢ === bᵢ && return true
@@ -208,9 +208,7 @@ function unroll_no_reductions(ls, order, vectorized)
208208
W, Wshift = lsvecwidthshift(ls, vectorized, size_T)
209209
# W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
210210

211-
compute_rt = 0.0
212-
load_rt = 0.0
213-
store_rt = 0.0
211+
compute_rt = load_rt = store_rt = 0.0
214212
unrolled = last(order)
215213
if unrolled === vectorized && length(order) > 1
216214
unrolled = order[end-1]
@@ -399,12 +397,12 @@ function solve_unroll_constT(R::AbstractVector, u₂::Int)
399397
iszero(denom) && return 8
400398
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / denom)
401399
end
402-
function solve_unroll_constT(ls::LoopSet, u₂::Int)
403-
R = @view ls.reg_pres[:,1]
404-
denom = u₂ * R[1] + R[2]
405-
iszero(denom) && return 8
406-
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
407-
end
400+
# function solve_unroll_constT(ls::LoopSet, u₂::Int)
401+
# R = @view ls.reg_pres[:,1]
402+
# denom = u₂ * R[1] + R[2]
403+
# iszero(denom) && return 8
404+
# floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
405+
# end
408406
# Tiling here is about alleviating register pressure for the UxT
409407
function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
410408
# iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
@@ -414,11 +412,10 @@ function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
414412
u₁_too_large = u₁ > u₁max
415413
u₂_too_large = u₂ > u₂max
416414
if u₁_too_large
415+
u₁ = u₁max
417416
if u₂_too_large
418-
u₁ = u₁max
419417
u₂ = u₂max
420418
else # u₁ too large, resolve u₂
421-
u₁ = u₁max
422419
u₂ = min(u₂max, max(1,solve_unroll_constU(R, u₁)))
423420
end
424421
cost = unroll_cost(X, u₁, u₂, u₁L, u₂L)
@@ -609,10 +606,6 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, u::Symbol)
609606
# opploopi = oppmref.loopedindex
610607
mnonew = typemin(Int)
611608
for i eachindex(opinds)
612-
if opinds[i] !== oppinds[i]
613-
mnonew = 1
614-
break
615-
end
616609
if opinds[i] === u
617610
mnonew = (opoffs[i] - oppoffs[i])
618611
elseif opoffs[i] != oppoffs[i]
@@ -727,20 +720,12 @@ function add_constant_offset_load_elmination_cost!(
727720
# we treat this as the unrolled loop getting eliminated is split into 2 parts:
728721
# 1 a non-cost-reduced part, with factor udependent_reduction
729722
# 2 a cost-reduced part, with factor uindependent_increase
730-
if uid == 1 # u₁reduces was false
723+
(r, i) = if uid == 1 # u₁reduces was false
731724
@assert !u₁reduces
732-
if u₂reduces
733-
r, i = 4, 2
734-
else
735-
r, i = 3, 1
736-
end
725+
u₂reduces ? (4, 2) : (3, 1)
737726
elseif uid == 2 # u₂reduces was false
738727
@assert !u₂reduces
739-
if u₁reduces
740-
r, i = 4, 3
741-
else
742-
r, i = 2, 1
743-
end
728+
u₁reduces ? (4, 3) : (2, 1)
744729
else
745730
throw("uid somehow did not return 1 or 2, even though offset > -4.")
746731
end
@@ -1085,16 +1070,16 @@ function choose_order(ls::LoopSet)
10851070
order, unroll, tile, vec, u₁, u₂
10861071
end
10871072

1088-
function register_pressure(ls::LoopSet, u₁, u₂)
1089-
if u₂ == -1
1090-
sum(register_pressure, operations(ls))
1091-
else
1092-
rp = @view ls.reg_pres[:,1]
1093-
u₁ * u₂ * rp[1] + u₁ * rp[2] + rp[3] + rp[4]
1094-
end
1095-
end
1096-
function register_pressure(ls::LoopSet)
1097-
order, unroll, tile, vec, u₁, u₂ = choose_order(ls)
1098-
register_pressure(ls, u₁, u₂)
1099-
end
1073+
# function register_pressure(ls::LoopSet, u₁, u₂)
1074+
# if u₂ == -1
1075+
# sum(register_pressure, operations(ls))
1076+
# else
1077+
# rp = @view ls.reg_pres[:,1]
1078+
# u₁ * u₂ * rp[1] + u₁ * rp[2] + rp[3] + rp[4]
1079+
# end
1080+
# end
1081+
# function register_pressure(ls::LoopSet)
1082+
# order, unroll, tile, vec, u₁, u₂ = choose_order(ls)
1083+
# register_pressure(ls, u₁, u₂)
1084+
# end
11001085

src/lowering.jl

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -500,16 +500,19 @@ function add_upper_outer_reductions(ls::LoopSet, loopq::Expr, Ulow::Int, Uhigh::
500500
initialize_outer_reductions!(ifq, ls, Ulow, Uhigh, vectorized)
501501
push!(ifq.args, loopq)
502502
reduce_range!(ifq, ls, Ulow, Uhigh)
503+
loopbuffer = Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh)
503504
comparison = if isstaticloop(unrolledloop)
504-
Expr(:call, lv(:scalar_less), length(unrolledloop), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
505-
elseif unrolledloop.starthint == 1
506-
Expr(:call, lv(:scalar_less), unrolledloop.stopsym, Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
505+
Expr(:call, lv(:scalar_less), length(unrolledloop), loopbuffer)
507506
elseif unrolledloop.startexact
508-
Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, unrolledloop.starthint-1), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
507+
if isone(unrolledloop.starthint)
508+
Expr(:call, lv(:scalar_less), unrolledloop.stopsym, loopbuffer)
509+
else
510+
Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, unrolledloop.starthint-1), loopbuffer)
511+
end
509512
elseif unrolledloop.stopexact
510-
Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stophint+1, unrolledloop.sartsym), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
513+
Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stophint+1, unrolledloop.sartsym), loopbuffer)
511514
else# both are given by symbols
512-
Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, Expr(:call,lv(:vsub),unrolledloop.startsym)), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
515+
Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, Expr(:call,lv(:vsub),unrolledloop.startsym, Expr(:call,lv(:Static),1))), loopbuffer)
513516
end
514517
ncomparison = Expr(:call, :!, comparison)
515518
Expr(:if, ncomparison, ifq)
@@ -612,14 +615,14 @@ function definemask(loop::Loop)
612615
maskexpr(lexpr)
613616
end
614617
end
615-
function definemask_for_alignment_cleanup(loop::Loop)
616-
lexpr = if loop.stopexact
617-
Expr(:call, lv(:vsub), loop.stophint + 1, loop.itersym)
618-
else
619-
Expr(:call, lv(:vsub), Expr(:call, lv(:vadd), loop.stopsym, 1), loop.itersymbol)
620-
end
621-
maskexpr(lexpr)
622-
end
618+
# function definemask_for_alignment_cleanup(loop::Loop)
619+
# lexpr = if loop.stopexact
620+
# Expr(:call, lv(:vsub), loop.stophint + 1, loop.itersym)
621+
# else
622+
# Expr(:call, lv(:vsub), Expr(:call, lv(:vadd), loop.stopsym, 1), loop.itersymbol)
623+
# end
624+
# maskexpr(lexpr)
625+
# end
623626
function define_eltype_vec_width!(q::Expr, ls::LoopSet, vectorized)
624627
push!(q.args, Expr(:(=), ELTYPESYMBOL, determine_eltype(ls)))
625628
push!(q.args, Expr(:(=), VECTORWIDTHSYMBOL, determine_width(ls, vectorized)))
@@ -700,7 +703,7 @@ function lower(ls::LoopSet, u₁::Int, u₂::Int, inline::Int)
700703
lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline)
701704
end
702705

703-
Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls)
706+
# Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls)
704707
Base.show(io::IO, ls::LoopSet) = println(io, lower(ls))
705708

706709

src/memory_ops_common.jl

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol, actualarray::B
4747
nothing
4848
end
4949

50-
@inline valsum() = Val{0}()
50+
# @inline valsum() = Val{0}()
5151
@inline valsum(::Val{M}) where {M} = Val{M}()
5252
@generated valsum(::Val{M}, ::Val{N}) where {M,N} = Val{M+N}()
5353
@inline valsum(::Val{M}, ::Val{N}, ::Val{K}, args...) where {M,N,K} = valsum(valsum(Val{M}(), Val{N}()), Val{K}(), args...)
@@ -69,15 +69,14 @@ function subset_vptr!(ls::LoopSet, vptr::Symbol, indnum::Int, ind, previndices,
6969
offset = first(previndices) === DISCONTIGUOUS
7070
valcall = Expr(:call, lv(:valsum), valcall)
7171
for i 1:indnum-1
72-
if loopindex[i]
73-
append_loop_valdims!(valcall, getloop(ls, previndices[i+offset]))
72+
loopdep = if loopindex[i]
73+
previndices[i+offset]
7474
else
7575
# assumes all valdims will be of equal length once expanded...
7676
# A[I + J, constindex], I and J may be CartesianIndices. This requires they all be of same number of dims
77-
let loopdep = first(loopdependencies(ls.opdict[previndices[i+offset]]))
78-
append_loop_valdims!(valcall, getloop(ls, loopdep))
79-
end
77+
first(loopdependencies(ls.opdict[previndices[i+offset]]))
8078
end
79+
append_loop_valdims!(valcall, getloop(ls, loopdep))
8180
end
8281
end
8382
indm1 = ind isa Integer ? ind - 1 : Expr(:call, :-, ind, 1)

src/operations.jl

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,7 @@ struct ArrayReference
2020
end
2121
ArrayReference(array, indices) = ArrayReference(array, indices, zeros(Int8, length(indices)))
2222
function sameref(x::ArrayReference, y::ArrayReference)
23-
x.array === y.array || return false
24-
xinds = x.indices
25-
yinds = y.indices
26-
nrefs = length(xinds)
27-
nrefs == length(yinds) || return false
28-
for n 1:nrefs
29-
xinds[n] === yinds[n] || return false
30-
end
31-
true
23+
(x.array === y.array) && (x.indices == y.indices)
3224
end
3325
function Base.isequal(x::ArrayReference, y::ArrayReference)
3426
sameref(x, y) || return false

test/broadcast.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
D1 = C .+ A * B;
8181
D2 = @avx C .+ A .*ˡ B;
8282
@test D1 D2
83-
fill!(D2, -999999); D2 = @avx C .+ At' .*ˡ B;
83+
fill!(D2, -999999); D2 = @avx C .+ At' *ˡ B;
8484
@test D1 D2
8585
fill!(D2, -999999); @test A * B (@avx @. D2 = A *ˡ B)
8686
D1 .= view(C, 1, :)' .+ A * B;

test/mapreduce.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,16 @@
3636
end;
3737
@test vreduce(+, x7) sum(x7)
3838
@test vreduce(+, x) sum(x)
39+
if T === Int32
40+
@test vreduce(*, x7) == (prod(x7) % Int32)
41+
@test vreduce(*, x) == (prod(x) % Int32)
42+
else
43+
@test vreduce(*, x7) prod(x7)
44+
@test vreduce(*, x) prod(x)
45+
end
3946
@test vmapreduce(abs2, max, x) mapreduce(abs2, max, x)
4047
@test vmapreduce(abs2, min, x) mapreduce(abs2, min, x)
48+
@test vmapreduce(sqrt, *, x) mapreduce(sqrt, *, x)
4149
@test_throws AssertionError vmapreduce(hypot, +, x7, x)
4250
if VERSION v"1.4"
4351
@test vmapreduce(a -> 2a, *, x) mapreduce(a -> 2a, *, x)

test/miscellaneous.jl

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,20 @@ function findreducedparentfornonvecstore!(U::AbstractMatrix{T}, E1::AbstractVect
722722
U,E1
723723
end
724724

725+
726+
function powcseliteral!(x)
727+
@avx for i eachindex(x)
728+
x[i] = 3^4
729+
end
730+
x
731+
end
732+
function powcsesymbol!(x, a = 3)
733+
@avx for i eachindex(x)
734+
x[i] = a^4
735+
end
736+
x
737+
end
738+
725739
@inline ninereturns(x) = (0.25x, 0.5x, 0.75, 1.0x, 1.25x, 1.5x, 1.75x, 2.0x, 2.25x)
726740
function manyreturntest(x)
727741
s = zero(eltype(x))
@@ -954,7 +968,10 @@ end
954968
U3, E3 = findreducedparentfornonvecstoreavx!(copy(U0), copy(E0));
955969
findreducedparentfornonvecstore!(U0, E0);
956970
@test U0 U3
957-
@test E0 E3
971+
@test E0 E3
972+
973+
@test all(isequal(81), powcseliteral!(E0))
974+
@test all(isequal(81), powcsesymbol!(E3))
958975
end
959976
for T [Int16, Int32, Int64]
960977
n = 8sizeof(T) - 1

0 commit comments

Comments
 (0)