Try to improve test coverage slightly.

chriselrod · chriselrod · commit 30a9ddf7877f · 2020-07-16T13:06:17.000-04:00
diff --git a/src/add_compute.jl b/src/add_compute.jl
@@ -44,9 +44,9 @@ function pushparent!(parents::Vector{Operation}, deps::Vector{Symbol}, reducedde
     push!(parents, parent)
     update_deps!(deps, reduceddeps, parent)
 end
-function pushparent!(mpref::ArrayReferenceMetaPosition, parent::Operation)
-    pushparent!(mpref.parents, mpref.loopdependencies, mpref.reduceddeps, parent)
-end
+# function pushparent!(mpref::ArrayReferenceMetaPosition, parent::Operation)
+#     pushparent!(mpref.parents, mpref.loopdependencies, mpref.reduceddeps, parent)
+# end
 function add_parent!(
     vparents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var, elementbytes::Int, position::Int
 )
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -131,7 +131,7 @@ Base.@propagate_inbounds Base.getindex(A::LowDimArray, i...) = getindex(A.data,
     s = Expr(:call, smul, T, multup)
     f = D[1] ? :PackedStridedPointer : :SparseStridedPointer
     Expr(:block, Expr(:meta,:inline), Expr(:(=), :strideA, Expr(:call, :strides, Expr(:(.), :A, QuoteNode(:data)))),
-         Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(f)), Expr(:call, :pointer, Expr(:(.), :A, QuoteNode(:data))), s))
+         Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(f)), Expr(:call, :pointer, :A), s))
 end
 function LowDimArray{D}(data::A) where {D,T,N,A <: AbstractArray{T,N}}
     LowDimArray{D,T,N,A}(data)
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -81,7 +81,7 @@ function findmatchingarray(ls::LoopSet, mref::ArrayReferenceMeta)
     end
     0x00
 end
-filled_4byte_chunks(u::UInt64) = 16 - (leading_zeros(u) >>> 2)
+# filled_4byte_chunks(u::UInt64) = 16 - (leading_zeros(u) >>> 2)
 filled_8byte_chunks(u::UInt64) = 8 - (leading_zeros(u) >>> 3)
 
 # num_loop_deps(os::OperationStruct) = filled_4byte_chunks(os.loopdeps)
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -1,19 +1,19 @@
 
-function indexappearences(op::Operation, s::Symbol)
-    s ∉ loopdependencies(op) && return 0
-    appearences = 0
-    if isloopvalue(op)
-        return s === first(loopdependencies(op)) ? 1 : 0
-    elseif isload(op)
-        return 100
-    end
-    newapp = 0
-    for opp ∈ parents(op)
-        newapp += indexappearences(opp, s)
-    end
-    factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :evadd) ? 1 : 10
-    newapp * factor
-end
+# function indexappearences(op::Operation, s::Symbol)
+#     s ∉ loopdependencies(op) && return 0
+#     appearences = 0
+#     if isloopvalue(op)
+#         return s === first(loopdependencies(op)) ? 1 : 0
+#     elseif isload(op)
+#         return 100
+#     end
+#     newapp = 0
+#     for opp ∈ parents(op)
+#         newapp += indexappearences(opp, s)
+#     end
+#     factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :evadd) ? 1 : 10
+#     newapp * factor
+# end
 function findparent(ls::LoopSet, s::Symbol)#opdict isn't filled when reconstructing
     id = findfirst(op -> name(op) === s, operations(ls))
     id === nothing && throw("$s not found")
@@ -42,13 +42,13 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
     true
 end
 
-function register_pressure(op::Operation)
-    if isconstant(op) || isloopvalue(op)
-        0
-    else
-        instruction_cost(instruction(op)).register_pressure
-    end
-end
+# function register_pressure(op::Operation)
+#     if isconstant(op) || isloopvalue(op)
+#         0
+#     else
+#         instruction_cost(instruction(op)).register_pressure
+#     end
+# end
 function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
     isconstant(op) && return 0.0, 0, Float64(length(loopdependencies(op)) > 0)
     isloopvalue(op) && return 0.0, 0, 0.0
@@ -82,7 +82,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
                 #       this feature is common to all of them.
                 srt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
             end
-        elseif instr === :setindex! # broadcast or reductionstore; if store we want to penalize reduction
+        elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
             srt *= 3
             sl *= 3
         end
@@ -95,12 +95,12 @@ end
 function biggest_type_size(ls::LoopSet)
     maximum(elsize, operations(ls))
 end
-function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
-    VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
-end
-function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
-    VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
-end
+# function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
+#     VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
+# end
+# function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
+#     VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
+# end
 function hasintersection(a, b)
     for aᵢ ∈ a, bᵢ ∈ b
         aᵢ === bᵢ && return true
@@ -208,9 +208,7 @@ function unroll_no_reductions(ls, order, vectorized)
     W, Wshift = lsvecwidthshift(ls, vectorized, size_T)
     # W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
 
-    compute_rt = 0.0
-    load_rt = 0.0
-    store_rt = 0.0
+    compute_rt = load_rt = store_rt = 0.0
     unrolled = last(order)
     if unrolled === vectorized && length(order) > 1
         unrolled = order[end-1]
@@ -399,12 +397,12 @@ function solve_unroll_constT(R::AbstractVector, u₂::Int)
     iszero(denom) && return 8
     floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / denom)
 end
-function solve_unroll_constT(ls::LoopSet, u₂::Int)
-    R = @view ls.reg_pres[:,1]
-    denom = u₂ * R[1] + R[2]
-    iszero(denom) && return 8
-    floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
-end
+# function solve_unroll_constT(ls::LoopSet, u₂::Int)
+#     R = @view ls.reg_pres[:,1]
+#     denom = u₂ * R[1] + R[2]
+#     iszero(denom) && return 8
+#     floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
+# end
 # Tiling here is about alleviating register pressure for the UxT
 function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
     # iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
@@ -414,11 +412,10 @@ function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
     u₁_too_large = u₁ > u₁max
     u₂_too_large = u₂ > u₂max
     if u₁_too_large
+        u₁ = u₁max
         if u₂_too_large
-            u₁ = u₁max
             u₂ = u₂max
         else # u₁ too large, resolve u₂
-            u₁ = u₁max
             u₂ = min(u₂max, max(1,solve_unroll_constU(R, u₁)))
         end
         cost = unroll_cost(X, u₁, u₂, u₁L, u₂L)
@@ -609,10 +606,6 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, u::Symbol)
         # opploopi = oppmref.loopedindex
         mnonew = typemin(Int)
         for i ∈ eachindex(opinds)
-            if opinds[i] !== oppinds[i]
-                mnonew = 1
-                break
-            end
             if opinds[i] === u
                 mnonew = (opoffs[i] - oppoffs[i])
             elseif opoffs[i] != oppoffs[i]
@@ -727,20 +720,12 @@ function add_constant_offset_load_elmination_cost!(
         # we treat this as the unrolled loop getting eliminated is split into 2 parts:
         # 1 a non-cost-reduced part, with factor udependent_reduction
         # 2 a cost-reduced part, with factor uindependent_increase
-        if uid == 1 # u₁reduces was false
+        (r, i) = if uid == 1 # u₁reduces was false
             @assert !u₁reduces
-            if u₂reduces
-                r, i = 4, 2
-            else
-                r, i = 3, 1
-            end
+            u₂reduces ? (4, 2) : (3, 1)
         elseif uid == 2 # u₂reduces was false
             @assert !u₂reduces
-            if u₁reduces
-                r, i = 4, 3
-            else
-                r, i = 2, 1
-            end
+            u₁reduces ? (4, 3) : (2, 1)
         else
             throw("uid somehow did not return 1 or 2, even though offset > -4.")
         end
@@ -1085,16 +1070,16 @@ function choose_order(ls::LoopSet)
     order, unroll, tile, vec, u₁, u₂
 end
 
-function register_pressure(ls::LoopSet, u₁, u₂)
-    if u₂ == -1
-        sum(register_pressure, operations(ls))
-    else
-        rp = @view ls.reg_pres[:,1]
-        u₁ * u₂ * rp[1] + u₁ * rp[2] + rp[3] + rp[4]
-    end
-end
-function register_pressure(ls::LoopSet)
-    order, unroll, tile, vec, u₁, u₂ = choose_order(ls)
-    register_pressure(ls, u₁, u₂)
-end
+# function register_pressure(ls::LoopSet, u₁, u₂)
+#     if u₂ == -1
+#         sum(register_pressure, operations(ls))
+#     else
+#         rp = @view ls.reg_pres[:,1]
+#         u₁ * u₂ * rp[1] + u₁ * rp[2] + rp[3] + rp[4]
+#     end
+# end
+# function register_pressure(ls::LoopSet)
+#     order, unroll, tile, vec, u₁, u₂ = choose_order(ls)
+#     register_pressure(ls, u₁, u₂)
+# end
 
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -500,16 +500,19 @@ function add_upper_outer_reductions(ls::LoopSet, loopq::Expr, Ulow::Int, Uhigh::
     initialize_outer_reductions!(ifq, ls, Ulow, Uhigh, vectorized)
     push!(ifq.args, loopq)
     reduce_range!(ifq, ls, Ulow, Uhigh)
+    loopbuffer = Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh)
     comparison = if isstaticloop(unrolledloop)
-        Expr(:call, lv(:scalar_less), length(unrolledloop), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
-    elseif unrolledloop.starthint == 1
-        Expr(:call, lv(:scalar_less), unrolledloop.stopsym, Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
+        Expr(:call, lv(:scalar_less), length(unrolledloop), loopbuffer)
     elseif unrolledloop.startexact
-        Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, unrolledloop.starthint-1), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
+        if isone(unrolledloop.starthint)
+            Expr(:call, lv(:scalar_less), unrolledloop.stopsym, loopbuffer)
+        else
+            Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, unrolledloop.starthint-1), loopbuffer)
+        end
     elseif unrolledloop.stopexact
-        Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stophint+1, unrolledloop.sartsym), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
+        Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stophint+1, unrolledloop.sartsym), loopbuffer)
     else# both are given by symbols
-        Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, Expr(:call,lv(:vsub),unrolledloop.startsym)), Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, Uhigh))
+        Expr(:call, lv(:scalar_less), Expr(:call, lv(:vsub), unrolledloop.stopsym, Expr(:call,lv(:vsub),unrolledloop.startsym, Expr(:call,lv(:Static),1))), loopbuffer)
     end
     ncomparison = Expr(:call, :!, comparison)
     Expr(:if, ncomparison, ifq)
@@ -612,14 +615,14 @@ function definemask(loop::Loop)
         maskexpr(lexpr)
     end
 end
-function definemask_for_alignment_cleanup(loop::Loop)
-    lexpr = if loop.stopexact
-        Expr(:call, lv(:vsub), loop.stophint + 1, loop.itersym)
-    else
-        Expr(:call, lv(:vsub), Expr(:call, lv(:vadd), loop.stopsym, 1), loop.itersymbol)
-    end
-    maskexpr(lexpr)
-end
+# function definemask_for_alignment_cleanup(loop::Loop)
+#     lexpr = if loop.stopexact
+#         Expr(:call, lv(:vsub), loop.stophint + 1, loop.itersym)
+#     else
+#         Expr(:call, lv(:vsub), Expr(:call, lv(:vadd), loop.stopsym, 1), loop.itersymbol)
+#     end
+#     maskexpr(lexpr)
+# end
 function define_eltype_vec_width!(q::Expr, ls::LoopSet, vectorized)
     push!(q.args, Expr(:(=), ELTYPESYMBOL, determine_eltype(ls)))
     push!(q.args, Expr(:(=), VECTORWIDTHSYMBOL, determine_width(ls, vectorized)))
@@ -700,7 +703,7 @@ function lower(ls::LoopSet, u₁::Int, u₂::Int, inline::Int)
     lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline)
 end
 
-Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls)
+# Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls)
 Base.show(io::IO, ls::LoopSet) = println(io, lower(ls))
 
 
diff --git a/src/memory_ops_common.jl b/src/memory_ops_common.jl
@@ -47,7 +47,7 @@ function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol, actualarray::B
     nothing
 end
 
-@inline valsum() = Val{0}()
+# @inline valsum() = Val{0}()
 @inline valsum(::Val{M}) where {M} = Val{M}()
 @generated valsum(::Val{M}, ::Val{N}) where {M,N} = Val{M+N}()
 @inline valsum(::Val{M}, ::Val{N}, ::Val{K}, args...) where {M,N,K} = valsum(valsum(Val{M}(), Val{N}()), Val{K}(), args...)
@@ -69,15 +69,14 @@ function subset_vptr!(ls::LoopSet, vptr::Symbol, indnum::Int, ind, previndices,
         offset = first(previndices) === DISCONTIGUOUS
         valcall = Expr(:call, lv(:valsum), valcall)
         for i ∈ 1:indnum-1
-            if loopindex[i]
-                append_loop_valdims!(valcall, getloop(ls, previndices[i+offset]))
+            loopdep = if loopindex[i]
+                previndices[i+offset]
             else
                 # assumes all valdims will be of equal length once expanded...
                 # A[I + J, constindex], I and J may be CartesianIndices. This requires they all be of same number of dims
-                let loopdep = first(loopdependencies(ls.opdict[previndices[i+offset]]))
-                    append_loop_valdims!(valcall, getloop(ls, loopdep))
-                end
+                first(loopdependencies(ls.opdict[previndices[i+offset]]))
             end
+            append_loop_valdims!(valcall, getloop(ls, loopdep))
         end
     end
     indm1 = ind isa Integer ? ind - 1 : Expr(:call, :-, ind, 1)
diff --git a/src/operations.jl b/src/operations.jl
@@ -20,15 +20,7 @@ struct ArrayReference
 end
 ArrayReference(array, indices) = ArrayReference(array, indices, zeros(Int8, length(indices)))
 function sameref(x::ArrayReference, y::ArrayReference)
-    x.array === y.array || return false
-    xinds = x.indices
-    yinds = y.indices
-    nrefs = length(xinds)
-    nrefs == length(yinds) || return false
-    for n ∈ 1:nrefs
-        xinds[n] === yinds[n] || return false
-    end
-    true
+    (x.array === y.array) && (x.indices == y.indices)
 end
 function Base.isequal(x::ArrayReference, y::ArrayReference)
     sameref(x, y) || return false
diff --git a/test/broadcast.jl b/test/broadcast.jl
@@ -80,7 +80,7 @@
         D1 = C .+ A * B;
         D2 = @avx C .+ A .*ˡ B;
         @test D1 ≈ D2
-        fill!(D2, -999999); D2 = @avx C .+ At' .*ˡ B;
+        fill!(D2, -999999); D2 = @avx C .+ At' *ˡ B;
         @test D1 ≈ D2
         fill!(D2, -999999); @test A * B ≈ (@avx @. D2 = A *ˡ B)
         D1 .= view(C, 1, :)' .+ A * B;
diff --git a/test/mapreduce.jl b/test/mapreduce.jl
@@ -36,8 +36,16 @@
         end;
         @test vreduce(+, x7) ≈ sum(x7)
         @test vreduce(+, x) ≈ sum(x)
+        if T === Int32
+            @test vreduce(*, x7) == (prod(x7) % Int32)
+            @test vreduce(*, x) == (prod(x) % Int32)
+        else
+            @test vreduce(*, x7) ≈ prod(x7)
+            @test vreduce(*, x) ≈ prod(x)
+        end
         @test vmapreduce(abs2, max, x) ≈ mapreduce(abs2, max, x)
         @test vmapreduce(abs2, min, x) ≈ mapreduce(abs2, min, x)
+        @test vmapreduce(sqrt, *, x) ≈ mapreduce(sqrt, *, x)
         @test_throws AssertionError vmapreduce(hypot, +, x7, x)
         if VERSION ≥ v"1.4"
             @test vmapreduce(a -> 2a, *, x) ≈ mapreduce(a -> 2a, *, x)
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -722,6 +722,20 @@ function findreducedparentfornonvecstore!(U::AbstractMatrix{T}, E1::AbstractVect
     U,E1
 end
 
+
+function powcseliteral!(x)
+    @avx for i ∈ eachindex(x)
+        x[i] = 3^4
+    end
+    x
+end
+function powcsesymbol!(x, a = 3)
+    @avx for i ∈ eachindex(x)
+        x[i] = a^4
+    end
+    x
+end
+
 @inline ninereturns(x) = (0.25x, 0.5x, 0.75, 1.0x, 1.25x, 1.5x, 1.75x, 2.0x, 2.25x)
 function manyreturntest(x)
     s = zero(eltype(x))
@@ -954,7 +968,10 @@ end
         U3, E3 = findreducedparentfornonvecstoreavx!(copy(U0), copy(E0));
         findreducedparentfornonvecstore!(U0, E0);
         @test U0 ≈ U3
-        @test E0 ≈ E3        
+        @test E0 ≈ E3
+
+        @test all(isequal(81), powcseliteral!(E0))
+        @test all(isequal(81), powcsesymbol!(E3))
     end
     for T ∈ [Int16, Int32, Int64]
         n = 8sizeof(T) - 1
diff --git a/test/offsetarrays.jl b/test/offsetarrays.jl
@@ -110,7 +110,7 @@ using LoopVectorization.VectorizationBase: StaticUnitRange
         # Manually unpack the OffsetArray
         @avx for j in rng2, i in rng1
             tmp_0 = zero(eltype(out))
-            Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
+            Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[(ik-2)+i,(jk-2)+j] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
             out[i,j] = tmp_9
         end
         out