Some progress towards 1.6.

chriselrod · chriselrod · commit 937fffe3b0b5 · 2020-11-28T09:00:33.000-05:00
diff --git a/src/add_compute.jl b/src/add_compute.jl
@@ -328,9 +328,9 @@ function add_pow!(
         return add_compute!(ls, var, :^, [xop, pop], elementbytes)
     end
     if pint == -1
-        return add_compute!(ls, var, :vinv, [xop], elementbytes)
+        return add_compute!(ls, var, :inv, [xop], elementbytes)
     elseif pint < 0
-        xop = add_compute!(ls, gensym(:inverse), :vinv, [xop], elementbytes)
+        xop = add_compute!(ls, gensym(:inverse), :inv, [xop], elementbytes)
         pint = - pint
     end
     if pint == 0
diff --git a/src/costs.jl b/src/costs.jl
@@ -183,7 +183,7 @@ const COST = Dict{Symbol,InstructionCost}(
     # Instruction(:ifelse) => InstructionCost(1, 0.5),
     :ifelse => InstructionCost(1, 0.5),
     :inv => InstructionCost(13,4.0,-2.0,1),
-    :vinv => InstructionCost(13,4.0,-2.0,1),
+    # :vinv => InstructionCost(13,4.0,-2.0,1),
     :muladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     :fma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     # :vmuladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
diff --git a/src/loopstartstopmanager.jl b/src/loopstartstopmanager.jl
@@ -200,6 +200,7 @@ function pointermax(ls::LoopSet, ar::ArrayReferenceMeta, n::Int, sub::Int, isvec
     loopsym = names(ls)[n]
     index = Expr(:tuple)
     found_loop_sym = false
+    call = Expr(:call, lv(:pointerforcomparison))
     for i ∈ getindicesonly(ar)
         if i === loopsym
             found_loop_sym = true
@@ -214,14 +215,15 @@ function pointermax(ls::LoopSet, ar::ArrayReferenceMeta, n::Int, sub::Int, isvec
             else
                 push!(index.args, staticexpr(stophint - sub))
             end
-            ptr = vptr(ar)
+            push!(call.args, vptr(ar))
             # return 
         else
             push!(index.args, Expr(:call, lv(:Zero)))
         end
     end
     @assert found_loop_sym "Failed to find $loopsym"
-    Expr(:call, lv(:pointerforcomparison), ptr, index)
+    push!(call.args, index)
+    call
     # @show ar, loopsym
 end
 function pointermax(ls::LoopSet, ar::ArrayReferenceMeta, n::Int, sub::Int, isvectorized::Bool, stopsym)::Expr
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -128,7 +128,7 @@ function pushvarg!(ls::LoopSet, ar::ArrayReferenceMeta, i, name)
 end
 function add_mref!(
     ls::LoopSet, ar::ArrayReferenceMeta, i::Int, @nospecialize(_::Type{S}), name
-) where {T, N, C, B, R, X, O, S <: StridedPointer{T,N,C,B,R,X,O}}
+) where {T, N, C, B, R, X, O, S <: AbstractStridedPointer{T,N,C,B,R,X,O}}
     @assert B ≤ 0 "Batched arrays not supported yet."
     sp = ArrayInterface.rank_to_sortperm(R)
     # maybe no change needed? -- optimize common case
@@ -401,7 +401,7 @@ function sizeofeltypes(v, num_arrays)::Int
         Ttemp = typeeltype(v[i])
         if !VectorizationBase.SIMD_NATIVE_INTEGERS && Ttemp <: Integer # hack
             return VectorizationBase.REGISTER_SIZE
-        end 
+        end
         T = promote_type(T, Ttemp)
     end
     sizeof(T)
diff --git a/test/gemv.jl b/test/gemv.jl
@@ -233,16 +233,21 @@ using Test
         mygemvavx_range!(y2, A, x)
         @test y1full ≈ y2full
 
-        Abit = A .> 0.5;
-        fill!(y2, -9999); mygemv_avx!(y2, Abit, x);
-        @test y2 ≈ Abit * x
-        fill!(y2, -9999); mygemvavx!(y2, Abit, x);
-        @test y2 ≈ Abit * x
-        xbit = x .> 0.5;
-        fill!(y2, -9999); mygemv_avx!(y2, A, xbit);
-        @test y2 ≈ A * xbit
-        fill!(y2, -9999); mygemvavx!(y2, A, xbit);
-        @test y2 ≈ A * xbit
+        let M = 56
+            A = view(Afull, M .+ (1:M), K .+ (1:K)); A .= rand.(Ref(R));
+            y1 = view(y1full, M .+ (1:M));
+            y2 = view(y2full, M .+ (1:M));
+            Abit = A .> 0.5;
+            fill!(y2, -9999); mygemv_avx!(y2, Abit, x);
+            @test y2 ≈ Abit * x
+            fill!(y2, -9999); mygemvavx!(y2, Abit, x);
+            @test y2 ≈ Abit * x
+            xbit = x .> 0.5;
+            fill!(y2, -9999); mygemv_avx!(y2, A, xbit);
+            @test y2 ≈ A * xbit
+            fill!(y2, -9999); mygemvavx!(y2, A, xbit);
+            @test y2 ≈ A * xbit
+        end
 
         # Check for out of bounds stores
         fill!(y1, 0); fill!(y2, 0); @test y1full ≈ y2full
diff --git a/test/ifelsemasks.jl b/test/ifelsemasks.jl
@@ -37,7 +37,7 @@ T = Float32
         z
     end
 
-    function Bernoulli_logit(y::BitVector, α::AbstractVector{T}) where {T}
+    function Bernoulli_logit(y, α::AbstractVector{T}) where {T}
         t = zero(promote_type(Float64,T))
         @inbounds for i ∈ eachindex(α)
             invOmP = 1 + exp(α[i])
@@ -47,7 +47,7 @@ T = Float32
         end
         t
     end
-    function Bernoulli_logitavx(y::BitVector, α::AbstractVector{T}) where {T}
+    function Bernoulli_logitavx(y, α::AbstractVector{T}) where {T}
         t = zero(T === Int32 ? Float32 : Float64)
         @avx for i ∈ eachindex(α)
             invOmP = 1 + exp(α[i])
@@ -57,7 +57,7 @@ T = Float32
         end
         t
     end
-    function Bernoulli_logit_avx(y::BitVector, α::AbstractVector{T}) where {T}
+    function Bernoulli_logit_avx(y, α::AbstractVector{T}) where {T}
         t = zero(T === Int32 ? Float32 : Float64)
         @_avx for i ∈ eachindex(α)
             invOmP = 1 + exp(α[i])
@@ -492,15 +492,19 @@ T = Float32
     
     
     a = rand(-10:10, 43);
-    bit = a .> 0.5;
+    bit = a .> 0.5; bool = copyto!(Vector{Bool}(undef, length(bit)), bit);
     t = Bernoulli_logit(bit, a);
     @test isapprox(t, Bernoulli_logitavx(bit, a), atol = Int === Int32 ? 0.1 : 0)
     @test isapprox(t, Bernoulli_logit_avx(bit, a), atol = Int === Int32 ? 0.1 : 0)
+    @test isapprox(t, Bernoulli_logitavx(bool, a), atol = Int === Int32 ? 0.1 : 0)
+    @test isapprox(t, Bernoulli_logit_avx(bool, a), atol = Int === Int32 ? 0.1 : 0)
     a = rand(43);
-    bit = a .> 0.5;
+    bit = a .> 0.5; bool = copyto!(Vector{Bool}(undef, length(bit)), bit);
     t = Bernoulli_logit(bit, a);
     @test t ≈ Bernoulli_logitavx(bit, a)
     @test t ≈ Bernoulli_logit_avx(bit, a)
+    @test t ≈ Bernoulli_logitavx(bool, a)
+    @test t ≈ Bernoulli_logit_avx(bool, a)
 
     ai = [rand(Bool) for _ in 1:71];
     bi = [rand(Bool) for _ in 1:71];