Couple non-AVX2 fixes/tweaks

chriselrod · chriselrod · commit 0db912b4f3c3 · 2021-03-26T02:58:37.000-04:00
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -1,4 +1,5 @@
 
+
 # function indexappearences(op::Operation, s::Symbol)
 #     s ∉ loopdependencies(op) && return 0
 #     appearences = 0
@@ -95,14 +96,15 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
                     shifter = 2
                     offset = 0.5reg_size(ls) / cache_lnsze(ls)
                 end
-                if !rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
-                    ((u₁ === contigind) | (u₂ === contigind)))
+                if shifter > 1 &&
+                    (!rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
+                    ((u₁ === contigind) | (u₂ === contigind))))
 
                     shifter -= 1
                     offset = 0.5reg_size(ls) / cache_lnsze(ls)
                 end
                 r = 1 << shifter
-                srt *= r + offset
+                srt = srt*r + offset
                 sl *= r
             elseif isload(op) & (length(loopdependencies(op)) > 1)# vmov(a/u)pd
                 # penalize vectorized loads with more than 1 loopdep
diff --git a/src/simdfunctionals/filter.jl b/src/simdfunctionals/filter.jl
@@ -7,14 +7,15 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeT
     j = 0
     st = VectorizationBase.static_sizeof(T)
     zero_index = MM(W, Static(0), st)
+    incr = W * VectorizationBase.static_sizeof(T)
     GC.@preserve x y begin
         # ptr_x = llvmptr(x); ptr_y = llvmptr(y)
         ptr_x = pointer(x); ptr_y = pointer(y)
         for _ ∈ 1:Nrep
             vy = VectorizationBase.__vload(ptr_y, zero_index, False(), register_size())
             mask = f(vy)
             VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
-            ptr_y = gep(ptr_y, register_size())
+            ptr_y = gep(ptr_y, incr)
             j = vadd_fast(j, count_ones(mask))
         end
         rem_mask = VectorizationBase.mask(T, Nrem)
diff --git a/test/ifelsemasks.jl b/test/ifelsemasks.jl
@@ -561,10 +561,15 @@ T = Float32
     a = rand(-10:10, 43);
     bit = a .> 0.5; bool = copyto!(Vector{Bool}(undef, length(bit)), bit);
     t = Bernoulli_logit(bit, a);
-    @test isapprox(t, Bernoulli_logitavx(bit, a), atol = Int === Int32 ? 0.1 : 0)
-    @test isapprox(t, Bernoulli_logit_avx(bit, a), atol = Int === Int32 ? 0.1 : 0)
-    @test isapprox(t, Bernoulli_logitavx(bool, a), atol = Int === Int32 ? 0.1 : 0)
-    @test isapprox(t, Bernoulli_logit_avx(bool, a), atol = Int === Int32 ? 0.1 : 0)
+    @test isapprox(t, Bernoulli_logitavx(bit, a), atol = ifelse(Int === Int32, 0.1, 0.0))
+    if VectorizationBase.pick_vector_width(eltype(a)) ≥ 4
+        # @_avx isn't really expected to work with bits if you don't have AVX512
+        # but it happens to work with AVX2 for this anyway, so may as well keep testing.
+        # am ruling out non-avx2 with the `VectorizationBase.pick_vector_width(eltype(a)) ≥ 4` check
+        @test isapprox(t, Bernoulli_logit_avx(bit, a), atol = ifelse(Int === Int32, 0.1, 0.0))
+    end
+    @test isapprox(t, Bernoulli_logitavx(bool, a), atol = ifelse(Int === Int32, 0.1, 0.0))
+    @test isapprox(t, Bernoulli_logit_avx(bool, a), atol = ifelse(Int === Int32, 0.1, 0.0))
     a = rand(43);
     bit = a .> 0.5; bool = copyto!(Vector{Bool}(undef, length(bit)), bit);
     t = Bernoulli_logit(bit, a);