Tweak dep chain unrolling factors, make reductions respect initialized int vs float status.

chriselrod · chriselrod · commit 5c398b932b2c · 2021-05-16T03:27:15.000-04:00
diff --git a/src/codegen/lowering.jl b/src/codegen/lowering.jl
@@ -423,11 +423,25 @@ function pointerremcomparison(ls::LoopSet, termind::Int, UFt::Int, n::Int, nisve
   end
 end
 
-
+@generated function of_same_size(::Type{T}, ::Type{S}) where {T,S}
+  sizeof_S = sizeof(S)
+  sizeof(T) == sizeof_S && return T
+  Tfloat = T <: Union{Float32,Float64}
+  if T <: Union{Float32,Float64}
+    sizeof_S ≥ 8 ? Float64 : Float32
+  elseif T <: Signed
+    Symbol(:Int, 8sizeof_S)
+  elseif (T <: Unsigned) | (T === Bool)
+    Symbol(:UInt, 8sizeof_S)
+  else
+    S
+  end
+end
 function outer_reduction_zero(op::Operation, u₁u::Bool, Umax::Int, reduct_class::Float64, rs::Expr)
   reduct_zero = reduction_zero(reduct_class)
   # Tsym = outer_reduct_init_typename(op)
-  Tsym = ELTYPESYMBOL
+  # Tsym = ELTYPESYMBOL
+  Tsym = Expr(:call, lv(:of_same_size), outer_reduct_init_typename(op), ELTYPESYMBOL)
   if isvectorized(op)
     if Umax == 1 || !u₁u
       if reduct_zero === :zero
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -70,8 +70,8 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
             return 0.0, 0, 0.0
         end
     elseif iscompute(op) &&
-        Base.sym_in(instruction(op).instr, (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast)) &&
-        all(opp -> (isloopvalue(opp)), parents(op))
+        (Base.sym_in(instruction(op).instr, (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast)) &&
+        all(opp -> (isloopvalue(opp)), parents(op)))# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse))
         # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
         return 0.0, 0, 0.0
     end
@@ -202,14 +202,14 @@ function depchain_cost!(
     skip[identifier(op)] = true
     # depth first search
     for opp ∈ parents(op)
-        skip[identifier(opp)] && continue
+      skip[identifier(opp)] && continue
         rt, sl = depchain_cost!(ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl)
     end
     # Basically assuming memory and compute don't conflict, but everything else does
     # Ie, ignoring the fact that integer and floating point operations likely don't either
     if iscompute(op)
-        rtᵢ, slᵢ = cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T)
-        rt += rtᵢ; sl += slᵢ
+      rtᵢ, slᵢ = cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T)
+      rt += rtᵢ; sl += slᵢ
     end
     rt, sl
 end
@@ -357,11 +357,11 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
         else
             return determine_unroll_factor(ls, order, vloopsym, num_reductions)
         end
-    elseif iszero(num_reductions)
+    elseif iszero(num_reductions) # handle `BitArray` loops w/out reductions
         return 8 ÷ ls.vector_width, vloopsym
-    else
+    else # handle `BitArray` loops with reductions
         rttemp, ltemp = determine_unroll_factor(ls, order, vloopsym, vloopsym)
-        UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp * num_reductions) ) )))
+        UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp) ) )))
         UFfactor = 8 ÷ ls.vector_width
         cld(UF, UFfactor)*UFfactor, vloopsym
     end
@@ -383,9 +383,11 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
         end
     end
     # min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
-    UF = VectorizationBase.nextpow2(round(Int, clamp(latency / (rt * num_reductions), 1.0, 8.0)))
-    if UF == 1 && num_reductions > 1
-        UF = VectorizationBase.nextpow2(round(Int, clamp(latency / (rt * cld(num_reductions, 2)), 1.0, 8.0)))
+    lrtratio  = latency / rt
+    if lrtratio ≥ 7.0
+        UF = 8
+    else
+        UF = VectorizationBase.nextpow2(round(Int, clamp(lrtratio, 1.0, 4.0)))
     end
     if best_unrolled === vloopsym
         UF = demote_unroll_factor(ls, UF, vloopsym)
diff --git a/test/dot.jl b/test/dot.jl
@@ -291,11 +291,11 @@ using Test
         @test dot33(a,b) ≈ @view(a[1:33])' * @view(b[1:33])
 
         if T <: Union{Float32,Float64}
-            πest = T(mcpi(a, b))
-            @test πest == mcpiavx(a, b)
-            @test πest == mcpiavx_u4(a, b)
-            @test πest == mcpi_avx(a, b)
-            @test πest == mcpi_avx_u4(a, b)
+            πest = mcpi(a, b)
+            @test πest ≈ mcpiavx(a, b)
+            @test πest ≈ mcpiavx_u4(a, b)
+            @test πest ≈ mcpi_avx(a, b)
+            @test πest ≈ mcpi_avx_u4(a, b)
         end
 
         if !(!Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx2))) && T === Int32)