ifelse reduction changes

chriselrod · chriselrod · commit b81bbbf24b08 · 2021-09-04T17:03:25.000-04:00
diff --git a/src/codegen/lower_store.jl b/src/codegen/lower_store.jl
@@ -50,8 +50,15 @@ function reduce_expr!(q::Expr, toreduct::Symbol, op::Operation, u₁::Int, u₂:
   end
   if (u₁ == 1) | (~isu₁unrolled)
     push!(q.args, Expr(:(=), Symbol(toreduct, "##onevec##"), _toreduct))
-  else
+  elseif instruction(op).instr ≢ :ifelse
     push!(q.args, Expr(:(=), Symbol(toreduct, "##onevec##"), Expr(:call, reduction_to_single_vector(op), _toreduct)))
+  else
+    fifelse = let u₁=u₁
+      ifelse_reduction(:IfElseCollapser,op) do opv
+        Symbol(mangledvar(opv), '_', u₁), tuple()
+      end
+    end
+    push!(q.args, Expr(:(=), Symbol(toreduct, "##onevec##"), Expr(:call, fifelse, _toreduct, staticexpr(1))))
   end
   nothing
 end
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -428,7 +428,7 @@ function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
     u₁best, u₂best, bestcost
 end
 
-function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int, atleast32registers::Bool)
+function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int, atleast31registers::Bool)
     X₁, X₂, X₃, X₄ = X[1], X[2], X[3], X[4]
     # If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512)
     R₁, R₂, R₃, R₄ = R[1], R[2], R[3], R[4]
@@ -443,8 +443,8 @@ function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int,
     u₂float = (RR - u₁float*R₂)/(u₁float*R₁)
     if !(isfinite(u₂float) & isfinite(u₁float)) # brute force
         u₁low = u₂low = 1
-        u₁high = iszero(X₂) ? 2 : (atleast32registers ? 8 : 6)
-        u₂high = iszero(X₃) ? 2 : (atleast32registers ? 8 : 6)
+        u₁high = iszero(X₂) ? 2 : (atleast31registers ? 8 : 6)
+        u₂high = iszero(X₃) ? 2 : (atleast31registers ? 8 : 6)
         return solve_unroll_iter(X, R, u₁L, u₂L, u₁low:u₁step:u₁high, u₂low:u₂step:u₂high)
     end
     u₁low = floor(Int, u₁float)
@@ -457,7 +457,7 @@ function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int,
     if u₂low ≥ u₂high
         u₂low = solve_unroll_constU(R, u₁high)
     end
-    maxunroll = atleast32registers ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
+    maxunroll = atleast31registers ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
     u₁low = (clamp(u₁low, 1, maxunroll) ÷ u₁step) * u₁step
     u₂low = (clamp(u₂low, 1, maxunroll) ÷ u₂step) * u₂step
     u₁high = clamp(u₁high, 1, maxunroll)
@@ -482,9 +482,9 @@ end
 #     floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
 # end
 # Tiling here is about alleviating register pressure for the UxT
-function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step, atleast32registers::Bool)
+function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step, atleast31registers::Bool)
     # iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
-    u₁, u₂, cost = solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step, u₂step, atleast32registers)
+    u₁, u₂, cost = solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step, u₂step, atleast31registers)
     # u₂ -= u₂ & 1
     # u₁ = min(u₁, u₂)
     u₁_too_large = u₁ > u₁max
@@ -539,7 +539,7 @@ function solve_unroll(
     u₁loop = getloop(ls, u₁loopsym)
     u₂loop = getloop(ls, u₂loopsym)
     solve_unroll(
-        u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, u₁loop, u₂loop, u₁step, u₂step, reg_count(ls) ≥ 32
+        u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, u₁loop, u₂loop, u₁step, u₂step, reg_count(ls) ≥ 31
     )
 end
 
@@ -550,9 +550,9 @@ function solve_unroll(
     W::Int, vloopsym::Symbol,
     u₁loop::Loop, u₂loop::Loop,
     u₁step::Int, u₂step::Int,
-    atleast32registers::Bool
+    atleast31registers::Bool
 )
-    maxu₂base = maxu₁base = atleast32registers ? 10 : 6#8
+    maxu₂base = maxu₁base = atleast31registers ? 10 : 6#8
     maxu₂ = maxu₂base#8
     maxu₁ = maxu₁base#8
     u₁L = length(u₁loop)
@@ -593,7 +593,7 @@ function solve_unroll(
     else
         u₂Lf = Float64(u₂L)
     end
-    u₁, u₂, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step, atleast32registers)
+    u₁, u₂, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step, atleast31registers)
     # heuristic to more evenly divide small numbers of iterations
     if isstaticloop(u₂loop)
         u₂ = maybedemotesize(u₂, length(u₂loop), u₁, u₁loop, maxu₂base)
diff --git a/test/gemm.jl b/test/gemm.jl
@@ -360,7 +360,7 @@
         if LoopVectorization.cache_linesize() == LoopVectorization.register_size()
             @test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :m, :n, :m, 3, 7)
         else
-            @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :n, :m, :m, 5, 4)
+            @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 3, 7)
         end
     elseif LoopVectorization.register_count() == 16
         # @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 1, 6)
@@ -637,7 +637,7 @@
         @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 1, 8)
         # @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :k, :n, :m, 1, 8)
       elseif LoopVectorization.register_size() == 16
-        @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 4, 4)
+        @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 2, 8)
       end            
     elseif LoopVectorization.register_count() == 16
       @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 2, 4)
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -9,11 +9,12 @@ using Test
               s += x[m] * A[m,n] * y[n]
               end);
     lsdot3 = LoopVectorization.loopset(dot3q);
-    if LoopVectorization.register_count() == 32
-        # @test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :m, :n, :m, Unum, Tnum)#&-2
+    if LoopVectorization.register_count() ≠ 32
+      @test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :n, :m, :m, 2, 6)
+    elseif Bool(LoopVectorization.has_opmask_registers())
       @test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :n, Symbol("##undefined##"), :m, 4, -1)
     else
-      @test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :n, :m, :m, 2, 6)
+      @test LoopVectorization.choose_order(lsdot3) == ([:n, :m], :n, :m, :m, 2, 8)
     end
 
     @static if VERSION < v"1.4"
@@ -71,7 +72,7 @@ using Test
     lssubcol = LoopVectorization.loopset(subcolq);
     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :i, Symbol("##undefined##"), :j, 1, -1)
     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 8)
-    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, ifelse(LoopVectorization.register_count() == 32, 8, 6))
+    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, ifelse((LoopVectorization.register_count() == 32), 8, 6))
 
     # if LoopVectorization.register_count() != 8
     #     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)