Add support for reducing through ? and ifelse

chriselrod · chriselrod · commit 8f0931ec2599 · 2020-12-25T15:25:15.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.9.11"
+version = "0.9.12"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/src/add_compute.jl b/src/add_compute.jl
@@ -162,7 +162,13 @@ function add_reduction_update_parent!(
 )
     var = name(parent)
     isouterreduction = parent.instruction === LOOPCONSTANT
-    instrclass = reduction_instruction_class(instr) # key allows for faster lookups
+    # @show instr, vparents, parent, reduction_ind
+    if instr.instr === :ifelse
+        @assert length(vparents) == 2
+        instrclass = reduction_instruction_class(instruction(vparents[2])) # key allows for faster lookups
+    else
+        instrclass = reduction_instruction_class(instr) # key allows for faster lookups
+    end
     reduct_zero = reduction_zero(instrclass)
     # if parent is not an outer reduction...
     # if !isouterreduction && !isreductzero(parent, ls, reduct_zero)
@@ -295,14 +301,48 @@ function add_compute!(
 end
 
 function add_compute!(
-    ls::LoopSet, LHS::Symbol, instr, vparents::Vector{Operation}, elementbytes
+    ls::LoopSet, LHS::Symbol, instr, vparents::Vector{Operation}, elementbytes::Int
 )
     deps = Symbol[]
     reduceddeps = Symbol[]
-    foreach(parent -> update_deps!(deps, reduceddeps, parent), vparents)
+    for parent ∈ vparents
+        update_deps!(deps, reduceddeps, parent)
+    end
     op = Operation(length(operations(ls)), LHS, elementbytes, instr, compute, deps, reduceddeps, vparents)
     pushop!(ls, op, LHS)
 end
+# checks for reductions
+function add_compute_ifelse!(
+    ls::LoopSet, LHS::Symbol, cond::Operation, iftrue::Operation, iffalse::Operation, elementbytes::Int
+)
+    deps = Symbol[]
+    reduceddeps = Symbol[]
+    update_deps!(deps, reduceddeps, cond)
+    update_deps!(deps, reduceddeps, iftrue)
+    update_deps!(deps, reduceddeps, iffalse)
+    if name(iftrue) === LHS
+        if name(iffalse) === LHS # a = ifelse(condition, a, a) # -- why??? Let's just eliminate it.
+            return iftrue
+        end
+        vparents = Operation[cond, iffalse]
+        setdiffv!(reduceddeps, deps, loopdependencies(iftrue))
+        add_reduction_update_parent!(
+            vparents, deps, reduceddeps, ls,
+            iftrue, Instruction(:LoopVectorization,:ifelse), 2, elementbytes
+        )
+    elseif name(iffalse) === LHS
+        vparents = Operation[cond, iftrue]
+        setdiffv!(reduceddeps, deps, loopdependencies(iffalse))
+        add_reduction_update_parent!(
+            vparents, deps, reduceddeps, ls,
+            iffalse, Instruction(:LoopVectorization,:ifelse), 3, elementbytes
+        )
+    else
+        vparents = Operation[cond, iftrue, iffalse]
+        op = Operation(length(operations(ls)), LHS, elementbytes, :ifelse, compute, deps, reduceddeps, vparents)
+        pushop!(ls, op, LHS)
+    end
+end
 
 # adds x ^ (p::Real)
 function add_pow!(
diff --git a/src/add_ifelse.jl b/src/add_ifelse.jl
@@ -35,13 +35,13 @@ function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int, positio
     else
         falseop = getop(ls, iffalse, elementbytes)
     end
-    add_compute!(ls, LHS, :ifelse, [condop, trueop, falseop], elementbytes)
+    add_compute_ifelse!(ls, LHS, condop, trueop, falseop, elementbytes)
 end
 
 function add_andblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int, position::Int)
     if LHS isa Symbol
         altop = getop(ls, LHS, elementbytes)
-        return add_compute!(ls, LHS, :ifelse, [condop, rhsop, altop], elementbytes)
+        return add_compute_ifelse!(ls, LHS, condop, rhsop, altop, elementbytes)
     elseif LHS isa Expr && LHS.head === :ref
         return add_conditional_store!(ls, LHS, condop, rhsop, elementbytes)
     else
@@ -81,7 +81,7 @@ function add_orblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, ele
         # return add_compute!(ls, LHS, :ifelse, [condop, altop, rhsop], elementbytes)
         # Placing altop second seems to let LLVM fuse operations; but as of LLVM 9.0.1 it will not if altop is first
         # therefore, we negate the condition and switch order so that the altop is second.
-        return add_compute!(ls, LHS, :ifelse, [negatedcondop, rhsop, altop], elementbytes)
+        return add_compute_ifelse!(ls, LHS, negatedcondop, rhsop, altop, elementbytes)
     elseif LHS isa Expr && LHS.head === :ref
         # negatedcondop = add_compute!(ls, gensym(:negated_mask), :~, [condop], elementbytes)
         return add_conditional_store!(ls, LHS, negatedcondop, rhsop, elementbytes)
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -892,13 +892,16 @@ function isunrolled_sym(op::Operation, u₁loop::Symbol, u₂loop::Symbol)
             u₂ild = u₂loop ∈ reducedchildren(op)
         end
     end
+    # @show op u₁ild, u₂ild
     (u₁ild & u₂ild) || return u₁ild, u₂ild
     reductops = isconstant(op) ? reducedchildren(op) : reduceddependencies(op)
+    # @show op reductops
     iszero(length(reductops)) && return true, true
     u₁reduced = u₁loop ∈ reductops
     u₂reduced = u₂loop ∈ reductops
     # We want to only unroll one of them.
     # We prefer not to unroll a reduced loop
+    # @show u₁reduced, u₂reduced
     if u₂reduced # if both are reduced, we unroll u₁
         true, false
     elseif u₁reduced
diff --git a/test/ifelsemasks.jl b/test/ifelsemasks.jl
@@ -360,6 +360,72 @@ T = Float32
             f[j, d] = _x 
         end
     end
+    
+    function barycentric_weight0(X)
+        T = eltype(X)
+        n = length(X) - 1
+        w = zero(X)
+        @inbounds @fastmath for j in 0:n
+            tmp = one(T)
+            for k in 0:n
+                tmp = k==j ? tmp : tmp * (X[j+1] - X[k+1])
+            end
+            w[j+1] = inv(tmp)
+        end
+        return w
+    end
+    function barycentric_weight1(X)
+        T = eltype(X)
+        n = length(X) - 1
+        w = zero(X)
+        @avx for j in 0:n
+            tmp = one(T)
+            for k in 0:n
+                tmp = k != j ? tmp * (X[j+1] - X[k+1]) : tmp
+            end
+            w[j+1] = inv(tmp)
+        end
+        return w
+    end
+    function barycentric_weight2(X)
+        T = eltype(X)
+        n = length(X) - 1
+        w = zero(X)
+        @avx inline=true for j in 0:n
+            tmp = one(T)
+            for k in 0:n
+                tmp = k==j ? tmp : tmp * (X[j+1] - X[k+1])
+            end
+            w[j+1] = inv(tmp)
+        end
+        return w
+    end
+    function barycentric_weight3(X)
+        T = eltype(X)
+        n = length(X) - 1
+        w = zero(X)
+        @avx inline=true for j in 0:n
+            tmp = one(T)
+            for k in 0:n
+                tmp = ifelse(k != j, tmp * (X[j+1] - X[k+1]), tmp)
+            end
+            w[j+1] = inv(tmp)
+        end
+        return w
+    end
+    function barycentric_weight4(X)
+        T = eltype(X)
+        n = length(X) - 1
+        w = zero(X)
+        @avx for j in 0:n
+            tmp = one(T)
+            for k in 0:n
+                tmp = ifelse(k == j, tmp, tmp * (X[j+1] - X[k+1]))
+            end
+            w[j+1] = inv(tmp)
+        end
+        return w
+    end
 
     N = 117
     for T ∈ (Float32, Float64, Int32, Int64)
@@ -529,5 +595,11 @@ T = Float32
     # fc2 = copy(f);
     testfunctionavx!(f, v, d, g, s, θ)    
     @test f ≈ fc
-    
+
+    X = rand(4, 5)
+    bX = barycentric_weight0(X);
+    @test barycentric_weight1(X) ≈ bX
+    @test barycentric_weight2(X) ≈ bX
+    @test barycentric_weight3(X) ≈ bX
+    @test barycentric_weight4(X) ≈ bX
 end