Series of updates to better support a few more complicated loops.

chriselrod · chriselrod · commit bb7569843c6b · 2020-01-16T00:20:42.000-05:00
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -428,6 +428,7 @@ end
 function add_load!(
     ls::LoopSet, var::Symbol, mpref::ArrayReferenceMetaPosition, elementbytes::Int = 8
 )
+    length(mpref.loopdependencies) == 0 && return add_constant!(ls, var, mpref, elementbytes)
     ref = mpref.mref.ref
     # try to CSE
     id = findfirst(r -> r == ref, ls.refs_aliasing_syms)
@@ -518,6 +519,12 @@ function add_constant!(ls::LoopSet, var, elementbytes::Int = 8)
     pushpreamble!(ls, Expr(:(=), mangledvar(op), var))
     pushop!(ls, op, sym)
 end
+function add_constant!(ls::LoopSet, var::Symbol, mpref::ArrayReferenceMetaPosition, elementbytes::Int)
+    op = Operation(length(operations(ls)), var, elementbytes, LOOPCONSTANT, constant, NODEPENDENCY, Symbol[], NOPARENTS, mpref.mref)
+    add_vptr!(ls, op)
+    pushpreamble!(ls, Expr(:(=), mangledvar(op), Expr(:call, lv(:load), mpref.mref.ptr, mem_offset(op, TileDescription(zero(Int32), Symbol(""), Symbol(""), nothing)))))
+    pushop!(ls, op, var)
+end
 # This version has loop dependencies. var gets assigned to sym when lowering.
 function add_constant!(ls::LoopSet, var::Symbol, deps::Vector{Symbol}, sym::Symbol = gensym(:constant), f::Symbol = Symbol(""), elementbytes::Int = 8)
     # length(deps) == 0 && push!(ls.preamble.args, Expr(:(=), sym, var))
@@ -533,7 +540,7 @@ end
 function pushparent!(parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation)
     push!(parents, parent)
     mergesetdiffv!(deps, loopdependencies(parent), reduceddependencies(parent))
-    if !(isload(parent) || isconstant(parent))
+    if !(isload(parent) || isconstant(parent)) && parent.instruction.instr ∉ (:reduced_add, :reduced_prod, :reduce_to_add, :reduce_to_prod)
         mergesetv!(reduceddeps, reduceddependencies(parent))
     end
     nothing
@@ -585,8 +592,12 @@ function add_reduction_update_parent!(
         reductcombine = Symbol("")
     end
     # mergesetv!(reduceddeps, deps)
-    setdiffv!(reduceddeps, deps, loopdependencies(reductinit))
-    mergesetv!(reduceddependencies(reductinit), reduceddeps)
+    if length(reduceddependencies(reductinit)) == 0
+        setdiffv!(reduceddeps, deps, loopdependencies(reductinit))
+    else
+        setdiffv!(reduceddeps, deps, loopdependencies(reductinit))
+    end
+    # mergesetv!(reduceddependencies(reductinit), reduceddeps)
     pushparent!(parents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
     op = Operation(length(operations(ls)), reductsym, elementbytes, instr, compute, deps, reduceddeps, parents)
     parent.instruction === LOOPCONSTANT && push!(ls.outer_reductions, identifier(op))
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -30,7 +30,7 @@ function symbolind(ind::Symbol, op::Operation, td::TileDescription)
     Expr(:call, :-, pvar, one(Int32))
 end
 function mem_offset(op::Operation, td::TileDescription)
-    @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory."
+    # @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory."
     ret = Expr(:tuple)
     indices = getindices(op)
     loopedindex = op.ref.loopedindex
@@ -146,7 +146,7 @@ function lower_load_scalar!(
     for u ∈ zero(Int32):Base.unsafe_trunc(Int32,U-1)
         varname = varassignname(var, u, isunrolled)
         td = TileDescription(u, unrolled, tiled, suffix)
-        push!(q.args, Expr(:(=), varname, Expr(:call, lv(:load),  ptr, mem_offset_u(op, td))))
+        push!(q.args, Expr(:(=), varname, Expr(:call, lv(:load), ptr, mem_offset_u(op, td))))
     end
     nothing
 end
@@ -512,11 +512,7 @@ function lower_nest(
     blockq = Expr(:block)
     if n > 1
         looptoadd = order[n-1]
-        if looptoadd === vectorized
-            push!(blockq.args, Expr(:(=), looptoadd, Expr(:call, lv(:_MM), W, loopstart)))
-        else
-            push!(blockq.args, Expr(:(=), looptoadd, loopstart))
-        end
+        push!(blockq.args, startloop(ls.loops[looptoadd], looptoadd === vectorized, W, looptoadd))
     end
     loopq = if exprtype === :block
         blockq
@@ -875,11 +871,7 @@ function lower_unrolled(ls::LoopSet, vectorized::Symbol, U::Int)
     W = ls.W
     typeT = ls.T
     setup_Wmask!(ls, W, typeT, vectorized, unrolled, last(order), U)
-    initunrolledcounter = if unrolled === vectorized
-        Expr(:(=), unrolled, Expr(:call, lv(:_MM), W, 0))
-    else
-        Expr(:(=), unrolled, 0)
-    end
+    initunrolledcounter = startloop(ls.loops[unrolled], unrolled === vectorized, W, unrolled)
     q = lower_unrolled!(Expr(:block, initunrolledcounter), ls, vectorized, U, -1, W, typeT, ls.loops[unrolled])
     lsexpr(ls, q)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -40,7 +40,7 @@ using LinearAlgebra
     @test logsumexp!(r, x) ≈ 102.35216846104409
 
     @testset "GEMM" begin
-        using LoopVectorization, Test
+        using LoopVectorization, Test; T = Float64
         Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 4)
         AmulBq1 = :(for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
                     C[m,n] = zeroB
@@ -209,7 +209,7 @@ using LinearAlgebra
 	    C12 += A[k,m] * B[k,n1] 
 	    C22 += A[k,m1] * B[k,n1]
 	    end)
-lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
+        # lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
 
         function toy1!(G, B,κ)
             d = size(G,1)
@@ -220,15 +220,6 @@ lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
                 end
             end
         end
-        # function toy4!(G, B,κ)
-            # d = size(G,1)
-            # @avx for d1=1:d
-                # G[d1,κ] = B[1,d1]*B[1,κ]
-                # for d2=2:d
-                    # G[d1,κ] += B[d2,d1]*B[d2,κ]
-                # end
-            # end
-        # end
         # tq1 = :(for d1=1:d
                 # G[d1,κ] = B[1,d1]*B[1,κ]
                 # for d2=2:d
@@ -271,6 +262,15 @@ lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
                 G[d1,κ] = z
                 end);
         lst3 = LoopVectorization.LoopSet(tq3)
+        function toy4!(G, B,κ)
+            d = size(G,1)
+            @avx for d1=1:d
+                G[d1,κ] = B[1,d1]*B[1,κ]
+                for d2=2:d
+                    G[d1,κ] += B[d2,d1]*B[d2,κ]
+                end
+            end
+        end
 
         for T ∈ (Float32, Float64, Int32, Int64)
             @show T, @__LINE__
@@ -312,6 +312,8 @@ lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
             @test G1 ≈ G2
             fill!(G2, TC(NaN)), toy3!(G2,B,1);
             @test G1 ≈ G2
+            fill!(G2, TC(NaN)), toy4!(G2,B,1);
+            @test G1 ≈ G2
             # fill!(G2, TC(NaN)), toy4!(G2,B,1);
             # @test G1 ≈ G2
         end
@@ -448,7 +450,7 @@ lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
                   y[i] = yᵢ
                   end)
         lsgemv = LoopVectorization.LoopSet(gemvq);
-        @test LoopVectorization.choose_order(lsgemv) == (Symbol[:i, :j], :i, 8, -1)
+        @test LoopVectorization.choose_order(lsgemv) == (Symbol[:i, :j], :i, 4, -1)
 
         function mygemv!(y, A, x)
             @inbounds for i ∈ eachindex(y)
@@ -645,7 +647,9 @@ lsmul2x2q = LoopVectorization.LoopSet(mul2x2q)
         basis = rand(r, (dim, nbasis));
         coeffs = rand(T, nbasis);
         P = rand(T, dim, maxdeg+1);
-        @test_broken mvp(P, basis, coeffs) ≈ mvpavx(P, basis, coeffs)
+        mvp(P, basis, coeffs)
+        mvpavx(P, basis, coeffs)
+        @test mvp(P, basis, coeffs) ≈ mvpavx(P, basis, coeffs)
     end
 end