A few tweeks, most importantly use check_args for broadcasts.

chriselrod · chriselrod · commit 753aa3dfe9be · 2020-07-13T01:17:38.000-04:00
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -66,9 +66,9 @@ function add_broadcast!(
     K = gensym(:K)
     mA = gensym(:Aₘₖ)
     mB = gensym(:Bₖₙ)
-    pushpreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
-    pushpreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
-    pushpreamble!(ls, Expr(:(=), K, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:ref, Expr(:call, :size, mB), 1))))
+    pushprepreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
+    pushprepreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
+    pushprepreamble!(ls, Expr(:(=), K, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:ref, Expr(:call, :size, mB), 1))))
     k = gensym(:k)
     add_loop!(ls, Loop(k, 1, K), k)
     m = loopsyms[1];
@@ -139,7 +139,7 @@ end
 function extract_all_1_array!(ls::LoopSet, bcname::Symbol, N::Int, elementbytes::Int)
     refextract = gensym(bcname)
     ref = Expr(:ref, bcname); append!(ref.args, [1 for n ∈ 1:N])
-    pushpreamble!(ls, Expr(:(=), refextract, ref))
+    pushprepreamble!(ls, Expr(:(=), refextract, ref))
     return add_constant!(ls, refextract, elementbytes) # or replace elementbytes with sizeof(T) ? u
 end
 function add_broadcast!(
@@ -159,7 +159,7 @@ function add_broadcast_adjoint_array!(
     ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{A}, elementbytes::Int
 ) where {T,N,A<:AbstractArray{T,N}}
     parent = gensym(:parent)
-    pushpreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
+    pushprepreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
     # isone(length(loopsyms)) && return extract_all_1_array!(ls, bcname, N, elementbytes)
     ref = ArrayReference(parent, Symbol[loopsyms[N + 1 - n] for n ∈ 1:N])
     add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
@@ -198,7 +198,7 @@ function add_broadcast!(
     ls::LoopSet, ::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{Base.RefValue{T}}, elementbytes::Int
 ) where {T}
     refextract = gensym(bcname)
-    pushpreamble!(ls, Expr(:(=), refextract, Expr(:ref, bcname)))
+    pushprepreamble!(ls, Expr(:(=), refextract, Expr(:ref, bcname)))
     add_constant!(ls, refextract, elementbytes) # or replace elementbytes with sizeof(T) ? u
 end
 function add_broadcast!(
@@ -210,7 +210,7 @@ function add_broadcast!(
     inds[2:end] .= @view(loopsyms[1:N])
     add_simple_load!(ls, destname, ArrayReference(bcname, inds), elementbytes, true, true)
 end
-BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} = Broadcasted{S,Nothing,F,A}
+const BroadcastedArray{S<:Broadcast.AbstractArrayStyle,F,A} = Broadcasted{S,Nothing,F,A}
 function add_broadcast!(
     ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
     @nospecialize(B::Type{<:BroadcastedArray}),
@@ -219,7 +219,7 @@ function add_broadcast!(
     S,_,F,A = B.parameters
     instr = get(FUNCTIONSYMBOLS, F) do
         f = gensym(:func)
-        pushpreamble!(ls, Expr(:(=), f, Expr(:(.), bcname, QuoteNode(:f))))
+        pushprepreamble!(ls, Expr(:(=), f, Expr(:(.), bcname, QuoteNode(:f))))
         Instruction(bcname, f)
     end
     args = A.parameters
@@ -231,7 +231,7 @@ function add_broadcast!(
     # reduceddeps = Symbol[]
     for (i,arg) ∈ enumerate(args)
         argname = gensym(:arg)
-        pushpreamble!(ls, Expr(:(=), argname, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:ref, bcargs, i))))
+        pushprepreamble!(ls, Expr(:(=), argname, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:ref, bcargs, i))))
         # dynamic dispatch
         parent = add_broadcast!(ls, gensym(:temp), argname, loopsyms, arg, elementbytes)::Operation
         push!(parents, parent)
@@ -272,8 +272,10 @@ end
     # return ls
     q = lower(ls)
     push!(q.args, :dest)
-    pushfirst!(q.args, Expr(:meta,:inline))
     # @show q
+    # q
+    q = Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), q, :(Base.Broadcast.materialize!(dest, bc))))
+    isone(N) && pushfirst!(q.args, Expr(:meta,:inline))
     q
      # ls
 end
@@ -285,7 +287,7 @@ end
     loopsyms = [gensym(:n) for n ∈ 1:N]
     ls = LoopSet(Mod)
     ls.isbroadcast[] = true
-    pushpreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
+    pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
     sizes = Expr(:tuple)
     for (n,itersym) ∈ enumerate(loopsyms)
         Nsym = gensym(:N)
@@ -299,7 +301,8 @@ end
     resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
     q = lower(ls)
     push!(q.args, :dest′)
-    pushfirst!(q.args, Expr(:meta,:inline))
+    q = Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), q, :(Base.Broadcast.materialize!(dest′, bc))))
+    isone(N) && pushfirst!(q.args, Expr(:meta,:inline))
     q
     # ls
 end
@@ -329,4 +332,4 @@ end
 end
 
 vmaterialize!(dest, bc, ::Val{mod}) where {mod} = Base.Broadcast.materialize!(dest, bc)
-    
+
diff --git a/src/costs.jl b/src/costs.jl
@@ -231,6 +231,8 @@ const COST = Dict{Symbol,InstructionCost}(
     :sinpi_fast => InstructionCost(18,15.0,68.0,23),
     :cospi_fast => InstructionCost(18,15.0,68.0,26),
     :sincospi_fast => InstructionCost(25,22.0,70.0,26),
+    :tanh => InstructionCost(40,40.0,40.0,26), # FIXME
+    # :tanh_fast => InstructionCost(25,22.0,70.0,26), # FIXME
     :identity => InstructionCost(0,0.0,0.0,0),
     :adjoint => InstructionCost(0,0.0,0.0,0),
     :conj => InstructionCost(0,0.0,0.0,0),
@@ -442,6 +444,8 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
     typeof(sincos) => :sincos,
     typeof(Base.FastMath.sincos_fast) => :sincos,
     typeof(SLEEFPirates.sincos) => :sincos,
+    typeof(Base.tanh) => :tanh,
+    # typeof(SLEEFPirates.tanh_fast) => :tanh_fast,
     typeof(max) => :max,
     typeof(min) => :min,
     typeof(<<) => :<<,
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -72,7 +72,15 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
                 r = (1 << Wshift)
                 srt *= r
                 sl *= r
-            # else # vmov(a/u)pd
+            elseif isload(op) & length(loopdependencies(op)) > 1# vmov(a/u)pd
+                # penalize vectorized loads with more than 1 loopdep
+                # heuristic; more than 1 loopdep means that many loads will not be aligned
+                # Roughly corresponds to double-counting loads crossing cacheline boundaries
+                # TODO: apparently the new ARM A64FX CPU (with 512 bit vectors) is NOT penalized for unaligned loads
+                #       would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
+                #       Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
+                #       this feature is common to all of them.
+                srt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
             end
         elseif instr === :setindex! # broadcast or reductionstore; if store we want to penalize reduction
             srt *= 3
@@ -857,12 +865,14 @@ function evaluate_cost_tile(
             elseif load_elimination_cost_factor!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, Wshift, size_T)
                 continue
             end
-        elseif isconstant(op)
+        #elseif isconstant(op)
         end
         rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
-        if isload(op) && !iszero(prefetchisagoodidea(ls, op, UnrollArgs(4, unrollsyms, 4, 0)))
-            rt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
-            prefetch_good_idea = true
+        if isload(op)
+            if !iszero(prefetchisagoodidea(ls, op, UnrollArgs(4, unrollsyms, 4, 0)))
+                # rt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
+                prefetch_good_idea = true
+            end
         end
         # rp = (opisininnerloop && !(loadintostore(ls, op))) ? rp : zero(rp) # we only care about register pressure within the inner most loop
         rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
@@ -871,9 +881,11 @@ function evaluate_cost_tile(
         if isstore(op) & (!u₁reducesrt) & (!u₂reducesrt)
             irreducible_storecosts += rt
         end
+        # @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp
         update_costs!(cost_vec, rt, u₁reducesrt, u₂reducesrt)
         update_costs!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
     end
+    # reg_pressure[1] = max(reg_pressure[1], length(ls.outer_reductions))
     # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
     costpenalty = (sum(reg_pressure) > REGISTER_COUNT) ? 2 : 1
     u₁v = vectorized === u₁loopsym; u₂v = vectorized === u₂loopsym
@@ -886,10 +898,12 @@ function evaluate_cost_tile(
     end
     outer_reduct_penalty = length(ls.outer_reductions) * (u₁ + isodd(u₁))
     favor_bigger_u₂ = u₁ - u₂
-    favor_smaller_vectorized = u₁v ? ( u₁ - u₂ )  : (u₂v ?  ( u₂ - u₁ ) : 0 )
+    # favor_smaller_vectorized = (u₁v ? u₁ : -u₁) + (u₂v ?  u₂ : -u₂)
+    favor_smaller_vectorized = (u₁v ⊻ u₂v) ? (u₁v ? u₁ - u₂ : u₂ - u₁) : 0
     favor_u₁_vectorized = -0.2u₁v
     favoring_heuristics = favor_bigger_u₂ + 0.5favor_smaller_vectorized + favor_u₁_vectorized
-    u₁, u₂, costpenalty * ucost + stride_penalty(ls, order) + outer_reduct_penalty + favoring_heuristics, choose_to_inline[]
+    costpenalty = costpenalty * ucost + stride_penalty(ls, order) + outer_reduct_penalty + favoring_heuristics
+    u₁, u₂, costpenalty, choose_to_inline[]
 end
 
 
diff --git a/src/lower_compute.jl b/src/lower_compute.jl
@@ -1,5 +1,5 @@
 
-function load_constrained(op, u₁loop, u₂loop, forprefetch = false)
+function load_constrained(op, u₁loop, u₂loop, innermost_loop, forprefetch = false)
     loopdeps = loopdependencies(op)
     dependsonu₁ = u₁loop ∈ loopdeps
     if u₂loop === Symbol("##undefined##")
@@ -17,7 +17,10 @@ function load_constrained(op, u₁loop, u₂loop, forprefetch = false)
     unrolleddeps = Symbol[]
     dependsonu₁ && push!(unrolleddeps, u₁loop)
     dependsonu₂ && push!(unrolleddeps, u₂loop)
-    any(opp -> isload(opp) && all(in(loopdependencies(opp)), unrolleddeps), parents(op))
+    forprefetch && push!(unrolleddeps, innermost_loop)
+    any(parents(op)) do opp
+        isload(opp) && all(in(loopdependencies(opp)), unrolleddeps)
+    end
 end
 function check_if_remfirst(ls, ua)
     usorig = ls.unrollspecification[]
@@ -34,7 +37,7 @@ function check_if_remfirst(ls, ua)
 end
 function sub_fmas(ls::LoopSet, op::Operation, ua::UnrollArgs)
     @unpack u₁, u₁loopsym, u₂loopsym, u₂max = ua
-    !(load_constrained(op, u₁loopsym, u₂loopsym) || check_if_remfirst(ls, ua))
+    !(load_constrained(op, u₁loopsym, u₂loopsym, first(names(ls))) || check_if_remfirst(ls, ua))
 end
 
 struct FalseCollection end
@@ -212,7 +215,8 @@ function lower_compute!(
     for u ∈ 0:Uiter
         instrcall = callexpr(instr)
         varsym = if tiledouterreduction > 0 # then suffix !== nothing
-            modsuffix = ((u + suffix*(Uiter + 1)) & 3)
+            modsuffix = ((u + suffix*(Uiter + 1)) & 7)
+            # modsuffix = suffix::Int#((u + suffix*(Uiter + 1)) & 7)
             # modsuffix = u
             # modsuffix = suffix # (suffix & 3)
             Symbol(mangledvar(op), modsuffix)
diff --git a/src/lower_load.jl b/src/lower_load.jl
@@ -90,7 +90,7 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
             if prod(s -> Float64(length(getloop(ls, s))), @view(indices[1:innermostloopind-1])) ≥ 120.0 && length(getloop(ls, innermostloopsym)) ≥ 120
                 if op.ref.ref.offsets[innermostloopind] < 120
                     for opp ∈ operations(ls)
-                        iscompute(opp) && (innermostloopsym ∈ loopdependencies(opp)) && load_constrained(opp, u₁loopsym, u₂loopsym, true) && return 0
+                        iscompute(opp) && (innermostloopsym ∈ loopdependencies(opp)) && load_constrained(opp, u₁loopsym, u₂loopsym, innermostloopsym, true) && return 0
                     end
                     return innermostloopind
                 end
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -643,7 +643,7 @@ function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
     elseif u₂ == -1
         min(u₁, 4)
     else
-        4#u₁
+        8#u₂#u₁
     # elseif num_loops(ls) == u₁loopnum
     #     min(u₁, 4)
     # else
diff --git a/test/fallback.jl b/test/fallback.jl
@@ -36,5 +36,9 @@
     @test msdavx(FallbackArrayWrapper(x)) == 1e18
     @test msd(x) == msdavx(FallbackArrayWrapper(x))
     @test msdavx(x) != msdavx(FallbackArrayWrapper(x))
+
+    x = rand(1000); # should be long enough to make zero differences incredibly unlikely
+    @test exp.(x) != (@avx exp.(x))
+    @test exp.(x) == (@avx exp.(FallbackArrayWrapper(x)))
 end
 
diff --git a/test/gemm.jl b/test/gemm.jl
@@ -336,7 +336,7 @@
                end);
     lsr2amb = LoopVectorization.LoopSet(r2ambq);
     if LoopVectorization.REGISTER_COUNT == 32
-        @test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 7, 3)
+        @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :n, :m, :m, 7, 3)
     elseif LoopVectorization.REGISTER_COUNT == 16
         @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :n, :m, :m, 4, 2)
     end
diff --git a/test/ifelsemasks.jl b/test/ifelsemasks.jl
@@ -165,21 +165,6 @@ T = Float32
             a[i] > b[i] || (c[i] = a[i] ^ b[i])
         end
     end
-    function maybewriteor!(c, a, b)
-        @inbounds for i ∈ eachindex(c,a,b)
-            a[i] > b[i] || (c[i] = a[i] ^ b[i])
-        end
-    end
-    function maybewriteor_avx!(c, a, b)
-        @_avx for i ∈ eachindex(c,a,b)
-            a[i] > b[i] || (c[i] = a[i] ^ b[i])
-        end
-    end
-    function maybewriteoravx!(c, a, b)
-        @avx for i ∈ eachindex(c,a,b)
-            a[i] > b[i] || (c[i] = a[i] ^ b[i])
-        end
-    end
     function maybewriteor!(c::AbstractVector{<:Integer}, a, b)
         @inbounds for i ∈ eachindex(c,a,b)
             a[i] > b[i] || (c[i] = a[i] & b[i])
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -108,7 +108,7 @@ using Test
     #     # @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
     #     @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 1, 1)
     # end
-    @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 8, -1)
+    @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
     # my colsum is wrong (by 0.25), but slightly more interesting
     function mycolsum!(x, A)
         @. x = 0
@@ -144,7 +144,7 @@ using Test
     lsvar = LoopVectorization.LoopSet(varq);
     # LoopVectorization.choose_order(lsvar)
     # @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
-    @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 8, -1)
+    @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
     # if LoopVectorization.REGISTER_COUNT == 32
     #     @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 10)
     # elseif LoopVectorization.REGISTER_COUNT == 16