Merge branch 'master' of github.com:JuliaSIMD/LoopVectorization.jl

chriselrod · chriselrod · commit d1b9f0be9562 · 2021-07-29T07:52:53.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.56"
+version = "0.12.57"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -404,8 +404,8 @@ end
   loopsyms = [gensym!(ls, "n") for n ∈ 1:N]
   add_broadcast_loops!(ls, loopsyms, :dest)
   elementbytes = sizeof(T)
-  add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
-  storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, loopsyms), elementbytes)
+  add_broadcast!(ls, :destination, :bc, loopsyms, BC, elementbytes)
+  storeop = add_simple_store!(ls, :destination, ArrayReference(:dest, loopsyms), elementbytes)
   doaddref!(ls, storeop)
   resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
   # return ls
@@ -426,8 +426,8 @@ end
   pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
   add_broadcast_loops!(ls, loopsyms, :dest′)
   elementbytes = sizeof(T)
-  add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
-  storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
+  add_broadcast!(ls, :destination, :bc, loopsyms, BC, elementbytes)
+  storeop = add_simple_store!(ls, :destination, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
   doaddref!(ls, storeop)
   resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
   Expr(:block, Expr(:meta,:inline), setup_call(ls, :(Base.Broadcast.materialize!(dest′, bc)), LineNumberNode(0), inline, false, u₁, u₂, v, threads%Int, warncheckarg), :dest′)
diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl
@@ -1,79 +1,94 @@
 
 
-function add_operation!(ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation)
-    newid = included[identifier(op)]
-    iszero(newid) || return operations(ls_new)[newid]
-    vparents = Operation[]
-    for opp ∈ parents(op)
-        # TODO: get it so that
-        # a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
-        # if iscompute(opp) && (!isstore(op)) # search for stores
-        #     found = false
-        #     for oppp ∈ operations(ls)
-        #         isstore(oppp) || continue
-        #         if first(parents(oppp)) === op
-        #             found = true
-                    
-        #             push!(vparents, add_operation!(ls_new, included, ls, opppp))
-        #             break
-        #         end
-        #     end
-        #     found && continue
-        # end
-        push!(vparents, add_operation!(ls_new, included, ls, opp))
+function add_operation!(ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation, ids::Vector{Int}, issecond::Bool)
+  newid = included[identifier(op)]
+  iszero(newid) || return operations(ls_new)[newid]
+  vparents = Operation[]
+  for opp ∈ parents(op)
+    # TODO: get it so that
+    # a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
+    if issecond && (iscompute(opp) & (!isstore(op)))
+      found = false
+      for opc ∈ children(opp)
+        if isstore(opc) && identifier(opc) ∉ ids
+          # @show opp opc op
+          # replace opp with a load from opc
+          parentsopc = parents(opc)
+          parentsnew = length(parentsopc) > 1 ? Operation[] : NOPARENTS
+          opnew = Operation(
+            length(operations(ls_new)), name(opp), opc.elementbytes, instruction(:getindex), memload,
+            loopdependencies(opc), reduceddependencies(opc), parentsnew, opc.ref, reducedchildren(opc)
+          )
+          addsetv!(ls_new.includedactualarrays, vptr(opc.ref))
+          push!(operations(ls_new), opnew)
+          push!(vparents, opnew)
+          for i ∈ 2:length(parentsopc)
+            push!(parentsnew, add_operation!(ls_new, included, ls, parentsopc[i], ids, issecond))
+          end
+          included[identifier(opp)] = identifier(opnew)
+          found = true
+          break
+        end
+      end
+      found && continue
     end
-    opnew = Operation(
-        length(operations(ls_new)), name(op), op.elementbytes, instruction(op), op.node_type,
-        loopdependencies(op), reduceddependencies(op), vparents, op.ref, reducedchildren(op)
-    )
-    accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
-    push!(operations(ls_new), opnew)
-    included[identifier(op)] = identifier(opnew)
-    opnew
+    push!(vparents, add_operation!(ls_new, included, ls, opp, ids, issecond))
+  end
+  opnew = Operation(
+    length(operations(ls_new)), name(op), op.elementbytes, instruction(op), op.node_type,
+    loopdependencies(op), reduceddependencies(op), vparents, op.ref, reducedchildren(op)
+  )
+  accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
+  push!(operations(ls_new), opnew)
+  included[identifier(op)] = identifier(opnew)
+  opnew
 end
 
 function append_if_included!(vnew, vold, included)
-    for (i, v) ∈ vold
-        id = included[i]
-        iszero(id) || push!(vnew, (id, v))
-    end
+  for (i, v) ∈ vold
+    id = included[i]
+    iszero(id) || push!(vnew, (id, v))
+  end
 end
 
-function split_loopset(ls::LoopSet, ids)
-    ls_new = LoopSet(:LoopVectorization)
-    included = zeros(Int, length(operations(ls)))
-    for i ∈ ids
-        add_operation!(ls_new, included, ls, operations(ls)[i])
-    end
-    for op ∈ operations(ls_new)
-        for l ∈ loopdependencies(op)
-            if l ∉ ls_new.loopsymbols
-                add_loop!(ls_new, getloop(ls, l))
-            end
-        end
-        length(ls_new.loopsymbols) == length(ls.loopsymbols) && break
-    end
-    append_if_included!(ls_new.preamble_symsym, ls.preamble_symsym, included)
-    append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
-    append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
-    append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
-    append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
-    for i ∈ ls.outer_reductions
-        id = included[i]
-        iszero(id) || push!(ls_new.outer_reductions, id)
+function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
+  ls_new = LoopSet(:LoopVectorization)
+  included = zeros(Int, length(operations(ls)))
+  for i ∈ ids
+    add_operation!(ls_new, included, ls, operations(ls)[i], ids, issecond)
+  end
+  for op ∈ operations(ls_new)
+    for l ∈ loopdependencies(op)
+      if l ∉ ls_new.loopsymbols
+        add_loop!(ls_new, getloop(ls, l))
+      end
     end
-    # TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
-    # Therefore, it would be profitable to split for this reason.
-    # However, currently the default assumption in vector width will be wrong, so we should calculate
-    # it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
-    # it shouldn't.
-    # Current behavior is incorrect when VECWIDTH chosen does actually differ between
-    # split loops and the loops are statically sized, because code gen will then assume it is correct...
-    l1, l2, l3 = cache_sze(ls)
-    set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
-    ls_new.vector_width = ls.vector_width
-    fill_offset_memop_collection!(ls)
-    ls_new
+    length(ls_new.loopsymbols) == length(ls.loopsymbols) && break
+  end
+  append_if_included!(ls_new.preamble_symsym, ls.preamble_symsym, included)
+  append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
+  append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
+  append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
+  append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
+  for i ∈ ls.outer_reductions
+    id = included[i]
+    iszero(id) || push!(ls_new.outer_reductions, id)
+  end
+  # TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
+  # Therefore, it would be profitable to split for this reason.
+  # However, currently the default assumption in vector width will be wrong, so we should calculate
+  # it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
+  # it shouldn't.
+  # Current behavior is incorrect when VECWIDTH chosen does actually differ between
+  # split loops and the loops are statically sized, because code gen will then assume it is correct...
+  l1, l2, l3 = cache_sze(ls)
+  set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
+  ls_new.vector_width = ls.vector_width
+  fill_offset_memop_collection!(ls)
+  # println("ls_new operations:")
+  # display(ls_new.operations)
+  # println()
+  ls_new
 end
 
 function returned_ops(ls::LoopSet)
@@ -96,14 +111,14 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
   # for (ind,i) ∈ enumerate(split_candidates)
   for (ind,i) ∈ enumerate(split_candidates)
     split_1[1] = i
-    ls_1 = split_loopset(ls, split_1)
+    ls_1 = split_loopset(ls, split_1, false)
     order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
     remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
-    ls_2 = split_loopset(ls, remaining_ops)
+    ls_2 = split_loopset(ls, remaining_ops, true)
     order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
     # U_1 = T_1 = U_2 = T_2 = 2
     # return ls_1, ls_2
-    # @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
+    # @show cost_1 + cost_2 ≤ 0.9cost_fused, (cost_1 + cost_2) / cost_fused, cost_1, cost_2, cost_fused
     if cost_1 + cost_2 ≤ 0.9cost_fused
       ls_2_lowered = if length(remaining_ops) > 1
         inline = iszero(inline) ? (shouldinline_1 % Int) : inline
diff --git a/test/special.jl b/test/special.jl
@@ -344,6 +344,20 @@
       end 
     end 
   end
+  function sin_sum_3loop_split!(u, x, y, z)
+    sx = similar(x); sy = similar(y); sz = similar(z);
+    @turbo for k in 1:length(z)
+      for j in 1:length(y)
+        for i in 1:length(x)
+          sxi = sin(x[i])
+          syj = sin(y[j])
+          szk = sin(z[k])
+          sx[i] = sxi; sy[j] = syj; sz[k] = szk;
+          u[i, j, k] = sxi + syj + szk
+        end 
+      end 
+    end 
+  end
 
     for T ∈ (Float32, Float64)
         @show T, @__LINE__
@@ -428,6 +442,8 @@
       u = zeros(itot+8, itot+8, itot+8);
       uv = @view u[5:5+itot-1, 5:5+itot-1, 5:5+itot-1];
       sin_sum_3loop!(uv, x, y, z);
-      @test uv ≈ (identity(sin.(x)) .+ identity((sin.(y))')) .+ identity(reshape(sin.(z), (1, 1, length(z))))
+      uv2 = @view similar(u)[5:5+itot-1, 5:5+itot-1, 5:5+itot-1];
+      sin_sum_3loop_split!(uv2, x, y, z);
+      @test uv ≈ uv2 ≈ (identity(sin.(x)) .+ identity((sin.(y))')) .+ identity(reshape(sin.(z), (1, 1, length(z))))
     end
 end