JuliaSIMD
diff --git a/‎.github/workflows/ci-julia-nightly.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/ci-julia-nightly.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎Project.toml
Lines changed: 2 additions & 2 deletions b/‎Project.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/LoopVectorization.jl
Lines changed: 1 addition & 0 deletions b/‎src/LoopVectorization.jl
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/codegen/loopstartstopmanager.jl
Lines changed: 42 additions & 30 deletions b/‎src/codegen/loopstartstopmanager.jl
Lines changed: 42 additions & 30 deletions
diff --git a/‎src/codegen/lower_constant.jl
Lines changed: 57 additions & 0 deletions b/‎src/codegen/lower_constant.jl
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/codegen/lower_store.jl
Lines changed: 3 additions & 5 deletions b/‎src/codegen/lower_store.jl
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/codegen/lowering.jl
Lines changed: 23 additions & 22 deletions b/‎src/codegen/lowering.jl
Lines changed: 23 additions & 22 deletions
diff --git a/‎src/condense_loopset.jl
Lines changed: 7 additions & 3 deletions b/‎src/condense_loopset.jl
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/constructors.jl
Lines changed: 20 additions & 19 deletions b/‎src/constructors.jl
Lines changed: 20 additions & 19 deletions
@@ -38,6 +38,9 @@ jobs:
           - part3
           - part4
           - part5
+          - part6
+          - part7
+          - part8
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
 
@@ -39,6 +39,9 @@ jobs:
           - part3
           - part4
           - part5
+          - part6
+          - part7
+          - part8
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
 
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.12.53"
+version = "0.12.54"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -30,5 +30,5 @@ Static = "0.2, 0.3"
 StrideArraysCore = "0.1.12"
 ThreadingUtilities = "0.4.5"
 UnPack = "1"
-VectorizationBase = "0.20.23"
+VectorizationBase = "0.20.25"
 julia = "1.5"
@@ -6,6 +6,7 @@ using VectorizationBase: register_size, register_count, cache_linesize, cache_si
   mask, pick_vector_width, MM, AbstractMask, data, grouped_strided_pointer, AbstractSIMD,
   vzero, offsetprecalc, lazymul,
   vadd_nw, vadd_nsw, vadd_nuw, vsub_nw, vsub_nsw, vsub_nuw, vmul_nw, vmul_nsw, vmul_nuw,
+  vfmaddsub, vfmsubadd, vpermilps177, vmovsldup, vmovshdup,
     maybestaticfirst, maybestaticlast, gep, gesp, NativeTypes, #llvmptr,
     vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
     vfma_fast, vmuladd_fast, vdiv_fast, vadd_fast, vsub_fast, vmul_fast,
 
@@ -439,49 +439,61 @@ end
 #   end
 #   return nothing
 # end
+
+
 function adjust_offsets!(
   ls::LoopSet, i::Int,
   array_refs_with_same_name::Vector{Int}, arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   ops = operations(ls)
-  @assert length(ops) ≤ 256
-  offsets::Base.RefValue{NTuple{256,Int8}} = Base.RefValue{NTuple{256,Int8}}();
-  GC.@preserve offsets begin
+  if length(ops) ≤ 256
+    offsets = Ref{NTuple{256,Int8}}()
     poffsets = Base.unsafe_convert(Ptr{Int8}, offsets)
-    minoffset = typemax(Int8)
-    maxoffset = typemin(Int8)
-    # stridesunequal = false
+    GC.@preserve offsets adjust_offsets!(ls, i, poffsets, array_refs_with_same_name, arrayref_to_name_op_collection)
+  else
+    offsetsv = similar(ops, Int8)
+    poffsets = pointer(offsetsv)
+    GC.@preserve offsetsv adjust_offsets!(ls, i, poffsets, array_refs_with_same_name, arrayref_to_name_op_collection)
+  end
+end
+function adjust_offsets!(
+  ls::LoopSet, i::Int, poffsets::Ptr{Int8},
+  array_refs_with_same_name::Vector{Int}, arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
+)
+  ops = operations(ls)
+  minoffset = typemax(Int8)
+  maxoffset = typemin(Int8)
+  # stridesunequal = false
+  for j ∈ array_refs_with_same_name
+    arrayref_to_name_op = arrayref_to_name_op_collection[j]
+    for (_,__,opid) ∈ arrayref_to_name_op
+      opref = ops[opid].ref
+      off = getoffsets(opref)[i]
+      minoffset = min(off, minoffset)
+      maxoffset = max(off, maxoffset)
+      unsafe_store!(poffsets, off, opid)
+      # stridesunequal |= (stride ≠ getstrides(opref)[i])
+    end
+  end
+  constoffset = Int(minoffset)
+  constoffset = Core.ifelse(Int(maxoffset) - constoffset > 127, 0, constoffset)
+  if constoffset ≠ 0
     for j ∈ array_refs_with_same_name
       arrayref_to_name_op = arrayref_to_name_op_collection[j]
       for (_,__,opid) ∈ arrayref_to_name_op
         opref = ops[opid].ref
-        off = getoffsets(opref)[i]
-        minoffset = min(off, minoffset)
-        maxoffset = max(off, maxoffset)
-        unsafe_store!(poffsets, off, opid)
-        # stridesunequal |= (stride ≠ getstrides(opref)[i])
-      end
-    end
-    constoffset = Int(minoffset)
-    constoffset = Core.ifelse(Int(maxoffset) - constoffset > 127, 0, constoffset)
-    if constoffset ≠ 0
-      for j ∈ array_refs_with_same_name
-        arrayref_to_name_op = arrayref_to_name_op_collection[j]
-        for (_,__,opid) ∈ arrayref_to_name_op
-          opref = ops[opid].ref
-          newoffset = unsafe_load(poffsets, opid) - constoffset
-          # if stridesunequal
-          #   stride = getstrides(opref)[i]
-          #   newoffsetint = Int(newoffset) + (Int(stride) - 1)
-          #   # @assert typemin(Int8) ≤ newoffsetint ≤ typemax(Int8)
-          #   newoffset = Int8(newoffsetint)
-          # end
-          getoffsets(ops[opid].ref)[i] = newoffset
-        end
+        newoffset = unsafe_load(poffsets, opid) - constoffset
+        # if stridesunequal
+        #   stride = getstrides(opref)[i]
+        #   newoffsetint = Int(newoffset) + (Int(stride) - 1)
+        #   # @assert typemin(Int8) ≤ newoffsetint ≤ typemax(Int8)
+        #   newoffset = Int8(newoffsetint)
+        # end
+        getoffsets(ops[opid].ref)[i] = newoffset
       end
     end
   end
-  constoffset#, Core.ifelse(stridesunequal, 1, Int(stride))
+  return constoffset#, Core.ifelse(stridesunequal, 1, Int(stride))
 end
 
 function calcgespinds(
 
@@ -235,3 +235,60 @@ function lower_licm_constants!(ls::LoopSet)
         setop!(ls, ops[id], Expr(:call, reduction_zero(f), ELTYPESYMBOL))
     end
 end
+
+function pushconstvalue!(v::Vector{Any}, ls::LoopSet, op::Operation)::Bool
+  isconstant(op) || return true
+  opid = identifier(op)
+  for (id,(intval,intsz,signed)) ∈ ls.preamble_symint
+    id == opid || continue
+    if intsz == 1
+      push!(v, intval % Bool)
+      return false
+    elseif intsz == 1
+      signed ? push!(v, intval % Int8) : push!(v, intval % UInt8)
+    elseif intsz == 2
+      signed ? push!(v, intval % Int16) : push!(v, intval % UInt16)
+    elseif intsz == 4
+      signed ? push!(v, intval % Int32) : push!(v, intval % UInt32)
+    else
+      signed ? push!(v, intval) : push!(v, unsigned(intval))
+    end
+    return false
+  end
+  for (id,floatval) ∈ ls.preamble_symfloat
+    if id == opid
+      push!(v, floatval)
+      return false
+    end
+  end
+  for (id,typ) ∈ ls.preamble_zeros
+    id == opid || continue
+    if typ == HardFloat
+      push!(v, 0.0)
+    else
+      push!(v, 0)
+    end
+    return false
+  end
+  for (id,f) ∈ ls.preamble_funcofeltypes
+    id == opid || continue
+    x = reduction_zero(f)
+    if x == ADDITIVE_IN_REDUCTIONS
+      push!(v, 0)
+    elseif x == MULTIPLICATIVE_IN_REDUCTIONS
+      push!(v, 1)
+    elseif x == MAX
+      push!(v, -Inf)
+    elseif x == MIN
+      push!(v, Inf)
+    elseif x == ALL
+      push!(v, true)
+    elseif x == ANY
+      push!(v, false)
+    else
+      return true
+    end
+    return false
+  end
+  return true
+end
@@ -211,7 +211,7 @@ function lower_store!(
           if reductfunc === Symbol("")
             Expr(:call, lv(:_vstore!), sptrsym, gf(mvard,u), inds)
           else
-            Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, mvaru, inds)
+            Expr(:call, lv(:_vstore!), lv(reductfunc), sptrsym, gf(mvard,u), inds)
           end
         elseif reductfunc === Symbol("")
           Expr(:call, lv(:_vstore!), sptrsym, mvar, inds)
@@ -282,13 +282,11 @@ function lower_tiled_store!(blockq::Expr, op::Operation, ls::LoopSet, ua::Unroll
         # opp = only(parents(opp))
     end
     isu₁, isu₂ = isunrolled_sym(opp, u₁loopsym, u₂loopsym, vloopsym, ls)#, u₂)
-    @assert isu₂
-    # It's reasonable forthis to be `!isu₁`
+    # It's reasonable for this to be `!isu₁`
     u = Core.ifelse(isu₁, u₁, 1)
     tup = Expr(:tuple)
     for t ∈ 0:u₂-1
-        mvar = Symbol(variable_name(opp, t), '_', u)
-        push!(tup.args, mvar)
+        push!(tup.args, Symbol(variable_name(opp, ifelse(isu₂, t, -1)), '_', u))
     end
     vut = Expr(:call, lv(:VecUnroll), tup) # `VecUnroll` of `VecUnroll`s
     inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, 0, ls)
 
@@ -873,33 +873,34 @@ function lower_unrollspec(ls::LoopSet)
 end
 
 function lower(ls::LoopSet, order, u₁loop, u₂loop, vectorized, u₁, u₂, inline::Bool)
-    cacheunrolled!(ls, u₁loop, u₂loop, vectorized)
-    fillorder!(ls, order, u₁loop, u₂loop, u₂, vectorized)
-    ls.unrollspecification = UnrollSpecification(ls, u₁loop, u₂loop, vectorized, u₁, u₂)
-    q = lower_unrollspec(ls)
-    inline && pushfirst!(q.args, Expr(:meta, :inline))
-    q
+  cacheunrolled!(ls, u₁loop, u₂loop, vectorized)
+  fillorder!(ls, order, u₁loop, u₂loop, u₂, vectorized)
+  ls.unrollspecification = UnrollSpecification(ls, u₁loop, u₂loop, vectorized, u₁, u₂)
+  q = lower_unrollspec(ls)
+  inline && pushfirst!(q.args, Expr(:meta, :inline))
+  q
 end
 
 function lower(ls::LoopSet, inline::Int = -1)
-    fill_offset_memop_collection!(ls)
-    order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls)
-    lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, inlinedecision(inline, shouldinline))
+  fill_offset_memop_collection!(ls)
+  order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls)
+  lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, inlinedecision(inline, shouldinline))
 end
 function lower(ls::LoopSet, u₁::Int, u₂::Int, inline::Int)
-    fill_offset_memop_collection!(ls)
-    if u₂ > 1
-        @assert num_loops(ls) > 1 "There is only $(num_loops(ls)) loop, but specified blocking parameter u₂ is $u₂."
-        order, u₁loop, u₂loop, vectorized, _u₁, _u₂, c, shouldinline = choose_tile(ls)
-        copyto!(ls.loop_order.bestorder, order)
-    else
-        u₂ = -1
-        order, vectorized, c = choose_unroll_order(ls, Inf)
-        u₁loop = first(order); u₂loop = Symbol("##undefined##"); shouldinline = true
-        copyto!(ls.loop_order.bestorder, order)
-    end
-    doinline = inlinedecision(inline, shouldinline)
-    lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline)
+  fill_offset_memop_collection!(ls)
+  fill_children!(ls)
+  if u₂ > 1
+    @assert num_loops(ls) > 1 "There is only $(num_loops(ls)) loop, but specified blocking parameter u₂ is $u₂."
+    order, u₁loop, u₂loop, vectorized, _u₁, _u₂, c, shouldinline = choose_tile(ls)
+    copyto!(ls.loop_order.bestorder, order)
+  else
+    u₂ = -1
+    order, vectorized, c = choose_unroll_order(ls, Inf)
+    u₁loop = first(order); u₂loop = Symbol("##undefined##"); shouldinline = true
+    copyto!(ls.loop_order.bestorder, order)
+  end
+  doinline = inlinedecision(inline, shouldinline)
+  lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline)
 end
 
 # Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls)
 
@@ -645,7 +645,7 @@ function generate_call_types(
   for op ∈ ops
     instr::Instruction = instruction(op)
     if (isconstant(op) && (instr == LOOPCONSTANT)) && (!roots[identifier(op)])
-      instr = op.instruction = DROPPEDCONSTANT 
+      instr = op.instruction = DROPPEDCONSTANT
     end
     push!(operation_descriptions.args, QuoteNode(instr.mod))
     push!(operation_descriptions.args, QuoteNode(instr.instr))
@@ -790,7 +790,7 @@ function setup_call_debug(ls::LoopSet)
   generate_call(ls, (false,zero(Int8),zero(Int8)), zero(UInt), true)
 end
 function setup_call(
-  ls::LoopSet, q::Expr, source::LineNumberNode, inline::Bool, check_empty::Bool, u₁::Int8, u₂::Int8, thread::Int, warncheckarg::Bool
+  ls::LoopSet, q::Expr, source::LineNumberNode, inline::Bool, check_empty::Bool, u₁::Int8, u₂::Int8, thread::Int, warncheckarg::Int
 )
   # We outline/inline at the macro level by creating/not creating an anonymous function.
   # The old API instead was based on inlining or not inline the generated function, but
@@ -802,7 +802,11 @@ function setup_call(
   call = generate_call(ls, (inline, u₁, u₂), thread%UInt, false)
   call = check_empty ? check_if_empty(ls, call) : call
   argfailure = make_crashy(make_fast(q))
-  warncheckarg && (argfailure = Expr(:block, :(@warn "`LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead."  maxlog=1), argfailure))
+  if warncheckarg ≠ 0
+    warning = :(@warn "`LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.")
+    warncheckarg > 0 && push!(warning.args, :(maxlog=$warncheckarg))
+    argfailure = Expr(:block,  warning, argfailure)
+  end
   pushprepreamble!(ls, Expr(:if, check_args_call(ls), call, argfailure))
   prepend_lnns!(ls.prepreamble, lnns)
   return ls.prepreamble
 
@@ -35,7 +35,7 @@ function add_ci_call!(q::Expr, @nospecialize(f), args, syms, i, valarg = nothing
     push!(q.args, Expr(:(=), syms[i], call))
 end
 
-function substitute_broadcast(q::Expr, mod::Symbol, inline, u₁, u₂, threads, warncheckarg)
+function substitute_broadcast(q::Expr, mod::Symbol, inline::Bool, u₁::Int8, u₂::Int8, threads::Int, warncheckarg::Int)
     ci = first(Meta.lower(LoopVectorization, q).args).code
     nargs = length(ci)-1
     ex = Expr(:block)
@@ -75,7 +75,7 @@ function loopset(q::Expr) # for interactive use only
     ls
 end
 
-function check_macro_kwarg(arg, inline::Bool, check_empty::Bool, u₁::Int8, u₂::Int8, threads::Int, warncheckarg::Bool)
+function check_macro_kwarg(arg, inline::Bool, check_empty::Bool, u₁::Int8, u₂::Int8, threads::Int, warncheckarg::Int)
     ((arg.head === :(=)) && (length(arg.args) == 2)) || throw(ArgumentError("macro kwarg should be of the form `argname = value`."))
     kw = (arg.args[1])::Symbol
     value = (arg.args[2])
@@ -101,29 +101,28 @@ function check_macro_kwarg(arg, inline::Bool, check_empty::Bool, u₁::Int8, u
             throw(ArgumentError("Don't know how to process argument in `thread=$value`."))
         end
     elseif kw === :warn_check_args
-        warncheckarg = value::Bool
+        warncheckarg = convert(Int, value)::Int
     else
         throw(ArgumentError("Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`."))
     end
     inline, check_empty, u₁, u₂, threads, warncheckarg
 end
-function process_args(args; inline = false, check_empty = false, u₁ = zero(Int8), u₂ = zero(Int8), threads = 1, warncheckarg = false)
-    for arg ∈ args
-        inline, check_empty, u₁, u₂, threads, warncheckarg = check_macro_kwarg(arg, inline, check_empty, u₁, u₂, threads, warncheckarg)
-    end
-    inline, check_empty, u₁, u₂, threads, warncheckarg
+function process_args(args; inline::Bool = false, check_empty::Bool = false, u₁::Int8 = zero(Int8), u₂::Int8 = zero(Int8), threads::Int = 1, warncheckarg::Int = 0)
+  for arg ∈ args
+    inline, check_empty, u₁, u₂, threads, warncheckarg = check_macro_kwarg(arg, inline, check_empty, u₁, u₂, threads, warncheckarg)
+  end
+  inline, check_empty, u₁, u₂, threads, warncheckarg
 end
 function turbo_macro(mod, src, q, args...)
-    q = macroexpand(mod, q)
-    
-    if q.head === :for
-        ls = LoopSet(q, mod)
-        inline, check_empty, u₁, u₂, threads, warncheckarg = process_args(args)
-        esc(setup_call(ls, q, src, inline, check_empty, u₁, u₂, threads, warncheckarg))
-    else
-        inline, check_empty, u₁, u₂, threads, warncheckarg = process_args(args, inline=true)
-        substitute_broadcast(q, Symbol(mod), inline, u₁, u₂, threads, warncheckarg)
-    end
+  q = macroexpand(mod, q)
+  if q.head === :for
+    ls = LoopSet(q, mod)
+    inline, check_empty, u₁, u₂, threads, warncheckarg = process_args(args)
+    esc(setup_call(ls, q, src, inline, check_empty, u₁, u₂, threads, warncheckarg))
+  else
+    inline, check_empty, u₁, u₂, threads, warncheckarg = process_args(args, inline=true)
+    substitute_broadcast(q, Symbol(mod), inline, u₁, u₂, threads, warncheckarg)
+  end
 end
 """
     @turbo
@@ -215,6 +214,8 @@ use their `parent`. Triangular loops aren't yet supported.
 Setting the keyword argument `warn_check_args=true` (e.g. `@turbo warn_check_args=true for ...`) in a loop or
 broadcast statement will cause it to warn once if `LoopVectorization.check_args` fails and the fallback
 loop is executed instead of the LoopVectorization-optimized loop.
+Setting it to an integer > 0 will warn that many times, while setting it to a negative integer will warn
+an unlimited amount of times. The default is `warn_check_args = 0`.
 """
 macro turbo(args...)
     turbo_macro(__module__, __source__, last(args), Base.front(args)...)
@@ -256,7 +257,7 @@ end
 macro _turbo(arg, q)
   @assert q.head === :for
   q = macroexpand(__module__, q)
-  inline, check_empty, u₁, u₂ = check_macro_kwarg(arg, false, false, zero(Int8), zero(Int8), 1, false)
+  inline, check_empty, u₁, u₂ = check_macro_kwarg(arg, false, false, zero(Int8), zero(Int8), 1, 0)
   ls = LoopSet(q, __module__)
   set_hw!(ls)
   def_outer_reduct_types!(ls)