JuliaSIMD
diff --git a/‎Project.toml
Lines changed: 2 additions & 2 deletions b/‎Project.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/looptests.jl
Lines changed: 63 additions & 1 deletion b/‎benchmark/looptests.jl
Lines changed: 63 additions & 1 deletion
diff --git a/‎src/codegen/lower_compute.jl
Lines changed: 2 additions & 3 deletions b/‎src/codegen/lower_compute.jl
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/codegen/lower_load.jl
Lines changed: 49 additions & 16 deletions b/‎src/codegen/lower_load.jl
Lines changed: 49 additions & 16 deletions
diff --git a/‎src/codegen/lower_memory_common.jl
Lines changed: 22 additions & 6 deletions b/‎src/codegen/lower_memory_common.jl
Lines changed: 22 additions & 6 deletions
diff --git a/‎src/codegen/lower_store.jl
Lines changed: 10 additions & 5 deletions b/‎src/codegen/lower_store.jl
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/codegen/lowering.jl
Lines changed: 18 additions & 8 deletions b/‎src/codegen/lowering.jl
Lines changed: 18 additions & 8 deletions
@@ -24,11 +24,11 @@ DocStringExtensions = "0.8"
 IfElse = "0.1"
 OffsetArrays = "1.4.1, 1.5"
 Requires = "1"
-SLEEFPirates = "0.6.11"
+SLEEFPirates = "0.6.12"
 Static = "0.2"
 ThreadingUtilities = "0.3"
 UnPack = "1"
-VectorizationBase = "0.19.3"
+VectorizationBase = "0.19.5"
 julia = "1.5"
 
 [extras]
 
@@ -64,14 +64,53 @@ function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
     end
 end
 function gemmavx!(𝐂, 𝐀, 𝐁)
-    @avx for m ∈ axes(𝐀,1), n ∈ axes(𝐁,2)
+    @avx for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
+        𝐂ₘₙ = zero(eltype(𝐂))
+        for k ∈ indices((𝐀,𝐁),(2,1))
+            𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
+        end
+        𝐂[m,n] = 𝐂ₘₙ
+    end
+end
+function gemmavx!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}, Bc::AbstractMatrix{Complex{T}}) where {T}
+    A = reinterpret(reshape, T, Ac)
+    B = reinterpret(reshape, T, Bc)
+    C = reinterpret(reshape, T, Cc)
+    @avx for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
+        Cre = zero(T)
+        Cim = zero(T)
+        for k ∈ indices((A,B),(3,2))
+            Cre += A[1,m,k]*B[1,k,n] - A[2,m,k]*B[2,k,n]
+            Cim += A[1,m,k]*B[2,k,n] + A[2,m,k]*B[1,k,n]
+        end
+        C[1,m,n] = Cre
+        C[2,m,n] = Cim
+    end
+end
+function gemmavxt!(𝐂, 𝐀, 𝐁)
+    @avxt for m ∈ axes(𝐀,1), n ∈ axes(𝐁,2)
         𝐂ₘₙ = zero(eltype(𝐂))
         for k ∈ axes(𝐀,2)
             𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
         end
         𝐂[m,n] = 𝐂ₘₙ
     end
 end
+function gemmavxt!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}, Bc::AbstractMatrix{Complex{T}}) where {T}
+    A = reinterpret(reshape, T, Ac)
+    B = reinterpret(reshape, T, Bc)
+    C = reinterpret(reshape, T, Cc)
+    @avxt for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
+        Cre = zero(T)
+        Cim = zero(T)
+        for k ∈ indices((A,B),(3,2))
+            Cre += A[1,m,k]*B[1,k,n] - A[2,m,k]*B[2,k,n]
+            Cim += A[1,m,k]*B[2,k,n] + A[2,m,k]*B[1,k,n]
+        end
+        C[1,m,n] = Cre
+        C[2,m,n] = Cim
+    end
+end
 function jdot(a, b)
     s = zero(eltype(a))
     # @inbounds @simd ivdep for i ∈ eachindex(a,b)
@@ -88,6 +127,14 @@ function jdotavx(a, b)
     end
     s
 end
+function jdotavxt(a, b)
+    s = zero(eltype(a))
+    # @avx for i ∈ eachindex(a,b)
+    @avxt for i ∈ eachindex(a)
+        s += a[i] * b[i]
+    end
+    s
+end
 function jselfdot(a)
     s = zero(eltype(a))
     @inbounds @simd ivdep for i ∈ eachindex(a)
@@ -324,3 +371,18 @@ function filter2dunrolledavx!(out::AbstractMatrix, A::AbstractMatrix, kern::Size
     end
     out
 end
+
+
+# function smooth_line!(sl,nrm1,j,i1,rl,ih2,denom)
+#     @fastmath @inbounds @simd ivdep for i=i1:2:nrm1
+#         sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1]))
+#     end
+# end
+# function smooth_line_avx!(sl,nrm1,j,i1,sl,rl,ih2,denom)
+#     @avx for i=i1:2:nrm1
+#         sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1]))
+#     end
+# end
+
+
+
@@ -1,9 +1,8 @@
 
 
 function load_constrained(op, u₁loop, u₂loop, innermost_loop_or_vloop, forprefetch = false)
-    loopdeps = loopdependencies(op)
-    dependsonu₁ = u₁loop ∈ loopdeps
-    dependsonu₂ = u₂loop ∈ loopdeps
+    dependsonu₁ = isu₁unrolled(op)
+    dependsonu₂ = isu₂unrolled(op)
     if forprefetch
         (dependsonu₁ & dependsonu₂) || return false
     end
 
@@ -117,7 +117,7 @@ function lower_load_no_optranslation!(
     u = ifelse(opu₁, u₁, 1)
     mvar = Symbol(variable_name(op, Core.ifelse(isu₂unrolled(op), suffix,-1)), '_', u)
     falseexpr = Expr(:call, lv(:False)); rs = staticexpr(reg_size(ls))
-    if all(op.ref.loopedindex)
+    if all(op.ref.loopedindex) && !rejectcurly(op)
         inds = unrolledindex(op, td, mask, inds_calc_by_ptr_offset)
         loadexpr = Expr(:call, lv(:_vload), vptr(op), inds)
         add_memory_mask!(loadexpr, op, td, mask)
@@ -325,29 +325,62 @@ end
 function _lower_load!(
     q::Expr, ls::LoopSet, op::Operation, td::UnrollArgs, mask::Bool, inds_calc_by_ptr_offset::Vector{Bool} = indices_calculated_by_pointer_offsets(ls, op.ref)
 )
-    omop = offsetloadcollection(ls)
-    batchid, opind = omop.batchedcollectionmap[identifier(op)]
-    # @show batchid == 0 (!isvectorized(op)) rejectinterleave(ls, op, td.vloop, idsformap)
-    if batchid == 0 || (!isvectorized(op)) || (rejectinterleave(ls, op, td.vloop, omop.batchedcollections[batchid]))
+    if rejectinterleave(op)
         lower_load_no_optranslation!(q, ls, op, td, mask, inds_calc_by_ptr_offset)
-    elseif opind == 1# only lower loads once
-        # I do not believe it is possible for `opind == 1` to be lowered after an  operation depending on a different opind.
-        # lower_load_collection!(q, ls, op, td, mask, collectionid)
+    else        
         omop = offsetloadcollection(ls)
-        collectionid, copind = omop.opidcollectionmap[identifier(op)]
-        opidmap = offsetloadcollection(ls).opids[collectionid]
-        idsformap = omop.batchedcollections[batchid]
-        lower_load_collection!(q, ls, opidmap, idsformap, td, mask, inds_calc_by_ptr_offset)
+        batchid, opind = omop.batchedcollectionmap[identifier(op)]
+        if opind == 1
+            collectionid, copind = omop.opidcollectionmap[identifier(op)]
+            opidmap = offsetloadcollection(ls).opids[collectionid]
+            idsformap = omop.batchedcollections[batchid]
+            lower_load_collection!(q, ls, opidmap, idsformap, td, mask, inds_calc_by_ptr_offset)
+        end
     end
 end
-function addive_loopinductvar_only(op::Operation)
+function additive_vectorized_loopinductvar_only(op::Operation)
+    isvectorized(op) || return true
     isloopvalue(op) && return true
     iscompute(op) || return false
     additive_instr = (:add_fast, :(+), :vadd, :identity, :sub_fast, :(-), :vsub)
     Base.sym_in(instruction(op).instr, additive_instr) || return false
-    return all(addive_loopinductvar_only, parents(op))
+    return all(additive_vectorized_loopinductvar_only, parents(op))
+end
+# Checks if we cannot use `Unroll`
+function rejectcurly(ls::LoopSet, op::Operation, td::UnrollArgs)
+    @unpack u₁loopsym, vloopsym = td
+    rejectcurly(ls, op, u₁loopsym, vloopsym)
+end
+function rejectcurly(ls::LoopSet, op::Operation, u₁loopsym::Symbol, vloopsym::Symbol)
+    indices = getindicesonly(op)
+    li = op.ref.loopedindex
+    AV = AU = false
+    for (n,ind) ∈ enumerate(indices)
+        # @show AU, op, n, ind, vloopsym, u₁loopsym
+        if li[n]
+            if ind === vloopsym
+                AV && return true
+                AV = true
+            end
+            if ind === u₁loopsym
+                AU && return true
+                AU = true
+            end
+        else
+            opp = findop(parents(op), ind)
+            # @show opp
+            if isvectorized(opp)
+                AV && return true
+                AV = true
+            end
+            if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (isu₁unrolled(opp))
+                AU && return true
+                AU = true
+            end
+        end
+    end
+    false
 end
-
 function rejectinterleave(ls::LoopSet, op::Operation, vloop::Loop, idsformap::SubArray{Tuple{Int,Int}, 1, Vector{Tuple{Int,Int}}, Tuple{UnitRange{Int}}, true})
     vloopsym = vloop.itersymbol; strd = step(vloop)
     isknown(strd) || return true
@@ -356,7 +389,7 @@ function rejectinterleave(ls::LoopSet, op::Operation, vloop::Loop, idsformap::Su
         li && continue
         for indop ∈ operations(ls)
             if (name(indop) === ind) && isvectorized(indop)
-                addive_loopinductvar_only(op) || return true # so that it is `MM`
+                additive_vectorized_loopinductvar_only(indop) || return true # so that it is `MM`
             end
         end
     end
 
@@ -172,18 +172,34 @@ function unrolled_curly(op::Operation, u₁::Int, u₁loop::Loop, vloop::Loop, m
     vloopsym = vloop.itersymbol
     indices = getindicesonly(op)
     vstep = step(vloop)
-    # loopedindex = op.ref.loopedindex
+    li = op.ref.loopedindex
     # @assert all(loopedindex)
     # @unpack u₁, u₁loopsym, vloopsym = td
     # @show vptr(op), inds_calc_by_ptr_offset
     # isone(u₁) && return mem_offset_u(op, td, inds_calc_by_ptr_offset, true)
     AV = AU = -1
     for (n,ind) ∈ enumerate(indices)
-        if ind === vloopsym
-            AV = n
-        end
-        if ind === u₁loopsym
-            AU = n
+        # @show AU, op, n, ind, vloopsym, u₁loopsym
+        if li[n]
+            if ind === vloopsym
+                @assert AV == -1 # FIXME: these asserts should be replaced with checks that prevent using `unrolled_curly` in these cases (also to be reflected in cost modeling, to avoid those)
+                AV = n
+            end
+            if ind === u₁loopsym
+                @assert AU == -1
+                AU = n
+            end
+        else
+            opp = findop(parents(op), ind)
+            # @show opp
+            if isvectorized(opp)
+                @assert AV == -1
+                AV = n
+            end
+            if (u₁loopsym === CONSTANTZEROINDEX) ? (CONSTANTZEROINDEX ∈ loopdependencies(opp)) : (isu₁unrolled(opp))
+                @assert AU == -1
+                AU = n
+            end
         end
     end
     # if AU == -1
 
@@ -26,8 +26,13 @@ function reduce_expr_u₂(toreduct::Symbol, instr::Instruction, u₂::Int)
     end
     Expr(:call, lv(:reduce_tup), reduce_to_onevecunroll(instr), t)
 end
-function reduce_expr!(q::Expr, toreduct::Symbol, instr::Instruction, u₁::Int, u₂::Int, isu₁unrolled::Bool)
-    if u₂ != -1
+function reduce_expr!(q::Expr, toreduct::Symbol, instr::Instruction, u₁::Int, u₂::Int, isu₁unrolled::Bool, isu₂unrolled::Bool)
+    # if u₂ == -1
+    #     u₁u, u₂u = (true, false)
+    # else
+    #     u₁u, u₂u = isunrolled_sym(op, getloop(ls, us.u₁loopnum).itersymbol, getloop(ls, us.u₂loopnum).itersymbol, _Umax)
+    # end
+    if isu₂unrolled# u₂ != -1
         _toreduct = Symbol(toreduct, 0)
         push!(q.args, Expr(:(=), _toreduct, reduce_expr_u₂(toreduct, instr, u₂)))
     else
@@ -105,7 +110,7 @@ function lower_store!(
 
     omop = offsetloadcollection(ls)
     batchid, opind = omop.batchedcollectionmap[identifier(op)]
-    if ((batchid ≠ 0) && isvectorized(op)) && (!rejectinterleave(ls, op, vloop, omop.batchedcollections[batchid]))
+    if ((batchid ≠ 0) && isvectorized(op)) && (!rejectinterleave(op))
         (opind == 1) && lower_store_collection!(q, ls, op, ua, mask, inds_calc_by_ptr_offset)
         return
     end
@@ -173,10 +178,10 @@ end
 
 function donot_tile_store(ls::LoopSet, op::Operation, vloop::Loop, reductfunc::Symbol, u₂::Int)
     (!((reductfunc === Symbol("")) && all(op.ref.loopedindex))) || (u₂ ≤ 1) || isconditionalmemop(op) && return true
-
+    rejectcurly(op) && return true
     omop = offsetloadcollection(ls)
     batchid, opind = omop.batchedcollectionmap[identifier(op)]
-    return ((batchid ≠ 0) && isvectorized(op)) && (!rejectinterleave(ls, op, vloop, omop.batchedcollections[batchid]))
+    return ((batchid ≠ 0) && isvectorized(op)) && (!rejectinterleave(op))
 end
 
 # VectorizationBase implements optimizations for certain grouped stores
 
@@ -73,7 +73,8 @@ function lower_block(
             end
         else
             # for u ∈ 0:u₁-1     #  u₁ && !u₂
-            lower!(blockq, ops[2,1,prepost,n], ls, unrollsyms, u₁, u₂, -1, mask, true, true)
+            lower!(blockq, ops[2,1,prepost,n], ls, unrollsyms, u₁, u₂, -1, mask, true, false)
+            lower!(blockq, ops[2,1,prepost,n], ls, unrollsyms, u₁, u₂, -1, mask, false, true)
             # end
         end
         if n > 1 && prepost == 1
@@ -434,7 +435,7 @@ end
 
 # TODO: handle tiled outer reductions; they will require a suffix arg
 function initialize_outer_reductions!(
-    q::Expr, op::Operation, _Umax::Int, vectorized::Symbol, us::UnrollSpecification, rs::Expr
+    q::Expr, ls::LoopSet, op::Operation, _Umax::Int, vectorized::Symbol, us::UnrollSpecification, rs::Expr
 )
     @unpack u₁, u₂ = us
     Umax = u₂ == -1 ? _Umax : u₁
@@ -459,12 +460,18 @@ function initialize_outer_reductions!(
         Expr(:call, reduct_zero, typeTr)
     end
     mvar = variable_name(op, -1)
+    # u1u, u2u = isunrolled_sym(op, getloop(ls, us.u₁loopnum).itersymbol, u₂loop, u₂max)
     if u₂ == -1
         push!(q.args, Expr(:(=), Symbol(mvar, '_', _Umax), z))
     else
-        for u ∈ 0:_Umax-1
-            # push!(q.args, Expr(:(=), Symbol(mvar, '_', u), z))
-            push!(q.args, Expr(:(=), Symbol(mvar, u), z))
+        u₁u, u₂u = isunrolled_sym(op, getloop(ls, us.u₁loopnum).itersymbol, getloop(ls, us.u₂loopnum).itersymbol, u₂)
+        if u₁u
+            push!(q.args, Expr(:(=), Symbol(mvar, '_', _Umax), z))
+        else            
+            for u ∈ 0:_Umax-1
+                # push!(q.args, Expr(:(=), Symbol(mvar, '_', u), z))
+                push!(q.args, Expr(:(=), Symbol(mvar, u), z))
+            end
         end
     end
     nothing
@@ -473,7 +480,7 @@ function initialize_outer_reductions!(q::Expr, ls::LoopSet, Umax::Int, vectorize
     rs = staticexpr(reg_size(ls))
     us = ls.unrollspecification[]
     for or ∈ ls.outer_reductions
-        initialize_outer_reductions!(q, ls.operations[or], Umax, vectorized, us, rs)
+        initialize_outer_reductions!(q, ls, ls.operations[or], Umax, vectorized, us, rs)
     end
 end
 initialize_outer_reductions!(ls::LoopSet, Umax::Int, vectorized::Symbol) = initialize_outer_reductions!(ls.preamble, ls, Umax, vectorized)
@@ -529,18 +536,21 @@ end
 ## This performs reduction to one `Vec`
 function reduce_expr!(q::Expr, ls::LoopSet, U::Int)
     us = ls.unrollspecification[]
-    u1f, u2f = if us.u₂ == -1 # TODO: these multiple meanings make code hard to follow. Simplify.
+    u₁f, u₂f = if us.u₂ == -1 # TODO: these multiple meanings make code hard to follow. Simplify.
         ifelse(U == -1, us.u₁, U), -1
     else
         us.u₁, U
     end
     # u₁loop, u₂loop = getunrolled(ls)
+    u₁loop = getloop(ls, us.u₁loopnum).itersymbol
+    u₂loop = getloop(ls, us.u₂loopnum).itersymbol
     for or ∈ ls.outer_reductions
         op = ls.operations[or]
         var = name(op)
         mvar = mangledvar(op)
         instr = instruction(op)
-        reduce_expr!(q, mvar, instr, u1f, u2f, isu₁unrolled(op))
+        u₁u, u₂u = isunrolled_sym(op, u₁loop, u₂loop, u₂f)
+        reduce_expr!(q, mvar, instr, u₁f, u₂f, u₁u, u₂u)#isu₁unrolled(op))
         if !iszero(length(ls.opdict))
             if (isu₁unrolled(op) | isu₂unrolled(op))
                 push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), Symbol(mvar, "##onevec##"), var)))