WIP: support mixed Int/CartesianIndex indexing

timholy · timholy · commit f702c4a0df99 · 2020-04-07T05:29:18.000-05:00
ImageFiltering contains optimizations for separable filters: if the size along each dimension is `n[i]`, then per-pixel a fully-separable filter has cost `O(n[1] + ... + n[d])`, where as a non-separable filter has cost `O(n[1] * ... * n[d])`. Consequently this can be a very large savings. This PR aims to support such operations by extending support for the exponential-filter demo in https://julialang.org/blog/2016/02/iteration/.
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -1,3 +1,5 @@
+const NOpsType = Union{Int,Vector{Int}}
+
 function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{<:AbstractUnitRange})
     ssym = String(sym)
     start = gensym(ssym*"_loopstart"); stop = gensym(ssym*"_loopstop"); loopsym = gensym(ssym * "_loop")
@@ -54,7 +56,7 @@ end
 
 function ArrayReferenceMeta(
     ls::LoopSet, @nospecialize(ar::ArrayRefStruct), arraysymbolinds::Vector{Symbol},
-    opsymbols::Vector{Symbol}, nopsv::Vector{Int}, expandedv::Vector{Bool}
+    opsymbols::Vector{Symbol}, nopsv::Vector{NOpsType}, expandedv::Vector{Bool}
 )
     index_types = ar.index_types
     indices = ar.indices
@@ -73,7 +75,14 @@ function ArrayReferenceMeta(
         elseif index_types == ComputedIndex
             opsym = opsymbols[ind]
             if expandedv[ind]
-                for j ∈ 0:nopsv[ind]-1
+                nops = nopsv[ind]
+                if isa(nops, Vector)
+                    n = first(nops)
+                    if all(isequal(n), nops)
+                        nops = n
+                    end
+                end
+                for j ∈ 0:nops-1
                     pushfirst!(index_vec, expandedopname(opsym, j))
                     pushfirst!(loopedindex, false)
                 end
@@ -144,7 +153,7 @@ function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{<:Abstrac
 end
 function create_mrefs!(
     ls::LoopSet, arf::Vector{ArrayRefStruct}, as::Vector{Symbol}, os::Vector{Symbol},
-    nopsv::Vector{Int}, expanded::Vector{Bool}, vargs
+    nopsv::Vector{NOpsType}, expanded::Vector{Bool}, vargs
 )
     mrefs = Vector{ArrayReferenceMeta}(undef, length(arf))
     for i ∈ eachindex(arf)
@@ -230,14 +239,11 @@ function calcnops(ls::LoopSet, os::OperationStruct)
     offsets = ls.loopsymbol_offsets
     idxs = loopindex(ls, os.loopdeps, 0x04)  # FIXME DRY
     iszero(length(idxs)) && return 1
-    Δidxs = map(i->offsets[i+1]-offsets[i], idxs)
-    nops = first(Δidxs)
-    @assert all(isequal(nops), Δidxs)
-    nops
+    return map(i->offsets[i+1]-offsets[i], idxs)
 end
-function isexpanded(ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{Int}, i::Int)
+function isexpanded(ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{NOpsType}, i::Int)
     nops = nopsv[i]
-    isone(nops) && return false
+    (nops === 1 || nops == [1]) && return false
     os = ops[i]
     optyp = optype(os)
     if optyp == compute
@@ -250,7 +256,7 @@ function isexpanded(ls::LoopSet, ops::Vector{OperationStruct}, nopsv::Vector{Int
 end
 
 function add_op!(
-    ls::LoopSet, instr::Instruction, ops::Vector{OperationStruct}, nopsv::Vector{Int}, expandedv::Vector{Bool}, i::Int,
+    ls::LoopSet, instr::Instruction, ops::Vector{OperationStruct}, nopsv::Vector{NOpsType}, expandedv::Vector{Bool}, i::Int,
     mrefs::Vector{ArrayReferenceMeta}, opsymbol, elementbytes::Int
 )
     os = ops[i]
@@ -272,9 +278,15 @@ function add_op!(
         push!(opoffsets, opoffsets[end] + 1)
         return
     end
+    if isa(nops, Vector)
+        n = first(nops)
+        if all(isequal(n), nops)
+            nops = n
+        end
+    end
     # if expanded, optyp must be either loopvalue, or compute (with loopvalues in its ancestry, not cutoff by loads)
     for offset = 0:nops-1
-        sym = nops == 1 ? opsymbol : expandedopname(opsymbol, offset)
+        sym = nops === 1 ? opsymbol : expandedopname(opsymbol, offset)
         op = Operation(
             length(operations(ls)), sym, elementbytes, instr,
             optyp, loopdependencies(ls, os, false, offset), reduceddependencies(ls, os, false, offset),
@@ -295,7 +307,7 @@ function add_parents_to_op!(ls::LoopSet, vparents::Vector{Operation}, up::Unsign
             for j ∈ offsets[i]+1:offsets[i+1] # if parents are expanded, add them all
                 pushfirst!(vparents, ops[j])
             end
-        end        
+        end
     else#if isexpanded
         # Do we want to require that all Δidxs are equal?
         # Because `CartesianIndex((2,3)) - 1` results in a methoderorr, I think this is reasonable for now
@@ -318,15 +330,15 @@ function add_parents_to_ops!(ls::LoopSet, ops::Vector{OperationStruct}, constoff
                     pushpreamble!(ls, Expr(:(=), instr.instr, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:ref, :vargs, constoffset))))
                 end
             elseif !isloopvalue(op)
-                add_parents_to_op!(ls, parents(op), ops[i].parents, k, Δ)                
+                add_parents_to_op!(ls, parents(op), ops[i].parents, k, Δ)
             end
         end
     end
     constoffset
 end
 function add_ops!(
     ls::LoopSet, instr::Vector{Instruction}, ops::Vector{OperationStruct}, mrefs::Vector{ArrayReferenceMeta},
-    opsymbols::Vector{Symbol}, constoffset::Int, nopsv::Vector{Int}, expandedv::Vector{Bool}, elementbytes::Int
+    opsymbols::Vector{Symbol}, constoffset::Int, nopsv::Vector{NOpsType}, expandedv::Vector{Bool}, elementbytes::Int
 )
     # @show ls.loopsymbols ls.loopsymbol_offsets
     for i ∈ eachindex(ops)
@@ -378,7 +390,7 @@ function avx_loopset(instr, ops, arf, AM, LPSYM, LB, vargs)
     resize!(ls.loop_order, ls.loopsymbol_offsets[end])
     arraysymbolinds = gen_array_syminds(AM)
     opsymbols = [gensym(:op) for _ ∈ eachindex(ops)]
-    nopsv = calcnops.(Ref(ls), ops)
+    nopsv = NOpsType[calcnops(ls, op) for op in ops]
     expandedv = [isexpanded(ls, ops, nopsv, i) for i ∈ eachindex(ops)]
     mrefs = create_mrefs!(ls, arf, arraysymbolinds, opsymbols, nopsv, expandedv, vargs)
     pushpreamble!(ls, Expr(:(=), ls.T, Expr(:call, :promote_type, [Expr(:call, :eltype, vptr(mref)) for mref ∈ mrefs]...)))
@@ -417,6 +429,3 @@ end
     ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
     avx_body(ls, UT)
 end
-
-
-    
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -1,5 +1,9 @@
+using LoopVectorization
+using LinearAlgebra
+using Test
 
 @testset "Miscellaneous" begin
+
     Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 6)
     dot3q = :(for m ∈ 1:M, n ∈ 1:N
               s += x[m] * A[m,n] * y[n]
@@ -48,7 +52,7 @@
         end
         s
     end
-    
+
     subcolq = :(for i ∈ 1:size(A,2), j ∈ eachindex(x)
                 B[j,i] = A[j,i] - x[j]
                 end)
@@ -220,10 +224,10 @@
 #     t = β₁ = β₂ = ρ = s = 0.0; weights = rand(1); nodes = rand(1); lomnibus(args...) = +(args...)
 # LoopVectorization.@avx_debug for i ∈ eachindex(weights, nodes)
 #         s += weights[i] * lomnibus(nodes[i], t, β₁, β₂, ρ)
-#     end    
+#     end
 # @macroexpand @avx for i ∈ eachindex(weights, nodes)
 #         s += weights[i] * lomnibus(nodes[i], t, β₁, β₂, ρ)
-#     end    
+#     end
     function softmax3_core!(lse, qq, xx, tmpmax, maxk, nk)
         for k in Base.OneTo(maxk)
             @inbounds for i in eachindex(lse)
@@ -486,7 +490,7 @@
             out[i] = 1 + i
         end
     end
-    
+
     for T ∈ (Float32, Float64)
         @show T, @__LINE__
         A = randn(T, 199, 498);
@@ -550,7 +554,7 @@
         @test view(vec(B1), r) == view(vec(B2), r)
         fill!(B2, NaN); test_for_with_different_index_avx!(B2, A, C, start_sample, num_samples)
         @test view(vec(B1), r) == view(vec(B2), r)
-        
+
         ni, nj, nk = (127, 113, 13)
         x = rand(T, ni, nj, nk);
         q1 = similar(x);
@@ -652,4 +656,54 @@
         one_plus_i_avx!(out2)
         @test out1 == out2
     end
+
+    @testset "Mixed CartesianIndex/Int indexing" begin
+        # A demo similar to the exponential filtering demo from https://julialang.org/blog/2016/02/iteration/,
+        # but with no loop-carried dependency.
+        function smoothdim!(s, x, α, Rpre, irng::AbstractUnitRange, Rpost)
+            ifirst, ilast = first(irng), last(irng)
+            ifirst > ilast && return s
+            for Ipost in Rpost
+                # Initialize the first value along the filtered dimension
+                for Ipre in Rpre
+                    s[Ipre, ifirst, Ipost] = x[Ipre, ifirst, Ipost]
+                end
+                # Handle all other entries
+                for i = ifirst+1:ilast
+                    for Ipre in Rpre
+                        s[Ipre, i, Ipost] = α*x[Ipre, i, Ipost] + (1-α)*x[Ipre, i-1, Ipost]
+                    end
+                end
+            end
+            s
+        end
+        function smoothdim_avx!(s, x, α, Rpre, irng::AbstractUnitRange, Rpost)
+            ifirst, ilast = first(irng), last(irng)
+            ifirst > ilast && return s
+            @avx for Ipost in Rpost
+                # Initialize the first value along the filtered dimension
+                for Ipre in Rpre
+                    s[Ipre, ifirst, Ipost] = x[Ipre, ifirst, Ipost]
+                end
+                # Handle all other entries
+                for i = ifirst+1:ilast
+                    for Ipre in Rpre
+                        s[Ipre, i, Ipost] = α*x[Ipre, i, Ipost] + (1-α)*x[Ipre, i-1, Ipost]
+                    end
+                end
+            end
+            s
+        end
+
+        x = rand(11,11,11) # ,11,11)
+        dest1, dest2 = similar(x), similar(x)
+        α = 0.3
+        for d = 1:ndims(x)
+            Rpre  = CartesianIndices(axes(x)[1:d-1])
+            Rpost = CartesianIndices(axes(x)[d+1:end])
+            smoothdim!(dest1, x, α, Rpre, axes(x, d), Rpost)
+            smoothdim_avx!(dest2, x, α, Rpre, axes(x, d), Rpost)
+            @test dest1 ≈ dest2
+        end
+    end
 end