Bump version.

chriselrod · chriselrod · commit 8c37cae1e16b · 2020-05-07T18:10:34.000-04:00
diff --git a/.travis.yml b/.travis.yml
@@ -16,7 +16,7 @@ jobs:
   - julia: nightly
   include:
     - stage: "Documentation"
-      julia: 1.3
+      julia: 1.4
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd()));
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.7.6"
+version = "0.7.7"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -43,13 +43,14 @@ function ArrayRefStruct(ls::LoopSet, mref::ArrayReferenceMeta, arraysymbolinds::
             indices |= getloopid(ls, ind)
         else
             parent = get(ls.opdict, ind, nothing)
-            if parent === nothing
-                index_types |= SymbolicIndex
-                indices |= findindoradd!(arraysymbolinds, ind)
-            else
-                index_types |= ComputedIndex
-                indices |= identifier(parent)
-            end
+            @assert !isnothing(parent) # Symbolic indices should have been subset
+            # if parent === nothing
+            #     index_types |= SymbolicIndex
+            #     indices |= findindoradd!(arraysymbolinds, ind)
+            # else
+            index_types |= ComputedIndex
+            indices |= identifier(parent)
+            # end
         end
     end
     ArrayRefStruct{mref.ref.array,mref.ptr}( index_types, indices, offsets )
@@ -154,13 +155,14 @@ function argmeta_and_consts_description(ls::LoopSet, arraysymbolinds)
 end
 
 function loopset_return_value(ls::LoopSet, ::Val{extract}) where {extract}
-    if length(ls.outer_reductions) == 1
+    @assert !iszero(length(ls.outer_reductions))
+    if isone(length(ls.outer_reductions))
         if extract
             Expr(:call, :extract_data, Symbol(mangledvar(getop(ls, ls.outer_reductions[1])), 0))
         else
             Symbol(mangledvar(getop(ls, ls.outer_reductions[1])), 0)
         end
-    elseif length(ls.outer_reductions) > 1
+    else#if length(ls.outer_reductions) > 1
         ret = Expr(:tuple)
         ops = operations(ls)
         for or ∈ ls.outer_reductions
@@ -171,8 +173,6 @@ function loopset_return_value(ls::LoopSet, ::Val{extract}) where {extract}
             end
         end
         ret
-    else
-        nothing
     end
 end
 
@@ -296,8 +296,8 @@ make_fast_and_crashy(q) = q |> make_fast |> make_crashy
 
 function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
     call = generate_call(ls, (inline,U,T))
-    hasouterreductions = length(ls.outer_reductions) > 0
-    if !hasouterreductions
+    noouterreductions = iszero(length(ls.outer_reductions))
+    if noouterreductions
         q = Expr(:block,gc_preserve(ls, call))
         append!(ls.preamble.args, q.args)
         return ls.preamble
@@ -315,7 +315,7 @@ function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zer
         push!(outer_reducts.args, out)
         push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), out, var)))
     end
-    hasouterreductions && pushpreamble!(ls, outer_reducts)
+    pushpreamble!(ls, outer_reducts)
     append!(ls.preamble.args, q.args)
     ls.preamble
 end
diff --git a/src/costs.jl b/src/costs.jl
@@ -91,19 +91,19 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     srt, sl, srp
 end
 
-const OPAQUE_INSTRUCTION = InstructionCost(50, 50.0, -1.0, VectorizationBase.REGISTER_COUNT)
+const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 50, 50.0, VectorizationBase.REGISTER_COUNT)
 
 instruction_cost(instruction::Instruction) = instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION
 instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
 scalar_cost(instr::Instruction) = scalar_cost(instruction_cost(instr))
 vector_cost(instr::Instruction, Wshift, sizeof_T) = vector_cost(instruction_cost(instr), Wshift, sizeof_T)
-function cost(instruction::InstructionCost, Wshift, sizeof_T)
-    Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, sizeof_T)
-end
+# function cost(instruction::InstructionCost, Wshift, sizeof_T)
+#     Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, sizeof_T)
+# end
 
-function cost(instruction::Instruction, Wshift, sizeof_T)
-    cost( instruction_cost(instruction), Wshift, sizeof_T )
-end
+# function cost(instruction::Instruction, Wshift, sizeof_T)
+#     cost( instruction_cost(instruction), Wshift, sizeof_T )
+# end
 
 
 # Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
@@ -323,11 +323,11 @@ function reduction_scalar_combine(x::Float64)
     x == ADDITIVE_IN_REDUCTIONS ? :reduced_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduced_prod : x == MAX ? :reduced_max : x == MIN ? :reduced_min : throw("Reduction not found.")
 end
 reduction_scalar_combine(x) = reduction_scalar_combine(reduction_instruction_class(x))
-function reduction_combine_to(x::Float64)
-    # x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 3.0 ? :reduce_to_any : x == 4.0 ? :reduce_to_all : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
-    x == ADDITIVE_IN_REDUCTIONS ? :reduce_to_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduce_to_prod : x == MAX ? :reduce_to_max : x == MIN ? :reduce_to_min : throw("Reduction not found.")
-end
-reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
+# function reduction_combine_to(x::Float64)
+#     # x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 3.0 ? :reduce_to_any : x == 4.0 ? :reduce_to_all : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
+#     x == ADDITIVE_IN_REDUCTIONS ? :reduce_to_add : x == MULTIPLICATIVE_IN_REDUCTIONS ? :reduce_to_prod : x == MAX ? :reduce_to_max : x == MIN ? :reduce_to_min : throw("Reduction not found.")
+# end
+# reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
 function reduction_zero(x::Float64)
     # x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
     x == ADDITIVE_IN_REDUCTIONS ? :zero : x == MULTIPLICATIVE_IN_REDUCTIONS ? :one : x == MAX ? :typemin : x == MIN ? :typemax : throw("Reduction not found.")
diff --git a/src/precompile.jl b/src/precompile.jl
@@ -1,32 +1,32 @@
 function _precompile_()
     ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
-
+    precompile(Tuple{typeof(LoopVectorization.lower),LoopVectorization.LoopSet})
     precompile(Tuple{Type{LoopVectorization.LoopSet},Expr})
-    precompile(Tuple{typeof(Base.mapreduce_impl),typeof(LoopVectorization.elsize),typeof(max),Array{LoopVectorization.Operation,1},Int64,Int64})
+    precompile(Tuple{typeof(Base.mapreduce_impl),typeof(LoopVectorization.elsize),typeof(max),Array{LoopVectorization.Operation,1},Int,Int})
     precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Any})
-    precompile(Tuple{typeof(LoopVectorization.add_broadcast!),LoopVectorization.LoopSet,Symbol,Symbol,Array{Symbol,1},Type{Array{Bool,1}},Int64})
-    precompile(Tuple{typeof(LoopVectorization.add_ci_call!),Expr,Any,Array{Any,1},Array{Symbol,1},Int64,Symbol})
-    precompile(Tuple{typeof(LoopVectorization.add_ci_call!),Expr,Any,Array{Any,1},Array{Symbol,1},Int64})
-    precompile(Tuple{typeof(LoopVectorization.add_constant!),LoopVectorization.LoopSet,Float64,Array{Symbol,1},Symbol,Int64})
-    precompile(Tuple{typeof(LoopVectorization.add_parent!),Array{LoopVectorization.Operation,1},Array{Symbol,1},Array{Symbol,1},LoopVectorization.LoopSet,Int64,Int64,Int64})
-    precompile(Tuple{typeof(LoopVectorization.avx_body),LoopVectorization.LoopSet,Tuple{Int8,Int8}})
+    precompile(Tuple{typeof(LoopVectorization.add_broadcast!),LoopVectorization.LoopSet,Symbol,Symbol,Array{Symbol,1},Type{Array{Bool,1}},Int})
+    precompile(Tuple{typeof(LoopVectorization.add_ci_call!),Expr,Any,Array{Any,1},Array{Symbol,1},Int,Symbol})
+    precompile(Tuple{typeof(LoopVectorization.add_ci_call!),Expr,Any,Array{Any,1},Array{Symbol,1},Int})
+    precompile(Tuple{typeof(LoopVectorization.add_constant!),LoopVectorization.LoopSet,Float64,Array{Symbol,1},Symbol,Int})
+    precompile(Tuple{typeof(LoopVectorization.add_parent!),Array{LoopVectorization.Operation,1},Array{Symbol,1},Array{Symbol,1},LoopVectorization.LoopSet,Int,Int,Int})
+    precompile(Tuple{typeof(LoopVectorization.avx_body),LoopVectorization.LoopSet,Tuple{Int8,Int8,Int8}})
     precompile(Tuple{typeof(LoopVectorization.avx_loopset),Array{LoopVectorization.Instruction,1},Array{LoopVectorization.OperationStruct,1},Array{LoopVectorization.ArrayRefStruct,1},Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Any})
     precompile(Tuple{typeof(LoopVectorization.cost_vec_buf),LoopVectorization.LoopSet})
     precompile(Tuple{typeof(LoopVectorization.evaluate_cost_tile),LoopVectorization.LoopSet,Array{Symbol,1},Symbol,Symbol,Symbol})
     precompile(Tuple{typeof(LoopVectorization.evaluate_cost_unroll),LoopVectorization.LoopSet,Array{Symbol,1},Symbol,Float64})
-    precompile(Tuple{typeof(LoopVectorization.lower_block),LoopVectorization.LoopSet,LoopVectorization.UnrollSpecification,Int64,Nothing,Int64})
-    precompile(Tuple{typeof(LoopVectorization.lower_block),LoopVectorization.LoopSet,LoopVectorization.UnrollSpecification,Int64,Symbol,Int64})
-    precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int64,Int64,Nothing,Bool})
-    precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int64,Int64,Symbol,Bool})
-    precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int64,Nothing,Nothing,Bool})
-    precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int64,Nothing,Symbol,Bool})
-    precompile(Tuple{typeof(LoopVectorization.lower_load!),Expr,LoopVectorization.Operation,Symbol,LoopVectorization.LoopSet,Symbol,Symbol,Int64,Int64,Nothing})
-    precompile(Tuple{typeof(LoopVectorization.lower_load_scalar!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int64,Nothing,Int64})
+    # precompile(Tuple{typeof(LoopVectorization.lower_block),LoopVectorization.LoopSet,LoopVectorization.UnrollSpecification,Int,Nothing,Int})
+    # precompile(Tuple{typeof(LoopVectorization.lower_block),LoopVectorization.LoopSet,LoopVectorization.UnrollSpecification,Int,Symbol,Int})
+    # precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int,Int,Nothing,Bool})
+    # precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int,Int,Symbol,Bool})
+    # precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int,Nothing,Nothing,Bool})
+    # precompile(Tuple{typeof(LoopVectorization.lower_compute!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int,Nothing,Symbol,Bool})
+    # precompile(Tuple{typeof(LoopVectorization.lower_load!),Expr,LoopVectorization.Operation,Symbol,LoopVectorization.LoopSet,Symbol,Symbol,Int,Int,Nothing})
+    # precompile(Tuple{typeof(LoopVectorization.lower_load_scalar!),Expr,LoopVectorization.Operation,Symbol,Symbol,Symbol,Symbol,Int,Nothing,Int})
     precompile(Tuple{typeof(LoopVectorization.reg_pres_buf),LoopVectorization.LoopSet})
     precompile(Tuple{typeof(LoopVectorization.setup_call),LoopVectorization.LoopSet})
-    precompile(Tuple{typeof(LoopVectorization.solve_unroll),SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},Int64,Int64})
+    precompile(Tuple{typeof(LoopVectorization.solve_unroll),SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int}},Int},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int}},Int},true},Int,Int})
     precompile(Tuple{typeof(LoopVectorization.substitute_broadcast),Expr,Symbol})
-    precompile(Tuple{typeof(LoopVectorization.vmap_quote),Int64,Type{Float32}})
+    precompile(Tuple{typeof(LoopVectorization.vmap_quote),Int,Type{Float32}})
     precompile(Tuple{typeof(println),Base.GenericIOBuffer{Array{UInt8,1}},Array{LoopVectorization.Operation,1}})
-    precompile(Tuple{typeof(resize!),LoopVectorization.LoopOrder,Int64})
+    precompile(Tuple{typeof(resize!),LoopVectorization.LoopOrder,Int})
 end
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -75,7 +75,8 @@ function ArrayReferenceMeta(
                 pushfirst!(offset_vec, offset)
                 pushfirst!(loopedindex, true)
             end
-        elseif index_types == ComputedIndex
+        else#if index_types == ComputedIndex
+            @assert index_types == ComputedIndex
             opsym = opsymbols[ind]
             if expandedv[ind]
                 nops = nopsv[ind]
@@ -89,11 +90,11 @@ function ArrayReferenceMeta(
                 pushfirst!(offset_vec, offset)
                 pushfirst!(loopedindex, false)
             end
-        else
-            @assert index_types == SymbolicIndex
-            pushfirst!(index_vec, arraysymbolinds[ind])
-            pushfirst!(offset_vec, offset)
-            pushfirst!(loopedindex, false)
+        # else
+            # @assert index_types == SymbolicIndex
+            # pushfirst!(index_vec, arraysymbolinds[ind])
+            # pushfirst!(offset_vec, offset)
+            # pushfirst!(loopedindex, false)
         end
         index_types >>>= 8
         indices >>>= 8
diff --git a/test/copy.jl b/test/copy.jl
@@ -53,8 +53,10 @@ using LoopVectorization, OffsetArrays, Test
         end
     end
     function make2point3avx!(x)
+        a = 1.742416161578685
+        b = 1.5
         @avx for i ∈ eachindex(x)
-            x[i] = 2.3
+            x[i] = a ^ b
         end
     end
     function make2point3_avx!(x)
diff --git a/test/dot.jl b/test/dot.jl
@@ -183,6 +183,27 @@ using Test
         4acc/length(x)
     end
 
+    function dotloopinductvarpow(x)
+        s = zero(eltype(x))
+        for i ∈ eachindex(x)
+            s += x[i] * i^3
+        end
+        s
+    end
+    function dotloopinductvarpowavx(x)
+        s = zero(eltype(x))
+        @avx for i ∈ eachindex(x)
+            s += x[i] * i^3
+        end
+        s
+    end
+    function dot_from_n_to_100(a, b, n)
+        s = zero(eltype(a))
+        @avx for i ∈ n:100
+            s += a[i] * b[i]
+        end
+        s
+    end
     # @macroexpand @_avx for i = 1:length(a_re) - 1
     #     c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
     #     c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1]
@@ -220,6 +241,9 @@ using Test
             @test πest == pi_avx_u4(a, b)
         end
 
+        @test dotloopinductvarpow(a) ≈ dotloopinductvarpowavx(a)
+        @test dot_from_n_to_100(a, b, 33) == @views mydotavx(a[33:100], b[33:100])
+
         a_re = rand(R, N); a_im = rand(R, N);
         b_re = rand(R, N); b_im = rand(R, N);
         ac = Complex.(a_re, a_im);
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -835,6 +835,7 @@ using Test
     end
 
     @testset "Mixed CartesianIndex/Int indexing" begin
+        @show T, @__LINE__
         # A demo similar to the exponential filtering demo from https://julialang.org/blog/2016/02/iteration/,
         # but with no loop-carried dependency.
         function smoothdim!(s, x, α, Rpre, irng::AbstractUnitRange, Rpost)