Updated to track removal of vectorizable from VectorizationBase. Also added some functions useful for defining derivatives.

chriselrod · chriselrod · commit 45cbcdc6da6f · 2020-02-21T01:56:45.000-05:00
diff --git a/Manifest.toml b/Manifest.toml
@@ -49,9 +49,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[SIMDPirates]]
 deps = ["VectorizationBase"]
-git-tree-sha1 = "6d93eddeaf847073dfa36ad339d76015c59a9adb"
+git-tree-sha1 = "f62bec2edf3dc415ac06547e8c9ef07b55d46c0a"
 uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
-version = "0.3.14"
+version = "0.3.15"
 
 [[SLEEFPirates]]
 deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
@@ -71,6 +71,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[VectorizationBase]]
 deps = ["CpuId", "LinearAlgebra"]
-git-tree-sha1 = "e1093ff0fc183880a6f836026309ba06672c92ec"
+git-tree-sha1 = "2a377190de71d8d3c7a65da8c6283e1d2c7f0507"
 uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
-version = "0.3.1"
+version = "0.4.0"
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -7,8 +7,8 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector
     Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
     PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
 using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
-    sizeequivalentfloat, sizeequivalentint
-#    vmullog2, vmullog10, vdivlog2, vdivlog2add, vdivlog10, vdivlog10add, vfmaddaddone
+    sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vfmadd!, vfnmadd!,
+    vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
 
diff --git a/src/costs.jl b/src/costs.jl
@@ -108,6 +108,8 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:(/)) => InstructionCost(13,4.0,-2.0),
     Instruction(:vadd) => InstructionCost(4,0.5),
     Instruction(:vsub) => InstructionCost(4,0.5),
+    Instruction(:vadd!) => InstructionCost(4,0.5),
+    Instruction(:vsub!) => InstructionCost(4,0.5),
     Instruction(:vmul) => InstructionCost(4,0.5),
     Instruction(:vfdiv) => InstructionCost(13,4.0,-2.0),
     Instruction(:evadd) => InstructionCost(4,0.5),
@@ -148,10 +150,21 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:vfmsub) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmadd) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmsub) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmadd!) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfnmadd!) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfmadd_fast) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfmsub_fast) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmadd_fast) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmsub_fast) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmaddaddone) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vmullog2) => InstructionCost(4,0.5),
+    Instruction(:vmullog2add!) => InstructionCost(4,0.5),
+    Instruction(:vmullog10) => InstructionCost(4,0.5),
+    Instruction(:vmullog10add!) => InstructionCost(4,0.5),
+    Instruction(:vdivlog2) => InstructionCost(13,4.0,-2.0),
+    Instruction(:vdivlog2add!) =>InstructionCost(13,4.0,-2.0),
+    Instruction(:vdivlog10) => InstructionCost(13,4.0,-2.0),
+    Instruction(:vdivlog10add!) =>InstructionCost(13,4.0,-2.0),
     Instruction(:sqrt) => InstructionCost(15,4.0,-2.0),
     Instruction(:sqrt_fast) => InstructionCost(15,4.0,-2.0),
     Instruction(:log) => InstructionCost(20,20.0,40.0,20),
@@ -213,6 +226,8 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :vfmsub => ADDITIVE_IN_REDUCTIONS,
     :vfnmadd => ADDITIVE_IN_REDUCTIONS,
     :vfnmsub => ADDITIVE_IN_REDUCTIONS,
+    :vfmadd! => ADDITIVE_IN_REDUCTIONS,
+    :vfnmadd! => ADDITIVE_IN_REDUCTIONS,
     :vfmadd_fast => ADDITIVE_IN_REDUCTIONS,
     :vfmsub_fast => ADDITIVE_IN_REDUCTIONS,
     :vfnmadd_fast => ADDITIVE_IN_REDUCTIONS,
@@ -260,9 +275,11 @@ isreductcombineinstr(instr::Instruction) = isreductcombineinstr(instr.instr)
 const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(+) => :(+),
     typeof(SIMDPirates.vadd) => :(+),
+    typeof(SIMDPirates.vadd!) => :(+),
     typeof(Base.FastMath.add_fast) => :(+),
     typeof(-) => :(-),
     typeof(SIMDPirates.vsub) => :(-),
+    typeof(SIMDPirates.vsub!) => :(-),
     typeof(Base.FastMath.sub_fast) => :(-),
     typeof(*) => :(*),
     typeof(SIMDPirates.vmul) => :(*),
@@ -287,10 +304,21 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(SIMDPirates.vfmsub) => :vfmsub,
     typeof(SIMDPirates.vfnmadd) => :vfnmadd,
     typeof(SIMDPirates.vfnmsub) => :vfnmsub,
+    typeof(SIMDPirates.vfmadd!) => :vfmadd!,
+    typeof(SIMDPirates.vfnmadd!) => :vfnmadd!,
     typeof(SIMDPirates.vfmadd_fast) => :vfmadd_fast,
     typeof(SIMDPirates.vfmsub_fast) => :vfmsub_fast,
     typeof(SIMDPirates.vfnmadd_fast) => :vfnmadd_fast,
     typeof(SIMDPirates.vfnmsub_fast) => :vfnmsub_fast,
+    typeof(vfmaddaddone) => :vfmaddaddone,
+    typeof(vmullog2) => :vmullog2,
+    typeof(vmullog2add!) => :vmullog2add!,
+    typeof(vmullog10) => :vmullog10,
+    typeof(vmullog10add!) => :vmullog10add!,
+    typeof(vdivlog2) => :vdivlog2,
+    typeof(vdivlog2add!) => :vdivlog2add!,
+    typeof(vdivlog10) => :vdivlog10,
+    typeof(vdivlog10add!) => :vdivlog10add!,
     typeof(sqrt) => :sqrt,
     typeof(Base.FastMath.sqrt_fast) => :sqrt,
     typeof(SIMDPirates.vsqrt) => :sqrt,
diff --git a/src/map.jl b/src/map.jl
@@ -3,14 +3,14 @@
 function vmap_quote(N, ::Type{T}) where {T}
     W, Wshift = VectorizationBase.pick_vector_width_shift(T)
     val = Expr(:call, Expr(:curly, :Val, W))
-    q = Expr(:block, Expr(:(=), :M, Expr(:call, :length, :dest)), Expr(:(=), :vdest, Expr(:call, :vectorizable, :dest)), Expr(:(=), :m, 0))
+    q = Expr(:block, Expr(:(=), :M, Expr(:call, :length, :dest)), Expr(:(=), :vdest, Expr(:call, :pointer, :dest)), Expr(:(=), :m, 0))
     fcall = Expr(:call, :f)
     loopbody = Expr(:block, Expr(:call, :vstore!, :vdest, fcall, :m), Expr(:(+=), :m, W))
     fcallmask = Expr(:call, :f)
     bodymask = Expr(:block, Expr(:(=), :__mask__, Expr(:call, :mask, val, Expr(:call, :&, :M, W-1))), Expr(:call, :vstore!, :vdest, fcallmask, :m, :__mask__))
     for n ∈ 1:N
         arg_n = Symbol(:varg_,n)
-        push!(q.args, Expr(:(=), arg_n, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:call, :vectorizable, Expr(:ref, :args, n)))))
+        push!(q.args, Expr(:(=), arg_n, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:call, :pointer, Expr(:ref, :args, n)))))
         push!(fcall.args, Expr(:call, :vload, val, arg_n, :m))
         push!(fcallmask.args, Expr(:call, :vload, val, arg_n, :m, :__mask__))
     end
diff --git a/src/precompile.jl b/src/precompile.jl
@@ -58,7 +58,6 @@ function _precompile_()
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(false, true, true),Int64,3,Array{Int64,3}}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(true, false, true),Int64,3,Array{Int64,3}}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(true, true, false),Int64,3,Array{Int64,3}}})
-    precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Float64,2},Array{Float64,1}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Int32,2},Array{Int32,1}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Int64,2},Array{Int64,1}})
     precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,NTuple{4,DataType}})
@@ -336,7 +335,6 @@ function _precompile_()
     precompile(Tuple{typeof(foreach),typeof(empty!),Array{Array{LoopVectorization.Operation,1},1}})
     precompile(Tuple{typeof(getindex),Type{LoopVectorization.ArrayRefStruct},LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct,Vararg{LoopVectorization.ArrayRefStruct,N} where N})
     precompile(Tuple{typeof(getindex),Type{LoopVectorization.ArrayRefStruct},LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct})
-    precompile(Tuple{typeof(getindex),Type{LoopVectorization.ArrayRefStruct},LoopVectorization.ArrayRefStruct,LoopVectorization.ArrayRefStruct})
     precompile(Tuple{typeof(hash),LoopVectorization.Instruction,UInt64})
     precompile(Tuple{typeof(iterate),LoopVectorization.LoopOrders,Array{Int64,1}})
     precompile(Tuple{typeof(println),Base.GenericIOBuffer{Array{UInt8,1}},Array{LoopVectorization.Operation,1}})