diff --git a/Project.toml b/Project.toml index bc474cbb..fe28b95b 100644 --- a/Project.toml +++ b/Project.toml @@ -29,10 +29,12 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [extensions] ForwardDiffExt = ["ChainRulesCore", "ForwardDiff"] +ForwardDiffNNlibExt = ["ForwardDiff", "NNlib"] SpecialFunctionsExt = "SpecialFunctions" [compat] @@ -46,6 +48,7 @@ HostCPUFeatures = "0.1.10" IfElse = "0.1" LayoutPointers = "0.1.11" LinearAlgebra = "1" +NNlib = "0.9.31" OffsetArrays = "1.4.1" PolyesterWeave = "0.1.10, 0.2" PrecompileTools = "1" @@ -56,5 +59,9 @@ Static = "0.8.4, 1" StaticArrayInterface = "1" ThreadingUtilities = "0.5" UnPack = "1" -VectorizationBase = "0.21.67" -julia = "1.6" +VectorizationBase = "0.21.72" +julia = "1.10" + +[extras] +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" diff --git a/docs/make.jl b/docs/make.jl index b537d66f..96808c54 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -31,7 +31,8 @@ makedocs(; ], # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}", sitename = "LoopVectorization.jl", - authors = "Chris Elrod" + authors = "Chris Elrod", + checkdocs=:exports, # assets=[], ) diff --git a/docs/src/api.md b/docs/src/api.md index 418844d1..cceae714 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -1,5 +1,9 @@ # API reference +```@docs +LoopVectorization +``` + ## Macros ```@docs @@ -12,6 +16,8 @@ ```@docs vmap vmap! +vmapt +vmapt! vmapnt vmapnt! vmapntt @@ -27,7 +33,12 @@ LoopVectorization.vfilter! ## `reduce`-like constructs ```@docs +vsum vreduce vmapreduce ``` +## Operators +```@docs +*ˡ +``` diff --git a/docs/src/index.md b/docs/src/index.md index d74d43fe..dd424210 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -30,5 +30,3 @@ Pages = [ ] Depth = 1 ``` - - diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl index 26227f69..765570ec 100644 --- a/ext/ForwardDiffExt.jl +++ b/ext/ForwardDiffExt.jl @@ -1,6 +1,7 @@ module ForwardDiffExt import ForwardDiff, ChainRulesCore using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff +using SLEEFPirates: tanh_fast, sigmoid_fast import IfElse: ifelse using VectorizationBase: AbstractSIMD, AbstractMask, zero_offsets @@ -8,7 +9,6 @@ using VectorizationBase: AbstractSIMD, AbstractMask, zero_offsets using LoopVectorization: AbstractSIMD, AbstractStridedPointer, - relu, vmap, VectorizationBase, vmapt, @@ -140,22 +140,6 @@ end ) end end -@generated function VectorizationBase.relu( - x::ForwardDiff.Dual{T,S,N} -) where {T,S,N} - quote - $(Expr(:meta, :inline)) - v = x.value - z = zero(v) - cmp = v < z - r = ifelse(cmp, z, v) - p = x.partials - ForwardDiff.Dual{T}( - r, - ForwardDiff.Partials(Base.Cartesian.@ntuple $N n -> ifelse(cmp, z, p[n])) - ) - end -end @generated function _ifelse( m::Union{AbstractMask,VecUnroll{<:Any,<:Any,Bit,<:AbstractMask}}, @@ -284,15 +268,6 @@ function ChainRulesCore.rrule(::typeof(sigmoid_fast), x) end s, ∂ end -function ChainRulesCore.rrule(::typeof(relu), v) - z = zero(v) - cmp = v < z - r = ifelse(cmp, z, v) - ∂ = let cmp = cmp - y -> (ChainRulesZero(), ifelse(cmp, zero(y), y)) - end - r, ∂ -end function ∂vmap_singlethread!( f::F, diff --git a/ext/ForwardDiffNNlibExt.jl b/ext/ForwardDiffNNlibExt.jl new file mode 100644 index 00000000..3c07d5d7 --- /dev/null +++ b/ext/ForwardDiffNNlibExt.jl @@ -0,0 +1,42 @@ +module ForwardDiffNNlibExt +import ForwardDiff +using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff, NNlib + +@generated function NNlib.relu( + x::ForwardDiff.Dual{T,<:LoopVectorization.AbstractSIMD,N} +) where {T,S,N} + quote + $(Expr(:meta, :inline)) + v = x.value + z = zero(v) + cmp = v < z + r = ifelse(cmp, z, v) + p = x.partials + ForwardDiff.Dual{T}( + r, + ForwardDiff.Partials(Base.Cartesian.@ntuple $N n -> ifelse(cmp, z, p[n])) + ) + end +end + +@generated function NNlib.leakyrelu( + x::ForwardDiff.Dual{T,<:LoopVectorization.AbstractSIMD,N}, + a = 0.01 +) where {T,S,N} + quote + $(Expr(:meta, :inline)) + v = x.value + z = zero(v) + + α = convert(typeof(v), a) + cmp = v < z + r = ifelse(cmp, α * v, v) + p = x.partials + ForwardDiff.Dual{T}( + r, + ForwardDiff.Partials(Base.Cartesian.@ntuple $N n -> ifelse(cmp, α * p[n], p[n])) + ) + end +end + +end diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 4af0f832..c86ca2da 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -25,7 +25,7 @@ if isdefined(Base, :Experimental) && @eval Base.Experimental.@max_methods 1 end export LowDimArray, - static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast + static, stridedpointer, *ˡ, tanh_fast, sigmoid_fast using ArrayInterface: UpTri, LoTri using Static: StaticInt, gt, static, Zero, One, reduce_tup diff --git a/src/constructors.jl b/src/constructors.jl index 9bdc5758..3391ea2d 100644 --- a/src/constructors.jl +++ b/src/constructors.jl @@ -61,53 +61,41 @@ function substitute_broadcast( configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe) unroll_param_tup = Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0)) + for n ∈ 1:nargs _ciₙ = ci[n] - if _ciₙ isa Symbol - syms[n] = _ciₙ::Symbol - else - syms[n] = Symbol('%', n) - #ciₙ::Expr = _ciₙ::Expr - if _ciₙ isa Expr - ciₙ = _ciₙ - elseif _ciₙ isa GlobalRef - ciₙ = Expr(:globalref, _ciₙ.mod, _ciₙ.name) + syms[n] = Symbol('%', n) + + if _ciₙ isa Core.SSAValue + push!(lb.args, Expr(:(=), syms[n], syms[_ciₙ.id])) + + elseif _ciₙ isa GlobalRef + if _ciₙ.mod === Base || _ciₙ.mod === Core + push!(lb.args, Expr(:(=), syms[n], lv(_ciₙ.name))) else - error("Unexpected type in ci: $(typeof(_ciₙ))") + push!(lb.args, Expr(:(=), syms[n], _ciₙ.name)) end - ciₙargs = ciₙ.args - f = first(ciₙargs) - if ciₙ.head === :(=) - push!(lb.args, Expr(:(=), f, syms[((ciₙargs[2])::Core.SSAValue).id])) - elseif isglobalref(f, Base, :materialize!) - add_ci_call!( - lb, - lv(:vmaterialize!), - ciₙargs, - syms, - n, - unroll_param_tup, - mod - ) + + elseif _ciₙ isa Expr && _ciₙ.head === :call + f = first(_ciₙ.args) + if isglobalref(f, Base, :materialize!) + add_ci_call!(lb, lv(:vmaterialize!), _ciₙ.args, syms, n, unroll_param_tup, mod) elseif isglobalref(f, Base, :materialize) - add_ci_call!( - lb, - lv(:vmaterialize), - ciₙargs, - syms, - n, - unroll_param_tup, - mod - ) + add_ci_call!(lb, lv(:vmaterialize), _ciₙ.args, syms, n, unroll_param_tup, mod) else - add_ci_call!(lb, f, ciₙargs, syms, n) + add_ci_call!(lb, f, _ciₙ.args, syms, n) end + + else + push!(lb.args, Expr(:(=), syms[n], _ciₙ)) end end + ret::Expr = pop!(lb.args)::Expr if Meta.isexpr(ret, :(=), 2) ret = (ret.args[2])::Expr end + esc(Expr(:let, lb, Expr(:block, ret))) end diff --git a/test/forwarddiffext.jl b/test/forwarddiffext.jl index b4b905c7..32f7e8c0 100644 --- a/test/forwarddiffext.jl +++ b/test/forwarddiffext.jl @@ -16,21 +16,6 @@ function tovec(x::ForwardDiff.Dual{T,V,N}) where {T,V,N} return ret end -if LoopVectorization.ifelse !== Base.ifelse - @inline function NNlib.leakyrelu( - x::LoopVectorization.AbstractSIMD, - a = NNlib.oftf(x, NNlib.leakyrelu_a), - ) - LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x)) # max(a*x, x) is 3x slower - end - @inline function NNlib.leakyrelu( - x::ForwardDiff.Dual{<:Any,<:LoopVectorization.AbstractSIMD}, - a = NNlib.oftf(x, NNlib.leakyrelu_a), - ) - LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x)) # max(a*x, x) is 3x slower - end -end - vx0 = randnvec() vx1 = randnvec() vx2 = randnvec() @@ -50,3 +35,8 @@ vud = ForwardDiff.Dual(vu0, vu1, vu2) reinterpret(Float64, NNlib.leakyrelu.(tovec(vd0))) @test reinterpret(Float64, tovec(NNlib.leakyrelu(vud))) ≈ reinterpret(Float64, NNlib.leakyrelu.(tovec(vud))) + +@test reinterpret(Float64, tovec(NNlib.relu(vd0))) ≈ + reinterpret(Float64, NNlib.relu.(tovec(vd0))) +@test reinterpret(Float64, tovec(NNlib.relu(vud))) ≈ + reinterpret(Float64, NNlib.relu.(tovec(vud))) \ No newline at end of file