Skip to content
12 changes: 9 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
[weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[extensions]
ForwardDiffExt = ["ChainRulesCore", "ForwardDiff"]
ForwardDiffExt = ["ChainRulesCore", "ForwardDiff", "NNlib"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NNLib should be a separate extension. Not only is this breaking, it also means that it won't properly trigger for most users.

SpecialFunctionsExt = "SpecialFunctions"

[compat]
Expand All @@ -46,6 +47,7 @@ HostCPUFeatures = "0.1.10"
IfElse = "0.1"
LayoutPointers = "0.1.11"
LinearAlgebra = "1"
NNlib = "0.9.31"
OffsetArrays = "1.4.1"
PolyesterWeave = "0.1.10, 0.2"
PrecompileTools = "1"
Expand All @@ -56,5 +58,9 @@ Static = "0.8.4, 1"
StaticArrayInterface = "1"
ThreadingUtilities = "0.5"
UnPack = "1"
VectorizationBase = "0.21.67"
julia = "1.6"
VectorizationBase = "0.21.72"
julia = "1.10"

[extras]
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
3 changes: 2 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ makedocs(;
],
# repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}",
sitename = "LoopVectorization.jl",
authors = "Chris Elrod"
authors = "Chris Elrod",
checkdocs=:exports,
# assets=[],
)

Expand Down
11 changes: 11 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# API reference

```@docs
LoopVectorization
```

## Macros

```@docs
Expand All @@ -12,6 +16,8 @@
```@docs
vmap
vmap!
vmapt
vmapt!
vmapnt
vmapnt!
vmapntt
Expand All @@ -27,7 +33,12 @@ LoopVectorization.vfilter!

## `reduce`-like constructs
```@docs
vsum
vreduce
vmapreduce
```

## Operators
```@docs
```
2 changes: 0 additions & 2 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,3 @@ Pages = [
]
Depth = 1
```


28 changes: 25 additions & 3 deletions ext/ForwardDiffExt.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
module ForwardDiffExt
import ForwardDiff, ChainRulesCore
using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff
using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff, NNlib
using SLEEFPirates: tanh_fast, sigmoid_fast

import IfElse: ifelse
using VectorizationBase: AbstractSIMD, AbstractMask, zero_offsets

using LoopVectorization:
AbstractSIMD,
AbstractStridedPointer,
relu,
vmap,
VectorizationBase,
vmapt,
Expand Down Expand Up @@ -140,7 +140,8 @@ end
)
end
end
@generated function VectorizationBase.relu(

@generated function NNlib.relu(
x::ForwardDiff.Dual{T,S,N}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is type piracy.

) where {T,S,N}
quote
Expand All @@ -157,6 +158,27 @@ end
end
end

@generated function NNlib.leakyrelu(
x::ForwardDiff.Dual{T,S,N},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is type piracy. Why is this required?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those functions were once written with branches.
LV only added methods for the SIMD types and Duals of SIMD types where this was problematic.
Not sure why this PR dropped the constraints.

Copy link
Contributor Author

@mxpoch mxpoch Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because that's what the test suite tested for and was causing this error -- which occurs across pre, 1, and lts. I figured it would be cleaner to move the definition to the extension instead of defining it in the test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although it appears that I may have missed a subtlety in the original implementation with branching; what was the original goal for these tests?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The concern is that the old functions were only overloaded for AbstractSIMD:

@inline function NNlib.leakyrelu(
x::LoopVectorization.AbstractSIMD,
a = NNlib.oftf(x, NNlib.leakyrelu_a),
)
LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x)) # max(a*x, x) is 3x slower
end
@inline function NNlib.leakyrelu(
x::ForwardDiff.Dual{<:Any,<:LoopVectorization.AbstractSIMD},
a = NNlib.oftf(x, NNlib.leakyrelu_a),
)
LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x)) # max(a*x, x) is 3x slower
end

The new ones work for any S. This is why the new ones commit type piracy, while the old ones only sort of did (sort of, because LoopVectorization doesn't itself define AbstractSIMD, but has permission to add methods if needed).

Probably SLEEFPirates is a better place for this, with an NNlib extension.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, should I remove these tests and the new associated functions from the PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, it seems strange that many functions defined in the extension aren't being tested; I can add a new set of tests if that's within scope.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just make them dispatch on AbstractSIMD again.

a = 0.01
) where {T,S,N}
quote
$(Expr(:meta, :inline))
v = x.value
z = zero(v)

α = convert(typeof(v), a)
cmp = v < z
r = ifelse(cmp, α * v, v)
p = x.partials
ForwardDiff.Dual{T}(
r,
ForwardDiff.Partials(Base.Cartesian.@ntuple $N n -> ifelse(cmp, α * p[n], p[n]))
)
end
end


@generated function _ifelse(
m::Union{AbstractMask,VecUnroll{<:Any,<:Any,Bit,<:AbstractMask}},
x::ForwardDiff.Dual{TAG,V,P},
Expand Down
2 changes: 1 addition & 1 deletion src/LoopVectorization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ if isdefined(Base, :Experimental) &&
@eval Base.Experimental.@max_methods 1
end
export LowDimArray,
static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast
static, stridedpointer, *ˡ, tanh_fast, sigmoid_fast

using ArrayInterface: UpTri, LoTri
using Static: StaticInt, gt, static, Zero, One, reduce_tup
Expand Down
56 changes: 22 additions & 34 deletions src/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,53 +61,41 @@ function substitute_broadcast(
configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe)
unroll_param_tup =
Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0))

for n ∈ 1:nargs
_ciₙ = ci[n]
if _ciₙ isa Symbol
syms[n] = _ciₙ::Symbol
else
syms[n] = Symbol('%', n)
#ciₙ::Expr = _ciₙ::Expr
if _ciₙ isa Expr
ciₙ = _ciₙ
elseif _ciₙ isa GlobalRef
ciₙ = Expr(:globalref, _ciₙ.mod, _ciₙ.name)
syms[n] = Symbol('%', n)

if _ciₙ isa Core.SSAValue
push!(lb.args, Expr(:(=), syms[n], syms[_ciₙ.id]))

elseif _ciₙ isa GlobalRef
if _ciₙ.mod === Base || _ciₙ.mod === Core
push!(lb.args, Expr(:(=), syms[n], lv(_ciₙ.name)))
else
error("Unexpected type in ci: $(typeof(_ciₙ))")
push!(lb.args, Expr(:(=), syms[n], _ciₙ.name))
end
ciₙargs = ciₙ.args
f = first(ciₙargs)
if ciₙ.head === :(=)
push!(lb.args, Expr(:(=), f, syms[((ciₙargs[2])::Core.SSAValue).id]))
elseif isglobalref(f, Base, :materialize!)
add_ci_call!(
lb,
lv(:vmaterialize!),
ciₙargs,
syms,
n,
unroll_param_tup,
mod
)

elseif _ciₙ isa Expr && _ciₙ.head === :call
f = first(_ciₙ.args)
if isglobalref(f, Base, :materialize!)
add_ci_call!(lb, lv(:vmaterialize!), _ciₙ.args, syms, n, unroll_param_tup, mod)
elseif isglobalref(f, Base, :materialize)
add_ci_call!(
lb,
lv(:vmaterialize),
ciₙargs,
syms,
n,
unroll_param_tup,
mod
)
add_ci_call!(lb, lv(:vmaterialize), _ciₙ.args, syms, n, unroll_param_tup, mod)
else
add_ci_call!(lb, f, ciₙargs, syms, n)
add_ci_call!(lb, f, _ciₙ.args, syms, n)
end

else
push!(lb.args, Expr(:(=), syms[n], _ciₙ))
end
end

ret::Expr = pop!(lb.args)::Expr
if Meta.isexpr(ret, :(=), 2)
ret = (ret.args[2])::Expr
end

esc(Expr(:let, lb, Expr(:block, ret)))
end

Expand Down
20 changes: 5 additions & 15 deletions test/forwarddiffext.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,6 @@ function tovec(x::ForwardDiff.Dual{T,V,N}) where {T,V,N}
return ret
end

if LoopVectorization.ifelse !== Base.ifelse
@inline function NNlib.leakyrelu(
x::LoopVectorization.AbstractSIMD,
a = NNlib.oftf(x, NNlib.leakyrelu_a),
)
LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x)) # max(a*x, x) is 3x slower
end
@inline function NNlib.leakyrelu(
x::ForwardDiff.Dual{<:Any,<:LoopVectorization.AbstractSIMD},
a = NNlib.oftf(x, NNlib.leakyrelu_a),
)
LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x)) # max(a*x, x) is 3x slower
end
end

vx0 = randnvec()
vx1 = randnvec()
vx2 = randnvec()
Expand All @@ -50,3 +35,8 @@ vud = ForwardDiff.Dual(vu0, vu1, vu2)
reinterpret(Float64, NNlib.leakyrelu.(tovec(vd0)))
@test reinterpret(Float64, tovec(NNlib.leakyrelu(vud))) ≈
reinterpret(Float64, NNlib.leakyrelu.(tovec(vud)))

@test reinterpret(Float64, tovec(NNlib.relu(vd0))) ≈
reinterpret(Float64, NNlib.relu.(tovec(vd0)))
@test reinterpret(Float64, tovec(NNlib.relu(vud))) ≈
reinterpret(Float64, NNlib.relu.(tovec(vud)))
Loading