JuliaSIMD · ChrisRackauckas · Oct 8, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 2, 2025
diff --git a/Project.toml b/Project.toml
@@ -29,10 +29,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 [weakdeps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
 [extensions]
-ForwardDiffExt = ["ChainRulesCore", "ForwardDiff"]
+ForwardDiffExt = ["ChainRulesCore", "ForwardDiff", "NNlib"]
 SpecialFunctionsExt = "SpecialFunctions"
 
 [compat]
@@ -46,6 +47,7 @@ HostCPUFeatures = "0.1.10"
 IfElse = "0.1"
 LayoutPointers = "0.1.11"
 LinearAlgebra = "1"
+NNlib = "0.9.31"
 OffsetArrays = "1.4.1"
 PolyesterWeave = "0.1.10, 0.2"
 PrecompileTools = "1"
@@ -56,5 +58,9 @@ Static = "0.8.4, 1"
 StaticArrayInterface = "1"
 ThreadingUtilities = "0.5"
 UnPack = "1"
-VectorizationBase = "0.21.67"
-julia = "1.6"
+VectorizationBase = "0.21.72"
+julia = "1.10"
+
+[extras]
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
diff --git a/docs/make.jl b/docs/make.jl
@@ -31,7 +31,8 @@ makedocs(;
   ],
   # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}",
   sitename = "LoopVectorization.jl",
-  authors = "Chris Elrod"
+  authors = "Chris Elrod",
+  checkdocs=:exports,
   # assets=[],
 )
 

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -1,5 +1,9 @@
 # API reference
 
+```@docs
+LoopVectorization
+```
+
 ## Macros
 
 ```@docs
@@ -12,6 +16,8 @@
 ```@docs
 vmap
 vmap!
+vmapt
+vmapt!
 vmapnt
 vmapnt!
 vmapntt
@@ -27,7 +33,12 @@ LoopVectorization.vfilter!
 
 ## `reduce`-like constructs
 ```@docs
+vsum
 vreduce
 vmapreduce
 ```
 
+## Operators
+```@docs
+*ˡ
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -30,5 +30,3 @@ Pages = [
 ]
 Depth = 1
 ```
-
-
diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl
@@ -1,14 +1,14 @@
 module ForwardDiffExt
 import ForwardDiff, ChainRulesCore
-using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff
+using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff, NNlib
+using SLEEFPirates: tanh_fast, sigmoid_fast
 
 import IfElse: ifelse
 using VectorizationBase: AbstractSIMD, AbstractMask, zero_offsets
 
 using LoopVectorization:
   AbstractSIMD,
   AbstractStridedPointer,
-  relu,
   vmap,
   VectorizationBase,
   vmapt,
@@ -140,7 +140,8 @@ end
     )
   end
 end
-@generated function VectorizationBase.relu(
+
+@generated function NNlib.relu(
   x::ForwardDiff.Dual{T,S,N}
 ) where {T,S,N}
   quote
@@ -157,6 +158,27 @@ end
   end
 end
 
+@generated function NNlib.leakyrelu(
+  x::ForwardDiff.Dual{T,S,N},
 @inline function NNlib.leakyrelu( 
   x::LoopVectorization.AbstractSIMD, 
   a = NNlib.oftf(x, NNlib.leakyrelu_a), 
 ) 
   LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x))  # max(a*x, x) is 3x slower 
 end 
 @inline function NNlib.leakyrelu( 
   x::ForwardDiff.Dual{<:Any,<:LoopVectorization.AbstractSIMD}, 
   a = NNlib.oftf(x, NNlib.leakyrelu_a), 
 ) 
   LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x))  # max(a*x, x) is 3x slower 
 end 
 @inline function NNlib.leakyrelu( 
   x::LoopVectorization.AbstractSIMD, 
   a = NNlib.oftf(x, NNlib.leakyrelu_a), 
 ) 
   LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x))  # max(a*x, x) is 3x slower 
 end 
 @inline function NNlib.leakyrelu( 
   x::ForwardDiff.Dual{<:Any,<:LoopVectorization.AbstractSIMD}, 
   a = NNlib.oftf(x, NNlib.leakyrelu_a), 
 ) 
   LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x))  # max(a*x, x) is 3x slower 
 end 
+  a = 0.01
+) where {T,S,N}
+  quote
+    $(Expr(:meta, :inline))
+    v = x.value
+    z = zero(v)
+
+    α = convert(typeof(v), a)
+    cmp = v < z
+    r = ifelse(cmp, α * v, v)
+    p = x.partials
+    ForwardDiff.Dual{T}(
+      r,
+      ForwardDiff.Partials(Base.Cartesian.@ntuple $N n -> ifelse(cmp, α * p[n], p[n]))
+    )
+  end
+end
+
+
 @generated function _ifelse(
   m::Union{AbstractMask,VecUnroll{<:Any,<:Any,Bit,<:AbstractMask}},
   x::ForwardDiff.Dual{TAG,V,P},

diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -25,7 +25,7 @@ if isdefined(Base, :Experimental) &&
   @eval Base.Experimental.@max_methods 1
 end
 export LowDimArray,
-  static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast
+  static, stridedpointer, *ˡ, tanh_fast, sigmoid_fast
 
 using ArrayInterface: UpTri, LoTri
 using Static: StaticInt, gt, static, Zero, One, reduce_tup

diff --git a/src/constructors.jl b/src/constructors.jl
@@ -61,53 +61,41 @@ function substitute_broadcast(
   configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe)
   unroll_param_tup =
     Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0))
+
   for n ∈ 1:nargs
     _ciₙ = ci[n]
-    if _ciₙ isa Symbol
-      syms[n] = _ciₙ::Symbol
-    else
-      syms[n] = Symbol('%', n)
-      #ciₙ::Expr = _ciₙ::Expr
-      if _ciₙ isa Expr
-          ciₙ = _ciₙ
-      elseif _ciₙ isa GlobalRef
-          ciₙ = Expr(:globalref, _ciₙ.mod, _ciₙ.name)
+    syms[n] = Symbol('%', n)
+
+    if _ciₙ isa Core.SSAValue
+      push!(lb.args, Expr(:(=), syms[n], syms[_ciₙ.id]))
+
+    elseif _ciₙ isa GlobalRef
+      if _ciₙ.mod === Base || _ciₙ.mod === Core
+        push!(lb.args, Expr(:(=), syms[n], lv(_ciₙ.name)))
       else
-        error("Unexpected type in ci: $(typeof(_ciₙ))")
+        push!(lb.args, Expr(:(=), syms[n], _ciₙ.name))
       end
-      ciₙargs = ciₙ.args
-      f = first(ciₙargs)
-      if ciₙ.head === :(=)
-        push!(lb.args, Expr(:(=), f, syms[((ciₙargs[2])::Core.SSAValue).id]))
-      elseif isglobalref(f, Base, :materialize!)
-        add_ci_call!(
-          lb,
-          lv(:vmaterialize!),
-          ciₙargs,
-          syms,
-          n,
-          unroll_param_tup,
-          mod
-        )
+
+    elseif _ciₙ isa Expr && _ciₙ.head === :call
+      f = first(_ciₙ.args)
+      if isglobalref(f, Base, :materialize!)
+        add_ci_call!(lb, lv(:vmaterialize!), _ciₙ.args, syms, n, unroll_param_tup, mod)
       elseif isglobalref(f, Base, :materialize)
-        add_ci_call!(
-          lb,
-          lv(:vmaterialize),
-          ciₙargs,
-          syms,
-          n,
-          unroll_param_tup,
-          mod
-        )
+        add_ci_call!(lb, lv(:vmaterialize), _ciₙ.args, syms, n, unroll_param_tup, mod)
       else
-        add_ci_call!(lb, f, ciₙargs, syms, n)
+        add_ci_call!(lb, f, _ciₙ.args, syms, n)
       end
+
+    else
+      push!(lb.args, Expr(:(=), syms[n], _ciₙ))
     end
   end
+
   ret::Expr = pop!(lb.args)::Expr
   if Meta.isexpr(ret, :(=), 2)
     ret = (ret.args[2])::Expr
   end
+
   esc(Expr(:let, lb, Expr(:block, ret)))
 end
 

diff --git a/test/forwarddiffext.jl b/test/forwarddiffext.jl
@@ -16,21 +16,6 @@ function tovec(x::ForwardDiff.Dual{T,V,N}) where {T,V,N}
   return ret
 end
 
-if LoopVectorization.ifelse !== Base.ifelse
-  @inline function NNlib.leakyrelu(
-    x::LoopVectorization.AbstractSIMD,
-    a = NNlib.oftf(x, NNlib.leakyrelu_a),
-  )
-    LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x))  # max(a*x, x) is 3x slower
-  end
-  @inline function NNlib.leakyrelu(
-    x::ForwardDiff.Dual{<:Any,<:LoopVectorization.AbstractSIMD},
-    a = NNlib.oftf(x, NNlib.leakyrelu_a),
-  )
-    LoopVectorization.ifelse(x > zero(x), float(x), NNlib.oftf(x, a * x))  # max(a*x, x) is 3x slower
-  end
-end
-
 vx0 = randnvec()
 vx1 = randnvec()
 vx2 = randnvec()
@@ -50,3 +35,8 @@ vud = ForwardDiff.Dual(vu0, vu1, vu2)
       reinterpret(Float64, NNlib.leakyrelu.(tovec(vd0)))
 @test reinterpret(Float64, tovec(NNlib.leakyrelu(vud))) ≈
       reinterpret(Float64, NNlib.leakyrelu.(tovec(vud)))
+
+@test reinterpret(Float64, tovec(NNlib.relu(vd0))) ≈
+      reinterpret(Float64, NNlib.relu.(tovec(vd0)))
+@test reinterpret(Float64, tovec(NNlib.relu(vud))) ≈
+      reinterpret(Float64, NNlib.relu.(tovec(vud)))