merged main

chriselrod · chriselrod · commit 320cf3a424d2 · 2023-07-10T21:49:08.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,8 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.160"
+version = "0.12.163"
+
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/README.md b/README.md
@@ -273,6 +273,30 @@ BenchmarkTools.Trial:
   evals/sample:     6
 ```
 
+Note: `@turbo` does not support passing of kwargs to function calls to which it is applied, e.g:
+```julia
+julia> @turbo round.(rand(10))
+
+julia> @turbo round.(rand(10); digits = 3)
+ERROR: TypeError: in typeassert, expected Expr, got a value of type GlobalRef
+```
+
+You can work around this by creating a anonymous function before applying `@turbo` as follows:
+```julia
+struct KwargCall{F,T}
+    f::F
+    x::T
+end
+@inline (f::KwargCall)(args...) = f.f(args...; f.x...)
+
+f = KwargCall(round, (digits = 3,));
+@turbo f.(rand(10))
+10-element Vector{Float64}:
+ 0.763
+ ⋮
+ 0.851
+```
+
 </p>
 </details>
 
diff --git a/docs/src/devdocs/evaluating_loops.md b/docs/src/devdocs/evaluating_loops.md
@@ -1,6 +1,6 @@
 # Determining the strategy for evaluating loops
 
-The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/costs.jl).
+The heart of the optimizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/modeling/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/modeling/costs.jl).
 Essentially, it estimates the cost of different means of evaluating the loops. It iterates through the different possible loop orders, as well as considering which loops to unroll, and which to vectorize. It will consider unrolling 1 or 2 loops (but it could settle on unrolling by a factor of 1, i.e. not unrolling), and vectorizing 1.
 
 The cost estimate is based on the costs of individual instructions and the number of times each one needs to be executed for the given strategy. The instruction cost can be broken into several components:
@@ -14,7 +14,7 @@ Data on individual instructions for specific architectures can be found on [Agne
 Examples of how these come into play:
 - Vectorizing a loop will result in each instruction evaluating multiple iterations, but the costs of loads and stores will change based on the memory layouts of the accessed arrays.
 - Unrolling can help reduce the number of times an operation must be performed, for example if it can allow us to reuse memory multiple times rather than reloading it every time it is needed.
-- When there is a reduction, such as performing a sum, there is a dependency chain. Each `+` has to wait for the previous `+` to finish executing before it can begin, thus execution time is bounded by latency rather than minimum of the throughput of the `+` and load operations. By unrolling the loop, we can create multiple independent dependency chains.
+- When there is a reduction, such as performing a sum, there is a dependency chain. Each `+` has to wait for the previous `+` to finish executing before it can begin, thus execution time is bounded by latency rather than the minimum of the throughput of the `+` and load operations. By unrolling the loop, we can create multiple independent dependency chains.
 
 
 
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -38,6 +38,31 @@ Aside from loops, `LoopVectorization.jl` also supports broadcasting.
 !!! danger
     Broadcasting an `Array` `A` when `size(A,1) == 1` is NOT SUPPORTED, unless this is known at compile time (e.g., broadcasting a transposed vector is fine). Otherwise, you will probably crash Julia.
 
+Note: `@turbo` does not support passing of kwargs to function calls to which it is applied, e.g:
+```julia
+julia> @turbo round.(rand(10));
+
+julia> @turbo round.(rand(10); digits = 3);
+ERROR: TypeError: in typeassert, expected Expr, got a value of type GlobalRef
+```
+
+You can work around this by creating a anonymous function before applying `@turbo` as follows:
+```julia
+struct KwargCall{F,T}
+    f::F
+    x::T
+end
+@inline (f::KwargCall)(args...) = f.f(args...; f.x...)
+
+f = KwargCall(round, (digits = 3,));
+@turbo f.(rand(10))
+10-element Vector{Float64}:
+ 0.763
+ ⋮
+ 0.851
+```
+
+
 ```julia
 julia> using LoopVectorization, BenchmarkTools
 
diff --git a/docs/src/vectorized_convenience_functions.md b/docs/src/vectorized_convenience_functions.md
@@ -132,4 +132,22 @@ julia> @btime mapreduce(hypot, +, $x, $y)
 96.75538300513509
 ```
 
+## vsum
+
+Vectorized version of `sum`. `vsum(f, a)` applies `f(a[i])` for `i in eachindex(a)`, then sums the results.
+
+```julia
+julia> using LoopVectorization, BenchmarkTools
+
+julia> x = rand(127);
+
+julia> @btime vsum(hypot, $x)
+  12.095 ns (0 allocations: 0 bytes)
+66.65246070098374
+
+julia> @btime sum(hypot, $x)
+  16.992 ns (0 allocations: 0 bytes)
+66.65246070098372
+```
+
 
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -196,6 +196,7 @@ export LowDimArray,
   vfilter,
   vfilter!,
   vmapreduce,
+  vsum,
   vreduce,
   vcount
 
@@ -245,6 +246,7 @@ loop-reordering so as to improve performance:
   - [`@turbo`](@ref): transform `for`-loops and broadcasting
   - [`vmapreduce`](@ref): vectorized version of `mapreduce`
   - [`vreduce`](@ref): vectorized version of `reduce`
+  - [`vsum`](@ref): vectorized version of `sum`
   - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
   - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
   - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
diff --git a/src/parse/add_compute.jl b/src/parse/add_compute.jl
@@ -497,7 +497,7 @@ function add_compute!(
       return add_pow!(ls, var, args[1], arg2num, elementbytes, position)
     end
   elseif instr.instr === :oftype && length(args) == 2
-    return getop(ls, args[2], elementbytes)
+    return get_arg!(ls, args[2], elementbytes, position)
   end
   vparents = Operation[]
   deps = Symbol[]
@@ -760,6 +760,34 @@ function add_compute_ifelse!(
   )
   pushop!(ls, op, LHS)
 end
+function get_arg!(
+  ls::LoopSet,
+  @nospecialize(x),
+  elementbytes::Int,
+  position::Int
+)::Operation
+  if x isa Expr
+    add_operation!(
+      ls,
+      Symbol("###xpow###$(length(operations(ls)))###"),
+      x,
+      elementbytes,
+      position
+    )::Operation
+  elseif x isa Symbol
+    if x ∈ ls.loopsymbols
+      add_loopvalue!(ls, x, elementbytes)
+    else
+      xo = get(ls.opdict, x, nothing)
+      xo === nothing && return add_constant!(ls, x, elementbytes)::Operation
+      return xo
+    end
+  elseif x isa Number
+    return add_constant!(ls, x^p, elementbytes, var)::Operation
+  else
+    throw("objects of type $x not supported as arg")
+  end
+end
 
 # adds x ^ (p::Real)
 function add_pow!(
diff --git a/src/simdfunctionals/mapreduce.jl b/src/simdfunctionals/mapreduce.jl
@@ -1,3 +1,4 @@
+import VectorizationBase: vsum
 
 @inline vreduce(::typeof(+), v::VectorizationBase.AbstractSIMDVector) = vsum(v)
 @inline vreduce(::typeof(*), v::VectorizationBase.AbstractSIMDVector) = vprod(v)
@@ -107,6 +108,16 @@ end
 end
 @inline vmapreduce(f, op, args...) = mapreduce(f, op, args...)
 
+"""
+    vsum(A::DenseArray)
+    vsum(f, A::DenseArray)
+
+Vectorized version of `sum`. Providing a function as the first argument
+will apply the function to each element of `A` before summing.
+"""
+@inline vsum(f::F, A::AbstractArray{T}) where {F,T<:NativeTypes} = vmapreduce(f, +, A)
+@inline vsum(A::AbstractArray{T}) where {T<:NativeTypes} = vsum(identity, A)
+
 length_one_axis(::Base.OneTo) = Base.OneTo(1)
 length_one_axis(::Any) = 1:1
 
diff --git a/src/vectorizationbase_compat/contract_pass.jl b/src/vectorizationbase_compat/contract_pass.jl
@@ -1,46 +1,42 @@
 
-mulexprcost(::Number) = 0
-mulexprcost(::Symbol) = 1
-function mulexprcost(ex::Expr)
-  base = ex.head === :call ? 10 : 1
-  base + length(ex.args)
+const ProdArg = Union{Symbol,Expr,Number}
+function mulexprcost(@nospecialize(x::ProdArg))::Int
+  if x isa Number
+    return 0
+  elseif x isa Symbol
+    return 1
+  else
+    ex = x::Expr
+    base = ex.head === :call ? 10 : 1
+    return base + length(ex.args)
+  end
 end
-function mul_fast_expr(args)
+function mul_fast_expr(
+  args::SubArray{Any,1,Vector{Any},Tuple{UnitRange{Int}},true}
+)::Expr
   b = Expr(:call, :mul_fast)
   for i ∈ 2:length(args)
     push!(b.args, args[i])
   end
   b
 end
-function mulexpr(mulexargs)
-  a = (mulexargs[1])::Union{Symbol,Expr,Number}
-  if length(mulexargs) == 2
-    return (a, mulexargs[2]::Union{Symbol,Expr,Number})
-  elseif length(mulexargs) == 3
-    # We'll calc the product between the guesstimated cheaper two args first, for better out of order execution
-    b = (mulexargs[2])::Union{Symbol,Expr,Number}
-    c = (mulexargs[3])::Union{Symbol,Expr,Number}
-    ac = mulexprcost(a)
-    bc = mulexprcost(b)
-    cc = mulexprcost(c)
-    maxc = max(ac, bc, cc)
-    if ac == maxc
-      return (a, Expr(:call, :mul_fast, b, c))
-    elseif bc == maxc
-      return (b, Expr(:call, :mul_fast, a, c))
-    else
-      return (c, Expr(:call, :mul_fast, a, b))
-    end
-  else
-    return (a, mul_fast_expr(mulexargs))
-  end
-  a = (mulexargs[1])::Union{Symbol,Expr,Number}
-  b = if length(mulexargs) == 2 # two arg mul
-    (mulexargs[2])::Union{Symbol,Expr,Number}
-  else
-    mul_fast_expr(mulexargs)
-  end
-  a, b
+function mulexpr(
+  mulexargs::SubArray{Any,1,Vector{Any},Tuple{UnitRange{Int}},true}
+)::Tuple{ProdArg,ProdArg}
+  a = (mulexargs[1])::ProdArg
+  Nexpr = length(mulexargs)
+  Nexpr == 2 && return (a, mulexargs[2]::ProdArg)
+  Nexpr != 3 && return (a, mul_fast_expr(mulexargs))
+  # We'll calc the product between the guesstimated cheaper two args first, for better out of order execution
+  b = (mulexargs[2])::ProdArg
+  c = (mulexargs[3])::ProdArg
+  ac = mulexprcost(a)
+  bc = mulexprcost(b)
+  cc = mulexprcost(c)
+  maxc = max(ac, bc, cc)
+  ac == maxc && return (a, Expr(:call, :mul_fast, b, c))
+  bc == maxc && return (b, Expr(:call, :mul_fast, c, a))
+  return (c, Expr(:call, :mul_fast, a, b))
 end
 function append_args_skip!(call, args, i, mod)
   for j ∈ eachindex(args)
@@ -222,7 +218,8 @@ function capture_a_muladd(ex::Expr, mod)
   end
   true, call
 end
-capture_muladd(ex::Expr, mod) = while true
+capture_muladd(ex::Expr, mod) =
+  while true
     ex.head === :ref && return ex
     if Meta.isexpr(ex, :call, 2)
       if (ex.args[1] === :(-))
diff --git a/test/mapreduce.jl b/test/mapreduce.jl
@@ -60,6 +60,9 @@
     end
     @test vmapreduce(log, +, x) ≈ sum(log, x)
     @test vmapreduce(abs2, +, x) ≈ sum(abs2, x)
+    @test vsum(log, x) ≈ sum(log, x)
+    @test vsum(abs2, x) ≈ sum(abs2, x)
+    @test vsum(x) ≈ sum(x)
     @test maximum(x) == vreduce(max, x) == maximum_avx(x)
     @test minimum(x) == vreduce(min, x) == minimum_avx(x)
 
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -1378,8 +1378,8 @@ end
     axkernel = axes(kernel, 1)
     zout = convert(eltype(out), z)
     for Ipost in Rpost
-      for i in axout_tile
-        @turbo for Ipre in Rpre
+      @turbo for i in axout_tile
+        for Ipre in Rpre
           tmp = zout
           # tmp = convert(eltype(out), z)    # failing to hoist this leads to an "UndefVarError: tmp not defined"
           for j in axkernel
@@ -1402,8 +1402,8 @@ end
   )
     axkernel = axes(kernel, 1)
     for Ipost in Rpost
-      for i in axout_tile
-        @turbo for Ipre in Rpre
+      @turbo for i in axout_tile
+        for Ipre in Rpre
           tmp = zero(eltype(out))
           # tmp = convert(eltype(out), z)    # failing to hoist this leads to an "UndefVarError: tmp not defined"
           for j in axkernel