JuliaSIMD
diff --git a/‎README.md
Lines changed: 4 additions & 34 deletions b/‎README.md
Lines changed: 4 additions & 34 deletions
diff --git a/‎src/LoopVectorization.jl
Lines changed: 0 additions & 49 deletions b/‎src/LoopVectorization.jl
Lines changed: 0 additions & 49 deletions
diff --git a/‎src/broadcast.jl
Lines changed: 5 additions & 1 deletion b/‎src/broadcast.jl
Lines changed: 5 additions & 1 deletion
@@ -137,43 +137,13 @@ d2 = @avx @. a + B * c′;
 can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
 
 
-<!-- You can also use `\ast` to for a lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, to allow it's arguments to -->
+You can also use `\ast` for lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `\ast` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
 
-
-
-Originally, LoopVectorization only provided a simple, dumb, transform on a single loop using the `@vectorize` macro. This transformation took element type and unroll factor arguments, performing no analysis of the loop, simply applying the specified arguments.
-For backwards compatability, this macro is still currently supported. However, it may eventually be deprecated.
-
-For example,
+At small sizes, this can be fast.
 ```julia
-function sum_simd(x)
-    s = zero(eltype(x))
-    @simd for xᵢ ∈ x
-        s += xᵢ
-    end
-    s
-end
-using LoopVectorization, BenchmarkTools
-function sum_loopvec(x::AbstractVector{Float64})
-    s = 0.0
-    @vectorize 4 for i ∈ eachindex(x)
-        s += x[i]
-    end
-    s
-end
-x = rand(110);
-@btime sum($x)
-#   20.527 ns (0 allocations: 0 bytes)
-# 53.38001667116997
-
-@btime sum_simd($x)
-#   16.749 ns (0 allocations: 0 bytes)
-# 53.38001667116997
-
-@btime sum_loopvec($x)
-#   12.022 ns (0 allocations: 0 bytes)
-# 53.38001667116997
+
 ```
 
 
 
+
@@ -13,55 +13,6 @@ export LowDimArray, stridedpointer, vectorizable,
     @avx, ∗,
     vmap, vmap!
 
-const SLEEFPiratesDict = Dict{Symbol,Tuple{Symbol,Symbol}}(
-    :sin => (:SLEEFPirates, :sin_fast),
-    :sinpi => (:SLEEFPirates, :sinpi),
-    :cos => (:SLEEFPirates, :cos_fast),
-    :cospi => (:SLEEFPirates, :cospi),
-    :tan => (:SLEEFPirates, :tan_fast),
-    # :log => (:SLEEFPirates, :log_fast),
-    :log => (:SIMDPirates, :vlog),
-    :log10 => (:SLEEFPirates, :log10),
-    :log2 => (:SLEEFPirates, :log2),
-    :log1p => (:SLEEFPirates, :log1p),
-    # :exp => (:SLEEFPirates, :exp),
-    :exp => (:SIMDPirates, :vexp),
-    :exp2 => (:SLEEFPirates, :exp2),
-    :exp10 => (:SLEEFPirates, :exp10),
-    :expm1 => (:SLEEFPirates, :expm1),
-    :inv => (:SIMDPirates, :vinv), # faster than sqrt_fast
-    :sqrt => (:SIMDPirates, :sqrt), # faster than sqrt_fast
-    :rsqrt => (:SIMDPirates, :rsqrt),
-    :cbrt => (:SLEEFPirates, :cbrt_fast),
-    :asin => (:SLEEFPirates, :asin_fast),
-    :acos => (:SLEEFPirates, :acos_fast),
-    :atan => (:SLEEFPirates, :atan_fast),
-    :sinh => (:SLEEFPirates, :sinh),
-    :cosh => (:SLEEFPirates, :cosh),
-    :tanh => (:SLEEFPirates, :tanh),
-    :asinh => (:SLEEFPirates, :asinh),
-    :acosh => (:SLEEFPirates, :acosh),
-    :atanh => (:SLEEFPirates, :atanh),
-    # :erf => :(SLEEFPirates.erf),
-    # :erfc => :(SLEEFPirates.erfc),
-    # :gamma => :(SLEEFPirates.gamma),
-    # :lgamma => :(SLEEFPirates.lgamma),
-    :trunc => (:SLEEFPirates, :trunc),
-    :floor => (:SLEEFPirates, :floor),
-    :ceil => (:SIMDPirates, :ceil),
-    :abs => (:SIMDPirates, :vabs),
-    :sincos => (:SLEEFPirates, :sincos_fast),
-    # :pow => (:SLEEFPirates, :pow_fast),
-    :^ => (:SLEEFPirates, :pow_fast),
-    # :sincospi => (:SLEEFPirates, :sincospi_fast),
-    # :pow => (:SLEEFPirates, :pow),
-    # :hypot => (:SLEEFPirates, :hypot_fast),
-    :mod => (:SLEEFPirates, :mod),
-    # :copysign => :copysign
-    :one => (:SIMDPirates, :vone),
-    :zero => (:SIMDPirates, :vzero),
-    :erf => (:SIMDPirates, :verf)
-)
 
 include("costs.jl")
 include("operations.jl")
 
@@ -2,10 +2,14 @@ struct Product{A,B}
     a::A
     b::B
 end
-function Base.size(p::Product)
+@inline function Base.size(p::Product)
     M = size(p.a, 1)
     (M, Base.tail(size(p.b))...)
 end
+@inline function Base.size(p::Product, i::Integer)
+    i == 1 && return size(p.a, 1)
+    size(p.b, i)
+end
 @inline Base.length(p::Product) = prod(size(p))
 @inline Base.broadcastable(p::Product) = p
 @inline Base.ndims(p::Type{Product{A,B}}) where {A,B} = ndims(B)