Skip to content

Commit 6c2de2b

Browse files
committed
Defined Instruction type that carries module Symbol for sake of future extensibility.
1 parent 374c84d commit 6c2de2b

File tree

9 files changed

+207
-264
lines changed

9 files changed

+207
-264
lines changed

README.md

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -137,43 +137,13 @@ d2 = @avx @. a + B * c′;
137137
can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
138138

139139

140-
<!-- You can also use `\ast` to for a lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, to allow it's arguments to -->
140+
You can also use `\ast` for lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `\ast` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
141141

142-
143-
144-
Originally, LoopVectorization only provided a simple, dumb, transform on a single loop using the `@vectorize` macro. This transformation took element type and unroll factor arguments, performing no analysis of the loop, simply applying the specified arguments.
145-
For backwards compatability, this macro is still currently supported. However, it may eventually be deprecated.
146-
147-
For example,
142+
At small sizes, this can be fast.
148143
```julia
149-
function sum_simd(x)
150-
s = zero(eltype(x))
151-
@simd for xᵢ x
152-
s += xᵢ
153-
end
154-
s
155-
end
156-
using LoopVectorization, BenchmarkTools
157-
function sum_loopvec(x::AbstractVector{Float64})
158-
s = 0.0
159-
@vectorize 4 for i eachindex(x)
160-
s += x[i]
161-
end
162-
s
163-
end
164-
x = rand(110);
165-
@btime sum($x)
166-
# 20.527 ns (0 allocations: 0 bytes)
167-
# 53.38001667116997
168-
169-
@btime sum_simd($x)
170-
# 16.749 ns (0 allocations: 0 bytes)
171-
# 53.38001667116997
172-
173-
@btime sum_loopvec($x)
174-
# 12.022 ns (0 allocations: 0 bytes)
175-
# 53.38001667116997
144+
176145
```
177146

178147

179148

149+

src/LoopVectorization.jl

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -13,55 +13,6 @@ export LowDimArray, stridedpointer, vectorizable,
1313
@avx, ,
1414
vmap, vmap!
1515

16-
const SLEEFPiratesDict = Dict{Symbol,Tuple{Symbol,Symbol}}(
17-
:sin => (:SLEEFPirates, :sin_fast),
18-
:sinpi => (:SLEEFPirates, :sinpi),
19-
:cos => (:SLEEFPirates, :cos_fast),
20-
:cospi => (:SLEEFPirates, :cospi),
21-
:tan => (:SLEEFPirates, :tan_fast),
22-
# :log => (:SLEEFPirates, :log_fast),
23-
:log => (:SIMDPirates, :vlog),
24-
:log10 => (:SLEEFPirates, :log10),
25-
:log2 => (:SLEEFPirates, :log2),
26-
:log1p => (:SLEEFPirates, :log1p),
27-
# :exp => (:SLEEFPirates, :exp),
28-
:exp => (:SIMDPirates, :vexp),
29-
:exp2 => (:SLEEFPirates, :exp2),
30-
:exp10 => (:SLEEFPirates, :exp10),
31-
:expm1 => (:SLEEFPirates, :expm1),
32-
:inv => (:SIMDPirates, :vinv), # faster than sqrt_fast
33-
:sqrt => (:SIMDPirates, :sqrt), # faster than sqrt_fast
34-
:rsqrt => (:SIMDPirates, :rsqrt),
35-
:cbrt => (:SLEEFPirates, :cbrt_fast),
36-
:asin => (:SLEEFPirates, :asin_fast),
37-
:acos => (:SLEEFPirates, :acos_fast),
38-
:atan => (:SLEEFPirates, :atan_fast),
39-
:sinh => (:SLEEFPirates, :sinh),
40-
:cosh => (:SLEEFPirates, :cosh),
41-
:tanh => (:SLEEFPirates, :tanh),
42-
:asinh => (:SLEEFPirates, :asinh),
43-
:acosh => (:SLEEFPirates, :acosh),
44-
:atanh => (:SLEEFPirates, :atanh),
45-
# :erf => :(SLEEFPirates.erf),
46-
# :erfc => :(SLEEFPirates.erfc),
47-
# :gamma => :(SLEEFPirates.gamma),
48-
# :lgamma => :(SLEEFPirates.lgamma),
49-
:trunc => (:SLEEFPirates, :trunc),
50-
:floor => (:SLEEFPirates, :floor),
51-
:ceil => (:SIMDPirates, :ceil),
52-
:abs => (:SIMDPirates, :vabs),
53-
:sincos => (:SLEEFPirates, :sincos_fast),
54-
# :pow => (:SLEEFPirates, :pow_fast),
55-
:^ => (:SLEEFPirates, :pow_fast),
56-
# :sincospi => (:SLEEFPirates, :sincospi_fast),
57-
# :pow => (:SLEEFPirates, :pow),
58-
# :hypot => (:SLEEFPirates, :hypot_fast),
59-
:mod => (:SLEEFPirates, :mod),
60-
# :copysign => :copysign
61-
:one => (:SIMDPirates, :vone),
62-
:zero => (:SIMDPirates, :vzero),
63-
:erf => (:SIMDPirates, :verf)
64-
)
6516

6617
include("costs.jl")
6718
include("operations.jl")

src/broadcast.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@ struct Product{A,B}
22
a::A
33
b::B
44
end
5-
function Base.size(p::Product)
5+
@inline function Base.size(p::Product)
66
M = size(p.a, 1)
77
(M, Base.tail(size(p.b))...)
88
end
9+
@inline function Base.size(p::Product, i::Integer)
10+
i == 1 && return size(p.a, 1)
11+
size(p.b, i)
12+
end
913
@inline Base.length(p::Product) = prod(size(p))
1014
@inline Base.broadcastable(p::Product) = p
1115
@inline Base.ndims(p::Type{Product{A,B}}) where {A,B} = ndims(B)

0 commit comments

Comments
 (0)