Skip to content

Commit c32809e

Browse files
committed
Resolve #13.
1 parent 80e7f22 commit c32809e

File tree

6 files changed

+23
-29
lines changed

6 files changed

+23
-29
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.3.2"
4+
version = "0.3.3"
55

66
[deps]
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ d2 = @avx @. a + B * c′;
157157
can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
158158

159159

160-
You can also use `` (which is typed `\ast` and not to be confused with `*`) for lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `\ast` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
160+
You can also use `` (which is typed `*\^l`) for lazy matrix multiplication that can fuse with broadcasts. `.` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
161161

162162
At small sizes, this can be fast.
163163
```julia

src/LoopVectorization.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ using MacroTools: prewalk, postwalk
1010

1111

1212
export LowDimArray, stridedpointer, vectorizable,
13-
@avx, ,
13+
@avx, *ˡ, ,
1414
vmap, vmap!
1515

1616

src/broadcast.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,9 @@ end
3636
# recursive_eltype(ARGS)
3737
# end
3838

39-
@inline (a::A, b::B) where {A,B} = Product{A,B}(a, b)
40-
@inline Base.Broadcast.broadcasted(::typeof(), a::A, b::B) where {A, B} = Product{A,B}(a, b)
39+
@inline *ˡ(a::A, b::B) where {A,B} = Product{A,B}(a, b)
40+
@inline Base.Broadcast.broadcasted(::typeof(*ˡ), a::A, b::B) where {A, B} = Product{A,B}(a, b)
41+
const = *ˡ
4142
# TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
4243
function add_broadcast!(
4344
ls::LoopSet, mC::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},

src/graphs.jl

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,6 @@ getop(ls::LoopSet, s::Symbol) = ls.opdict[s]
162162
getop(ls::LoopSet, i::Int) = ls.operations[i + 1]
163163

164164
@inline extract_val(::Val{N}) where {N} = N
165-
function determine_veced_increment(ls::LoopSet, iter::Symbol, isunrolled::Bool, W::Symbol, U::Int) # , istiled::Bool, ..., T::Int # may not be tiled
166-
if isunrolled
167-
Expr(:call, lv(:valmul), W, U)
168-
# elseif istiled
169-
# Expr(:call, lv(:valmul), W, T)
170-
else
171-
Expr(:call, lv(:extract_val), W)
172-
end
173-
end
174165
function vec_looprange(ls::LoopSet, s::Symbol, isunrolled::Bool, W::Symbol, U::Int, loop = ls.loops[s])
175166
incr = if isunrolled
176167
Expr(:call, lv(:valmuladd), W, U, -1)
@@ -191,15 +182,15 @@ function looprange(ls::LoopSet, s::Symbol, incr::Int = 1, mangledname::Symbol =
191182
Expr(:call, :<, mangledname, loop.hintexact ? loop.rangehint - incr : Expr(:call, :-, loop.rangesym, incr))
192183
end
193184
end
194-
function looprange(ls::LoopSet, s::Symbol, incr::Expr, mangledname::Symbol = s, loop = ls.loops[s])
195-
increxpr = Expr(:call, :-, incr, 1)
196-
increxpr = if loop.hintexact
197-
Expr(:call, :-, loop.rangehint, increxpr)
198-
else
199-
Expr(:call, :-, loop.rangesym, increxpr)
200-
end
201-
Expr(:call, :<, mangledname, increxpr)
202-
end
185+
# function looprange(ls::LoopSet, s::Symbol, incr::Expr, mangledname::Symbol = s, loop = ls.loops[s])
186+
# increxpr = Expr(:call, :-, incr, 1)
187+
# increxpr = if loop.hintexact
188+
# Expr(:call, :-, loop.rangehint, increxpr)
189+
# else
190+
# Expr(:call, :-, loop.rangesym, increxpr)
191+
# end
192+
# Expr(:call, :<, mangledname, increxpr)
193+
# end
203194

204195
function Base.length(ls::LoopSet, is::Symbol)
205196
ls.loops[is].rangehint

test/runtests.jl

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ using LinearAlgebra
6262
C[m,n] = Cₘₙ
6363
end
6464
end
65-
function AmuladdBavx!(C, A, B)
65+
function AmuladdBavx!(C, A, B, factor = 1)
6666
@avx for m 1:size(A,1), n 1:size(B,2)
6767
ΔCₘₙ = zero(eltype(C))
6868
for k 1:size(A,2)
6969
ΔCₘₙ += A[m,k] * B[k,n]
7070
end
71-
C[m,n] += ΔCₘₙ
71+
C[m,n] += ΔCₘₙ * factor
7272
end
7373
end
7474

@@ -178,6 +178,8 @@ using LinearAlgebra
178178
@test C C2
179179
AmuladdBavx!(C, A, B)
180180
@test C 2C2
181+
AmuladdBavx!(C, A, B, -1)
182+
@test C C2
181183
At = copy(A');
182184
fill!(C, 9999.999); AtmulBavx!(C, At, B)
183185
@test C C2
@@ -475,25 +477,25 @@ end
475477

476478
d3 = a .+ B * c;
477479
# no method matching _similar_for(::UnitRange{Int64}, ::Type{Any}, ::Product)
478-
d4 = @avx a .+ B c;
480+
d4 = @avx a .+ B *ˡ c;
479481
@test d3 d4
480482

481483
fill!(d3, -1000.0);
482484
fill!(d4, 91000.0);
483485

484486
d3 .= a .+ B * c;
485-
@avx d4 .= a .+ B c;
487+
@avx d4 .= a .+ B *ˡ c;
486488
@test d3 d4
487489

488490
fill!(d4, 91000.0);
489-
@avx @. d4 = a + B c;
491+
@avx @. d4 = a + B *ˡ c;
490492
@test d3 d4
491493

492494
M, K, N = 77, 83, 57;
493495
A = rand(T,M,K); B = rand(T,K,N); C = rand(T,M,N);
494496

495497
D1 = C .+ A * B;
496-
D2 = @avx C .+ A B;
498+
D2 = @avx C .+ A *ˡ B;
497499
@test D1 D2
498500

499501
D3 = exp.(B');

0 commit comments

Comments
 (0)