Skip to content

Commit 5272ab8

Browse files
committed
Fixed broadcasting where args may be broadcasted objects.
1 parent 0c96b40 commit 5272ab8

File tree

3 files changed

+207
-97
lines changed

3 files changed

+207
-97
lines changed

README.md

Lines changed: 187 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -29,27 +29,45 @@ The macro assumes that loop iterations can be reordered. It also currently suppo
2929

3030
A simple example with a single loop is the dot product:
3131
```julia
32-
using LoopVectorization, BenchmarkTools
33-
function mydot(a, b)
34-
s = 0.0
35-
@inbounds @simd for i eachindex(a,b)
36-
s += a[i]*b[i]
37-
end
38-
s
39-
end
40-
function mydotavx(a, b)
41-
s = 0.0
42-
@avx for i eachindex(a,b)
43-
s += a[i]*b[i]
44-
end
45-
s
46-
end
47-
a = rand(256); b = rand(256);
48-
@btime mydot($a, $b)
49-
@btime mydotavx($a, $b)
50-
a = rand(43); b = rand(43);
51-
@btime mydot($a, $b)
52-
@btime mydotavx($a, $b)
32+
julia> using LoopVectorization, BenchmarkTools
33+
34+
julia> function mydot(a, b)
35+
s = 0.0
36+
@inbounds @simd for i eachindex(a,b)
37+
s += a[i]*b[i]
38+
end
39+
s
40+
end
41+
mydot (generic function with 1 method)
42+
43+
julia> function mydotavx(a, b)
44+
s = 0.0
45+
@avx for i eachindex(a,b)
46+
s += a[i]*b[i]
47+
end
48+
s
49+
end
50+
mydotavx (generic function with 1 method)
51+
52+
julia> a = rand(256); b = rand(256);
53+
54+
julia> @btime mydot($a, $b)
55+
12.273 ns (0 allocations: 0 bytes)
56+
62.61049816874535
57+
58+
julia> @btime mydotavx($a, $b)
59+
11.618 ns (0 allocations: 0 bytes)
60+
62.61049816874536
61+
62+
julia> a = rand(255); b = rand(255);
63+
64+
julia> @btime mydot($a, $b)
65+
36.539 ns (0 allocations: 0 bytes)
66+
62.29537331565549
67+
68+
julia> @btime mydotavx($a, $b)
69+
11.739 ns (0 allocations: 0 bytes)
70+
62.29537331565549
5371
```
5472

5573
On most recent CPUs, the performance of the dot product is bounded by
@@ -59,25 +77,41 @@ However, the dot product requires two loads per `fma`.
5977

6078
A self-dot function, on the otherhand, requires one load per fma:
6179
```julia
62-
function myselfdot(a)
63-
s = 0.0
64-
@inbounds @simd for i eachindex(a)
65-
s += a[i]*a[i]
66-
end
67-
s
68-
end
69-
function myselfdotavx(a)
70-
s = 0.0
71-
@avx for i eachindex(a)
72-
s += a[i]*a[i]
73-
end
74-
s
75-
end
76-
a = rand(256);
77-
@btime myselfdotavx($a)
78-
@btime myselfdot($a)
79-
@btime myselfdotavx($b)
80-
@btime myselfdot($b)
80+
julia> function myselfdot(a)
81+
s = 0.0
82+
@inbounds @simd for i eachindex(a)
83+
s += a[i]*a[i]
84+
end
85+
s
86+
end
87+
myselfdot (generic function with 1 method)
88+
89+
julia> function myselfdotavx(a)
90+
s = 0.0
91+
@avx for i eachindex(a)
92+
s += a[i]*a[i]
93+
end
94+
s
95+
end
96+
myselfdotavx (generic function with 1 method)
97+
98+
julia> a = rand(256);
99+
100+
julia> @btime myselfdot($a)
101+
8.578 ns (0 allocations: 0 bytes)
102+
90.16636687132868
103+
104+
julia> @btime myselfdotavx($a)
105+
9.560 ns (0 allocations: 0 bytes)
106+
90.16636687132868
107+
108+
julia> @btime myselfdot($b)
109+
28.923 ns (0 allocations: 0 bytes)
110+
83.20114563267853
111+
112+
julia> @btime myselfdotavx($b)
113+
9.174 ns (0 allocations: 0 bytes)
114+
83.20114563267856
81115
```
82116
For this reason, the `@avx` version is roughly twice as fast. The `@inbounds @simd` version, however, is not, because it runs into the problem of loop carried dependencies: to add `a[i]*b[i]` to `s_new = s_old + a[i-j]*b[i-j]`, we must have first finished calculating `s_new`, but -- while two `fma` instructions can be initiated per cycle -- they each take several clock cycles to complete.
83117
For this reason, we need to unroll the operation to run several independent instances concurrently. The `@avx` macro models this cost to try and pick an optimal unroll factor.
@@ -94,34 +128,83 @@ Note that 14 and 12 nm Ryzen chips can only do 1 full width `fma` per clock cycl
94128

95129
We can also vectorize fancier loops. A likely familiar example to dive into:
96130
```julia
97-
function mygemm!(C, A, B)
98-
@inbounds for i 1:size(A,1), j 1:size(B,2)
99-
Cᵢⱼ = 0.0
100-
@fastmath for k 1:size(A,2)
101-
Cᵢⱼ += A[i,k] * B[k,j]
102-
end
103-
C[i,j] = Cᵢⱼ
104-
end
105-
end
106-
function mygemmavx!(C, A, B)
107-
@avx for i 1:size(A,1), j 1:size(B,2)
108-
Cᵢⱼ = 0.0
109-
for k 1:size(A,2)
110-
Cᵢⱼ += A[i,k] * B[k,j]
111-
end
112-
C[i,j] = Cᵢⱼ
113-
end
114-
end
115-
M, K, N = 72, 75, 71;
116-
C1 = Matrix{Float64}(undef, M, N); A = randn(M, K); B = randn(K, N);
117-
C2 = similar(C1); C3 = similar(C1);
118-
@btime mygemmavx!($C1, $A, $B)
119-
@btime mygemm!($C2, $A, $B)
120-
using LinearAlgebra, Test
121-
@test all(C1 .≈ C2)
122-
BLAS.set_num_threads(1); BLAS.vendor()
123-
@btime mul!($C3, $A, $B)
124-
@test all(C1 .≈ C3)
131+
julia> function mygemm!(𝐂, 𝐀, 𝐁)
132+
@inbounds @fastmath for m 1:size(𝐀,1), n 1:size(𝐁,2)
133+
𝐂ₘₙ = zero(eltype(𝐂))
134+
for k 1:size(𝐀,2)
135+
𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
136+
end
137+
𝐂[m,n] = 𝐂ₘₙ
138+
end
139+
end
140+
mygemm! (generic function with 1 method)
141+
142+
julia> function mygemmavx!(𝐂, 𝐀, 𝐁)
143+
@avx for m 1:size(𝐀,1), n 1:size(𝐁,2)
144+
𝐂ₘₙ = zero(eltype(𝐂))
145+
for k 1:size(𝐀,2)
146+
𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
147+
end
148+
𝐂[m,n] = 𝐂ₘₙ
149+
end
150+
end
151+
mygemmavx! (generic function with 1 method)
152+
153+
julia> M, K, N = 72, 75, 71;
154+
155+
julia> C1 = Matrix{Float64}(undef, M, N); A = randn(M, K); B = randn(K, N);
156+
157+
julia> C2 = similar(C1); C3 = similar(C1);
158+
159+
julia> @benchmark mygemmavx!($C1, $A, $B)
160+
BenchmarkTools.Trial:
161+
memory estimate: 0 bytes
162+
allocs estimate: 0
163+
--------------
164+
minimum time: 7.381 μs (0.00% GC)
165+
median time: 7.415 μs (0.00% GC)
166+
mean time: 7.432 μs (0.00% GC)
167+
maximum time: 15.444 μs (0.00% GC)
168+
--------------
169+
samples: 10000
170+
evals/sample: 4
171+
172+
julia> @benchmark mygemm!($C2, $A, $B)
173+
BenchmarkTools.Trial:
174+
memory estimate: 0 bytes
175+
allocs estimate: 0
176+
--------------
177+
minimum time: 230.790 μs (0.00% GC)
178+
median time: 231.288 μs (0.00% GC)
179+
mean time: 231.882 μs (0.00% GC)
180+
maximum time: 275.460 μs (0.00% GC)
181+
--------------
182+
samples: 10000
183+
evals/sample: 1
184+
185+
julia> using LinearAlgebra, Test
186+
187+
julia> @test all(C1 .≈ C2)
188+
Test Passed
189+
190+
julia> BLAS.set_num_threads(1); BLAS.vendor()
191+
:mkl
192+
193+
julia> @benchmark mul!($C3, $A, $B)
194+
BenchmarkTools.Trial:
195+
memory estimate: 0 bytes
196+
allocs estimate: 0
197+
--------------
198+
minimum time: 6.830 μs (0.00% GC)
199+
median time: 6.861 μs (0.00% GC)
200+
mean time: 6.869 μs (0.00% GC)
201+
maximum time: 15.125 μs (0.00% GC)
202+
--------------
203+
samples: 10000
204+
evals/sample: 5
205+
206+
julia> @test all(C1 .≈ C3)
207+
Test Passed
125208
```
126209
It can produce a decent macro kernel.
127210
In the future, I would like it to also model the cost of memory movement in the L1 and L2 cache, and use these to generate loops around the macro kernel following the work of [Low, et al. (2016)](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
@@ -142,28 +225,34 @@ For example, what if `A` were the outer product of two vectors?
142225
<summaryClick me! ></summary>
143226
<p>
144227

145-
Another example, a straightforward operation expressed well via broadcasting:
228+
Another example, a straightforward operation expressed well via broadcasting and `` (which is typed `*\^l`), the lazy matrix multiplication operator:
146229
```julia
147-
a = rand(37); B = rand(37, 47); c = rand(47); c′ = c';
230+
julia> using LoopVectorization, LinearAlgebra, BenchmarkTools, Test; BLAS.set_num_threads(1)
148231

149-
d1 = @. a + B * c′;
150-
d2 = @avx @. a + B * c′;
232+
julia> a = rand(48); B = rand(48, 51); c = rand(51); d = rand(49);
151233

152-
@test all(d1 .≈ d2)
234+
julia> X1 = a .+ B * (c .+ d');
153235

154-
@time @. $d1 = $a + $B * $c′;
155-
@time @avx @. $d2 = $a + $B * $c′;
156-
@test all(d1 .≈ d2)
157-
```
158-
can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
236+
julia> X2 = @avx @. a + B *ˡ (c + d');
159237

238+
julia> @test X1 X2
239+
Test Passed
160240

161-
You can also use `` (which is typed `*\^l`) for lazy matrix multiplication that can fuse with broadcasts. `.*ˡ` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
241+
julia> buf1 = Matrix{Float64}(undef, length(c), length(d));
162242

163-
At small sizes, this can be fast.
164-
```julia
243+
julia> buf2 = similar(X1);
165244

245+
julia> @btime $X1 .= $a .+ mul!($buf2, $B, ($buf1 .= $c .+ $d'));
246+
3.499 μs (0 allocations: 0 bytes)
247+
248+
julia> @btime @avx @. $X2 = $a + $B *ˡ ($c + $d');
249+
3.289 μs (0 allocations: 0 bytes)
250+
251+
julia> @test X1 X2
252+
Test Passed
166253
```
254+
The lazy matrix multiplication operator `` escapes broadcasts and fuses, making it easy to write code that avoids intermediates. However, I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
255+
This may improve as the optimizations within LoopVectorization improve.
167256

168257
</p>
169258
</details>
@@ -215,16 +304,25 @@ end
215304
```
216305
this `mul_avx!` kernel can now accept `StructArray` matrices of complex numbers and multiply them efficiently:
217306
```julia
218-
M, K, N = 50, 51, 52
307+
julia> M, K, N = 56, 57, 58
308+
(56, 57, 58)
309+
310+
julia> A = StructArray(randn(ComplexF64, M, K));
311+
312+
julia> B = StructArray(randn(ComplexF64, K, N));
313+
314+
julia> C1 = StructArray(Matrix{ComplexF64}(undef, M, N));
315+
316+
julia> C2 = collect(similar(C1));
317+
318+
julia> @btime mul_avx!($C1, $A, $B)
319+
13.634 μs (0 allocations: 0 bytes)
219320

220-
A = StructArray(randn(ComplexF64, M, K));
221-
B = StructArray(randn(ComplexF64, K, N));
222-
C1 = StructArray(Matrix{ComplexF64}(undef, M, N));
223-
C2 = collect(similar(C1));
321+
julia> @btime mul!( $C2, $(collect(A)), $(collect(B))); # collect turns the StructArray into a regular Array
322+
14.007 μs (0 allocations: 0 bytes)
224323

225-
@btime mul_avx!($C1, $A, $B)
226-
@btime mul!( $C2, $(collect(A)), $(collect(B))) # collect turns the StructArray into a regular Array
227-
@test C1 C2
324+
julia> @test C1 C2
325+
Test Passed
228326
```
229327

230328
Similar approaches can be taken to make kernels working with a variety of numeric struct types such as [dual numbers](https://github.com/JuliaDiff/DualNumbers.jl), [DoubleFloats](https://github.com/JuliaMath/DoubleFloats.jl), etc.

src/broadcast.jl

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,20 @@ struct Product{A,B}
33
b::B
44
end
55
@inline function Base.size(p::Product)
6-
M = size(p.a, 1)
6+
M = @inbounds size(p.a)[1]
77
(M, Base.tail(size(p.b))...)
88
end
99
@inline function Base.size(p::Product, i::Integer)
10-
i == 1 && return size(p.a, 1)
11-
size(p.b, i)
10+
i == 1 && return @inbounds size(p.a)[1]
11+
@inbounds size(p.b)[i]
1212
end
1313
@inline Base.length(p::Product) = prod(size(p))
1414
@inline Base.broadcastable(p::Product) = p
15-
@inline Base.ndims(p::Type{Product{A,B}}) where {A,B} = ndims(B)
15+
@inline numdims(A) = ndims(A) # fallback
16+
@inline numdims(::Type{Product{A,B}}) where {A,B} = numdims(B)
17+
@inline Base.ndims(::Type{Product{A,B}}) where {A,B} = numdims(B)
18+
# This numdims nonsense is a hack to avoid type piracy in defining:
19+
@inline numdims(::Type{B}) where {N, S <: Base.Broadcast.AbstractArrayStyle{N}, B <: Base.Broadcast.Broadcasted{S}} = N
1620

1721
Base.Broadcast._broadcast_getindex_eltype(::Product{A,B}) where {T, A <: AbstractVecOrMat{T}, B <: AbstractVecOrMat{T}} = T
1822
function Base.Broadcast._broadcast_getindex_eltype(p::Product)
@@ -48,17 +52,16 @@ function add_broadcast!(
4852
mB = gensym(:Bₖₙ)
4953
pushpreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
5054
pushpreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
51-
pushpreamble!(ls, Expr(:(=), K, Expr(:call, :size, mB, 1)))
52-
55+
pushpreamble!(ls, Expr(:(=), K, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,@__FILE__), Expr(:ref, Expr(:call, :size, mB), 1))))
5356
k = gensym(:k)
5457
add_loop!(ls, Loop(k, 0, K), k)
5558
m = loopsyms[1];
56-
if ndims(B) == 1
59+
if numdims(B) == 1
5760
bloopsyms = Symbol[k]
5861
cloopsyms = Symbol[m]
5962
reductdeps = Symbol[m, k]
6063
kvec = bloopsyms
61-
elseif ndims(B) == 2
64+
elseif numdims(B) == 2
6265
n = loopsyms[2];
6366
bloopsyms = Symbol[k,n]
6467
cloopsyms = Symbol[m,n]
@@ -202,6 +205,7 @@ end
202205
) where {T <: SUPPORTED_TYPES, N, BC <: Broadcasted, Mod}
203206
# we have an N dimensional loop.
204207
# need to construct the LoopSet
208+
# @show typeof(dest)
205209
loopsyms = [gensym(:n) for n 1:N]
206210
ls = LoopSet(Mod)
207211
sizes = Expr(:tuple)

test/broadcast.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@
6161
@test D1 D2
6262
fill!(D2, -999999); D2 = @avx C .+ At' *ˡ B;
6363
@test D1 D2
64+
65+
b = rand(T,K); x = rand(R,N);
66+
D1 .= C .+ A * (b .+ x');
67+
@avx @. D2 = C + A *ˡ (b + x');
68+
@test D1 D2
69+
D2 = @avx @. C + A *ˡ (b + x');
70+
@test D1 D2
71+
6472
if T <: Union{Float32,Float64}
6573
D3 = cos.(B');
6674
D4 = @avx cos.(B');

0 commit comments

Comments
 (0)