You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@@ -29,27 +29,45 @@ The macro assumes that loop iterations can be reordered. It also currently suppo
29
29
30
30
A simple example with a single loop is the dot product:
31
31
```julia
32
-
using LoopVectorization, BenchmarkTools
33
-
functionmydot(a, b)
34
-
s =0.0
35
-
@inbounds@simdfor i ∈eachindex(a,b)
36
-
s += a[i]*b[i]
37
-
end
38
-
s
39
-
end
40
-
functionmydotavx(a, b)
41
-
s =0.0
42
-
@avxfor i ∈eachindex(a,b)
43
-
s += a[i]*b[i]
44
-
end
45
-
s
46
-
end
47
-
a =rand(256); b =rand(256);
48
-
@btimemydot($a, $b)
49
-
@btimemydotavx($a, $b)
50
-
a =rand(43); b =rand(43);
51
-
@btimemydot($a, $b)
52
-
@btimemydotavx($a, $b)
32
+
julia>using LoopVectorization, BenchmarkTools
33
+
34
+
julia>functionmydot(a, b)
35
+
s =0.0
36
+
@inbounds@simdfor i ∈eachindex(a,b)
37
+
s += a[i]*b[i]
38
+
end
39
+
s
40
+
end
41
+
mydot (generic function with 1 method)
42
+
43
+
julia>functionmydotavx(a, b)
44
+
s =0.0
45
+
@avxfor i ∈eachindex(a,b)
46
+
s += a[i]*b[i]
47
+
end
48
+
s
49
+
end
50
+
mydotavx (generic function with 1 method)
51
+
52
+
julia> a =rand(256); b =rand(256);
53
+
54
+
julia>@btimemydot($a, $b)
55
+
12.273 ns (0 allocations:0 bytes)
56
+
62.61049816874535
57
+
58
+
julia>@btimemydotavx($a, $b)
59
+
11.618 ns (0 allocations:0 bytes)
60
+
62.61049816874536
61
+
62
+
julia> a =rand(255); b =rand(255);
63
+
64
+
julia>@btimemydot($a, $b)
65
+
36.539 ns (0 allocations:0 bytes)
66
+
62.29537331565549
67
+
68
+
julia>@btimemydotavx($a, $b)
69
+
11.739 ns (0 allocations:0 bytes)
70
+
62.29537331565549
53
71
```
54
72
55
73
On most recent CPUs, the performance of the dot product is bounded by
@@ -59,25 +77,41 @@ However, the dot product requires two loads per `fma`.
59
77
60
78
A self-dot function, on the otherhand, requires one load per fma:
61
79
```julia
62
-
functionmyselfdot(a)
63
-
s =0.0
64
-
@inbounds@simdfor i ∈eachindex(a)
65
-
s += a[i]*a[i]
66
-
end
67
-
s
68
-
end
69
-
functionmyselfdotavx(a)
70
-
s =0.0
71
-
@avxfor i ∈eachindex(a)
72
-
s += a[i]*a[i]
73
-
end
74
-
s
75
-
end
76
-
a =rand(256);
77
-
@btimemyselfdotavx($a)
78
-
@btimemyselfdot($a)
79
-
@btimemyselfdotavx($b)
80
-
@btimemyselfdot($b)
80
+
julia>functionmyselfdot(a)
81
+
s =0.0
82
+
@inbounds@simdfor i ∈eachindex(a)
83
+
s += a[i]*a[i]
84
+
end
85
+
s
86
+
end
87
+
myselfdot (generic function with 1 method)
88
+
89
+
julia>functionmyselfdotavx(a)
90
+
s =0.0
91
+
@avxfor i ∈eachindex(a)
92
+
s += a[i]*a[i]
93
+
end
94
+
s
95
+
end
96
+
myselfdotavx (generic function with 1 method)
97
+
98
+
julia> a =rand(256);
99
+
100
+
julia>@btimemyselfdot($a)
101
+
8.578 ns (0 allocations:0 bytes)
102
+
90.16636687132868
103
+
104
+
julia>@btimemyselfdotavx($a)
105
+
9.560 ns (0 allocations:0 bytes)
106
+
90.16636687132868
107
+
108
+
julia>@btimemyselfdot($b)
109
+
28.923 ns (0 allocations:0 bytes)
110
+
83.20114563267853
111
+
112
+
julia>@btimemyselfdotavx($b)
113
+
9.174 ns (0 allocations:0 bytes)
114
+
83.20114563267856
81
115
```
82
116
For this reason, the `@avx` version is roughly twice as fast. The `@inbounds @simd` version, however, is not, because it runs into the problem of loop carried dependencies: to add `a[i]*b[i]` to `s_new = s_old + a[i-j]*b[i-j]`, we must have first finished calculating `s_new`, but -- while two `fma` instructions can be initiated per cycle -- they each take several clock cycles to complete.
83
117
For this reason, we need to unroll the operation to run several independent instances concurrently. The `@avx` macro models this cost to try and pick an optimal unroll factor.
@@ -94,34 +128,83 @@ Note that 14 and 12 nm Ryzen chips can only do 1 full width `fma` per clock cycl
94
128
95
129
We can also vectorize fancier loops. A likely familiar example to dive into:
96
130
```julia
97
-
functionmygemm!(C, A, B)
98
-
@inboundsfor i ∈1:size(A,1), j ∈1:size(B,2)
99
-
Cᵢⱼ =0.0
100
-
@fastmathfor k ∈1:size(A,2)
101
-
Cᵢⱼ += A[i,k] * B[k,j]
102
-
end
103
-
C[i,j] = Cᵢⱼ
104
-
end
105
-
end
106
-
functionmygemmavx!(C, A, B)
107
-
@avxfor i ∈1:size(A,1), j ∈1:size(B,2)
108
-
Cᵢⱼ =0.0
109
-
for k ∈1:size(A,2)
110
-
Cᵢⱼ += A[i,k] * B[k,j]
111
-
end
112
-
C[i,j] = Cᵢⱼ
113
-
end
114
-
end
115
-
M, K, N =72, 75, 71;
116
-
C1 =Matrix{Float64}(undef, M, N); A =randn(M, K); B =randn(K, N);
117
-
C2 =similar(C1); C3 =similar(C1);
118
-
@btimemygemmavx!($C1, $A, $B)
119
-
@btimemygemm!($C2, $A, $B)
120
-
using LinearAlgebra, Test
121
-
@testall(C1 .≈ C2)
122
-
BLAS.set_num_threads(1); BLAS.vendor()
123
-
@btimemul!($C3, $A, $B)
124
-
@testall(C1 .≈ C3)
131
+
julia>functionmygemm!(𝐂, 𝐀, 𝐁)
132
+
@inbounds@fastmathfor m ∈1:size(𝐀,1), n ∈1:size(𝐁,2)
133
+
𝐂ₘₙ =zero(eltype(𝐂))
134
+
for k ∈1:size(𝐀,2)
135
+
𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
136
+
end
137
+
𝐂[m,n] = 𝐂ₘₙ
138
+
end
139
+
end
140
+
mygemm! (generic function with 1 method)
141
+
142
+
julia>functionmygemmavx!(𝐂, 𝐀, 𝐁)
143
+
@avxfor m ∈1:size(𝐀,1), n ∈1:size(𝐁,2)
144
+
𝐂ₘₙ =zero(eltype(𝐂))
145
+
for k ∈1:size(𝐀,2)
146
+
𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
147
+
end
148
+
𝐂[m,n] = 𝐂ₘₙ
149
+
end
150
+
end
151
+
mygemmavx! (generic function with 1 method)
152
+
153
+
julia> M, K, N =72, 75, 71;
154
+
155
+
julia> C1 =Matrix{Float64}(undef, M, N); A =randn(M, K); B =randn(K, N);
156
+
157
+
julia> C2 =similar(C1); C3 =similar(C1);
158
+
159
+
julia>@benchmarkmygemmavx!($C1, $A, $B)
160
+
BenchmarkTools.Trial:
161
+
memory estimate:0 bytes
162
+
allocs estimate:0
163
+
--------------
164
+
minimum time:7.381 μs (0.00% GC)
165
+
median time:7.415 μs (0.00% GC)
166
+
mean time:7.432 μs (0.00% GC)
167
+
maximum time:15.444 μs (0.00% GC)
168
+
--------------
169
+
samples:10000
170
+
evals/sample:4
171
+
172
+
julia>@benchmarkmygemm!($C2, $A, $B)
173
+
BenchmarkTools.Trial:
174
+
memory estimate:0 bytes
175
+
allocs estimate:0
176
+
--------------
177
+
minimum time:230.790 μs (0.00% GC)
178
+
median time:231.288 μs (0.00% GC)
179
+
mean time:231.882 μs (0.00% GC)
180
+
maximum time:275.460 μs (0.00% GC)
181
+
--------------
182
+
samples:10000
183
+
evals/sample:1
184
+
185
+
julia>using LinearAlgebra, Test
186
+
187
+
julia>@testall(C1 .≈ C2)
188
+
Test Passed
189
+
190
+
julia> BLAS.set_num_threads(1); BLAS.vendor()
191
+
:mkl
192
+
193
+
julia>@benchmarkmul!($C3, $A, $B)
194
+
BenchmarkTools.Trial:
195
+
memory estimate:0 bytes
196
+
allocs estimate:0
197
+
--------------
198
+
minimum time:6.830 μs (0.00% GC)
199
+
median time:6.861 μs (0.00% GC)
200
+
mean time:6.869 μs (0.00% GC)
201
+
maximum time:15.125 μs (0.00% GC)
202
+
--------------
203
+
samples:10000
204
+
evals/sample:5
205
+
206
+
julia>@testall(C1 .≈ C3)
207
+
Test Passed
125
208
```
126
209
It can produce a decent macro kernel.
127
210
In the future, I would like it to also model the cost of memory movement in the L1 and L2 cache, and use these to generate loops around the macro kernel following the work of [Low, et al. (2016)](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
@@ -142,28 +225,34 @@ For example, what if `A` were the outer product of two vectors?
142
225
<summaryClick me! ></summary>
143
226
<p>
144
227
145
-
Another example, a straightforward operation expressed well via broadcasting:
228
+
Another example, a straightforward operation expressed well via broadcasting and `*ˡ` (which is typed `*\^l`), the lazy matrix multiplication operator:
146
229
```julia
147
-
a =rand(37); B =rand(37, 47); c =rand(47); c′ = c';
julia> a =rand(48); B =rand(48, 51); c =rand(51); d =rand(49);
151
233
152
-
@testall(d1 .≈ d2)
234
+
julia> X1 = a .+ B * (c .+ d');
153
235
154
-
@time@.$d1 =$a +$B *$c′;
155
-
@time@avx@.$d2 =$a +$B *$c′;
156
-
@testall(d1 .≈ d2)
157
-
```
158
-
can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
236
+
julia> X2 =@avx@. a + B *ˡ (c + d');
159
237
238
+
julia>@test X1 ≈ X2
239
+
Test Passed
160
240
161
-
You can also use `*ˡ` (which is typed `*\^l`) for lazy matrix multiplication that can fuse with broadcasts. `.*ˡ` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `*ˡ` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
The lazy matrix multiplication operator `*ˡ` escapes broadcasts and fuses, making it easy to write code that avoids intermediates. However, I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
255
+
This may improve as the optimizations within LoopVectorization improve.
167
256
168
257
</p>
169
258
</details>
@@ -215,16 +304,25 @@ end
215
304
```
216
305
this `mul_avx!` kernel can now accept `StructArray` matrices of complex numbers and multiply them efficiently:
217
306
```julia
218
-
M, K, N =50, 51, 52
307
+
julia> M, K, N =56, 57, 58
308
+
(56, 57, 58)
309
+
310
+
julia> A =StructArray(randn(ComplexF64, M, K));
311
+
312
+
julia> B =StructArray(randn(ComplexF64, K, N));
313
+
314
+
julia> C1 =StructArray(Matrix{ComplexF64}(undef, M, N));
315
+
316
+
julia> C2 =collect(similar(C1));
317
+
318
+
julia>@btimemul_avx!($C1, $A, $B)
319
+
13.634 μs (0 allocations:0 bytes)
219
320
220
-
A =StructArray(randn(ComplexF64, M, K));
221
-
B =StructArray(randn(ComplexF64, K, N));
222
-
C1 =StructArray(Matrix{ComplexF64}(undef, M, N));
223
-
C2 =collect(similar(C1));
321
+
julia>@btimemul!( $C2, $(collect(A)), $(collect(B))); # collect turns the StructArray into a regular Array
322
+
14.007 μs (0 allocations:0 bytes)
224
323
225
-
@btimemul_avx!($C1, $A, $B)
226
-
@btimemul!( $C2, $(collect(A)), $(collect(B))) # collect turns the StructArray into a regular Array
227
-
@test C1 ≈ C2
324
+
julia>@test C1 ≈ C2
325
+
Test Passed
228
326
```
229
327
230
328
Similar approaches can be taken to make kernels working with a variety of numeric struct types such as [dual numbers](https://github.com/JuliaDiff/DualNumbers.jl), [DoubleFloats](https://github.com/JuliaMath/DoubleFloats.jl), etc.
0 commit comments