|
72 | 72 | end
|
73 | 73 | end
|
74 | 74 | function AmulBavx1!(C, A, B)
|
75 |
| - @avx for m ∈ 1:size(A,1), n ∈ axes(B,2) |
| 75 | + @avx unroll=(1,2) for m ∈ 1:size(A,1), n ∈ axes(B,2) |
76 | 76 | Cₘₙ = zero(eltype(C))
|
77 | 77 | for k ∈ 1:size(A,2)
|
78 | 78 | Cₘₙ += A[m,k] * B[k,n]
|
|
82 | 82 | end
|
83 | 83 | function AmulBavx2!(C, A, B)
|
84 | 84 | z = zero(eltype(C))
|
85 |
| - @avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 85 | + @avx unroll=(2,1) for m ∈ axes(A,1), n ∈ axes(B,2) |
86 | 86 | C[m,n] = z
|
87 | 87 | for k ∈ axes(A,2)
|
88 | 88 | C[m,n] += A[m,k] * B[k,n]
|
89 | 89 | end
|
90 | 90 | end
|
91 | 91 | end
|
92 | 92 | function AmulBavx3!(C, A, B)
|
93 |
| - @avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 93 | + @avx unroll=(2,2) for m ∈ axes(A,1), n ∈ axes(B,2) |
94 | 94 | C[m,n] = zero(eltype(C))
|
95 | 95 | for k ∈ axes(A,2)
|
96 | 96 | C[m,n] += A[m,k] * B[k,n]
|
|
115 | 115 | # C[m,n] += ΔCₘₙ * factor
|
116 | 116 | # end;
|
117 | 117 | function AmuladdBavx!(C, A, B, α = one(eltype(C)))
|
118 |
| - @avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 118 | + @avx unroll=(2,2) for m ∈ axes(A,1), n ∈ axes(B,2) |
119 | 119 | ΔCₘₙ = zero(eltype(C))
|
120 | 120 | for k ∈ axes(A,2)
|
121 | 121 | ΔCₘₙ += A[m,k] * B[k,n]
|
|
124 | 124 | end
|
125 | 125 | end
|
126 | 126 | function AmuladdBavx!(C, A, B, α, β)# = zero(eltype(C)))
|
127 |
| - @avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 127 | + @avx unroll=(1,1) for m ∈ axes(A,1), n ∈ axes(B,2) |
128 | 128 | ΔCₘₙ = zero(eltype(C))
|
129 | 129 | for k ∈ axes(A,2)
|
130 | 130 | ΔCₘₙ += A[m,k] * B[k,n]
|
|
160 | 160 | end
|
161 | 161 |
|
162 | 162 | function AmulB_avx1!(C, A, B)
|
163 |
| - @_avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2) |
| 163 | + @_avx unroll=(2,2) for m ∈ 1:size(A,1), n ∈ 1:size(B,2) |
164 | 164 | Cₘₙ = zero(eltype(C))
|
165 | 165 | for k ∈ axes(A,2)
|
166 | 166 | Cₘₙ += A[m,k] * B[k,n]
|
|
182 | 182 | # A = rand(M, M); B = rand(M, M); C = similar(A);
|
183 | 183 | function AmulB_avx2!(C, A, B)
|
184 | 184 | z = zero(eltype(C))
|
185 |
| - @_avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 185 | + @_avx unroll=(2,2) for m ∈ axes(A,1), n ∈ axes(B,2) |
186 | 186 | C[m,n] = z
|
187 | 187 | for k ∈ axes(A,2)
|
188 | 188 | C[m,n] += A[m,k] * B[k,n]
|
|
201 | 201 | # ls.operations[1]
|
202 | 202 | function AmulB_avx3!(C, A, B)
|
203 | 203 | Kmin = firstindex(axes(A,2)); Kmax = lastindex(axes(A,2))
|
204 |
| - @_avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 204 | + @_avx unroll=(2,2) for m ∈ axes(A,1), n ∈ axes(B,2) |
205 | 205 | C[m,n] = zero(eltype(C))
|
206 | 206 | for k ∈ Kmin:Kmax
|
207 | 207 | C[m,n] += A[m,k] * B[k,n]
|
|
224 | 224 | # end)
|
225 | 225 | # ls = LoopVectorization.LoopSet(q);
|
226 | 226 | function AmuladdB_avx!(C, A, B, factor = 1)
|
227 |
| - @_avx for m ∈ axes(A,1), n ∈ axes(B,2) |
| 227 | + @_avx unroll=(2,2) for m ∈ axes(A,1), n ∈ axes(B,2) |
228 | 228 | ΔCₘₙ = zero(eltype(C))
|
229 | 229 | for k ∈ axes(A,2)
|
230 | 230 | ΔCₘₙ += A[m,k] * B[k,n]
|
|
293 | 293 | # LoopVectorization.loopdependencies.(operations(atls))
|
294 | 294 | # LoopVectorization.reduceddependencies.(operations(atls))
|
295 | 295 | function AtmulB_avx1!(C, A, B)
|
296 |
| - @_avx for n ∈ axes(C,2), m ∈ axes(C,1) |
| 296 | + @_avx unroll=(2,2) for n ∈ axes(C,2), m ∈ axes(C,1) |
297 | 297 | Cₘₙ = zero(eltype(C))
|
298 | 298 | for k ∈ axes(A,1)
|
299 | 299 | Cₘₙ += A[k,m] * B[k,n]
|
|
308 | 308 | @assert size(A, 1) == size(B, 1)
|
309 | 309 | # When the @avx macro is available, this code is faster:
|
310 | 310 | z = zero(eltype(C))
|
311 |
| - @avx for n in axes(C,2), m in axes(C,1) |
| 311 | + @avx unroll=(2,2) for n in axes(C,2), m in axes(C,1) |
312 | 312 | Cmn = z
|
313 | 313 | for k in axes(A,1)
|
314 | 314 | Cmn += A[k,m] * B[k,n]
|
|
0 commit comments