@@ -36,28 +36,27 @@ using LinearAlgebra
36
36
@test logsumexp! (r, x) ≈ 102.35216846104409
37
37
38
38
@testset " GEMM" begin
39
- gemmq = :(for i ∈ 1 : size (A,1 ), j ∈ 1 : size (B,2 )
39
+ AmulBq = :(for i ∈ 1 : size (A,1 ), j ∈ 1 : size (B,2 )
40
40
Cᵢⱼ = zero (eltype (C))
41
41
for k ∈ 1 : size (A,2 )
42
42
Cᵢⱼ += A[i,k] * B[k,j]
43
43
end
44
44
C[i,j] = Cᵢⱼ
45
45
end )
46
46
47
- lsgemm = LoopVectorization. LoopSet (gemmq );
47
+ lsAmulB = LoopVectorization. LoopSet (AmulBq );
48
48
U, T = LoopVectorization. VectorizationBase. REGISTER_COUNT == 16 ? (3 ,4 ) : (6 , 4 )
49
- @test LoopVectorization. choose_order (lsgemm ) == (Symbol[:j ,:i ,:k ], :i , U, T)
49
+ @test LoopVectorization. choose_order (lsAmulB ) == (Symbol[:j ,:i ,:k ], :i , U, T)
50
50
51
- function mygemm ! (C, A, B)
52
- @inbounds for i ∈ 1 : size (A, 1 ), j ∈ 1 : size (B, 2 )
53
- Cᵢⱼ = zero ( eltype (C) )
54
- @simd ivdep for k ∈ 1 : size (A,2 )
55
- Cᵢⱼ += A[i,k] * B[k,j]
51
+ function AmulB ! (C, A, B)
52
+ C . = 0
53
+ for k ∈ 1 : size (A, 2 ), j ∈ 1 : size (B, 2 )
54
+ @simd ivdep for i ∈ 1 : size (A,1 )
55
+ @inbounds C[i,j] += A[i,k] * B[k,j]
56
56
end
57
- C[i,j] = Cᵢⱼ
58
57
end
59
58
end
60
- function mygemmavx ! (C, A, B)
59
+ function AmulBavx ! (C, A, B)
61
60
@avx for i ∈ 1 : size (A,1 ), j ∈ 1 : size (B,2 )
62
61
Cᵢⱼ = zero (eltype (C))
63
62
for k ∈ 1 : size (A,2 )
@@ -67,13 +66,46 @@ using LinearAlgebra
67
66
end
68
67
end
69
68
70
-
69
+ # function AtmulB!(C, A, B)
70
+ # for j ∈ 1:size(C,2), i ∈ 1:size(C,1)
71
+ # Cᵢⱼ = zero(eltype(C))
72
+ # @simd ivdep for k ∈ 1:size(A,1)
73
+ # @inbounds Cᵢⱼ += A[k,i] * B[k,j]
74
+ # end
75
+ # C[i,j] = Cᵢⱼ
76
+ # end
77
+ # end
78
+ AtmulBq = :(for j ∈ 1 : size (C,2 ), i ∈ 1 : size (C,1 )
79
+ Cᵢⱼ = zero (eltype (C))
80
+ for k ∈ 1 : size (A,1 )
81
+ Cᵢⱼ += A[k,i] * B[k,j]
82
+ end
83
+ C[i,j] = Cᵢⱼ
84
+ end )
85
+ lsAtmulB = LoopVectorization. LoopSet (AtmulBq);
86
+ # LoopVectorization.choose_order(lsAtmulB)
87
+ @test LoopVectorization. choose_order (lsAtmulB) == (Symbol[:j ,:i ,:k ], :k , U, T)
88
+
89
+ function AtmulBavx! (C, A, B)
90
+ @avx for j ∈ 1 : size (C,2 ), i ∈ 1 : size (C,1 )
91
+ Cᵢⱼ = zero (eltype (C))
92
+ for k ∈ 1 : size (A,1 )
93
+ Cᵢⱼ += A[k,i] * B[k,j]
94
+ end
95
+ C[i,j] = Cᵢⱼ
96
+ end
97
+ end
98
+
71
99
for T ∈ (Float32, Float64)
72
100
M, K, N = 72 , 75 , 71 ;
73
101
C = Matrix {T} (undef, M, N); A = randn (T, M, K); B = randn (T, K, N);
74
102
C2 = similar (C);
75
- mygemmavx! (C, A, B)
76
- mygemm! (C2, A, B)
103
+ AmulBavx! (C, A, B)
104
+ AmulB! (C2, A, B)
105
+ @test C ≈ C2
106
+ At = copy (A' );
107
+ fill! (C, 9999.999 );
108
+ AtmulBavx! (C, At, B)
77
109
@test C ≈ C2
78
110
end
79
111
end
@@ -178,6 +210,7 @@ using LinearAlgebra
178
210
myvexpavx! (b2, a)
179
211
@test b1 ≈ b2
180
212
@test myvexp (a) ≈ myvexpavx (a)
213
+ @test b1 ≈ @avx exp .(a)
181
214
end
182
215
end
183
216
@@ -225,6 +258,23 @@ using LinearAlgebra
225
258
226
259
227
260
@testset " Miscellaneous" begin
261
+
262
+ dot3q = :(for m ∈ 1 : M, n ∈ 1 : N
263
+ s += x[m] * A[m,n] * y[n]
264
+ end )
265
+ lsdot3 = LoopVectorization. LoopSet (dot3q);
266
+ LoopVectorization. choose_order (lsdot3)
267
+
268
+ dot3 (x, A, y) = dot (x, A * y)
269
+ function dot3avx (x, A, y)
270
+ M, N = size (A)
271
+ s = zero (promote_type (eltype (x), eltype (A), eltype (y)))
272
+ @avx for m ∈ 1 : M, n ∈ 1 : N
273
+ s += x[m] * A[m,n] * y[n]
274
+ end
275
+ s
276
+ end
277
+
228
278
subcolq = :(for i ∈ 1 : size (A,2 ), j ∈ eachindex (x)
229
279
B[j,i] = A[j,i] - x[j]
230
280
end )
@@ -246,25 +296,25 @@ using LinearAlgebra
246
296
end
247
297
248
298
colsumq = :(for i ∈ 1 : size (A,2 ), j ∈ eachindex (x)
249
- x[j] += A[j,i]
299
+ x[j] += A[j,i] - 0.25
250
300
end )
251
301
lscolsum = LoopVectorization. LoopSet (colsumq);
252
- @test LoopVectorization. choose_order (lscolsum) == (Symbol[:j ,:i ], :j , 4 , - 1 )
302
+ @test LoopVectorization. choose_order (lscolsum) == (Symbol[:j ,:i ], :j , 8 , - 1 )
253
303
304
+ # my colsum is wrong (by 0.25), but slightly more interesting
254
305
function mycolsum! (x, A)
255
306
@. x = 0
256
307
@inbounds for i ∈ 1 : size (A,2 )
257
308
@simd for j ∈ eachindex (x)
258
- x[j] += A[j,i]
309
+ x[j] += A[j,i] - 0.25
259
310
end
260
311
end
261
312
end
262
-
263
313
function mycolsumavx! (x, A)
264
314
@avx for j ∈ eachindex (x)
265
315
xⱼ = zero (eltype (x))
266
316
for i ∈ 1 : size (A,2 )
267
- xⱼ += A[j,i]
317
+ xⱼ += A[j,i] - 0.25
268
318
end
269
319
x[j] = xⱼ
270
320
end
@@ -300,8 +350,8 @@ using LinearAlgebra
300
350
end
301
351
302
352
for T ∈ (Float32, Float64)
303
- A = randn (T, 199 , 498 )
304
- x = randn (T, size (A,1 ))
353
+ A = randn (T, 199 , 498 );
354
+ x = randn (T, size (A,1 ));
305
355
B1 = similar (A); B2 = similar (A);
306
356
307
357
mysubcol! (B1, A, x)
@@ -311,13 +361,17 @@ using LinearAlgebra
311
361
x1 = similar (x); x2 = similar (x);
312
362
mycolsum! (x1, A)
313
363
mycolsumavx! (x2, A)
314
-
315
364
@test x1 ≈ x2
316
365
317
366
x̄ = x1 ./ size (A,2 );
318
367
myvar! (x1, A, x̄)
319
368
myvaravx! (x2, A, x̄)
320
369
@test x1 ≈ x2
370
+
371
+ M, N = 47 , 73 ;
372
+ x = rand (T, M); A = rand (T, M, N); y = rand (T, N);
373
+ @test dot3avx (x, A, y) ≈ dot3 (x, A, y)
374
+
321
375
end
322
376
end
323
377
@@ -351,26 +405,17 @@ end
351
405
@avx @. d4 = a + B ∗ c;
352
406
@test d3 ≈ d4
353
407
354
- # T = Float64
355
- # T = Float32
356
408
M, K, N = 77 , 83 , 57 ;
357
409
A = rand (T,M,K); B = rand (T,K,N); C = rand (T,M,N);
358
410
359
411
D1 = C .+ A * B;
360
412
D2 = @avx C .+ A ∗ B;
361
413
@test D1 ≈ D2
362
414
363
- B = rand (T,K,N);
364
415
D3 = exp .(B' );
365
416
D4 = @avx exp .(B' );
366
417
@test D3 ≈ D4
367
418
368
- d4q = @avx exp .(B' )
369
-
370
- D3c = copy (D3); D4c = copy (D4);
371
- D3[1 : 21 ,1 : 10 ]
372
- D4[1 : 21 ,1 : 10 ]
373
-
374
419
fill! (D3, - 1e3 ); fill! (D4, 9e9 );
375
420
Bt = Transpose (B);
376
421
@. D3 = exp (Bt);
390
435
lset = @avx @. D2' = exp (Bt);
391
436
392
437
@test D1 ≈ D2
438
+
439
+ a = rand (137 );
440
+ b1 = @avx @. 3 * a + sin (a) + sqrt (a);
441
+ b2 = @. 3 * a + sin (a) + sqrt (a);
442
+ @test b1 ≈ b2
393
443
end
394
444
end
395
445
0 commit comments