|
290 | 290 | C[m,n] = Cₘₙ
|
291 | 291 | end
|
292 | 292 | end
|
| 293 | + function rank2AmulBavx_noinline!(C, Aₘ, Aₖ, B) |
| 294 | + @avx inline=false for m ∈ 1:size(C,1), n ∈ 1:size(C,2) |
| 295 | + Cₘₙ = zero(eltype(C)) |
| 296 | + for k ∈ 1:size(B,1) |
| 297 | + Cₘₙ += (Aₘ[m,1]*Aₖ[1,k]+Aₘ[m,2]*Aₖ[2,k]) * B[k,n] |
| 298 | + end |
| 299 | + C[m,n] = Cₘₙ |
| 300 | + end |
| 301 | + end |
293 | 302 |
|
294 | 303 | function mulCAtB_2x2blockavx!(C, A, B)
|
295 | 304 | M, N = size(C); K = size(B,1)
|
|
399 | 408 | end
|
400 | 409 | return C
|
401 | 410 | end
|
| 411 | + function mulCAtB_2x2blockavx_noinline!(C, A, B) |
| 412 | + M, N = size(C); K = size(B,1) |
| 413 | + @assert size(C, 1) == size(A, 2) |
| 414 | + @assert size(C, 2) == size(B, 2) |
| 415 | + @assert size(A, 1) == size(B, 1) |
| 416 | + T = eltype(C) |
| 417 | + for m ∈ 1:2:(M & -2) |
| 418 | + m1 = m + 1 |
| 419 | + for n ∈ 1:2:(N & -2) |
| 420 | + n1 = n + 1 |
| 421 | + C11, C21, C12, C22 = zero(T), zero(T), zero(T), zero(T) |
| 422 | + @avx inline=false for k ∈ 1:K |
| 423 | + C11 += A[k,m] * B[k,n] |
| 424 | + C21 += A[k,m1] * B[k,n] |
| 425 | + C12 += A[k,m] * B[k,n1] |
| 426 | + C22 += A[k,m1] * B[k,n1] |
| 427 | + end |
| 428 | + C[m,n] = C11 |
| 429 | + C[m1,n] = C21 |
| 430 | + C[m,n1] = C12 |
| 431 | + C[m1,n1] = C22 |
| 432 | + end |
| 433 | + if isodd(N) |
| 434 | + C1n = 0.0 |
| 435 | + C2n = 0.0 |
| 436 | + @avx inline=false for k ∈ 1:K |
| 437 | + C1n += A[k,m] * B[k,N] |
| 438 | + C2n += A[k,m1] * B[k,N] |
| 439 | + end |
| 440 | + C[m,N] = C1n |
| 441 | + C[m1,N] = C2n |
| 442 | + end |
| 443 | + end |
| 444 | + if isodd(M) |
| 445 | + for n ∈ 1:2:(N & -2) |
| 446 | + n1 = n + 1 |
| 447 | + Cm1, Cm2 = zero(T), zero(T) |
| 448 | + @avx inline=false for k ∈ 1:K |
| 449 | + Cm1 += A[k,M] * B[k,n] |
| 450 | + Cm2 += A[k,M] * B[k,n1] |
| 451 | + end |
| 452 | + C[M,n] = Cm1 |
| 453 | + C[M,n1] = Cm2 |
| 454 | + end |
| 455 | + if isodd(N) |
| 456 | + Cmn = 0.0 |
| 457 | + @avx inline=false for k ∈ 1:K |
| 458 | + Cmn += A[k,M] * B[k,N] |
| 459 | + end |
| 460 | + C[M,N] = Cmn |
| 461 | + end |
| 462 | + end |
| 463 | + return C |
| 464 | + end |
402 | 465 | # M = 77;
|
403 | 466 | # A = rand(M,M); B = rand(M,M); C = similar(A);
|
404 | 467 | # mulCAtB_2x2block_avx!(C,A,B)
|
|
481 | 544 | @test C ≈ C2
|
482 | 545 | fill!(C, 9999.999); mulCAtB_2x2blockavx!(C, A', B);
|
483 | 546 | @test C ≈ C2
|
| 547 | + fill!(C, 9999.999); mulCAtB_2x2blockavx_noinline!(C, At, B); |
| 548 | + @test C ≈ C2 |
| 549 | + fill!(C, 9999.999); mulCAtB_2x2blockavx_noinline!(C, A', B); |
| 550 | + @test C ≈ C2 |
484 | 551 | end
|
485 | 552 | @time @testset "_avx $T dynamic gemm" begin
|
486 | 553 | AmulB_avx1!(C, A, B)
|
|
549 | 616 | @test Cs ≈ C2
|
550 | 617 | fill!(Cs, 9999.999); mulCAtB_2x2blockavx!(Cs, As', Bs);
|
551 | 618 | @test Cs ≈ C2
|
| 619 | + fill!(Cs, 9999.999); mulCAtB_2x2blockavx_noinline!(Cs, Ats, Bs); |
| 620 | + @test Cs ≈ C2 |
| 621 | + fill!(Cs, 9999.999); mulCAtB_2x2blockavx_noinline!(Cs, As', Bs); |
| 622 | + @test Cs ≈ C2 |
552 | 623 | end
|
553 | 624 | @time @testset "_avx $T static gemm" begin
|
554 | 625 | AmulB_avx1!(Cs, As, Bs)
|
|
593 | 664 | @test C ≈ C2
|
594 | 665 | fill!(C, 9999.999); rank2AmulB_avx!(C, Aₘ, Aₖ, B)
|
595 | 666 | @test C ≈ C2
|
| 667 | + fill!(C, 9999.999); rank2AmulBavx_noinline!(C, Aₘ, Aₖ, B) |
| 668 | + @test C ≈ C2 |
596 | 669 | fill!(C, 9999.999); rank2AmulBavx!(C, Aₘ, Aₖ′', B)
|
597 | 670 | @test C ≈ C2
|
598 | 671 | fill!(C, 9999.999); rank2AmulB_avx!(C, Aₘ, Aₖ′', B)
|
599 | 672 | @test C ≈ C2
|
| 673 | + fill!(C, 9999.999); rank2AmulBavx_noinline!(C, Aₘ, Aₖ′', B) |
| 674 | + @test C ≈ C2 |
600 | 675 | end
|
601 | 676 |
|
602 | 677 | end
|
|
0 commit comments