|
174 | 174 | end
|
175 | 175 | end
|
176 | 176 |
|
| 177 | + function AmulB2x2avx!(C, A, B) |
| 178 | + @avx tile=(2,2) for m ∈ 1:size(A,1), n ∈ 1:size(B,2) |
| 179 | + ΔCₘₙ = zero(eltype(C)) |
| 180 | + for k ∈ 1:size(A,2) |
| 181 | + ΔCₘₙ += A[m,k] * B[k,n] |
| 182 | + end |
| 183 | + C[m,n] = ΔCₘₙ |
| 184 | + end |
| 185 | + end |
| 186 | + function AmulB2x2_avx!(C, A, B) |
| 187 | + @_avx tile=(2,2) for m ∈ 1:size(A,1), n ∈ 1:size(B,2) |
| 188 | + ΔCₘₙ = zero(eltype(C)) |
| 189 | + for k ∈ 1:size(A,2) |
| 190 | + ΔCₘₙ += A[m,k] * B[k,n] |
| 191 | + end |
| 192 | + C[m,n] = ΔCₘₙ |
| 193 | + end |
| 194 | + end |
| 195 | + |
177 | 196 | # function AtmulB!(C, A, B)
|
178 | 197 | # for j ∈ 1:size(C,2), i ∈ 1:size(C,1)
|
179 | 198 | # Cᵢⱼ = zero(eltype(C))
|
|
532 | 551 | @test C ≈ C2
|
533 | 552 | AmuladdBavx!(C, At', B, -2)
|
534 | 553 | @test C ≈ -C2
|
| 554 | + fill!(C, 9999.999); AmulB2x2avx!(C, A, B); |
| 555 | + @test C ≈ C2 |
| 556 | + fill!(C, 9999.999); AmulB2x2avx!(C, At', B); |
| 557 | + @test C ≈ C2 |
535 | 558 | fill!(C, 9999.999); AtmulBavx1!(C, At, B)
|
536 | 559 | @test C ≈ C2
|
537 | 560 | fill!(C, 9999.999); AtmulBavx1!(C, A', B)
|
|
570 | 593 | @test C ≈ C2
|
571 | 594 | AmuladdB_avx!(C, At', B, -2)
|
572 | 595 | @test C ≈ -C2
|
| 596 | + fill!(C, 9999.999); AmulB2x2_avx!(C, A, B); |
| 597 | + @test C ≈ C2 |
| 598 | + fill!(C, 9999.999); AmulB2x2_avx!(C, At', B); |
| 599 | + @test C ≈ C2 |
573 | 600 | fill!(C, 9999.999); AtmulB_avx1!(C, At, B)
|
574 | 601 | @test C ≈ C2
|
575 | 602 | fill!(C, 9999.999); AtmulB_avx1!(C, A', B)
|
|
604 | 631 | @test Cs ≈ C2
|
605 | 632 | AmuladdBavx!(Cs, Ats', Bs, -2)
|
606 | 633 | @test Cs ≈ -C2
|
| 634 | + fill!(Cs, 9999.999); AmulB2x2avx!(Cs, As, Bs) |
| 635 | + @test Cs ≈ C2 |
| 636 | + fill!(Cs, 9999.999); AmulB2x2avx!(Cs, Ats', Bs) |
| 637 | + @test Cs ≈ C2 |
607 | 638 | fill!(Cs, 9999.999); AtmulBavx1!(Cs, Ats, Bs)
|
608 | 639 | @test Cs ≈ C2
|
609 | 640 | fill!(Cs, 9999.999); AtmulBavx1!(Cs, As', Bs)
|
|
642 | 673 | @test Cs ≈ C2
|
643 | 674 | AmuladdB_avx!(Cs, Ats', Bs, -2)
|
644 | 675 | @test Cs ≈ -C2
|
| 676 | + fill!(Cs, 9999.999); AmulB2x2_avx!(Cs, As, Bs) |
| 677 | + @test Cs ≈ C2 |
| 678 | + fill!(Cs, 9999.999); AmulB2x2_avx!(Cs, Ats', Bs) |
| 679 | + @test Cs ≈ C2 |
645 | 680 | fill!(Cs, 9999.999); AtmulB_avx1!(Cs, Ats, Bs)
|
646 | 681 | @test Cs ≈ C2
|
647 | 682 | fill!(Cs, 9999.999); AtmulB_avx1!(Cs, As', Bs)
|
|
0 commit comments