Skip to content

Commit 0909366

Browse files
committed
Update benchmarks
1 parent ae834d2 commit 0909366

13 files changed

+303
-29
lines changed

benchmark/Manifest.toml

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ version = "1.1.1"
1818

1919
[[ArrayInterface]]
2020
deps = ["Compat", "IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
21-
git-tree-sha1 = "a8101545d6b15ff1ebc927e877e28b0ab4bc4f16"
21+
git-tree-sha1 = "d9352737cef8525944bf9ef34392d756321cbd54"
2222
uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
23-
version = "3.1.36"
23+
version = "3.1.38"
2424

2525
[[Artifacts]]
2626
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -108,9 +108,9 @@ version = "0.3.0"
108108

109109
[[Compat]]
110110
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
111-
git-tree-sha1 = "31d0151f5716b655421d9d75b7fa74cc4e744df2"
111+
git-tree-sha1 = "dce3e3fea680869eaa0b774b2e8343e9ff442313"
112112
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
113-
version = "3.39.0"
113+
version = "3.40.0"
114114

115115
[[CompilerSupportLibraries_jll]]
116116
deps = ["Artifacts", "Libdl"]
@@ -145,6 +145,12 @@ git-tree-sha1 = "cc70b17275652eb47bc9e5f81635981f13cea5c8"
145145
uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
146146
version = "1.9.0"
147147

148+
[[DataFrames]]
149+
deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
150+
git-tree-sha1 = "d785f42445b63fc86caa08bb9a9351008be9b765"
151+
uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
152+
version = "1.2.2"
153+
148154
[[DataStructures]]
149155
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
150156
git-tree-sha1 = "7d9d316f04214f7efdbb6398d545446e246eff02"
@@ -178,9 +184,9 @@ version = "1.3.1"
178184

179185
[[Distances]]
180186
deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
181-
git-tree-sha1 = "09d9eaef9ef719d2cd5d928a191dc95be2ec8059"
187+
git-tree-sha1 = "837c83e5574582e07662bbbba733964ff7c26b9d"
182188
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
183-
version = "0.10.5"
189+
version = "0.10.6"
184190

185191
[[Distributed]]
186192
deps = ["Random", "Serialization", "Sockets"]
@@ -223,9 +229,9 @@ version = "3.3.10+0"
223229

224230
[[FileIO]]
225231
deps = ["Pkg", "Requires", "UUIDs"]
226-
git-tree-sha1 = "3c041d2ac0a52a12a27af2782b34900d9c3ee68c"
232+
git-tree-sha1 = "2db648b6712831ecb333eae76dbfd1c156ca13bb"
227233
uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
228-
version = "1.11.1"
234+
version = "1.11.2"
229235

230236
[[FillArrays]]
231237
deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"]
@@ -345,9 +351,9 @@ uuid = "e33a78d0-f292-5ffc-b300-72abe9b543c8"
345351
version = "2.5.0+0"
346352

347353
[[IfElse]]
348-
git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef"
354+
git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
349355
uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
350-
version = "0.1.0"
356+
version = "0.1.1"
351357

352358
[[IndirectArrays]]
353359
git-tree-sha1 = "012e604e1c7458645cb8b436f8fba789a51b257f"
@@ -376,6 +382,11 @@ git-tree-sha1 = "f0c6489b12d28fb4c2103073ec7452f3423bd308"
376382
uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
377383
version = "0.1.1"
378384

385+
[[InvertedIndices]]
386+
git-tree-sha1 = "bee5f1ef5bf65df56bdd2e40447590b272a5471f"
387+
uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
388+
version = "1.1.0"
389+
379390
[[IrrationalConstants]]
380391
git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
381392
uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -588,7 +599,7 @@ version = "1.10.7"
588599
[[OpenBLAS_jll]]
589600
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
590601
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
591-
version = "0.3.13+7"
602+
version = "0.3.17+2"
592603

593604
[[OpenLibm_jll]]
594605
deps = ["Artifacts", "Libdl"]
@@ -614,9 +625,9 @@ version = "8.44.0+0"
614625

615626
[[PDMats]]
616627
deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
617-
git-tree-sha1 = "4dd403333bcf0909341cfe57ec115152f937d7d8"
628+
git-tree-sha1 = "82041e63725d156bf61c6302dd7635ea13e3d5e7"
618629
uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
619-
version = "0.11.1"
630+
version = "0.11.2"
620631

621632
[[Pango_jll]]
622633
deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "FriBidi_jll", "Glib_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl", "Pkg"]
@@ -647,6 +658,12 @@ git-tree-sha1 = "a3ff99bf561183ee20386aec98ab8f4a12dc724a"
647658
uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
648659
version = "0.1.2"
649660

661+
[[PooledArrays]]
662+
deps = ["DataAPI", "Future"]
663+
git-tree-sha1 = "a193d6ad9c45ada72c14b731a318bedd3c2f00cf"
664+
uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
665+
version = "1.3.0"
666+
650667
[[Preferences]]
651668
deps = ["TOML"]
652669
git-tree-sha1 = "00cfd92944ca9c760982747e9a1d0d5d86ab1e5a"
@@ -718,6 +735,7 @@ version = "0.3.0+0"
718735

719736
[[SHA]]
720737
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
738+
version = "0.7.0"
721739

722740
[[SIMDDualNumbers]]
723741
deps = ["ForwardDiff", "IfElse", "SLEEFPirates", "VectorizationBase"]

benchmark/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
33
Cairo = "159f3aea-2a34-519c-b102-8c37f9878175"
44
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
55
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
6+
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
67
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
78
Fontconfig = "186bb1d3-e1f7-5a2c-a377-96d770f13627"
89
Gadfly = "c91e804a-d5a3-530f-b6f0-dfbca275c004"

docs/src/assets/gemm_Float64_10_500_cascadelake_AVX512__multithreaded.svg

Lines changed: 255 additions & 0 deletions
Loading
12.4 KB
Loading
-7.94 KB
Loading
-24 KB
Loading

docs/src/examples/array_interface.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ df = runbenches(1:24, Float64);
7171
df |> @vlplot(:line, x = :Size, y = :GFLOPS, color = :MulType, height=640,width=960) |> save("sarraymatmul.svg")
7272
```
7373
This yields:
74-
![sarray_benchmarks](../assets/sarraymatmul.png)
74+
![sarray_benchmarks](https://raw.githubusercontent.com/JuliaSIMD/LoopVectorization.jl/docsassets/docs/src/assets/sarraymatmul.svg)
7575
Our `AmulB!` for `MMatrix`es was the fastest at all sizes except `2`x`2`, where it lost out to `AmulB` for `SMatrix`, which in turn was faster than the hundreds of lines of
7676
`StaticArray`s code at all sizes except `3`x`3`, `5`x`5`, and `6`x`6`.
7777

docs/src/examples/dot_product.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ However, there is another bottle neck: we can only perform 2 aligned loads per c
2424
Thus, in 4 clock cycles, we can do up to 8 loads. But each `fma` requires 2 loads, meaning we are limited to 4 of them per 4 clock cyles, and any unrolling beyond 4 gives us no benefit.
2525

2626
Double precision benchmarks pitting Julia's builtin dot product, and code compiled with a variety of compilers:
27-
![dot](../assets/bench_dot_v2.png)
27+
![dot](https://raw.githubusercontent.com/JuliaSIMD/LoopVectorization.jl/docsassets/docs/src/assets/bench_dot_v2.svg)
2828
What we just described is the core of the approach used by all these compilers. The variation in results is explained mostly by how they handle vectors with lengths that are not an integer multiple of `W`. I ran these on a computer with AVX512 so that `W = 8`. LLVM, the backend compiler of both Julia and Clang, shows rapid performance degredation as `N % 4W` increases, where `N` is the length of the vectors.
2929
This is because, to handle the remainder, it uses a scalar loop that runs as written: multiply and add single elements, one after the other.
3030

@@ -52,7 +52,7 @@ end
5252
```
5353
Because we only need a single load per `fma`-instruction, we can now benefit from having 8 separate accumulators.
5454
For this reason, LoopVectorization now unrolls by 8 -- it decides how much to unroll by comparing the bottlenecks on throughput with latency. The other compilers do not change their behavior, so now LoopVectorization has the advantage:
55-
![selfdot](../assets/bench_selfdot_v2.png)
55+
![selfdot](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_selfdot_v2.svg)
5656
This algorithm may need refinement, because Julia (without LoopVectorization) only unrolls by 4, yet achieves roughly the same performance as LoopVectorization at multiples of `4W = 32`, although performance declines rapidly from there due to the slow scalar loop. Performance for most is much higher -- more GFLOPS -- than the normal dot product, but still under half of the CPU's potential 131.2 GFLOPS, suggesting that some other bottlenecks are preventing the core from attaining 2 fmas per clock cycle.
5757
Note also that `8W = 64`, so we don't really have enough iterations of the loop to amortize the overhead of performing the reductions of all these vectors into a single scalar.
5858
By the time the vectors are long enough to do this, we'll start running into memory bandwidth bottlenecks.

docs/src/examples/filtering.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,16 @@ function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
1818
end
1919
```
2020
These are effectively four nested loops. For all the benchmarks, `kern` was 3 by 3, making it too small for vectorizing these loops to be particularly profitable. By vectorizing an outer loop instead, it can benefit from SIMD and also avoid having to do a reduction (horizontal addition) of a vector before storing in `out`, as the vectors can then be stored directly.
21-
![dynamicfilter](../assets/bench_filter2d_dynamic_v2.png)
21+
![dynamicfilter](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_filter2d_dynamic_v2.svg)
2222

2323
LoopVectorization achieved much better performance than all the alternatives, which tried vectorizing the inner loops.
2424
By making the compilers aware that the inner loops are too short to be worth vectorizing, we can get them to vectorize an outer loop instead. By defining the size of `kern` as constant in `C` and `Fortran`, and using size parameters in Julia, we can inform the compilers:
25-
![staticsizefilter](../assets/bench_filter2d_3x3_v2.png)
25+
![staticsizefilter](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_filter2d_3x3_v2.svg)
2626
Now all are doing much better than they were before, although still well shy of the 131.2 GFLOPS theoretical limit for the host CPU cores. While they all improved, two are lagging behind the main group:
2727
- `ifort` lags behind all the others except base Julia. I'll need to do more investigating to find out why.
2828
- Base Julia. While providing static size information was enough for it to realize vectorizing the inner loops was not worth it, base Julia was seemingly the only one that didn't decide to vectorize an outer loop instead.
2929

3030
Manually unrolling the inner loops allows base Julia to vectorize, while the performance of all non-Julia variants was unchanged:
31-
![unrolledfilter](../assets/bench_filter2d_unrolled_v2.png)
31+
![unrolledfilter](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_filter2d_unrolled_v2.svg)
3232
LoopVectorization is currently limited to only unrolling two loops (but a third may be vectorized, effectively unrolling it by the length of the vectors). Manually unrolling two of the loops lets up to four loops be unrolled.
3333

docs/src/examples/matrix_multiplication.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@ end
1818
and this can handle all transposed/not-tranposed permutations. LoopVectorization will change loop orders and strategy as appropriate based on the types of the input matrices. For each of the others, I wrote separate functions to handle each case.
1919
Letting all three matrices be square and `Size` x `Size`, we attain the following benchmark results:
2020

21-
![AmulB](../assets/bench_AmulB_v2.png)
21+
![AmulB](https://raw.githubusercontent.com/JuliaSIMD/LoopVectorization.jl/docsassets/docs/src/assets/bench_AmulB_v2.svg)
2222
This is classic GEMM, `𝐂 = 𝐀 * 𝐁`. GFortran's intrinsic `matmul` function does fairly well. But all the compilers are well behind LoopVectorization here, which falls behind MKL's `gemm` beyond 70x70 or so. The problem imposed by alignment is also striking: performance is much higher when the sizes are integer multiplies of 8. Padding arrays so that each column is aligned regardless of the number of rows can thus be very profitable. [PaddedMatrices.jl](https://github.com/JuliaSIMD/PaddedMatrices.jl) offers just such arrays in Julia. I believe that is also what the [-pad](https://software.intel.com/en-us/fortran-compiler-developer-guide-and-reference-pad-qpad) compiler flag does when using Intel's compilers.
2323

24-
![AmulBt](../assets/bench_AmulBt_v2.png)
24+
![AmulBt](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_AmulBt_v2.svg)
2525
The optimal pattern for `𝐂 = 𝐀 * 𝐁ᵀ` is almost identical to that for `𝐂 = 𝐀 * 𝐁`. Yet, gfortran's `matmul` instrinsic stumbles, surprisingly doing much worse than gfortran + loops, and almost certainly worse than allocating memory for `𝐁ᵀ` and creating the ecplicit copy.
2626

2727
ifort did equally well whethor or not `𝐁` was transposed, while LoopVectorization's performance degraded slightly faster as a function of size in the transposed case, because strides between memory accesses are larger when `𝐁` is transposed. But it still performed best of all the compiled loops over this size range, losing out to MKL and eventually OpenBLAS.
2828
icc interestingly does better when it is transposed.
2929

3030
GEMM is easiest when the matrix `𝐀` is not tranposed (assuming column-major memory layouts), because then you can sum up columns of `𝐀` to store into `𝐂`. If `𝐀` were transposed, then we cannot efficiently load contiguous elements from `𝐀` that can best stored directly in `𝐂`. So for `𝐂 = 𝐀ᵀ * 𝐁`, contiguous vectors along the `k`-loop have to be reduced, adding some overhead.
31-
![AtmulB](../assets/bench_AtmulB_v2.png)
31+
![AtmulB](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_AtmulB_v2.svg)
3232
Packing is critical for performance here. LoopVectorization does not pack, therefore it is well behind MKL and OpenBLAS, which do. Eigen packs, but is poorly optimized for this CPU architecture.
3333

3434
When both `𝐀` and ` 𝐁` are transposed, we now have `𝐂 = 𝐀ᵀ * 𝐁ᵀ = (𝐁 * 𝐀)ᵀ`.
35-
![AtmulBt](../assets/bench_AtmulBt_v2.png)
35+
![AtmulBt](https://github.com/JuliaSIMD/LoopVectorization.jl/raw/docsassets/docs/src/assets/bench_AtmulBt_v2.svg)
3636
Julia, Clang, and gfortran all struggled to vectorize this, because none of the matrices share a contiguous access: `M` for `𝐂`, `K` for `𝐀ᵀ`, and `N` for `𝐁ᵀ`. However, LoopVectorization and all the specialized matrix multiplication functions managed to do about as well as normal; transposing while storing the results takes negligible amounts of time relative to the matrix multiplication itself.
3737
The ifort-loop version also did fairly well.
3838

0 commit comments

Comments
 (0)