Use Ref's in README microbenchmark to avoid overoptimization

c42f · c42f · commit a7e9cbaa76c8 · 2019-09-20T17:33:12.000+10:00
Julia 1.2 will optimize some of these benchmarks away, so use
dereferencing of Ref's to at least prevent that particular error.

Update the README a bit with these results.
diff --git a/README.md b/README.md
@@ -27,22 +27,30 @@ Full documentation can be found [here](https://JuliaArrays.github.io/StaticArray
 ## Speed
 
 The speed of *small* `SVector`s, `SMatrix`s and `SArray`s is often > 10 × faster
-than `Base.Array`. See this simplified benchmark:
+than `Base.Array`. For example, here's a
+[microbenchmark](perf/README_benchmarks.jl) showing some common operations.
 
 ```
 ============================================
     Benchmarks for 3×3 Float64 matrices
 ============================================
-Matrix multiplication               -> 5.1x speedup
-Matrix multiplication (mutating)    -> 1.6x speedup
-Matrix addition                     -> 14.0x speedup
-Matrix addition (mutating)          -> 2.1x speedup
-Matrix determinant                  -> 119.3x speedup
-Matrix inverse                      -> 65.6x speedup
-Matrix symmetric eigendecomposition -> 24.8x speedup
-Matrix Cholesky decomposition       -> 12.1x speedup
+Matrix multiplication               -> 5.9x speedup
+Matrix multiplication (mutating)    -> 1.8x speedup
+Matrix addition                     -> 33.1x speedup
+Matrix addition (mutating)          -> 2.5x speedup
+Matrix determinant                  -> 112.9x speedup
+Matrix inverse                      -> 67.8x speedup
+Matrix symmetric eigendecomposition -> 25.0x speedup
+Matrix Cholesky decomposition       -> 8.8x speedup
+Matrix LU decomposition             -> 6.1x speedup
+Matrix QR decomposition             -> 65.0x speedup
 ```
 
+These numbers were generated on an Intel i7-7700HQ using Julia-1.2. As with all
+synthetic benchmarks, the speedups you see here should only be taken as very
+roughly indicative of the speedup you may see in real code. When in doubt,
+benchmark your real application!
+
 Note that in the current implementation, working with large `StaticArray`s puts a
 lot of stress on the compiler, and becomes slower than `Base.Array` as the size
 increases.  A very rough rule of thumb is that you should consider using a
diff --git a/perf/README_benchmarks.jl b/perf/README_benchmarks.jl
@@ -18,22 +18,42 @@ function simple_bench(N, T=Float64)
 ============================================
 """)
     ops = [
-        ("Matrix multiplication              ", *, (A, A), (SA, SA)),
-        ("Matrix multiplication (mutating)   ", mul!, (B, A, A), (MB, MA, MA)),
-        ("Matrix addition                    ", +, (A, A), (SA, SA)),
-        ("Matrix addition (mutating)         ", add!, (B, A, A), (MB, MA, MA)),
-        ("Matrix determinant                 ", det, A, SA),
-        ("Matrix inverse                     ", inv, A, SA),
-        ("Matrix symmetric eigendecomposition", eigen, A, SA),
-        ("Matrix Cholesky decomposition      ", cholesky, A, SA)
+           ("Matrix multiplication              ", *,        (A, A),     (SA, SA)),
+           ("Matrix multiplication (mutating)   ", mul!,     (B, A, A),  (MB, MA, MA)),
+           ("Matrix addition                    ", +,        (A, A),     (SA, SA)),
+           ("Matrix addition (mutating)         ", add!,     (B, A, A),  (MB, MA, MA)),
+           ("Matrix determinant                 ", det,      (A,),       (SA,)),
+           ("Matrix inverse                     ", inv,      (A,),       (SA,)),
+           ("Matrix symmetric eigendecomposition", eigen,    (A,),       (SA,)),
+           ("Matrix Cholesky decomposition      ", cholesky, (A,),       (SA,)),
+           ("Matrix LU decomposition            ", lu,       (A,),       (SA,)),
+           ("Matrix QR decomposition            ", qr,       (A,),       (SA,)),
     ]
     for (name, op, Aargs, SAargs) in ops
-        if Aargs isa Tuple && length(Aargs) == 2
-            speedup = @belapsed($op($Aargs[1], $Aargs[2])) / @belapsed($op($SAargs[1], $SAargs[2]))
-        elseif Aargs isa Tuple && length(Aargs) == 3
-            speedup = @belapsed($op($Aargs[1], $Aargs[2], $Aargs[3])) / @belapsed($op($SAargs[1], $SAargs[2], $SAargs[3]))
+        # We load from Ref's here to avoid the compiler completely removing the
+        # benchmark in some cases.
+        #
+        # Like any microbenchmark, the speedups you see here should only be
+        # taken as roughly indicative of the speedup you may see in real code.
+        if length(Aargs) == 1
+            A1  = Ref(Aargs[1])
+            SA1 = Ref(SAargs[1])
+            speedup = @belapsed($op($A1[])) / @belapsed($op($SA1[]))
+        elseif length(Aargs) == 2
+            A1  = Ref(Aargs[1])
+            A2  = Ref(Aargs[2])
+            SA1 = Ref(SAargs[1])
+            SA2 = Ref(SAargs[2])
+            speedup = @belapsed($op($A1[], $A2[])) / @belapsed($op($SA1[], $SA2[]))
+        elseif length(Aargs) == 3
+            A1  = Ref(Aargs[1])
+            A2  = Ref(Aargs[2])
+            A3  = Ref(Aargs[3])
+            SA1 = Ref(SAargs[1])
+            SA2 = Ref(SAargs[2])
+            SA3 = Ref(SAargs[3])
+            speedup = @belapsed($op($A1[], $A2[], $A3[])) / @belapsed($op($SA1[], $SA2[], $SA3[]))
         else
-            speedup = @belapsed($op($Aargs)) / @belapsed($op($SAargs))
         end
         println(name*" -> $(round(speedup, digits=1))x speedup")
     end