JuliaPerf · Dvegrod · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/docs/make.jl b/docs/make.jl
@@ -38,10 +38,13 @@ makedocs(;
     pages   = [
         "Introduction"  => "index.md",
         "Usage"         => "usage.md",
+        "Macros"        => "macros.md",
         "Examples"      => [hide("..." => "examples.md"),
-                            "examples/memcopyCellArray3D.md",
-                            "examples/memcopyCellArray3D_ParallelStencil.md",
+                            "examples/mock2-memorythroughput.md",
+                            "examples/mock3-roofline.md",
+                            "examples/mock4-recursive.md",
                            ],
+        "Limitations"   => "limitations.md",
         "API reference" => "api.md",
     ],
 )

diff --git a/docs/src/examples/mock2-memorythroughput.md b/docs/src/examples/mock2-memorythroughput.md
@@ -0,0 +1,56 @@
+
+# Effective memory throughput
+
+This mock example explains how to setup the effective memory throughput methodology.
+
+The memory bandwidth test is more appropriate for functions that are primarily moving data, with reduced protagonism to computation performance.
+
+```julia
+
+using Test
+using PerfTest
+
+
+@perftest_config "
+[regression]
+enabled = false
+"
+
+
+# TEST TARGET
+function copyvec_squared(a :: Vector{Int})
+
+    b = zeros(length(a))
+
+    for i in 1:length(a)
+        b[i] = a[i] * a[i]
+    end
+
+    return b
+end
+
+@testset "Example" begin
+
+    N = 1000000
+
+    a = rand(Int, N)
+
+    # To use a variable in macro formulas, it has to be exported (i.e. N)
+    @export_vars N
+    # The ratio sets the threshold, being 1.0 the maximum empirical bandwidth and 0.6 = 60% of such maximum
+    @define_eff_memory_throughput ratio=0.6 begin
+        # The main block of the macro holds the formula for the bandwidth (therefore BYTES divided by SECONDS) on
+        # one single execution of the test target.
+        # In this case per execution N elements of 4 Bytes are Read on Memort + Written
+        # on Cache + Written on Memory (Copy on write assumption by default)
+        # The median time is considered an adequate measure for the denominator in this case
+        # THUS:
+        N * 4 * 3 / :median_time
+    end
+
+    # Set the target
+    b = @perftest copyvec_squared(a)
+end
+```
+
+This test results in a failure to meet the expectation.
diff --git a/docs/src/examples/mock3-roofline.md b/docs/src/examples/mock3-roofline.md
@@ -0,0 +1,51 @@
+
+
+# Roofline
+
+This mock example explains how to setup the effective roofline methodology.
+
+
+```julia
+using Test
+using PerfTest
+
+# Disable regression enable verbosity to see successful tests
+@perftest_config "
+[regression]
+enabled = false
+[general]
+verbose = true
+"
+
+
+# TEST TARGET
+function polynom(x :: Float64, coeff :: Vector{Float64})
+
+    res = 0.
+
+    for i in length(coeff):-1:1
+        res += coeff[i] * x ^ i
+    end
+
+    return res
+end
+
+@testset "Example" begin
+
+    N = 50
+
+    coeff = rand(N)
+    x = 1.0
+
+    # To use a variable in macro formulas, it has to be exported (i.e. N)
+    @export_vars N
+    @roofline actual_flops=:autoflop target_ratio=0.1 begin
+        :autoflop / ((1 + N)*4)
+    end
+
+    # Set the target
+    res = @perftest polynom(x, coeff)
+end
+```
+
+This test results in a success to meet the expectation.
diff --git a/docs/src/examples/mock4-recursive.md b/docs/src/examples/mock4-recursive.md
@@ -0,0 +1,33 @@
+
+
+# Recursive Suite Generation
+
+This mock example explains how PerfTest can recursively integrate files into a performance test suite.
+
+
+```julia
+using Test
+using PerfTest
+
+
+# Note the config here disables verbosity but the nested files enable it, inside the nested file its config has priority
+@perftest_config "
+[regression]
+enabled = false
+[general]
+verbose = false
+"
+
+@testset "A" begin
+    @testset "A.1" begin
+        # Check that time elapsed is less than one second, applies to the targets inside this testset
+        @perfcompare :median_time < 1
+        # Being "mock3-roofline.jl" a file with the roofline mock example source code.
+        include("mock3-roofline.jl")
+    end
+    include("mock3-roofline.jl")
+end
+```
+
+This will execute the same roofline test but under different hierarchies, also the first one will have an
+additional performance metric assertion with perfcompare. All test are successful.
diff --git a/docs/src/limitations.md b/docs/src/limitations.md
@@ -0,0 +1,14 @@
+# Limitations and future work:
+
+There are a couple of things to keep into consideration when using the package:
+
+1. GPU testing is technically supported, but it requires more effort from the developer to set up since the automatic measurements apart from time elapsed do not apply to GPUs (as of now).
+2. The automatic flop counting feature works exclusively for Julia native functions, it can not measure the flops in C calls given the limitations of the subjacent package [CountFlops.jl]
+
+
+## Features to be expected in the next versions:
+
+1. Easier access to performance suite results after execution
+2. Simplification of the package structure, there will be an emphasis on making the package easy to extend for unfamiliarised developers (hopefully by June 2025)
+3. Access to performance counter values through LIKWID (hopefully by June 2025)
+4. Alternative regression testing against git commits instead of last execution for easier testing
diff --git a/docs/src/macros.md b/docs/src/macros.md
@@ -0,0 +1,39 @@
+```@meta
+CurrentModule = PerfTest
+```
+
+# PerfTest macros quick reference
+
+The following are the main macros used to define performance test suites. These shall be always used
+inside a testset (see the [Test] package). Combining the different macros listed in this section gives
+the full extent of the package features.
+
+## Declaring test targets
+
+```@docs
+@perftest
+```
+
+## Declaring metrics
+
+```@docs
+@define_metric
+@auxiliary_metric
+```
+
+## Declaring methodologies
+
+```@docs
+@perfcompare
+@define_eff_memory_throughput
+@roofline
+```
+
+## Structure and configuration
+
+```@docs
+@perftest_config
+@on_perftest_exec
+@on_perftest_ignore
+@export_vars
+```
diff --git a/docs/src/usage.MD b/docs/src/usage.MD
@@ -1,10 +1,47 @@
 # Usage
-Have a look at the [Examples](@ref) and see the [API reference](@ref) for details on the usage of `PerfTest`.
+
+`PerfTest` provides a set of macros to instrument ordinary Julia test files with performance tests. The idea is to have the posibility of having a functional and a performance suite all in the same place.
+
+The underlying idea of declaring performance tests can be boiled down the following:
+
+1. Have a @testset that groups tests for a software unit
+2. Tell PerfTest what is the target to be tested by using the macro @perftest
+3. Tell PerfTest how the target shall be tested, which metrics are interesting, which of those metrics values would be considered a failure, this can be declared using the metric and methodology macros (see Macros)
+
+The following dummy example embodies the paradigm of the package:
+
+```julia
+using ExampleModule : innerProduct, Test, PerfTest   # Importing the target and test libraries
+@testset "Roofline Test" begin
+    a,b = rand(1e6),rand(1e6)
+
+    @roofline actual_flops=:autoflop target_ratio=0.5
+        :autoflop / (2 * 8 * 1e6)
+    end
+
+    @perftest innerProduct(a, b)
+    @test innerProduct(a,b) == sum(a .* b)
+end
+```
+
+The following things can be appreciated in this example:
+1. This is a combined functional and performance unit test suite (there is both @test and @perftest present)
+2. The target of the perftest is the innerProduct function
+3. The performance test methodology is a roofline model, the developer expects innerProduct to perform at least at 50% of the maximum flop performance set by the roofline. The operational intensity is defined on the main block of the macro. :autoflop is a symbol that enables the use of an automatic flop count feature.
+
+
+For more information have a look at the [Examples](@ref) and see the [API reference](@ref) for details on the usage of `PerfTest`.
 
 
 ## Installation
+
 `PerfTest` can be installed directly with the [Julia package manager] from the [Julia REPL]:
 ```julia-repl
 julia>]
-  pkg> add PerfTest
+  pkg> add https://github.com/JuliaPerf/PerfTest.jl.git
 ```
+
+<!-- `PerfTest` can be installed directly with the [Julia package manager] from the [Julia REPL]: -->
+<!-- ```julia-repl -->
+<!-- julia>] -->
+<!--   pkg> add PerfTest -->
diff --git a/src/execution/machine_benchmarking.jl b/src/execution/machine_benchmarking.jl
@@ -1,5 +1,4 @@
 
-using BenchmarkTools: _run
 # Memory and CPU benchmarks used by different methodologies
 
 function getMachineInfo()::Expr
@@ -36,6 +35,7 @@ using Base.Threads
 copy_kernel(C,A;kwargs...) = STREAMBenchmark.copy_nthreads(C,A;kwargs...)
 add_kernel(C,A,B;kwargs...) = STREAMBenchmark.add_nthreads(C,A,B;kwargs...)
 
+# This function is heavily based on the respective from STREAMBenchmark
 function _run_kernels(copy, add;
                       verbose = true,
                       N,
@@ -77,22 +77,20 @@ function _run_kernels(copy, add;
     # COPY
     t_copy = @belapsed $copy($C, $A; nthreads = $nthreads, thread_indices = $thread_indices) samples=10 evals=evals_per_sample
     bw_copy = f(t_copy)
-    verbose && println("╟─  COPY:  ", round(bw_copy; digits = 1), "B/s")
 
     # ADD
     t_add = @belapsed $add($C, $A, $B; nthreads = $nthreads,
         thread_indices=$thread_indices) samples = 10 evals = evals_per_sample
     bw_add = g(t_add)
-    verbose && println("╟─  ADD:   ", round(bw_add; digits=1), "B/s")
 
     return (bw_copy,bw_add)
 end
 
 
 function measureMemBandwidth!(::Type{<:NormalMode}, _PRFT_GLOBAL::Dict{Symbol,Any})
     bench_data = _run_kernels(copy_kernel, add_kernel; N=div(_PRFT_GLOBAL[:machine][:cache_sizes][end], 2))
-    # In B/s
-    peakbandwidth = bench_data .* 1e6
+    # in Bytes/sec
+    peakbandwidth = bench_data
     _PRFT_GLOBAL[:machine][:empirical][:peakmemBW] = Dict{Symbol, Number}()
     _PRFT_GLOBAL[:machine][:empirical][:peakmemBW][:COPY] = peakbandwidth[1]
     _PRFT_GLOBAL[:machine][:empirical][:peakmemBW][:ADD] = peakbandwidth[2]

diff --git a/src/execution/printing.jl b/src/execution/printing.jl
@@ -124,7 +124,7 @@ function printMetric(metric :: Metric_Result, test:: Metric_Test, tab::Int)
     end
     print(@lpad(tab) *"METRIC ")
     p_blue("$(metric.name)")
-    print(" ["* metric.units *"]:")
+    print(" ["* metric.magnitude_prefix * metric.units *"]:")
     # MPI info
     if !(metric.mpi isa Nothing)
         print(" "^10)
@@ -166,7 +166,7 @@ end
 This function is used to dump metric information regading auxiliar metrics, which are not used in testing.
 """
 function auxiliarMetricPrint(metric :: Metric_Result, tab::Int)
-    println(" " ^ tab * "Metric: " * metric.name * " [" * metric.units * "]")
+    println(" "^tab * "Metric: " * metric.name * " [" * metric.magnitude_prefix * metric.units * "]")
     println(" " ^ tab * "  = ", metric.value)
     println("")
 end

diff --git a/src/transform/methodologies/mem_bandwidth.jl b/src/transform/methodologies/mem_bandwidth.jl
@@ -63,13 +63,13 @@ function buildMemTRPTMethodology(context :: Context)::Expr
                 aux_abs_value = newMetricResult(
                     $mode,
                     name="Attained Bandwidth",
-                    units="GB/s",
+                    units="B/s",
                     value=_PRFT_LOCAL[:metrics][:effMemTP].value
                 )
                 aux_ref_value = newMetricResult(
                     $mode,
                     name="Peak empirical bandwidth",
-                    units="GB/s",
+                    units="B/s",
                     value=_PRFT_GLOBAL[:machine][:empirical][:peakmemBW][$(QuoteNode(info[:mem_benchmark]))]
                 )
 
@@ -81,8 +81,11 @@ function buildMemTRPTMethodology(context :: Context)::Expr
                 methodology_res.custom_elements[:abs] = magnitudeAdjust(aux_abs_value)
                 methodology_res.custom_elements[:abs_ref] = magnitudeAdjust(aux_ref_value)
 
+                @show _PRFT_LOCAL[:metrics][:effMemTP].value, _PRFT_GLOBAL[:machine][:empirical][:peakmemBW][$(QuoteNode(info[:mem_benchmark]))]
+                @show test,value,result
+
                 # Printing
-                if $(Configuration.CONFIG["general"]["verbose"]) || !(flop_test.succeeded)
+                if $(Configuration.CONFIG["general"]["verbose"]) || !(test.succeeded)
                     PerfTest.printMethodology(methodology_res, $(length(context._local.depth_record)), $(Configuration.CONFIG["general"]["plotting"]))
                 end