Added info and simplified examples

Dvegrod · Dvegrod · commit f40efc834d73 · 2026-01-29T21:36:37.000+01:00
diff --git a/examples/example-paper-implicitglobalgrid/EXP_test_halo_thr.jl b/examples/example-paper-implicitglobalgrid/EXP_test_halo_thr.jl
@@ -1,4 +1,7 @@
-
+# IMPORTANT: To replicate the same results as in the original paper, divide the GB/s values by 3
+# this difference is due to the STREAM benchmark policy, which counts each byte transferred thrice (write-allocate cache) and thus the target does as write_allocate
+# in our example the policy is to measure throughput as the amount of bytes transferred on an execution which is a different policy
+# Nevertheless, the percentages remain constant and the test is valid regardless of the used criterion
 # NOTE: All tests of this file can be run with any number of processes.
 # Nearly all of the functionality can however be verified with one single process
 # (thanks to the usage of periodic boundaries in most of the full halo update tests).
@@ -65,25 +68,26 @@ dz = 1.0
                 P2 = zeros(size(P));
                 halowidths = (1,1,1)
                 # (dim=3)
-		    buf = zeros(size(P ,1), size(P,2), halowidths[3]);
-		    ranges = [1:size(P,1), 1:size(P,2), 1:1];
+                buf = zeros(size(P ,1), size(P,2), halowidths[3]);
+                ranges = [1:size(P,1), 1:size(P,2), 1:1];
 
-            i = 0
-            @define_eff_memory_throughput ratio=0.9 begin
-                (nx * ny * 8) * 3 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
-            end
-            @auxiliary_metric name="Time" units="s" begin
-                :median_time
-            end
-            @auxiliary_metric name="MPI" units="size" begin
-                MPI.Comm_size(MPI.COMM_WORLD);
-            end
-            for i in 1:1
-                GG.write_h2h!(buf, P, ranges, 3);
-            end
-            @perftest samples=100 begin
-                GG.read_h2h!(buf, P2, ranges, 3);
-            end
+                i = 0
+                # For info in the x3 multiplier see beggining of file
+                @define_eff_memory_throughput ratio=0.9 begin
+                    (nx * ny * 8) * 3 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
+                end
+                @auxiliary_metric name="Time" units="s" begin
+                    :median_time
+                end
+                @auxiliary_metric name="MPI" units="size" begin
+                    MPI.Comm_size(MPI.COMM_WORLD);
+                end
+                for i in 1:1
+                    GG.write_h2h!(buf, P, ranges, 3);
+                end
+                @perftest samples=100 begin
+                    GG.read_h2h!(buf, P2, ranges, 3);
+                end
                 finalize_global_grid(finalize_MPI=false);
             end;
         end;
diff --git a/examples/example-paper-implicitglobalgrid/EXP_test_halo_thr_GPU.jl b/examples/example-paper-implicitglobalgrid/EXP_test_halo_thr_GPU.jl
@@ -1,5 +1,3 @@
-
-
 push!(LOAD_PATH, "../src")
 using Test
 using PerfTest
@@ -143,7 +141,7 @@ dz = 1.0
             device_data = CUDA.zeros(Float32, length(host_data))
             # Benchmark
             b = @benchmark begin CUDA.@sync(copyto!($device_data, $host_data))end;
-            size_bytes * 2. / (median(b.times) / 1e9)
+            size_bytes / (median(b.times) / 1e9)
         else 
             0
         end
@@ -157,7 +155,7 @@ dz = 1.0
             host_data = rand(Float32, div(size_bytes, sizeof(Float32)))
             device_data = AMDGPU.zeros(Float32, length(host_data))
             b = @benchmark begin AMDGPU.@sync(copyto!($device_data, $host_data)) end;
-            size_bytes * 2. / (median(b.times) / 1.0e9)
+            size_bytes / (median(b.times) / 1.0e9)
         else 
             0
         end
@@ -185,8 +183,9 @@ dz = 1.0
                     buf = zeros(size(P, 1), size(P, 2), halowidths[3])
                     ranges = [1:size(P, 1), 1:size(P, 2), 1:1]
 
+                    # See beggining of EXP_test_halo_thr.jl for more info on the x3 multiplier
                     @define_eff_memory_throughput begin
-                        (nx * ny * 8) * 2 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
+                        (nx * ny * 8) * 3 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
                     end	
 
                     i = 0
@@ -220,7 +219,7 @@ dz = 1.0
                         nblocks = Tuple(ceil.(Int, halosize ./ nthreads))
                         custream = stream();
             	        @define_eff_memory_throughput custom_benchmark=GPUBandwidthCUDA begin
-                	        (nx * ny * 8) * 2 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
+                	        (nx * ny * 8) * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
             	        end	
                         @perftest begin
                             CUDA.@sync GG.read_h2d_async!(buf, P2, ranges, custream)
@@ -236,7 +235,7 @@ dz = 1.0
                         nblocks = Tuple(ceil.(Int, halosize ./ nthreads))
                         rocstream = AMDGPU.HIPStream()
             	        @define_eff_memory_throughput custom_benchmark=GPUBandwidthROC begin
-                	        (nx * ny * 8) * 2 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
+                	        (nx * ny * 8) * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
             	        end	
                         @perftest begin
                             GG.read_h2d_async!(buf, P2, ranges, rocstream)
diff --git a/src/execution/macros/customs.jl b/src/execution/macros/customs.jl
@@ -80,6 +80,8 @@ auxiliary_metric_validation = define_metric_validation
 # Arguments
  - `name` : the name of the metric for identification purposes.
  - `units` : the unit space that the metric values will be in.
+ - `mem_benchmark` : which STREAM kernel benchmark to use (e.g :MEM_STREAM_COPY for transfer operations :MEM_STREAM_ADD for transfer and computing)
+ - `custom_benchmark` : in case of using a custom benchmark, the symbol that identifies the chosen benchmark, (must have been defined before)
  - formula block : an expression that returns a single value, which would be the metric value. The formula can have any julia expression inside and additionally some special symbols are supported. The formula may be evaluated several times, so its applied to every target in every test set or just once, if the formula is defined inside a test set, which makes it only applicable to it.
 
 # Special symbols: