Skip to content

Commit f40efc8

Browse files
committed
Added info and simplified examples
1 parent 21b699d commit f40efc8

File tree

3 files changed

+31
-26
lines changed

3 files changed

+31
-26
lines changed

examples/example-paper-implicitglobalgrid/EXP_test_halo_thr.jl

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
1+
# IMPORTANT: To replicate the same results as in the original paper, divide the GB/s values by 3
2+
# this difference is due to the STREAM benchmark policy, which counts each byte transferred thrice (write-allocate cache) and thus the target does as write_allocate
3+
# in our example the policy is to measure throughput as the amount of bytes transferred on an execution which is a different policy
4+
# Nevertheless, the percentages remain constant and the test is valid regardless of the used criterion
25
# NOTE: All tests of this file can be run with any number of processes.
36
# Nearly all of the functionality can however be verified with one single process
47
# (thanks to the usage of periodic boundaries in most of the full halo update tests).
@@ -65,25 +68,26 @@ dz = 1.0
6568
P2 = zeros(size(P));
6669
halowidths = (1,1,1)
6770
# (dim=3)
68-
buf = zeros(size(P ,1), size(P,2), halowidths[3]);
69-
ranges = [1:size(P,1), 1:size(P,2), 1:1];
71+
buf = zeros(size(P ,1), size(P,2), halowidths[3]);
72+
ranges = [1:size(P,1), 1:size(P,2), 1:1];
7073

71-
i = 0
72-
@define_eff_memory_throughput ratio=0.9 begin
73-
(nx * ny * 8) * 3 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
74-
end
75-
@auxiliary_metric name="Time" units="s" begin
76-
:median_time
77-
end
78-
@auxiliary_metric name="MPI" units="size" begin
79-
MPI.Comm_size(MPI.COMM_WORLD);
80-
end
81-
for i in 1:1
82-
GG.write_h2h!(buf, P, ranges, 3);
83-
end
84-
@perftest samples=100 begin
85-
GG.read_h2h!(buf, P2, ranges, 3);
86-
end
74+
i = 0
75+
# For info in the x3 multiplier see beggining of file
76+
@define_eff_memory_throughput ratio=0.9 begin
77+
(nx * ny * 8) * 3 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
78+
end
79+
@auxiliary_metric name="Time" units="s" begin
80+
:median_time
81+
end
82+
@auxiliary_metric name="MPI" units="size" begin
83+
MPI.Comm_size(MPI.COMM_WORLD);
84+
end
85+
for i in 1:1
86+
GG.write_h2h!(buf, P, ranges, 3);
87+
end
88+
@perftest samples=100 begin
89+
GG.read_h2h!(buf, P2, ranges, 3);
90+
end
8791
finalize_global_grid(finalize_MPI=false);
8892
end;
8993
end;

examples/example-paper-implicitglobalgrid/EXP_test_halo_thr_GPU.jl

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
2-
31
push!(LOAD_PATH, "../src")
42
using Test
53
using PerfTest
@@ -143,7 +141,7 @@ dz = 1.0
143141
device_data = CUDA.zeros(Float32, length(host_data))
144142
# Benchmark
145143
b = @benchmark begin CUDA.@sync(copyto!($device_data, $host_data))end;
146-
size_bytes * 2. / (median(b.times) / 1e9)
144+
size_bytes / (median(b.times) / 1e9)
147145
else
148146
0
149147
end
@@ -157,7 +155,7 @@ dz = 1.0
157155
host_data = rand(Float32, div(size_bytes, sizeof(Float32)))
158156
device_data = AMDGPU.zeros(Float32, length(host_data))
159157
b = @benchmark begin AMDGPU.@sync(copyto!($device_data, $host_data)) end;
160-
size_bytes * 2. / (median(b.times) / 1.0e9)
158+
size_bytes / (median(b.times) / 1.0e9)
161159
else
162160
0
163161
end
@@ -185,8 +183,9 @@ dz = 1.0
185183
buf = zeros(size(P, 1), size(P, 2), halowidths[3])
186184
ranges = [1:size(P, 1), 1:size(P, 2), 1:1]
187185

186+
# See beggining of EXP_test_halo_thr.jl for more info on the x3 multiplier
188187
@define_eff_memory_throughput begin
189-
(nx * ny * 8) * 2 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
188+
(nx * ny * 8) * 3 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
190189
end
191190

192191
i = 0
@@ -220,7 +219,7 @@ dz = 1.0
220219
nblocks = Tuple(ceil.(Int, halosize ./ nthreads))
221220
custream = stream();
222221
@define_eff_memory_throughput custom_benchmark=GPUBandwidthCUDA begin
223-
(nx * ny * 8) * 2 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
222+
(nx * ny * 8) * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
224223
end
225224
@perftest begin
226225
CUDA.@sync GG.read_h2d_async!(buf, P2, ranges, custream)
@@ -236,7 +235,7 @@ dz = 1.0
236235
nblocks = Tuple(ceil.(Int, halosize ./ nthreads))
237236
rocstream = AMDGPU.HIPStream()
238237
@define_eff_memory_throughput custom_benchmark=GPUBandwidthROC begin
239-
(nx * ny * 8) * 2 * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
238+
(nx * ny * 8) * MPI.Comm_size(MPI.COMM_WORLD) / :median_time
240239
end
241240
@perftest begin
242241
GG.read_h2d_async!(buf, P2, ranges, rocstream)

src/execution/macros/customs.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ auxiliary_metric_validation = define_metric_validation
8080
# Arguments
8181
- `name` : the name of the metric for identification purposes.
8282
- `units` : the unit space that the metric values will be in.
83+
- `mem_benchmark` : which STREAM kernel benchmark to use (e.g :MEM_STREAM_COPY for transfer operations :MEM_STREAM_ADD for transfer and computing)
84+
- `custom_benchmark` : in case of using a custom benchmark, the symbol that identifies the chosen benchmark, (must have been defined before)
8385
- formula block : an expression that returns a single value, which would be the metric value. The formula can have any julia expression inside and additionally some special symbols are supported. The formula may be evaluated several times, so its applied to every target in every test set or just once, if the formula is defined inside a test set, which makes it only applicable to it.
8486
8587
# Special symbols:

0 commit comments

Comments
 (0)