-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy paththermo_bench.jl
More file actions
186 lines (162 loc) · 7.54 KB
/
thermo_bench.jl
File metadata and controls
186 lines (162 loc) · 7.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#=
julia --project=.buildkite
using Revise; include(joinpath("benchmarks", "scripts", "thermo_bench.jl"))
This benchmark requires Thermodynamics and ClimaParams
to be in your local environment to run.
# Benchmark results:
Clima A100:
```
[ Info: device = ClimaComms.CUDADevice()
Problem size: (4, 4, 1, 63, 5400), N reads-writes: 9, N-reps: 100, Float_type = Float32, Device_bandwidth_GBs=2039
┌──────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┐
│ funcs │ time per call │ bw % │ achieved bw │
├──────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┤
│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm) │ 586 microseconds, 353 nanoseconds │ 15.2644 │ 311.242 │
│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 796 nanoseconds │ 30.4645 │ 621.171 │
│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm) │ 586 microseconds, 138 nanoseconds │ 15.27 │ 311.356 │
│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 755 nanoseconds │ 30.4687 │ 621.258 │
└──────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┘
```
=#
#! format: off
module ThermoBench
import CUDA
include("benchmark_utils.jl")
import ClimaCore
import CUDA
using ClimaComms
using Test
using StaticArrays, IntervalSets, LinearAlgebra
using JET
import ClimaCore: Spaces, Fields
import ClimaCore.Domains: Geometry
@inline function make_thermo_state(thermo_params, ρ, e_int, q_tot)
sa = TD.saturation_adjustment(thermo_params, TD.ρe(), ρ, e_int, q_tot; maxiter = 3)
p = TD.air_pressure(thermo_params, sa.T, ρ, q_tot, sa.q_liq, sa.q_ice)
return (; ρ = ρ, p = p, T = sa.T, e_int = e_int, q_tot = q_tot)
end
@inline ts_gs(thermo_params, e_tot, q_tot, K, Φ, ρ) =
make_thermo_state(thermo_params, ρ, e_tot - K - Φ, q_tot)
import Thermodynamics as TD
function thermo_func_bc!(x, thermo_params, us; nreps = 1, bm=nothing, n_trials = 30)
e = Inf
for t in 1:n_trials
et = CUDA.@elapsed begin
for _ in 1:nreps
(; ts, e_tot, q_tot, K, Φ, ρ) = x
@. ts = ts_gs(thermo_params, e_tot, q_tot, K, Φ, ρ) # 5 reads, 5 writes, many flops
end
end
e = min(e, et)
end
s = size(Fields.field_values(x.ρ))
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this
return nothing
end
function thermo_func_sol!(x, thermo_params, us::UniversalSizesStatic; nreps = 1, bm=nothing, n_trials = 30)
e = Inf
for t in 1:n_trials
et = CUDA.@elapsed begin
(; ts, e_tot, q_tot, K, Φ, ρ) = x
kernel = CUDA.@cuda always_inline = true launch = false thermo_func_sol_kernel!(ts,e_tot,q_tot,K,Φ,ρ,thermo_params,us)
N = get_N(us)
config = CUDA.launch_configuration(kernel.fun)
threads = min(N, config.threads)
blocks = cld(N, threads)
for _ in 1:nreps
kernel(ts,e_tot,q_tot,K,Φ,ρ,thermo_params,us; threads, blocks)
end
end
e = min(e, et)
end
s = size(x.ρ)
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this
return nothing
end
# Mimics how indexing works in generalized pointwise kernels
function thermo_func_sol_kernel!(ts, e_tot, q_tot, K, Φ, ρ, thermo_params, us)
@inbounds begin
FT = eltype(e_tot)
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
if I ≤ get_N(us)
# Data is not read into the correct fields because this is only used
# to compare with the case when the number of flops goes to zero.
# 5 reads, 5 writes, potentially many flops (see thermodynamics for estimate)
ts_i = ts_gs(thermo_params, e_tot[I], q_tot[I], K[I], Φ[I], ρ[I])
ts.ρ[I] = ts_i.ρ
ts.p[I] = ts_i.p
ts.T[I] = ts_i.T
ts.e_int[I] = ts_i.e_int
ts.q_tot[I] = ts_i.q_tot
end
end
return nothing
end
end
import ClimaParams # trigger Thermo extension
import .ThermoBench as TB
import Thermodynamics as TD
import CUDA
using ClimaComms
using ClimaCore
import ClimaCore: Spaces, Fields
import ClimaCore.Domains: Geometry
ENV["CLIMACOMMS_DEVICE"] = get(ENV, "CLIMACOMMS_DEVICE", "CPU");
ClimaComms.@import_required_backends
using BenchmarkTools
@isdefined(TU) || include(
joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"),
);
import .TestUtilities as TU;
using Test
@testset "Thermo state" begin
FT = Float32
device_name = CUDA.name(CUDA.device())
bm = TB.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=FT)
device = ClimaComms.device()
context = ClimaComms.context(device)
cspace = TU.CenterExtrudedFiniteDifferenceSpace(
FT;
zelem = 63,
context,
helem = 30,
Nq = 4,
)
fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace)
@info "device = $device"
thermo_params = TD.Parameters.ThermodynamicsParameters(FT)
# TODO: fill with non-trivial values (e.g., use Thermodynamics TestedProfiles) to verify correctness.
nt_core = (; K = FT(0), Φ = FT(1), ρ = FT(0), e_tot = FT(1), q_tot = FT(0.001))
nt_ts = (;
ρ = FT(0),
p = FT(0),
e_int = FT(0),
q_tot = FT(0),
T = FT(0),
)
x = fill((; ts = nt_ts, nt_core...), cspace)
xv = fill((; ts = nt_ts, nt_core...), cspace)
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
us = TB.UniversalSizesStatic(Nv, Nij, Nh)
function to_vec(ξ)
pns = propertynames(ξ)
dl_vals = map(pns) do pn
val = getproperty(ξ, pn)
pn == :ts ? to_vec(val) :
CUDA.CuArray(collect(vec(parent(Fields.field_values(val)))))
end
return (; zip(propertynames(ξ), dl_vals)...)
end
x_vec = to_vec(xv)
TB.thermo_func_bc!(x, thermo_params, us; nreps=1, n_trials = 1)
TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=1, n_trials = 1)
rc = Fields.rcompare(x_vec, to_vec(x))
rc || Fields.rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
@test rc # test correctness
TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)
TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm)
TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)
TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm)
TB.tabulate_benchmark(bm)
end
#! format: on