ClimaCore.jl/benchmarks/scripts/thermo_bench.jl at b10ccbe7e66a1eccb78a3cb64fe0faac3e59d31d · CliMA/ClimaCore.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#=
julia --project=.buildkite
using Revise; include(joinpath("benchmarks", "scripts", "thermo_bench.jl"))

This benchmark requires Thermodynamics and ClimaParams
to be in your local environment to run.

# Benchmark results:

Clima A100:
```
[ Info: device = ClimaComms.CUDADevice()
Problem size: (4, 4, 1, 63, 5400), N reads-writes: 9, N-reps: 100,  Float_type = Float32, Device_bandwidth_GBs=2039
┌──────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┐
│ funcs                                                        │ time per call                     │ bw %    │ achieved bw │
├──────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┤
│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)      │ 586 microseconds, 353 nanoseconds │ 15.2644 │ 311.242     │
│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 796 nanoseconds │ 30.4645 │ 621.171     │
│ TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)      │ 586 microseconds, 138 nanoseconds │ 15.27   │ 311.356     │
│ TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm) │ 293 microseconds, 755 nanoseconds │ 30.4687 │ 621.258     │
└──────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┘
```
=#

#! format: off
module ThermoBench

import CUDA
include("benchmark_utils.jl")

import ClimaCore
import CUDA
using ClimaComms
using Test
using StaticArrays, IntervalSets, LinearAlgebra
using JET

import ClimaCore: Spaces, Fields
import ClimaCore.Domains: Geometry

@inline function make_thermo_state(thermo_params, ρ, e_int, q_tot)
    sa = TD.saturation_adjustment(thermo_params, TD.ρe(), ρ, e_int, q_tot; maxiter = 3)
    p = TD.air_pressure(thermo_params, sa.T, ρ, q_tot, sa.q_liq, sa.q_ice)
    return (; ρ = ρ, p = p, T = sa.T, e_int = e_int, q_tot = q_tot)
end
@inline ts_gs(thermo_params, e_tot, q_tot, K, Φ, ρ) =
    make_thermo_state(thermo_params, ρ, e_tot - K - Φ, q_tot)

import Thermodynamics as TD

function thermo_func_bc!(x, thermo_params, us; nreps = 1, bm=nothing, n_trials = 30)
    e = Inf
    for t in 1:n_trials
        et = CUDA.@elapsed begin
            for _ in 1:nreps
                (; ts, e_tot, q_tot, K, Φ, ρ) = x
                @. ts = ts_gs(thermo_params, e_tot, q_tot, K, Φ, ρ) # 5 reads, 5 writes, many flops
            end
        end
        e = min(e, et)
    end
    s = size(Fields.field_values(x.ρ))
    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this
    return nothing
end

function thermo_func_sol!(x, thermo_params, us::UniversalSizesStatic; nreps = 1, bm=nothing, n_trials = 30)
    e = Inf
    for t in 1:n_trials
        et = CUDA.@elapsed begin
            (; ts, e_tot, q_tot, K, Φ, ρ) = x
            kernel = CUDA.@cuda always_inline = true launch = false thermo_func_sol_kernel!(ts,e_tot,q_tot,K,Φ,ρ,thermo_params,us)
            N = get_N(us)
            config = CUDA.launch_configuration(kernel.fun)
            threads = min(N, config.threads)
            blocks = cld(N, threads)
            for _ in 1:nreps
                kernel(ts,e_tot,q_tot,K,Φ,ρ,thermo_params,us; threads, blocks)
            end
        end
        e = min(e, et)
    end
    s = size(x.ρ)
    push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=s,n_reads_writes=5+4) # TODO: verify this
    return nothing
end

# Mimics how indexing works in generalized pointwise kernels
function thermo_func_sol_kernel!(ts, e_tot, q_tot, K, Φ, ρ, thermo_params, us)
    @inbounds begin
        FT = eltype(e_tot)
        I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
        if I ≤ get_N(us)
            # Data is not read into the correct fields because this is only used
            # to compare with the case when the number of flops goes to zero.

            # 5 reads, 5 writes, potentially many flops (see thermodynamics for estimate)
            ts_i = ts_gs(thermo_params, e_tot[I], q_tot[I], K[I], Φ[I], ρ[I])
            ts.ρ[I] = ts_i.ρ
            ts.p[I] = ts_i.p
            ts.T[I] = ts_i.T
            ts.e_int[I] = ts_i.e_int
            ts.q_tot[I] = ts_i.q_tot
        end
    end
    return nothing
end

end

import ClimaParams # trigger Thermo extension
import .ThermoBench as TB

import Thermodynamics as TD
import CUDA
using ClimaComms
using ClimaCore
import ClimaCore: Spaces, Fields
import ClimaCore.Domains: Geometry

ENV["CLIMACOMMS_DEVICE"] = get(ENV, "CLIMACOMMS_DEVICE", "CPU");
ClimaComms.@import_required_backends
using BenchmarkTools
@isdefined(TU) || include(
    joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"),
);
import .TestUtilities as TU;

using Test
@testset "Thermo state" begin
    FT = Float32
    device_name = CUDA.name(CUDA.device())
    bm = TB.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=FT)
    device = ClimaComms.device()
    context = ClimaComms.context(device)
    cspace = TU.CenterExtrudedFiniteDifferenceSpace(
        FT;
        zelem = 63,
        context,
        helem = 30,
        Nq = 4,
    )
    fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace)
    @info "device = $device"
    thermo_params = TD.Parameters.ThermodynamicsParameters(FT)
    # TODO: fill with non-trivial values (e.g., use Thermodynamics TestedProfiles) to verify correctness.
    nt_core = (; K = FT(0), Φ = FT(1), ρ = FT(0), e_tot = FT(1), q_tot = FT(0.001))
    nt_ts = (;
        ρ = FT(0),
        p = FT(0),
        e_int = FT(0),
        q_tot = FT(0),
        T = FT(0),
    )
    x = fill((; ts = nt_ts, nt_core...), cspace)
    xv = fill((; ts = nt_ts, nt_core...), cspace)
    (_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
    us = TB.UniversalSizesStatic(Nv, Nij, Nh)
    function to_vec(ξ)
        pns = propertynames(ξ)
        dl_vals = map(pns) do pn
            val = getproperty(ξ, pn)
            pn == :ts ? to_vec(val) :
            CUDA.CuArray(collect(vec(parent(Fields.field_values(val)))))
        end
        return (; zip(propertynames(ξ), dl_vals)...)
    end
    x_vec = to_vec(xv)

    TB.thermo_func_bc!(x, thermo_params, us; nreps=1, n_trials = 1)
    TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=1, n_trials = 1)

    rc = Fields.rcompare(x_vec, to_vec(x))
    rc || Fields.rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
    @test rc # test correctness

    TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)
    TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm)

    TB.thermo_func_bc!(x, thermo_params, us; nreps=100, bm)
    TB.thermo_func_sol!(x_vec, thermo_params, us; nreps=100, bm)

    TB.tabulate_benchmark(bm)

end
#! format: on