trixi-framework · benegee · Dec 17, 2024 · Apr 21, 2025 · Apr 21, 2025 · Apr 22, 2025
diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -0,0 +1,79 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A = 1.0 # magnitude of speed
+    Ms = 0.1 # maximum Mach number
+
+    rho = 1.0
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+    p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p = p +
+        1.0 / 16.0 * A^2 * rho *
+        (cos(2 * x[1]) * cos(2 * x[3]) +
+         2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+# TODO Undefined external symbol "log"
+#volume_flux = flux_ranocha
+volume_flux = flux_lax_friedrichs
+solver = DGSEM(polydeg = 5, surface_flux = volume_flux)
+# TODO flux diff
+#volume_integral=VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
+
+initial_refinement_level = 1
+trees_per_dimension = (4, 4, 4)
+
+mesh = P4estMesh(trees_per_dimension, polydeg = 1,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true, initial_refinement_level = initial_refinement_level)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 100.0)
+ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
+
+summary_callback = SummaryCallback()
+
+stepsize_callback = StepsizeCallback(cfl = 0.1)
+
+callbacks = CallbackSet(summary_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+maxiters = 200
+run_profiler = false
+
+# disable warnings when maxiters is reached
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
+                  dt = 1.0,
+                  save_everystep = false, callback = callbacks,
+                  maxiters = maxiters, verbose = false)
+if run_profiler
+    prof_result = CUDA.@profile solve!(integrator)
+else
+    solve!(integrator)
+    prof_result = nothing
+end
+
+finalize(mesh)
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
@@ -0,0 +1,89 @@
+using Trixi
+using CUDA
+using TimerOutputs
+using JSON
+
+function main(elixir_path)
+
+    # setup
+    maxiters = 50
+    initial_refinement_level = 3
+    storage_type = CuArray
+    real_type = Float64
+
+    println("Warming up...")
+
+    # start simulation with tiny final time to trigger compilation
+    duration_compile = @elapsed begin
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = real_type)
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = Float32)
+    end
+
+    println("Finished warm-up in $duration_compile seconds\n")
+    println("Starting simulation...")
+
+    # start the real simulation
+    duration_elixir = @elapsed trixi_include(elixir_path,
+                                             maxiters = maxiters,
+                                             initial_refinement_level = initial_refinement_level,
+                                             storage_type = storage_type,
+                                             real_type = real_type)
+
+    # store metrics (on every rank!)
+    metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
+
+    # read TimerOutputs timings
+    timer = Trixi.timer()
+    metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
+    metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
+
+    # compute performance index
+    nrhscalls = Trixi.ncalls(semi.performance_counter)
+    walltime = 1.0e-9 * take!(semi.performance_counter)
+    metrics["PID"] = walltime * Trixi.mpi_nranks() / (Trixi.ndofsglobal(semi) * nrhscalls)
+
+    # write json file
+    open("metrics.out", "w") do f
+        indent = 2
+        JSON.print(f, metrics, indent)
+    end
+
+    # run profiler
+    maxiters = 5
+    initial_refinement_level = 1
+
+    println("Running profiler (Float64)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float64,
+                  run_profiler = true)
+
+    open("profile_float64.txt", "w") do io
+        show(io, prof_result)
+    end
+
+    println("Running profiler (Float32)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float32,
+                  run_profiler = true)
+
+    open("profile_float32.txt", "w") do io
+        show(io, prof_result)
+    end
+end
+
+# hardcoded elixir
+elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
+
+main(elixir_path)
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -48,9 +48,8 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, stepsize_callback)
-# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
-# analysis_callback, save_solution, stepsize_callback)
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation

diff --git a/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,60 @@
+# The same setup as tree_3d_dgsem/elixir_advection_basic.jl
+# to verify the P4estMesh implementation against TreeMesh
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0, -1.0) # minimum coordinates (min(x), min(y), min(z))
+coordinates_max = (1.0, 1.0, 1.0) # maximum coordinates (max(x), max(y), max(z))
+
+# Create P4estMesh with 8 x 8 x 8 elements (note `refinement_level=1`)
+trees_per_dimension = (4, 4, 4)
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.2)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 0.05, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/src/Trixi.jl b/src/Trixi.jl
@@ -59,7 +59,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
+                          allocate
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
@@ -138,17 +138,27 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                       UnstructuredMesh2D,
                                       P4estMesh{2}, P4estMeshView{2},
                                       T8codeMesh{2}},
                           equations,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
 
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -210,13 +220,23 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                            UnstructuredMesh2D, P4estMesh{2},
                                            T8codeMesh{2}},
                                equations,
                                dg::DGSEM, cache, args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, equations, dg, args...))
@@ -226,7 +246,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
             integral += volume_jacobian * weights[i] * weights[j] *
                         func(u, i, j, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j]
@@ -271,10 +291,19 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                              UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
                  equations, dg::DG, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
+    # Calculate
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, element, equations, dg, du