Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
26759a8
Use Adapt.jl to change storage and element type
vchuravy Dec 17, 2024
fc610f9
add docs and CUDAExt
vchuravy Apr 21, 2025
7b5d81b
Aqua set unbound_args
vchuravy Apr 21, 2025
f730ef4
lower bound CUDA to 5.2
vchuravy Apr 22, 2025
13b7f59
add initial CUDA pipeline
vchuravy Apr 21, 2025
02de7d2
add storage_type, real_type to semidiscretize
vchuravy Apr 22, 2025
671f5b1
add GPU construction test
vchuravy Apr 22, 2025
ecd09a5
don't adapt Array{MArray}
vchuravy Apr 22, 2025
312009a
add some more cuda adapt tests
vchuravy Apr 22, 2025
690efd1
use sources for dev branch
vchuravy Apr 28, 2025
15a898b
fixup! use sources for dev branch
vchuravy May 8, 2025
45d344b
use released version of CUDA
vchuravy May 14, 2025
7e72eff
Update .buildkite/pipeline.yml
vchuravy May 14, 2025
3450ddd
Use Adapt.jl to change storage and element type
vchuravy Dec 17, 2024
cf2f590
add docs and CUDAExt
vchuravy Apr 21, 2025
de96f85
Aqua set unbound_args
vchuravy Apr 21, 2025
1a7cff2
lower bound CUDA to 5.2
vchuravy Apr 22, 2025
68edf29
add initial CUDA pipeline
vchuravy Apr 21, 2025
11ff63a
add storage_type, real_type to semidiscretize
vchuravy Apr 22, 2025
4d8a31f
add GPU construction test
vchuravy Apr 22, 2025
6ca8c3d
don't adapt Array{MArray}
vchuravy Apr 22, 2025
4ef2d98
add some more cuda adapt tests
vchuravy Apr 22, 2025
77395f5
use sources for dev branch
vchuravy Apr 28, 2025
1d78f07
fixup! use sources for dev branch
vchuravy May 8, 2025
39535ee
use released version of CUDA
vchuravy May 14, 2025
b973758
Update .buildkite/pipeline.yml
vchuravy May 14, 2025
7105da7
fix test_p4est_2d
vchuravy Jun 30, 2025
1fd6fe6
fix first GPU test
vchuravy Jun 30, 2025
d8a4bc8
Merge branch 'vc/adapt' into feature-gpu-offloading
benegee Jul 1, 2025
6ceef3a
address review comments
vchuravy Jul 1, 2025
7a53362
offload compute_coefficients
benegee Jul 1, 2025
68eb905
fmt
benegee Jul 1, 2025
3d00bdf
fixup! address review comments
vchuravy Jul 1, 2025
4b32fa0
add review comments
vchuravy Jul 1, 2025
10f7593
convert fstar_* cache entries to VecOfArrays
benegee Jul 1, 2025
c83bdbd
restore elixir
benegee Jul 1, 2025
8c6c57d
Merge branch 'vc/adapt' into feature-gpu-offloading
benegee Jul 2, 2025
d3b94fc
test native version as well
benegee Jul 2, 2025
97e13ec
adapt 1D and 3D version
benegee Jul 2, 2025
44f7134
Downgrade compat with Adapt
benegee Jul 2, 2025
abbcc56
Use Adapt.jl to change storage and element type
vchuravy Dec 17, 2024
a18e5d2
restore elixir
benegee Jul 1, 2025
5c942fe
offload compute_coefficients
benegee Jul 1, 2025
47a55f2
fmt
benegee Jul 1, 2025
36b0e4a
test native version as well
benegee Jul 2, 2025
153d828
adapt 1D and 3D version
benegee Jul 2, 2025
819ba75
Downgrade compat with Adapt
benegee Jul 2, 2025
e75cac7
update requires to 1.3
vchuravy Jul 2, 2025
4b6f63e
Merge branch 'vc/adapt' into feature-gpu-offloading
benegee Jul 2, 2025
61b4da1
Merge branch 'main' into feature-gpu-offloading
benegee Sep 16, 2025
e7cde27
missed during merge
benegee Sep 16, 2025
b174d6d
mistakes during merge
benegee Sep 16, 2025
489bb24
cleanup
benegee Sep 18, 2025
b4d1535
Basis kernels for 3D P4est
benegee Sep 18, 2025
2443cf8
port stepsize computation
benegee Sep 18, 2025
fc13ea5
CPU workaround for analysis callback
benegee Sep 18, 2025
2ff2f52
tests
benegee Sep 18, 2025
bc4ad17
add benchmark
benegee Sep 19, 2025
de06c61
fix max_dt
benegee Sep 19, 2025
29298a5
profiler output
benegee Sep 25, 2025
281a540
Merge branch 'main' into feature-gpu-offloading
benegee Sep 29, 2025
962a383
fmt
benegee Sep 29, 2025
a60e27d
missed max_dt calls
benegee Sep 29, 2025
ce742a3
Merge branch 'main' into feature-gpu-offloading
benegee Sep 30, 2025
2073d7c
some fixes
benegee Sep 30, 2025
9a2f130
after merge fixes
benegee Sep 30, 2025
9a47f29
some more fixes
benegee Sep 30, 2025
94f5d90
Merge branch 'main' into feature-gpu-offloading
benegee Oct 1, 2025
6ffb69f
post merge fixes
benegee Oct 1, 2025
fb25fa2
Merge branch 'main' into feature-gpu-offloading
benegee Oct 1, 2025
307c3eb
more
benegee Oct 1, 2025
c39b4de
more
benegee Oct 1, 2025
a38cc03
Squashed commit of the following:
benegee Oct 7, 2025
013244d
Apply suggestions from code review
benegee Oct 8, 2025
5b2c0bf
Merge branch 'feature-gpu-offloading' of github.com:trixi-framework/T…
benegee Oct 8, 2025
8d5a55b
Merge branch 'main' into feature-gpu-offloading
benegee Oct 8, 2025
8a98d27
!fixup
benegee Oct 8, 2025
7de1e57
fmt
benegee Oct 8, 2025
31a65cb
pass backend through
benegee Oct 8, 2025
4064e79
fixes
benegee Oct 8, 2025
af50cda
backends here and there
benegee Oct 8, 2025
5893d4d
almost everywhere
benegee Oct 8, 2025
a1caa12
some more
benegee Oct 8, 2025
a5cded3
next round
benegee Oct 8, 2025
7c6ab4a
could this be...
benegee Oct 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
using OrdinaryDiffEqLowStorageRK
using Trixi

###############################################################################
# semidiscretization of the compressible Euler equations

equations = CompressibleEulerEquations3D(1.4)

function initial_condition_taylor_green_vortex(x, t,
equations::CompressibleEulerEquations3D)
A = 1.0 # magnitude of speed
Ms = 0.1 # maximum Mach number

rho = 1.0
v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
v3 = 0.0
p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
p = p +
1.0 / 16.0 * A^2 * rho *
(cos(2 * x[1]) * cos(2 * x[3]) +
2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))

return prim2cons(SVector(rho, v1, v2, v3, p), equations)
end

initial_condition = initial_condition_taylor_green_vortex

# TODO Undefined external symbol "log"
#volume_flux = flux_ranocha
volume_flux = flux_lax_friedrichs
solver = DGSEM(polydeg = 5, surface_flux = volume_flux)
# TODO flux diff
#volume_integral=VolumeIntegralFluxDifferencing(volume_flux))

coordinates_min = (-1.0, -1.0, -1.0) .* pi
coordinates_max = (1.0, 1.0, 1.0) .* pi

initial_refinement_level = 1
trees_per_dimension = (4, 4, 4)

mesh = P4estMesh(trees_per_dimension, polydeg = 1,
coordinates_min = coordinates_min, coordinates_max = coordinates_max,
periodicity = true, initial_refinement_level = initial_refinement_level)

semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)

###############################################################################
# ODE solvers, callbacks etc.

tspan = (0.0, 100.0)
ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)

summary_callback = SummaryCallback()

stepsize_callback = StepsizeCallback(cfl = 0.1)

callbacks = CallbackSet(summary_callback,
stepsize_callback)

###############################################################################
# run the simulation

maxiters = 200
run_profiler = false

# disable warnings when maxiters is reached
integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
dt = 1.0,
save_everystep = false, callback = callbacks,
maxiters = maxiters, verbose = false)
if run_profiler
prof_result = CUDA.@profile solve!(integrator)
else
solve!(integrator)
prof_result = nothing
end

finalize(mesh)
89 changes: 89 additions & 0 deletions benchmark/CUDA/run.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using Trixi
using CUDA
using TimerOutputs
using JSON

function main(elixir_path)

# setup
maxiters = 50
initial_refinement_level = 3
storage_type = CuArray
real_type = Float64

println("Warming up...")

# start simulation with tiny final time to trigger compilation
duration_compile = @elapsed begin
trixi_include(elixir_path,
tspan = (0.0, 1e-14),
storage_type = storage_type,
real_type = real_type)
trixi_include(elixir_path,
tspan = (0.0, 1e-14),
storage_type = storage_type,
real_type = Float32)
end

println("Finished warm-up in $duration_compile seconds\n")
println("Starting simulation...")

# start the real simulation
duration_elixir = @elapsed trixi_include(elixir_path,
maxiters = maxiters,
initial_refinement_level = initial_refinement_level,
storage_type = storage_type,
real_type = real_type)

# store metrics (on every rank!)
metrics = Dict{String, Float64}("elapsed time" => duration_elixir)

# read TimerOutputs timings
timer = Trixi.timer()
metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])

# compute performance index
nrhscalls = Trixi.ncalls(semi.performance_counter)
walltime = 1.0e-9 * take!(semi.performance_counter)
metrics["PID"] = walltime * Trixi.mpi_nranks() / (Trixi.ndofsglobal(semi) * nrhscalls)

# write json file
open("metrics.out", "w") do f
indent = 2
JSON.print(f, metrics, indent)
end

# run profiler
maxiters = 5
initial_refinement_level = 1

println("Running profiler (Float64)...")
trixi_include(elixir_path,
maxiters = maxiters,
initial_refinement_level = initial_refinement_level,
storage_type = storage_type,
real_type = Float64,
run_profiler = true)

open("profile_float64.txt", "w") do io
show(io, prof_result)
end

println("Running profiler (Float32)...")
trixi_include(elixir_path,
maxiters = maxiters,
initial_refinement_level = initial_refinement_level,
storage_type = storage_type,
real_type = Float32,
run_profiler = true)

open("profile_float32.txt", "w") do io
show(io, prof_result)
end
end

# hardcoded elixir
elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")

main(elixir_path)
5 changes: 2 additions & 3 deletions examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,8 @@ save_solution = SaveSolutionCallback(interval = 100,
stepsize_callback = StepsizeCallback(cfl = 1.6)

# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
callbacks = CallbackSet(summary_callback, stepsize_callback)
# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
# analysis_callback, save_solution, stepsize_callback)
callbacks = CallbackSet(summary_callback, analysis_callback,
save_solution, stepsize_callback)

###############################################################################
# run the simulation
Expand Down
60 changes: 60 additions & 0 deletions examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# The same setup as tree_3d_dgsem/elixir_advection_basic.jl
# to verify the P4estMesh implementation against TreeMesh

using OrdinaryDiffEqLowStorageRK
using Trixi

###############################################################################
# semidiscretization of the linear advection equation

advection_velocity = (0.2, -0.7, 0.5)
equations = LinearScalarAdvectionEquation3D(advection_velocity)

# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)

coordinates_min = (-1.0, -1.0, -1.0) # minimum coordinates (min(x), min(y), min(z))
coordinates_max = (1.0, 1.0, 1.0) # maximum coordinates (max(x), max(y), max(z))

# Create P4estMesh with 8 x 8 x 8 elements (note `refinement_level=1`)
trees_per_dimension = (4, 4, 4)
mesh = P4estMesh(trees_per_dimension, polydeg = 3,
coordinates_min = coordinates_min, coordinates_max = coordinates_max,
initial_refinement_level = 1)

# A semidiscretization collects data structures and functions for the spatial discretization
semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
solver)

###############################################################################
# ODE solvers, callbacks etc.

# Create ODE problem with time span from 0.0 to 1.0
tspan = (0.0, 1.0)
ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)

# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
# and resets the timers
summary_callback = SummaryCallback()

# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
analysis_callback = AnalysisCallback(semi, interval = 100)

# The SaveSolutionCallback allows to save the solution to a file in regular intervals
save_solution = SaveSolutionCallback(interval = 100,
solution_variables = cons2prim)

# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
stepsize_callback = StepsizeCallback(cfl = 1.2)

# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
callbacks = CallbackSet(summary_callback, analysis_callback,
save_solution, stepsize_callback)

###############################################################################
# run the simulation

# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
dt = 0.05, # solve needs some value here but it will be overwritten by the stepsize_callback
ode_default_options()..., callback = callbacks);
3 changes: 2 additions & 1 deletion src/Trixi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
using FillArrays: Ones, Zeros
using ForwardDiff: ForwardDiff
using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
allocate
using LinearMaps: LinearMap
if _PREFERENCE_LOOPVECTORIZATION
using LoopVectorization: LoopVectorization, @turbo, indices
Expand Down
41 changes: 35 additions & 6 deletions src/callbacks_step/analysis_dg2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,27 @@ function calc_error_norms(func, u, t, analyzer,
return l2_error, linf_error
end

function calc_error_norms(func, u, t, analyzer,
function calc_error_norms(func, _u, t, analyzer,
mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
UnstructuredMesh2D,
P4estMesh{2}, P4estMeshView{2},
T8codeMesh{2}},
equations,
initial_condition, dg::DGSEM, cache, cache_analysis)
@unpack vandermonde, weights = analyzer
@unpack node_coordinates, inverse_jacobian = cache.elements
@unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis

# TODO GPU AnalysiCallback currently lives on CPU
backend = trixi_backend(_u)
if backend isa Nothing # TODO GPU KA CPU backend
@unpack node_coordinates, inverse_jacobian = cache.elements
u = _u
else
node_coordinates = Array(cache.elements.node_coordinates)
inverse_jacobian = Array(cache.elements.inverse_jacobian)
u = Array(_u)
end

# Set up data structures
l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1), equations))
linf_error = copy(l2_error)
Expand Down Expand Up @@ -210,13 +220,23 @@ function integrate_via_indices(func::Func, u,
return integral
end

function integrate_via_indices(func::Func, u,
function integrate_via_indices(func::Func, _u,
mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
UnstructuredMesh2D, P4estMesh{2},
T8codeMesh{2}},
equations,
dg::DGSEM, cache, args...; normalize = true) where {Func}
@unpack weights = dg.basis
# TODO GPU AnalysiCallback currently lives on CPU
backend = trixi_backend(_u)
if backend isa Nothing # TODO GPU KA CPU backend
@unpack weights = dg.basis
@unpack inverse_jacobian = cache.elements
u = _u
else
weights = Array(dg.basis.weights)
inverse_jacobian = Array(cache.elements.inverse_jacobian)
u = Array(_u)
end

# Initialize integral with zeros of the right shape
integral = zero(func(u, 1, 1, 1, equations, dg, args...))
Expand All @@ -226,7 +246,7 @@ function integrate_via_indices(func::Func, u,
@batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
cache)
for j in eachnode(dg), i in eachnode(dg)
volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, element]))
volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
integral += volume_jacobian * weights[i] * weights[j] *
func(u, i, j, element, equations, dg, args...)
total_volume += volume_jacobian * weights[i] * weights[j]
Expand Down Expand Up @@ -271,10 +291,19 @@ function integrate(func::Func, u,
end
end

function analyze(::typeof(entropy_timederivative), du, u, t,
function analyze(::typeof(entropy_timederivative), _du, u, t,
mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
equations, dg::DG, cache)
# TODO GPU AnalysiCallback currently lives on CPU
backend = trixi_backend(u)
if backend isa Nothing # TODO GPU KA CPU backend
du = _du
else
du = Array(_du)
end

# Calculate
# Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
integrate_via_indices(u, mesh, equations, dg, cache,
du) do u, i, j, element, equations, dg, du
Expand Down
Loading
Loading