diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0f8ad475db8..344b8eacc3a 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,3 +1,5 @@ +env: + steps: - label: "CUDA Julia {{matrix.version}}" matrix: @@ -7,12 +9,13 @@ steps: plugins: - JuliaCI/julia#v1: version: "{{matrix.version}}" - command: | - true + - JuliaCI/julia-test#v1: ~ + env: + TRIXI_TEST: "CUDA" agents: queue: "juliagpu" cuda: "*" if: build.message !~ /\[skip ci\]/ timeout_in_minutes: 60 soft_fail: - - exit_status: 3 \ No newline at end of file + - exit_status: 3 diff --git a/.github/workflows/GPUCompat.yml b/.github/workflows/GPUCompat.yml deleted file mode 100644 index 335e1c83c4c..00000000000 --- a/.github/workflows/GPUCompat.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: GPU Package Compatibility - -on: - pull_request: - paths-ignore: - - 'AUTHORS.md' - - 'CITATION.bib' - - 'CONTRIBUTING.md' - - 'LICENSE.md' - - 'NEWS.md' - - 'README.md' - - '.zenodo.json' - - '.github/workflows/benchmark.yml' - - '.github/workflows/CompatHelper.yml' - - '.github/workflows/TagBot.yml' - - 'benchmark/**' - - 'docs/**' - - 'utils/**' - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - test: - if: "!contains(github.event.head_commit.message, 'skip ci')" - name: ${{ matrix.os }} - ${{ matrix.arch }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - include: - - version: '1.10' - os: ubuntu-latest - arch: x64 - - version: '1.10' - os: windows-latest - arch: x64 - # CUDA.jl only supports 64-bit Linux and Windows, see https://github.com/JuliaGPU/CUDA.jl?tab=readme-ov-file#requirements - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Julia - uses: julia-actions/setup-julia@v2 - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - - name: Display version info - run: julia -e 'using InteractiveUtils; versioninfo(verbose=true)' - - - name: Cache Julia packages - uses: julia-actions/cache@v2 - - - name: Build project - uses: julia-actions/julia-buildpkg@v1 - - # Only CUDA.jl is needed for GPU compatibility test now - - name: Add CUDA.jl to environment - run: | - julia --project=. -e ' - using Pkg; - Pkg.activate(temp=true); - Pkg.develop(PackageSpec(path=pwd())); - Pkg.add("CUDA"); - Pkg.update()' - - # - name: Add Metal.jl to environment - # run: | - # julia --project=. -e ' - # using Pkg; - # Pkg.activate(temp=true); - # Pkg.develop(PackageSpec(path=pwd())); - # Pkg.add("Metal"); - # Pkg.update()' - - # - name: Add AMDGPU.jl to environment - # run: | - # julia --project=. -e ' - # using Pkg; - # Pkg.activate(temp=true); - # Pkg.develop(PackageSpec(path=pwd())); - # Pkg.add("AMDGPU"); - # Pkg.update()' diff --git a/NEWS.md b/NEWS.md index 72f572a74fc..c96b522da94 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,13 @@ Trixi.jl follows the interpretation of used in the Julia ecosystem. Notable changes will be documented in this file for human readability. +## Changes in the v0.12 lifecycle + +#### Added +- Initial support for adapting data-structures between different storage arrays was added. This enables future work to support GPU with Trixi ([#2212]). + +#### Deprecated + ## Changes when updating to v0.12 from v0.11.x #### Added diff --git a/Project.toml b/Project.toml index 83bfc35f982..72457d14258 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.12.7-DEV" [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" @@ -16,6 +17,7 @@ EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" @@ -52,24 +54,31 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" [extensions] +TrixiAMDGPUExt = "AMDGPU" +TrixiCUDAExt = "CUDA" TrixiConvexECOSExt = ["Convex", "ECOS"] TrixiMakieExt = "Makie" TrixiNLsolveExt = "NLsolve" [compat] +AMDGPU = "1.3.5" Accessors = "0.1.36" +Adapt = "4" +CUDA = "5.8" CodeTracking = "1.0.5" ConstructionBase = "1.5" Convex = "0.16" DataStructures = "0.18.15" DelimitedFiles = "1" -DiffEqBase = "6.154" +DiffEqBase = "6.155.2" DiffEqCallbacks = "2.35, 3, 4" Downloads = "1.6" ECOS = "1.1.2" @@ -77,6 +86,7 @@ EllipsisNotation = "1.0" FillArrays = "1.9" ForwardDiff = "0.10.36, 1" HDF5 = "0.16.10, 0.17" +KernelAbstractions = "0.9.36" LinearAlgebra = "1" LinearMaps = "2.7, 3.0" LoopVectorization = "0.12.171" @@ -94,7 +104,7 @@ Printf = "1" RecipesBase = "1.3.4" RecursiveArrayTools = "3.31.1" Reexport = "1.2" -Requires = "1.1" +Requires = "1.3" SciMLBase = "2.67.0" SimpleUnPack = "1.1" SparseArrays = "1" @@ -104,7 +114,7 @@ Static = "1.1.1" StaticArrayInterface = "1.5.1" StaticArrays = "1.9" StrideArrays = "0.1.29" -StructArrays = "0.6.18, 0.7" +StructArrays = "0.6.20, 0.7" SummationByPartsOperators = "0.5.52" T8code = "0.7.4" TimerOutputs = "0.5.23" diff --git a/docs/Project.toml b/docs/Project.toml index f2623d6c1ba..0b6b0cc7392 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,4 +1,5 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" @@ -16,9 +17,13 @@ OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388" OrdinaryDiffEqTsit5 = "b1df2697-797e-41e3-8120-5422d3b24e4a" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb" Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95" TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284" +[sources] +Trixi = {path = ".."} + [compat] CairoMakie = "0.12, 0.13, 0.14, 0.15" Changelog = "1.1" diff --git a/docs/make.jl b/docs/make.jl index 7111b66ab94..0301f5ba64e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -163,7 +163,8 @@ makedocs( "Style guide" => "styleguide.md", "Testing" => "testing.md", "Performance" => "performance.md", - "Parallelization" => "parallelization.md" + "Parallelization" => "parallelization.md", + "Heterogeneous" => "heterogeneous.md" ], "Troubleshooting and FAQ" => "troubleshooting.md", "Reference" => [ diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md new file mode 100644 index 00000000000..451945ab6cd --- /dev/null +++ b/docs/src/heterogeneous.md @@ -0,0 +1,163 @@ +# Heterogeneous computing + +Support for heterogeneous computing is currently being worked on. + +## The use of Adapt.jl + +[Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) is a package in the +[JuliaGPU](https://github.com/JuliaGPU) family that allows for +the translation of nested data structures. The primary goal is to allow the substitution of `Array` +at the storage level with a GPU array like `CuArray` from [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl). + +To facilitate this, data structures must be parameterized, so instead of: + +```julia +struct Container <: Trixi.AbstractContainer + data::Array{Float64, 2} +end +``` + +They must be written as: + +```jldoctest adapt; output = false, setup=:(import Trixi) +struct Container{D<:AbstractArray} <: Trixi.AbstractContainer + data::D +end + +# output + +``` + +furthermore, we need to define a function that allows for the conversion of storage +of our types: + +```jldoctest adapt; output = false +using Adapt + +function Adapt.adapt_structure(to, C::Container) + return Container(adapt(to, C.data)) +end + +# output + +``` + +or simply + +```julia +Adapt.@adapt_structure(Container) +``` + +additionally, we must define `Adapt.parent_type`. + +```jldoctest adapt; output = false +function Adapt.parent_type(::Type{<:Container{D}}) where D + return D +end + +# output + +``` + +All together we can use this machinery to perform conversions of a container. + +```jldoctest adapt +julia> C = Container(zeros(3)) +Container{Vector{Float64}}([0.0, 0.0, 0.0]) + +julia> Trixi.storage_type(C) +Array +``` + + +```julia-repl +julia> using CUDA + +julia> GPU_C = adapt(CuArray, C) +Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0]) + +julia> Trixi.storage_type(C) +CuArray +``` + +## Element-type conversion with `Trixi.trixi_adapt`. + +We can use [`Trixi.trixi_adapt`](@ref) to perform both an element-type and a storage-type adoption: + +```jldoctest adapt +julia> C = Container(zeros(3)) +Container{Vector{Float64}}([0.0, 0.0, 0.0]) + +julia> Trixi.trixi_adapt(Array, Float32, C) +Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0]) +``` + +```julia-repl +julia> Trixi.trixi_adapt(CuArray, Float32, C) +Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0]) +``` + +!!! note + `adapt(Array{Float32}, C)` is tempting, but it will do the wrong thing + in the presence of `SVector`s and similar arrays from StaticArrays.jl. + + +## Writing GPU kernels + +Offloading computations to the GPU is done with +[KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl), +allowing for vendor-agnostic GPU code. + +### Example + +Given the following Trixi.jl code, which would typically be called from within `rhs!`: + +```julia +function trixi_rhs_fct(mesh, equations, solver, cache, args) + @threaded for element in eachelement(solver, cache) + # code + end +end +``` + +1. Put the inner code in a new function `rhs_fct_per_element`. Besides the index + `element`, pass all required fields as arguments, but make sure to `@unpack` them from + their structs in advance. + +2. Where `trixi_rhs_fct` is called, get the backend, i.e., the hardware we are currently + running on via `trixi_backend(x)`. + This will, e.g., work with `u_ode`. Internally, KernelAbstractions.jl's `get_backend` + will be called, i.e., KernelAbstractions.jl has to know the type of `x`. + + ```julia + backend = trixi_backend(u_ode) + ``` + +3. Add a new argument `backend` to `trixi_rhs_fct` used for dispatch. + When `backend` is `nothing`, the legacy implementation should be used: + ```julia + function trixi_rhs_fct(backend::Nothing, mesh, equations, solver, cache, args) + @unpack unpacked_args = cache + @threaded for element in eachelement(solver, cache) + rhs_fct_per_element(element, unpacked_args, args) + end + end + ``` + +4. When `backend` is a `Backend` (a type defined by KernelAbstractions.jl), write a + KernelAbstractions.jl kernel: + ```julia + function trixi_rhs_fct(backend::Backend, mesh, equations, solver, cache, args) + nelements(solver, cache) == 0 && return nothing # return early when there are no elements + @unpack unpacked_args = cache + kernel! = rhs_fct_kernel!(backend) + kernel!(unpacked_args, args, + ndrange = nelements(solver, cache)) + return nothing + end + + @kernel function rhs_fct_kernel!(unpacked_args, args) + element = @index(Global) + rhs_fct_per_element(element, unpacked_args, args) + end + ``` \ No newline at end of file diff --git a/docs/src/styleguide.md b/docs/src/styleguide.md index 27210f951c0..07f2d90cddc 100644 --- a/docs/src/styleguide.md +++ b/docs/src/styleguide.md @@ -20,6 +20,7 @@ conventions, we apply and enforce automated source code formatting * The main modified argument comes first. For example, if the right-hand side `du` is modified, it should come first. If only the `cache` is modified, e.g., in `prolong2interfaces!` and its siblings, put the `cache` first. + * Some internal functions take a "computational backend" argument, this should always be passed as the first argument. * Otherwise, use the order `mesh, equations, solver, cache`. * If something needs to be specified in more detail for dispatch, put the additional argument before the general one that is specified in more detail. For example, we use `have_nonconservative_terms(equations), equations` diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl new file mode 100644 index 00000000000..6f9e8e56986 --- /dev/null +++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl @@ -0,0 +1,61 @@ +# The same setup as tree_2d_dgsem/elixir_advection_basic.jl +# to verify GPU support and Adapt.jl support. + +using OrdinaryDiffEqLowStorageRK +using Trixi + +############################################################################### +# semidiscretization of the linear advection equation + +advection_velocity = (0.2, -0.7) +equations = LinearScalarAdvectionEquation2D(advection_velocity) + +# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux +solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) + +coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y)) +coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y)) + +trees_per_dimension = (8, 8) + +# Create P4estMesh with 8 x 8 trees and 16 x 16 elements +mesh = P4estMesh(trees_per_dimension, polydeg = 3, + coordinates_min = coordinates_min, coordinates_max = coordinates_max, + initial_refinement_level = 1) + +# A semidiscretization collects data structures and functions for the spatial discretization +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test, + solver) + +############################################################################### +# ODE solvers, callbacks etc. + +# Create ODE problem with time span from 0.0 to 1.0 +ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing) + +# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup +# and resets the timers +summary_callback = SummaryCallback() + +# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results +analysis_callback = AnalysisCallback(semi, interval = 100) + +# The SaveSolutionCallback allows to save the solution to a file in regular intervals +save_solution = SaveSolutionCallback(interval = 100, + solution_variables = cons2prim) + +# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step +stepsize_callback = StepsizeCallback(cfl = 1.6) + +# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver +callbacks = CallbackSet(summary_callback, stepsize_callback) +# TODO: GPU. The `analysis_callback` needs to be updated for GPU support +# analysis_callback, save_solution, stepsize_callback) + +############################################################################### +# run the simulation + +# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks +sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false); + dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback + ode_default_options()..., callback = callbacks); diff --git a/ext/TrixiAMDGPUExt.jl b/ext/TrixiAMDGPUExt.jl new file mode 100644 index 00000000000..f3f37769fcd --- /dev/null +++ b/ext/TrixiAMDGPUExt.jl @@ -0,0 +1,19 @@ +# Package extension for adding AMDGPU-based features to Trixi.jl +module TrixiAMDGPUExt + +import AMDGPU: ROCArray +import Trixi + +function Trixi.storage_type(::Type{<:ROCArray}) + return ROCArray +end + +function Trixi.unsafe_wrap_or_alloc(to::Type{<:ROCArray}, vector, size) + if length(vector) == 0 + return similar(vector, size) + else + return unsafe_wrap(to, pointer(vector), size, lock = false) + end +end + +end diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl new file mode 100644 index 00000000000..681d2f53a1e --- /dev/null +++ b/ext/TrixiCUDAExt.jl @@ -0,0 +1,11 @@ +# Package extension for adding CUDA-based features to Trixi.jl +module TrixiCUDAExt + +import CUDA: CuArray +import Trixi + +function Trixi.storage_type(::Type{<:CuArray}) + return CuArray +end + +end diff --git a/src/Trixi.jl b/src/Trixi.jl index a707437655e..dda4bc021f5 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -20,6 +20,7 @@ const _PREFERENCE_SQRT = @load_preference("sqrt", "sqrt_Trixi_NaN") const _PREFERENCE_LOG = @load_preference("log", "log_Trixi_NaN") const _PREFERENCE_POLYESTER = @load_preference("polyester", true) const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", true) +const _PREFERENCE_USE_NATIVE_THREADING = @load_preference("native_threading", true) # Include other packages that are used in Trixi.jl # (standard library packages first, other packages next, all of them sorted alphabetically) @@ -50,6 +51,7 @@ import SciMLBase: get_du, get_tmp_cache, u_modified!, using DelimitedFiles: readdlm using Downloads: Downloads +using Adapt: Adapt, adapt using CodeTracking: CodeTracking using ConstructionBase: ConstructionBase using DiffEqBase: DiffEqBase, get_tstops, get_tstops_array @@ -58,6 +60,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect using FillArrays: Ones, Zeros using ForwardDiff: ForwardDiff using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace +using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend using LinearMaps: LinearMap if _PREFERENCE_LOOPVECTORIZATION using LoopVectorization: LoopVectorization, @turbo, indices @@ -132,6 +135,7 @@ include("basic_types.jl") # Include all top-level source files include("auxiliary/auxiliary.jl") +include("auxiliary/vector_of_arrays.jl") include("auxiliary/mpi.jl") include("auxiliary/p4est.jl") include("auxiliary/t8code.jl") diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl index 90650f6abcf..aeeddcbe185 100644 --- a/src/auxiliary/containers.jl +++ b/src/auxiliary/containers.jl @@ -314,4 +314,135 @@ end function raw_copy!(c::AbstractContainer, from::Int, destination::Int) raw_copy!(c, c, from, from, destination) end + +# Trixi storage types must implement these two Adapt.jl methods +function Adapt.adapt_structure(to, c::AbstractContainer) + error("Interface: Must implement Adapt.adapt_structure(to, ::$(typeof(c)))") +end + +function Adapt.parent_type(C::Type{<:AbstractContainer}) + error("Interface: Must implement Adapt.parent_type(::Type{$C}") +end + +function Adapt.unwrap_type(C::Type{<:AbstractContainer}) + return Adapt.unwrap_type(Adapt.parent_type(C)) +end + +# TODO: Upstream to Adapt +""" + storage_type(x) + +Return the storage type of `x`, which is a concrete array type, such as `Array`, `CuArray`, or `ROCArray`. +""" +function storage_type(x) + return storage_type(typeof(x)) +end + +function storage_type(T::Type) + error("Interface: Must implement storage_type(::Type{$T}") +end + +function storage_type(::Type{<:Array}) + Array +end + +function storage_type(C::Type{<:AbstractContainer}) + return storage_type(Adapt.unwrap_type(C)) +end + +# backend handling +""" + trixi_backend(x) + +Return the computational backend for `x`, which is either a KernelAbstractions backend or `nothing`. +If the backend is `nothing`, the default multi-threaded CPU backend is used. +""" +function trixi_backend(x) + # TODO: https://github.com/trixi-framework/Trixi.jl/pull/2417 + if (_PREFERENCE_POLYESTER && LoopVectorization.check_args(x)) || + (_PREFERENCE_USE_NATIVE_THREADING && get_backend(x) isa KernelAbstractions.CPU) + return nothing + end + return get_backend(x) +end + +# TODO: After https://github.com/SciML/RecursiveArrayTools.jl/pull/455 we need to investigate the right way to handle StaticArray as uEltype for MultiDG. +function trixi_backend(x::VectorOfArray) + u = parent(x) + # FIXME(vchuravy): This is a workaround because KA.get_backend is ambivalent of where a SArray is residing. + if eltype(u) <: StaticArrays.StaticArray + return nothing + end + if length(u) == 0 + error("VectorOfArray is empty, cannot determine backend.") + end + # Use the backend of the first element in the parent array + return get_backend(u[1]) +end + +# For some storage backends like CUDA.jl, empty arrays do seem to simply be +# null pointers which can cause `unsafe_wrap` to fail when calling +# Adapt.adapt (ArgumentError, see +# https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/src/array.jl#L212-L229). +# To circumvent this, on length zero arrays this allocates +# a separate empty array instead of wrapping. +# However, since zero length arrays are not used in calculations, +# it should be okay if the underlying storage vectors and wrapped arrays +# are not the same as long as they are properly wrapped when `resize!`d etc. +function unsafe_wrap_or_alloc(to, vector, size) + if length(vector) == 0 + return similar(vector, size) + else + return unsafe_wrap(to, pointer(vector), size) + end +end + +struct TrixiAdaptor{Storage, RealT} end + +""" + trixi_adapt(Storage, RealT, x) + +Adapt `x` to the storage type `Storage` and real type `RealT`. +""" +function trixi_adapt(Storage, RealT, x) + adapt(TrixiAdaptor{Storage, RealT}(), x) +end + +# Custom rules +# 1. handling of StaticArrays +function Adapt.adapt_storage(::TrixiAdaptor{<:Any, RealT}, + x::StaticArrays.StaticArray) where {RealT} + StaticArrays.similar_type(x, RealT)(x) +end + +# 2. Handling of Arrays +function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT}, + x::AbstractArray{T}) where {Storage, RealT, + T <: AbstractFloat} + adapt(Storage{RealT}, x) +end + +function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT}, + x::AbstractArray{T}) where {Storage, RealT, + T <: StaticArrays.StaticArray} + adapt(Storage{StaticArrays.similar_type(T, RealT)}, x) +end + +# Our threaded cache contains MArray, it is unlikely that we would want to adapt those +function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT}, + x::Array{T}) where {Storage, RealT, + T <: StaticArrays.MArray} + adapt(Array{StaticArrays.similar_type(T, RealT)}, x) +end + +function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT}, + x::AbstractArray) where {Storage, RealT} + adapt(Storage, x) +end + +# 3. TODO: Should we have a fallback? But that would imply implementing things for NamedTuple again + +function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage} + return unsafe_wrap_or_alloc(Storage, vec, size) +end end # @muladd diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl new file mode 100644 index 00000000000..0fa8dd7f1ec --- /dev/null +++ b/src/auxiliary/vector_of_arrays.jl @@ -0,0 +1,31 @@ +# By default, Julia/LLVM does not use fused multiply-add operations (FMAs). +# Since these FMAs can increase the performance of many numerical algorithms, +# we need to opt-in explicitly. +# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details. +@muladd begin +#! format: noindent + +# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector. +# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is +# not possible for a plain Vector of Arrays. +struct VecOfArrays{T <: AbstractArray} + arrays::Vector{T} +end +Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i) +Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays) +Base.size(v::VecOfArrays) = Base.size(v.arrays) +Base.length(v::VecOfArrays) = Base.length(v.arrays) +Base.eltype(v::VecOfArrays{T}) where {T} = T +function Adapt.adapt_structure(to, v::VecOfArrays) + return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays]) +end +function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T} + return T +end +function Adapt.unwrap_type(A::Type{<:VecOfArrays}) + Adapt.unwrap_type(Adapt.parent_type(A)) +end +function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray}) + VecOfArrays(v) +end +end # @muladd diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl index cc3900d42da..3f8823ea17c 100644 --- a/src/semidiscretization/semidiscretization.jl +++ b/src/semidiscretization/semidiscretization.jl @@ -82,9 +82,15 @@ end Wrap the semidiscretization `semi` as an ODE problem in the time interval `tspan` that can be passed to `solve` from the [SciML ecosystem](https://diffeq.sciml.ai/latest/). + +The optional keyword arguments `storage_type` and `real_type` configure the underlying computational +datastructures. `storage_type` changes the fundamental array type being used, allowing the +experimental use of `CuArray` or other GPU array types. `real_type` changes the computational data type being used. """ function semidiscretize(semi::AbstractSemidiscretization, tspan; - reset_threads = true) + reset_threads = true, + storage_type = nothing, + real_type = nothing) # Optionally reset Polyester.jl threads. See # https://github.com/trixi-framework/Trixi.jl/issues/1583 # https://github.com/JuliaSIMD/Polyester.jl/issues/30 @@ -92,6 +98,19 @@ function semidiscretize(semi::AbstractSemidiscretization, tspan; Polyester.reset_threads!() end + if !(storage_type === nothing && real_type === nothing) + if storage_type === nothing + storage_type = Array + end + if real_type === nothing + real_type = real(semi) + end + semi = trixi_adapt(storage_type, real_type, semi) + if eltype(tspan) !== real_type + tspan = convert.(real_type, tspan) + end + end + u0_ode = compute_coefficients(first(tspan), semi) # TODO: MPI, do we want to synchronize loading and print debug statements, e.g. using # mpi_isparallel() && MPI.Barrier(mpi_comm()) @@ -155,9 +174,10 @@ end Same as [`compute_coefficients`](@ref) but stores the result in `u_ode`. """ function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization) + backend = trixi_backend(u_ode) u = wrap_array(u_ode, semi) # Call `compute_coefficients` defined by the solver - compute_coefficients!(u, func, t, mesh_equations_solver_cache(semi)...) + compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...) end """ diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl index 7496a345661..2a563c02229 100644 --- a/src/semidiscretization/semidiscretization_hyperbolic.jl +++ b/src/semidiscretization/semidiscretization_hyperbolic.jl @@ -27,25 +27,6 @@ mutable struct SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition, solver::Solver cache::Cache performance_counter::PerformanceCounter - - function SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition, - BoundaryConditions, SourceTerms, Solver, - Cache}(mesh::Mesh, equations::Equations, - initial_condition::InitialCondition, - boundary_conditions::BoundaryConditions, - source_terms::SourceTerms, - solver::Solver, - cache::Cache) where {Mesh, Equations, - InitialCondition, - BoundaryConditions, - SourceTerms, - Solver, - Cache} - performance_counter = PerformanceCounter() - - new(mesh, equations, initial_condition, boundary_conditions, source_terms, - solver, cache, performance_counter) - end end """ @@ -71,6 +52,8 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions) + performance_counter = PerformanceCounter() + SemidiscretizationHyperbolic{typeof(mesh), typeof(equations), typeof(initial_condition), typeof(_boundary_conditions), typeof(source_terms), @@ -78,9 +61,13 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver initial_condition, _boundary_conditions, source_terms, solver, - cache) + cache, + performance_counter) end +# @eval due to @muladd +@eval Adapt.@adapt_structure(SemidiscretizationHyperbolic) + # Create a new semidiscretization but change some parameters compared to the input. # `Base.similar` follows a related concept but would require us to `copy` the `mesh`, # which would impact the performance. Instead, `SciMLBase.remake` has exactly the diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl index ad211b3c003..3ed10ec2d46 100644 --- a/src/solvers/dg.jl +++ b/src/solvers/dg.jl @@ -415,6 +415,9 @@ struct DG{Basis, Mortar, SurfaceIntegral, VolumeIntegral} volume_integral::VolumeIntegral end +# @eval due to @muladd +@eval Adapt.@adapt_structure(DG) + function Base.show(io::IO, dg::DG) @nospecialize dg # reduce precompilation time @@ -639,8 +642,11 @@ include("fdsbp_unstructured/fdsbp.jl") function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache) # We must allocate a `Vector` in order to be able to `resize!` it (AMR). # cf. wrap_array - zeros(eltype(cache.elements), - nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache)) + u_ode = similar(cache.elements.node_coordinates, eltype(cache.elements), + nvariables(equations) * nnodes(dg)^ndims(mesh) * + nelements(dg, cache)) + fill!(u_ode, zero(eltype(u_ode))) + return u_ode end @inline function wrap_array(u_ode::AbstractVector, mesh::AbstractMesh, equations, @@ -683,7 +689,8 @@ end # (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache))) else # The following version is reasonably fast and allows us to `resize!(u_ode, ...)`. - unsafe_wrap(Array{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode), + ArrayType = Trixi.storage_type(u_ode) + unsafe_wrap(ArrayType{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode), (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache))) end @@ -732,8 +739,8 @@ end nelements(dg, cache))) end -function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg::DG, - cache) +function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{1}, + equations, dg::DG, cache) @threaded for element in eachelement(dg, cache) for i in eachnode(dg) x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i, @@ -753,20 +760,43 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg: end end -function compute_coefficients!(u, func, t, mesh::AbstractMesh{2}, equations, dg::DG, - cache) +function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2}, + equations, dg::DG, cache) + @unpack node_coordinates = cache.elements @threaded for element in eachelement(dg, cache) - for j in eachnode(dg), i in eachnode(dg) - x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i, - j, element) - u_node = func(x_node, t, equations) - set_node_vars!(u, u_node, equations, dg, i, j, element) - end + compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, + element) + end +end + +function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2}, + equations, dg::DG, cache) + nelements(dg, cache) == 0 && return nothing + @unpack node_coordinates = cache.elements + kernel! = compute_coefficients_kernel!(backend) + kernel!(u, func, t, equations, dg, node_coordinates, + ndrange = nelements(dg, cache)) + return nothing +end + +@kernel function compute_coefficients_kernel!(u, func, t, equations, + dg::DG, node_coordinates) + element = @index(Global) + compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element) +end + +function compute_coefficients_element!(u, func, t, equations, dg::DG, + node_coordinates, element) + for j in eachnode(dg), i in eachnode(dg) + x_node = get_node_coords(node_coordinates, equations, dg, i, + j, element) + u_node = func(x_node, t, equations) + set_node_vars!(u, u_node, equations, dg, i, j, element) end end -function compute_coefficients!(u, func, t, mesh::AbstractMesh{3}, equations, dg::DG, - cache) +function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3}, + equations, dg::DG, cache) @threaded for element in eachelement(dg, cache) for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i, diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl index b2a547ac64d..f5a995cba3f 100644 --- a/src/solvers/dgmulti/dg.jl +++ b/src/solvers/dgmulti/dg.jl @@ -209,7 +209,7 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations, dg::DGMultiWeakForm, local_values_threaded, flux_threaded, rotated_flux_threaded) end -function compute_coefficients!(u, initial_condition, t, +function compute_coefficients!(::Nothing, u, initial_condition, t, mesh::DGMultiMesh, equations, dg::DGMulti, cache) md = mesh.md rd = dg.basis diff --git a/src/solvers/dgsem/basis_lobatto_legendre.jl b/src/solvers/dgsem/basis_lobatto_legendre.jl index 777348aa8ce..9647f172e20 100644 --- a/src/solvers/dgsem/basis_lobatto_legendre.jl +++ b/src/solvers/dgsem/basis_lobatto_legendre.jl @@ -34,6 +34,32 @@ struct LobattoLegendreBasis{RealT <: Real, NNODES, # negative adjoint wrt the SBP dot product end +function Adapt.adapt_structure(to, basis::LobattoLegendreBasis) + inverse_vandermonde_legendre = adapt(to, basis.inverse_vandermonde_legendre) + RealT = eltype(inverse_vandermonde_legendre) + + nodes = SVector{<:Any, RealT}(basis.nodes) + weights = SVector{<:Any, RealT}(basis.weights) + inverse_weights = SVector{<:Any, RealT}(basis.inverse_weights) + boundary_interpolation = adapt(to, basis.boundary_interpolation) + derivative_matrix = adapt(to, basis.derivative_matrix) + derivative_split = adapt(to, basis.derivative_split) + derivative_split_transpose = adapt(to, basis.derivative_split_transpose) + derivative_dhat = adapt(to, basis.derivative_dhat) + return LobattoLegendreBasis{RealT, nnodes(basis), typeof(nodes), + typeof(inverse_vandermonde_legendre), + typeof(boundary_interpolation), + typeof(derivative_matrix)}(nodes, + weights, + inverse_weights, + inverse_vandermonde_legendre, + boundary_interpolation, + derivative_matrix, + derivative_split, + derivative_split_transpose, + derivative_dhat) +end + function LobattoLegendreBasis(RealT, polydeg::Integer) nnodes_ = polydeg + 1 @@ -155,6 +181,17 @@ struct LobattoLegendreMortarL2{RealT <: Real, NNODES, reverse_lower::ReverseMatrix end +function Adapt.adapt_structure(to, mortar::LobattoLegendreMortarL2) + forward_upper = adapt(to, mortar.forward_upper) + forward_lower = adapt(to, mortar.forward_lower) + reverse_upper = adapt(to, mortar.reverse_upper) + reverse_lower = adapt(to, mortar.reverse_lower) + return LobattoLegendreMortarL2{eltype(forward_upper), nnodes(mortar), + typeof(forward_upper), + typeof(reverse_upper)}(forward_upper, forward_lower, + reverse_upper, reverse_lower) +end + function MortarL2(basis::LobattoLegendreBasis) RealT = real(basis) nnodes_ = nnodes(basis) diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl index a070db6b701..59942bc49a7 100644 --- a/src/solvers/dgsem_p4est/containers.jl +++ b/src/solvers/dgsem_p4est/containers.jl @@ -6,25 +6,31 @@ #! format: noindent mutable struct P4estElementContainer{NDIMS, RealT <: Real, uEltype <: Real, NDIMSP1, - NDIMSP2, NDIMSP3} <: AbstractContainer + NDIMSP2, NDIMSP3, + ArrayNDIMSP1 <: DenseArray{RealT, NDIMSP1}, + ArrayNDIMSP2 <: DenseArray{RealT, NDIMSP2}, + ArrayNDIMSP3 <: DenseArray{RealT, NDIMSP3}, + VectorRealT <: DenseVector{RealT}, + VectoruEltype <: DenseVector{uEltype}} <: + AbstractContainer # Physical coordinates at each node - node_coordinates::Array{RealT, NDIMSP2} # [orientation, node_i, node_j, node_k, element] + node_coordinates::ArrayNDIMSP2 # [orientation, node_i, node_j, node_k, element] # Jacobian matrix of the transformation # [jacobian_i, jacobian_j, node_i, node_j, node_k, element] where jacobian_i is the first index of the Jacobian matrix,... - jacobian_matrix::Array{RealT, NDIMSP3} + jacobian_matrix::ArrayNDIMSP3 # Contravariant vectors, scaled by J, in Kopriva's blue book called Ja^i_n (i index, n dimension) - contravariant_vectors::Array{RealT, NDIMSP3} # [dimension, index, node_i, node_j, node_k, element] + contravariant_vectors::ArrayNDIMSP3 # [dimension, index, node_i, node_j, node_k, element] # 1/J where J is the Jacobian determinant (determinant of Jacobian matrix) - inverse_jacobian::Array{RealT, NDIMSP1} # [node_i, node_j, node_k, element] + inverse_jacobian::ArrayNDIMSP1 # [node_i, node_j, node_k, element] # Buffer for calculated surface flux - surface_flux_values::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element] + surface_flux_values::ArrayNDIMSP2 # [variable, i, j, direction, element] # internal `resize!`able storage - _node_coordinates::Vector{RealT} - _jacobian_matrix::Vector{RealT} - _contravariant_vectors::Vector{RealT} - _inverse_jacobian::Vector{RealT} - _surface_flux_values::Vector{uEltype} + _node_coordinates::VectorRealT + _jacobian_matrix::VectorRealT + _contravariant_vectors::VectorRealT + _inverse_jacobian::VectorRealT + _surface_flux_values::VectoruEltype end @inline function nelements(elements::P4estElementContainer) @@ -36,7 +42,7 @@ end RealT, uEltype } - uEltype + return uEltype end # Only one-dimensional `Array`s are `resize!`able in Julia. @@ -51,31 +57,41 @@ function Base.resize!(elements::P4estElementContainer, capacity) n_dims = ndims(elements) n_nodes = size(elements.node_coordinates, 2) n_variables = size(elements.surface_flux_values, 1) + ArrayType = storage_type(elements) resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity) - elements.node_coordinates = unsafe_wrap(Array, pointer(_node_coordinates), - (n_dims, ntuple(_ -> n_nodes, n_dims)..., - capacity)) + elements.node_coordinates = unsafe_wrap_or_alloc(ArrayType, + _node_coordinates, + (n_dims, + ntuple(_ -> n_nodes, n_dims)..., + capacity)) resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity) - elements.jacobian_matrix = unsafe_wrap(Array, pointer(_jacobian_matrix), - (n_dims, n_dims, - ntuple(_ -> n_nodes, n_dims)..., capacity)) + elements.jacobian_matrix = unsafe_wrap_or_alloc(ArrayType, + _jacobian_matrix, + (n_dims, n_dims, + ntuple(_ -> n_nodes, n_dims)..., + capacity)) resize!(_contravariant_vectors, length(_jacobian_matrix)) - elements.contravariant_vectors = unsafe_wrap(Array, pointer(_contravariant_vectors), - size(elements.jacobian_matrix)) + elements.contravariant_vectors = unsafe_wrap_or_alloc(ArrayType, + _contravariant_vectors, + size(elements.jacobian_matrix)) resize!(_inverse_jacobian, n_nodes^n_dims * capacity) - elements.inverse_jacobian = unsafe_wrap(Array, pointer(_inverse_jacobian), - (ntuple(_ -> n_nodes, n_dims)..., capacity)) + elements.inverse_jacobian = unsafe_wrap_or_alloc(ArrayType, + _inverse_jacobian, + (ntuple(_ -> n_nodes, n_dims)..., + capacity)) resize!(_surface_flux_values, n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity) - elements.surface_flux_values = unsafe_wrap(Array, pointer(_surface_flux_values), - (n_variables, - ntuple(_ -> n_nodes, n_dims - 1)..., - n_dims * 2, capacity)) + elements.surface_flux_values = unsafe_wrap_or_alloc(ArrayType, + _surface_flux_values, + (n_variables, + ntuple(_ -> n_nodes, + n_dims - 1)..., + n_dims * 2, capacity)) return nothing end @@ -117,33 +133,104 @@ function init_elements(mesh::Union{P4estMesh{NDIMS, NDIMS, RealT}, NDIMS * 2, nelements)) elements = P4estElementContainer{NDIMS, RealT, uEltype, NDIMS + 1, NDIMS + 2, - NDIMS + 3}(node_coordinates, jacobian_matrix, - contravariant_vectors, - inverse_jacobian, surface_flux_values, - _node_coordinates, _jacobian_matrix, - _contravariant_vectors, - _inverse_jacobian, _surface_flux_values) + NDIMS + 3, Array{RealT, NDIMS + 1}, + Array{RealT, NDIMS + 2}, Array{RealT, NDIMS + 3}, + Vector{RealT}, Vector{uEltype}}(node_coordinates, + jacobian_matrix, + contravariant_vectors, + inverse_jacobian, + surface_flux_values, + _node_coordinates, + _jacobian_matrix, + _contravariant_vectors, + _inverse_jacobian, + _surface_flux_values) init_elements!(elements, mesh, basis) return elements end -mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <: +function Adapt.parent_type(::Type{<:P4estElementContainer{<:Any, <:Any, <:Any, <:Any, + <:Any, <:Any, ArrayT}}) where {ArrayT} + ArrayT +end + +# Manual adapt_structure since we have aliasing memory +function Adapt.adapt_structure(to, + elements::P4estElementContainer{NDIMS}) where {NDIMS} + # Adapt underlying storage + _node_coordinates = adapt(to, elements._node_coordinates) + _jacobian_matrix = adapt(to, elements._jacobian_matrix) + _contravariant_vectors = adapt(to, elements._contravariant_vectors) + _inverse_jacobian = adapt(to, elements._inverse_jacobian) + _surface_flux_values = adapt(to, elements._surface_flux_values) + + RealT = eltype(_inverse_jacobian) + uEltype = eltype(_surface_flux_values) + + # Wrap arrays again + node_coordinates = unsafe_wrap_or_alloc(to, _node_coordinates, + size(elements.node_coordinates)) + jacobian_matrix = unsafe_wrap_or_alloc(to, _jacobian_matrix, + size(elements.jacobian_matrix)) + contravariant_vectors = unsafe_wrap_or_alloc(to, _contravariant_vectors, + size(jacobian_matrix)) + inverse_jacobian = unsafe_wrap_or_alloc(to, _inverse_jacobian, + size(elements.inverse_jacobian)) + surface_flux_values = unsafe_wrap_or_alloc(to, _surface_flux_values, + size(elements.surface_flux_values)) + + new_type_params = (NDIMS, + RealT, + uEltype, + NDIMS + 1, + NDIMS + 2, + NDIMS + 3, + typeof(inverse_jacobian), # ArrayNDIMSP1 + typeof(node_coordinates), # ArrayNDIMSP2 + typeof(jacobian_matrix), # ArrayNDIMSP3 + typeof(_node_coordinates), # VectorRealT + typeof(_surface_flux_values)) # VectoruEltype + return P4estElementContainer{new_type_params...}(node_coordinates, + jacobian_matrix, + contravariant_vectors, + inverse_jacobian, + surface_flux_values, + _node_coordinates, + _jacobian_matrix, + _contravariant_vectors, + _inverse_jacobian, + _surface_flux_values) +end + +mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2, + uArray <: DenseArray{uEltype, NDIMSP2}, + IdsMatrix <: DenseMatrix{Int}, + IndicesMatrix <: + DenseMatrix{NTuple{NDIMS, Symbol}}, + uVector <: DenseVector{uEltype}, + IdsVector <: DenseVector{Int}, + IndicesVector <: + DenseVector{NTuple{NDIMS, Symbol}}} <: AbstractContainer - u::Array{uEltype, NDIMSP2} # [primary/secondary, variable, i, j, interface] - neighbor_ids::Matrix{Int} # [primary/secondary, interface] - node_indices::Matrix{NTuple{NDIMS, Symbol}} # [primary/secondary, interface] + u::uArray # [primary/secondary, variable, i, j, interface] + neighbor_ids::IdsMatrix # [primary/secondary, interface] + node_indices::IndicesMatrix # [primary/secondary, interface] # internal `resize!`able storage - _u::Vector{uEltype} - _neighbor_ids::Vector{Int} - _node_indices::Vector{NTuple{NDIMS, Symbol}} + _u::uVector + _neighbor_ids::IdsVector + _node_indices::IndicesVector end @inline function ninterfaces(interfaces::P4estInterfaceContainer) size(interfaces.neighbor_ids, 2) end @inline Base.ndims(::P4estInterfaceContainer{NDIMS}) where {NDIMS} = NDIMS +@inline function Base.eltype(::P4estInterfaceContainer{NDIMS, uEltype}) where {NDIMS, + uEltype} + return uEltype +end # See explanation of Base.resize! for the element container function Base.resize!(interfaces::P4estInterfaceContainer, capacity) @@ -152,17 +239,20 @@ function Base.resize!(interfaces::P4estInterfaceContainer, capacity) n_dims = ndims(interfaces) n_nodes = size(interfaces.u, 3) n_variables = size(interfaces.u, 2) + ArrayType = storage_type(interfaces) resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity) - interfaces.u = unsafe_wrap(Array, pointer(_u), + interfaces.u = unsafe_wrap(ArrayType, pointer(_u), (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)..., capacity)) resize!(_neighbor_ids, 2 * capacity) - interfaces.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids), (2, capacity)) + interfaces.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids), + (2, capacity)) resize!(_node_indices, 2 * capacity) - interfaces.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity)) + interfaces.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices), + (2, capacity)) return nothing end @@ -189,10 +279,15 @@ function init_interfaces(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_interfaces) node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_interfaces)) - interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u, neighbor_ids, - node_indices, - _u, _neighbor_ids, - _node_indices) + interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2, + typeof(u), typeof(neighbor_ids), + typeof(node_indices), typeof(_u), + typeof(_neighbor_ids), typeof(_node_indices)}(u, + neighbor_ids, + node_indices, + _u, + _neighbor_ids, + _node_indices) init_interfaces!(interfaces, mesh) @@ -205,21 +300,58 @@ function init_interfaces!(interfaces, mesh::Union{P4estMesh, P4estMeshView}) return interfaces end -mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1} <: +function Adapt.parent_type(::Type{<:P4estInterfaceContainer{<:Any, <:Any, <:Any, + ArrayT}}) where {ArrayT} + ArrayT +end + +# Manual adapt_structure since we have aliasing memory +function Adapt.adapt_structure(to, interfaces::P4estInterfaceContainer) + # Adapt underlying storage + _u = adapt(to, interfaces._u) + _neighbor_ids = adapt(to, interfaces._neighbor_ids) + _node_indices = adapt(to, interfaces._node_indices) + # Wrap arrays again + u = unsafe_wrap_or_alloc(to, _u, size(interfaces.u)) + neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids, + size(interfaces.neighbor_ids)) + node_indices = unsafe_wrap_or_alloc(to, _node_indices, + size(interfaces.node_indices)) + + NDIMS = ndims(interfaces) + new_type_params = (NDIMS, + eltype(_u), + NDIMS + 2, + typeof(u), typeof(neighbor_ids), typeof(node_indices), + typeof(_u), typeof(_neighbor_ids), typeof(_node_indices)) + return P4estInterfaceContainer{new_type_params...}(u, neighbor_ids, node_indices, + _u, _neighbor_ids, _node_indices) +end + +mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1, + uArray <: DenseArray{uEltype, NDIMSP1}, + IdsVector <: DenseVector{Int}, + IndicesVector <: + DenseVector{NTuple{NDIMS, Symbol}}, + uVector <: DenseVector{uEltype}} <: AbstractContainer - u::Array{uEltype, NDIMSP1} # [variables, i, j, boundary] - neighbor_ids::Vector{Int} # [boundary] - node_indices::Vector{NTuple{NDIMS, Symbol}} # [boundary] - name::Vector{Symbol} # [boundary] + u::uArray # [variables, i, j, boundary] + neighbor_ids::IdsVector # [boundary] + node_indices::IndicesVector # [boundary] + name::Vector{Symbol} # [boundary] # internal `resize!`able storage - _u::Vector{uEltype} + _u::uVector end @inline function nboundaries(boundaries::P4estBoundaryContainer) length(boundaries.neighbor_ids) end @inline Base.ndims(::P4estBoundaryContainer{NDIMS}) where {NDIMS} = NDIMS +@inline function Base.eltype(::P4estBoundaryContainer{NDIMS, uEltype}) where {NDIMS, + uEltype} + uEltype +end # See explanation of Base.resize! for the element container function Base.resize!(boundaries::P4estBoundaryContainer, capacity) @@ -228,9 +360,10 @@ function Base.resize!(boundaries::P4estBoundaryContainer, capacity) n_dims = ndims(boundaries) n_nodes = size(boundaries.u, 2) n_variables = size(boundaries.u, 1) + ArrayType = storage_type(boundaries) resize!(_u, n_variables * n_nodes^(n_dims - 1) * capacity) - boundaries.u = unsafe_wrap(Array, pointer(_u), + boundaries.u = unsafe_wrap(ArrayType, pointer(_u), (n_variables, ntuple(_ -> n_nodes, n_dims - 1)..., capacity)) @@ -263,9 +396,11 @@ function init_boundaries(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, n_boundaries) names = Vector{Symbol}(undef, n_boundaries) - boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1}(u, neighbor_ids, - node_indices, names, - _u) + boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1, typeof(u), + typeof(neighbor_ids), typeof(node_indices), + typeof(_u)}(u, neighbor_ids, + node_indices, names, + _u) if n_boundaries > 0 init_boundaries!(boundaries, mesh) @@ -312,6 +447,25 @@ function init_boundaries_iter_face_inner(info_pw, boundaries, boundary_id, mesh) return nothing end +function Adapt.parent_type(::Type{<:P4estBoundaryContainer{<:Any, <:Any, <:Any, ArrayT}}) where {ArrayT} + ArrayT +end + +# Manual adapt_structure since we have aliasing memory +function Adapt.adapt_structure(to, boundaries::P4estBoundaryContainer) + _u = adapt(to, boundaries._u) + u = unsafe_wrap_or_alloc(to, _u, size(boundaries.u)) + neighbor_ids = adapt(to, boundaries.neighbor_ids) + node_indices = adapt(to, boundaries.node_indices) + name = boundaries.name + + NDIMS = ndims(boundaries) + return P4estBoundaryContainer{NDIMS, eltype(_u), NDIMS + 1, typeof(u), + typeof(neighbor_ids), typeof(node_indices), + typeof(_u)}(u, neighbor_ids, node_indices, + name, _u) +end + # Container data structure (structure-of-arrays style) for DG L2 mortars # # The positions used in `neighbor_ids` are 1:3 (in 2D) or 1:5 (in 3D), where 1:2 (in 2D) @@ -337,20 +491,32 @@ end # │ └─────────────┴─────────────┘ └───────────────────────────┘ # │ # ⋅────> ξ -mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3} <: +mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3, + uArray <: DenseArray{uEltype, NDIMSP3}, + IdsMatrix <: DenseMatrix{Int}, + IndicesMatrix <: + DenseMatrix{NTuple{NDIMS, Symbol}}, + uVector <: DenseVector{uEltype}, + IdsVector <: DenseVector{Int}, + IndicesVector <: + DenseVector{NTuple{NDIMS, Symbol}}} <: AbstractContainer - u::Array{uEltype, NDIMSP3} # [small/large side, variable, position, i, j, mortar] - neighbor_ids::Matrix{Int} # [position, mortar] - node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar] + u::uArray # [small/large side, variable, position, i, j, mortar] + neighbor_ids::IdsMatrix # [position, mortar] + node_indices::IndicesMatrix # [small/large, mortar] # internal `resize!`able storage - _u::Vector{uEltype} - _neighbor_ids::Vector{Int} - _node_indices::Vector{NTuple{NDIMS, Symbol}} + _u::uVector + _neighbor_ids::IdsVector + _node_indices::IndicesVector end @inline nmortars(mortars::P4estMortarContainer) = size(mortars.neighbor_ids, 2) @inline Base.ndims(::P4estMortarContainer{NDIMS}) where {NDIMS} = NDIMS +@inline function Base.eltype(::P4estMortarContainer{NDIMS, uEltype}) where {NDIMS, + uEltype} + uEltype +end # See explanation of Base.resize! for the element container function Base.resize!(mortars::P4estMortarContainer, capacity) @@ -359,18 +525,19 @@ function Base.resize!(mortars::P4estMortarContainer, capacity) n_dims = ndims(mortars) n_nodes = size(mortars.u, 4) n_variables = size(mortars.u, 2) + ArrayType = storage_type(mortars) resize!(_u, 2 * n_variables * 2^(n_dims - 1) * n_nodes^(n_dims - 1) * capacity) - mortars.u = unsafe_wrap(Array, pointer(_u), + mortars.u = unsafe_wrap(ArrayType, pointer(_u), (2, n_variables, 2^(n_dims - 1), ntuple(_ -> n_nodes, n_dims - 1)..., capacity)) resize!(_neighbor_ids, (2^(n_dims - 1) + 1) * capacity) - mortars.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids), + mortars.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids), (2^(n_dims - 1) + 1, capacity)) resize!(_node_indices, 2 * capacity) - mortars.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity)) + mortars.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices), (2, capacity)) return nothing end @@ -398,12 +565,15 @@ function init_mortars(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equatio _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_mortars) node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_mortars)) - mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3}(u, - neighbor_ids, - node_indices, - _u, - _neighbor_ids, - _node_indices) + mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3, typeof(u), + typeof(neighbor_ids), typeof(node_indices), + typeof(_u), typeof(_neighbor_ids), + typeof(_node_indices)}(u, + neighbor_ids, + node_indices, + _u, + _neighbor_ids, + _node_indices) if n_mortars > 0 init_mortars!(mortars, mesh) @@ -418,6 +588,34 @@ function init_mortars!(mortars, mesh::Union{P4estMesh, P4estMeshView}) return mortars end +function Adapt.parent_type(::Type{<:P4estMortarContainer{<:Any, <:Any, <:Any, <:Any, + ArrayT}}) where {ArrayT} + ArrayT +end + +# Manual adapt_structure since we have aliasing memory +function Adapt.adapt_structure(to, mortars::P4estMortarContainer) + # Adapt underlying storage + _u = adapt(to, mortars._u) + _neighbor_ids = adapt(to, mortars._neighbor_ids) + _node_indices = adapt(to, mortars._node_indices) + + # Wrap arrays again + u = unsafe_wrap_or_alloc(to, _u, size(mortars.u)) + neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids, size(mortars.neighbor_ids)) + node_indices = unsafe_wrap_or_alloc(to, _node_indices, size(mortars.node_indices)) + + NDIMS = ndims(mortars) + new_type_params = (NDIMS, + eltype(_u), + NDIMS + 1, + NDIMS + 3, + typeof(u), typeof(neighbor_ids), typeof(node_indices), + typeof(_u), typeof(_neighbor_ids), typeof(_node_indices)) + return P4estMortarContainer{new_type_params...}(u, neighbor_ids, node_indices, + _u, _neighbor_ids, _node_indices) +end + function reinitialize_containers!(mesh::P4estMesh, equations, dg::DGSEM, cache) # Re-initialize elements container @unpack elements = cache diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl index 676b37efff3..4b6fc813703 100644 --- a/src/solvers/dgsem_p4est/containers_parallel.jl +++ b/src/solvers/dgsem_p4est/containers_parallel.jl @@ -5,15 +5,19 @@ @muladd begin #! format: noindent -mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <: +mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2, + uArray <: DenseArray{uEltype, NDIMSP2}, + VecInt <: DenseVector{Int}, + IndicesVector <: + DenseVector{NTuple{NDIMS, Symbol}}, + uVector <: DenseVector{uEltype}} <: AbstractContainer - u::Array{uEltype, NDIMSP2} # [primary/secondary, variable, i, j, interface] - local_neighbor_ids::Vector{Int} # [interface] - node_indices::Vector{NTuple{NDIMS, Symbol}} # [interface] - local_sides::Vector{Int} # [interface] - + u::uArray # [primary/secondary, variable, i, j, interface] + local_neighbor_ids::VecInt # [interface] + node_indices::IndicesVector # [interface] + local_sides::VecInt # [interface] # internal `resize!`able storage - _u::Vector{uEltype} + _u::uVector end @inline function nmpiinterfaces(interfaces::P4estMPIInterfaceContainer) @@ -27,9 +31,10 @@ function Base.resize!(mpi_interfaces::P4estMPIInterfaceContainer, capacity) n_dims = ndims(mpi_interfaces) n_nodes = size(mpi_interfaces.u, 3) n_variables = size(mpi_interfaces.u, 2) + ArrayType = storage_type(mpi_interfaces) resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity) - mpi_interfaces.u = unsafe_wrap(Array, pointer(_u), + mpi_interfaces.u = unsafe_wrap(ArrayType, pointer(_u), (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)..., capacity)) @@ -64,11 +69,13 @@ function init_mpi_interfaces(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh}, local_sides = Vector{Int}(undef, n_mpi_interfaces) - mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u, - local_neighbor_ids, - node_indices, - local_sides, - _u) + mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2, + typeof(u), typeof(local_neighbor_ids), + typeof(node_indices), typeof(_u)}(u, + local_neighbor_ids, + node_indices, + local_sides, + _u) init_mpi_interfaces!(mpi_interfaces, mesh) @@ -81,6 +88,32 @@ function init_mpi_interfaces!(mpi_interfaces, mesh::ParallelP4estMesh) return mpi_interfaces end +function Adapt.parent_type(::Type{<:Trixi.P4estMPIInterfaceContainer{<:Any, <:Any, + <:Any, A}}) where {A} + return A +end + +# Manual adapt_structure since we have aliasing memory +function Adapt.adapt_structure(to, mpi_interfaces::P4estMPIInterfaceContainer) + # Adapt Vectors and underlying storage + _u = adapt(to, mpi_interfaces._u) + local_neighbor_ids = adapt(to, mpi_interfaces.local_neighbor_ids) + node_indices = adapt(to, mpi_interfaces.node_indices) + local_sides = adapt(to, mpi_interfaces.local_sides) + + # Wrap array again + u = unsafe_wrap_or_alloc(to, _u, size(mpi_interfaces.u)) + + NDIMS = ndims(mpi_interfaces) + return P4estMPIInterfaceContainer{NDIMS, eltype(u), + NDIMS + 2, + typeof(u), typeof(local_neighbor_ids), + typeof(node_indices), typeof(_u)}(u, + local_neighbor_ids, + node_indices, + local_sides, _u) +end + # Container data structure (structure-of-arrays style) for DG L2 mortars # # Similar to `P4estMortarContainer`. The field `neighbor_ids` has been split up into @@ -88,14 +121,17 @@ end # available elements belonging to a particular MPI mortar. Furthermore, `normal_directions` holds # the normal vectors on the surface of the small elements for each mortar. mutable struct P4estMPIMortarContainer{NDIMS, uEltype <: Real, RealT <: Real, NDIMSP1, - NDIMSP2, NDIMSP3} <: AbstractContainer - u::Array{uEltype, NDIMSP3} # [small/large side, variable, position, i, j, mortar] - local_neighbor_ids::Vector{Vector{Int}} # [mortar][ids] - local_neighbor_positions::Vector{Vector{Int}} # [mortar][positions] - node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar] - normal_directions::Array{RealT, NDIMSP2} # [dimension, i, j, position, mortar] + NDIMSP2, NDIMSP3, + uArray <: DenseArray{uEltype, NDIMSP3}, + uVector <: DenseVector{uEltype}} <: + AbstractContainer + u::uArray # [small/large side, variable, position, i, j, mortar] + local_neighbor_ids::Vector{Vector{Int}} # [mortar][ids] + local_neighbor_positions::Vector{Vector{Int}} # [mortar][positions] + node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar] + normal_directions::Array{RealT, NDIMSP2} # [dimension, i, j, position, mortar] # internal `resize!`able storage - _u::Vector{uEltype} + _u::uVector _node_indices::Vector{NTuple{NDIMS, Symbol}} _normal_directions::Vector{RealT} end @@ -164,11 +200,12 @@ function init_mpi_mortars(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh}, eq 2^(NDIMS - 1), n_mpi_mortars)) mpi_mortars = P4estMPIMortarContainer{NDIMS, uEltype, RealT, NDIMS + 1, NDIMS + 2, - NDIMS + 3}(u, local_neighbor_ids, - local_neighbor_positions, - node_indices, normal_directions, - _u, _node_indices, - _normal_directions) + NDIMS + 3, typeof(u), + typeof(_u)}(u, local_neighbor_ids, + local_neighbor_positions, + node_indices, normal_directions, + _u, _node_indices, + _normal_directions) if n_mpi_mortars > 0 init_mpi_mortars!(mpi_mortars, mesh, basis, elements) @@ -184,6 +221,35 @@ function init_mpi_mortars!(mpi_mortars, mesh::ParallelP4estMesh, basis, elements return mpi_mortars end +function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer) + # TODO: GPU + # Only parts of this container are adapted, since we currently don't + # use `local_neighbor_ids`, `local_neighbor_positions`, `normal_directions` + # on the GPU. If we do need them we need to redesign this to use the VecOfArrays + # approach. + + _u = adapt(to, mpi_mortars._u) + _node_indices = mpi_mortars._node_indices + _normal_directions = mpi_mortars._normal_directions + + u = unsafe_wrap_or_alloc(to, _u, size(mpi_mortars.u)) + local_neighbor_ids = mpi_mortars.local_neighbor_ids + local_neighbor_positions = mpi_mortars.local_neighbor_positions + node_indices = mpi_mortars.node_indices + normal_directions = mpi_mortars.normal_directions + + NDIMS = ndims(mpi_mortars) + return P4estMPIMortarContainer{NDIMS, eltype(_u), + eltype(_normal_directions), + NDIMS + 1, NDIMS + 2, NDIMS + 3, + typeof(u), typeof(_u)}(u, local_neighbor_ids, + local_neighbor_positions, + node_indices, + normal_directions, _u, + _node_indices, + _normal_directions) +end + # Overload init! function for regular interfaces, regular mortars and boundaries since they must # call the appropriate init_surfaces! function for parallel p4est meshes function init_interfaces!(interfaces, mesh::ParallelP4estMesh) diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl index e59f502c86c..4c099c9fd3f 100644 --- a/src/solvers/dgsem_p4est/dg_3d.jl +++ b/src/solvers/dgsem_p4est/dg_3d.jl @@ -13,18 +13,18 @@ function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations, fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), 4) - for _ in 1:Threads.nthreads()] + for _ in 1:Threads.nthreads()] |> VecOfArrays fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), 4) - for _ in 1:Threads.nthreads()] + for _ in 1:Threads.nthreads()] |> VecOfArrays fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2)) - for _ in 1:Threads.nthreads()] + for _ in 1:Threads.nthreads()] |> VecOfArrays u_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2)) - for _ in 1:Threads.nthreads()] + for _ in 1:Threads.nthreads()] |> VecOfArrays (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded) end diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl index 2cc201dd1f0..7acddf07b4b 100644 --- a/src/solvers/dgsem_p4est/dg_parallel.jl +++ b/src/solvers/dgsem_p4est/dg_parallel.jl @@ -5,12 +5,13 @@ @muladd begin #! format: noindent -mutable struct P4estMPICache{uEltype} +mutable struct P4estMPICache{BufferType <: DenseVector, + VecInt <: DenseVector{<:Integer}} mpi_neighbor_ranks::Vector{Int} - mpi_neighbor_interfaces::Vector{Vector{Int}} - mpi_neighbor_mortars::Vector{Vector{Int}} - mpi_send_buffers::Vector{Vector{uEltype}} - mpi_recv_buffers::Vector{Vector{uEltype}} + mpi_neighbor_interfaces::VecOfArrays{VecInt} + mpi_neighbor_mortars::VecOfArrays{VecInt} + mpi_send_buffers::VecOfArrays{BufferType} + mpi_recv_buffers::VecOfArrays{BufferType} mpi_send_requests::Vector{MPI.Request} mpi_recv_requests::Vector{MPI.Request} n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}} @@ -25,25 +26,29 @@ function P4estMPICache(uEltype) end mpi_neighbor_ranks = Vector{Int}(undef, 0) - mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) - mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) - mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) - mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) + mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays + mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays + mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays + mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays mpi_send_requests = Vector{MPI.Request}(undef, 0) mpi_recv_requests = Vector{MPI.Request}(undef, 0) n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1) n_elements_global = 0 first_element_global_id = 0 - P4estMPICache{uEltype}(mpi_neighbor_ranks, mpi_neighbor_interfaces, - mpi_neighbor_mortars, - mpi_send_buffers, mpi_recv_buffers, - mpi_send_requests, mpi_recv_requests, - n_elements_by_rank, n_elements_global, - first_element_global_id) + P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks, + mpi_neighbor_interfaces, + mpi_neighbor_mortars, + mpi_send_buffers, mpi_recv_buffers, + mpi_send_requests, mpi_recv_requests, + n_elements_by_rank, n_elements_global, + first_element_global_id) end -@inline Base.eltype(::P4estMPICache{uEltype}) where {uEltype} = uEltype +@inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType) + +# @eval due to @muladd +@eval Adapt.@adapt_structure(P4estMPICache) ## # Note that the code in `start_mpi_send`/`finish_mpi_receive!` is sensitive to inference on (at least) Julia 1.10. @@ -265,16 +270,16 @@ end function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh, mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype) - mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces, - mpi_mortars, - mesh) + mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces, + mpi_mortars, + mesh) - mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces, - mpi_neighbor_mortars, - ndims(mesh), - nvars, - n_nodes, - uEltype) + _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces, + _mpi_neighbor_mortars, + ndims(mesh), + nvars, + n_nodes, + uEltype) # Determine local and total number of elements n_elements_global = Int(mesh.p4est.global_num_quadrants[]) @@ -286,6 +291,11 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh, first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1 @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements" + mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces) + mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars) + mpi_send_buffers = VecOfArrays(_mpi_send_buffers) + mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers) + # TODO reuse existing structures @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars, diff --git a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl index 0cb3bd7f409..d6cf6e1ce6d 100644 --- a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl +++ b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl @@ -13,9 +13,10 @@ It stores a set of global indices for each boundary condition type and name to e during the call to `calc_boundary_flux!`. The original dictionary form of the boundary conditions set by the user in the elixir file is also stored for printing. """ -mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any}} +mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any}, + Vec <: AbstractVector{<:Integer}} boundary_condition_types::BCs # specific boundary condition type(s), e.g. BoundaryConditionDirichlet - boundary_indices::NTuple{N, Vector{Int}} # integer vectors containing global boundary indices + boundary_indices::NTuple{N, Vec} # integer vectors containing global boundary indices boundary_dictionary::Dict{Symbol, Any} # boundary conditions as set by the user in the elixir file boundary_symbol_indices::Dict{Symbol, Vector{Int}} # integer vectors containing global boundary indices per boundary identifier end @@ -33,10 +34,11 @@ function UnstructuredSortedBoundaryTypes(boundary_conditions::Dict, cache) boundary_symbol_indices = Dict{Symbol, Vector{Int}}() container = UnstructuredSortedBoundaryTypes{n_boundary_types, - typeof(boundary_condition_types)}(boundary_condition_types, - boundary_indices, - boundary_conditions, - boundary_symbol_indices) + typeof(boundary_condition_types), + Vector{Int}}(boundary_condition_types, + boundary_indices, + boundary_conditions, + boundary_symbol_indices) initialize!(container, cache) end @@ -119,4 +121,7 @@ function initialize!(boundary_types_container::UnstructuredSortedBoundaryTypes{N return boundary_types_container end + +# @eval due to @muladd +@eval Adapt.@adapt_structure(UnstructuredSortedBoundaryTypes) end # @muladd diff --git a/test/Project.toml b/test/Project.toml index 3559f8cb6e2..d2d835746e1 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,8 +1,11 @@ [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" @@ -29,10 +32,13 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26" [compat] +Adapt = "4" ADTypes = "1.11" +AMDGPU = "1.3.5" Aqua = "0.8" CairoMakie = "0.12, 0.13, 0.14, 0.15" Convex = "0.16" +CUDA = "5.8" DelimitedFiles = "1" DoubleFloats = "1.4.0" Downloads = "1" diff --git a/test/runtests.jl b/test/runtests.jl index db2c2e9dd88..d32eac318bb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -109,4 +109,22 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3) @time if TRIXI_TEST == "all" || TRIXI_TEST == "paper_self_gravitating_gas_dynamics" include("test_paper_self_gravitating_gas_dynamics.jl") end + + @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA" + import CUDA + if CUDA.functional() + include("test_cuda.jl") + else + @warn "Unable to run CUDA tests on this machine" + end + end + + @time if TRIXI_TEST == "all" || TRIXI_TEST == "AMDGPU" + import AMDGPU + if AMDGPU.functional() + include("test_amdgpu.jl") + else + @warn "Unable to run AMDGPU tests on this machine" + end + end end diff --git a/test/test_amdgpu.jl b/test/test_amdgpu.jl new file mode 100644 index 00000000000..6bbc9e41bec --- /dev/null +++ b/test/test_amdgpu.jl @@ -0,0 +1,79 @@ +module TestAMDGPU + +using Test +using Trixi + +include("test_trixi.jl") + +# Start with a clean environment: remove Trixi.jl output directory if it exists +outdir = "out" +isdir(outdir) && rm(outdir, recursive = true) + +EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem") + +@trixi_testset "elixir_advection_basic_gpu.jl native" begin + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=8.311947673061856e-6, + linf=6.627000273229378e-5,) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + let + t = sol.t[end] + u_ode = sol.u[end] + du_ode = similar(u_ode) + @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 + end + @test real(ode.p.solver) == Float64 + @test real(ode.p.solver.basis) == Float64 + @test real(ode.p.solver.mortar) == Float64 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa Array + @test ode.p.solver.basis.derivative_matrix isa Array + + @test Trixi.storage_type(ode.p.cache.elements) === Array + @test Trixi.storage_type(ode.p.cache.interfaces) === Array + @test Trixi.storage_type(ode.p.cache.boundaries) === Array + @test Trixi.storage_type(ode.p.cache.mortars) === Array +end + +@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin + # Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules + using AMDGPU + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=nothing, # TODO: GPU. [Float32(8.311947673061856e-6)], + linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)], + RealT=Float32, + real_type=Float32, + storage_type=ROCArray, + sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU + # # Ensure that we do not have excessive memory allocations + # # (e.g., from type instabilities) + # let + # t = sol.t[end] + # u_ode = sol.u[end] + # du_ode = similar(u_ode) + # @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 + # end + @test real(ode.p.solver) == Float32 + @test real(ode.p.solver.basis) == Float32 + @test real(ode.p.solver.mortar) == Float32 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa ROCArray + @test ode.p.solver.basis.derivative_matrix isa ROCArray + + @test Trixi.storage_type(ode.p.cache.elements) === ROCArray + @test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray + @test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray + @test Trixi.storage_type(ode.p.cache.mortars) === ROCArray +end + +# Clean up afterwards: delete Trixi.jl output directory +@test_nowarn isdir(outdir) && rm(outdir, recursive = true) + +end # module diff --git a/test/test_aqua.jl b/test/test_aqua.jl index 9b3f2d67903..154088995ca 100644 --- a/test/test_aqua.jl +++ b/test/test_aqua.jl @@ -10,6 +10,7 @@ include("test_trixi.jl") @timed_testset "Aqua.jl" begin Aqua.test_all(Trixi, ambiguities = false, + unbound_args = false, # FIXME: UnstructuredSortedBoundaryTypes # exceptions necessary for adding a new method `StartUpDG.estimate_h` # in src/solvers/dgmulti/sbp.jl piracies = (treat_as_own = [Trixi.StartUpDG.RefElemData, diff --git a/test/test_cuda.jl b/test/test_cuda.jl new file mode 100644 index 00000000000..4380ab0e111 --- /dev/null +++ b/test/test_cuda.jl @@ -0,0 +1,79 @@ +module TestCUDA + +using Test +using Trixi + +include("test_trixi.jl") + +# Start with a clean environment: remove Trixi.jl output directory if it exists +outdir = "out" +isdir(outdir) && rm(outdir, recursive = true) + +EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem") + +@trixi_testset "elixir_advection_basic_gpu.jl native" begin + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=8.311947673061856e-6, + linf=6.627000273229378e-5,) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + let + t = sol.t[end] + u_ode = sol.u[end] + du_ode = similar(u_ode) + @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 + end + @test real(ode.p.solver) == Float64 + @test real(ode.p.solver.basis) == Float64 + @test real(ode.p.solver.mortar) == Float64 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa Array + @test ode.p.solver.basis.derivative_matrix isa Array + + @test Trixi.storage_type(ode.p.cache.elements) === Array + @test Trixi.storage_type(ode.p.cache.interfaces) === Array + @test Trixi.storage_type(ode.p.cache.boundaries) === Array + @test Trixi.storage_type(ode.p.cache.mortars) === Array +end + +@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin + # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules + using CUDA + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=nothing, # TODO: GPU. [Float32(8.311947673061856e-6)], + linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)], + RealT=Float32, + real_type=Float32, + storage_type=CuArray, + sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU + # # Ensure that we do not have excessive memory allocations + # # (e.g., from type instabilities) + # let + # t = sol.t[end] + # u_ode = sol.u[end] + # du_ode = similar(u_ode) + # @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 + # end + @test real(ode.p.solver) == Float32 + @test real(ode.p.solver.basis) == Float32 + @test real(ode.p.solver.mortar) == Float32 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa CuArray + @test ode.p.solver.basis.derivative_matrix isa CuArray + + @test Trixi.storage_type(ode.p.cache.elements) === CuArray + @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray + @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray + @test Trixi.storage_type(ode.p.cache.mortars) === CuArray +end + +# Clean up afterwards: delete Trixi.jl output directory +@test_nowarn isdir(outdir) && rm(outdir, recursive = true) + +end # module diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl index 8f903a849d2..04ebba71050 100644 --- a/test/test_p4est_2d.jl +++ b/test/test_p4est_2d.jl @@ -27,6 +27,34 @@ isdir(outdir) && rm(outdir, recursive = true) du_ode = similar(u_ode) @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 end + semi32 = Trixi.trixi_adapt(Array, Float32, semi) + @test real(semi32.solver) == Float32 + @test real(semi32.solver.basis) == Float32 + @test real(semi32.solver.mortar) == Float32 + # TODO: remake ignores the mesh itself as well + @test real(semi32.mesh) == Float64 +end + +@trixi_testset "elixir_advection_basic.jl (Float32)" begin + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=[Float32(8.311947673061856e-6)], + linf=[Float32(6.627000273229378e-5)], + RealT=Float32, + real_type=Float32) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + let + t = sol.t[end] + u_ode = sol.u[end] + du_ode = similar(u_ode) + @test_broken (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 + end + @test real(ode.p.solver) == Float32 + @test real(ode.p.solver.basis) == Float32 + @test real(ode.p.solver.mortar) == Float32 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 end @trixi_testset "elixir_advection_nonconforming_flag.jl" begin diff --git a/test/test_unstructured_2d.jl b/test/test_unstructured_2d.jl index d16bc96fb83..758e42b7da1 100644 --- a/test/test_unstructured_2d.jl +++ b/test/test_unstructured_2d.jl @@ -2,6 +2,7 @@ module TestExamplesUnstructuredMesh2D using Test using Trixi +using Adapt include("test_trixi.jl") @@ -32,6 +33,12 @@ isdir(outdir) && rm(outdir, recursive = true) du_ode = similar(u_ode) @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000 end + semi32 = Trixi.trixi_adapt(Array, Float32, semi) + @test real(semi32.solver) == Float32 + @test real(semi32.solver.basis) == Float32 + @test real(semi32.solver.mortar) == Float32 + # TODO: remake ignores the mesh as well + @test real(semi32.mesh) == Float64 end @trixi_testset "elixir_euler_free_stream.jl" begin