trixi-framework
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 6 additions & 3 deletions b/‎.buildkite/pipeline.yml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎.github/workflows/GPUCompat.yml‎
Lines changed: 0 additions & 86 deletions b/‎.github/workflows/GPUCompat.yml‎
Lines changed: 0 additions & 86 deletions
diff --git a/‎NEWS.md‎
Lines changed: 7 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 13 additions & 3 deletions b/‎Project.toml‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎docs/Project.toml‎
Lines changed: 5 additions & 0 deletions b/‎docs/Project.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 2 additions & 1 deletion b/‎docs/make.jl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/src/heterogeneous.md‎
Lines changed: 163 additions & 0 deletions b/‎docs/src/heterogeneous.md‎
Lines changed: 163 additions & 0 deletions
@@ -1,3 +1,5 @@
+env:
+
 steps:
   - label: "CUDA Julia {{matrix.version}}"
     matrix:
@@ -7,12 +9,13 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
-    command: |
-      true
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "CUDA"
     agents:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip ci\]/
     timeout_in_minutes: 60
     soft_fail:
-      - exit_status: 3
+      - exit_status: 3
@@ -5,6 +5,13 @@ Trixi.jl follows the interpretation of
 used in the Julia ecosystem. Notable changes will be documented in this file
 for human readability.
 
+## Changes in the v0.12 lifecycle
+
+#### Added
+- Initial support for adapting data-structures between different storage arrays was added. This enables future work to support GPU with Trixi ([#2212]).
+
+#### Deprecated
+
 ## Changes when updating to v0.12 from v0.11.x
 
 #### Added
 
@@ -5,6 +5,7 @@ version = "0.12.7-DEV"
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -16,6 +17,7 @@ EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -52,31 +54,39 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 
 [extensions]
+TrixiAMDGPUExt = "AMDGPU"
+TrixiCUDAExt = "CUDA"
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
 
 [compat]
+AMDGPU = "1.3.5"
 Accessors = "0.1.36"
+Adapt = "4"
+CUDA = "5.8"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
 DataStructures = "0.18.15"
 DelimitedFiles = "1"
-DiffEqBase = "6.154"
+DiffEqBase = "6.155.2"
 DiffEqCallbacks = "2.35, 3, 4"
 Downloads = "1.6"
 ECOS = "1.1.2"
 EllipsisNotation = "1.0"
 FillArrays = "1.9"
 ForwardDiff = "0.10.36, 1"
 HDF5 = "0.16.10, 0.17"
+KernelAbstractions = "0.9.36"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
@@ -94,7 +104,7 @@ Printf = "1"
 RecipesBase = "1.3.4"
 RecursiveArrayTools = "3.31.1"
 Reexport = "1.2"
-Requires = "1.1"
+Requires = "1.3"
 SciMLBase = "2.67.0"
 SimpleUnPack = "1.1"
 SparseArrays = "1"
@@ -104,7 +114,7 @@ Static = "1.1.1"
 StaticArrayInterface = "1.5.1"
 StaticArrays = "1.9"
 StrideArrays = "0.1.29"
-StructArrays = "0.6.18, 0.7"
+StructArrays = "0.6.20, 0.7"
 SummationByPartsOperators = "0.5.52"
 T8code = "0.7.4"
 TimerOutputs = "0.5.23"
 
@@ -1,4 +1,5 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
@@ -16,9 +17,13 @@ OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388"
 OrdinaryDiffEqTsit5 = "b1df2697-797e-41e3-8120-5422d3b24e4a"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
 Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95"
 TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
 
+[sources]
+Trixi = {path = ".."}
+
 [compat]
 CairoMakie = "0.12, 0.13, 0.14, 0.15"
 Changelog = "1.1"
 
@@ -163,7 +163,8 @@ makedocs(
                  "Style guide" => "styleguide.md",
                  "Testing" => "testing.md",
                  "Performance" => "performance.md",
-                 "Parallelization" => "parallelization.md"
+                 "Parallelization" => "parallelization.md",
+                 "Heterogeneous" => "heterogeneous.md"
              ],
              "Troubleshooting and FAQ" => "troubleshooting.md",
              "Reference" => [
 
@@ -0,0 +1,163 @@
+# Heterogeneous computing
+
+Support for heterogeneous computing is currently being worked on.
+
+## The use of Adapt.jl
+
+[Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) is a package in the
+[JuliaGPU](https://github.com/JuliaGPU) family that allows for
+the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
+at the storage level with a GPU array like `CuArray` from [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
+
+To facilitate this, data structures must be parameterized, so instead of:
+
+```julia
+struct Container <: Trixi.AbstractContainer
+   data::Array{Float64, 2}
+end
+```
+
+They must be written as:
+
+```jldoctest adapt; output = false, setup=:(import Trixi)
+struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+   data::D
+end
+
+# output
+
+```
+
+furthermore, we need to define a function that allows for the conversion of storage
+of our types: 
+
+```jldoctest adapt; output = false
+using Adapt
+
+function Adapt.adapt_structure(to, C::Container)
+    return Container(adapt(to, C.data))
+end
+
+# output
+
+```
+
+or simply
+
+```julia
+Adapt.@adapt_structure(Container)
+```
+
+additionally, we must define `Adapt.parent_type`.
+
+```jldoctest adapt; output = false
+function Adapt.parent_type(::Type{<:Container{D}}) where D
+    return D
+end
+
+# output
+
+```
+
+All together we can use this machinery to perform conversions of a container.
+
+```jldoctest adapt
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+Array
+```
+
+
+```julia-repl
+julia> using CUDA
+
+julia> GPU_C = adapt(CuArray, C)
+Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+CuArray
+```
+
+## Element-type conversion with `Trixi.trixi_adapt`.
+
+We can use [`Trixi.trixi_adapt`](@ref) to perform both an element-type and a storage-type adoption:
+
+```jldoctest adapt
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(Array, Float32, C)
+Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0])
+```
+
+```julia-repl
+julia> Trixi.trixi_adapt(CuArray, Float32, C)
+Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0])
+```
+
+!!! note
+    `adapt(Array{Float32}, C)` is tempting, but it will do the wrong thing
+    in the presence of `SVector`s and similar arrays from StaticArrays.jl.
+
+
+## Writing GPU kernels
+
+Offloading computations to the GPU is done with
+[KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl),
+allowing for vendor-agnostic GPU code.
+
+### Example
+
+Given the following Trixi.jl code, which would typically be called from within `rhs!`:
+
+```julia
+function trixi_rhs_fct(mesh, equations, solver, cache, args)
+    @threaded for element in eachelement(solver, cache)
+        # code
+    end
+end
+```
+
+1.  Put the inner code in a new function `rhs_fct_per_element`. Besides the index
+    `element`, pass all required fields as arguments, but make sure to `@unpack` them from
+    their structs in advance.
+
+2.  Where `trixi_rhs_fct` is called, get the backend, i.e., the hardware we are currently
+    running on via `trixi_backend(x)`.
+    This will, e.g., work with `u_ode`. Internally, KernelAbstractions.jl's `get_backend`
+    will be called, i.e., KernelAbstractions.jl has to know the type of `x`.
+
+    ```julia
+    backend = trixi_backend(u_ode)
+    ```
+
+3.  Add a new argument `backend` to `trixi_rhs_fct` used for dispatch.
+    When `backend` is `nothing`, the legacy implementation should be used:
+    ```julia
+    function trixi_rhs_fct(backend::Nothing, mesh, equations, solver, cache, args)
+        @unpack unpacked_args = cache
+        @threaded for element in eachelement(solver, cache)
+            rhs_fct_per_element(element, unpacked_args, args)
+        end
+    end
+    ```
+
+4.  When `backend` is a `Backend` (a type defined by KernelAbstractions.jl), write a
+    KernelAbstractions.jl kernel:
+    ```julia
+    function trixi_rhs_fct(backend::Backend, mesh, equations, solver, cache, args)
+        nelements(solver, cache) == 0 && return nothing  # return early when there are no elements
+        @unpack unpacked_args = cache
+        kernel! = rhs_fct_kernel!(backend)
+        kernel!(unpacked_args, args,
+                ndrange = nelements(solver, cache))
+        return nothing
+    end
+
+    @kernel function rhs_fct_kernel!(unpacked_args, args)
+        element = @index(Global)
+        rhs_fct_per_element(element, unpacked_args, args)
+    end
+    ```