diff --git a/ci/cscs-gh200.yml b/ci/cscs-gh200.yml
new file mode 100644
index 00000000..afca96d8
--- /dev/null
+++ b/ci/cscs-gh200.yml
@@ -0,0 +1,36 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+
+unit_test:
+  extends: .uenv-runner-daint-gh200
+  image: julia/25.5:v1
+  script:
+    - export MPICH_GPU_SUPPORT_ENABLED=1
+    - julia -e 'println("Instantiating project");
+                using Pkg;
+                Pkg.activate(pwd())'
+    - julia -e 'println("Running tests");
+                using Pkg;
+                Pkg.activate(pwd());
+                Pkg.test("Chmy"; test_args=["--backends=CUDA"])'
+  variables:
+    WITH_UENV_VIEW: 'juliaup'
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_NTASKS_PER_NODE: 1
+    SLURM_GPUS_PER_TASK: 1
+    SLURM_TIMELIMIT: "00:15:00"
+
+perf_test:
+  extends: .baremetal-runner-daint-gh200
+  script:
+    - echo "Preparing the test environment (single rank)"
+    - export MPICH_GPU_SUPPORT_ENABLED=1
+    - srun -n 1 --uenv julia/25.5:v1 --view=juliaup julia --project=. -e 'using Pkg; Pkg.instantiate()'
+    - srun -n 1 --uenv julia/25.5:v1 --view=juliaup julia --project=. -e 'using Pkg; Pkg.add("CUDA")'
+    - echo "Running the reference test (multiple ranks)"
+    - srun --uenv julia/25.5:v1 --view=juliaup julia --project=. examples/stokes_3d_inc_ve_T_mpi_perf.jl
+  variables:
+    SLURM_JOB_NUM_NODES: 2
+    SLURM_NTASKS_PER_NODE: 4
+    SLURM_GPUS_PER_TASK: 1
+    SLURM_TIMELIMIT: "00:10:00"
diff --git a/examples/stokes_3d_inc_ve_T_mpi_perf.jl b/examples/stokes_3d_inc_ve_T_mpi_perf.jl
index 7b03e5de..dbabb9e4 100644
--- a/examples/stokes_3d_inc_ve_T_mpi_perf.jl
+++ b/examples/stokes_3d_inc_ve_T_mpi_perf.jl
@@ -1,13 +1,13 @@
 using Chmy
 using KernelAbstractions
 using Printf
-using JSON
+# using JSON
 # using CairoMakie
 
-using AMDGPU
-AMDGPU.allowscalar(false)
-# using CUDA
-# CUDA.allowscalar(false)
+# using AMDGPU
+# AMDGPU.allowscalar(false)
+using CUDA
+CUDA.allowscalar(false)
 
 using MPI
 MPI.Init()
@@ -86,8 +86,8 @@ end
 end
 
 @views function main(backend=CPU(); nxyz_l=(126, 126, 126))
-    # arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 0); device_id=1)
-    arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 0))
+    arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 0); device_id=1)
+    # arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 0))
     topo = topology(arch)
     me   = global_rank(topo)
     # geometry
@@ -237,12 +237,13 @@ end
     return
 end
 
-input = open(JSON.parse, joinpath(@__DIR__, "params.json"))
-params = NamedTuple(Symbol.(keys(input)) .=> values(input))
-res = params.res
-# res = 640
+# input = open(JSON.parse, joinpath(@__DIR__, "params.json"))
+# params = NamedTuple(Symbol.(keys(input)) .=> values(input))
+# res = params.res
+res = 512
 
-main(ROCBackend(); nxyz_l=(res, res, res) .- 2)
+# main(ROCBackend(); nxyz_l=(res, res, res) .- 2)
+main(CUDABackend(); nxyz_l=(res, res, res) .- 2)
 # main(; nxyz_l=(254, 254, 254))
 
 MPI.Finalize()