diff --git a/.buildkite/run_tests.yml b/.buildkite/run_tests.yml
index ce4ddfd3..c4111fbf 100644
--- a/.buildkite/run_tests.yml
+++ b/.buildkite/run_tests.yml
@@ -5,6 +5,7 @@ steps:
         version:
           - "1.10"
           - "1.11"
+          - "1.12-nightly"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
@@ -33,6 +34,7 @@ steps:
         version:
           - "1.10"
           - "1.11"
+          - "1.12-nightly"
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
diff --git a/README.md b/README.md
index d527bc55..8b741169 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 [![DOI](https://zenodo.org/badge/507905159.svg)](https://zenodo.org/doi/10.5281/zenodo.10212675)
 [![CPU UnitTests](https://github.com/JuliaGeodynamics/JustPIC.jl/actions/workflows/UnitTests.yml/badge.svg)](https://github.com/JuliaGeodynamics/JustPIC.jl/actions/workflows/UnitTests.yml)
 [![GPU UnitTests](https://badge.buildkite.com/bb05ed7ef3b43f843a5ba4a976c27a724064d67955193accea.svg)](https://buildkite.com/julialang/justpic-dot-jl)
+[![CSCS UnitTests](https://gitlab.com/cscs-ci/ci-testing/webhook-ci/mirrors/6264856887055800/8444213376739374/badges/main/pipeline.svg?ignore_skipped=true)](https://gitlab.com/cscs-ci/ci-testing/webhook-ci/mirrors/6264856887055800/8444213376739374/-/pipelines)
 [![codecov](https://codecov.io/gh/JuliaGeodynamics/JustPIC.jl/graph/badge.svg?token=PN0AJZXK13)](https://codecov.io/gh/JuliaGeodynamics/JustPIC.jl)
 [![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
 
diff --git a/ci/cscs-gh200.yml b/ci/cscs-gh200.yml
new file mode 100644
index 00000000..c53c2a98
--- /dev/null
+++ b/ci/cscs-gh200.yml
@@ -0,0 +1,40 @@
+include:
+  - remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
+
+unit_test:
+  extends: .uenv-runner-daint-gh200
+  image: julia/25.5:v1
+  script:
+    - export MPICH_GPU_SUPPORT_ENABLED=1
+    - export IGG_CUDAAWARE_MPI=1 # IGG
+    - export JULIA_CUDA_USE_COMPAT=false # IGG
+    - julia -e 'println("Instantiating project");
+                using Pkg;
+                Pkg.activate(pwd())'
+    - julia -e 'println("Running tests");
+                using Pkg;
+                Pkg.activate(pwd());
+                Pkg.test("JustPIC"; test_args=["--backend=CUDA"])'
+  variables:
+    WITH_UENV_VIEW: 'juliaup'
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_NTASKS_PER_NODE: 1
+    SLURM_GPUS_PER_TASK: 1
+    SLURM_TIMELIMIT: "00:30:00"
+
+ref_test:
+  extends: .baremetal-runner-daint-gh200
+  script:
+    - export MPICH_GPU_SUPPORT_ENABLED=1
+    - export IGG_CUDAAWARE_MPI=1 # IGG
+    - export JULIA_CUDA_USE_COMPAT=false # IGG
+    - echo "Preparing the test environment (single rank)"
+    - srun -n 1 --uenv julia/25.5:v1 --view=juliaup julia --project=. -e 'using Pkg; Pkg.instantiate()'
+    - srun -n 1 --uenv julia/25.5:v1 --view=juliaup julia --project=. -e 'using Pkg; Pkg.add("CUDA")'
+    - echo "Running the reference test (multiple ranks)"
+    - srun --uenv julia/25.5:v1 --view=juliaup julia --project=. scripts/temperature_advection_MPI_ci.jl
+  variables:
+    SLURM_JOB_NUM_NODES: 2
+    SLURM_NTASKS_PER_NODE: 4
+    SLURM_GPUS_PER_TASK: 1
+    SLURM_TIMELIMIT: "00:30:00"
diff --git a/ext/JustPICAMDGPUExt.jl b/ext/JustPICAMDGPUExt.jl
index cbd44e81..d9e8f991 100644
--- a/ext/JustPICAMDGPUExt.jl
+++ b/ext/JustPICAMDGPUExt.jl
@@ -269,8 +269,8 @@ module _2D
     end
 
     function JustPIC._2D.semilagrangian_advection!(
-            F::Union{ROCArrays, NTuple{NF, ROCArrays}},
-            F0::Union{ROCArrays, NTuple{NF, ROCArrays}},
+            F::Union{ROCArray, NTuple{NF, ROCArray}},
+            F0::Union{ROCArray, NTuple{NF, ROCArray}},
             method::AbstractAdvectionIntegrator,
             V,
             grid_vi::NTuple{N, NTuple{N, T}},
@@ -282,8 +282,8 @@ module _2D
     end
 
     function JustPIC._2D.semilagrangian_advection_LinP!(
-            F::Union{ROCArrays, NTuple{NF, ROCArrays}},
-            F0::Union{ROCArrays, NTuple{NF, ROCArrays}},
+            F::Union{ROCArray, NTuple{NF, ROCArray}},
+            F0::Union{ROCArray, NTuple{NF, ROCArray}},
             method::AbstractAdvectionIntegrator,
             V,
             grid_vi::NTuple{N, NTuple{N, T}},
@@ -295,8 +295,8 @@ module _2D
     end
 
     function JustPIC._2D.semilagrangian_advection_MQS!(
-            F::Union{ROCArrays, NTuple{NF, ROCArrays}},
-            F0::Union{ROCArrays, NTuple{NF, ROCArrays}},
+            F::Union{ROCArray, NTuple{NF, ROCArray}},
+            F0::Union{ROCArray, NTuple{NF, ROCArray}},
             method::AbstractAdvectionIntegrator,
             V,
             grid_vi::NTuple{N, NTuple{N, T}},
@@ -703,8 +703,8 @@ module _3D
     end
 
     function JustPIC._3D.semilagrangian_advection!(
-            F::Union{ROCArrays, NTuple{NF, ROCArrays}},
-            F0::Union{ROCArrays, NTuple{NF, ROCArrays}},
+            F::Union{ROCArray, NTuple{NF, ROCArray}},
+            F0::Union{ROCArray, NTuple{NF, ROCArray}},
             method::AbstractAdvectionIntegrator,
             V,
             grid_vi::NTuple{N, NTuple{N, T}},
@@ -716,8 +716,8 @@ module _3D
     end
 
     function JustPIC._3D.semilagrangian_advection_LinP!(
-            F::Union{ROCArrays, NTuple{NF, ROCArrays}},
-            F0::Union{ROCArrays, NTuple{NF, ROCArrays}},
+            F::Union{ROCArray, NTuple{NF, ROCArray}},
+            F0::Union{ROCArray, NTuple{NF, ROCArray}},
             method::AbstractAdvectionIntegrator,
             V,
             grid_vi::NTuple{N, NTuple{N, T}},
@@ -729,8 +729,8 @@ module _3D
     end
 
     function JustPIC._3D.semilagrangian_advection_MQS!(
-            F::Union{ROCArrays, NTuple{NF, ROCArrays}},
-            F0::Union{ROCArrays, NTuple{NF, ROCArrays}},
+            F::Union{ROCArray, NTuple{NF, ROCArray}},
+            F0::Union{ROCArray, NTuple{NF, ROCArray}},
             method::AbstractAdvectionIntegrator,
             V,
             grid_vi::NTuple{N, NTuple{N, T}},
diff --git a/scripts/temperature_advection_MPI.jl b/scripts/temperature_advection_MPI.jl
index 623eaae8..a64ba91d 100644
--- a/scripts/temperature_advection_MPI.jl
+++ b/scripts/temperature_advection_MPI.jl
@@ -7,7 +7,7 @@ using JustPIC, JustPIC._2D
 # to run on a CUDA GPU load CUDA.jl (i.e. "using CUDA"),
 # and to run on an AMD GPU load AMDGPU.jl (i.e. "using AMDGPU")
 # const backend = JustPIC.CPUBackend # Options: CPUBackend, CUDABackend, AMDGPUBackend
-const backend = CUDABackend # Options: CPUBackend, CUDABackend, AMDGPUBackend
+# const backend = CUDABackend # Options: CPUBackend, CUDABackend, AMDGPUBackend
 
 using GLMakie
 using ImplicitGlobalGrid
@@ -87,9 +87,9 @@ function main()
         advection!(particles, RungeKutta2(), V, (grid_vx, grid_vy), dt)
 
         # update halos
-        update_cell_halo!(particles.coords...)
-        update_cell_halo!(particle_args...)
-        update_cell_halo!(particles.index)
+        update_halo!(particles.coords...)
+        update_halo!(particle_args...)
+        update_halo!(particles.index)
         # shuffle particles
         move_particles!(particles, xvi, particle_args)
         # interpolate T from particle to grid
diff --git a/scripts/temperature_advection_MPI_ci.jl b/scripts/temperature_advection_MPI_ci.jl
new file mode 100644
index 00000000..8caa13d8
--- /dev/null
+++ b/scripts/temperature_advection_MPI_ci.jl
@@ -0,0 +1,153 @@
+using CUDA
+# import Pkg
+# Pkg.resolve(); Pkg.update()
+using JustPIC, JustPIC._2D
+
+# Threads is the default backend,
+# to run on a CUDA GPU load CUDA.jl (i.e. "using CUDA"),
+# and to run on an AMD GPU load AMDGPU.jl (i.e. "using AMDGPU")
+# const backend = JustPIC.CPUBackend # Options: CPUBackend, CUDABackend, AMDGPUBackend
+const backend = CUDABackend # Options: CPUBackend, CUDABackend, AMDGPUBackend
+
+# using GLMakie
+using ImplicitGlobalGrid
+import MPI
+
+# Analytical flow solution
+vx_stream(x, y) = 250 * sin(π * x) * cos(π * y)
+vy_stream(x, y) = -250 * cos(π * x) * sin(π * y)
+g(x) = Point2f(
+    vx_stream(x[1], x[2]),
+    vy_stream(x[1], x[2])
+)
+
+function expand_range(x::AbstractRange)
+    dx = x[2] - x[1]
+    n = length(x)
+    x1, x2 = extrema(x)
+    xI = x1 - dx
+    xF = x2 + dx
+    return LinRange(xI, xF, n + 2)
+end
+
+function main()
+    # Initialize particles -------------------------------
+    nxcell, max_xcell, min_xcell = 24, 40, 1
+    n = 64
+    nx = ny = n - 1
+    me, dims, = init_global_grid(
+        n - 1, n - 1, 1;
+        init_MPI = MPI.Initialized() ? false : true,
+        select_device = false
+    )
+    Lx = Ly = 1.0
+    dxi = dx, dy = Lx / (nx_g() - 1), Ly / (ny_g() - 1)
+    # nodal vertices
+    xvi = xv, yv = let
+        dummy = zeros(n, n)
+        xv = [x_g(i, dx, dummy) for i in axes(dummy, 1)]
+        yv = [y_g(i, dy, dummy) for i in axes(dummy, 2)]
+        LinRange(first(xv), last(xv), n), LinRange(first(yv), last(yv), n)
+    end
+    # nodal centers
+    xci = xc, yc = let
+        dummy = zeros(nx, ny)
+        xc = [x_g(i, dx, dummy) for i in axes(dummy, 1)]
+        yc = [y_g(i, dy, dummy) for i in axes(dummy, 2)]
+        LinRange(first(xc), last(xc), n), LinRange(first(yc), last(yc), n)
+    end
+
+    # staggered grid for the velocity components
+    grid_vx = xv, expand_range(yc)
+    grid_vy = expand_range(xc), yv
+
+    particles = init_particles(
+        backend, nxcell, max_xcell, min_xcell, xvi...
+    )
+
+    # Cell fields -------------------------------
+    Vx = TA(backend)([vx_stream(x, y) for x in grid_vx[1], y in grid_vx[2]])
+    Vy = TA(backend)([vy_stream(x, y) for x in grid_vy[1], y in grid_vy[2]])
+    T = TA(backend)([y for x in xv, y in yv])
+    T0 = deepcopy(T)
+    V = Vx, Vy
+
+    nx_v = (size(T, 1) - 2) * dims[1]
+    ny_v = (size(T, 2) - 2) * dims[2]
+    T_v = zeros(nx_v, ny_v)
+    T_nohalo = TA(backend)(zeros(size(T) .- 2))
+
+    dt = mapreduce(x -> x[1] / MPI.Allreduce(maximum(abs.(x[2])), MPI.MAX, MPI.COMM_WORLD), min, zip(dxi, V)) / 2
+
+    # Advection test
+    particle_args = pT, = init_cell_arrays(particles, Val(1))
+    grid2particle!(pT, xvi, T, particles)
+
+    niter = 250
+    for iter in 1:niter
+        me == 0 && @show iter
+
+        # advect particles
+        advection!(particles, RungeKutta2(), V, (grid_vx, grid_vy), dt)
+
+        # update halos
+        update_cell_halo!(particles.coords...)
+        update_cell_halo!(particle_args...)
+        update_cell_halo!(particles.index)
+        # shuffle particles
+        move_particles!(particles, xvi, particle_args)
+        # interpolate T from particle to grid
+        particle2grid!(T, pT, xvi, particles)
+
+        @views T_nohalo .= T[2:(end - 1), 2:(end - 1)]
+        gather!(Array(T_nohalo), T_v)
+
+        # if me == 0 && iter % 1 == 0
+        #     x_global = range(0, Lx, length = size(T_v, 1))
+        #     y_global = range(0, Ly, length = size(T_v, 2))
+        #     f, ax, = heatmap(x_global, y_global, T_v)
+        #     w = 0.504
+        #     offset = 0.5 - (w - 0.5)
+        #     lines!(
+        #         ax,
+        #         [0, w, w, 0, 0],
+        #         [0, 0, w, w, 0],
+        #         linewidth = 3
+        #     )
+        #     lines!(
+        #         ax,
+        #         [0, w, w, 0, 0] .+ offset,
+        #         [0, 0, w, w, 0],
+        #         linewidth = 3
+        #     )
+        #     lines!(
+        #         ax,
+        #         [0, w, w, 0, 0] .+ offset,
+        #         [0, 0, w, w, 0] .+ offset,
+        #         linewidth = 3
+        #     )
+        #     lines!(
+        #         ax,
+        #         [0, w, w, 0, 0],
+        #         [0, 0, w, w, 0] .+ offset,
+        #         linewidth = 3
+        #     )
+
+        #     save("figs/T_MPI_$iter.png", f)
+        # end
+
+        # px = particles.coords[1].data[:]
+        # py = particles.coords[2].data[:]
+        # idx = particles.index.data[:]
+        # f = scatter(px[idx], py[idx], color=:black)
+        # save("figs/particles_$(iter)_$(me).png", f)
+    end
+
+    # f, ax, = heatmap(xvi..., T, colormap=:batlow)
+    # streamplot!(ax, g, xvi...)
+    # f
+    return finalize_global_grid()
+
+end
+
+main()