diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index e7adc177e..913b7c571 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -162,8 +162,8 @@ steps:
           codecov: true
     agents:
       queue: "juliaecosystem"
-      os: linux
-      arch: x86_64
+      os: macos
+      arch: aarch64
     env:
       CI_USE_OPENCL: "1"
 
diff --git a/docs/make.jl b/docs/make.jl
index 159f7aa8d..fffad5017 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,7 +26,10 @@ makedocs(;
         "Scopes" => "scopes.md",
         "Processors" => "processors.md",
         "Task Queues" => "task-queues.md",
-        "Datadeps" => "datadeps.md",
+        "Datadeps" => [
+            "Basics" => "datadeps.md",
+            "Stencils" => "stencils.md",
+        ],
         "GPUs" => "gpu.md",
         "Option Propagation" => "propagation.md",
         "Logging and Visualization" => [
diff --git a/docs/src/index.md b/docs/src/index.md
index 1d59189f1..4b79aaa72 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -361,6 +361,39 @@ DA = rand(Blocks(32, 32), 256, 128)
 collect(DA) # returns a `Matrix{Float64}`
 ```
 
+-----
+
+## Quickstart: Stencil Operations
+
+Dagger's `@stencil` macro allows for easy specification of stencil operations on `DArray`s, often used in simulations and image processing. These operations typically involve updating an element based on the values of its neighbors.
+
+For more details: [Stencil Operations](@ref)
+
+### Applying a Simple Stencil
+
+Here's how to apply a stencil that averages each element with its immediate neighbors, using a `Wrap` boundary condition (where neighbor access at the array edges wrap around).
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap
+
+# Create a 5x5 DArray, partitioned into 2x2 blocks
+A = rand(Blocks(2, 2), 5, 5)
+B = zeros(Blocks(2,2), 5, 5)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # For each element in A, calculate the sum of its 3x3 neighborhood
+        # (including itself) and store the average in B.
+        # Values outside the array bounds are determined by Wrap().
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# B now contains the averaged values.
+```
+In this example, `idx` refers to the coordinates of each element being processed. `@neighbors(A[idx], 1, Wrap())` fetches the 3x3 neighborhood around `A[idx]`. The `1` indicates a neighborhood distance of 1 from the central element, and `Wrap()` specifies the boundary behavior.
+
 ## Quickstart: Datadeps
 
 Datadeps is a feature in Dagger.jl that facilitates parallelism control within designated regions, allowing tasks to write to their arguments while ensuring dependencies are respected.
diff --git a/docs/src/stencils.md b/docs/src/stencils.md
new file mode 100644
index 000000000..f7c0b40a5
--- /dev/null
+++ b/docs/src/stencils.md
@@ -0,0 +1,183 @@
+# Stencil Operations
+
+The `@stencil` macro in Dagger.jl provides a convenient way to perform stencil computations on `DArray`s. It operates within a `Dagger.spawn_datadeps()` block and allows you to define operations that apply to each element of a `DArray`, potentially accessing values from each element's neighbors.
+
+## Basic Usage
+
+The fundamental structure of a `@stencil` block involves iterating over an implicit index, named `idx` in the following example , which represents the coordinates of an element in the processed `DArray`s.
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap, Pad
+
+# Initialize a DArray
+A = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        A[idx] = 1 # Assign 1 to every element of A
+    end
+end
+
+@assert all(collect(A) .== 1)
+```
+
+In this example, `A[idx] = 1` is executed for each chunk of `A`. The `idx` variable corresponds to the indices within each chunk.
+
+## Neighborhood Access with `@neighbors`
+
+The true power of stencils comes from accessing neighboring elements. The `@neighbors` macro facilitates this.
+
+`@neighbors(array[idx], distance, boundary_condition)`
+
+- `array[idx]`: The array and current index from which to find neighbors.
+- `distance`: An integer specifying the extent of the neighborhood (e.g., `1` for a 3x3 neighborhood in 2D).
+- `boundary_condition`: Defines how to handle accesses beyond the array boundaries. Available conditions are:
+    - `Wrap()`: Wraps around to the other side of the array.
+    - `Pad(value)`: Pads with a specified `value`.
+
+### Example: Averaging Neighbors with `Wrap`
+
+```julia
+import Dagger: Wrap
+
+# Initialize a DArray
+A = ones(Blocks(1, 1), Int, 3, 3)
+A[2,2] = 10 # Central element has a different value
+B = zeros(Blocks(1, 1), Float64, 3, 3)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # Calculate the average of the 3x3 neighborhood (including the center)
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# Manually calculate expected B for verification
+expected_B = zeros(Float64, 3, 3)
+A_collected = collect(A)
+for r in 1:3, c in 1:3
+    local_sum = 0.0
+    for dr in -1:1, dc in -1:1
+        nr, nc = mod1(r+dr, 3), mod1(c+dc, 3)
+        local_sum += A_collected[nr, nc]
+    end
+    expected_B[r,c] = local_sum / 9.0
+end
+
+@assert collect(B) ≈ expected_B
+```
+
+### Example: Convolution with `Pad`
+
+```julia
+import Pad
+
+# Initialize a DArray
+A = ones(Blocks(2, 2), Int, 4, 4)
+B = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        B[idx] = sum(@neighbors(A[idx], 1, Pad(0))) # Pad with 0
+    end
+end
+
+# Expected result for a 3x3 sum filter with zero padding
+expected_B_padded = [
+    4 6 6 4;
+    6 9 9 6;
+    6 9 9 6;
+    4 6 6 4
+]
+@assert collect(B) == expected_B_padded
+```
+
+## Sequential Semantics
+
+Expressions within a `@stencil` block are executed sequentially in terms of their effect on the data. This means that the result of one statement is visible to the subsequent statements, as if they were applied "all at once" across all indices before the next statement begins.
+
+```julia
+A = zeros(Blocks(2, 2), Int, 4, 4)
+B = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        A[idx] = 1  # First, A is initialized
+        B[idx] = A[idx] * 2       # Then, B is computed using the new values of A
+    end
+end
+
+expected_A = [1 for r in 1:4, c in 1:4]
+expected_B_seq = expected_A .* 2
+
+@assert collect(A) == expected_A
+@assert collect(B) == expected_B_seq
+```
+
+## Operations on Multiple `DArray`s
+
+You can read from and write to multiple `DArray`s within a single `@stencil` block, provided they have compatible chunk structures.
+
+```julia
+A = ones(Blocks(1, 1), Int, 2, 2)
+B = DArray(fill(3, 2, 2), Blocks(1, 1))
+C = zeros(Blocks(1, 1), Int, 2, 2)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        C[idx] = A[idx] + B[idx]
+    end
+end
+@assert all(collect(C) .== 4)
+```
+
+## Example: Game of Life
+
+The following demonstrates a more complex example: Conway's Game of Life.
+
+```julia
+# Ensure Plots and other necessary packages are available for the example
+using Plots
+
+N = 27 # Size of one dimension of a tile
+nt = 3 # Number of tiles in each dimension (results in nt x nt grid of tiles)
+niters = 10 # Number of iterations for the animation
+
+tiles = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+outputs = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+
+# Create a fun initial state (e.g., a glider and some random noise)
+tiles[13, 14] = true
+tiles[14, 14] = true
+tiles[15, 14] = true
+tiles[15, 15] = true
+tiles[14, 16] = true
+# Add some random noise in one of the tiles
+@view(tiles[(2N+1):3N, (2N+1):3N]) .= rand(Bool, N, N)
+
+
+
+anim = @animate for _ in 1:niters
+    Dagger.spawn_datadeps() do
+        @stencil begin
+            outputs[idx] = begin
+                nhood = @neighbors(tiles[idx], 1, Wrap())
+                neighs = sum(nhood) - tiles[idx] # Sum neighborhood, but subtract own value
+                if tiles[idx] && neighs < 2
+                    0 # Dies of underpopulation
+                elseif tiles[idx] && neighs > 3
+                    0 # Dies of overpopulation
+                elseif !tiles[idx] && neighs == 3
+                    1 # Becomes alive by reproduction
+                else
+                    tiles[idx] # Keeps its prior value
+                end
+            end
+            tiles[idx] = outputs[idx] # Update tiles for the next iteration
+        end
+    end
+    heatmap(Int.(collect(outputs))) # Generate a heatmap visualization
+end
+path = mp4(anim; fps=5, show_msg=true).filename # Create an animation of the heatmaps over time
+```
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
index 0e12b4ca2..6b8c61f9a 100644
--- a/ext/CUDAExt.jl
+++ b/ext/CUDAExt.jl
@@ -252,24 +252,6 @@ Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Function) = x
 Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk{T}) where {T<:Function} =
     Dagger.move(from_proc, to_proc, fetch(x))
 
-# Adapt BLAS/LAPACK functions
-import LinearAlgebra: BLAS, LAPACK
-for lib in [BLAS, LAPACK]
-    for name in names(lib; all=true)
-        name == nameof(lib) && continue
-        startswith(string(name), '#') && continue
-        endswith(string(name), '!') || continue
-
-        for culib in [CUBLAS, CUSOLVER]
-            if name in names(culib; all=true)
-                fn = getproperty(lib, name)
-                cufn = getproperty(culib, name)
-                @eval Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, ::$(typeof(fn))) = $cufn
-            end
-        end
-    end
-end
-
 # Task execution
 function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
     @nospecialize f args kwargs
@@ -291,6 +273,44 @@ function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
     end
 end
 
+# Adapt BLAS/LAPACK functions
+import LinearAlgebra: BLAS, LAPACK
+for lib in [BLAS, LAPACK]
+    for name in names(lib; all=true)
+        name == nameof(lib) && continue
+        startswith(string(name), '#') && continue
+        endswith(string(name), '!') || continue
+
+        for culib in [CUBLAS, CUSOLVER]
+            if name in names(culib; all=true)
+                fn = getproperty(lib, name)
+                cufn = getproperty(culib, name)
+                @eval Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, ::$(typeof(fn))) = $cufn
+            end
+        end
+    end
+end
+
+CuArray(H::Dagger.HaloArray) = convert(CuArray, H)
+Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:CuArray} =
+    Dagger.HaloArray(C(H.center),
+                     C.(H.edges),
+                     C.(H.corners),
+                     H.halo_width)
+Adapt.adapt_structure(to::CUDA.KernelAdaptor, H::Dagger.HaloArray) =
+    Dagger.HaloArray(adapt(to, H.center),
+                     adapt.(Ref(to), H.edges),
+                     adapt.(Ref(to), H.corners),
+                     H.halo_width)
+function Dagger.inner_stencil_proc!(::CuArrayDeviceProc, f, output, read_vars)
+    Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
+    return
+end
+@kernel function _inner_stencil!(f, output, read_vars)
+    idx = @index(Global, Cartesian)
+    f(idx, output, read_vars)
+end
+
 Dagger.gpu_processor(::Val{:CUDA}) = CuArrayDeviceProc
 Dagger.gpu_can_compute(::Val{:CUDA}) = CUDA.has_cuda()
 Dagger.gpu_kernel_backend(::CuArrayDeviceProc) = CUDABackend()
diff --git a/ext/IntelExt.jl b/ext/IntelExt.jl
index 95df7169f..74253007d 100644
--- a/ext/IntelExt.jl
+++ b/ext/IntelExt.jl
@@ -259,6 +259,26 @@ function Dagger.execute!(proc::oneArrayDeviceProc, f, args...; kwargs...)
     end
 end
 
+oneArray(H::Dagger.HaloArray) = convert(oneArray, H)
+Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:oneArray} =
+    Dagger.HaloArray(C(H.center),
+                     C.(H.edges),
+                     C.(H.corners),
+                     H.halo_width)
+Adapt.adapt_structure(to::oneAPI.KernelAdaptor, H::Dagger.HaloArray) =
+    Dagger.HaloArray(adapt(to, H.center),
+                     adapt.(Ref(to), H.edges),
+                     adapt.(Ref(to), H.corners),
+                     H.halo_width)
+function Dagger.inner_stencil_proc!(::oneArrayDeviceProc, f, output, read_vars)
+    Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
+    return
+end
+@kernel function _inner_stencil!(f, output, read_vars)
+    idx = @index(Global, Cartesian)
+    f(idx, output, read_vars)
+end
+
 Dagger.gpu_processor(::Val{:oneAPI}) = oneArrayDeviceProc
 Dagger.gpu_can_compute(::Val{:oneAPI}) = oneAPI.functional()
 Dagger.gpu_kernel_backend(::oneArrayDeviceProc) = oneAPIBackend()
diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl
index b9b28cccf..50cfc8905 100644
--- a/ext/MetalExt.jl
+++ b/ext/MetalExt.jl
@@ -274,6 +274,21 @@ function Dagger.execute!(proc::MtlArrayDeviceProc, f, args...; kwargs...)
     end
 end
 
+MtlArray(H::Dagger.HaloArray) = convert(MtlArray, H)
+Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:MtlArray} =
+    Dagger.HaloArray(C(H.center),
+                     C.(H.edges),
+                     C.(H.corners),
+                     H.halo_width)
+function Dagger.inner_stencil_proc!(::MtlArrayDeviceProc, f, output, read_vars)
+    Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
+    return
+end
+@kernel function _inner_stencil!(f, output, read_vars)
+    idx = @index(Global, Cartesian)
+    f(idx, output, read_vars)
+end
+
 function Base.show(io::IO, proc::MtlArrayDeviceProc)
     print(io, "MtlArrayDeviceProc(worker $(proc.owner), device $(something(_get_metal_device(proc)).name))")
 end
@@ -284,7 +299,7 @@ Dagger.gpu_kernel_backend(proc::MtlArrayDeviceProc) = MetalBackend()
 # TODO: Switch devices
 Dagger.gpu_with_device(f, proc::MtlArrayDeviceProc) = f()
 
-function Dagger.gpu_synchronize(proc::MtlArrayDeviceProc)q
+function Dagger.gpu_synchronize(proc::MtlArrayDeviceProc)
     with_context(proc) do
         Metal.synchronize()
     end
diff --git a/ext/OpenCLExt.jl b/ext/OpenCLExt.jl
index ce085d310..fbf73de72 100644
--- a/ext/OpenCLExt.jl
+++ b/ext/OpenCLExt.jl
@@ -242,6 +242,26 @@ function Dagger.execute!(proc::CLArrayDeviceProc, f, args...; kwargs...)
     end
 end
 
+CLArray(H::Dagger.HaloArray) = convert(CLArray, H)
+Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:CLArray} =
+    Dagger.HaloArray(C(H.center),
+                     C.(H.edges),
+                     C.(H.corners),
+                     H.halo_width)
+Adapt.adapt_structure(to::OpenCL.KernelAdaptor, H::Dagger.HaloArray) =
+    Dagger.HaloArray(adapt(to, H.center),
+                     adapt.(Ref(to), H.edges),
+                     adapt.(Ref(to), H.corners),
+                     H.halo_width)
+function Dagger.inner_stencil_proc!(::CLArrayDeviceProc, f, output, read_vars)
+    Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
+    return
+end
+@kernel function _inner_stencil!(f, output, read_vars)
+    idx = @index(Global, Cartesian)
+    f(idx, output, read_vars)
+end
+
 Dagger.gpu_processor(::Val{:OpenCL}) = CLArrayDeviceProc
 Dagger.gpu_can_compute(::Val{:OpenCL}) = length(cl.platforms()) > 0
 Dagger.gpu_kernel_backend(::CLArrayDeviceProc) = OpenCLBackend()
diff --git a/ext/ROCExt.jl b/ext/ROCExt.jl
index 6aaa5a867..288c4744f 100644
--- a/ext/ROCExt.jl
+++ b/ext/ROCExt.jl
@@ -261,6 +261,21 @@ function Dagger.execute!(proc::ROCArrayDeviceProc, f, args...; kwargs...)
     end
 end
 
+ROCArray(H::Dagger.HaloArray) = convert(ROCArray, H)
+Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:ROCArray} =
+    Dagger.HaloArray(C(H.center),
+                     C.(H.edges),
+                     C.(H.corners),
+                     H.halo_width)
+function Dagger.inner_stencil_proc!(::ROCArrayDeviceProc, f, output, read_vars)
+    Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
+    return
+end
+@kernel function _inner_stencil!(f, output, read_vars)
+    idx = @index(Global, Cartesian)
+    f(idx, output, read_vars)
+end
+
 Dagger.gpu_processor(::Val{:ROC}) = ROCArrayDeviceProc
 Dagger.gpu_can_compute(::Val{:ROC}) = AMDGPU.functional()
 Dagger.gpu_kernel_backend(proc::ROCArrayDeviceProc) = ROCBackend()
diff --git a/src/Dagger.jl b/src/Dagger.jl
index 9baa504ae..0c3761c44 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -50,7 +50,8 @@ include("utils/dagdebug.jl")
 include("utils/locked-object.jl")
 include("utils/tasks.jl")
 
-import MacroTools: @capture
+import MacroTools: @capture, prewalk
+
 include("options.jl")
 include("processor.jl")
 include("threadproc.jl")
@@ -76,6 +77,8 @@ include("sch/Sch.jl"); using .Sch
 
 # Data dependency task queue
 include("datadeps.jl")
+include("utils/haloarray.jl")
+include("stencil.jl")
 
 # Streaming
 include("stream.jl")
diff --git a/src/sch/util.jl b/src/sch/util.jl
index 2e090b26c..01138a052 100644
--- a/src/sch/util.jl
+++ b/src/sch/util.jl
@@ -33,6 +33,8 @@ unwrap_nested_exception(err::DTaskFailedException) =
     unwrap_nested_exception(err.ex)
 unwrap_nested_exception(err::TaskFailedException) =
     unwrap_nested_exception(err.t.exception)
+unwrap_nested_exception(err::LoadError) =
+    unwrap_nested_exception(err.error)
 unwrap_nested_exception(err) = err
 
 "Gets a `NamedTuple` of options propagated by `thunk`."
diff --git a/src/stencil.jl b/src/stencil.jl
new file mode 100644
index 000000000..b283c119e
--- /dev/null
+++ b/src/stencil.jl
@@ -0,0 +1,307 @@
+# FIXME: Remove me
+const Read = In
+const Write = Out
+const ReadWrite = InOut
+
+function validate_neigh_dist(neigh_dist)
+    if !(neigh_dist isa Integer)
+        throw(ArgumentError("Neighborhood distance ($neigh_dist) must be an Integer"))
+    end
+    if neigh_dist <= 0
+        throw(ArgumentError("Neighborhood distance ($neigh_dist) must be greater than 0"))
+    end
+end
+function validate_neigh_dist(neigh_dist, size)
+    validate_neigh_dist(neigh_dist)
+    if any(size .< neigh_dist)
+        throw(ArgumentError("Neighborhood distance ($neigh_dist) must not be larger than the chunk size ($size)"))
+    end
+end
+
+function load_neighbor_edge(arr, dim, dir, neigh_dist)
+    validate_neigh_dist(neigh_dist, size(arr))
+    if dir == -1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
+    elseif dir == 1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
+    end
+    # FIXME: Don't collect
+    return move(task_processor(), collect(@view arr[start_idx:stop_idx]))
+end
+function load_neighbor_corner(arr, corner_side, neigh_dist)
+    validate_neigh_dist(neigh_dist, size(arr))
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
+    return move(task_processor(), collect(@view arr[start_idx:stop_idx]))
+end
+function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
+    validate_neigh_dist(neigh_dist)
+
+    # FIXME: Depends on neigh_dist and chunk size
+    chunk_dist = 1
+    # Get the center
+    accesses = Any[chunks[idx]]
+
+    # Get the edges
+    for dim in 1:ndims(chunks)
+        for dir in (-1, +1)
+            new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks)))
+            if is_past_boundary(size(chunks), new_idx)
+                if boundary_has_transition(boundary)
+                    new_idx = boundary_transition(boundary, new_idx, size(chunks))
+                else
+                    new_idx = idx
+                end
+                chunk = chunks[new_idx]
+                push!(accesses, Dagger.@spawn load_boundary_edge(boundary, chunk, dim, dir, neigh_dist))
+            else
+                chunk = chunks[new_idx]
+                push!(accesses, Dagger.@spawn load_neighbor_edge(chunk, dim, dir, neigh_dist))
+            end
+        end
+    end
+
+    # Get the corners
+    for corner_num in 1:(2^ndims(chunks))
+        corner_side = CartesianIndex(reverse(ntuple(ndims(chunks)) do i
+            ((corner_num-1) >> (((ndims(chunks) - i) + 1) - 1)) & 1
+        end))
+        corner_new_idx = CartesianIndex(ntuple(ndims(chunks)) do i
+            corner_shift = iszero(corner_side[i]) ? -1 : 1
+            return idx[i] + corner_shift
+        end)
+        if is_past_boundary(size(chunks), corner_new_idx)
+            if boundary_has_transition(boundary)
+                corner_new_idx = boundary_transition(boundary, corner_new_idx, size(chunks))
+            else
+                corner_new_idx = idx
+            end
+            chunk = chunks[corner_new_idx]
+            push!(accesses, Dagger.@spawn load_boundary_corner(boundary, chunk, corner_side, neigh_dist))
+        else
+            chunk = chunks[corner_new_idx]
+            push!(accesses, Dagger.@spawn load_neighbor_corner(chunk, corner_side, neigh_dist))
+        end
+    end
+
+    @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))"
+    return accesses
+end
+function build_halo(neigh_dist, boundary, center, all_neighbors...)
+    N = ndims(center)
+    edges = all_neighbors[1:(2*N)]
+    corners = all_neighbors[((2^N)+1):end]
+    @assert length(edges) == 2*N && length(corners) == 2^N "Halo mismatch: edges=$(length(edges)) corners=$(length(corners))"
+    return HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N))
+end
+function load_neighborhood(arr::HaloArray{T,N}, idx) where {T,N}
+    @assert all(arr.halo_width .== arr.halo_width[1])
+    neigh_dist = arr.halo_width[1]
+    start_idx = idx - CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
+    stop_idx = idx + CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
+    return @view arr[start_idx:stop_idx]
+end
+function inner_stencil!(f, output, read_vars)
+    processor = task_processor()
+    inner_stencil_proc!(processor, f, output, read_vars)
+end
+# Non-KA (for CPUs)
+function inner_stencil_proc!(::ThreadProc, f, output, read_vars)
+    for idx in CartesianIndices(output)
+        f(idx, output, read_vars)
+    end
+    return
+end
+
+is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size)))
+
+struct Wrap end
+boundary_has_transition(::Wrap) = true
+boundary_transition(::Wrap, idx, size) =
+    CartesianIndex(ntuple(i -> mod1(idx[i], size[i]), length(size)))
+load_boundary_edge(::Wrap, arr, dim, dir, neigh_dist) = load_neighbor_edge(arr, dim, dir, neigh_dist)
+load_boundary_corner(::Wrap, arr, corner_side, neigh_dist) = load_neighbor_corner(arr, corner_side, neigh_dist)
+
+struct Pad{T}
+    padval::T
+end
+boundary_has_transition(::Pad) = false
+function load_boundary_edge(pad::Pad, arr, dim, dir, neigh_dist)
+    if dir == -1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
+    elseif dir == 1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
+    end
+    edge_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
+    # FIXME: return Fill(pad.padval, edge_size)
+    return move(task_processor(), fill(pad.padval, edge_size))
+end
+function load_boundary_corner(pad::Pad, arr, corner_side, neigh_dist)
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
+    corner_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
+    # FIXME: return Fill(pad.padval, corner_size)
+    return move(task_processor(), fill(pad.padval, corner_size))
+end
+
+"""
+    @stencil begin body end
+
+Allows the execution of stencil operations within a `spawn_datadeps` region.
+The `idx` variable is used to iterate over one or more `DArray`s. An example
+usage may look like:
+
+```julia
+import Dagger: @stencil, Wrap
+
+A = zeros(Blocks(3, 3), Int, 9, 9)
+A[5, 5] = 1
+B = zeros(Blocks(3, 3), Int, 9, 9)
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # Increment all values by 1
+        A[idx] = A[idx] + 1
+        # Sum values of all neighbors with self and write to B
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+        # Copy B back to A
+        A[idx] = B[idx]
+    end
+end
+```
+
+Each expression within an `@stencil` region that performs an in-place indexing
+expression like `A[idx] = ...` is transformed into a set of tasks that operate
+on each chunk of `A` or any other arrays specified as `A[idx]`; within each
+task, elements of that chunk of `A` can be accessed. Elements of multiple
+`DArray`s can be accessed, such as `B[idx]`, so long as `B` has the same size,
+shape, and chunk layout as `A`.
+
+Additionally, the `@neighbors` macro can be used to access a neighborhood of
+values around `A[idx]`, at a configurable distance (in this case, 1 element
+distance) and with various kinds of boundary conditions (in this case, `Wrap()`
+specifies wrapping behavior on the boundaries). Neighborhoods are computed with
+respect to neighboring chunks as well - if a neighborhood would overflow from
+the current chunk into a neighboring chunk, values from that neighboring chunk
+will be included in the neighborhood.
+
+Note that, while `@stencil` may look like a `for` loop, it does not follow the
+same semantics; in particular, an expression within `@stencil` occurs "all at
+once" (across all indices) before the next expression occurs. This means that
+`A[idx] = A[idx] + 1` increments the values `A` by 1, which occurs before
+`B[idx] = sum(@neighbors(A[idx], 1, Wrap()))` writes the sum of neighbors for
+all `idx` values into `B[idx]`, and that occurs before any of the values are
+copied to `A` in `A[idx] = B[idx]`. Of course, pipelining and other optimizations
+may still occur, so long as they respect the sequential nature of `@stencil`
+(just like with other operations in `spawn_datadeps`). Due to this behavior,
+expressions like `A[idx] = sum(@neighbors(A[idx], 1, Wrap()))` are not valid,
+as that would currently cause race conditions and lead to undefined behavior.
+"""
+macro stencil(orig_ex)
+    if !Meta.isexpr(orig_ex, :block)
+        throw(ArgumentError("Invalid stencil block: $orig_ex"))
+    end
+
+    # Collect access pattern information
+    inners = []
+    all_accessed_vars = Set{Symbol}()
+    for inner_ex in orig_ex.args
+        inner_ex isa LineNumberNode && continue
+        if !@capture(inner_ex, write_ex_ = read_ex_)
+            throw(ArgumentError("Invalid update expression: $inner_ex"))
+        end
+        if !@capture(write_ex, write_var_[write_idx_])
+            throw(ArgumentError("Update expression requires a write: $write_ex"))
+        end
+        accessed_vars = Set{Symbol}()
+        read_vars = Set{Symbol}()
+        neighborhoods = Dict{Symbol, Tuple{Any, Any}}()
+        push!(accessed_vars, write_var)
+        prewalk(read_ex) do read_inner_ex
+            if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
+                push!(accessed_vars, read_var)
+                push!(read_vars, read_var)
+            elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
+                if read_idx != write_idx
+                    throw(ArgumentError("Neighborhood access must be at the same index as the write: $read_inner_ex"))
+                end
+                if write_var == read_var
+                    throw(ArgumentError("Cannot write to the same variable as the neighborhood access: $read_inner_ex"))
+                end
+                push!(accessed_vars, read_var)
+                push!(read_vars, read_var)
+                neighborhoods[read_var] = (neigh_dist, boundary)
+            end
+            return read_inner_ex
+        end
+        union!(all_accessed_vars, accessed_vars)
+        push!(inners, (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods))
+    end
+
+    # Codegen update functions
+    final_ex = Expr(:block)
+    @gensym chunk_idx
+    for (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods) in inners
+        # Generate a variable for chunk access
+        @gensym chunk_idx
+
+        # Generate function with transformed body
+        @gensym inner_vars inner_index_var
+        new_inner_ex_body = prewalk(inner_ex) do old_inner_ex
+            if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
+                # Direct access
+                if read_var == write_var
+                    return :($write_var[$inner_index_var])
+                else
+                    return :($inner_vars.$read_var[$inner_index_var])
+                end
+            elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
+                # Neighborhood access
+                return :($load_neighborhood($inner_vars.$read_var, $inner_index_var))
+            end
+            return old_inner_ex
+        end
+        new_inner_f = :(($inner_index_var, $write_var, $inner_vars)->$new_inner_ex_body)
+        new_inner_ex = quote
+            $inner_vars = (;$(read_vars...))
+            $inner_stencil!($new_inner_f, $write_var, $inner_vars)
+        end
+        inner_fn = Expr(:->, Expr(:tuple, Expr(:parameters, write_var, read_vars...)), new_inner_ex)
+
+        # Generate @spawn call with appropriate vars and deps
+        deps_ex = Any[]
+        if write_var in read_vars
+            push!(deps_ex, Expr(:kw, write_var, :($ReadWrite($chunks($write_var)[$chunk_idx]))))
+        else
+            push!(deps_ex, Expr(:kw, write_var, :($Write($chunks($write_var)[$chunk_idx]))))
+        end
+        neighbor_copy_all_ex = Expr(:block)
+        for read_var in read_vars
+            if read_var in keys(neighborhoods)
+                # Generate a neighborhood copy operation
+                neigh_dist, boundary = neighborhoods[read_var]
+                deps_inner_ex = Expr(:block)
+                @gensym neighbor_copy_var
+                push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn name="stencil_build_halo" $build_halo($neigh_dist, $boundary, map($Read, $select_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)))
+                push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var))))
+            else
+                push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx]))))
+            end
+        end
+        spawn_ex = :(Dagger.@spawn name="stencil_inner_fn" $inner_fn(;$(deps_ex...)))
+
+        # Generate loop
+        push!(final_ex.args, quote
+            for $chunk_idx in $CartesianIndices($chunks($write_var))
+                $neighbor_copy_all_ex
+                $spawn_ex
+            end
+        end)
+    end
+
+
+    return esc(final_ex)
+end
diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl
new file mode 100644
index 000000000..c27182142
--- /dev/null
+++ b/src/utils/haloarray.jl
@@ -0,0 +1,102 @@
+# Define the HaloArray type with minimized halo storage
+struct HaloArray{T,N,E,C,A,EAT<:Tuple,CAT<:Tuple} <: AbstractArray{T,N}
+    center::A
+    edges::EAT
+    corners::CAT
+    halo_width::NTuple{N,Int}
+end
+
+# Helper function to create an empty HaloArray with minimized halo storage
+function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) where {T,N}
+    center = Array{T,N}(undef, center_size...)
+    edges = ntuple(2N) do i
+        prev_dims = center_size[1:(cld(i,2)-1)]
+        next_dims = center_size[(cld(i,2)+1):end]
+        return Array{T,N}(undef, prev_dims..., halo_width[cld(i,2)], next_dims...)
+    end
+    corners = ntuple(2^N) do i
+        return Array{T,N}(undef, halo_width)
+    end
+    return HaloArray(center, edges, corners, halo_width)
+end
+
+HaloArray(center::AT, edges::EAT, corners::CAT, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},CAT<:Tuple,EAT<:Tuple} =
+    HaloArray{T,N,length(edges),length(corners),AT,EAT,CAT}(center, edges, corners, halo_width)
+
+Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width
+function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H}
+    ntuple(N) do i
+        first_ind = 1 - tile.halo_width[i]
+        last_ind = size(tile.center, i) + tile.halo_width[i]
+        return first_ind:last_ind
+    end
+end
+function Base.similar(tile::HaloArray{T,N,H}, ::Type{T}, dims::NTuple{N,Int}) where {T,N,H}
+    center_size = dims
+    halo_width = tile.halo_width
+    return HaloArray{T,N,H}(center_size, halo_width)
+end
+function Base.copy(tile::HaloArray{T,N,H}) where {T,N,H}
+    center = copy(tile.center)
+    halo = ntuple(i->copy(tile.edges[i]), H)
+    halo_width = tile.halo_width
+    return HaloArray{T,N,H}(center, halo, halo_width)
+end
+
+# Define getindex for HaloArray
+function Base.getindex(tile::HaloArray{T,N}, I::Vararg{Int,N}) where {T,N}
+    checkbounds(tile, I...)
+    if all(1 .<= I .<= size(tile.center))
+        return tile.center[I...]
+    elseif !any(1 .<= I .<= size(tile.center))
+        # Corner
+        # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11
+        corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1
+        corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N))
+        return tile.corners[corner_idx][corner_offset]
+    else
+        for d in 1:N
+            if I[d] < 1
+                halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N)
+                return tile.edges[(2*(d-1))+1][halo_idx...]
+            elseif I[d] > size(tile.center, d)
+                halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N)
+                return tile.edges[(2*(d-1))+2][halo_idx...]
+            end
+        end
+    end
+    error("Index out of bounds")
+end
+
+# Define setindex! for HaloArray
+function Base.setindex!(tile::HaloArray{T,N}, value, I::Vararg{Int,N}) where {T,N}
+    checkbounds(tile, I...)
+    if all(1 .<= I .<= size(tile.center))
+        # Center
+        return tile.center[I...] = value
+    elseif !any(1 .<= I .<= size(tile.center))
+        # Corner
+        # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11
+        corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1
+        corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N))
+        return tile.corners[corner_idx][corner_offset] = value
+    else
+        # Edge
+        for d in 1:N
+            if I[d] < 1
+                halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N)
+                return tile.edges[(2*(d-1))+1][halo_idx...] = value
+            elseif I[d] > size(tile.center, d)
+                halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N)
+                return tile.edges[(2*(d-1))+2][halo_idx...] = value
+            end
+        end
+    end
+    error("Index out of bounds")
+end
+
+Adapt.adapt_structure(to, H::Dagger.HaloArray) =
+    HaloArray(Adapt.adapt(to, H.center),
+              Adapt.adapt.(Ref(to), H.edges),
+              Adapt.adapt.(Ref(to), H.corners),
+              H.halo_width)
\ No newline at end of file
diff --git a/test/array/stencil.jl b/test/array/stencil.jl
new file mode 100644
index 000000000..69b7bb069
--- /dev/null
+++ b/test/array/stencil.jl
@@ -0,0 +1,180 @@
+import Dagger: @stencil, Wrap, Pad
+
+function test_stencil()
+    @testset "Simple assignment" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = 1
+            end
+        end
+        @test all(collect(A) .== 1)
+    end
+
+    @testset "Wrap boundary" begin
+        A = zeros(Int, 4, 4)
+        A[1,1] = 10
+        A = DArray(A, Blocks(2, 2))
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+            end
+        end
+        # Expected result after convolution with wrap around
+        # Corner element (1,1) will sum its 3 neighbors + itself (10) + 5 wrapped around neighbors
+        # For A[1,1], neighbors are A[4,4], A[4,1], A[4,2], A[1,4], A[1,2], A[2,4], A[2,1], A[2,2]
+        # Since only A[1,1] is 10 and others are 0, sum for B[1,1] will be 10 (A[1,1])
+        # Sum for B[1,2] will be A[1,1] = 10
+        # Sum for B[2,1] will be A[1,1] = 10
+        # Sum for B[2,2] will be A[1,1] = 10
+        # Sum for B[4,4] will be A[1,1] = 10
+        # ... and so on for elements that wrap around to include A[1,1]
+        expected_B_calc = zeros(Int, 4, 4)
+        for i in 1:4, j in 1:4
+            sum_val = 0
+            for ni in -1:1, nj in -1:1
+                # Apply wrap around logic for neighbors
+                row = mod1(i+ni, 4)
+                col = mod1(j+nj, 4)
+                if row == 1 && col == 1 # Check if the wrapped neighbor is A[1,1]
+                    sum_val += 10
+                end
+            end
+            expected_B_calc[i,j] = sum_val
+        end
+        @test collect(B) == expected_B_calc
+    end
+
+    @testset "Pad boundary" begin
+        A = ones(Blocks(2, 2), Int, 4, 4)
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Pad(0)))
+            end
+        end
+        # Expected result after convolution with zero padding
+        # Inner elements (e.g., B[2,2]) will sum 9 (3x3 neighborhood of 1s)
+        # Edge elements (e.g., B[1,2]) will sum 6 (2x3 neighborhood of 1s, 3 zeros from padding)
+        # Corner elements (e.g., B[1,1]) will sum 4 (2x2 neighborhood of 1s, 5 zeros from padding)
+        expected_B_pad = [
+            4 6 6 4;
+            6 9 9 6;
+            6 9 9 6;
+            4 6 6 4
+        ]
+        @test collect(B) == expected_B_pad
+    end
+
+    @testset "Multiple expressions" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = 1
+                B[idx] = A[idx] * 2
+            end
+        end
+        expected_A_multi = [1 for r in 1:4, c in 1:4]
+        expected_B_multi = expected_A_multi .* 2
+        @test collect(A) == expected_A_multi
+        @test collect(B) == expected_B_multi
+    end
+
+    @testset "Multiple DArrays" begin
+        A = ones(Blocks(2, 2), Int, 4, 4)
+        B = DArray(fill(2, 4, 4), Blocks(2, 2))
+        C = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                C[idx] = A[idx] + B[idx]
+            end
+        end
+        @test all(collect(C) .== 3)
+    end
+
+    @testset "Pad boundary with non-zero value" begin
+        A = ones(Blocks(1, 1), Int, 2, 2) # Simpler 2x2 case
+        B = zeros(Blocks(1, 1), Int, 2, 2)
+        pad_value = 5
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Pad(pad_value)))
+            end
+        end
+        # For A = [1 1; 1 1] and Pad(5)
+        # B[1,1] neighbors considering a 3x3 neighborhood around A[1,1]:
+        # P P P
+        # P A11 A12
+        # P A21 A22
+        # Values:
+        # 5 5 5
+        # 5 1 1
+        # 5 1 1
+        # Sum = 5*5 (for the padded values) + 1*4 (for the actual values from A) = 25 + 4 = 29.
+        # This logic applies to all elements in B because the array A is small (2x2) and the neighborhood is 1.
+        # Every element's 3x3 neighborhood will include 5 padded values and the 4 values of A.
+        expected_B_pad_val = fill(pad_value*5 + 1*4, 2, 2)
+        @test collect(B) == expected_B_pad_val
+    end
+
+    @testset "Invalid neighborhood distance" begin
+        A = ones(Blocks(1, 1), Int, 2, 2)
+        B = zeros(Blocks(1, 1), Int, 2, 2)
+        @test_throws_unwrap ArgumentError Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 0, Wrap()))
+            end
+        end
+        @test_throws_unwrap ArgumentError Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], -1, Wrap()))
+            end
+        end
+        @test_throws_unwrap ArgumentError Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1.5, Wrap()))
+            end
+        end
+        @test_throws_unwrap ArgumentError Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 2, Wrap()))
+            end
+        end
+    end
+
+    @testset "Invalid neighborhood access of written variable" begin
+        A = ones(Blocks(1, 1), Int, 2, 2)
+        @test_throws_unwrap ArgumentError @eval Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+            end
+        end
+    end
+
+    @testset "Invalid update expression" begin
+        A = ones(Blocks(1, 1), Int, 2, 2)
+        @test_throws_unwrap ArgumentError @eval Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] += 1
+            end
+        end
+    end
+end
+
+@testset "CPU" begin
+    test_stencil()
+end
+
+@testset "GPU" begin
+    for (kind, scope) in GPU_SCOPES
+        # FIXME
+        kind == :oneAPI && continue
+        @testset "$kind" begin
+            Dagger.with_options(;scope) do
+                test_stencil()
+            end
+        end
+    end
+end
diff --git a/test/gpu.jl b/test/gpu.jl
index 4da5c54bd..e9c05eebe 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -1,64 +1,7 @@
 using Random
 using LinearAlgebra
 
-if USE_CUDA
-    using Pkg
-    Pkg.add("CUDA")
-end
-if USE_ROCM
-    using Pkg
-    Pkg.add("AMDGPU")
-end
-if USE_ONEAPI
-    using Pkg
-    Pkg.add("oneAPI")
-end
-if USE_METAL
-    using Pkg
-    Pkg.add("Metal")
-end
-if USE_OPENCL
-    using Pkg
-    Pkg.add("OpenCL")
-    Pkg.add("pocl_jll")
-end
-
 @everywhere begin
-    if $USE_CUDA
-        using CUDA
-    elseif !$IN_CI
-        try using CUDA
-        catch end
-    end
-
-    if $USE_ROCM
-        using AMDGPU
-    elseif !$IN_CI
-        try using AMDGPU
-        catch end
-    end
-
-    if $USE_ONEAPI
-        using oneAPI
-    elseif !$IN_CI
-        try using oneAPI
-        catch end
-    end
-
-    if $USE_METAL
-        using Metal
-    elseif !$IN_CI
-        try using Metal
-        catch end
-    end
-
-    if $USE_OPENCL
-        using pocl_jll, OpenCL
-    elseif !$IN_CI
-        try using pocl_jll, OpenCL
-        catch end
-    end
-
     using Distributed, Dagger
     import Dagger: Kernel
     using KernelAbstractions
diff --git a/test/runtests.jl b/test/runtests.jl
index 3b5d8de28..20c7eb41c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -30,6 +30,7 @@ tests = [
     ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"),
     ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"),
     ("Array - Random", "array/random.jl"),
+    ("Array - Stencils", "array/stencil.jl"),
     ("GPU", "gpu.jl"),
     ("Caching", "cache.jl"),
     ("Disk Caching", "diskcaching.jl"),
@@ -40,7 +41,10 @@ tests = [
 ]
 if USE_GPU
     # Only run GPU tests
-    tests = [("GPU", "gpu.jl")]
+    tests = [
+        ("GPU", "gpu.jl"),
+        ("Array - Stencils", "array/stencil.jl"),
+    ]
 end
 all_test_names = map(test -> replace(last(test), ".jl"=>""), tests)
 
@@ -136,6 +140,11 @@ using Dagger
 using UUIDs
 import MemPool
 
+GPU_SCOPES = Pair{Symbol, Dagger.AbstractScope}[]
+if USE_GPU
+    include("setup_gpu.jl")
+end
+
 try
     for test in to_test
         test_title = tests[findfirst(x->x[2]==test * ".jl", tests)][1]
diff --git a/test/setup_gpu.jl b/test/setup_gpu.jl
new file mode 100644
index 000000000..931161850
--- /dev/null
+++ b/test/setup_gpu.jl
@@ -0,0 +1,89 @@
+if USE_CUDA
+    using Pkg
+    Pkg.add("CUDA")
+end
+if USE_ROCM
+    using Pkg
+    Pkg.add("AMDGPU")
+end
+if USE_ONEAPI
+    using Pkg
+    Pkg.add("oneAPI")
+end
+if USE_METAL
+    using Pkg
+    Pkg.add("Metal")
+end
+if USE_OPENCL
+    using Pkg
+    Pkg.add("OpenCL")
+    Pkg.add("pocl_jll")
+end
+
+@everywhere begin
+    if $USE_CUDA
+        using CUDA
+    elseif !$IN_CI
+        try using CUDA
+        catch end
+    end
+
+    if $USE_ROCM
+        using AMDGPU
+    elseif !$IN_CI
+        try using AMDGPU
+        catch end
+    end
+
+    if $USE_ONEAPI
+        using oneAPI
+    elseif !$IN_CI
+        try using oneAPI
+        catch end
+    end
+
+    if $USE_METAL
+        using Metal
+    elseif !$IN_CI
+        try using Metal
+        catch end
+    end
+
+    if $USE_OPENCL
+        using pocl_jll, OpenCL
+    elseif !$IN_CI
+        try using pocl_jll, OpenCL
+        catch end
+    end
+end
+
+if USE_CUDA
+    push!(GPU_SCOPES, :CUDA => Dagger.scope(;worker=1, cuda_gpu=1))
+    if length(CUDA.devices()) > 1
+        push!(GPU_SCOPES, :CUDA => Dagger.scope(;worker=1, cuda_gpu=2))
+    end
+end
+if USE_ROCM
+    push!(GPU_SCOPES, :ROCm => Dagger.scope(;worker=1, rocm_gpu=1))
+    if length(AMDGPU.devices()) > 1
+        push!(GPU_SCOPES, :ROCm => Dagger.scope(;worker=1, rocm_gpu=2))
+    end
+end
+if USE_ONEAPI
+    push!(GPU_SCOPES, :oneAPI => Dagger.scope(;worker=1, intel_gpu=1))
+    if length(oneAPI.devices()) > 1
+        push!(GPU_SCOPES, :oneAPI => Dagger.scope(;worker=1, intel_gpu=2))
+    end
+end
+if USE_METAL
+    push!(GPU_SCOPES, :Metal => Dagger.scope(;worker=1, metal_gpu=1))
+    if length(Metal.devices()) > 1
+        push!(GPU_SCOPES, :Metal => Dagger.scope(;worker=1, metal_gpu=2))
+    end
+end
+if USE_OPENCL
+    push!(GPU_SCOPES, :OpenCL => Dagger.scope(;worker=1, cl_device=1))
+    if length(cl.devices(cl.default_platform())) > 1
+        push!(GPU_SCOPES, :OpenCL => Dagger.scope(;worker=1, cl_device=2))
+    end
+end