diff --git a/docs/make.jl b/docs/make.jl
index 8f1f97f5c..641d3865b 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,7 +26,10 @@ makedocs(;
         "Scopes" => "scopes.md",
         "Processors" => "processors.md",
         "Task Queues" => "task-queues.md",
-        "Datadeps" => "datadeps.md",
+        "Datadeps" => [
+            "Basics" => "datadeps.md",
+            "Stencils" => "stencils.md",
+        ],
         "Option Propagation" => "propagation.md",
         "Logging and Visualization" => [
             "Logging: Basics" => "logging.md",
diff --git a/docs/src/index.md b/docs/src/index.md
index f98e715c1..df66a95f2 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -361,6 +361,40 @@ DA = rand(Blocks(32, 32), 256, 128)
 collect(DA) # returns a `Matrix{Float64}`
 ```
 
+-----
+
+## Quickstart: Stencil Operations
+
+Dagger's `@stencil` macro allows for easy specification of stencil operations on `DArray`s, often used in simulations and image processing. These operations typically involve updating an element based on the values of its neighbors.
+
+For more details: [Stencil Operations](@ref stencils.md)
+
+### Applying a Simple Stencil
+
+Here's how to apply a stencil that averages each element with its immediate neighbors, using a `Wrap` boundary condition (where edges wrap around).
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap
+
+# Create a 5x5 DArray, partitioned into 2x2 blocks
+A = Dagger.rand(Blocks(2, 2), Int, 5, 5)
+B = Dagger.zeros(Blocks(2,2), Float64, 5, 5)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # For each element in A, calculate the sum of its 3x3 neighborhood
+        # (including itself) and store the average in B.
+        # Values outside the array bounds are determined by Wrap().
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# B now contains the averaged values.
+# You can inspect it with collect(B)
+```
+In this example, `idx` refers to the coordinates of each element being processed. `@neighbors(A[idx], 1, Wrap())` fetches the 3x3 neighborhood around `A[idx]`. The `1` indicates a distance of 1 from the central element, and `Wrap()` specifies the boundary behavior.
+
 ## Quickstart: Datadeps
 
 Datadeps is a feature in Dagger.jl that facilitates parallelism control within designated regions, allowing tasks to write to their arguments while ensuring dependencies are respected.
@@ -412,6 +446,68 @@ Dagger.@spawn copyto!(C, X)
 
 In contrast to the previous example, here, the tasks are executed without argument annotations. As a result, there is a possibility of the `copyto!` task being executed before the `sort!` task, leading to unexpected results in the output array `C`.
 
+-----
+
+## Quickstart: Stencil Operations
+
+Dagger's `@stencil` macro allows for easy specification of stencil operations on `DArray`s, often used in simulations and image processing. These operations typically involve updating an element based on the values of its neighbors.
+
+For more details: [Stencil Operations](@ref stencils.md)
+
+### Applying a Simple Stencil
+
+Here's how to apply a stencil that averages each element with its immediate neighbors, using a `Wrap` boundary condition (where edges wrap around).
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap
+
+# Create a 5x5 DArray, partitioned into 2x2 blocks
+A = Dagger.rand(Blocks(2, 2), Int, 5, 5)
+B = Dagger.zeros(Blocks(2,2), Float64, 5, 5)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # For each element in A, calculate the sum of its 3x3 neighborhood
+        # (including itself) and store the average in B.
+        # Values outside the array bounds are determined by Wrap().
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# B now contains the averaged values.
+# You can inspect it with collect(B)
+```
+In this example, `idx` refers to the coordinates of each element being processed. `@neighbors(A[idx], 1, Wrap())` fetches the 3x3 neighborhood around `A[idx]`. The `1` indicates a distance of 1 from the central element, and `Wrap()` specifies the boundary behavior.
+
+### Using `Pad` for Boundary Conditions
+
+Alternatively, `Pad(value)` can be used to fill out-of-bounds accesses with a specific value.
+
+```julia
+import Dagger: Pad
+
+# Create a 4x4 DArray
+C = ones(Blocks(2, 2), Int, 4, 4)
+D = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # Sum neighbors, padding with 0 for out-of-bounds accesses
+        D[idx] = sum(@neighbors(C[idx], 1, Pad(0)))
+    end
+end
+
+# D will now contain sums where boundary elements used 0 for padding.
+# For example, D[1,1] (a corner) would sum C[1,1], C[1,2], C[2,1], C[2,2]
+# and 5 zeros from padding, resulting in a sum of 4 if all C elements are 1.
+# collect(D) would be:
+#  4  6  6  4
+#  6  9  9  6
+#  6  9  9  6
+#  4  6  6  4
+```
+
 ## Quickstart: Streaming
 
 Dagger.jl provides a streaming API that allows you to process data in a streaming fashion, where data is processed as it becomes available, rather than waiting for the entire dataset to be loaded into memory.
diff --git a/docs/src/stencils.md b/docs/src/stencils.md
new file mode 100644
index 000000000..df4db912e
--- /dev/null
+++ b/docs/src/stencils.md
@@ -0,0 +1,220 @@
+# Stencil Operations
+
+The `@stencil` macro in Dagger.jl provides a convenient way to perform stencil computations on `DArray`s. It operates within a `Dagger.spawn_datadeps()` block and allows you to define operations that apply to each element of a `DArray`, potentially considering its neighbors.
+
+## Basic Usage
+
+The fundamental structure of a `@stencil` block involves iterating over an implicit index, `idx`, which represents the coordinates of an element in the processed `DArray`s.
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap, Pad
+
+# Initialize a DArray
+A = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        A[idx] = 1 # Assign 1 to every element of A
+    end
+end
+
+@assert all(collect(A) .== 1)
+```
+
+In this example, `A[idx] = 1` is executed for each chunk of `A`. The `idx` variable corresponds to the indices within each chunk.
+
+## Neighborhood Access with `@neighbors`
+
+The true power of stencils comes from accessing neighboring elements. The `@neighbors` macro facilitates this.
+
+`@neighbors(array[idx], distance, boundary_condition)`
+
+- `array[idx]`: The array and current index from which to find neighbors.
+- `distance`: An integer specifying the extent of the neighborhood (e.g., `1` for a 3x3 neighborhood in 2D).
+- `boundary_condition`: Defines how to handle accesses beyond the array boundaries. Common conditions are:
+    - `Wrap()`: Wraps around to the other side of the array.
+    - `Pad(value)`: Pads with a specified `value`.
+
+### Example: Averaging Neighbors with `Wrap`
+
+```julia
+# Initialize a DArray
+A = ones(Blocks(1, 1), Int, 3, 3)
+A[2,2] = 10 # Central element has a different value
+B = zeros(Blocks(1, 1), Float64, 3, 3)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # Calculate the average of the 3x3 neighborhood (including the center)
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# Manually calculate expected B for verification
+expected_B = zeros(Float64, 3, 3)
+A_collected = collect(A)
+for r in 1:3, c in 1:3
+    local_sum = 0.0
+    for dr in -1:1, dc in -1:1
+        nr, nc = mod1(r+dr, 3), mod1(c+dc, 3)
+        local_sum += A_collected[nr, nc]
+    end
+    expected_B[r,c] = local_sum / 9.0
+end
+
+@assert collect(B) ≈ expected_B
+```
+
+### Example: Convolution with `Pad`
+
+```julia
+# Initialize a DArray
+A = ones(Blocks(2, 2), Int, 4, 4)
+B = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        B[idx] = sum(@neighbors(A[idx], 1, Pad(0))) # Pad with 0
+    end
+end
+
+# Expected result for a 3x3 sum filter with zero padding
+expected_B_padded = [
+    4 6 6 4;
+    6 9 9 6;
+    6 9 9 6;
+    4 6 6 4
+]
+@assert collect(B) == expected_B_padded
+```
+
+## Sequential Semantics
+
+Expressions within a `@stencil` block are executed sequentially in terms of their effect on the data. This means that the result of one statement is visible to the subsequent statements, as if they were applied "all at once" across all indices before the next statement begins.
+
+```julia
+A = zeros(Blocks(2, 2), Int, 4, 4)
+B = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        A[idx] = idx[1] + idx[2]  # First, A is filled based on coordinates
+        B[idx] = A[idx] * 2       # Then, B is computed using the new values of A
+    end
+end
+
+expected_A = [(r+c) for r in 1:4, c in 1:4]
+expected_B_seq = expected_A .* 2
+
+@assert collect(A) == expected_A
+@assert collect(B) == expected_B_seq
+```
+
+## Operations on Multiple `DArray`s
+
+You can read from and write to multiple `DArray`s within a single `@stencil` block, provided they have compatible chunk structures.
+
+```julia
+A = ones(Blocks(1, 1), Int, 2, 2)
+B_multi = Dagger.fill(Blocks(1, 1), 2, Int, 2, 2) # Renamed to avoid conflict, corrected fill
+C = zeros(Blocks(1, 1), Int, 2, 2)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        C[idx] = A[idx] + B_multi[idx] # Use the renamed B_multi
+    end
+end
+@assert all(collect(C) .== 3)
+```
+
+## Example: Game of Life
+
+The following demonstrates a more complex example: Conway's Game of Life.
+
+```julia
+# Ensure Plots and other necessary packages are available for the example
+# using Plots
+
+N = 27 # Size of one dimension of a tile
+nt = 3 # Number of tiles in each dimension (results in nt x nt grid of tiles)
+niters = 10 # Number of iterations for the animation
+
+tiles = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+outputs = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+
+# Create a fun initial state (e.g., a glider and some random noise)
+tiles[13, 14] = true
+tiles[14, 14] = true
+tiles[15, 14] = true
+tiles[15, 15] = true # Corrected glider part
+tiles[14, 16] = true
+# Add some random noise in one of the tiles
+# Make sure to use Dagger-compatible assignment if you were to modify chunks directly
+# For simplicity, direct array indexing is used here for initial setup.
+rand_tile_data = rand(Bool, N, N)
+# To assign this to a specific block, you'd typically work with chunks,
+# but for initial setup, direct indexing on the collected array or careful DArray construction is easier.
+# For this example, we'll simplify and assume direct modification is for setup.
+# A Dagger-idiomatic way for partial modification might involve map! or similar.
+# Here, we just modify the underlying array before it's heavily used by Dagger tasks if possible,
+# or use Dagger operations.
+# For collected view for setup:
+temp_tiles = collect(tiles) # This collect is fine for initial setup visualization/modification
+temp_tiles[(2N+1):3N, (2N+1):3N] .= rand_tile_data
+tiles = Dagger.distribute(temp_tiles, Blocks(N,N)) # Use distribute to create DArray from existing array
+
+
+# The animation part requires a graphical environment.
+# If running in a headless environment, you might comment out the @animate macro
+# and inspect `outputs` programmatically.
+# anim = @animate for _ in 1:niters
+#     Dagger.spawn_datadeps() do
+#         @stencil begin
+#             outputs[idx] = begin
+#                 nhood = @neighbors(tiles[idx], 1, Wrap())
+#                 live_neighbors = sum(nhood) - tiles[idx] # Subtract self if it was counted
+#                 if tiles[idx] # If current cell is alive
+#                     if live_neighbors < 2 || live_neighbors > 3
+#                         false # Dies by underpopulation or overpopulation
+#                     else
+#                         true  # Survives
+#                     end
+#                 else # If current cell is dead
+#                     if live_neighbors == 3
+#                         true  # Becomes alive by reproduction
+#                     else
+#                         false # Stays dead
+#                     end
+#                 end
+#             end
+#             tiles[idx] = outputs[idx] # Update tiles for the next iteration
+#         end
+#     end
+#     # heatmap(Int.(collect(outputs))) # Visualize (requires Plots.jl)
+# end
+# path = mp4(anim; fps=5, show_msg=true).filename # Save animation (requires Plots.jl)
+
+# For testing without animation:
+# Execute one iteration:
+Dagger.spawn_datadeps() do
+    @stencil begin
+        outputs[idx] = begin
+            nhood = @neighbors(tiles[idx], 1, Wrap())
+            live_neighbors = sum(nhood) - tiles[idx]
+            if tiles[idx]
+                if live_neighbors < 2 || live_neighbors > 3; false
+                else; true; end
+            else
+                if live_neighbors == 3; true
+                else; false; end
+            end
+        end
+        tiles[idx] = outputs[idx]
+    end
+end
+# You can inspect `collect(outputs)` or `collect(tiles)` here.
+println("Game of Life example processed one iteration.")
+```
+
+This updated documentation provides a more structured explanation of `@stencil`, including its syntax, common use cases like neighborhood access with different boundary conditions, the sequential nature of its operations, and how to use it with multiple `DArray`s. The Game of Life example is also slightly corrected and clarified.
diff --git a/src/Dagger.jl b/src/Dagger.jl
index fd6395a4b..3a76cb3bc 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -50,7 +50,8 @@ include("utils/dagdebug.jl")
 include("utils/locked-object.jl")
 include("utils/tasks.jl")
 
-import MacroTools: @capture
+import MacroTools: @capture, prewalk
+
 include("options.jl")
 include("processor.jl")
 include("threadproc.jl")
@@ -76,6 +77,8 @@ include("sch/Sch.jl"); using .Sch
 
 # Data dependency task queue
 include("datadeps.jl")
+include("utils/haloarray.jl")
+include("stencil.jl")
 
 # Streaming
 include("stream.jl")
diff --git a/src/array/indexing.jl b/src/array/indexing.jl
index 69725eb7a..778e15a17 100644
--- a/src/array/indexing.jl
+++ b/src/array/indexing.jl
@@ -127,7 +127,8 @@ function Base.setindex!(A::DArray{T,N}, value, idx::NTuple{N,Int}) where {T,N}
     # Set the value
     part = A.chunks[part_idx...]
     space = memory_space(part)
-    scope = Dagger.scope(worker=root_worker_id(space))
+    # FIXME: Do this correctly w.r.t memory space of part
+    scope = Dagger.scope(worker=root_worker_id(space), threads=:)
     return fetch(Dagger.@spawn scope=scope setindex!(part, value, offset_idx...))
 end
 Base.setindex!(A::DArray, value, idx::Integer...) =
diff --git a/src/scopes.jl b/src/scopes.jl
index 834993c9f..29badbe70 100644
--- a/src/scopes.jl
+++ b/src/scopes.jl
@@ -325,13 +325,20 @@ function to_scope(sc::NamedTuple)
     else
         nothing
     end
+    all_threads = false
     threads = if haskey(sc, :thread)
         Int[sc.thread]
     elseif haskey(sc, :threads)
-        Int[sc.threads...]
+        if sc.threads == Colon()
+            all_threads = true
+            nothing
+        else
+            Int[sc.threads...]
+        end
     else
         nothing
     end
+    want_threads = all_threads || threads !== nothing
 
     # Simple cases
     if workers !== nothing && threads !== nothing
@@ -341,18 +348,22 @@ function to_scope(sc::NamedTuple)
         end
         return simplified_union_scope(subscopes)
     elseif workers !== nothing && threads === nothing
-        subscopes = AbstractScope[ProcessScope(w) for w in workers]
-        return simplified_union_scope(subscopes)
+        subscopes = simplified_union_scope(AbstractScope[ProcessScope(w) for w in workers])
+        if all_threads
+            return constrain(subscopes, ProcessorTypeScope(ThreadProc))
+        else
+            return subscopes
+        end
     end
 
     # More complex cases that require querying the cluster
     # FIXME: Use per-field scope taint
     if workers === nothing
-        workers = procs()
+        workers = map(p->p.pid, filter(p->p isa OSProc, procs(Dagger.Sch.eager_context())))
     end
     subscopes = AbstractScope[]
     for w in workers
-        if threads === nothing
+        if threads === nothing && want_threads
             threads = map(c->c.tid,
                           filter(c->c isa ThreadProc,
                                  collect(children(OSProc(w)))))
diff --git a/src/stencil.jl b/src/stencil.jl
new file mode 100644
index 000000000..7c4ab99d2
--- /dev/null
+++ b/src/stencil.jl
@@ -0,0 +1,277 @@
+# FIXME: Remove me
+const Read = In
+const Write = Out
+const ReadWrite = InOut
+
+function load_neighbor_edge(arr, dim, dir, neigh_dist)
+    if dir == -1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
+    elseif dir == 1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
+    end
+    # FIXME: Don't collect
+    return move(thunk_processor(), collect(@view arr[start_idx:stop_idx]))
+end
+function load_neighbor_corner(arr, corner_side, neigh_dist)
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
+    return move(thunk_processor(), collect(@view arr[start_idx:stop_idx]))
+end
+function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
+    @assert neigh_dist isa Integer && neigh_dist > 0 "Neighborhood distance must be an Integer greater than 0"
+
+    # FIXME: Depends on neigh_dist and chunk size
+    chunk_dist = 1
+    # Get the center
+    accesses = Any[chunks[idx]]
+
+    # Get the edges
+    for dim in 1:ndims(chunks)
+        for dir in (-1, +1)
+            new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks)))
+            if is_past_boundary(size(chunks), new_idx)
+                if boundary_has_transition(boundary)
+                    new_idx = boundary_transition(boundary, new_idx, size(chunks))
+                else
+                    new_idx = idx
+                end
+                chunk = chunks[new_idx]
+                push!(accesses, Dagger.@spawn load_boundary_edge(boundary, chunk, dim, dir, neigh_dist))
+            else
+                chunk = chunks[new_idx]
+                push!(accesses, Dagger.@spawn load_neighbor_edge(chunk, dim, dir, neigh_dist))
+            end
+        end
+    end
+
+    # Get the corners
+    for corner_num in 1:(2^ndims(chunks))
+        corner_side = CartesianIndex(reverse(ntuple(ndims(chunks)) do i
+            ((corner_num-1) >> (((ndims(chunks) - i) + 1) - 1)) & 1
+        end))
+        corner_new_idx = CartesianIndex(ntuple(ndims(chunks)) do i
+            corner_shift = iszero(corner_side[i]) ? -1 : 1
+            return idx[i] + corner_shift
+        end)
+        if is_past_boundary(size(chunks), corner_new_idx)
+            if boundary_has_transition(boundary)
+                corner_new_idx = boundary_transition(boundary, corner_new_idx, size(chunks))
+            else
+                corner_new_idx = idx
+            end
+            chunk = chunks[corner_new_idx]
+            push!(accesses, Dagger.@spawn load_boundary_corner(boundary, chunk, corner_side, neigh_dist))
+        else
+            chunk = chunks[corner_new_idx]
+            push!(accesses, Dagger.@spawn load_neighbor_corner(chunk, corner_side, neigh_dist))
+        end
+    end
+
+    @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))"
+    return accesses
+end
+function build_halo(neigh_dist, boundary, center, all_neighbors...)
+    N = ndims(center)
+    edges = all_neighbors[1:(2*N)]
+    corners = all_neighbors[((2^N)+1):end]
+    @assert length(edges) == 2*N && length(corners) == 2^N "Halo mismatch: edges=$(length(edges)) corners=$(length(corners))"
+    return HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N))
+end
+function load_neighborhood(arr::HaloArray{T,N}, idx) where {T,N}
+    @assert all(arr.halo_width .== arr.halo_width[1])
+    neigh_dist = arr.halo_width[1]
+    start_idx = idx - CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
+    stop_idx = idx + CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
+    return @view arr[start_idx:stop_idx]
+end
+function inner_stencil!(f, output, read_vars)
+    processor = thunk_processor()
+    inner_stencil_proc!(processor, f, output, read_vars)
+end
+# Non-KA (for CPUs)
+function inner_stencil_proc!(::ThreadProc, f, output, read_vars)
+    for idx in CartesianIndices(output)
+        f(idx, output, read_vars)
+    end
+    return
+end
+
+is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size)))
+
+struct Wrap end
+boundary_has_transition(::Wrap) = true
+boundary_transition(::Wrap, idx, size) =
+    CartesianIndex(ntuple(i -> mod1(idx[i], size[i]), length(size)))
+load_boundary_edge(::Wrap, arr, dim, dir, neigh_dist) = load_neighbor_edge(arr, dim, dir, neigh_dist)
+load_boundary_corner(::Wrap, arr, corner_side, neigh_dist) = load_neighbor_corner(arr, corner_side, neigh_dist)
+
+struct Pad{T}
+    padval::T
+end
+boundary_has_transition(::Pad) = false
+function load_boundary_edge(pad::Pad, arr, dim, dir, neigh_dist)
+    if dir == -1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
+    elseif dir == 1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
+    end
+    edge_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
+    # FIXME: return Fill(pad.padval, edge_size)
+    return move(thunk_processor(), fill(pad.padval, edge_size))
+end
+function load_boundary_corner(pad::Pad, arr, corner_side, neigh_dist)
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
+    corner_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
+    # FIXME: return Fill(pad.padval, corner_size)
+    return move(thunk_processor(), fill(pad.padval, corner_size))
+end
+
+"""
+    @stencil begin body end
+
+Allows the specification of stencil operations within a `spawn_datadeps`
+region. The `idx` variable is used to iterate over `range`, which must be a
+`DArray`. An example usage may look like:
+
+```julia
+import Dagger: @stencil, Wrap
+
+A = zeros(Blocks(3, 3), Int, 9, 9)
+A[5, 5] = 1
+B = zeros(Blocks(3, 3), Int, 9, 9)
+Dagger.@spawn_datadeps() do
+    @stencil begin
+        # Sum values of all neighbors with self
+        A[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+        # Decrement all values by 1
+        A[idx] -= 1
+        # Copy A to B
+        B[idx] = A[idx]
+    end
+end
+```
+
+Each expression within an `@stencil` region that performs an in-place indexing
+expression like `A[idx] = ...` is transformed into a set of tasks that operate
+on each chunk of `A` or any other arrays specified as `A[idx]`, and within each
+task, elements of that chunk of `A` can be accessed. Elements of multiple
+`DArray`s can be accessed, such as `B[idx]`, so long as `B` has the same size,
+shape, and chunk layout as `A`.
+
+Additionally, the `@neighbors` macro can be used to access a neighborhood of
+values around `A[idx]`, at a configurable distance (in this case, 1 element
+distance) and with various kinds of boundary conditions (in this case, `Wrap()`
+specifies wrapping behavior on the boundaries). Neighborhoods are computed with
+respect to neighboring chunks as well - if a neighborhood would overflow from
+the current chunk into one or more neighboring chunks, values from those
+neighboring chunks will be included in the neighborhood.
+
+Note that, while `@stencil` may look like a `for` loop, it does not follow the
+same semantics; in particular, an expression within `@stencil` occurs "all at
+once" (across all indices) before the next expression occurs. This means that
+`A[idx] = sum(@neighbors(A[idx], 1, Wrap()))` will write the sum of
+neighbors for all `idx` values into `A[idx]` before `A[idx] -= 1` decrements
+the values `A` by 1, and that occurs before any of the values are copied to `B`
+in `B[idx] = A[idx]`. Of course, pipelining and other optimizations may still
+occur, so long as they respect the sequential nature of `@stencil` (just like
+with other operations in `spawn_datadeps`).
+"""
+macro stencil(orig_ex)
+    @assert Meta.isexpr(orig_ex, :block) "Invalid stencil block: $orig_ex"
+
+    # Collect access pattern information
+    inners = []
+    all_accessed_vars = Set{Symbol}()
+    for inner_ex in orig_ex.args
+        inner_ex isa LineNumberNode && continue
+        @assert @capture(inner_ex, write_ex_ = read_ex_) "Invalid update expression: $inner_ex"
+        @assert @capture(write_ex, write_var_[write_idx_]) "Update expression requires a write: $write_ex"
+        accessed_vars = Set{Symbol}()
+        read_vars = Set{Symbol}()
+        neighborhoods = Dict{Symbol, Tuple{Any, Any}}()
+        push!(accessed_vars, write_var)
+        prewalk(read_ex) do read_inner_ex
+            if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
+                push!(accessed_vars, read_var)
+                push!(read_vars, read_var)
+            elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
+                @assert read_idx == write_idx "Neighborhood access must be at the same index as the write: $read_inner_ex"
+                push!(accessed_vars, read_var)
+                push!(read_vars, read_var)
+                neighborhoods[read_var] = (neigh_dist, boundary)
+            end
+            return read_inner_ex
+        end
+        union!(all_accessed_vars, accessed_vars)
+        push!(inners, (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods))
+    end
+
+    # Codegen update functions
+    final_ex = Expr(:block)
+    @gensym chunk_idx
+    for (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods) in inners
+        # Generate a variable for chunk access
+        @gensym chunk_idx
+
+        # Generate function with transformed body
+        @gensym inner_vars inner_index_var
+        new_inner_ex_body = prewalk(inner_ex) do old_inner_ex
+            if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
+                # Direct access
+                if read_var == write_var
+                    return :($write_var[$inner_index_var])
+                else
+                    return :($inner_vars.$read_var[$inner_index_var])
+                end
+            elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
+                # Neighborhood access
+                return :($load_neighborhood($inner_vars.$read_var, $inner_index_var))
+            end
+            return old_inner_ex
+        end
+        new_inner_f = :(($inner_index_var, $write_var, $inner_vars)->$new_inner_ex_body)
+        new_inner_ex = quote
+            $inner_vars = (;$(read_vars...))
+            $inner_stencil!($new_inner_f, $write_var, $inner_vars)
+        end
+        inner_fn = Expr(:->, Expr(:tuple, Expr(:parameters, write_var, read_vars...)), new_inner_ex)
+
+        # Generate @spawn call with appropriate vars and deps
+        deps_ex = Any[]
+        if write_var in read_vars
+            push!(deps_ex, Expr(:kw, write_var, :($ReadWrite($chunks($write_var)[$chunk_idx]))))
+        else
+            push!(deps_ex, Expr(:kw, write_var, :($Write($chunks($write_var)[$chunk_idx]))))
+        end
+        neighbor_copy_all_ex = Expr(:block)
+        for read_var in read_vars
+            if read_var in keys(neighborhoods)
+                # Generate a neighborhood copy operation
+                neigh_dist, boundary = neighborhoods[read_var]
+                deps_inner_ex = Expr(:block)
+                @gensym neighbor_copy_var
+                push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $select_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)))
+                push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var))))
+            else
+                push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx]))))
+            end
+        end
+        spawn_ex = :(Dagger.@spawn $inner_fn(;$(deps_ex...)))
+
+        # Generate loop
+        push!(final_ex.args, quote
+            for $chunk_idx in $CartesianIndices($chunks($write_var))
+                $neighbor_copy_all_ex
+                $spawn_ex
+            end
+        end)
+    end
+
+
+    return esc(final_ex)
+end
diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl
new file mode 100644
index 000000000..2e26ed1cf
--- /dev/null
+++ b/src/utils/haloarray.jl
@@ -0,0 +1,96 @@
+# Define the HaloArray type with minimized halo storage
+struct HaloArray{T,N,E,C,A,EAT<:Tuple,CAT<:Tuple} <: AbstractArray{T,N}
+    center::A
+    edges::EAT
+    corners::CAT
+    halo_width::NTuple{N,Int}
+end
+
+# Helper function to create an empty HaloArray with minimized halo storage
+function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) where {T,N}
+    center = Array{T,N}(undef, center_size...)
+    edges = ntuple(2N) do i
+        prev_dims = center_size[1:(cld(i,2)-1)]
+        next_dims = center_size[(cld(i,2)+1):end]
+        return Array{T,N}(undef, prev_dims..., halo_width[cld(i,2)], next_dims...)
+    end
+    corners = ntuple(2^N) do i
+        return Array{T,N}(undef, halo_width)
+    end
+    return HaloArray(center, edges, corners, halo_width)
+end
+
+HaloArray(center::AT, edges::EAT, corners::CAT, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},CAT<:Tuple,EAT<:Tuple} =
+    HaloArray{T,N,length(edges),length(corners),AT,EAT,CAT}(center, edges, corners, halo_width)
+
+Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width
+function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H}
+    ntuple(N) do i
+        first_ind = 1 - tile.halo_width[i]
+        last_ind = size(tile.center, i) + tile.halo_width[i]
+        return first_ind:last_ind
+    end
+end
+function Base.similar(tile::HaloArray{T,N,H}, ::Type{T}, dims::NTuple{N,Int}) where {T,N,H}
+    center_size = dims
+    halo_width = tile.halo_width
+    return HaloArray{T,N,H}(center_size, halo_width)
+end
+function Base.copy(tile::HaloArray{T,N,H}) where {T,N,H}
+    center = copy(tile.center)
+    halo = ntuple(i->copy(tile.edges[i]), H)
+    halo_width = tile.halo_width
+    return HaloArray{T,N,H}(center, halo, halo_width)
+end
+
+# Define getindex for HaloArray
+function Base.getindex(tile::HaloArray{T,N}, I::Vararg{Int,N}) where {T,N}
+    checkbounds(tile, I...)
+    if all(1 .<= I .<= size(tile.center))
+        return tile.center[I...]
+    elseif !any(1 .<= I .<= size(tile.center))
+        # Corner
+        # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11
+        corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1
+        corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N))
+        return tile.corners[corner_idx][corner_offset]
+    else
+        for d in 1:N
+            if I[d] < 1
+                halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N)
+                return tile.edges[(2*(d-1))+1][halo_idx...]
+            elseif I[d] > size(tile.center, d)
+                halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N)
+                return tile.edges[(2*(d-1))+2][halo_idx...]
+            end
+        end
+    end
+    error("Index out of bounds")
+end
+
+# Define setindex! for HaloArray
+function Base.setindex!(tile::HaloArray{T,N}, value, I::Vararg{Int,N}) where {T,N}
+    checkbounds(tile, I...)
+    if all(1 .<= I .<= size(tile.center))
+        # Center
+        return tile.center[I...] = value
+    elseif !any(1 .<= I .<= size(tile.center))
+        # Corner
+        # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11
+        corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1
+        corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N))
+        return tile.corners[corner_idx][corner_offset] = value
+    else
+        # Edge
+        for d in 1:N
+            if I[d] < 1
+                halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N)
+                return tile.edges[(2*(d-1))+1][halo_idx...] = value
+            elseif I[d] > size(tile.center, d)
+                halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N)
+                return tile.edges[(2*(d-1))+2][halo_idx...] = value
+            end
+        end
+    end
+    error("Index out of bounds")
+end
diff --git a/test/array/stencil.jl b/test/array/stencil.jl
new file mode 100644
index 000000000..7398ea305
--- /dev/null
+++ b/test/array/stencil.jl
@@ -0,0 +1,122 @@
+using Test
+using Dagger
+import Dagger: @stencil, Wrap, Pad
+
+@testset "@stencil" begin
+    @testset "Simple assignment" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = 1
+            end
+        end
+        @test all(collect(A) .== 1)
+    end
+
+    @testset "Wrap boundary" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        A[1,1] = 10
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+            end
+        end
+        # Expected result after convolution with wrap around
+        # Corner element (1,1) will sum its 3 neighbors + itself (10) + 5 wrapped around neighbors
+        # For A[1,1], neighbors are A[4,4], A[4,1], A[4,2], A[1,4], A[1,2], A[2,4], A[2,1], A[2,2]
+        # Since only A[1,1] is 10 and others are 0, sum for B[1,1] will be 10 (A[1,1])
+        # Sum for B[1,2] will be A[1,1] = 10
+        # Sum for B[2,1] will be A[1,1] = 10
+        # Sum for B[2,2] will be A[1,1] = 10
+        # Sum for B[4,4] will be A[1,1] = 10
+        # ... and so on for elements that wrap around to include A[1,1]
+        expected_B_calc = zeros(Int, 4, 4)
+        for i in 1:4, j in 1:4
+            sum_val = 0
+            for ni in -1:1, nj in -1:1
+                # Apply wrap around logic for neighbors
+                row = mod1(i+ni, 4)
+                col = mod1(j+nj, 4)
+                if row == 1 && col == 1 # Check if the wrapped neighbor is A[1,1]
+                    sum_val += 10
+                end
+            end
+            expected_B_calc[i,j] = sum_val
+        end
+        @test collect(B) == expected_B_calc
+    end
+
+    @testset "Pad boundary" begin
+        A = Dagger.DArray(ones(Int, 4, 4), Blocks(2, 2))
+        B = Dagger.DArray(zeros(Int, 4, 4), Blocks(2, 2))
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Pad(0)))
+            end
+        end
+        # Expected result after convolution with zero padding
+        # Inner elements (e.g., B[2,2]) will sum 9 (3x3 neighborhood of 1s)
+        # Edge elements (e.g., B[1,2]) will sum 6 (2x3 neighborhood of 1s, 3 zeros from padding)
+        # Corner elements (e.g., B[1,1]) will sum 4 (2x2 neighborhood of 1s, 5 zeros from padding)
+        expected_B_pad = [
+            4 6 6 4;
+            6 9 9 6;
+            6 9 9 6;
+            4 6 6 4
+        ]
+        @test collect(B) == expected_B_pad
+    end
+
+    @testset "Multiple expressions" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = idx[1] + idx[2] # Sum of coordinates
+                B[idx] = A[idx] * 2
+            end
+        end
+        expected_A_multi = [(r+c) for r in 1:4, c in 1:4]
+        expected_B_multi = expected_A_multi .* 2
+        @test collect(A) == expected_A_multi
+        @test collect(B) == expected_B_multi
+    end
+
+    @testset "Multiple DArrays" begin
+        A = ones(Blocks(2, 2), Int, 4, 4)
+        B = Dagger.fill(Blocks(2, 2), 2, Int, 4, 4) # Corrected fill: value first
+        C = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                C[idx] = A[idx] + B[idx]
+            end
+        end
+        @test all(collect(C) .== 3)
+    end
+
+    @testset "Pad boundary with non-zero value" begin
+        A = ones(Blocks(1, 1), Int, 2, 2) # Simpler 2x2 case
+        B = zeros(Blocks(1, 1), Int, 2, 2)
+        pad_value = 5
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Pad(pad_value)))
+            end
+        end
+        # For A = [1 1; 1 1] and Pad(5)
+        # B[1,1] neighbors considering a 3x3 neighborhood around A[1,1]:
+        # P P P
+        # P A11 A12
+        # P A21 A22
+        # Values:
+        # 5 5 5
+        # 5 1 1
+        # 5 1 1
+        # Sum = 5*5 (for the padded values) + 1*4 (for the actual values from A) = 25 + 4 = 29.
+        # This logic applies to all elements in B because the array A is small (2x2) and the neighborhood is 1.
+        # Every element's 3x3 neighborhood will include 5 padded values and the 4 values of A.
+        expected_B_pad_val = fill(pad_value*5 + 1*4, 2, 2)
+        @test collect(B) == expected_B_pad_val
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 79ba890d7..2e832d2ef 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -21,6 +21,7 @@ tests = [
     ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"),
     ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"),
     ("Array - Random", "array/random.jl"),
+    ("Array - Stencils", "array/stencil.jl"),
     ("Caching", "cache.jl"),
     ("Disk Caching", "diskcaching.jl"),
     ("File IO", "file-io.jl"),