From a5db0af4526c7076ed74d3e7b76a538ea5ce1171 Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Fri, 9 Aug 2024 09:49:30 -0500
Subject: [PATCH 1/6] datadeps: Add at-stencil helper

---
 docs/make.jl           |   5 +-
 docs/src/stencils.jl   |  43 ++++++++
 src/Dagger.jl          |   2 +
 src/stencil.jl         | 220 +++++++++++++++++++++++++++++++++++++++++
 src/utils/haloarray.jl | 115 +++++++++++++++++++++
 5 files changed, 384 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/stencils.jl
 create mode 100644 src/stencil.jl
 create mode 100644 src/utils/haloarray.jl

diff --git a/docs/make.jl b/docs/make.jl
index 8f1f97f5c..641d3865b 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,7 +26,10 @@ makedocs(;
         "Scopes" => "scopes.md",
         "Processors" => "processors.md",
         "Task Queues" => "task-queues.md",
-        "Datadeps" => "datadeps.md",
+        "Datadeps" => [
+            "Basics" => "datadeps.md",
+            "Stencils" => "stencils.md",
+        ],
         "Option Propagation" => "propagation.md",
         "Logging and Visualization" => [
             "Logging: Basics" => "logging.md",
diff --git a/docs/src/stencils.jl b/docs/src/stencils.jl
new file mode 100644
index 000000000..0f388d8a7
--- /dev/null
+++ b/docs/src/stencils.jl
@@ -0,0 +1,43 @@
+# Stencil Operations
+
+
+
+```julia
+N = 27
+nt = 3
+tiles = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+outputs = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+
+# Create fun initial state
+tiles[13, 14] = 1
+tiles[14, 14] = 1
+tiles[15, 14] = 1
+tiles[15, 15] = 1
+tiles[14, 16] = 1
+@view(tiles[(2N+1):3N, (2N+1):3N]) .= rand(Bool, N, N)
+
+import Dagger: @stencil, Wrap
+
+anim = @animate for _ in 1:niters
+    Dagger.spawn_datadeps() do
+        @stencil idx = tiles begin
+            outputs[idx] = begin
+                nhood = @neighbors(tiles[idx], 1, Wrap())
+                neighs = sum(nhood) - tiles[idx]
+                if tiles[idx] && neighs < 2
+                    0
+                elseif tiles[idx] && neighs > 3
+                    0
+                elseif !tiles[idx] && neighs == 3
+                    1
+                else
+                    tiles[idx]
+                end
+            end
+            tiles[idx] = outputs[idx]
+        end
+    end
+    heatmap(Int.(collect(outputs)))
+end
+path = mp4(anim; fps=5, show_msg=true).filename
+```
diff --git a/src/Dagger.jl b/src/Dagger.jl
index fd6395a4b..4b5cfc40b 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -76,6 +76,8 @@ include("sch/Sch.jl"); using .Sch
 
 # Data dependency task queue
 include("datadeps.jl")
+include("utils/haloarray.jl")
+include("stencil.jl")
 
 # Streaming
 include("stream.jl")
diff --git a/src/stencil.jl b/src/stencil.jl
new file mode 100644
index 000000000..37e085dd0
--- /dev/null
+++ b/src/stencil.jl
@@ -0,0 +1,220 @@
+# FIXME: Remove me
+const Read = In
+const Write = Out
+const ReadWrite = InOut
+
+function get_neighbor_edge(arr, dim, dir, dist)
+    if dir == -1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - dist + 1) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
+    elseif dir == 1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + dist - 1) : lastindex(arr, i), ndims(arr)))
+    end
+    return collect(@view arr[start_idx:stop_idx])
+end
+function get_neighbor_corner(chunk, corner_side, neigh_dist)
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(chunk, i) - neigh_dist + 1) : firstindex(chunk, i), ndims(chunk)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(chunk, i) : (firstindex(chunk, i) + neigh_dist - 1), ndims(chunk)))
+    return collect(@view chunk[start_idx:stop_idx])
+end
+function get_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
+    chunk_dist = 1
+    # Get the center
+    accesses = Any[chunks[idx]]
+    # Get the edges
+    for dim in 1:ndims(chunks)
+        for dir in (-1, +1)
+            if dir == -1 && idx[dim] == firstindex(chunks, dim)
+                new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks)))
+            elseif dir == +1 && idx[dim] == lastindex(chunks, dim)
+                new_idx = idx - CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks)))
+            else
+                new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks)))
+            end
+            chunk = chunks[new_idx]
+            push!(accesses, Dagger.@spawn get_neighbor_edge(chunk, dim, dir, neigh_dist))
+        end
+    end
+    # Get the corners
+    for corner_num in 1:(2^ndims(chunks))
+        corner_side = CartesianIndex(reverse(ntuple(ndims(chunks)) do i
+            ((corner_num-1) >> (((ndims(chunks) - i) + 1) - 1)) & 1
+        end))
+        corner_idx = CartesianIndex(ntuple(ndims(chunks)) do i
+            corner_shift = iszero(corner_side[i]) ? -1 : 1
+            return mod1(idx[i] + corner_shift, size(chunks, i))
+        end)
+        chunk = chunks[corner_idx]
+        push!(accesses, Dagger.@spawn get_neighbor_corner(chunk, corner_side, neigh_dist))
+    end
+    @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))"
+    return accesses
+end
+function build_halo(neigh_dist, boundary, center::Array{T,N}, all_neighbors...) where {T,N}
+    # FIXME: Don't collect views
+    edges = collect.(all_neighbors[1:(2*N)])
+    corners = collect.(all_neighbors[((2^N)+1):end])
+    @assert length(edges) == 2*N && length(corners) == 2^N "Halo mismatch: edges=$(length(edges)) corners=$(length(corners))"
+    arr = HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N))
+    return arr
+end
+function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N}
+    start_idx = idx - CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
+    stop_idx = idx + CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
+    # FIXME: Don't collect HaloArray view
+    return collect(@view arr[start_idx:stop_idx])
+end
+
+struct Wrap end
+boundary_init(::Wrap, arr, size) = similar(arr, eltype(arr), size)
+boundary_has_transition(::Wrap) = true
+boundary_transition(::Wrap, idx, size) = mod1(idx, size)
+
+struct Pad{T}
+    padval::T
+end
+boundary_init(::Pad{T}, arr, size) where T = Fill(padval, size)
+boundary_has_transition(::Pad) = false
+
+"""
+    @stencil idx in range begin body end
+
+Allows the specification of stencil operations within a `spawn_datadeps`
+region. The `idx` variable is used to iterate over `range`, which must be a
+`DArray`. An example usage may look like:
+
+```julia
+import Dagger: @stencil, Wrap
+
+A = zeros(Blocks(3, 3), Int, 9, 9)
+A[5, 5] = 1
+B = zeros(Blocks(3, 3), Int, 9, 9)
+Dagger.@spawn_datadeps() do
+    @stencil idx in A begin
+        # Sum values of all neighbors with self
+        A[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+        # Decrement all values by 1
+        A[idx] -= 1
+        # Copy A to B
+        B[idx] = A[idx]
+    end
+end
+```
+
+Each expression within an `@stencil` region that accesses `A[idx]` is
+transformed into a set of tasks that operate on each chunk of `A`, and within
+each task, elements of that chunk of `A` can be accessed. Elements of other
+`DArray`s can also be accessed, such as `B[idx]`, so long as `B` has the same
+size, shape, and chunk layout as `A`.
+
+Additionally, the `@neighbors` macro can be used to access a neighborhood of
+values around `A[idx]`, at a configurable distance (in this case, 1 element
+distance) and with various kinds of boundary conditions (in this case, `Wrap()`
+specifies wrapping behavior on the boundaries). Neighborhoods are computed with
+respect to neighboring chunks as well - if a neighborhood would overflow from
+the current chunk into one or more neighboring chunks, values from those
+neighboring chunks will be included in the neighborhood.
+
+Note that, while `@stencil` may look like a `for` loop, it does not follow the
+same semantics; in particular, an expression within `@stencil` occurs "all at
+once" (across all indices) before the next expression occurs. This means that
+`A[idx] = sum(@neighbors(A[idx], 1, Wrap()))` will write the sum of
+neighbors for all `idx` values into `A[idx]` before `A[idx] -= 1` decrements
+the values `A` by 1, and that occurs before any of the values are copied to `B`
+in `B[idx] = A[idx]`. Of course, pipelining and other optimizations may still
+occur, so long as they respect the sequential nature of `@stencil` (just like
+with other operations in `spawn_datadeps`).
+"""
+macro stencil(index_ex, orig_ex)
+    @assert @capture(index_ex, index_var_ = index_range_) || @capture(index_ex, index_var_ in index_range_) "Invalid indexing expression: $index_ex"
+    @assert Meta.isexpr(orig_ex, :block) "Invalid stencil block: $orig_ex"
+
+    # Collect access pattern information
+    inners = []
+    all_accessed_vars = Set{Symbol}()
+    for inner_ex in orig_ex.args
+        inner_ex isa LineNumberNode && continue
+        @assert @capture(inner_ex, write_ex_ = read_ex_) "Invalid update expression: $inner_ex"
+        @assert @capture(write_ex, write_var_[write_idx_]) "Update expression requires a write: $write_ex"
+        @assert write_idx == index_var "Can only write to $index_var: $write_ex"
+        accessed_vars = Set{Symbol}()
+        read_vars = Set{Symbol}()
+        neighborhoods = Dict{Symbol, Tuple{Any, Any}}()
+        push!(accessed_vars, write_var)
+        prewalk(read_ex) do read_inner_ex
+            if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == index_var
+                push!(accessed_vars, read_var)
+                push!(read_vars, read_var)
+            elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var
+                push!(accessed_vars, read_var)
+                push!(read_vars, read_var)
+                neighborhoods[read_var] = (neigh_dist, boundary)
+            end
+            return read_inner_ex
+        end
+        union!(all_accessed_vars, accessed_vars)
+        push!(inners, (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods))
+    end
+
+    # Codegen update functions
+    final_ex = Expr(:block)
+    @gensym chunk_idx
+    for (;inner_ex, accessed_vars, write_var, write_idx, read_ex, read_vars, neighborhoods) in inners
+        # Generate a variable for chunk access
+        @gensym chunk_idx
+
+        # Generate function with transformed body
+        @gensym inner_index_var
+        new_inner_ex_body = prewalk(inner_ex) do old_inner_ex
+            if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == index_var
+                # Direct access
+                return :($read_var[$inner_index_var])
+            elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var
+                # Neighborhood access
+                return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist))
+            end
+            return old_inner_ex
+        end
+        new_inner_ex = quote
+            for $inner_index_var in CartesianIndices($write_var)
+                $new_inner_ex_body
+            end
+        end
+        inner_fn = Expr(:->, Expr(:tuple, Expr(:parameters, write_var, read_vars...)), new_inner_ex)
+
+        # Generate @spawn call with appropriate vars and deps
+        deps_ex = Any[]
+        if write_var in read_vars
+            push!(deps_ex, Expr(:kw, write_var, :($ReadWrite($chunks($write_var)[$chunk_idx]))))
+        else
+            push!(deps_ex, Expr(:kw, write_var, :($Write($chunks($write_var)[$chunk_idx]))))
+        end
+        neighbor_copy_all_ex = Expr(:block)
+        for read_var in read_vars
+            if read_var in keys(neighborhoods)
+                # Generate a neighborhood copy operation
+                neigh_dist, boundary = neighborhoods[read_var]
+                deps_inner_ex = Expr(:block)
+                @gensym neighbor_copy_var
+                push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $get_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)))
+                push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var))))
+            else
+                push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx]))))
+            end
+        end
+        spawn_ex = :(Dagger.@spawn $inner_fn(;$(deps_ex...)))
+
+        # Generate loop
+        push!(final_ex.args, quote
+            for $chunk_idx in $CartesianIndices($chunks($index_range))
+                $neighbor_copy_all_ex
+                $spawn_ex
+            end
+        end)
+    end
+
+    @show final_ex
+
+    return esc(final_ex)
+end
diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl
new file mode 100644
index 000000000..835131d7b
--- /dev/null
+++ b/src/utils/haloarray.jl
@@ -0,0 +1,115 @@
+# Define the HaloArray type with minimized halo storage
+struct HaloArray{T,N,E,C,A,EA,CA} <: AbstractArray{T,N}
+    center::A
+    edges::NTuple{E, EA}
+    corners::NTuple{C, CA}
+    halo_width::NTuple{N,Int}
+end
+
+# Helper function to create an empty HaloArray with minimized halo storage
+function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) where {T,N}
+    center = Array{T,N}(undef, center_size...)
+    edges = ntuple(2N) do i
+        prev_dims = center_size[1:(cld(i,2)-1)]
+        next_dims = center_size[(cld(i,2)+1):end]
+        return Array{T,N}(undef, prev_dims..., halo_width[cld(i,2)], next_dims...)
+    end
+    corners = ntuple(2^N) do i
+        return Array{T,N}(undef, halo_width)
+    end
+    return HaloArray{T,N,2N,2^N}(center, edges, corners, halo_width)
+end
+
+HaloArray(center::AT, edges::NTuple{E, EA}, corners::NTuple{C, CA}, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},C,E,CA,EA} =
+    HaloArray{T,N,E,C,AT,EA,CA}(center, edges, corners, halo_width)
+
+Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width
+function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H}
+    ntuple(N) do i
+        first_ind = 1 - tile.halo_width[i]
+        last_ind = size(tile.center, i) + tile.halo_width[i]
+        return first_ind:last_ind
+    end
+end
+function Base.similar(tile::HaloArray{T,N,H}, ::Type{T}, dims::NTuple{N,Int}) where {T,N,H}
+    center_size = dims
+    halo_width = tile.halo_width
+    return HaloArray{T,N,H}(center_size, halo_width)
+end
+function Base.copy(tile::HaloArray{T,N,H}) where {T,N,H}
+    center = copy(tile.center)
+    halo = ntuple(i->copy(tile.edges[i]), H)
+    halo_width = tile.halo_width
+    return HaloArray{T,N,H}(center, halo, halo_width)
+end
+
+# Define getindex for HaloArray
+function Base.getindex(tile::HaloArray{T,N}, I::Vararg{Int,N}) where {T,N}
+    checkbounds(tile, I...)
+    if all(1 .<= I .<= size(tile.center))
+        return tile.center[I...]
+    elseif !any(1 .<= I .<= size(tile.center))
+        # Corner
+        # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11
+        corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1
+        corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N))
+        return tile.corners[corner_idx][corner_offset]
+    else
+        for d in 1:N
+            if I[d] < 1
+                halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...)
+                return tile.edges[(2*(d-1))+1][halo_idx...]
+            elseif I[d] > size(tile.center, d)
+                halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...)
+                return tile.edges[(2*(d-1))+2][halo_idx...]
+            end
+        end
+    end
+    error("Index out of bounds")
+end
+
+# Define setindex! for HaloArray
+function Base.setindex!(tile::HaloArray{T,N}, value, I::Vararg{Int,N}) where {T,N}
+    checkbounds(tile, I...)
+    if all(1 .<= I .<= size(tile.center))
+        # Center
+        return tile.center[I...] = value
+    elseif !any(1 .<= I .<= size(tile.center))
+        # Corner
+        # N.B. Corner indexes are in binary, e.g. 0b01, 0b10, 0b11
+        corner_idx = sum(ntuple(i->(I[i] < 1 ? 0 : 1) * (2^(i-1)), N)) + 1
+        corner_offset = CartesianIndex(I) + CartesianIndex(ntuple(i->(I[i] < 1 ? tile.halo_width[i] : -size(tile.center, i)), N))
+        return tile.corners[corner_idx][corner_offset] = value
+    else
+        # Edge
+        for d in 1:N
+            if I[d] < 1
+                halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...)
+                return tile.edges[(2*(d-1))+1][halo_idx...] = value
+            elseif I[d] > size(tile.center, d)
+                halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...)
+                return tile.edges[(2*(d-1))+2][halo_idx...] = value
+            end
+        end
+    end
+    error("Index out of bounds")
+end
+
+#=
+# Example usage
+center_size = (3, 5)
+halo_width = (1, 1)
+tile = HaloArray{Float64, 2}(center_size, halo_width)
+
+# Set values in the center and halo
+tile[2, 2] = 1.0
+tile[0, 2] = 2.0  # This should be in an edge
+tile[0, 0] = 3.0  # This should be in a corner
+tile[4, 6] = 4.0  # This should be in a corner
+
+# Get values from the center and halo
+println(tile[2, 2])  # 1.0
+println(tile[0, 2])  # 2.0
+println(tile[0, 0])  # 3.0
+println(tile[4, 6])  # 4.0
+=#

From 5b70c1788bbf8cb68cae8d37361b0fe60214acdf Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Mon, 12 Aug 2024 13:48:41 -0500
Subject: [PATCH 2/6] stencils: Add custom boundary conditions

---
 Project.toml   |  2 ++
 src/Dagger.jl  |  5 ++-
 src/stencil.jl | 86 ++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/Project.toml b/Project.toml
index a89547522..4bb4d184e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 DistributedNext = "fab6aee4-877b-4bac-a744-3eca44acbb6f"
+FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -51,6 +52,7 @@ DataFrames = "1"
 DataStructures = "0.18"
 DistributedNext = "1.0.0"
 Distributions = "0.25"
+FillArrays = "1.11.0"
 GraphViz = "0.2"
 Graphs = "1"
 JSON3 = "1"
diff --git a/src/Dagger.jl b/src/Dagger.jl
index 4b5cfc40b..d65cd4a92 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -32,6 +32,8 @@ import TimespanLogging: timespan_start, timespan_finish
 
 import Adapt
 
+import FillArrays: Fill
+
 # Preferences
 import Preferences: @load_preference, @set_preferences!
 
@@ -50,7 +52,8 @@ include("utils/dagdebug.jl")
 include("utils/locked-object.jl")
 include("utils/tasks.jl")
 
-import MacroTools: @capture
+import MacroTools: @capture, prewalk
+
 include("options.jl")
 include("processor.jl")
 include("threadproc.jl")
diff --git a/src/stencil.jl b/src/stencil.jl
index 37e085dd0..4e85af451 100644
--- a/src/stencil.jl
+++ b/src/stencil.jl
@@ -3,51 +3,71 @@ const Read = In
 const Write = Out
 const ReadWrite = InOut
 
-function get_neighbor_edge(arr, dim, dir, dist)
+function load_neighbor_edge(arr, dim, dir, neigh_dist)
     if dir == -1
-        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - dist + 1) : firstindex(arr, i), ndims(arr)))
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
         stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
     elseif dir == 1
         start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
-        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + dist - 1) : lastindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
     end
     return collect(@view arr[start_idx:stop_idx])
 end
-function get_neighbor_corner(chunk, corner_side, neigh_dist)
-    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(chunk, i) - neigh_dist + 1) : firstindex(chunk, i), ndims(chunk)))
-    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(chunk, i) : (firstindex(chunk, i) + neigh_dist - 1), ndims(chunk)))
-    return collect(@view chunk[start_idx:stop_idx])
+function load_neighbor_corner(arr, corner_side, neigh_dist)
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
+    return collect(@view arr[start_idx:stop_idx])
 end
-function get_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
+function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
+    @assert neigh_dist isa Integer && neigh_dist > 0 "Neighborhood distance must be an Integer greater than 0"
+
+    # FIXME: Depends on neigh_dist and chunk size
     chunk_dist = 1
     # Get the center
     accesses = Any[chunks[idx]]
+
     # Get the edges
     for dim in 1:ndims(chunks)
         for dir in (-1, +1)
-            if dir == -1 && idx[dim] == firstindex(chunks, dim)
-                new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks)))
-            elseif dir == +1 && idx[dim] == lastindex(chunks, dim)
-                new_idx = idx - CartesianIndex(ntuple(i -> i == dim ? size(chunks, dim)-1 : 0, ndims(chunks)))
+            new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks)))
+            if is_past_boundary(size(chunks), new_idx)
+                if boundary_has_transition(boundary)
+                    new_idx = boundary_transition(boundary, new_idx, size(chunks))
+                else
+                    new_idx = idx
+                end
+                chunk = chunks[new_idx]
+                push!(accesses, Dagger.@spawn load_boundary_edge(boundary, chunk, dim, dir, neigh_dist))
             else
-                new_idx = idx + CartesianIndex(ntuple(i -> i == dim ? dir*chunk_dist : 0, ndims(chunks)))
+                chunk = chunks[new_idx]
+                push!(accesses, Dagger.@spawn load_neighbor_edge(chunk, dim, dir, neigh_dist))
             end
-            chunk = chunks[new_idx]
-            push!(accesses, Dagger.@spawn get_neighbor_edge(chunk, dim, dir, neigh_dist))
         end
     end
+
     # Get the corners
     for corner_num in 1:(2^ndims(chunks))
         corner_side = CartesianIndex(reverse(ntuple(ndims(chunks)) do i
             ((corner_num-1) >> (((ndims(chunks) - i) + 1) - 1)) & 1
         end))
-        corner_idx = CartesianIndex(ntuple(ndims(chunks)) do i
+        corner_new_idx = CartesianIndex(ntuple(ndims(chunks)) do i
             corner_shift = iszero(corner_side[i]) ? -1 : 1
-            return mod1(idx[i] + corner_shift, size(chunks, i))
+            return idx[i] + corner_shift
         end)
-        chunk = chunks[corner_idx]
-        push!(accesses, Dagger.@spawn get_neighbor_corner(chunk, corner_side, neigh_dist))
+        if is_past_boundary(size(chunks), corner_new_idx)
+            if boundary_has_transition(boundary)
+                corner_new_idx = boundary_transition(boundary, corner_new_idx, size(chunks))
+            else
+                corner_new_idx = idx
+            end
+            chunk = chunks[corner_new_idx]
+            push!(accesses, Dagger.@spawn load_boundary_corner(boundary, chunk, corner_side, neigh_dist))
+        else
+            chunk = chunks[corner_new_idx]
+            push!(accesses, Dagger.@spawn load_neighbor_corner(chunk, corner_side, neigh_dist))
+        end
     end
+
     @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))"
     return accesses
 end
@@ -66,16 +86,36 @@ function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N}
     return collect(@view arr[start_idx:stop_idx])
 end
 
+is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size)))
+
 struct Wrap end
-boundary_init(::Wrap, arr, size) = similar(arr, eltype(arr), size)
 boundary_has_transition(::Wrap) = true
-boundary_transition(::Wrap, idx, size) = mod1(idx, size)
+boundary_transition(::Wrap, idx, size) =
+    CartesianIndex(ntuple(i -> mod1(idx[i], size[i]), length(size)))
+load_boundary_edge(::Wrap, arr, dim, dir, neigh_dist) = load_neighbor_edge(arr, dim, dir, neigh_dist)
+load_boundary_corner(::Wrap, arr, corner_side, neigh_dist) = load_neighbor_corner(arr, corner_side, neigh_dist)
 
 struct Pad{T}
     padval::T
 end
-boundary_init(::Pad{T}, arr, size) where T = Fill(padval, size)
 boundary_has_transition(::Pad) = false
+function load_boundary_edge(pad::Pad, arr, dim, dir, neigh_dist)
+    if dir == -1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? lastindex(arr, i) : lastindex(arr, i), ndims(arr)))
+    elseif dir == 1
+        start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
+        stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
+    end
+    edge_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
+    return Fill(pad.padval, edge_size)
+end
+function load_boundary_corner(pad::Pad, arr, corner_side, neigh_dist)
+    start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
+    stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
+    corner_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
+    return Fill(pad.padval, corner_size)
+end
 
 """
     @stencil idx in range begin body end
@@ -197,7 +237,7 @@ macro stencil(index_ex, orig_ex)
                 neigh_dist, boundary = neighborhoods[read_var]
                 deps_inner_ex = Expr(:block)
                 @gensym neighbor_copy_var
-                push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $get_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)))
+                push!(neighbor_copy_all_ex.args, :($neighbor_copy_var = Dagger.@spawn $build_halo($neigh_dist, $boundary, map($Read, $select_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)))
                 push!(deps_ex, Expr(:kw, read_var, :($Read($neighbor_copy_var))))
             else
                 push!(deps_ex, Expr(:kw, read_var, :($Read($chunks($read_var)[$chunk_idx]))))

From a0b150231a78428f2e4333d919bdee6c776c899f Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Mon, 12 Aug 2024 14:13:58 -0500
Subject: [PATCH 3/6] stencils: Remove unnecessary index var and range

---
 docs/src/stencils.jl |  2 +-
 src/stencil.jl       | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/src/stencils.jl b/docs/src/stencils.jl
index 0f388d8a7..3cf552dc1 100644
--- a/docs/src/stencils.jl
+++ b/docs/src/stencils.jl
@@ -20,7 +20,7 @@ import Dagger: @stencil, Wrap
 
 anim = @animate for _ in 1:niters
     Dagger.spawn_datadeps() do
-        @stencil idx = tiles begin
+        @stencil begin
             outputs[idx] = begin
                 nhood = @neighbors(tiles[idx], 1, Wrap())
                 neighs = sum(nhood) - tiles[idx]
diff --git a/src/stencil.jl b/src/stencil.jl
index 4e85af451..c292ace3f 100644
--- a/src/stencil.jl
+++ b/src/stencil.jl
@@ -131,7 +131,7 @@ A = zeros(Blocks(3, 3), Int, 9, 9)
 A[5, 5] = 1
 B = zeros(Blocks(3, 3), Int, 9, 9)
 Dagger.@spawn_datadeps() do
-    @stencil idx in A begin
+    @stencil begin
         # Sum values of all neighbors with self
         A[idx] = sum(@neighbors(A[idx], 1, Wrap()))
         # Decrement all values by 1
@@ -142,11 +142,12 @@ Dagger.@spawn_datadeps() do
 end
 ```
 
-Each expression within an `@stencil` region that accesses `A[idx]` is
-transformed into a set of tasks that operate on each chunk of `A`, and within
-each task, elements of that chunk of `A` can be accessed. Elements of other
-`DArray`s can also be accessed, such as `B[idx]`, so long as `B` has the same
-size, shape, and chunk layout as `A`.
+Each expression within an `@stencil` region that performs an in-place indexing
+expression like `A[idx] = ...` is transformed into a set of tasks that operate
+on each chunk of `A` or any other arrays specified as `A[idx]`, and within each
+task, elements of that chunk of `A` can be accessed. Elements of multiple
+`DArray`s can be accessed, such as `B[idx]`, so long as `B` has the same size,
+shape, and chunk layout as `A`.
 
 Additionally, the `@neighbors` macro can be used to access a neighborhood of
 values around `A[idx]`, at a configurable distance (in this case, 1 element
@@ -166,8 +167,7 @@ in `B[idx] = A[idx]`. Of course, pipelining and other optimizations may still
 occur, so long as they respect the sequential nature of `@stencil` (just like
 with other operations in `spawn_datadeps`).
 """
-macro stencil(index_ex, orig_ex)
-    @assert @capture(index_ex, index_var_ = index_range_) || @capture(index_ex, index_var_ in index_range_) "Invalid indexing expression: $index_ex"
+macro stencil(orig_ex)
     @assert Meta.isexpr(orig_ex, :block) "Invalid stencil block: $orig_ex"
 
     # Collect access pattern information
@@ -177,16 +177,16 @@ macro stencil(index_ex, orig_ex)
         inner_ex isa LineNumberNode && continue
         @assert @capture(inner_ex, write_ex_ = read_ex_) "Invalid update expression: $inner_ex"
         @assert @capture(write_ex, write_var_[write_idx_]) "Update expression requires a write: $write_ex"
-        @assert write_idx == index_var "Can only write to $index_var: $write_ex"
         accessed_vars = Set{Symbol}()
         read_vars = Set{Symbol}()
         neighborhoods = Dict{Symbol, Tuple{Any, Any}}()
         push!(accessed_vars, write_var)
         prewalk(read_ex) do read_inner_ex
-            if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == index_var
+            if @capture(read_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
                 push!(accessed_vars, read_var)
                 push!(read_vars, read_var)
-            elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var
+            elseif @capture(read_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
+                @assert read_idx == write_idx "Neighborhood access must be at the same index as the write: $read_inner_ex"
                 push!(accessed_vars, read_var)
                 push!(read_vars, read_var)
                 neighborhoods[read_var] = (neigh_dist, boundary)
@@ -207,10 +207,10 @@ macro stencil(index_ex, orig_ex)
         # Generate function with transformed body
         @gensym inner_index_var
         new_inner_ex_body = prewalk(inner_ex) do old_inner_ex
-            if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == index_var
+            if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
                 # Direct access
                 return :($read_var[$inner_index_var])
-            elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_)) && read_idx == index_var
+            elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
                 # Neighborhood access
                 return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist))
             end
@@ -247,7 +247,7 @@ macro stencil(index_ex, orig_ex)
 
         # Generate loop
         push!(final_ex.args, quote
-            for $chunk_idx in $CartesianIndices($chunks($index_range))
+            for $chunk_idx in $CartesianIndices($chunks($write_var))
                 $neighbor_copy_all_ex
                 $spawn_ex
             end

From 7db16406033c07bf034534c535c97eb2a636795c Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Thu, 15 Aug 2024 17:34:58 -0400
Subject: [PATCH 4/6] stencils: Add GPU support

---
 Project.toml           |  2 --
 src/Dagger.jl          |  2 --
 src/stencil.jl         | 59 +++++++++++++++++++++++++++---------------
 src/utils/haloarray.jl | 39 +++++++---------------------
 4 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/Project.toml b/Project.toml
index 4bb4d184e..a89547522 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,7 +7,6 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 DistributedNext = "fab6aee4-877b-4bac-a744-3eca44acbb6f"
-FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -52,7 +51,6 @@ DataFrames = "1"
 DataStructures = "0.18"
 DistributedNext = "1.0.0"
 Distributions = "0.25"
-FillArrays = "1.11.0"
 GraphViz = "0.2"
 Graphs = "1"
 JSON3 = "1"
diff --git a/src/Dagger.jl b/src/Dagger.jl
index d65cd4a92..3a76cb3bc 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -32,8 +32,6 @@ import TimespanLogging: timespan_start, timespan_finish
 
 import Adapt
 
-import FillArrays: Fill
-
 # Preferences
 import Preferences: @load_preference, @set_preferences!
 
diff --git a/src/stencil.jl b/src/stencil.jl
index c292ace3f..7c4ab99d2 100644
--- a/src/stencil.jl
+++ b/src/stencil.jl
@@ -11,12 +11,13 @@ function load_neighbor_edge(arr, dim, dir, neigh_dist)
         start_idx = CartesianIndex(ntuple(i -> i == dim ? firstindex(arr, i) : firstindex(arr, i), ndims(arr)))
         stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
     end
-    return collect(@view arr[start_idx:stop_idx])
+    # FIXME: Don't collect
+    return move(thunk_processor(), collect(@view arr[start_idx:stop_idx]))
 end
 function load_neighbor_corner(arr, corner_side, neigh_dist)
     start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
     stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
-    return collect(@view arr[start_idx:stop_idx])
+    return move(thunk_processor(), collect(@view arr[start_idx:stop_idx]))
 end
 function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
     @assert neigh_dist isa Integer && neigh_dist > 0 "Neighborhood distance must be an Integer greater than 0"
@@ -71,19 +72,30 @@ function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
     @assert length(accesses) == 1+2*ndims(chunks)+2^ndims(chunks) "Accesses mismatch: $(length(accesses))"
     return accesses
 end
-function build_halo(neigh_dist, boundary, center::Array{T,N}, all_neighbors...) where {T,N}
-    # FIXME: Don't collect views
-    edges = collect.(all_neighbors[1:(2*N)])
-    corners = collect.(all_neighbors[((2^N)+1):end])
+function build_halo(neigh_dist, boundary, center, all_neighbors...)
+    N = ndims(center)
+    edges = all_neighbors[1:(2*N)]
+    corners = all_neighbors[((2^N)+1):end]
     @assert length(edges) == 2*N && length(corners) == 2^N "Halo mismatch: edges=$(length(edges)) corners=$(length(corners))"
-    arr = HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N))
-    return arr
+    return HaloArray(center, (edges...,), (corners...,), ntuple(_->neigh_dist, N))
 end
-function load_neighborhood(arr::HaloArray{T,N}, idx, neigh_dist) where {T,N}
+function load_neighborhood(arr::HaloArray{T,N}, idx) where {T,N}
+    @assert all(arr.halo_width .== arr.halo_width[1])
+    neigh_dist = arr.halo_width[1]
     start_idx = idx - CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
     stop_idx = idx + CartesianIndex(ntuple(_->neigh_dist, ndims(arr)))
-    # FIXME: Don't collect HaloArray view
-    return collect(@view arr[start_idx:stop_idx])
+    return @view arr[start_idx:stop_idx]
+end
+function inner_stencil!(f, output, read_vars)
+    processor = thunk_processor()
+    inner_stencil_proc!(processor, f, output, read_vars)
+end
+# Non-KA (for CPUs)
+function inner_stencil_proc!(::ThreadProc, f, output, read_vars)
+    for idx in CartesianIndices(output)
+        f(idx, output, read_vars)
+    end
+    return
 end
 
 is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size)))
@@ -108,17 +120,19 @@ function load_boundary_edge(pad::Pad, arr, dim, dir, neigh_dist)
         stop_idx = CartesianIndex(ntuple(i -> i == dim ? (firstindex(arr, i) + neigh_dist - 1) : lastindex(arr, i), ndims(arr)))
     end
     edge_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
-    return Fill(pad.padval, edge_size)
+    # FIXME: return Fill(pad.padval, edge_size)
+    return move(thunk_processor(), fill(pad.padval, edge_size))
 end
 function load_boundary_corner(pad::Pad, arr, corner_side, neigh_dist)
     start_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? (lastindex(arr, i) - neigh_dist + 1) : firstindex(arr, i), ndims(arr)))
     stop_idx = CartesianIndex(ntuple(i -> corner_side[i] == 0 ? lastindex(arr, i) : (firstindex(arr, i) + neigh_dist - 1), ndims(arr)))
     corner_size = ntuple(i -> length(start_idx[i]:stop_idx[i]), ndims(arr))
-    return Fill(pad.padval, corner_size)
+    # FIXME: return Fill(pad.padval, corner_size)
+    return move(thunk_processor(), fill(pad.padval, corner_size))
 end
 
 """
-    @stencil idx in range begin body end
+    @stencil begin body end
 
 Allows the specification of stencil operations within a `spawn_datadeps`
 region. The `idx` variable is used to iterate over `range`, which must be a
@@ -205,21 +219,25 @@ macro stencil(orig_ex)
         @gensym chunk_idx
 
         # Generate function with transformed body
-        @gensym inner_index_var
+        @gensym inner_vars inner_index_var
         new_inner_ex_body = prewalk(inner_ex) do old_inner_ex
             if @capture(old_inner_ex, read_var_[read_idx_]) && read_idx == write_idx
                 # Direct access
-                return :($read_var[$inner_index_var])
+                if read_var == write_var
+                    return :($write_var[$inner_index_var])
+                else
+                    return :($inner_vars.$read_var[$inner_index_var])
+                end
             elseif @capture(old_inner_ex, @neighbors(read_var_[read_idx_], neigh_dist_, boundary_))
                 # Neighborhood access
-                return :($load_neighborhood($read_var, $inner_index_var, $neigh_dist))
+                return :($load_neighborhood($inner_vars.$read_var, $inner_index_var))
             end
             return old_inner_ex
         end
+        new_inner_f = :(($inner_index_var, $write_var, $inner_vars)->$new_inner_ex_body)
         new_inner_ex = quote
-            for $inner_index_var in CartesianIndices($write_var)
-                $new_inner_ex_body
-            end
+            $inner_vars = (;$(read_vars...))
+            $inner_stencil!($new_inner_f, $write_var, $inner_vars)
         end
         inner_fn = Expr(:->, Expr(:tuple, Expr(:parameters, write_var, read_vars...)), new_inner_ex)
 
@@ -254,7 +272,6 @@ macro stencil(orig_ex)
         end)
     end
 
-    @show final_ex
 
     return esc(final_ex)
 end
diff --git a/src/utils/haloarray.jl b/src/utils/haloarray.jl
index 835131d7b..2e26ed1cf 100644
--- a/src/utils/haloarray.jl
+++ b/src/utils/haloarray.jl
@@ -1,8 +1,8 @@
 # Define the HaloArray type with minimized halo storage
-struct HaloArray{T,N,E,C,A,EA,CA} <: AbstractArray{T,N}
+struct HaloArray{T,N,E,C,A,EAT<:Tuple,CAT<:Tuple} <: AbstractArray{T,N}
     center::A
-    edges::NTuple{E, EA}
-    corners::NTuple{C, CA}
+    edges::EAT
+    corners::CAT
     halo_width::NTuple{N,Int}
 end
 
@@ -17,11 +17,11 @@ function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) w
     corners = ntuple(2^N) do i
         return Array{T,N}(undef, halo_width)
     end
-    return HaloArray{T,N,2N,2^N}(center, edges, corners, halo_width)
+    return HaloArray(center, edges, corners, halo_width)
 end
 
-HaloArray(center::AT, edges::NTuple{E, EA}, corners::NTuple{C, CA}, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},C,E,CA,EA} =
-    HaloArray{T,N,E,C,AT,EA,CA}(center, edges, corners, halo_width)
+HaloArray(center::AT, edges::EAT, corners::CAT, halo_width::NTuple{N, Int}) where {T,N,AT<:AbstractArray{T,N},CAT<:Tuple,EAT<:Tuple} =
+    HaloArray{T,N,length(edges),length(corners),AT,EAT,CAT}(center, edges, corners, halo_width)
 
 Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width
 function Base.axes(tile::HaloArray{T,N,H}) where {T,N,H}
@@ -57,10 +57,10 @@ function Base.getindex(tile::HaloArray{T,N}, I::Vararg{Int,N}) where {T,N}
     else
         for d in 1:N
             if I[d] < 1
-                halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...)
+                halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N)
                 return tile.edges[(2*(d-1))+1][halo_idx...]
             elseif I[d] > size(tile.center, d)
-                halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...)
+                halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N)
                 return tile.edges[(2*(d-1))+2][halo_idx...]
             end
         end
@@ -84,32 +84,13 @@ function Base.setindex!(tile::HaloArray{T,N}, value, I::Vararg{Int,N}) where {T,
         # Edge
         for d in 1:N
             if I[d] < 1
-                halo_idx = (I[1:d-1]..., I[d] + tile.halo_width[d], I[d+1:end]...)
+                halo_idx = ntuple(i->i == d ? I[i] + tile.halo_width[i] : I[i], N)
                 return tile.edges[(2*(d-1))+1][halo_idx...] = value
             elseif I[d] > size(tile.center, d)
-                halo_idx = (I[1:d-1]..., I[d] - size(tile.center, d), I[d+1:end]...)
+                halo_idx = ntuple(i->i == d ? I[i] - size(tile.center, d) : I[i], N)
                 return tile.edges[(2*(d-1))+2][halo_idx...] = value
             end
         end
     end
     error("Index out of bounds")
 end
-
-#=
-# Example usage
-center_size = (3, 5)
-halo_width = (1, 1)
-tile = HaloArray{Float64, 2}(center_size, halo_width)
-
-# Set values in the center and halo
-tile[2, 2] = 1.0
-tile[0, 2] = 2.0  # This should be in an edge
-tile[0, 0] = 3.0  # This should be in a corner
-tile[4, 6] = 4.0  # This should be in a corner
-
-# Get values from the center and halo
-println(tile[2, 2])  # 1.0
-println(tile[0, 2])  # 2.0
-println(tile[0, 0])  # 3.0
-println(tile[4, 6])  # 4.0
-=#

From 386dfd88cb73574a401a0eb4726506b7d4bd3fe2 Mon Sep 17 00:00:00 2001
From: Julian P Samaroo <jpsamaroo@jpsamaroo.me>
Date: Mon, 19 Aug 2024 10:50:47 -0400
Subject: [PATCH 5/6] scopes/DArray: Prevent GPU running setindex!

---
 src/array/indexing.jl |  3 ++-
 src/scopes.jl         | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/array/indexing.jl b/src/array/indexing.jl
index 69725eb7a..778e15a17 100644
--- a/src/array/indexing.jl
+++ b/src/array/indexing.jl
@@ -127,7 +127,8 @@ function Base.setindex!(A::DArray{T,N}, value, idx::NTuple{N,Int}) where {T,N}
     # Set the value
     part = A.chunks[part_idx...]
     space = memory_space(part)
-    scope = Dagger.scope(worker=root_worker_id(space))
+    # FIXME: Do this correctly w.r.t memory space of part
+    scope = Dagger.scope(worker=root_worker_id(space), threads=:)
     return fetch(Dagger.@spawn scope=scope setindex!(part, value, offset_idx...))
 end
 Base.setindex!(A::DArray, value, idx::Integer...) =
diff --git a/src/scopes.jl b/src/scopes.jl
index 834993c9f..29badbe70 100644
--- a/src/scopes.jl
+++ b/src/scopes.jl
@@ -325,13 +325,20 @@ function to_scope(sc::NamedTuple)
     else
         nothing
     end
+    all_threads = false
     threads = if haskey(sc, :thread)
         Int[sc.thread]
     elseif haskey(sc, :threads)
-        Int[sc.threads...]
+        if sc.threads == Colon()
+            all_threads = true
+            nothing
+        else
+            Int[sc.threads...]
+        end
     else
         nothing
     end
+    want_threads = all_threads || threads !== nothing
 
     # Simple cases
     if workers !== nothing && threads !== nothing
@@ -341,18 +348,22 @@ function to_scope(sc::NamedTuple)
         end
         return simplified_union_scope(subscopes)
     elseif workers !== nothing && threads === nothing
-        subscopes = AbstractScope[ProcessScope(w) for w in workers]
-        return simplified_union_scope(subscopes)
+        subscopes = simplified_union_scope(AbstractScope[ProcessScope(w) for w in workers])
+        if all_threads
+            return constrain(subscopes, ProcessorTypeScope(ThreadProc))
+        else
+            return subscopes
+        end
     end
 
     # More complex cases that require querying the cluster
     # FIXME: Use per-field scope taint
     if workers === nothing
-        workers = procs()
+        workers = map(p->p.pid, filter(p->p isa OSProc, procs(Dagger.Sch.eager_context())))
     end
     subscopes = AbstractScope[]
     for w in workers
-        if threads === nothing
+        if threads === nothing && want_threads
             threads = map(c->c.tid,
                           filter(c->c isa ThreadProc,
                                  collect(children(OSProc(w)))))

From eb67881a6ca5016682bc49d47fe17f068e6bfdec Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 1 Jul 2025 18:27:17 +0000
Subject: [PATCH 6/6] Add Pad example to stencil quickstart

- Added a new subsection to "Quickstart: Stencil Operations" in `docs/src/index.md` demonstrating the `Pad(value)` boundary condition.
---
 docs/src/index.md     |  96 ++++++++++++++++++
 docs/src/stencils.jl  |  43 ---------
 docs/src/stencils.md  | 220 ++++++++++++++++++++++++++++++++++++++++++
 test/array/stencil.jl | 122 +++++++++++++++++++++++
 test/runtests.jl      |   1 +
 5 files changed, 439 insertions(+), 43 deletions(-)
 delete mode 100644 docs/src/stencils.jl
 create mode 100644 docs/src/stencils.md
 create mode 100644 test/array/stencil.jl

diff --git a/docs/src/index.md b/docs/src/index.md
index f98e715c1..df66a95f2 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -361,6 +361,40 @@ DA = rand(Blocks(32, 32), 256, 128)
 collect(DA) # returns a `Matrix{Float64}`
 ```
 
+-----
+
+## Quickstart: Stencil Operations
+
+Dagger's `@stencil` macro allows for easy specification of stencil operations on `DArray`s, often used in simulations and image processing. These operations typically involve updating an element based on the values of its neighbors.
+
+For more details: [Stencil Operations](@ref stencils.md)
+
+### Applying a Simple Stencil
+
+Here's how to apply a stencil that averages each element with its immediate neighbors, using a `Wrap` boundary condition (where edges wrap around).
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap
+
+# Create a 5x5 DArray, partitioned into 2x2 blocks
+A = Dagger.rand(Blocks(2, 2), Int, 5, 5)
+B = Dagger.zeros(Blocks(2,2), Float64, 5, 5)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # For each element in A, calculate the sum of its 3x3 neighborhood
+        # (including itself) and store the average in B.
+        # Values outside the array bounds are determined by Wrap().
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# B now contains the averaged values.
+# You can inspect it with collect(B)
+```
+In this example, `idx` refers to the coordinates of each element being processed. `@neighbors(A[idx], 1, Wrap())` fetches the 3x3 neighborhood around `A[idx]`. The `1` indicates a distance of 1 from the central element, and `Wrap()` specifies the boundary behavior.
+
 ## Quickstart: Datadeps
 
 Datadeps is a feature in Dagger.jl that facilitates parallelism control within designated regions, allowing tasks to write to their arguments while ensuring dependencies are respected.
@@ -412,6 +446,68 @@ Dagger.@spawn copyto!(C, X)
 
 In contrast to the previous example, here, the tasks are executed without argument annotations. As a result, there is a possibility of the `copyto!` task being executed before the `sort!` task, leading to unexpected results in the output array `C`.
 
+-----
+
+## Quickstart: Stencil Operations
+
+Dagger's `@stencil` macro allows for easy specification of stencil operations on `DArray`s, often used in simulations and image processing. These operations typically involve updating an element based on the values of its neighbors.
+
+For more details: [Stencil Operations](@ref stencils.md)
+
+### Applying a Simple Stencil
+
+Here's how to apply a stencil that averages each element with its immediate neighbors, using a `Wrap` boundary condition (where edges wrap around).
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap
+
+# Create a 5x5 DArray, partitioned into 2x2 blocks
+A = Dagger.rand(Blocks(2, 2), Int, 5, 5)
+B = Dagger.zeros(Blocks(2,2), Float64, 5, 5)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # For each element in A, calculate the sum of its 3x3 neighborhood
+        # (including itself) and store the average in B.
+        # Values outside the array bounds are determined by Wrap().
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# B now contains the averaged values.
+# You can inspect it with collect(B)
+```
+In this example, `idx` refers to the coordinates of each element being processed. `@neighbors(A[idx], 1, Wrap())` fetches the 3x3 neighborhood around `A[idx]`. The `1` indicates a distance of 1 from the central element, and `Wrap()` specifies the boundary behavior.
+
+### Using `Pad` for Boundary Conditions
+
+Alternatively, `Pad(value)` can be used to fill out-of-bounds accesses with a specific value.
+
+```julia
+import Dagger: Pad
+
+# Create a 4x4 DArray
+C = ones(Blocks(2, 2), Int, 4, 4)
+D = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # Sum neighbors, padding with 0 for out-of-bounds accesses
+        D[idx] = sum(@neighbors(C[idx], 1, Pad(0)))
+    end
+end
+
+# D will now contain sums where boundary elements used 0 for padding.
+# For example, D[1,1] (a corner) would sum C[1,1], C[1,2], C[2,1], C[2,2]
+# and 5 zeros from padding, resulting in a sum of 4 if all C elements are 1.
+# collect(D) would be:
+#  4  6  6  4
+#  6  9  9  6
+#  6  9  9  6
+#  4  6  6  4
+```
+
 ## Quickstart: Streaming
 
 Dagger.jl provides a streaming API that allows you to process data in a streaming fashion, where data is processed as it becomes available, rather than waiting for the entire dataset to be loaded into memory.
diff --git a/docs/src/stencils.jl b/docs/src/stencils.jl
deleted file mode 100644
index 3cf552dc1..000000000
--- a/docs/src/stencils.jl
+++ /dev/null
@@ -1,43 +0,0 @@
-# Stencil Operations
-
-
-
-```julia
-N = 27
-nt = 3
-tiles = zeros(Blocks(N, N), Bool, N*nt, N*nt)
-outputs = zeros(Blocks(N, N), Bool, N*nt, N*nt)
-
-# Create fun initial state
-tiles[13, 14] = 1
-tiles[14, 14] = 1
-tiles[15, 14] = 1
-tiles[15, 15] = 1
-tiles[14, 16] = 1
-@view(tiles[(2N+1):3N, (2N+1):3N]) .= rand(Bool, N, N)
-
-import Dagger: @stencil, Wrap
-
-anim = @animate for _ in 1:niters
-    Dagger.spawn_datadeps() do
-        @stencil begin
-            outputs[idx] = begin
-                nhood = @neighbors(tiles[idx], 1, Wrap())
-                neighs = sum(nhood) - tiles[idx]
-                if tiles[idx] && neighs < 2
-                    0
-                elseif tiles[idx] && neighs > 3
-                    0
-                elseif !tiles[idx] && neighs == 3
-                    1
-                else
-                    tiles[idx]
-                end
-            end
-            tiles[idx] = outputs[idx]
-        end
-    end
-    heatmap(Int.(collect(outputs)))
-end
-path = mp4(anim; fps=5, show_msg=true).filename
-```
diff --git a/docs/src/stencils.md b/docs/src/stencils.md
new file mode 100644
index 000000000..df4db912e
--- /dev/null
+++ b/docs/src/stencils.md
@@ -0,0 +1,220 @@
+# Stencil Operations
+
+The `@stencil` macro in Dagger.jl provides a convenient way to perform stencil computations on `DArray`s. It operates within a `Dagger.spawn_datadeps()` block and allows you to define operations that apply to each element of a `DArray`, potentially considering its neighbors.
+
+## Basic Usage
+
+The fundamental structure of a `@stencil` block involves iterating over an implicit index, `idx`, which represents the coordinates of an element in the processed `DArray`s.
+
+```julia
+using Dagger
+import Dagger: @stencil, Wrap, Pad
+
+# Initialize a DArray
+A = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        A[idx] = 1 # Assign 1 to every element of A
+    end
+end
+
+@assert all(collect(A) .== 1)
+```
+
+In this example, `A[idx] = 1` is executed for each chunk of `A`. The `idx` variable corresponds to the indices within each chunk.
+
+## Neighborhood Access with `@neighbors`
+
+The true power of stencils comes from accessing neighboring elements. The `@neighbors` macro facilitates this.
+
+`@neighbors(array[idx], distance, boundary_condition)`
+
+- `array[idx]`: The array and current index from which to find neighbors.
+- `distance`: An integer specifying the extent of the neighborhood (e.g., `1` for a 3x3 neighborhood in 2D).
+- `boundary_condition`: Defines how to handle accesses beyond the array boundaries. Common conditions are:
+    - `Wrap()`: Wraps around to the other side of the array.
+    - `Pad(value)`: Pads with a specified `value`.
+
+### Example: Averaging Neighbors with `Wrap`
+
+```julia
+# Initialize a DArray
+A = ones(Blocks(1, 1), Int, 3, 3)
+A[2,2] = 10 # Central element has a different value
+B = zeros(Blocks(1, 1), Float64, 3, 3)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        # Calculate the average of the 3x3 neighborhood (including the center)
+        B[idx] = sum(@neighbors(A[idx], 1, Wrap())) / 9.0
+    end
+end
+
+# Manually calculate expected B for verification
+expected_B = zeros(Float64, 3, 3)
+A_collected = collect(A)
+for r in 1:3, c in 1:3
+    local_sum = 0.0
+    for dr in -1:1, dc in -1:1
+        nr, nc = mod1(r+dr, 3), mod1(c+dc, 3)
+        local_sum += A_collected[nr, nc]
+    end
+    expected_B[r,c] = local_sum / 9.0
+end
+
+@assert collect(B) ≈ expected_B
+```
+
+### Example: Convolution with `Pad`
+
+```julia
+# Initialize a DArray
+A = ones(Blocks(2, 2), Int, 4, 4)
+B = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        B[idx] = sum(@neighbors(A[idx], 1, Pad(0))) # Pad with 0
+    end
+end
+
+# Expected result for a 3x3 sum filter with zero padding
+expected_B_padded = [
+    4 6 6 4;
+    6 9 9 6;
+    6 9 9 6;
+    4 6 6 4
+]
+@assert collect(B) == expected_B_padded
+```
+
+## Sequential Semantics
+
+Expressions within a `@stencil` block are executed sequentially in terms of their effect on the data. This means that the result of one statement is visible to the subsequent statements, as if they were applied "all at once" across all indices before the next statement begins.
+
+```julia
+A = zeros(Blocks(2, 2), Int, 4, 4)
+B = zeros(Blocks(2, 2), Int, 4, 4)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        A[idx] = idx[1] + idx[2]  # First, A is filled based on coordinates
+        B[idx] = A[idx] * 2       # Then, B is computed using the new values of A
+    end
+end
+
+expected_A = [(r+c) for r in 1:4, c in 1:4]
+expected_B_seq = expected_A .* 2
+
+@assert collect(A) == expected_A
+@assert collect(B) == expected_B_seq
+```
+
+## Operations on Multiple `DArray`s
+
+You can read from and write to multiple `DArray`s within a single `@stencil` block, provided they have compatible chunk structures.
+
+```julia
+A = ones(Blocks(1, 1), Int, 2, 2)
+B_multi = Dagger.fill(Blocks(1, 1), 2, Int, 2, 2) # Renamed to avoid conflict, corrected fill
+C = zeros(Blocks(1, 1), Int, 2, 2)
+
+Dagger.spawn_datadeps() do
+    @stencil begin
+        C[idx] = A[idx] + B_multi[idx] # Use the renamed B_multi
+    end
+end
+@assert all(collect(C) .== 3)
+```
+
+## Example: Game of Life
+
+The following demonstrates a more complex example: Conway's Game of Life.
+
+```julia
+# Ensure Plots and other necessary packages are available for the example
+# using Plots
+
+N = 27 # Size of one dimension of a tile
+nt = 3 # Number of tiles in each dimension (results in nt x nt grid of tiles)
+niters = 10 # Number of iterations for the animation
+
+tiles = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+outputs = zeros(Blocks(N, N), Bool, N*nt, N*nt)
+
+# Create a fun initial state (e.g., a glider and some random noise)
+tiles[13, 14] = true
+tiles[14, 14] = true
+tiles[15, 14] = true
+tiles[15, 15] = true # Corrected glider part
+tiles[14, 16] = true
+# Add some random noise in one of the tiles
+# Make sure to use Dagger-compatible assignment if you were to modify chunks directly
+# For simplicity, direct array indexing is used here for initial setup.
+rand_tile_data = rand(Bool, N, N)
+# To assign this to a specific block, you'd typically work with chunks,
+# but for initial setup, direct indexing on the collected array or careful DArray construction is easier.
+# For this example, we'll simplify and assume direct modification is for setup.
+# A Dagger-idiomatic way for partial modification might involve map! or similar.
+# Here, we just modify the underlying array before it's heavily used by Dagger tasks if possible,
+# or use Dagger operations.
+# For collected view for setup:
+temp_tiles = collect(tiles) # This collect is fine for initial setup visualization/modification
+temp_tiles[(2N+1):3N, (2N+1):3N] .= rand_tile_data
+tiles = Dagger.distribute(temp_tiles, Blocks(N,N)) # Use distribute to create DArray from existing array
+
+
+# The animation part requires a graphical environment.
+# If running in a headless environment, you might comment out the @animate macro
+# and inspect `outputs` programmatically.
+# anim = @animate for _ in 1:niters
+#     Dagger.spawn_datadeps() do
+#         @stencil begin
+#             outputs[idx] = begin
+#                 nhood = @neighbors(tiles[idx], 1, Wrap())
+#                 live_neighbors = sum(nhood) - tiles[idx] # Subtract self if it was counted
+#                 if tiles[idx] # If current cell is alive
+#                     if live_neighbors < 2 || live_neighbors > 3
+#                         false # Dies by underpopulation or overpopulation
+#                     else
+#                         true  # Survives
+#                     end
+#                 else # If current cell is dead
+#                     if live_neighbors == 3
+#                         true  # Becomes alive by reproduction
+#                     else
+#                         false # Stays dead
+#                     end
+#                 end
+#             end
+#             tiles[idx] = outputs[idx] # Update tiles for the next iteration
+#         end
+#     end
+#     # heatmap(Int.(collect(outputs))) # Visualize (requires Plots.jl)
+# end
+# path = mp4(anim; fps=5, show_msg=true).filename # Save animation (requires Plots.jl)
+
+# For testing without animation:
+# Execute one iteration:
+Dagger.spawn_datadeps() do
+    @stencil begin
+        outputs[idx] = begin
+            nhood = @neighbors(tiles[idx], 1, Wrap())
+            live_neighbors = sum(nhood) - tiles[idx]
+            if tiles[idx]
+                if live_neighbors < 2 || live_neighbors > 3; false
+                else; true; end
+            else
+                if live_neighbors == 3; true
+                else; false; end
+            end
+        end
+        tiles[idx] = outputs[idx]
+    end
+end
+# You can inspect `collect(outputs)` or `collect(tiles)` here.
+println("Game of Life example processed one iteration.")
+```
+
+This updated documentation provides a more structured explanation of `@stencil`, including its syntax, common use cases like neighborhood access with different boundary conditions, the sequential nature of its operations, and how to use it with multiple `DArray`s. The Game of Life example is also slightly corrected and clarified.
diff --git a/test/array/stencil.jl b/test/array/stencil.jl
new file mode 100644
index 000000000..7398ea305
--- /dev/null
+++ b/test/array/stencil.jl
@@ -0,0 +1,122 @@
+using Test
+using Dagger
+import Dagger: @stencil, Wrap, Pad
+
+@testset "@stencil" begin
+    @testset "Simple assignment" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = 1
+            end
+        end
+        @test all(collect(A) .== 1)
+    end
+
+    @testset "Wrap boundary" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        A[1,1] = 10
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Wrap()))
+            end
+        end
+        # Expected result after convolution with wrap around
+        # Corner element (1,1) will sum its 3 neighbors + itself (10) + 5 wrapped around neighbors
+        # For A[1,1], neighbors are A[4,4], A[4,1], A[4,2], A[1,4], A[1,2], A[2,4], A[2,1], A[2,2]
+        # Since only A[1,1] is 10 and others are 0, sum for B[1,1] will be 10 (A[1,1])
+        # Sum for B[1,2] will be A[1,1] = 10
+        # Sum for B[2,1] will be A[1,1] = 10
+        # Sum for B[2,2] will be A[1,1] = 10
+        # Sum for B[4,4] will be A[1,1] = 10
+        # ... and so on for elements that wrap around to include A[1,1]
+        expected_B_calc = zeros(Int, 4, 4)
+        for i in 1:4, j in 1:4
+            sum_val = 0
+            for ni in -1:1, nj in -1:1
+                # Apply wrap around logic for neighbors
+                row = mod1(i+ni, 4)
+                col = mod1(j+nj, 4)
+                if row == 1 && col == 1 # Check if the wrapped neighbor is A[1,1]
+                    sum_val += 10
+                end
+            end
+            expected_B_calc[i,j] = sum_val
+        end
+        @test collect(B) == expected_B_calc
+    end
+
+    @testset "Pad boundary" begin
+        A = Dagger.DArray(ones(Int, 4, 4), Blocks(2, 2))
+        B = Dagger.DArray(zeros(Int, 4, 4), Blocks(2, 2))
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Pad(0)))
+            end
+        end
+        # Expected result after convolution with zero padding
+        # Inner elements (e.g., B[2,2]) will sum 9 (3x3 neighborhood of 1s)
+        # Edge elements (e.g., B[1,2]) will sum 6 (2x3 neighborhood of 1s, 3 zeros from padding)
+        # Corner elements (e.g., B[1,1]) will sum 4 (2x2 neighborhood of 1s, 5 zeros from padding)
+        expected_B_pad = [
+            4 6 6 4;
+            6 9 9 6;
+            6 9 9 6;
+            4 6 6 4
+        ]
+        @test collect(B) == expected_B_pad
+    end
+
+    @testset "Multiple expressions" begin
+        A = zeros(Blocks(2, 2), Int, 4, 4)
+        B = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                A[idx] = idx[1] + idx[2] # Sum of coordinates
+                B[idx] = A[idx] * 2
+            end
+        end
+        expected_A_multi = [(r+c) for r in 1:4, c in 1:4]
+        expected_B_multi = expected_A_multi .* 2
+        @test collect(A) == expected_A_multi
+        @test collect(B) == expected_B_multi
+    end
+
+    @testset "Multiple DArrays" begin
+        A = ones(Blocks(2, 2), Int, 4, 4)
+        B = Dagger.fill(Blocks(2, 2), 2, Int, 4, 4) # Corrected fill: value first
+        C = zeros(Blocks(2, 2), Int, 4, 4)
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                C[idx] = A[idx] + B[idx]
+            end
+        end
+        @test all(collect(C) .== 3)
+    end
+
+    @testset "Pad boundary with non-zero value" begin
+        A = ones(Blocks(1, 1), Int, 2, 2) # Simpler 2x2 case
+        B = zeros(Blocks(1, 1), Int, 2, 2)
+        pad_value = 5
+        Dagger.spawn_datadeps() do
+            @stencil begin
+                B[idx] = sum(@neighbors(A[idx], 1, Pad(pad_value)))
+            end
+        end
+        # For A = [1 1; 1 1] and Pad(5)
+        # B[1,1] neighbors considering a 3x3 neighborhood around A[1,1]:
+        # P P P
+        # P A11 A12
+        # P A21 A22
+        # Values:
+        # 5 5 5
+        # 5 1 1
+        # 5 1 1
+        # Sum = 5*5 (for the padded values) + 1*4 (for the actual values from A) = 25 + 4 = 29.
+        # This logic applies to all elements in B because the array A is small (2x2) and the neighborhood is 1.
+        # Every element's 3x3 neighborhood will include 5 padded values and the 4 values of A.
+        expected_B_pad_val = fill(pad_value*5 + 1*4, 2, 2)
+        @test collect(B) == expected_B_pad_val
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 79ba890d7..2e832d2ef 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -21,6 +21,7 @@ tests = [
     ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"),
     ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"),
     ("Array - Random", "array/random.jl"),
+    ("Array - Stencils", "array/stencil.jl"),
     ("Caching", "cache.jl"),
     ("Disk Caching", "diskcaching.jl"),
     ("File IO", "file-io.jl"),