From b90d7e98aecc592f69401fc8bc17a4cde42739b4 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Tue, 16 Sep 2025 16:45:09 +0200
Subject: [PATCH 01/27] simplify if condition in examples

---
 examples/diffusion2D_shmem_novis.jl                           | 2 +-
 examples/diffusion3D_multigpucpu_hidecomm_parindices_novis.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/diffusion2D_shmem_novis.jl b/examples/diffusion2D_shmem_novis.jl
index 27bfba8d..3ce406f9 100644
--- a/examples/diffusion2D_shmem_novis.jl
+++ b/examples/diffusion2D_shmem_novis.jl
@@ -13,7 +13,7 @@ end
     ty  = @threadIdx().y + 1
     T_l = @sharedMem(eltype(T), (@blockDim().x+2, @blockDim().y+2))
     T_l[tx,ty] = T[ix,iy]
-    if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
+    if (1<ix<size(T2,1) && 1<iy<size(T2,2))
         if (@threadIdx().x == 1)             T_l[tx-1,ty] = T[ix-1,iy] end
         if (@threadIdx().x == @blockDim().x) T_l[tx+1,ty] = T[ix+1,iy] end
         if (@threadIdx().y == 1)             T_l[tx,ty-1] = T[ix,iy-1] end
diff --git a/examples/diffusion3D_multigpucpu_hidecomm_parindices_novis.jl b/examples/diffusion3D_multigpucpu_hidecomm_parindices_novis.jl
index 8dfd8cf4..d4ffc4c1 100644
--- a/examples/diffusion3D_multigpucpu_hidecomm_parindices_novis.jl
+++ b/examples/diffusion3D_multigpucpu_hidecomm_parindices_novis.jl
@@ -8,7 +8,7 @@ else
 end
 
 @parallel_indices (ix,iy,iz) function diffusion3D_step!(T2, T, Ci, lam, dt, _dx, _dy, _dz)
-        if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2) && iz>1 && iz<size(T2,3))
+        if (1<ix<size(T2,1) && 1<iy<size(T2,2) && 1<iz<size(T2,3))
             T2[ix,iy,iz] = T[ix,iy,iz] + dt*(Ci[ix,iy,iz]*(
                             - ((-lam*(T[ix+1,iy,iz] - T[ix,iy,iz])*_dx) - (-lam*(T[ix,iy,iz] - T[ix-1,iy,iz])*_dx))*_dx
                             - ((-lam*(T[ix,iy+1,iz] - T[ix,iy,iz])*_dy) - (-lam*(T[ix,iy,iz] - T[ix,iy-1,iz])*_dy))*_dy

From d68cd72c28088db119eaaa0d83c388b150d61c35 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Tue, 16 Sep 2025 17:27:02 +0200
Subject: [PATCH 02/27] add macro signatures and docstrings

---
 src/ParallelKernel/ParallelKernel.jl  |   1 +
 src/ParallelKernel/kernel_language.jl | 116 ++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/src/ParallelKernel/ParallelKernel.jl b/src/ParallelKernel/ParallelKernel.jl
index 740e1b9e..8e355ee7 100644
--- a/src/ParallelKernel/ParallelKernel.jl
+++ b/src/ParallelKernel/ParallelKernel.jl
@@ -74,6 +74,7 @@ include("FieldAllocators.jl")
 ## Exports
 export @init_parallel_kernel, @parallel, @hide_communication, @parallel_indices, @parallel_async, @synchronize, @zeros, @ones, @rand, @falses, @trues, @fill, @fill!, @CellType
 export @gridDim, @blockIdx, @blockDim, @threadIdx, @sync_threads, @sharedMem, @pk_show, @pk_println, @∀
+export @warpsize, @laneid, @active_mask, @shfl_sync, @shfl_up_sync, @shfl_down_sync, @shfl_xor_sync, @vote_any_sync, @vote_all_sync, @vote_ballot_sync
 export PKNumber
 
 end # Module ParallelKernel
diff --git a/src/ParallelKernel/kernel_language.jl b/src/ParallelKernel/kernel_language.jl
index 6470318b..dfffe66c 100644
--- a/src/ParallelKernel/kernel_language.jl
+++ b/src/ParallelKernel/kernel_language.jl
@@ -89,6 +89,110 @@ Call a macro analogue to `Base.@println`, compatible with the package for parall
 macro pk_println(args...) check_initialized(__module__); esc(pk_println(__module__, args...)); end
 
 
+##
+const WARPSIZE_DOC = """
+    @warpsize() -> Int
+
+Return the logical warp / wavefront / SIMD-group width in threads for the active backend.  CUDA returns 32. AMD GPUs return the hardware wavefront size (typically 64 or 32). Metal returns the device `threadExecutionWidth`. CPU backend returns 1.  Guaranteed constant for the lifetime of the kernel invocation. Use this value (not a hard‑coded constant) for portable intra-warp algorithms.
+"""
+@doc WARPSIZE_DOC
+macro warpsize(args...) check_initialized(__module__); checknoargs(args...); esc(warpsize(__module__, args...)); end
+
+
+##
+const LANEID_DOC = """
+    @laneid() -> Int
+
+Return the 1-based logical lane index in the current warp (range: 1:warpsize()).  For CUDA this is `CUDA.laneid()+1` internally; for backends with 0-based hardware lane numbering the abstraction adds 1.  CPU backend always returns 1.
+"""
+@doc LANEID_DOC
+macro laneid(args...) check_initialized(__module__); checknoargs(args...); esc(laneid(__module__, args...)); end
+
+
+##
+const ACTIVE_MASK_DOC = """
+    @active_mask() -> Unsigned
+
+Return a bit mask of currently active (non-exited, converged) lanes in the caller's warp.  Bit (laneid()-1) corresponds to that logical lane.  CUDA returns a 32-bit value; AMD returns a 64-bit value.  Absent (throws) on Metal if not supported; CPU returns UInt64(0x1).
+"""
+@doc ACTIVE_MASK_DOC
+macro active_mask(args...) check_initialized(__module__); checknoargs(args...); esc(active_mask(__module__, args...)); end
+
+
+##
+const SHFL_SYNC_DOC = """
+    @shfl_sync(mask::Unsigned, val, lane::Integer)
+    @shfl_sync(mask::Unsigned, val, lane::Integer, width::Integer)
+
+Return the value of `val` from the source lane `lane` (1-based) among lanes named in `mask`.  Optional `width` (power of two, 1 <= width <= warpsize()) logically partitions the warp into independent contiguous sub-groups each behaving as a mini-warp with lanes numbered 1:width.  The source lane index is resolved modulo `width`.  All participating lanes must supply identical `mask`, `lane`, and (if present) `width`.  `val` may be any isbits type; larger composite isbits values are shuffled by decomposition into supported word sizes.  CPU backend returns `val` unchanged.
+"""
+@doc SHFL_SYNC_DOC
+macro shfl_sync(args...) check_initialized(__module__); checkargs_shfl_sync(args...); esc(shfl_sync(__module__, args...)); end
+
+
+##
+const SHFL_UP_SYNC_DOC = """
+    @shfl_up_sync(mask::Unsigned, val, delta::Integer)
+    @shfl_up_sync(mask::Unsigned, val, delta::Integer, width::Integer)
+
+Shift `val` up by `delta` lanes within each logical partition (width semantics as in `shfl_sync`).  Lanes with no valid upstream partner retain their original `val`.  `delta >= 0`.  CPU backend returns `val` unchanged.
+"""
+@doc SHFL_UP_SYNC_DOC
+macro shfl_up_sync(args...) check_initialized(__module__); checkargs_shfl_up_down_xor(args...); esc(shfl_up_sync(__module__, args...)); end
+
+
+##
+const SHFL_DOWN_SYNC_DOC = """
+    @shfl_down_sync(mask::Unsigned, val, delta::Integer)
+    @shfl_down_sync(mask::Unsigned, val, delta::Integer, width::Integer)
+
+Shift `val` down by `delta` lanes within each logical partition; lanes without a valid downstream partner retain their original `val`.  `delta >= 0`.  CPU backend returns `val` unchanged.
+"""
+@doc SHFL_DOWN_SYNC_DOC
+macro shfl_down_sync(args...) check_initialized(__module__); checkargs_shfl_up_down_xor(args...); esc(shfl_down_sync(__module__, args...)); end
+
+
+##
+const SHFL_XOR_SYNC_DOC = """
+    @shfl_xor_sync(mask::Unsigned, val, lane_mask::Integer)
+    @shfl_xor_sync(mask::Unsigned, val, lane_mask::Integer, width::Integer)
+
+Perform a butterfly (bitwise XOR) shuffle: each lane exchanges with the lane whose (laneid()-1) XOR `lane_mask` differs in the specified bits, constrained within each `width` partition if provided.  If the computed partner is outside the partition the calling lane's own `val` is returned.  CPU backend returns `val` unchanged.
+"""
+@doc SHFL_XOR_SYNC_DOC
+macro shfl_xor_sync(args...) check_initialized(__module__); checkargs_shfl_up_down_xor(args...); esc(shfl_xor_sync(__module__, args...)); end
+
+
+##
+const VOTE_ANY_SYNC_DOC = """
+    @vote_any_sync(mask::Unsigned, predicate::Bool) -> Bool
+
+Evaluate `predicate` across all active lanes named in `mask`; return true if any lane's predicate is true.  Does not imply a memory fence.  CPU backend returns `predicate`.
+"""
+@doc VOTE_ANY_SYNC_DOC
+macro vote_any_sync(args...) check_initialized(__module__); checkargs_vote(args...); esc(vote_any_sync(__module__, args...)); end
+
+
+##
+const VOTE_ALL_SYNC_DOC = """
+    @vote_all_sync(mask::Unsigned, predicate::Bool) -> Bool
+
+Evaluate `predicate` across all active lanes named in `mask`; return true only if every such lane's predicate is true.  No memory ordering implied.  CPU backend returns `predicate`.
+"""
+@doc VOTE_ALL_SYNC_DOC
+macro vote_all_sync(args...) check_initialized(__module__); checkargs_vote(args...); esc(vote_all_sync(__module__, args...)); end
+
+
+##
+const VOTE_BALLOT_SYNC_DOC = """
+    @vote_ballot_sync(mask::Unsigned, predicate::Bool) -> Unsigned
+
+Return a bit mask aggregating `predicate` values for lanes named in `mask`: bit (laneid()-1) set iff that lane's predicate is true.  Width of result equals hardware warp mask width (32 for CUDA, 64 for AMD, CPU uses 64 with only bit 0 meaningful).  Caller may safely promote to `UInt64` for uniform handling; upper bits beyond hardware width are zero.  No memory ordering implied.
+"""
+@doc VOTE_BALLOT_SYNC_DOC
+macro vote_ballot_sync(args...) check_initialized(__module__); checkargs_vote(args...); esc(vote_ballot_sync(__module__, args...)); end
+
+
 ##
 const FORALL_DOC = """
     @∀ x ∈ X statement
@@ -178,6 +282,18 @@ function checkargs_begin_end(args...)
     if !(2 <= length(args) <= 3) @ArgumentError("wrong number of arguments.") end
 end
 
+function checkargs_shfl_sync(args...)
+    if !(3 <= length(args) <= 4) @ArgumentError("wrong number of arguments.") end
+end
+
+function checkargs_shfl_up_down_xor(args...)
+    if !(3 <= length(args) <= 4) @ArgumentError("wrong number of arguments.") end
+end
+
+function checkargs_vote(args...)
+    if !(length(args) == 2) @ArgumentError("wrong number of arguments.") end
+end
+
 
 ## FUNCTIONS FOR INDEXING AND DIMENSIONS
 

From 2bc27acea6efffb2b4c1ac25e5e4716966bb53e6 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Sep 2025 10:27:46 +0200
Subject: [PATCH 03/27] and warp primitives tests

---
 test/ParallelKernel/test_kernel_language.jl | 75 +++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl
index 87bc6473..1a5daa62 100644
--- a/test/ParallelKernel/test_kernel_language.jl
+++ b/test/ParallelKernel/test_kernel_language.jl
@@ -79,6 +79,81 @@ eval(:(
                     @test @prettystring(1, ParallelStencil.ParallelKernel.@threads()) == "Polyester.@batch"
                 end;
             end;
+            @testset "Warp level primitives" begin
+                @testset "Parse-time direct call mapping" begin
+                    # Common test variables used in macro expansions
+                    mask      = UInt64(0xffff_ffff_ffff_ffff)
+                    mask32    = UInt32(0xffff_ffff)
+                    val       = one($FloatDefault)
+                    lane      = 1
+                    width     = 32
+                    delta     = 1
+                    lane_mask = 1
+                    predicate = true
+
+                    if $package == $PKG_CUDA
+                        @test @prettystring(1, @warpsize()) == "CUDA.warpsize()"
+                        @test @prettystring(1, @laneid())   == "CUDA.laneid()"
+                        @test @prettystring(1, @active_mask()) == "CUDA.active_mask()"
+
+                        @test @prettystring(1, @shfl_sync(mask32, val, lane)) == "CUDA.shfl_sync(mask32, val, lane)"
+                        @test @prettystring(1, @shfl_sync(mask32, val, lane, width)) == "CUDA.shfl_sync(mask32, val, lane, width)"
+                        @test @prettystring(1, @shfl_up_sync(mask32, val, delta)) == "CUDA.shfl_up_sync(mask32, val, delta)"
+                        @test @prettystring(1, @shfl_up_sync(mask32, val, delta, width)) == "CUDA.shfl_up_sync(mask32, val, delta, width)"
+                        @test @prettystring(1, @shfl_down_sync(mask32, val, delta)) == "CUDA.shfl_down_sync(mask32, val, delta)"
+                        @test @prettystring(1, @shfl_down_sync(mask32, val, delta, width)) == "CUDA.shfl_down_sync(mask32, val, delta, width)"
+                        @test @prettystring(1, @shfl_xor_sync(mask32, val, lane_mask)) == "CUDA.shfl_xor_sync(mask32, val, lane_mask)"
+                        @test @prettystring(1, @shfl_xor_sync(mask32, val, lane_mask, width)) == "CUDA.shfl_xor_sync(mask32, val, lane_mask, width)"
+
+                        @test @prettystring(1, @vote_any_sync(mask32, predicate))   == "CUDA.vote_any_sync(mask32, predicate)"
+                        @test @prettystring(1, @vote_all_sync(mask32, predicate))   == "CUDA.vote_all_sync(mask32, predicate)"
+                        @test @prettystring(1, @vote_ballot_sync(mask32, predicate)) == "CUDA.vote_ballot_sync(mask32, predicate)"
+
+                    elseif $package == $PKG_AMDGPU
+                        @test @prettystring(1, @warpsize()) == "AMDGPU.Device.wavefrontsize()"
+                        @test @prettystring(1, @laneid())   == "unsafe_trunc(Cint, AMDGPU.Device.activelane()) + Cint(1)"
+                        @test @prettystring(1, @active_mask()) == "AMDGPU.Device.activemask()"
+
+                        @test @prettystring(1, @shfl_sync(mask, val, lane)) == "AMDGPU.Device.shfl_sync(UInt64(mask), val, unsafe_trunc(Cint, lane) - Cint(1))"
+                        @test @prettystring(1, @shfl_sync(mask, val, lane, width)) == "AMDGPU.Device.shfl_sync(UInt64(mask), val, unsafe_trunc(Cint, lane) - Cint(1), unsafe_trunc(Cuint, width))"
+                        @test @prettystring(1, @shfl_up_sync(mask, val, delta)) == "AMDGPU.Device.shfl_up_sync(UInt64(mask), val, unsafe_trunc(Cint, delta))"
+                        @test @prettystring(1, @shfl_up_sync(mask, val, delta, width)) == "AMDGPU.Device.shfl_up_sync(UInt64(mask), val, unsafe_trunc(Cint, delta), unsafe_trunc(Cuint, width))"
+                        @test @prettystring(1, @shfl_down_sync(mask, val, delta)) == "AMDGPU.Device.shfl_down_sync(UInt64(mask), val, unsafe_trunc(Cint, delta))"
+                        @test @prettystring(1, @shfl_down_sync(mask, val, delta, width)) == "AMDGPU.Device.shfl_down_sync(UInt64(mask), val, unsafe_trunc(Cint, delta), unsafe_trunc(Cuint, width))"
+                        @test @prettystring(1, @shfl_xor_sync(mask, val, lane_mask)) == "AMDGPU.Device.shfl_xor_sync(UInt64(mask), val, unsafe_trunc(Cint, lane_mask) - Cint(1))"
+                        @test @prettystring(1, @shfl_xor_sync(mask, val, lane_mask, width)) == "AMDGPU.Device.shfl_xor_sync(UInt64(mask), val, unsafe_trunc(Cint, lane_mask) - Cint(1), unsafe_trunc(Cuint, width))"
+
+                        @test @prettystring(1, @vote_any_sync(mask, predicate))   == "AMDGPU.Device.any_sync(UInt64(mask), predicate)"
+                        @test @prettystring(1, @vote_all_sync(mask, predicate))   == "AMDGPU.Device.all_sync(UInt64(mask), predicate)"
+                        @test @prettystring(1, @vote_ballot_sync(mask, predicate)) == "AMDGPU.Device.ballot_sync(UInt64(mask), predicate)"
+
+                    elseif $package == $PKG_METAL
+                        @test @prettystring(1, @warpsize()) == "Metal.threads_per_simdgroup()"
+                        @test @prettystring(1, @laneid())   == "unsafe_trunc(Cint, Metal.thread_index_in_simdgroup()) + Cint(1)"
+                        @test_throws Exception @prettystring(1, @active_mask())
+                        @test_throws Exception @prettystring(1, @shfl_sync(mask, val, lane))
+                        @test_throws Exception @prettystring(1, @vote_ballot_sync(mask, predicate))
+
+                    elseif @iscpu($package)
+                        @test @prettystring(1, @warpsize())     == "ParallelStencil.ParallelKernel.warpsize_cpu()"
+                        @test @prettystring(1, @laneid())       == "ParallelStencil.ParallelKernel.laneid_cpu()"
+                        @test @prettystring(1, @active_mask())  == "ParallelStencil.ParallelKernel.active_mask_cpu()"
+
+                        @test @prettystring(1, @shfl_sync(mask, val, lane)) == "ParallelStencil.ParallelKernel.shfl_sync_cpu(mask, val, Int64(lane) - Int64(1))"
+                        @test @prettystring(1, @shfl_sync(mask, val, lane, width)) == "ParallelStencil.ParallelKernel.shfl_sync_cpu(mask, val, Int64(lane) - Int64(1), Int64(width))"
+                        @test @prettystring(1, @shfl_up_sync(mask, val, delta)) == "ParallelStencil.ParallelKernel.shfl_up_sync_cpu(mask, val, Int64(delta))"
+                        @test @prettystring(1, @shfl_up_sync(mask, val, delta, width)) == "ParallelStencil.ParallelKernel.shfl_up_sync_cpu(mask, val, Int64(delta), Int64(width))"
+                        @test @prettystring(1, @shfl_down_sync(mask, val, delta)) == "ParallelStencil.ParallelKernel.shfl_down_sync_cpu(mask, val, Int64(delta))"
+                        @test @prettystring(1, @shfl_down_sync(mask, val, delta, width)) == "ParallelStencil.ParallelKernel.shfl_down_sync_cpu(mask, val, Int64(delta), Int64(width))"
+                        @test @prettystring(1, @shfl_xor_sync(mask, val, lane_mask)) == "ParallelStencil.ParallelKernel.shfl_xor_sync_cpu(mask, val, Int64(lane_mask) - Int64(1))"
+                        @test @prettystring(1, @shfl_xor_sync(mask, val, lane_mask, width)) == "ParallelStencil.ParallelKernel.shfl_xor_sync_cpu(mask, val, Int64(lane_mask) - Int64(1), Int64(width))"
+
+                        @test @prettystring(1, @vote_any_sync(mask, predicate))   == "ParallelStencil.ParallelKernel.vote_any_sync_cpu(mask, predicate)"
+                        @test @prettystring(1, @vote_all_sync(mask, predicate))   == "ParallelStencil.ParallelKernel.vote_all_sync_cpu(mask, predicate)"
+                        @test @prettystring(1, @vote_ballot_sync(mask, predicate)) == "ParallelStencil.ParallelKernel.vote_ballot_sync_cpu(mask, predicate)"
+                    end
+                end;
+            end;
             @testset "@gridDim, @blockIdx, @blockDim, @threadIdx (1D)" begin
                 @static if $package == $PKG_THREADS
                     A  = @zeros(4)

From 2afdec3861c86145f0dafd2f7958599a69c4778d Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Sep 2025 10:41:28 +0200
Subject: [PATCH 04/27] and warp primitives tests

---
 test/ParallelKernel/test_kernel_language.jl | 29 +++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl
index 1a5daa62..9801ec15 100644
--- a/test/ParallelKernel/test_kernel_language.jl
+++ b/test/ParallelKernel/test_kernel_language.jl
@@ -153,6 +153,35 @@ eval(:(
                         @test @prettystring(1, @vote_ballot_sync(mask, predicate)) == "ParallelStencil.ParallelKernel.vote_ballot_sync_cpu(mask, predicate)"
                     end
                 end;
+                @testset "CPU zero overhead" begin
+                    @static if @iscpu($package)
+                        # Use stable literal arguments to exercise CPU code paths
+                        mask      = UInt64(0x1)
+                        valf      = one($FloatDefault)
+                        lane      = 1
+                        width     = 1
+                        delta     = 1
+                        lanemask  = 1
+                        predicate = true
+
+                        @test @allocated(@warpsize())    == 0
+                        @test @allocated(@laneid())      == 0
+                        @test @allocated(@active_mask()) == 0
+
+                        @test @allocated(@shfl_sync(mask, valf, lane))            == 0
+                        @test @allocated(@shfl_sync(mask, valf, lane, width))     == 0
+                        @test @allocated(@shfl_up_sync(mask, valf, delta))        == 0
+                        @test @allocated(@shfl_up_sync(mask, valf, delta, width)) == 0
+                        @test @allocated(@shfl_down_sync(mask, valf, delta))      == 0
+                        @test @allocated(@shfl_down_sync(mask, valf, delta, width)) == 0
+                        @test @allocated(@shfl_xor_sync(mask, valf, lanemask))    == 0
+                        @test @allocated(@shfl_xor_sync(mask, valf, lanemask, width)) == 0
+
+                        @test @allocated(@vote_any_sync(mask, predicate))    == 0
+                        @test @allocated(@vote_all_sync(mask, predicate))    == 0
+                        @test @allocated(@vote_ballot_sync(mask, predicate)) == 0
+                    end
+                end;
             end;
             @testset "@gridDim, @blockIdx, @blockDim, @threadIdx (1D)" begin
                 @static if $package == $PKG_THREADS

From 7b594723c0ef8fae31d23c5889a5c0a53777b6b1 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Sep 2025 10:49:38 +0200
Subject: [PATCH 05/27] and warp primitives tests

---
 test/ParallelKernel/test_kernel_language.jl | 43 +++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl
index 9801ec15..92062964 100644
--- a/test/ParallelKernel/test_kernel_language.jl
+++ b/test/ParallelKernel/test_kernel_language.jl
@@ -182,6 +182,49 @@ eval(:(
                         @test @allocated(@vote_ballot_sync(mask, predicate)) == 0
                     end
                 end;
+                @testset "Semantic smoke tests" begin
+                    @static if @iscpu($package)
+                        N = 8
+                        A  = @rand($FloatDefault, N)
+                        P  = [isfinite(A[i]) && (A[i] > zero($FloatDefault)) for i in 1:N]  # simple predicate
+                        Bout_any    = Vector{Bool}(undef, N)
+                        Bout_all    = Vector{Bool}(undef, N)
+                        Bout_ballot = Vector{UInt64}(undef, N)
+                        Bshfl       = similar(A)
+                        Bshfl_up    = similar(A)
+                        Bshfl_down  = similar(A)
+                        Bshfl_xor   = similar(A)
+
+                        @parallel_indices (ix) function kernel_semantics!(Bout_any, Bout_all, Bout_ballot, Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
+                            m = @active_mask()
+                            w = @warpsize()
+                            l = @laneid()
+                            # basic invariants under CPU model
+                            @test w == 1
+                            @test l == 1
+                            # shuffle identities
+                            Bshfl[ix]      = @shfl_sync(m, A[ix], l)
+                            Bshfl_up[ix]   = @shfl_up_sync(m, A[ix], 1)
+                            Bshfl_down[ix] = @shfl_down_sync(m, A[ix], 1)
+                            Bshfl_xor[ix]  = @shfl_xor_sync(m, A[ix], 1)
+                            # votes
+                            pa = P[ix]
+                            Bout_any[ix]   = @vote_any_sync(m, pa)
+                            Bout_all[ix]   = @vote_all_sync(m, pa)
+                            Bout_ballot[ix] = @vote_ballot_sync(m, pa)
+                            return
+                        end
+                        @parallel (1:N) kernel_semantics!(Bout_any, Bout_all, Bout_ballot, Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
+
+                        @test all(Bshfl .== A)
+                        @test all(Bshfl_up .== A)
+                        @test all(Bshfl_down .== A)
+                        @test all(Bshfl_xor .== A)
+                        @test Bout_any == P
+                        @test Bout_all == P
+                        @test Bout_ballot == map(p -> p ? UInt64(0x1) : UInt64(0x0), P)
+                    end
+                end;
             end;
             @testset "@gridDim, @blockIdx, @blockDim, @threadIdx (1D)" begin
                 @static if $package == $PKG_THREADS

From d2ea2cc3d8dd92fa1578ee0f418810ec601c972f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 17 Sep 2025 11:42:12 +0200
Subject: [PATCH 06/27] add warp level primitives

---
 src/ParallelKernel/kernel_language.jl | 204 ++++++++++++++++++++++++++
 1 file changed, 204 insertions(+)

diff --git a/src/ParallelKernel/kernel_language.jl b/src/ParallelKernel/kernel_language.jl
index dfffe66c..a88f0fbd 100644
--- a/src/ParallelKernel/kernel_language.jl
+++ b/src/ParallelKernel/kernel_language.jl
@@ -386,6 +386,152 @@ function pk_println(caller::Module, args...; package::Symbol=get_package(caller)
 end
 
 
+## FUNCTIONS FOR WARP-LEVEL PRIMITIVES (backend mapping)
+
+function warpsize(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)    return :(CUDA.warpsize())
+    elseif (package == PKG_AMDGPU)  return :(AMDGPU.Device.wavefrontsize())
+    elseif (package == PKG_METAL)   return :(Metal.threads_per_simdgroup())
+    elseif iscpu(package)           return :(ParallelStencil.ParallelKernel.warpsize_cpu())
+    else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function laneid(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)    return :(CUDA.laneid() + 1)
+    elseif (package == PKG_AMDGPU)  return :(unsafe_trunc(Cint, AMDGPU.Device.activelane()) + Cint(1))
+    elseif (package == PKG_METAL)   return :(unsafe_trunc(Cint, Metal.thread_index_in_simdgroup()) + Cint(1))
+    elseif iscpu(package)           return :(ParallelStencil.ParallelKernel.laneid_cpu())
+    else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function active_mask(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)    return :(CUDA.active_mask())
+    elseif (package == PKG_AMDGPU)  return :(AMDGPU.Device.activemask())
+    elseif (package == PKG_METAL)   @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)           return :(ParallelStencil.ParallelKernel.active_mask_cpu())
+    else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function shfl_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)
+        return :(CUDA.shfl_sync($(args...)))
+    elseif (package == PKG_AMDGPU)
+        if length(args) == 3
+            # (mask, val, lane)
+            return :(AMDGPU.Device.shfl_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3])) - Cint(1)))
+        else
+            # (mask, val, lane, width)
+            return :(AMDGPU.Device.shfl_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3])) - Cint(1), unsafe_trunc(Cuint, $(args[4]))))
+        end
+    elseif (package == PKG_METAL)
+        @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)
+        if length(args) == 3
+            return :(ParallelStencil.ParallelKernel.shfl_sync_cpu($(args[1]), $(args[2]), Int64($(args[3])) - Int64(1)))
+        else
+            return :(ParallelStencil.ParallelKernel.shfl_sync_cpu($(args[1]), $(args[2]), Int64($(args[3])) - Int64(1), Int64($(args[4]))))
+        end
+    else
+        @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function shfl_up_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)
+        return :(CUDA.shfl_up_sync($(args...)))
+    elseif (package == PKG_AMDGPU)
+        if length(args) == 3
+            return :(AMDGPU.Device.shfl_up_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3]))))
+        else
+            return :(AMDGPU.Device.shfl_up_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3])), unsafe_trunc(Cuint, $(args[4]))))
+        end
+    elseif (package == PKG_METAL)
+        @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)
+        if length(args) == 3
+            return :(ParallelStencil.ParallelKernel.shfl_up_sync_cpu($(args[1]), $(args[2]), Int64($(args[3]))))
+        else
+            return :(ParallelStencil.ParallelKernel.shfl_up_sync_cpu($(args[1]), $(args[2]), Int64($(args[3])), Int64($(args[4]))))
+        end
+    else
+        @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function shfl_down_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)
+        return :(CUDA.shfl_down_sync($(args...)))
+    elseif (package == PKG_AMDGPU)
+        if length(args) == 3
+            return :(AMDGPU.Device.shfl_down_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3]))))
+        else
+            return :(AMDGPU.Device.shfl_down_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3])), unsafe_trunc(Cuint, $(args[4]))))
+        end
+    elseif (package == PKG_METAL)
+        @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)
+        if length(args) == 3
+            return :(ParallelStencil.ParallelKernel.shfl_down_sync_cpu($(args[1]), $(args[2]), Int64($(args[3]))))
+        else
+            return :(ParallelStencil.ParallelKernel.shfl_down_sync_cpu($(args[1]), $(args[2]), Int64($(args[3])), Int64($(args[4]))))
+        end
+    else
+        @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function shfl_xor_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)
+        return :(CUDA.shfl_xor_sync($(args...)))
+    elseif (package == PKG_AMDGPU)
+        if length(args) == 3
+            return :(AMDGPU.Device.shfl_xor_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3])) - Cint(1)))
+        else
+            return :(AMDGPU.Device.shfl_xor_sync(UInt64($(args[1])), $(args[2]), unsafe_trunc(Cint, $(args[3])) - Cint(1), unsafe_trunc(Cuint, $(args[4]))))
+        end
+    elseif (package == PKG_METAL)
+        @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)
+        if length(args) == 3
+            return :(ParallelStencil.ParallelKernel.shfl_xor_sync_cpu($(args[1]), $(args[2]), Int64($(args[3])) - Int64(1)))
+        else
+            return :(ParallelStencil.ParallelKernel.shfl_xor_sync_cpu($(args[1]), $(args[2]), Int64($(args[3])) - Int64(1), Int64($(args[4]))))
+        end
+    else
+        @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function vote_any_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)    return :(CUDA.vote_any_sync($(args...)))
+    elseif (package == PKG_AMDGPU)  return :(AMDGPU.Device.any_sync(UInt64($(args[1])), $(args[2])))
+    elseif (package == PKG_METAL)   @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)           return :(ParallelStencil.ParallelKernel.vote_any_sync_cpu($(args...)))
+    else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function vote_all_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)    return :(CUDA.vote_all_sync($(args...)))
+    elseif (package == PKG_AMDGPU)  return :(AMDGPU.Device.all_sync(UInt64($(args[1])), $(args[2])))
+    elseif (package == PKG_METAL)   @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)           return :(ParallelStencil.ParallelKernel.vote_all_sync_cpu($(args...)))
+    else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
+function vote_ballot_sync(caller::Module, args...; package::Symbol=get_package(caller))
+    if     (package == PKG_CUDA)    return :(CUDA.vote_ballot_sync($(args...)))
+    elseif (package == PKG_AMDGPU)  return :(AMDGPU.Device.ballot_sync(UInt64($(args[1])), $(args[2])))
+    elseif (package == PKG_METAL)   @KeywordArgumentError("this functionality is not yet supported in Metal.jl.")
+    elseif iscpu(package)           return :(ParallelStencil.ParallelKernel.vote_ballot_sync_cpu($(args...)))
+    else                            @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
+    end
+end
+
 ## FUNCTIONS FOR MATH SYNTAX
 
 function ∀(caller::Module, member_expr::Expr, statement::Union{Expr, Symbol})
@@ -461,3 +607,61 @@ macro sync_threads_cpu() esc(:(begin end)) end
 macro sharedMem_cpu(T, dims) :(MArray{Tuple{$(esc(dims))...}, $(esc(T)), length($(esc(dims))), prod($(esc(dims)))}(undef)); end # Note: A macro is used instead of a function as a creating a type stable function is not really possible (dims can take any values and they become part of the MArray type...). MArray is not escaped in order not to have to import StaticArrays in the user code.
 
 macro sharedMem_cpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@sharedMem_cpu($T, $dims))) end
+
+## CPU BACKEND: WARP-LEVEL PRIMITIVES (zero-overhead pure functions)
+
+# The CPU backend follows a single-thread-per-block model. All warp-level
+# operations therefore degenerate to constants or identity operations.
+# These functions are intentionally small, @inline, allocation-free, and
+# operate on isbits values only. They are called by the macro dispatchers
+# for the CPU backend.
+
+@inline warpsize_cpu()::Int = 1
+
+@inline laneid_cpu()::Int = 1
+
+@inline active_mask_cpu()::UInt64 = UInt64(0x1)
+
+# Shuffle: direct, with optional width. Identity on CPU.
+@inline shfl_sync_cpu(mask::Unsigned, val, lane0::Int64)
+    val
+end
+
+@inline shfl_sync_cpu(mask::Unsigned, val, lane0::Int64, width::Int64)
+    val
+end
+
+# Shuffle up
+@inline shfl_up_sync_cpu(mask::Unsigned, val, delta::Int64)
+    val
+end
+
+@inline shfl_up_sync_cpu(mask::Unsigned, val, delta::Int64, width::Int64)
+    val
+end
+
+# Shuffle down
+@inline shfl_down_sync_cpu(mask::Unsigned, val, delta::Int64)
+    val
+end
+
+@inline shfl_down_sync_cpu(mask::Unsigned, val, delta::Int64, width::Int64)
+    val
+end
+
+# Shuffle xor (butterfly)
+@inline shfl_xor_sync_cpu(mask::Unsigned, val, lane_mask0::Int64)
+    val
+end
+
+@inline shfl_xor_sync_cpu(mask::Unsigned, val, lane_mask0::Int64, width::Int64)
+    val
+end
+
+# Vote operations
+@inline vote_any_sync_cpu(mask::Unsigned, predicate::Bool)::Bool = predicate
+
+@inline vote_all_sync_cpu(mask::Unsigned, predicate::Bool)::Bool = predicate
+
+# Ballot returns a mask with bit 0 set iff predicate is true; CPU uses 64-bit mask.
+@inline vote_ballot_sync_cpu(mask::Unsigned, predicate::Bool)::UInt64 = predicate ? UInt64(0x1) : UInt64(0x0)

From ffbe50d28e7ce85d8698b270221fa1d6ae9b454f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 24 Sep 2025 09:59:07 +0200
Subject: [PATCH 07/27] fix CPU target functions

---
 src/ParallelKernel/kernel_language.jl | 32 +++++++--------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/src/ParallelKernel/kernel_language.jl b/src/ParallelKernel/kernel_language.jl
index a88f0fbd..742d1aa6 100644
--- a/src/ParallelKernel/kernel_language.jl
+++ b/src/ParallelKernel/kernel_language.jl
@@ -623,40 +623,24 @@ macro sharedMem_cpu(T, dims, offset) esc(:(ParallelStencil.ParallelKernel.@share
 @inline active_mask_cpu()::UInt64 = UInt64(0x1)
 
 # Shuffle: direct, with optional width. Identity on CPU.
-@inline shfl_sync_cpu(mask::Unsigned, val, lane0::Int64)
-    val
-end
+@inline shfl_sync_cpu(mask::Unsigned, val, lane0::Int64) = val
 
-@inline shfl_sync_cpu(mask::Unsigned, val, lane0::Int64, width::Int64)
-    val
-end
+@inline shfl_sync_cpu(mask::Unsigned, val, lane0::Int64, width::Int64) = val
 
 # Shuffle up
-@inline shfl_up_sync_cpu(mask::Unsigned, val, delta::Int64)
-    val
-end
+@inline shfl_up_sync_cpu(mask::Unsigned, val, delta::Int64) = val
 
-@inline shfl_up_sync_cpu(mask::Unsigned, val, delta::Int64, width::Int64)
-    val
-end
+@inline shfl_up_sync_cpu(mask::Unsigned, val, delta::Int64, width::Int64) = val
 
 # Shuffle down
-@inline shfl_down_sync_cpu(mask::Unsigned, val, delta::Int64)
-    val
-end
+@inline shfl_down_sync_cpu(mask::Unsigned, val, delta::Int64) = val
 
-@inline shfl_down_sync_cpu(mask::Unsigned, val, delta::Int64, width::Int64)
-    val
-end
+@inline shfl_down_sync_cpu(mask::Unsigned, val, delta::Int64, width::Int64) = val
 
 # Shuffle xor (butterfly)
-@inline shfl_xor_sync_cpu(mask::Unsigned, val, lane_mask0::Int64)
-    val
-end
+@inline shfl_xor_sync_cpu(mask::Unsigned, val, lane_mask0::Int64) = val
 
-@inline shfl_xor_sync_cpu(mask::Unsigned, val, lane_mask0::Int64, width::Int64)
-    val
-end
+@inline shfl_xor_sync_cpu(mask::Unsigned, val, lane_mask0::Int64, width::Int64) = val
 
 # Vote operations
 @inline vote_any_sync_cpu(mask::Unsigned, predicate::Bool)::Bool = predicate

From 6c3a400266a2a0916f2a9739cbb9d7b6d58853c6 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 24 Sep 2025 10:00:47 +0200
Subject: [PATCH 08/27] add test set for metal

---
 test/ParallelKernel/test_kernel_language.jl | 27 +++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl
index 92062964..ba536109 100644
--- a/test/ParallelKernel/test_kernel_language.jl
+++ b/test/ParallelKernel/test_kernel_language.jl
@@ -225,6 +225,33 @@ eval(:(
                         @test Bout_ballot == map(p -> p ? UInt64(0x1) : UInt64(0x0), P)
                     end
                 end;
+                @testset "Unsupported primitives" begin
+                    @static if $package == $PKG_METAL
+                        mask      = UInt64(0x1)
+                        mask32    = UInt32(0x1)
+                        valf      = one($FloatDefault)
+                        lane      = 1
+                        width     = 1
+                        delta     = 1
+                        lanemask  = 1
+                        predicate = true
+
+                        @test_throws Exception @prettystring(1, @active_mask())
+
+                        @test_throws Exception @prettystring(1, @shfl_sync(mask,  valf, lane))
+                        @test_throws Exception @prettystring(1, @shfl_sync(mask,  valf, lane, width))
+                        @test_throws Exception @prettystring(1, @shfl_up_sync(mask,  valf, delta))
+                        @test_throws Exception @prettystring(1, @shfl_up_sync(mask,  valf, delta, width))
+                        @test_throws Exception @prettystring(1, @shfl_down_sync(mask,  valf, delta))
+                        @test_throws Exception @prettystring(1, @shfl_down_sync(mask,  valf, delta, width))
+                        @test_throws Exception @prettystring(1, @shfl_xor_sync(mask,  valf, lanemask))
+                        @test_throws Exception @prettystring(1, @shfl_xor_sync(mask,  valf, lanemask, width))
+
+                        @test_throws Exception @prettystring(1, @vote_any_sync(mask32, predicate))
+                        @test_throws Exception @prettystring(1, @vote_all_sync(mask32, predicate))
+                        @test_throws Exception @prettystring(1, @vote_ballot_sync(mask32, predicate))
+                    end
+                end;
             end;
             @testset "@gridDim, @blockIdx, @blockDim, @threadIdx (1D)" begin
                 @static if $package == $PKG_THREADS

From a0069891e9b15b6e2607c143fc8f52d510580248 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 24 Sep 2025 17:19:51 +0200
Subject: [PATCH 09/27] add missing exports

---
 src/ParallelStencil.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl
index a46433a6..fdd8e2fd 100644
--- a/src/ParallelStencil.jl
+++ b/src/ParallelStencil.jl
@@ -74,6 +74,7 @@ include("FiniteDifferences.jl")
 export @init_parallel_stencil, FiniteDifferences1D, FiniteDifferences2D, FiniteDifferences3D, AD
 export @parallel, @hide_communication, @parallel_indices, @parallel_async, @synchronize, @zeros, @ones, @rand, @falses, @trues, @fill, @fill!, @CellType
 export @gridDim, @blockIdx, @blockDim, @threadIdx, @sync_threads, @sharedMem, @ps_show, @ps_println, @∀
+export @warpsize, @laneid, @active_mask, @shfl_sync, @shfl_up_sync, @shfl_down_sync, @shfl_xor_sync, @vote_any_sync, @vote_all_sync, @vote_ballot_sync
 export PSNumber
 
 end # Module ParallelStencil

From 5af959e87b6afe562222685a15e7a319ba60c501 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:07:09 +0200
Subject: [PATCH 10/27] a rename kernel language file to memopt

---
 src/ParallelStencil.jl                | 2 +-
 src/{kernel_language.jl => memopt.jl} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/{kernel_language.jl => memopt.jl} (100%)

diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl
index fdd8e2fd..437d1612 100644
--- a/src/ParallelStencil.jl
+++ b/src/ParallelStencil.jl
@@ -61,7 +61,7 @@ include("shared.jl")
 
 ## Alphabetical include of function files
 include("init_parallel_stencil.jl")
-include("kernel_language.jl")
+include("memopt.jl")
 include("parallel.jl")
 include("reset_parallel_stencil.jl")
 
diff --git a/src/kernel_language.jl b/src/memopt.jl
similarity index 100%
rename from src/kernel_language.jl
rename to src/memopt.jl

From 313abed186841d556da6086e1a285165ce42bb86 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:10:53 +0200
Subject: [PATCH 11/27] move kernel language wrappers to the corresponding file

---
 src/kernel_language.jl | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 src/kernel_language.jl

diff --git a/src/kernel_language.jl b/src/kernel_language.jl
new file mode 100644
index 00000000..83b2a5e7
--- /dev/null
+++ b/src/kernel_language.jl
@@ -0,0 +1,9 @@
+@doc replace(ParallelKernel.GRIDDIM_DOC,            "@init_parallel_kernel" => "@init_parallel_stencil") macro gridDim(args...)            check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@gridDim($(args...)))); end
+@doc replace(ParallelKernel.BLOCKIDX_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro blockIdx(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@blockIdx($(args...)))); end
+@doc replace(ParallelKernel.BLOCKDIM_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro blockDim(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@blockDim($(args...)))); end
+@doc replace(ParallelKernel.THREADIDX_DOC,          "@init_parallel_kernel" => "@init_parallel_stencil") macro threadIdx(args...)          check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@threadIdx($(args...)))); end
+@doc replace(ParallelKernel.SYNCTHREADS_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro sync_threads(args...)       check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@sync_threads($(args...)))); end
+@doc replace(ParallelKernel.SHAREDMEM_DOC,          "@init_parallel_kernel" => "@init_parallel_stencil") macro sharedMem(args...)          check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@sharedMem($(args...)))); end
+@doc replace(ParallelKernel.FORALL_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro ∀(args...)                  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@∀($(args...)))); end
+@doc replace(replace(ParallelKernel.PKSHOW_DOC,     "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_show"    => "ps_show")    macro ps_show(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_show($(args...)))); end
+@doc replace(replace(ParallelKernel.PKPRINTLN_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_println" => "ps_println") macro ps_println(args...)  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_println($(args...)))); end

From ca17bb576f72bd3a12daa2b948ed087311bb67ff Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:14:41 +0200
Subject: [PATCH 12/27] move allocator wrappers to the corresponding file

---
 src/ParallelStencil.jl | 1 +
 src/allocators.jl      | 8 ++++++++
 2 files changed, 9 insertions(+)
 create mode 100644 src/allocators.jl

diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl
index 437d1612..22dbfded 100644
--- a/src/ParallelStencil.jl
+++ b/src/ParallelStencil.jl
@@ -60,6 +60,7 @@ using .ParallelKernel.Exceptions
 include("shared.jl")
 
 ## Alphabetical include of function files
+include("allocators.jl")
 include("init_parallel_stencil.jl")
 include("memopt.jl")
 include("parallel.jl")
diff --git a/src/allocators.jl b/src/allocators.jl
new file mode 100644
index 00000000..c8ee4bc5
--- /dev/null
+++ b/src/allocators.jl
@@ -0,0 +1,8 @@
+@doc replace(ParallelKernel.ZEROS_DOC,              "@init_parallel_kernel" => "@init_parallel_stencil") macro zeros(args...)              check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@zeros($(args...)))); end
+@doc replace(ParallelKernel.ONES_DOC,               "@init_parallel_kernel" => "@init_parallel_stencil") macro ones(args...)               check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@ones($(args...)))); end
+@doc replace(ParallelKernel.RAND_DOC,               "@init_parallel_kernel" => "@init_parallel_stencil") macro rand(args...)               check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@rand($(args...)))); end
+@doc replace(ParallelKernel.FALSES_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro falses(args...)             check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@falses($(args...)))); end
+@doc replace(ParallelKernel.TRUES_DOC,              "@init_parallel_kernel" => "@init_parallel_stencil") macro trues(args...)              check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@trues($(args...)))); end
+@doc replace(ParallelKernel.FILL_DOC,               "@init_parallel_kernel" => "@init_parallel_stencil") macro fill(args...)               check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill($(args...)))); end
+@doc replace(ParallelKernel.FILL!_DOC,              "@init_parallel_kernel" => "@init_parallel_stencil") macro fill!(args...)              check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill!($(args...)))); end
+@doc replace(ParallelKernel.CELLTYPE_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro CellType(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@CellType($(args...)))); end

From 3b7d253e6aa6ca7bacbece8aa0c91e365bc8cc34 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:15:11 +0200
Subject: [PATCH 13/27] move allocator wrappers to the corresponding file

---
 src/init_parallel_stencil.jl | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/init_parallel_stencil.jl b/src/init_parallel_stencil.jl
index 454ee360..27c65198 100644
--- a/src/init_parallel_stencil.jl
+++ b/src/init_parallel_stencil.jl
@@ -1,24 +1,6 @@
 # NOTE: @parallel and @parallel_indices and @parallel_async do not appear in the following as they are extended and therefore defined in parallel.jl
 @doc replace(ParallelKernel.HIDE_COMMUNICATION_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro hide_communication(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@hide_communication($(args...)))); end
-@doc replace(ParallelKernel.ZEROS_DOC,              "@init_parallel_kernel" => "@init_parallel_stencil") macro zeros(args...)              check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@zeros($(args...)))); end
-@doc replace(ParallelKernel.ONES_DOC,               "@init_parallel_kernel" => "@init_parallel_stencil") macro ones(args...)               check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@ones($(args...)))); end
-@doc replace(ParallelKernel.RAND_DOC,               "@init_parallel_kernel" => "@init_parallel_stencil") macro rand(args...)               check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@rand($(args...)))); end
-@doc replace(ParallelKernel.FALSES_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro falses(args...)             check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@falses($(args...)))); end
-@doc replace(ParallelKernel.TRUES_DOC,              "@init_parallel_kernel" => "@init_parallel_stencil") macro trues(args...)              check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@trues($(args...)))); end
-@doc replace(ParallelKernel.FILL_DOC,               "@init_parallel_kernel" => "@init_parallel_stencil") macro fill(args...)               check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill($(args...)))); end
-@doc replace(ParallelKernel.FILL!_DOC,              "@init_parallel_kernel" => "@init_parallel_stencil") macro fill!(args...)              check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@fill!($(args...)))); end
-@doc replace(ParallelKernel.CELLTYPE_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro CellType(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@CellType($(args...)))); end
 @doc replace(ParallelKernel.SYNCHRONIZE_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro synchronize(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@synchronize($(args...)))); end
-@doc replace(ParallelKernel.GRIDDIM_DOC,            "@init_parallel_kernel" => "@init_parallel_stencil") macro gridDim(args...)            check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@gridDim($(args...)))); end
-@doc replace(ParallelKernel.BLOCKIDX_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro blockIdx(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@blockIdx($(args...)))); end
-@doc replace(ParallelKernel.BLOCKDIM_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro blockDim(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@blockDim($(args...)))); end
-@doc replace(ParallelKernel.THREADIDX_DOC,          "@init_parallel_kernel" => "@init_parallel_stencil") macro threadIdx(args...)          check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@threadIdx($(args...)))); end
-@doc replace(ParallelKernel.SYNCTHREADS_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro sync_threads(args...)       check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@sync_threads($(args...)))); end
-@doc replace(ParallelKernel.SHAREDMEM_DOC,          "@init_parallel_kernel" => "@init_parallel_stencil") macro sharedMem(args...)          check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@sharedMem($(args...)))); end
-@doc replace(ParallelKernel.FORALL_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro ∀(args...)                  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@∀($(args...)))); end
-@doc replace(replace(ParallelKernel.PKSHOW_DOC,     "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_show"    => "ps_show")    macro ps_show(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_show($(args...)))); end
-@doc replace(replace(ParallelKernel.PKPRINTLN_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_println" => "ps_println") macro ps_println(args...)  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_println($(args...)))); end
-
 
 """
     @init_parallel_stencil(package, numbertype, ndims)

From 2da9f45340e4a7df772d1ad65b60bdedf4ff8204 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:18:24 +0200
Subject: [PATCH 14/27] move hide communication to the corresponding file

---
 src/hide_communication.jl    | 1 +
 src/init_parallel_stencil.jl | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 src/hide_communication.jl

diff --git a/src/hide_communication.jl b/src/hide_communication.jl
new file mode 100644
index 00000000..868acd32
--- /dev/null
+++ b/src/hide_communication.jl
@@ -0,0 +1 @@
+@doc replace(ParallelKernel.HIDE_COMMUNICATION_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro hide_communication(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@hide_communication($(args...)))); end
\ No newline at end of file
diff --git a/src/init_parallel_stencil.jl b/src/init_parallel_stencil.jl
index 27c65198..6bb14537 100644
--- a/src/init_parallel_stencil.jl
+++ b/src/init_parallel_stencil.jl
@@ -1,5 +1,4 @@
 # NOTE: @parallel and @parallel_indices and @parallel_async do not appear in the following as they are extended and therefore defined in parallel.jl
-@doc replace(ParallelKernel.HIDE_COMMUNICATION_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro hide_communication(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@hide_communication($(args...)))); end
 @doc replace(ParallelKernel.SYNCHRONIZE_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro synchronize(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@synchronize($(args...)))); end
 
 """

From 37f5a056fa872661070a2b2d679e3ff2e2e09f4a Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:18:49 +0200
Subject: [PATCH 15/27] move hide communication to the corresponding file

---
 src/ParallelStencil.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl
index 22dbfded..ba30d7a6 100644
--- a/src/ParallelStencil.jl
+++ b/src/ParallelStencil.jl
@@ -61,6 +61,8 @@ include("shared.jl")
 
 ## Alphabetical include of function files
 include("allocators.jl")
+include("hide_communication.jl")
+include("kernel_language.jl")
 include("init_parallel_stencil.jl")
 include("memopt.jl")
 include("parallel.jl")

From 59c581bdf4a55e77e652cc6a18f2ec5ee3c776db Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:19:18 +0200
Subject: [PATCH 16/27] move hide communication to the corresponding file

---
 src/ParallelStencil.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl
index ba30d7a6..1792a8c1 100644
--- a/src/ParallelStencil.jl
+++ b/src/ParallelStencil.jl
@@ -62,8 +62,8 @@ include("shared.jl")
 ## Alphabetical include of function files
 include("allocators.jl")
 include("hide_communication.jl")
-include("kernel_language.jl")
 include("init_parallel_stencil.jl")
+include("kernel_language.jl")
 include("memopt.jl")
 include("parallel.jl")
 include("reset_parallel_stencil.jl")

From 042af793d67b769f0810adfca7c6a58b990961aa Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 26 Sep 2025 10:23:01 +0200
Subject: [PATCH 17/27] moves synchronization to the corresponding file

---
 src/init_parallel_stencil.jl | 3 ---
 src/parallel.jl              | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/init_parallel_stencil.jl b/src/init_parallel_stencil.jl
index 6bb14537..e27b62bd 100644
--- a/src/init_parallel_stencil.jl
+++ b/src/init_parallel_stencil.jl
@@ -1,6 +1,3 @@
-# NOTE: @parallel and @parallel_indices and @parallel_async do not appear in the following as they are extended and therefore defined in parallel.jl
-@doc replace(ParallelKernel.SYNCHRONIZE_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro synchronize(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@synchronize($(args...)))); end
-
 """
     @init_parallel_stencil(package, numbertype, ndims)
     @init_parallel_stencil(package, numbertype, ndims, inbounds=...)
diff --git a/src/parallel.jl b/src/parallel.jl
index c3ca3055..574741f1 100644
--- a/src/parallel.jl
+++ b/src/parallel.jl
@@ -1,5 +1,9 @@
 import .ParallelKernel: get_name, set_name, get_body, set_body!, add_return, remove_return, extract_kwargs, split_parallel_args, extract_tuple, substitute, literaltypes, push_to_signature!, add_loop, add_threadids, promote_maxsize
 
+# NOTE: @parallel and @parallel_indices and @parallel_async do not appear in the following as they are extended and therefore re-defined here in parallel.jl
+@doc replace(ParallelKernel.SYNCHRONIZE_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro synchronize(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@synchronize($(args...)))); end
+
+
 const PARALLEL_DOC = """
     @parallel kernel
     @parallel inbounds=... memopt=... ndims=... kernel

From 70dbb79d0341a96a8fb4b359753bb112a7ca8216 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 29 Oct 2025 12:12:40 +0100
Subject: [PATCH 18/27] add warp level primitives to module docstring

---
 src/ParallelKernel/ParallelKernel.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/ParallelKernel/ParallelKernel.jl b/src/ParallelKernel/ParallelKernel.jl
index 8e355ee7..78f18172 100644
--- a/src/ParallelKernel/ParallelKernel.jl
+++ b/src/ParallelKernel/ParallelKernel.jl
@@ -32,6 +32,17 @@ Enables writing parallel high-performance kernels and whole applications that ca
     - [`@threadIdx`](@ref)
     - [`@sync_threads`](@ref)
     - [`@sharedMem`](@ref)
+!!! note "Warp-level primitives"
+    - [`@warpsize`](@ref)
+    - [`@laneid`](@ref)
+    - [`@active_mask`](@ref)
+    - [`@shfl_sync`](@ref)
+    - [`@shfl_up_sync`](@ref)
+    - [`@shfl_down_sync`](@ref)
+    - [`@shfl_xor_sync`](@ref)
+    - [`@vote_any_sync`](@ref)
+    - [`@vote_all_sync`](@ref)
+    - [`@vote_ballot_sync`](@ref)
 
 # Submodules
 - [`ParallelKernel.AD`](@ref)

From 660829c6a11fbc71299082df7c93d327090616df Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 29 Oct 2025 12:13:10 +0100
Subject: [PATCH 19/27] add warp level primitives to module docstring

---
 src/ParallelStencil.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/ParallelStencil.jl b/src/ParallelStencil.jl
index 1792a8c1..a890ff77 100644
--- a/src/ParallelStencil.jl
+++ b/src/ParallelStencil.jl
@@ -32,6 +32,17 @@ https://github.com/omlins/ParallelStencil.jl
     - [`@threadIdx`](@ref)
     - [`@sync_threads`](@ref)
     - [`@sharedMem`](@ref)
+!!! note "Warp-level primitives"
+    - [`@warpsize`](@ref)
+    - [`@laneid`](@ref)
+    - [`@active_mask`](@ref)
+    - [`@shfl_sync`](@ref)
+    - [`@shfl_up_sync`](@ref)
+    - [`@shfl_down_sync`](@ref)
+    - [`@shfl_xor_sync`](@ref)
+    - [`@vote_any_sync`](@ref)
+    - [`@vote_all_sync`](@ref)
+    - [`@vote_ballot_sync`](@ref)
 
 # Submodules
 - [`ParallelStencil.AD`](@ref)

From 53dabe40b84838cf00bc4d96bace1d4eff103830 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 29 Oct 2025 12:14:11 +0100
Subject: [PATCH 20/27] and pass through macros to parallel kernel

---
 src/kernel_language.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/kernel_language.jl b/src/kernel_language.jl
index 83b2a5e7..b161a3a5 100644
--- a/src/kernel_language.jl
+++ b/src/kernel_language.jl
@@ -7,3 +7,13 @@
 @doc replace(ParallelKernel.FORALL_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro ∀(args...)                  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@∀($(args...)))); end
 @doc replace(replace(ParallelKernel.PKSHOW_DOC,     "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_show"    => "ps_show")    macro ps_show(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_show($(args...)))); end
 @doc replace(replace(ParallelKernel.PKPRINTLN_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil"), "pk_println" => "ps_println") macro ps_println(args...)  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@pk_println($(args...)))); end
+@doc replace(ParallelKernel.WARPSIZE_DOC,           "@init_parallel_kernel" => "@init_parallel_stencil") macro warpsize(args...)           check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@warpsize($(args...)))); end
+@doc replace(ParallelKernel.LANEID_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro laneid(args...)             check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@laneid($(args...)))); end
+@doc replace(ParallelKernel.ACTIVE_MASK_DOC,        "@init_parallel_kernel" => "@init_parallel_stencil") macro active_mask(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@active_mask($(args...)))); end
+@doc replace(ParallelKernel.SHFL_SYNC_DOC,          "@init_parallel_kernel" => "@init_parallel_stencil") macro shfl_sync(args...)          check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@shfl_sync($(args...)))); end
+@doc replace(ParallelKernel.SHFL_UP_SYNC_DOC,       "@init_parallel_kernel" => "@init_parallel_stencil") macro shfl_up_sync(args...)       check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@shfl_up_sync($(args...)))); end
+@doc replace(ParallelKernel.SHFL_DOWN_SYNC_DOC,     "@init_parallel_kernel" => "@init_parallel_stencil") macro shfl_down_sync(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@shfl_down_sync($(args...)))); end
+@doc replace(ParallelKernel.SHFL_XOR_SYNC_DOC,      "@init_parallel_kernel" => "@init_parallel_stencil") macro shfl_xor_sync(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@shfl_xor_sync($(args...)))); end
+@doc replace(ParallelKernel.VOTE_ANY_SYNC_DOC,      "@init_parallel_kernel" => "@init_parallel_stencil") macro vote_any_sync(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@vote_any_sync($(args...)))); end
+@doc replace(ParallelKernel.VOTE_ALL_SYNC_DOC,      "@init_parallel_kernel" => "@init_parallel_stencil") macro vote_all_sync(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@vote_all_sync($(args...)))); end
+@doc replace(ParallelKernel.VOTE_BALLOT_SYNC_DOC,   "@init_parallel_kernel" => "@init_parallel_stencil") macro vote_ballot_sync(args...)   check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.@vote_ballot_sync($(args...)))); end

From 49d19b7260cb4ffce363b6cc6fdb0219a1ab17f7 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Wed, 29 Oct 2025 12:14:55 +0100
Subject: [PATCH 21/27] at parallel stencil kernel language tests

---
 test/test_kernel_language.jl | 134 +++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 test/test_kernel_language.jl

diff --git a/test/test_kernel_language.jl b/test/test_kernel_language.jl
new file mode 100644
index 00000000..d908c410
--- /dev/null
+++ b/test/test_kernel_language.jl
@@ -0,0 +1,134 @@
+using Test
+using ParallelStencil
+import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES,
+    PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, @require, @iscpu
+
+TEST_PACKAGES = copy(SUPPORTED_PACKAGES)
+@static if PKG_CUDA in TEST_PACKAGES
+    import CUDA
+    if !CUDA.functional()
+        TEST_PACKAGES = filter!(x -> x ≠ PKG_CUDA, TEST_PACKAGES)
+    end
+end
+@static if PKG_AMDGPU in TEST_PACKAGES
+    import AMDGPU
+    if !AMDGPU.functional()
+        TEST_PACKAGES = filter!(x -> x ≠ PKG_AMDGPU, TEST_PACKAGES)
+    end
+end
+@static if PKG_METAL in TEST_PACKAGES
+    import Metal
+    if !Metal.functional()
+        TEST_PACKAGES = filter!(x -> x ≠ PKG_METAL, TEST_PACKAGES)
+    end
+end
+@static if PKG_POLYESTER in TEST_PACKAGES
+    import Polyester
+end
+Base.retry_load_extensions()
+
+strip_linenums(ex) = Base.remove_linenums!(deepcopy(ex))
+expand_once(expr) = strip_linenums(Base.macroexpand(@__MODULE__, expr, recursive=false))
+normalized_string(expr) = strip(replace(string(expand_once(expr)), r"#= .*? =#" => ""))
+
+@static for package in TEST_PACKAGES
+    FloatDefault = (package == PKG_METAL) ? Float32 : Float64
+
+    eval(:(
+        @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin
+            @testset "Kernel language pass-through macros" begin
+                @require !@is_initialized()
+                @init_parallel_stencil($package, $FloatDefault, 3, nonconst_metadata=true)
+                @require @is_initialized()
+
+                mask64    = UInt64(0xffff_ffff_ffff_ffff)
+                mask32    = UInt32(0xffff_ffff)
+                val       = one($FloatDefault)
+                lane      = 1
+                width     = 2
+                delta     = 1
+                lanemask  = 1
+                predicate = true
+                x         = 42
+
+                @testset "Macro expansion forwards to ParallelKernel" begin
+                    @test normalized_string(:(@gridDim()))      == "ParallelStencil.ParallelKernel.@gridDim"
+                    @test normalized_string(:(@blockIdx()))     == "ParallelStencil.ParallelKernel.@blockIdx"
+                    @test normalized_string(:(@blockDim()))     == "ParallelStencil.ParallelKernel.@blockDim"
+                    @test normalized_string(:(@threadIdx()))    == "ParallelStencil.ParallelKernel.@threadIdx"
+                    @test normalized_string(:(@sync_threads())) == "ParallelStencil.ParallelKernel.@sync_threads"
+                    @test normalized_string(:(@sharedMem($FloatDefault, (2, 3)))) == "ParallelStencil.ParallelKernel.@sharedMem $(string($FloatDefault)) (2, 3)"
+                    @test normalized_string(:(@ps_show x))      == "ParallelStencil.ParallelKernel.@pk_show x"
+                    @test normalized_string(:(@ps_println "pass-through")) == "ParallelStencil.ParallelKernel.@pk_println \"pass-through\""
+                    @test occursin("ParallelStencil.ParallelKernel.@∀", normalized_string(:(@∀ i ∈ (x,) @all(C.i) = @all(A.i))))
+
+                    @test normalized_string(:(@warpsize()))     == "ParallelStencil.ParallelKernel.@warpsize"
+                    @test normalized_string(:(@laneid()))       == "ParallelStencil.ParallelKernel.@laneid"
+                    @test normalized_string(:(@active_mask()))  == "ParallelStencil.ParallelKernel.@active_mask"
+
+                    @test normalized_string(:(@shfl_sync(mask32, val, lane))) == "ParallelStencil.ParallelKernel.@shfl_sync mask32 val lane"
+                    @test normalized_string(:(@shfl_sync(mask32, val, lane, width))) == "ParallelStencil.ParallelKernel.@shfl_sync mask32 val lane width"
+                    @test normalized_string(:(@shfl_up_sync(mask32, val, delta))) == "ParallelStencil.ParallelKernel.@shfl_up_sync mask32 val delta"
+                    @test normalized_string(:(@shfl_up_sync(mask32, val, delta, width))) == "ParallelStencil.ParallelKernel.@shfl_up_sync mask32 val delta width"
+                    @test normalized_string(:(@shfl_down_sync(mask32, val, delta))) == "ParallelStencil.ParallelKernel.@shfl_down_sync mask32 val delta"
+                    @test normalized_string(:(@shfl_down_sync(mask32, val, delta, width))) == "ParallelStencil.ParallelKernel.@shfl_down_sync mask32 val delta width"
+                    @test normalized_string(:(@shfl_xor_sync(mask32, val, lanemask))) == "ParallelStencil.ParallelKernel.@shfl_xor_sync mask32 val lanemask"
+                    @test normalized_string(:(@shfl_xor_sync(mask32, val, lanemask, width))) == "ParallelStencil.ParallelKernel.@shfl_xor_sync mask32 val lanemask width"
+
+                    @test normalized_string(:(@vote_any_sync(mask32, predicate))) == "ParallelStencil.ParallelKernel.@vote_any_sync mask32 predicate"
+                    @test normalized_string(:(@vote_all_sync(mask32, predicate))) == "ParallelStencil.ParallelKernel.@vote_all_sync mask32 predicate"
+                    @test normalized_string(:(@vote_ballot_sync(mask32, predicate))) == "ParallelStencil.ParallelKernel.@vote_ballot_sync mask32 predicate"
+                end
+
+                @testset "CPU runtime smoke tests" begin
+                    @static if @iscpu($package)
+                        N = 8
+                        A = rand($FloatDefault, N)
+                        P = [A[i] > zero($FloatDefault) for i in 1:N]
+                        Bout_any    = Vector{Bool}(undef, N)
+                        Bout_all    = Vector{Bool}(undef, N)
+                        Bout_ballot = Vector{UInt64}(undef, N)
+                        Bshfl       = similar(A)
+                        Bshfl_up    = similar(A)
+                        Bshfl_down  = similar(A)
+                        Bshfl_xor   = similar(A)
+
+                        @parallel_indices (ix) function kernel_pass_through!(Bout_any, Bout_all, Bout_ballot,
+                                Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
+                            mask = @active_mask()
+                            warp = @warpsize()
+                            lane_local = @laneid()
+                            @test warp == 1
+                            @test lane_local == 1
+
+                            Bshfl[ix]      = @shfl_sync(mask, A[ix], lane_local)
+                            Bshfl_up[ix]   = @shfl_up_sync(mask, A[ix], 1)
+                            Bshfl_down[ix] = @shfl_down_sync(mask, A[ix], 1)
+                            Bshfl_xor[ix]  = @shfl_xor_sync(mask, A[ix], 1)
+
+                            pred = P[ix]
+                            Bout_any[ix]    = @vote_any_sync(mask, pred)
+                            Bout_all[ix]    = @vote_all_sync(mask, pred)
+                            Bout_ballot[ix] = @vote_ballot_sync(mask, pred)
+                            return
+                        end
+
+                        @parallel (1:N) kernel_pass_through!(Bout_any, Bout_all, Bout_ballot,
+                            Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
+
+                        @test all(Bshfl .== A)
+                        @test all(Bshfl_up .== A)
+                        @test all(Bshfl_down .== A)
+                        @test all(Bshfl_xor .== A)
+                        @test Bout_any == P
+                        @test Bout_all == P
+                        @test Bout_ballot == map(p -> p ? UInt64(0x1) : UInt64(0x0), P)
+                    end
+                end
+
+                @reset_parallel_stencil()
+                @require !@is_initialized()
+            end
+        end
+    ))
+end

From d6b9f8456276a5cbe6a5027a86dfee94c8062750 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 31 Oct 2025 09:45:02 +0100
Subject: [PATCH 22/27] add parallel stencil kernel language tests

---
 test/test_kernel_language.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_kernel_language.jl b/test/test_kernel_language.jl
index d908c410..e03234ec 100644
--- a/test/test_kernel_language.jl
+++ b/test/test_kernel_language.jl
@@ -3,7 +3,7 @@ using ParallelStencil
 import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES,
     PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, @require, @iscpu
 
-TEST_PACKAGES = copy(SUPPORTED_PACKAGES)
+TEST_PACKAGES = SUPPORTED_PACKAGES
 @static if PKG_CUDA in TEST_PACKAGES
     import CUDA
     if !CUDA.functional()
@@ -131,4 +131,4 @@ normalized_string(expr) = strip(replace(string(expand_once(expr)), r"#= .*? =#"
             end
         end
     ))
-end
+end == nothing || true;

From 74020cd166b039b1f32af7ed1c52785cce380670 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 31 Oct 2025 10:04:27 +0100
Subject: [PATCH 23/27] add parallel stencil kernel language tests

---
 test/test_kernel_language.jl | 134 -----------------------------------
 1 file changed, 134 deletions(-)
 delete mode 100644 test/test_kernel_language.jl

diff --git a/test/test_kernel_language.jl b/test/test_kernel_language.jl
deleted file mode 100644
index e03234ec..00000000
--- a/test/test_kernel_language.jl
+++ /dev/null
@@ -1,134 +0,0 @@
-using Test
-using ParallelStencil
-import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES,
-    PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER, @require, @iscpu
-
-TEST_PACKAGES = SUPPORTED_PACKAGES
-@static if PKG_CUDA in TEST_PACKAGES
-    import CUDA
-    if !CUDA.functional()
-        TEST_PACKAGES = filter!(x -> x ≠ PKG_CUDA, TEST_PACKAGES)
-    end
-end
-@static if PKG_AMDGPU in TEST_PACKAGES
-    import AMDGPU
-    if !AMDGPU.functional()
-        TEST_PACKAGES = filter!(x -> x ≠ PKG_AMDGPU, TEST_PACKAGES)
-    end
-end
-@static if PKG_METAL in TEST_PACKAGES
-    import Metal
-    if !Metal.functional()
-        TEST_PACKAGES = filter!(x -> x ≠ PKG_METAL, TEST_PACKAGES)
-    end
-end
-@static if PKG_POLYESTER in TEST_PACKAGES
-    import Polyester
-end
-Base.retry_load_extensions()
-
-strip_linenums(ex) = Base.remove_linenums!(deepcopy(ex))
-expand_once(expr) = strip_linenums(Base.macroexpand(@__MODULE__, expr, recursive=false))
-normalized_string(expr) = strip(replace(string(expand_once(expr)), r"#= .*? =#" => ""))
-
-@static for package in TEST_PACKAGES
-    FloatDefault = (package == PKG_METAL) ? Float32 : Float64
-
-    eval(:(
-        @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin
-            @testset "Kernel language pass-through macros" begin
-                @require !@is_initialized()
-                @init_parallel_stencil($package, $FloatDefault, 3, nonconst_metadata=true)
-                @require @is_initialized()
-
-                mask64    = UInt64(0xffff_ffff_ffff_ffff)
-                mask32    = UInt32(0xffff_ffff)
-                val       = one($FloatDefault)
-                lane      = 1
-                width     = 2
-                delta     = 1
-                lanemask  = 1
-                predicate = true
-                x         = 42
-
-                @testset "Macro expansion forwards to ParallelKernel" begin
-                    @test normalized_string(:(@gridDim()))      == "ParallelStencil.ParallelKernel.@gridDim"
-                    @test normalized_string(:(@blockIdx()))     == "ParallelStencil.ParallelKernel.@blockIdx"
-                    @test normalized_string(:(@blockDim()))     == "ParallelStencil.ParallelKernel.@blockDim"
-                    @test normalized_string(:(@threadIdx()))    == "ParallelStencil.ParallelKernel.@threadIdx"
-                    @test normalized_string(:(@sync_threads())) == "ParallelStencil.ParallelKernel.@sync_threads"
-                    @test normalized_string(:(@sharedMem($FloatDefault, (2, 3)))) == "ParallelStencil.ParallelKernel.@sharedMem $(string($FloatDefault)) (2, 3)"
-                    @test normalized_string(:(@ps_show x))      == "ParallelStencil.ParallelKernel.@pk_show x"
-                    @test normalized_string(:(@ps_println "pass-through")) == "ParallelStencil.ParallelKernel.@pk_println \"pass-through\""
-                    @test occursin("ParallelStencil.ParallelKernel.@∀", normalized_string(:(@∀ i ∈ (x,) @all(C.i) = @all(A.i))))
-
-                    @test normalized_string(:(@warpsize()))     == "ParallelStencil.ParallelKernel.@warpsize"
-                    @test normalized_string(:(@laneid()))       == "ParallelStencil.ParallelKernel.@laneid"
-                    @test normalized_string(:(@active_mask()))  == "ParallelStencil.ParallelKernel.@active_mask"
-
-                    @test normalized_string(:(@shfl_sync(mask32, val, lane))) == "ParallelStencil.ParallelKernel.@shfl_sync mask32 val lane"
-                    @test normalized_string(:(@shfl_sync(mask32, val, lane, width))) == "ParallelStencil.ParallelKernel.@shfl_sync mask32 val lane width"
-                    @test normalized_string(:(@shfl_up_sync(mask32, val, delta))) == "ParallelStencil.ParallelKernel.@shfl_up_sync mask32 val delta"
-                    @test normalized_string(:(@shfl_up_sync(mask32, val, delta, width))) == "ParallelStencil.ParallelKernel.@shfl_up_sync mask32 val delta width"
-                    @test normalized_string(:(@shfl_down_sync(mask32, val, delta))) == "ParallelStencil.ParallelKernel.@shfl_down_sync mask32 val delta"
-                    @test normalized_string(:(@shfl_down_sync(mask32, val, delta, width))) == "ParallelStencil.ParallelKernel.@shfl_down_sync mask32 val delta width"
-                    @test normalized_string(:(@shfl_xor_sync(mask32, val, lanemask))) == "ParallelStencil.ParallelKernel.@shfl_xor_sync mask32 val lanemask"
-                    @test normalized_string(:(@shfl_xor_sync(mask32, val, lanemask, width))) == "ParallelStencil.ParallelKernel.@shfl_xor_sync mask32 val lanemask width"
-
-                    @test normalized_string(:(@vote_any_sync(mask32, predicate))) == "ParallelStencil.ParallelKernel.@vote_any_sync mask32 predicate"
-                    @test normalized_string(:(@vote_all_sync(mask32, predicate))) == "ParallelStencil.ParallelKernel.@vote_all_sync mask32 predicate"
-                    @test normalized_string(:(@vote_ballot_sync(mask32, predicate))) == "ParallelStencil.ParallelKernel.@vote_ballot_sync mask32 predicate"
-                end
-
-                @testset "CPU runtime smoke tests" begin
-                    @static if @iscpu($package)
-                        N = 8
-                        A = rand($FloatDefault, N)
-                        P = [A[i] > zero($FloatDefault) for i in 1:N]
-                        Bout_any    = Vector{Bool}(undef, N)
-                        Bout_all    = Vector{Bool}(undef, N)
-                        Bout_ballot = Vector{UInt64}(undef, N)
-                        Bshfl       = similar(A)
-                        Bshfl_up    = similar(A)
-                        Bshfl_down  = similar(A)
-                        Bshfl_xor   = similar(A)
-
-                        @parallel_indices (ix) function kernel_pass_through!(Bout_any, Bout_all, Bout_ballot,
-                                Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
-                            mask = @active_mask()
-                            warp = @warpsize()
-                            lane_local = @laneid()
-                            @test warp == 1
-                            @test lane_local == 1
-
-                            Bshfl[ix]      = @shfl_sync(mask, A[ix], lane_local)
-                            Bshfl_up[ix]   = @shfl_up_sync(mask, A[ix], 1)
-                            Bshfl_down[ix] = @shfl_down_sync(mask, A[ix], 1)
-                            Bshfl_xor[ix]  = @shfl_xor_sync(mask, A[ix], 1)
-
-                            pred = P[ix]
-                            Bout_any[ix]    = @vote_any_sync(mask, pred)
-                            Bout_all[ix]    = @vote_all_sync(mask, pred)
-                            Bout_ballot[ix] = @vote_ballot_sync(mask, pred)
-                            return
-                        end
-
-                        @parallel (1:N) kernel_pass_through!(Bout_any, Bout_all, Bout_ballot,
-                            Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
-
-                        @test all(Bshfl .== A)
-                        @test all(Bshfl_up .== A)
-                        @test all(Bshfl_down .== A)
-                        @test all(Bshfl_xor .== A)
-                        @test Bout_any == P
-                        @test Bout_all == P
-                        @test Bout_ballot == map(p -> p ? UInt64(0x1) : UInt64(0x0), P)
-                    end
-                end
-
-                @reset_parallel_stencil()
-                @require !@is_initialized()
-            end
-        end
-    ))
-end == nothing || true;

From 22c0bb54d0b578a8c63d012b9f88e4baa321a27c Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 31 Oct 2025 11:53:36 +0100
Subject: [PATCH 24/27] add parallel stencil kernel language tests

---
 test/test_kernel_language.jl | 114 +++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 test/test_kernel_language.jl

diff --git a/test/test_kernel_language.jl b/test/test_kernel_language.jl
new file mode 100644
index 00000000..65ea40cb
--- /dev/null
+++ b/test/test_kernel_language.jl
@@ -0,0 +1,114 @@
+using Test
+using ParallelStencil
+import ParallelStencil: @reset_parallel_stencil, @is_initialized, SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL, PKG_THREADS, PKG_POLYESTER
+import ParallelStencil: @require, @prettystring, @iscpu
+
+TEST_PACKAGES = SUPPORTED_PACKAGES
+@static if PKG_CUDA in TEST_PACKAGES
+    import CUDA
+    if !CUDA.functional()
+        TEST_PACKAGES = filter!(x -> x ≠ PKG_CUDA, TEST_PACKAGES)
+    end
+end
+@static if PKG_AMDGPU in TEST_PACKAGES
+    import AMDGPU
+    if !AMDGPU.functional()
+        TEST_PACKAGES = filter!(x -> x ≠ PKG_AMDGPU, TEST_PACKAGES)
+    end
+end
+@static if PKG_METAL in TEST_PACKAGES
+    @static if Sys.isapple()
+        import Metal
+        if !Metal.functional()
+            TEST_PACKAGES = filter!(x -> x ≠ PKG_METAL, TEST_PACKAGES)
+        end
+    else
+        TEST_PACKAGES = filter!(x -> x ≠ PKG_METAL, TEST_PACKAGES)
+    end
+end
+@static if PKG_POLYESTER in TEST_PACKAGES
+    import Polyester
+end
+Base.retry_load_extensions()
+
+@static for package in TEST_PACKAGES
+    FloatDefault = (package == PKG_METAL) ? Float32 : Float64
+
+    eval(:(
+    @testset "$(basename(@__FILE__)) (package: $(nameof($package)))" begin
+        @require !@is_initialized()
+        @init_parallel_stencil($package, $FloatDefault, 3)
+        @require @is_initialized()
+
+        @testset "Pass-through macro mapping" begin
+            @test @prettystring(1, @gridDim()) == "ParallelStencil.ParallelKernel.@gridDim"
+            @test @prettystring(1, @blockIdx()) == "ParallelStencil.ParallelKernel.@blockIdx"
+            @test @prettystring(1, @blockDim()) == "ParallelStencil.ParallelKernel.@blockDim"
+            @test @prettystring(1, @threadIdx()) == "ParallelStencil.ParallelKernel.@threadIdx"
+            @test @prettystring(1, @sync_threads()) == "ParallelStencil.ParallelKernel.@sync_threads"
+            @test @prettystring(1, @sharedMem(T, dims)) == "ParallelStencil.ParallelKernel.@sharedMem T dims"
+            @test @prettystring(1, @ps_show args) == "ParallelStencil.ParallelKernel.@pk_show args"
+            @test @prettystring(1, @ps_println args) == "ParallelStencil.ParallelKernel.@pk_println args"
+            @test @prettystring(1, @∀ i ∈ (x, y) body) == "ParallelStencil.ParallelKernel.@∀ i ∈ (x, y) body"
+
+            @test @prettystring(1, @warpsize()) == "ParallelStencil.ParallelKernel.@warpsize"
+            @test @prettystring(1, @laneid()) == "ParallelStencil.ParallelKernel.@laneid"
+            @test @prettystring(1, @active_mask()) == "ParallelStencil.ParallelKernel.@active_mask"
+            @test @prettystring(1, @shfl_sync(mask, val, lane)) == "ParallelStencil.ParallelKernel.@shfl_sync mask val lane"
+            @test @prettystring(1, @shfl_sync(mask, val, lane, width)) == "ParallelStencil.ParallelKernel.@shfl_sync mask val lane width"
+            @test @prettystring(1, @shfl_up_sync(mask, val, delta)) == "ParallelStencil.ParallelKernel.@shfl_up_sync mask val delta"
+            @test @prettystring(1, @shfl_up_sync(mask, val, delta, width)) == "ParallelStencil.ParallelKernel.@shfl_up_sync mask val delta width"
+            @test @prettystring(1, @shfl_down_sync(mask, val, delta)) == "ParallelStencil.ParallelKernel.@shfl_down_sync mask val delta"
+            @test @prettystring(1, @shfl_down_sync(mask, val, delta, width)) == "ParallelStencil.ParallelKernel.@shfl_down_sync mask val delta width"
+            @test @prettystring(1, @shfl_xor_sync(mask, val, lanemask)) == "ParallelStencil.ParallelKernel.@shfl_xor_sync mask val lanemask"
+            @test @prettystring(1, @shfl_xor_sync(mask, val, lanemask, width)) == "ParallelStencil.ParallelKernel.@shfl_xor_sync mask val lanemask width"
+            @test @prettystring(1, @vote_any_sync(mask, predicate)) == "ParallelStencil.ParallelKernel.@vote_any_sync mask predicate"
+            @test @prettystring(1, @vote_all_sync(mask, predicate)) == "ParallelStencil.ParallelKernel.@vote_all_sync mask predicate"
+            @test @prettystring(1, @vote_ballot_sync(mask, predicate)) == "ParallelStencil.ParallelKernel.@vote_ballot_sync mask predicate"
+        end
+
+        @testset "CPU semantic smoke tests" begin
+            @static if @iscpu($package)
+                N = 8
+                A = @rand($FloatDefault, N)
+                P = [isfinite(A[i]) && (A[i] > zero($FloatDefault)) for i in 1:N]
+                Bout_any    = Vector{Bool}(undef, N)
+                Bout_all    = Vector{Bool}(undef, N)
+                Bout_ballot = Vector{UInt64}(undef, N)
+                Bshfl       = similar(A)
+                Bshfl_up    = similar(A)
+                Bshfl_down  = similar(A)
+                Bshfl_xor   = similar(A)
+
+                @parallel_indices (ix) function kernel_semantics!(Bout_any, Bout_all, Bout_ballot, Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
+                    m = @active_mask()
+                    w = @warpsize()
+                    l = @laneid()
+                    @test w == 1
+                    @test l == 1
+                    Bshfl[ix]      = @shfl_sync(m, A[ix], l)
+                    Bshfl_up[ix]   = @shfl_up_sync(m, A[ix], 1)
+                    Bshfl_down[ix] = @shfl_down_sync(m, A[ix], 1)
+                    Bshfl_xor[ix]  = @shfl_xor_sync(m, A[ix], 1)
+                    pa = P[ix]
+                    Bout_any[ix]    = @vote_any_sync(m, pa)
+                    Bout_all[ix]    = @vote_all_sync(m, pa)
+                    Bout_ballot[ix] = @vote_ballot_sync(m, pa)
+                    return
+                end
+                @parallel (1:N) kernel_semantics!(Bout_any, Bout_all, Bout_ballot, Bshfl, Bshfl_up, Bshfl_down, Bshfl_xor, A, P)
+
+                @test all(Bshfl .== A)
+                @test all(Bshfl_up .== A)
+                @test all(Bshfl_down .== A)
+                @test all(Bshfl_xor .== A)
+                @test Bout_any == P
+                @test Bout_all == P
+                @test Bout_ballot == map(p -> p ? UInt64(0x1) : UInt64(0x0), P)
+            end
+        end
+
+        @reset_parallel_stencil()
+    end
+    ))
+end == nothing || true;
\ No newline at end of file

From fdee3df04167455d903f42725d023d823c5d83f3 Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Fri, 31 Oct 2025 12:04:42 +0100
Subject: [PATCH 25/27] add parallel stencil kernel language tests

---
 test/test_kernel_language.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_kernel_language.jl b/test/test_kernel_language.jl
index 65ea40cb..e0d090e2 100644
--- a/test/test_kernel_language.jl
+++ b/test/test_kernel_language.jl
@@ -70,7 +70,7 @@ Base.retry_load_extensions()
         @testset "CPU semantic smoke tests" begin
             @static if @iscpu($package)
                 N = 8
-                A = @rand($FloatDefault, N)
+                A = @rand(N)
                 P = [isfinite(A[i]) && (A[i] > zero($FloatDefault)) for i in 1:N]
                 Bout_any    = Vector{Bool}(undef, N)
                 Bout_all    = Vector{Bool}(undef, N)

From da3a60d57e9eee3e68de93dac61706e82275318b Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 3 Nov 2025 09:15:12 +0100
Subject: [PATCH 26/27] replace allocated with custom macro

---
 test/ParallelKernel/test_kernel_language.jl | 46 ++++++++++++++-------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/test/ParallelKernel/test_kernel_language.jl b/test/ParallelKernel/test_kernel_language.jl
index ba536109..1517677f 100644
--- a/test/ParallelKernel/test_kernel_language.jl
+++ b/test/ParallelKernel/test_kernel_language.jl
@@ -24,6 +24,20 @@ end
 Base.retry_load_extensions() # Potentially needed to load the extensions after the packages have been filtered.
 
 
+macro expr_allocated(ex)
+    expanded = Base.macroexpand(__module__, ex; recursive=true)
+    quote
+        # Warm-up evaluation to exclude first-call setup allocations
+        let
+            $(esc(expanded))
+        end
+        @allocated begin
+            $(esc(expanded))
+        end
+    end
+end
+
+
 @static for package in TEST_PACKAGES
     FloatDefault = (package == PKG_METAL) ? Float32 : Float64 # Metal does not support Float64
     
@@ -93,7 +107,7 @@ eval(:(
 
                     if $package == $PKG_CUDA
                         @test @prettystring(1, @warpsize()) == "CUDA.warpsize()"
-                        @test @prettystring(1, @laneid())   == "CUDA.laneid()"
+                        @test @prettystring(1, @laneid())   == "CUDA.laneid() + 1"
                         @test @prettystring(1, @active_mask()) == "CUDA.active_mask()"
 
                         @test @prettystring(1, @shfl_sync(mask32, val, lane)) == "CUDA.shfl_sync(mask32, val, lane)"
@@ -164,28 +178,28 @@ eval(:(
                         lanemask  = 1
                         predicate = true
 
-                        @test @allocated(@warpsize())    == 0
-                        @test @allocated(@laneid())      == 0
-                        @test @allocated(@active_mask()) == 0
+                        @test @expr_allocated(@warpsize())    == 0
+                        @test @expr_allocated(@laneid())      == 0
+                        @test @expr_allocated(@active_mask()) == 0
 
-                        @test @allocated(@shfl_sync(mask, valf, lane))            == 0
-                        @test @allocated(@shfl_sync(mask, valf, lane, width))     == 0
-                        @test @allocated(@shfl_up_sync(mask, valf, delta))        == 0
-                        @test @allocated(@shfl_up_sync(mask, valf, delta, width)) == 0
-                        @test @allocated(@shfl_down_sync(mask, valf, delta))      == 0
-                        @test @allocated(@shfl_down_sync(mask, valf, delta, width)) == 0
-                        @test @allocated(@shfl_xor_sync(mask, valf, lanemask))    == 0
-                        @test @allocated(@shfl_xor_sync(mask, valf, lanemask, width)) == 0
+                        @test @expr_allocated(@shfl_sync(mask, valf, lane))            == 0
+                        @test @expr_allocated(@shfl_sync(mask, valf, lane, width))     == 0
+                        @test @expr_allocated(@shfl_up_sync(mask, valf, delta))        == 0
+                        @test @expr_allocated(@shfl_up_sync(mask, valf, delta, width)) == 0
+                        @test @expr_allocated(@shfl_down_sync(mask, valf, delta))      == 0
+                        @test @expr_allocated(@shfl_down_sync(mask, valf, delta, width)) == 0
+                        @test @expr_allocated(@shfl_xor_sync(mask, valf, lanemask))    == 0
+                        @test @expr_allocated(@shfl_xor_sync(mask, valf, lanemask, width)) == 0
 
-                        @test @allocated(@vote_any_sync(mask, predicate))    == 0
-                        @test @allocated(@vote_all_sync(mask, predicate))    == 0
-                        @test @allocated(@vote_ballot_sync(mask, predicate)) == 0
+                        @test @expr_allocated(@vote_any_sync(mask, predicate))    == 0
+                        @test @expr_allocated(@vote_all_sync(mask, predicate))    == 0
+                        @test @expr_allocated(@vote_ballot_sync(mask, predicate)) == 0
                     end
                 end;
                 @testset "Semantic smoke tests" begin
                     @static if @iscpu($package)
                         N = 8
-                        A  = @rand($FloatDefault, N)
+                        A  = @rand(N)
                         P  = [isfinite(A[i]) && (A[i] > zero($FloatDefault)) for i in 1:N]  # simple predicate
                         Bout_any    = Vector{Bool}(undef, N)
                         Bout_all    = Vector{Bool}(undef, N)

From 1d571ce4c37544d4bba574bbbca2ae04100ba25d Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Mon, 3 Nov 2025 09:18:52 +0100
Subject: [PATCH 27/27] improve test runner

---
 test/runtests.jl | 51 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 3692d64a..9be36612 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,13 +9,14 @@ import ParallelStencil: SUPPORTED_PACKAGES, PKG_CUDA, PKG_AMDGPU, PKG_METAL
 
 excludedfiles = [ "test_excluded.jl", "test_incremental_compilation.jl", "test_revise.jl"]; # TODO: test_incremental_compilation has to be deactivated until Polyester support released
 
-function runtests()
+function runtests(testfiles=String[])
     exename   = joinpath(Sys.BINDIR, Base.julia_exename())
     testdir   = pwd()
     istest(f) = endswith(f, ".jl") && startswith(basename(f), "test_")
-    testfiles = sort(filter(istest, vcat([joinpath.(root, files) for (root, dirs, files) in walkdir(testdir)]...)))
+    testfiles = isempty(testfiles) ? sort(filter(istest, vcat([joinpath.(root, files) for (root, dirs, files) in walkdir(testdir)]...))) : testfiles
 
-    nfail = 0
+    nerror = 0
+    errorfiles = String[]
     printstyled("Testing package ParallelStencil.jl\n"; bold=true, color=:white)
 
     if (PKG_CUDA in SUPPORTED_PACKAGES && !CUDA.functional())
@@ -37,13 +38,49 @@ function runtests()
             println("$f")
             continue
         end
+        cmd = `$exename -O3 --startup-file=no $(joinpath(testdir, f))`
+        stdout_path = tempname()
+        stderr_path = tempname()
+        stdout_content = ""
+        stderr_content = ""
         try
-            run(`$exename -O3 --startup-file=no $(joinpath(testdir, f))`)
+            open(stdout_path, "w") do stdout_io
+                open(stderr_path, "w") do stderr_io
+                    proc = run(pipeline(Cmd(cmd; ignorestatus=true), stdout=stdout_io, stderr=stderr_io); wait=false)
+                    wait(proc)
+                end
+            end
+            stdout_content = read(stdout_path, String)
+            stderr_content = read(stderr_path, String)
+            print(stdout_content)
+            print(Base.stderr, stderr_content)
         catch ex
-            nfail += 1
+            println("Test Error: an exception occurred while running the test file $f :")
+            println(ex)
+        finally
+            if ispath(stdout_path)
+                rm(stdout_path; force=true)
+            end
+            if ispath(stderr_path)
+                rm(stderr_path; force=true)
+            end
+        end
+        if !occursin(r"(?i)test summary", stdout_content)
+            nerror += 1
+            push!(errorfiles, f)
+        end
+    end
+    println("")
+    if nerror == 0
+        printstyled("Test suite: all selected test files executed (see above for results).\n"; bold=true, color=:green)
+    else
+        printstyled("Test suite: $nerror test file(s) aborted execution due to error (see above for details); files aborting execution:\n"; bold=true, color=:red)
+        for f in errorfiles
+            println(" - $f")
         end
     end
-    return nfail
+    println("")
+    return nerror
 end
 
-exit(runtests())
+exit(runtests(ARGS))