Modify launch config and indexing for fd operators and pointwise in CUDAext

imreddyTeja · imreddyTeja · commit c01ed8fa3ea2 · 2026-02-02T14:41:52.000-08:00
diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl
@@ -21,6 +21,20 @@ function knl_copyto_linear!(dest, src, us)
     return nothing
 end
 
+"""
+    knl_copyto_VIJFH_64!(dest, src, ::Val{P})
+
+Kernel for pointwise broadcasts on VIJFHStyle{63,4} and VIJFHStyle{64,4} datalayouts. P is a boolean
+indicating if the column is padded (true for 63, false for 64).
+"""
+function knl_copyto_VIJFH_64!(dest, src, ::Val{P}) where {P}
+    # P is a boolean, indicating if the column is padded
+    P && threadIdx().x == 64 && return nothing
+    I = CartesianIndex(blockIdx().x, blockIdx().y, 1, threadIdx().x, blockIdx().z)
+    @inbounds dest[I] = src[I]
+    return nothing
+end
+
 if VERSION ≥ v"1.11.0-beta"
     # https://github.com/JuliaLang/julia/issues/56295
     # Julia 1.11's Base.Broadcast currently requires
@@ -104,6 +118,44 @@ else
     end
 end
 
+# Specialized kernel launch for VIJFHStyle{63,4} and VIJFHStyle{64,4} arrays. This uses block and grid indices
+# instead of computing cartesian indices from a linear index. The threads are launched so that
+# a set 64 threads covers a column.
+function Base.copyto!(
+    dest::AbstractData,
+    bc::BC,
+    to::ToCUDA,
+    mask::NoMask = NoMask(),
+) where {BC <: Base.Broadcast.Broadcasted{<:ClimaCore.DataLayouts.VIJFHStyle{63, 4}}}
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(dest)
+    Nv > 0 && Nh > 0 || return dest # copied from above
+    args = (dest, bc, Val(true))
+    auto_launch!(
+        knl_copyto_VIJFH_64!,
+        args;
+        threads_s = (64, 1, 1),
+        blocks_s = (Ni, Nj, Nh),
+    )
+    return dest
+end
+function Base.copyto!(
+    dest::AbstractData,
+    bc::BC,
+    to::ToCUDA,
+    mask::NoMask = NoMask(),
+) where {BC <: Base.Broadcast.Broadcasted{<:ClimaCore.DataLayouts.VIJFHStyle{64, 4}}}
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(dest)
+    Nv > 0 && Nh > 0 || return dest # copied from above
+    args = (dest, bc, Val(false))
+    auto_launch!(
+        knl_copyto_VIJFH_64!,
+        args;
+        threads_s = (64, 1, 1),
+        blocks_s = (Ni, Nj, Nh),
+    )
+    return dest
+end
+
 # broadcasting scalar assignment
 # Performance optimization for the common identity scalar case: dest .= val
 # And this is valid for the CPU or GPU, since the broadcasted object
diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl
@@ -78,6 +78,25 @@ function Base.copyto!(
         )
     else
         bc′ = disable_shmem_style(bc)
+        (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(out_fv)
+        #  Specialized kernel launch for common case.  This uses block and grid indices
+        # instead of computing cartesian indices from a linear index
+        if (Nv == 64 || Nv == 63) && mask isa NoMask && Ni == 4 && Nj == 4 && Nh >= 1500
+            args = (
+                strip_space(out, space),
+                strip_space(bc′, space),
+                axes(out),
+                bounds,
+                Val(Nv == 63),
+            )
+            auto_launch!(
+                copyto_stencil_kernel_64!,
+                args;
+                threads_s = (64, 1, 1),
+                blocks_s = (Ni, Nj, Nh),
+            )
+            return out
+        end
         @assert !any_fd_shmem_style(bc′)
         cart_inds = if mask isa NoMask
             cartesian_indices(us)
@@ -102,7 +121,6 @@ function Base.copyto!(
         else
             masked_partition(mask, n_max_threads, us)
         end
-
         auto_launch!(
             copyto_stencil_kernel!,
             args;
@@ -115,6 +133,47 @@ function Base.copyto!(
 end
 import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
 
+"""
+    copyto_stencil_kernel_64!(
+        out,
+        bc::Union{
+            StencilBroadcasted{CUDAColumnStencilStyle},
+            Broadcasted{CUDAColumnStencilStyle},
+        },
+        space,
+        bds,
+        ::Val{P},
+    )
+
+Kernel for fd operators on VIJFHStyle{63,4} and VIJFHStyle{64,4} datalayouts. P is a boolean
+indicating if the column is padded (true for 63, false for 64).
+"""
+function copyto_stencil_kernel_64!(
+    out,
+    bc::Union{
+        StencilBroadcasted{CUDAColumnStencilStyle},
+        Broadcasted{CUDAColumnStencilStyle},
+    },
+    space,
+    bds,
+    ::Val{P},
+) where {P}
+    @inbounds begin
+        # P is a boolean, indicating if the column is padded
+        P && threadIdx().x == 64 && return nothing
+        i = blockIdx().x
+        j = blockIdx().y
+        v = threadIdx().x
+        h = blockIdx().z
+        hidx = (i, j, h)
+        (li, lw, rw, ri) = bds
+        idx = v - 1 + li
+        val = Operators.getidx(space, bc, idx, hidx)
+        setidx!(space, out, idx, hidx, val)
+    end
+    return nothing
+end
+
 function copyto_stencil_kernel!(
     out,
     bc::Union{