Implement a columnwise shmem operator (#2328)

charleskawczynski · web-flow · commit 507ed4e7669a · 2025-05-30T17:07:42.000-04:00
diff --git a/.buildkite/Manifest.toml b/.buildkite/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.10.9"
 manifest_format = "2.0"
-project_hash = "6ab89829ea190189b0319a6f8e22b3515e5283c2"
+project_hash = "45a11f30c749324ab2ca9eb06366eb279b21cfa8"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "e2478490447631aedba0823d4d7a80b2cc8cdb32"
@@ -331,7 +331,7 @@ weakdeps = ["CUDA", "MPI"]
 deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"]
 path = ".."
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
-version = "0.14.29"
+version = "0.14.33"
 weakdeps = ["CUDA", "Krylov"]
 
     [deps.ClimaCore.extensions]
@@ -356,6 +356,12 @@ path = "../lib/ClimaCoreVTK"
 uuid = "c8b6d40d-e815-466f-95ae-c48aefa668fa"
 version = "0.7.6"
 
+[[deps.ClimaParams]]
+deps = ["TOML"]
+git-tree-sha1 = "acf6c80c7ad59fe9dac9cc49625d52f4b8e1f4b7"
+uuid = "5c42b081-d73a-476f-9059-fd94b934656c"
+version = "0.10.30"
+
 [[deps.ClimaTimeSteppers]]
 deps = ["ClimaComms", "Colors", "DataStructures", "DiffEqBase", "KernelAbstractions", "Krylov", "LinearAlgebra", "LinearOperators", "NVTX", "SciMLBase", "StaticArrays"]
 git-tree-sha1 = "e719705cf15fec895abcb547946131ffe83de4d7"
@@ -1516,6 +1522,11 @@ version = "400.902.209+0"
 uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
 version = "1.2.0"
 
+[[deps.NullBroadcasts]]
+git-tree-sha1 = "343c7bb67d0a29ea5d7d2b3e945afe81e2862337"
+uuid = "0d71be07-595a-4f89-9529-4065a4ab43a6"
+version = "0.1.0"
+
 [[deps.OffsetArrays]]
 git-tree-sha1 = "a414039192a155fb38c4599a60110f0018c6ec82"
 uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -2186,6 +2197,16 @@ git-tree-sha1 = "43044b737fa70bc12f6105061d3da38f881a3e3c"
 uuid = "b718987f-49a8-5099-9789-dcd902bef87d"
 version = "1.0.2"
 
+[[deps.Thermodynamics]]
+deps = ["DocStringExtensions", "KernelAbstractions", "Random", "RootSolvers"]
+git-tree-sha1 = "efe74e0344fd7fb68b831316055290d80a62d9c1"
+uuid = "b60c26fb-14c3-4610-9d3e-2d17fe7ff00c"
+version = "0.12.11"
+weakdeps = ["ClimaParams"]
+
+    [deps.Thermodynamics.extensions]
+    CreateParametersExt = "ClimaParams"
+
 [[deps.ThreadingUtilities]]
 deps = ["ManualMemory"]
 git-tree-sha1 = "eda08f7e9818eb53661b3deb74e3159460dfbc27"
diff --git a/.buildkite/Project.toml b/.buildkite/Project.toml
@@ -12,6 +12,7 @@ ClimaCore = "d414da3d-4745-48bb-8d80-42e94e092884"
 ClimaCorePlots = "cf7c7e5a-b407-4c48-9047-11a94a308626"
 ClimaCoreTempestRemap = "d934ef94-cdd4-4710-83d6-720549644b70"
 ClimaCoreVTK = "c8b6d40d-e815-466f-95ae-c48aefa668fa"
+ClimaParams = "5c42b081-d73a-476f-9059-fd94b934656c"
 ClimaTimeSteppers = "595c0a79-7f3d-439a-bc5a-b232dc3bde79"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CountFlops = "1db9610d-79e1-487a-8d40-77f3295c7593"
@@ -32,6 +33,7 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+NullBroadcasts = "0d71be07-595a-4f89-9529-4065a4ab43a6"
 OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388"
 OrdinaryDiffEqTsit5 = "b1df2697-797e-41e3-8120-5422d3b24e4a"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
@@ -52,6 +54,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TerminalLoggers = "5d786b92-1e48-4d6f-9151-6b4477ca9bed"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Thermodynamics = "b60c26fb-14c3-4610-9d3e-2d17fe7ff00c"
 ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d"
 
 [compat]
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -624,6 +624,19 @@ steps:
         agents:
           slurm_gpus: 1
 
+      - label: "Unit: gpu columnwise"
+        key: unit_gpu_columnwise
+        command:
+          - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/finitedifference/unit_columnwise.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
+      - label: "Unit: columnwise"
+        key: unit_columnwise
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/finitedifference/unit_columnwise.jl"
+
       - label: "Unit: column"
         key: unit_column
         command:
diff --git a/ext/ClimaCoreCUDAExt.jl b/ext/ClimaCoreCUDAExt.jl
@@ -35,6 +35,7 @@ include(joinpath("cuda", "limiters.jl"))
 include(joinpath("cuda", "operators_sem_shmem.jl"))
 include(joinpath("cuda", "operators_fd_shmem_common.jl"))
 include(joinpath("cuda", "operators_fd_shmem.jl"))
+include(joinpath("cuda", "operators_columnwise.jl"))
 include(joinpath("cuda", "matrix_fields_single_field_solve.jl"))
 include(joinpath("cuda", "matrix_fields_multiple_field_solve.jl"))
 include(joinpath("cuda", "operators_spectral_element.jl"))
diff --git a/ext/cuda/data_layouts.jl b/ext/cuda/data_layouts.jl
@@ -15,12 +15,43 @@ import CUDA
 parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
     CUDA.CuArray{T, N, B} where {N}
 
+# allow on-device use of lazy broadcast objects
+parent_array_type(
+    ::Type{<:CUDA.CuDeviceArray{T, N, A} where {N}},
+) where {T, A} = CUDA.CuDeviceArray{T, N, A} where {N}
+
 # Ensure that both parent array types have the same memory buffer type.
 promote_parent_array_type(
     ::Type{CUDA.CuArray{T1, N, B} where {N}},
     ::Type{CUDA.CuArray{T2, N, B} where {N}},
 ) where {T1, T2, B} = CUDA.CuArray{promote_type(T1, T2), N, B} where {N}
 
+# allow on-device use of lazy broadcast objects
+promote_parent_array_type(
+    ::Type{CUDA.CuDeviceArray{T1, N, B} where {N}},
+    ::Type{CUDA.CuDeviceArray{T2, N, B} where {N}},
+) where {T1, T2, B} = CUDA.CuDeviceArray{promote_type(T1, T2), N, B} where {N}
+
+# allow on-device use of lazy broadcast objects with different type params
+promote_parent_array_type(
+    ::Type{CUDA.CuDeviceArray{T1, N, B1} where {N}},
+    ::Type{CUDA.CuDeviceArray{T2, N, B2} where {N}},
+) where {T1, T2, B1, B2} =
+    CUDA.CuDeviceArray{promote_type(T1, T2), N, B} where {N, B}
+
+# allow on-device use of lazy broadcast objects with different type params
+promote_parent_array_type(
+    ::Type{CUDA.CuDeviceArray{T1}},
+    ::Type{CUDA.CuDeviceArray{T2, N, B2} where {N}},
+) where {T1, T2, B2} =
+    CUDA.CuDeviceArray{promote_type(T1, T2), N, B} where {N, B}
+
+promote_parent_array_type(
+    ::Type{CUDA.CuDeviceArray{T1, N, B1} where {N}},
+    ::Type{CUDA.CuDeviceArray{T2} where {N}},
+) where {T1, T2, B1} =
+    CUDA.CuDeviceArray{promote_type(T1, T2), N, B} where {N, B}
+
 # Make `similar` accept our special `UnionAll` parent array type for CuArray.
 Base.similar(
     ::Type{CUDA.CuArray{T, N′, B} where {N′}},
diff --git a/ext/cuda/operators_columnwise.jl b/ext/cuda/operators_columnwise.jl
@@ -0,0 +1,85 @@
+import ClimaCore.Operators:
+    columnwise!,
+    device_sync_threads,
+    columnwise_kernel!,
+    universal_index_columnwise,
+    local_mem
+
+device_sync_threads(device::ClimaComms.CUDADevice) = CUDA.sync_threads()
+
+local_mem(
+    device::ClimaComms.CUDADevice,
+    ::Type{T},
+    ::Val{dims},
+) where {T, dims} = CUDA.CuStaticSharedArray(T, dims)
+
+function columnwise!(
+    device::ClimaComms.CUDADevice,
+    ᶜf::ᶜF,
+    ᶠf::ᶠF,
+    ᶜYₜ::Fields.Field,
+    ᶠYₜ::Fields.Field,
+    ᶜY::Fields.Field,
+    ᶠY::Fields.Field,
+    p,
+    t,
+    ::Val{localmem_lg} = Val(true),
+    ::Val{localmem_state} = Val(true),
+) where {ᶜF, ᶠF, localmem_lg, localmem_state}
+    ᶜspace = axes(ᶜY)
+    ᶠspace = Spaces.face_space(ᶜspace)
+    ᶠNv = Spaces.nlevels(ᶠspace)
+    ᶜcf = Fields.coordinate_field(ᶜspace)
+    us = DataLayouts.UniversalSize(Fields.field_values(ᶜcf))
+    (Ni, Nj, _, _, Nh) = DataLayouts.universal_size(us)
+    nitems = Ni * Nj * 1 * ᶠNv * Nh
+    kernel = CUDA.@cuda(
+        always_inline = true,
+        launch = false,
+        columnwise_kernel!(
+            device,
+            ᶜf,
+            ᶠf,
+            ᶜYₜ,
+            ᶠYₜ,
+            ᶜY,
+            ᶠY,
+            p,
+            t,
+            nothing,
+            Val(localmem_lg),
+            Val(localmem_state),
+        )
+    )
+    threads = (ᶠNv,)
+    blocks = (Nh, Ni * Nj)
+    kernel(
+        device,
+        ᶜf,
+        ᶠf,
+        ᶜYₜ,
+        ᶠYₜ,
+        ᶜY,
+        ᶠY,
+        p,
+        t,
+        nothing,
+        Val(localmem_lg),
+        Val(localmem_state);
+        threads,
+        blocks,
+    )
+end
+
+@inline function universal_index_columnwise(
+    device::ClimaComms.CUDADevice,
+    UI,
+    us,
+)
+    (v,) = CUDA.threadIdx()
+    (h, ij) = CUDA.blockIdx()
+    (Ni, Nj, _, _, _) = DataLayouts.universal_size(us)
+    Ni * Nj < ij && return CartesianIndex((-1, -1, 1, -1, -1))
+    @inbounds (i, j) = CartesianIndices((Ni, Nj))[ij].I
+    return CartesianIndex((i, j, 1, v, h))
+end
diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl
@@ -11,9 +11,8 @@ import ClimaCore.Operators: LeftBoundaryWindow, RightBoundaryWindow, Interior
 
 struct CUDAColumnStencilStyle <: AbstractStencilStyle end
 struct CUDAWithShmemColumnStencilStyle <: AbstractStencilStyle end
-AbstractStencilStyle(bc, ::ClimaComms.CUDADevice) =
-    Operators.any_fd_shmem_supported(bc) ? CUDAWithShmemColumnStencilStyle :
-    CUDAColumnStencilStyle
+
+AbstractStencilStyle(bc, ::ClimaComms.CUDADevice) = CUDAColumnStencilStyle
 
 Base.Broadcast.BroadcastStyle(
     x::Operators.ColumnStencilStyle,
@@ -150,10 +149,7 @@ end
 
 function copyto_stencil_kernel_shmem!(
     out,
-    bc′::Union{
-        StencilBroadcasted{CUDAWithShmemColumnStencilStyle},
-        Broadcasted{CUDAWithShmemColumnStencilStyle},
-    },
+    bc′::Union{StencilBroadcasted, Broadcasted},
     space,
     bds,
     us,
diff --git a/src/DataLayouts/struct.jl b/src/DataLayouts/struct.jl
@@ -116,6 +116,19 @@ promote_parent_array_type(
     ::Type{Array{T1}},
     ::Type{MArray{S, T2}},
 ) where {S, T1, T2} = MArray{S, promote_type(T1, T2)}
+# Ditch sizes (they're never actually used!)
+promote_parent_array_type(
+    ::Type{MArray{S1, T1}},
+    ::Type{MArray{S2, T2}},
+) where {S1, T1, S2, T2} = MArray{S, promote_type(T1, T2)} where {S}
+promote_parent_array_type(
+    ::Type{MArray{S1, T1} where {S1}},
+    ::Type{MArray{S2, T2}},
+) where {T1, S2, T2} = MArray{S, promote_type(T1, T2)} where {S}
+promote_parent_array_type(
+    ::Type{MArray{S1, T1}},
+    ::Type{MArray{S2, T2} where {S2}},
+) where {S1, T1, T2} = MArray{S, promote_type(T1, T2)} where {S}
 
 """
     StructArrays.bypass_constructor(T, args)
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
@@ -369,6 +369,9 @@ local_geometry_field(space::AbstractSpace) =
     Field(Spaces.local_geometry_data(space), space)
 local_geometry_field(field::Field) = local_geometry_field(axes(field))
 
+Fields.local_geometry_field(bc::Base.Broadcast.Broadcasted) =
+    Fields.local_geometry_field(axes(bc))
+
 """
     Δz_field(field::Field)
     Δz_field(space::AbstractSpace)
diff --git a/src/Grids/extruded.jl b/src/Grids/extruded.jl
@@ -157,6 +157,10 @@ struct DeviceExtrudedFiniteDifferenceGrid{VT, Q, GG, CLG, FLG} <:
     face_local_geometry::FLG
 end
 
+# Specialize to allow on-device call of `device` for `DeviceExtrudedFiniteDifferenceGrid`
+ClimaComms.device(grid::DeviceExtrudedFiniteDifferenceGrid) =
+    ClimaComms.device(vertical_topology(grid))
+
 local_geometry_type(
     ::Type{DeviceExtrudedFiniteDifferenceGrid{VT, Q, GG, CLG, FLG}},
 ) where {VT, Q, GG, CLG, FLG} = eltype(CLG) # calls eltype from DataLayouts
diff --git a/src/Operators/Operators.jl b/src/Operators/Operators.jl
@@ -26,5 +26,6 @@ include("numericalflux.jl")
 include("finitedifference.jl")
 include("remapping.jl")
 include("integrals.jl")
+include("columnwise.jl")
 
 end # module
diff --git a/src/Operators/columnwise.jl b/src/Operators/columnwise.jl
diff --git a/test/Operators/finitedifference/unit_columnwise.jl b/test/Operators/finitedifference/unit_columnwise.jl