Skip to content

Commit c01ed8f

Browse files
committed
Modify launch config and indexing for fd operators and pointwise in CUDAext
1 parent ad541e0 commit c01ed8f

File tree

2 files changed

+112
-1
lines changed

2 files changed

+112
-1
lines changed

ext/cuda/data_layouts_copyto.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@ function knl_copyto_linear!(dest, src, us)
2121
return nothing
2222
end
2323

24+
"""
25+
knl_copyto_VIJFH_64!(dest, src, ::Val{P})
26+
27+
Kernel for pointwise broadcasts on VIJFHStyle{63,4} and VIJFHStyle{64,4} datalayouts. P is a boolean
28+
indicating if the column is padded (true for 63, false for 64).
29+
"""
30+
function knl_copyto_VIJFH_64!(dest, src, ::Val{P}) where {P}
31+
# P is a boolean, indicating if the column is padded
32+
P && threadIdx().x == 64 && return nothing
33+
I = CartesianIndex(blockIdx().x, blockIdx().y, 1, threadIdx().x, blockIdx().z)
34+
@inbounds dest[I] = src[I]
35+
return nothing
36+
end
37+
2438
if VERSION v"1.11.0-beta"
2539
# https://github.com/JuliaLang/julia/issues/56295
2640
# Julia 1.11's Base.Broadcast currently requires
@@ -104,6 +118,44 @@ else
104118
end
105119
end
106120

121+
# Specialized kernel launch for VIJFHStyle{63,4} and VIJFHStyle{64,4} arrays. This uses block and grid indices
122+
# instead of computing cartesian indices from a linear index. The threads are launched so that
123+
# a set 64 threads covers a column.
124+
function Base.copyto!(
125+
dest::AbstractData,
126+
bc::BC,
127+
to::ToCUDA,
128+
mask::NoMask = NoMask(),
129+
) where {BC <: Base.Broadcast.Broadcasted{<:ClimaCore.DataLayouts.VIJFHStyle{63, 4}}}
130+
(Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(dest)
131+
Nv > 0 && Nh > 0 || return dest # copied from above
132+
args = (dest, bc, Val(true))
133+
auto_launch!(
134+
knl_copyto_VIJFH_64!,
135+
args;
136+
threads_s = (64, 1, 1),
137+
blocks_s = (Ni, Nj, Nh),
138+
)
139+
return dest
140+
end
141+
function Base.copyto!(
142+
dest::AbstractData,
143+
bc::BC,
144+
to::ToCUDA,
145+
mask::NoMask = NoMask(),
146+
) where {BC <: Base.Broadcast.Broadcasted{<:ClimaCore.DataLayouts.VIJFHStyle{64, 4}}}
147+
(Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(dest)
148+
Nv > 0 && Nh > 0 || return dest # copied from above
149+
args = (dest, bc, Val(false))
150+
auto_launch!(
151+
knl_copyto_VIJFH_64!,
152+
args;
153+
threads_s = (64, 1, 1),
154+
blocks_s = (Ni, Nj, Nh),
155+
)
156+
return dest
157+
end
158+
107159
# broadcasting scalar assignment
108160
# Performance optimization for the common identity scalar case: dest .= val
109161
# And this is valid for the CPU or GPU, since the broadcasted object

ext/cuda/operators_finite_difference.jl

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,25 @@ function Base.copyto!(
7878
)
7979
else
8080
bc′ = disable_shmem_style(bc)
81+
(Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(out_fv)
82+
# Specialized kernel launch for common case. This uses block and grid indices
83+
# instead of computing cartesian indices from a linear index
84+
if (Nv == 64 || Nv == 63) && mask isa NoMask && Ni == 4 && Nj == 4 && Nh >= 1500
85+
args = (
86+
strip_space(out, space),
87+
strip_space(bc′, space),
88+
axes(out),
89+
bounds,
90+
Val(Nv == 63),
91+
)
92+
auto_launch!(
93+
copyto_stencil_kernel_64!,
94+
args;
95+
threads_s = (64, 1, 1),
96+
blocks_s = (Ni, Nj, Nh),
97+
)
98+
return out
99+
end
81100
@assert !any_fd_shmem_style(bc′)
82101
cart_inds = if mask isa NoMask
83102
cartesian_indices(us)
@@ -102,7 +121,6 @@ function Base.copyto!(
102121
else
103122
masked_partition(mask, n_max_threads, us)
104123
end
105-
106124
auto_launch!(
107125
copyto_stencil_kernel!,
108126
args;
@@ -115,6 +133,47 @@ function Base.copyto!(
115133
end
116134
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
117135

136+
"""
137+
copyto_stencil_kernel_64!(
138+
out,
139+
bc::Union{
140+
StencilBroadcasted{CUDAColumnStencilStyle},
141+
Broadcasted{CUDAColumnStencilStyle},
142+
},
143+
space,
144+
bds,
145+
::Val{P},
146+
)
147+
148+
Kernel for fd operators on VIJFHStyle{63,4} and VIJFHStyle{64,4} datalayouts. P is a boolean
149+
indicating if the column is padded (true for 63, false for 64).
150+
"""
151+
function copyto_stencil_kernel_64!(
152+
out,
153+
bc::Union{
154+
StencilBroadcasted{CUDAColumnStencilStyle},
155+
Broadcasted{CUDAColumnStencilStyle},
156+
},
157+
space,
158+
bds,
159+
::Val{P},
160+
) where {P}
161+
@inbounds begin
162+
# P is a boolean, indicating if the column is padded
163+
P && threadIdx().x == 64 && return nothing
164+
i = blockIdx().x
165+
j = blockIdx().y
166+
v = threadIdx().x
167+
h = blockIdx().z
168+
hidx = (i, j, h)
169+
(li, lw, rw, ri) = bds
170+
idx = v - 1 + li
171+
val = Operators.getidx(space, bc, idx, hidx)
172+
setidx!(space, out, idx, hidx, val)
173+
end
174+
return nothing
175+
end
176+
118177
function copyto_stencil_kernel!(
119178
out,
120179
bc::Union{

0 commit comments

Comments
 (0)