1+ import ClimaCore. DataLayouts:
2+ to_non_extruded_broadcasted, has_uniform_datalayouts
13DataLayouts. _device_dispatch(x:: CUDA.CuArray ) = ToCUDA()
24
3- function knl_copyto!(dest, src)
4-
5- i = CUDA. threadIdx(). x
6- j = CUDA. threadIdx(). y
7-
8- h = CUDA. blockIdx(). x
9- v = CUDA. blockDim(). z * (CUDA. blockIdx(). y - 1 ) + CUDA. threadIdx(). z
10-
11- if v <= size(dest, 4 )
12- I = CartesianIndex((i, j, 1 , v, h))
13- @inbounds dest[I] = src[I]
14- end
15- return nothing
16- end
17-
18- function Base. copyto!(
19- dest:: IJFH{S, Nij, Nh} ,
20- bc:: DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh} ,
21- :: ToCUDA ,
22- ) where {S, Nij, Nh}
23- if Nh > 0
24- auto_launch!(
25- knl_copyto!,
26- (dest, bc),
27- dest;
28- threads_s = (Nij, Nij),
29- blocks_s = (Nh, 1 ),
30- )
31- end
32- return dest
33- end
34-
35- function Base. copyto!(
36- dest:: VIJFH{S, Nv, Nij, Nh} ,
37- bc:: DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh} ,
38- :: ToCUDA ,
39- ) where {S, Nv, Nij, Nh}
40- if Nv > 0 && Nh > 0
41- Nv_per_block = min(Nv, fld(256 , Nij * Nij))
42- Nv_blocks = cld(Nv, Nv_per_block)
43- auto_launch!(
44- knl_copyto!,
45- (dest, bc),
46- dest;
47- threads_s = (Nij, Nij, Nv_per_block),
48- blocks_s = (Nh, Nv_blocks),
49- )
50- end
51- return dest
52- end
53-
54- function Base. copyto!(
55- dest:: VF{S, Nv} ,
56- bc:: DataLayouts.BroadcastedUnionVF{S, Nv} ,
57- :: ToCUDA ,
58- ) where {S, Nv}
59- if Nv > 0
60- auto_launch!(
61- knl_copyto!,
62- (dest, bc),
63- dest;
64- threads_s = (1 , 1 ),
65- blocks_s = (1 , Nv),
66- )
67- end
68- return dest
69- end
70-
71- function Base. copyto!(
72- dest:: DataF{S} ,
73- bc:: DataLayouts.BroadcastedUnionDataF{S} ,
74- :: ToCUDA ,
75- ) where {S}
76- auto_launch!(
77- knl_copyto!,
78- (dest, bc),
79- dest;
80- threads_s = (1 , 1 ),
81- blocks_s = (1 , 1 ),
82- )
83- return dest
84- end
85-
865import ClimaCore. DataLayouts: isascalar
87- function knl_copyto_flat !(dest:: AbstractData , bc, us)
6+ function knl_copyto_cart !(dest:: AbstractData , bc, us)
887 @inbounds begin
898 tidx = thread_index()
909 if tidx ≤ get_N(us)
@@ -96,24 +15,43 @@ function knl_copyto_flat!(dest::AbstractData, bc, us)
9615 return nothing
9716end
9817
18+ function knl_copyto_linear!(dest:: AbstractData , bc, us)
19+ @inbounds begin
20+ tidx = thread_index()
21+ if tidx ≤ get_N(us)
22+ dest[tidx] = bc[tidx]
23+ end
24+ end
25+ return nothing
26+ end
27+
28+ function knl_copyto_linear!(dest:: DataF , bc, us)
29+ @inbounds dest[] = bc[tidx]
30+ return nothing
31+ end
32+
9933function cuda_copyto!(dest:: AbstractData , bc)
10034 (_, _, Nv, _, Nh) = DataLayouts. universal_size(dest)
35+ (Nv > 0 && Nh > 0 ) || return dest
10136 us = DataLayouts. UniversalSize(dest)
102- if Nv > 0 && Nh > 0
103- auto_launch!(knl_copyto_flat!, (dest, bc, us), dest; auto = true )
37+ if has_uniform_datalayouts(bc)
38+ bc′ = to_non_extruded_broadcasted(bc)
39+ auto_launch!(knl_copyto_linear!, (dest, bc′, us), dest; auto = true )
40+ else
41+ auto_launch!(knl_copyto_cart!, (dest, bc, us), dest; auto = true )
10442 end
10543 return dest
10644end
10745
10846# TODO : can we use CUDA's luanch configuration for all data layouts?
10947# Currently, it seems to have a slight performance degradation.
11048# ! format: off
111- # Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
49+ Base. copyto!(dest:: IJFH{S, Nij} , bc:: DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh} , :: ToCUDA ) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
11250Base. copyto!(dest:: IFH{S, Ni, Nh} , bc:: DataLayouts.BroadcastedUnionIFH{S, Ni, Nh} , :: ToCUDA ) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
11351Base. copyto!(dest:: IJF{S, Nij} , bc:: DataLayouts.BroadcastedUnionIJF{S, Nij} , :: ToCUDA ) where {S, Nij} = cuda_copyto!(dest, bc)
11452Base. copyto!(dest:: IF{S, Ni} , bc:: DataLayouts.BroadcastedUnionIF{S, Ni} , :: ToCUDA ) where {S, Ni} = cuda_copyto!(dest, bc)
11553Base. copyto!(dest:: VIFH{S, Nv, Ni, Nh} , bc:: DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh} , :: ToCUDA ) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
116- # Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
117- # Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
118- # Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
54+ Base. copyto!(dest:: VIJFH{S, Nv, Nij, Nh} , bc:: DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh} , :: ToCUDA ) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
55+ Base. copyto!(dest:: VF{S, Nv} , bc:: DataLayouts.BroadcastedUnionVF{S, Nv} , :: ToCUDA ) where {S, Nv} = cuda_copyto!(dest, bc)
56+ Base. copyto!(dest:: DataF{S} , bc:: DataLayouts.BroadcastedUnionDataF{S} , :: ToCUDA ) where {S} = cuda_copyto!(dest, bc)
11957# ! format: on
0 commit comments