add SHTnsCUDAExt and working wrappers

fgerick · fgerick · commit 7158ae8819b1 · 2024-11-17T15:29:47.000Z
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,12 @@ version = "0.2.0"
 [deps]
 SHTns_jll = "daf09cc5-9ab3-509e-9618-0b89086eb825"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+SHTnsCUDAExt = "CUDA"
+
 [compat]
 julia = "1.6"
 
diff --git a/ext/SHTnsCUDAExt/SHTnsCUDAExt.jl b/ext/SHTnsCUDAExt/SHTnsCUDAExt.jl
@@ -0,0 +1,15 @@
+module SHTnsCUDAExt
+
+using CUDA
+using SHTns
+
+import SHTns: libshtns
+import SHTns: synth, synth!, analys, analys!
+
+__init__() = @assert CUDA.functional()
+
+include("sht.jl")
+include("synth.jl")
+include("analys.jl")
+
+end #module
diff --git a/ext/SHTnsCUDAExt/analys.jl b/ext/SHTnsCUDAExt/analys.jl
@@ -0,0 +1,83 @@
+function analys(cfg::SHTnsCfg, v::CuMatrix{Float64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm)
+    analys!(cfg, copy(v), qlm)
+    return qlm
+end
+
+function analys(cfg::SHTnsCfg, v::CuMatrix{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx)
+    analys!(cfg, copy(v), qlm)
+    return qlm
+end
+
+function analys(cfg::SHTnsCfg, utheta::CuMatrix{Float64}, uphi::CuMatrix{Float64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    slm = CuVector{ComplexF64}(undef, cfg.nlm)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm)
+    analys!(cfg, copy(utheta), copy(uphi), slm, tlm)
+    return slm, tlm
+end
+
+function analys(cfg::SHTnsCfg, utheta::CuMatrix{ComplexF64}, uphi::CuMatrix{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    slm = CuVector{ComplexF64}(undef, cfg.nlm_cplx)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx)
+    analys!(cfg, copy(utheta), copy(uphi), slm, tlm)
+    return slm, tlm
+end
+
+function analys(cfg::SHTnsCfg, ur::CuMatrix{Float64}, utheta::CuMatrix{Float64}, uphi::CuMatrix{Float64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm)
+    slm = CuVector{ComplexF64}(undef, cfg.nlm)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm)
+    analys!(cfg, copy(ur), copy(utheta), copy(uphi), qlm, slm, tlm)
+    return qlm, slm, tlm
+end
+
+function analys(cfg::SHTnsCfg, ur::CuMatrix{ComplexF64}, utheta::CuMatrix{ComplexF64}, uphi::CuMatrix{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx)
+    slm = CuVector{ComplexF64}(undef, cfg.nlm_cplx)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx)
+    analys!(cfg, copy(ur), copy(utheta), copy(uphi), qlm, slm, tlm)
+    return qlm, slm, tlm
+end
+
+function analys!(cfg::SHTnsCfg, v::CuMatrix{Float64}, qlm::CuVector{ComplexF64})
+    @assert cfg.shtype.gpu
+    return cu_spat_to_SH(cfg.cfg, v, qlm, cfg.lmax)
+end
+
+
+function analys!(cfg::SHTnsCfg, utheta::T, uphi::T, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuMatrix{Float64}}
+    @assert cfg.shtype.gpu
+    return cu_spat_to_SHsphtor(cfg.cfg, utheta, uphi, slm, tlm, cfg.lmax)
+end
+
+function analys!(cfg::SHTnsCfg, ur::T, utheta::T, uphi::T, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuMatrix{Float64}}
+    @assert cfg.shtype.gpu
+    return cu_spat_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm, cfg.lmax)
+end
+
+#complex to complex not available for CUDA (status: SHTns v3.7)
+
+# function analys!(cfg::SHTnsCfg, v::CuMatrix{ComplexF64}, qlm::CuVector{ComplexF64})
+#     return cu_spat_cplx_to_SH(cfg.cfg, v, qlm)
+# end
+
+# function analys!(cfg::SHTnsCfg, utheta::T, uphi::T, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuMatrix{ComplexF64}}
+#     return cu_spat_cplx_to_SHsphtor(cfg.cfg, utheta, uphi, slm, tlm)
+# end
+
+# function analys!(cfg::SHTnsCfg, ur::T, utheta::T, uphi::T, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuMatrix{ComplexF64}}
+#     return cu_spat_cplx_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm)
+# end
diff --git a/ext/SHTnsCUDAExt/sht.jl b/ext/SHTnsCUDAExt/sht.jl
@@ -0,0 +1,25 @@
+function cu_spat_to_SH(cfg, Vr::CuMatrix{Float64}, Qlm::CuVector{Complex{Float64}}, lmax)
+    ccall((:cu_spat_to_SH, libshtns[]), Nothing, (shtns_cfg, CuPtr{Float64}, CuPtr{Complex{Float64}}, Clong), cfg, Vr, Qlm, lmax)
+end
+
+function cu_SH_to_spat(cfg, Qlm::CuVector{Complex{Float64}}, Vr::CuMatrix{Float64}, lmax)
+    ccall((:cu_SH_to_spat, libshtns[]), Nothing, (shtns_cfg, CuPtr{Complex{Float64}}, CuPtr{Float64}, Clong), cfg, Qlm, Vr, lmax)
+end
+
+function cu_spat_to_SHsphtor(cfg, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, lmax)
+    ccall((:cu_spat_to_SHsphtor, libshtns[]), Nothing, (shtns_cfg,CuPtr{Float64},CuPtr{Float64},CuPtr{ComplexF64},CuPtr{ComplexF64}, Clong), cfg, Vt, Vp, Slm, Tlm, lmax)
+end
+
+function cu_SHsphtor_to_spat(cfg, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, lmax)
+    ccall((:cu_SHsphtor_to_spat, libshtns[]), Nothing, (shtns_cfg,CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{Float64},CuPtr{Float64}, Clong), cfg, Slm, Tlm, Vt, Vp, lmax)
+end
+
+function spat_to_SHqst(cfg, Vr::CuMatrix{Float64}, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, Qlm::CuVector{Complex{Float64}}, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, lmax)
+    ccall((:cu_spat_to_SHqst, libshtns[]), Nothing, (shtns_cfg,CuPtr{Float64},CuPtr{Float64},CuPtr{Float64},CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{ComplexF64}, Clong), cfg, Vr, Vt, Vp, Qlm, Slm, Tlm, lmax)
+end
+
+function SHqst_to_spat(cfg, Qlm::CuVector{Complex{Float64}}, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, Vr::CuMatrix{Float64}, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, lmax)
+    ccall((:cu_SHqst_to_spat, libshtns[]), Nothing, (shtns_cfg,CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{Float64},CuPtr{Float64},CuPtr{Float64}, Clong), cfg, Qlm, Slm, Tlm, Vr, Vt, Vp, lmax)
+end
+
+
diff --git a/ext/SHTnsCUDAExt/synth.jl b/ext/SHTnsCUDAExt/synth.jl
@@ -0,0 +1,123 @@
+function synth(cfg::SHTnsCfg, qlm::CuVector{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    @assert length(qlm) == cfg.nlm
+
+    nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+    ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+    v = CuMatrix{Float64}(undef, nx, ny)
+    synth!(cfg, qlm, v)
+    return v
+end
+
+function synth(cfg::SHTnsCfg, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    @assert length(slm) == length(tlm) == cfg.nlm
+
+    nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+    ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+    utheta = CuMatrix{Float64}(undef, nx, ny)
+    uphi = CuMatrix{Float64}(undef, nx, ny)
+    synth!(cfg, slm, tlm, utheta, uphi)
+    return utheta, uphi
+end
+
+function synth(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    @assert length(qlm) == length(slm) == length(tlm) == cfg.nlm
+    nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+    ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+    ur = CuMatrix{Float64}(undef, nx, ny)
+    utheta = CuMatrix{Float64}(undef, nx, ny)
+    uphi = CuMatrix{Float64}(undef, nx, ny)
+    synth!(cfg, qlm, slm, tlm, ur, utheta, uphi)
+    return ur, utheta, uphi
+end
+
+
+function synth!(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, v::CuMatrix{Float64})
+    @assert cfg.shtype.gpu
+    cu_SH_to_spat(cfg.cfg, qlm, v, cfg.lmax)
+    return v
+end
+
+function synth!(cfg::SHTnsCfg, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, utheta::T, uphi::T) where {T<:CuMatrix{Float64}}
+    @assert cfg.shtype.gpu
+    cu_SHsphtor_to_spat(cfg.cfg, slm, tlm, utheta, uphi, cfg.lmax)
+    return utheta, uphi
+end
+
+function synth!(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, ur::T, utheta::T, uphi::T) where {T<:CuMatrix{Float64}}
+    @assert cfg.shtype.gpu
+    cu_SHqst_to_spat(cfg.cfg, qlm, slm, tlm, ur, utheta, uphi, cfg.lmax)
+    return ur, utheta, uphi
+end
+
+
+#complex to complex not available for CUDA (status: SHTns v3.7)
+
+# function synth_cplx(cfg::SHTnsCfg, qlm::CuVector{ComplexF64})
+#     @assert cfg.shtype.gpu
+#     @assert cfg.nlat != 0
+#     @assert length(qlm) == cfg.nlm_cplx
+#     @assert cfg.lmax == cfg.mmax
+
+#     nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+#     ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+#     v = CuMatrix{ComplexF64}(undef, nx, ny)
+#     synth!(cfg, qlm, v)
+#     return v
+# end
+
+# function synth_cplx(cfg::SHTnsCfg, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64})
+#     @assert cfg.shtype.gpu
+#     @assert cfg.nlat != 0
+#     @assert length(slm) == length(tlm) == cfg.nlm_cplx
+#     @assert cfg.lmax == cfg.mmax
+
+#     nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+#     ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+#     utheta = CuMatrix{ComplexF64}(undef, nx, ny)
+#     uphi = CuMatrix{ComplexF64}(undef, nx, ny)
+#     synth!(cfg, slm, tlm, utheta, uphi)
+#     return utheta, uphi
+# end
+
+# function synth_cplx(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64})
+#     @assert cfg.shtype.gpu
+#     @assert cfg.nlat != 0
+#     @assert length(qlm) == length(slm) == length(tlm) == cfg.nlm_cplx
+#     @assert cfg.lmax == cfg.mmax
+
+#     nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+#     ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+#     ur = CuMatrix{ComplexF64}(undef, nx, ny)
+#     utheta = CuMatrix{ComplexF64}(undef, nx, ny)
+#     uphi = CuMatrix{ComplexF64}(undef, nx, ny)
+#     synth!(cfg, qlm, slm, tlm, ur, utheta, uphi)
+#     return ur, utheta, uphi
+# end
+
+# function synth!(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, v::CuMatrix{ComplexF64})
+#     cu_SH_to_spat_cplx(cfg.cfg, qlm, v, cfg.lmax)
+#     return v
+# end
+
+# function synth!(cfg::SHTnsCfg, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, utheta::T, uphi::T) where {T<:CuMatrix{ComplexF64}}
+#     cu_SHsphtor_to_spat_cplx(cfg.cfg, slm, tlm, utheta, uphi, cfg.lmax)
+#     return utheta, uphi
+# end
+
+# function synth!(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, ur::T, utheta::T, uphi::T) where {T<:CuMatrix{ComplexF64}}
+#     cu_SHqst_to_spat_cplx(cfg.cfg, qlm, slm, tlm, ur, utheta, uphi, cfg.lmax)
+#     return ur, utheta, uphi
+# end
+
diff --git a/src/analys.jl b/src/analys.jl
@@ -63,11 +63,11 @@ function analys!(cfg::SHTnsCfg, utheta::T, uphi::T, slm, tlm) where {T<:Abstract
 end
 
 function analys!(cfg::SHTnsCfg, ur::T, utheta::T, uphi::T, qlm, slm, tlm) where {T<:AbstractMatrix{Float64}}
-    return spat_cplx_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm)
+    return spat_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm)
 end
 
 function analys!(cfg::SHTnsCfg, ur::T, utheta::T, uphi::T, qlm, slm, tlm) where {T<:AbstractMatrix{ComplexF64}}
-    return spat_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm)
+    return spat_cplx_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm)
 end