fgerick
diff --git a/‎Project.toml‎
Lines changed: 6 additions & 0 deletions b/‎Project.toml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ext/SHTnsCUDAExt/SHTnsCUDAExt.jl‎
Lines changed: 15 additions & 0 deletions b/‎ext/SHTnsCUDAExt/SHTnsCUDAExt.jl‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎ext/SHTnsCUDAExt/analys.jl‎
Lines changed: 83 additions & 0 deletions b/‎ext/SHTnsCUDAExt/analys.jl‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎ext/SHTnsCUDAExt/sht.jl‎
Lines changed: 25 additions & 0 deletions b/‎ext/SHTnsCUDAExt/sht.jl‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎ext/SHTnsCUDAExt/synth.jl‎
Lines changed: 126 additions & 0 deletions b/‎ext/SHTnsCUDAExt/synth.jl‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎src/SHTns.jl‎
Lines changed: 29 additions & 5 deletions b/‎src/SHTns.jl‎
Lines changed: 29 additions & 5 deletions
@@ -6,6 +6,12 @@ version = "0.2.0"
 [deps]
 SHTns_jll = "daf09cc5-9ab3-509e-9618-0b89086eb825"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+SHTnsCUDAExt = "CUDA"
+
 [compat]
 julia = "1.6"
 
 
@@ -0,0 +1,15 @@
+module SHTnsCUDAExt
+
+using CUDA
+using SHTns
+
+import SHTns: libshtns
+import SHTns: synth, synth!, analys, analys!
+
+__init__() = @assert CUDA.functional()
+
+include("sht.jl")
+include("synth.jl")
+include("analys.jl")
+
+end #module
@@ -0,0 +1,83 @@
+function analys(cfg::SHTnsCfg, v::CuArray{Float64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm*cfg.howmany)
+    analys!(cfg, copy(v), qlm)
+    return qlm
+end
+
+function analys(cfg::SHTnsCfg, v::CuArray{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx*cfg.howmany)
+    analys!(cfg, copy(v), qlm)
+    return qlm
+end
+
+function analys(cfg::SHTnsCfg, utheta::CuArray{Float64}, uphi::CuArray{Float64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    slm = CuVector{ComplexF64}(undef, cfg.nlm*cfg.howmany)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm*cfg.howmany)
+    analys!(cfg, copy(utheta), copy(uphi), slm, tlm)
+    return slm, tlm
+end
+
+function analys(cfg::SHTnsCfg, utheta::CuArray{ComplexF64}, uphi::CuArray{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    slm = CuVector{ComplexF64}(undef, cfg.nlm_cplx*cfg.howmany)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx*cfg.howmany)
+    analys!(cfg, copy(utheta), copy(uphi), slm, tlm)
+    return slm, tlm
+end
+
+function analys(cfg::SHTnsCfg, ur::CuArray{Float64}, utheta::CuArray{Float64}, uphi::CuArray{Float64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm*cfg.howmany)
+    slm = CuVector{ComplexF64}(undef, cfg.nlm*cfg.howmany)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm*cfg.howmany)
+    analys!(cfg, copy(ur), copy(utheta), copy(uphi), qlm, slm, tlm)
+    return qlm, slm, tlm
+end
+
+function analys(cfg::SHTnsCfg, ur::CuArray{ComplexF64}, utheta::CuArray{ComplexF64}, uphi::CuArray{ComplexF64})
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    qlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx*cfg.howmany)
+    slm = CuVector{ComplexF64}(undef, cfg.nlm_cplx*cfg.howmany)
+    tlm = CuVector{ComplexF64}(undef, cfg.nlm_cplx*cfg.howmany)
+    analys!(cfg, copy(ur), copy(utheta), copy(uphi), qlm, slm, tlm)
+    return qlm, slm, tlm
+end
+
+function analys!(cfg::SHTnsCfg, v::CuArray{Float64}, qlm::CuVector{ComplexF64})
+    @assert cfg.shtype.gpu
+    return cu_spat_to_SH(cfg.cfg, v, qlm, cfg.lmax)
+end
+
+
+function analys!(cfg::SHTnsCfg, utheta::T, uphi::T, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuArray{Float64}}
+    @assert cfg.shtype.gpu
+    return cu_spat_to_SHsphtor(cfg.cfg, utheta, uphi, slm, tlm, cfg.lmax)
+end
+
+function analys!(cfg::SHTnsCfg, ur::T, utheta::T, uphi::T, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuArray{Float64}}
+    @assert cfg.shtype.gpu
+    return cu_spat_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm, cfg.lmax)
+end
+
+#complex to complex not available for CUDA (status: SHTns v3.7)
+
+# function analys!(cfg::SHTnsCfg, v::CuArray{ComplexF64}, qlm::CuVector{ComplexF64})
+#     return cu_spat_cplx_to_SH(cfg.cfg, v, qlm)
+# end
+
+# function analys!(cfg::SHTnsCfg, utheta::T, uphi::T, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuArray{ComplexF64}}
+#     return cu_spat_cplx_to_SHsphtor(cfg.cfg, utheta, uphi, slm, tlm)
+# end
+
+# function analys!(cfg::SHTnsCfg, ur::T, utheta::T, uphi::T, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {T<:CuArray{ComplexF64}}
+#     return cu_spat_cplx_to_SHqst(cfg.cfg, ur, utheta, uphi, qlm, slm, tlm)
+# end
@@ -0,0 +1,25 @@
+function cu_spat_to_SH(cfg, Vr::CuMatrix{Float64}, Qlm::CuVector{Complex{Float64}}, lmax)
+    ccall((:cu_spat_to_SH, libshtns[]), Nothing, (shtns_cfg, CuPtr{Float64}, CuPtr{Complex{Float64}}, Clong), cfg, Vr, Qlm, lmax)
+end
+
+function cu_SH_to_spat(cfg, Qlm::CuVector{Complex{Float64}}, Vr::CuMatrix{Float64}, lmax)
+    ccall((:cu_SH_to_spat, libshtns[]), Nothing, (shtns_cfg, CuPtr{Complex{Float64}}, CuPtr{Float64}, Clong), cfg, Qlm, Vr, lmax)
+end
+
+function cu_spat_to_SHsphtor(cfg, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, lmax)
+    ccall((:cu_spat_to_SHsphtor, libshtns[]), Nothing, (shtns_cfg,CuPtr{Float64},CuPtr{Float64},CuPtr{ComplexF64},CuPtr{ComplexF64}, Clong), cfg, Vt, Vp, Slm, Tlm, lmax)
+end
+
+function cu_SHsphtor_to_spat(cfg, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, lmax)
+    ccall((:cu_SHsphtor_to_spat, libshtns[]), Nothing, (shtns_cfg,CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{Float64},CuPtr{Float64}, Clong), cfg, Slm, Tlm, Vt, Vp, lmax)
+end
+
+function spat_to_SHqst(cfg, Vr::CuMatrix{Float64}, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, Qlm::CuVector{Complex{Float64}}, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, lmax)
+    ccall((:cu_spat_to_SHqst, libshtns[]), Nothing, (shtns_cfg,CuPtr{Float64},CuPtr{Float64},CuPtr{Float64},CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{ComplexF64}, Clong), cfg, Vr, Vt, Vp, Qlm, Slm, Tlm, lmax)
+end
+
+function SHqst_to_spat(cfg, Qlm::CuVector{Complex{Float64}}, Slm::CuVector{Complex{Float64}}, Tlm::CuVector{Complex{Float64}}, Vr::CuMatrix{Float64}, Vt::CuMatrix{Float64}, Vp::CuMatrix{Float64}, lmax)
+    ccall((:cu_SHqst_to_spat, libshtns[]), Nothing, (shtns_cfg,CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{ComplexF64},CuPtr{Float64},CuPtr{Float64},CuPtr{Float64}, Clong), cfg, Qlm, Slm, Tlm, Vr, Vt, Vp, lmax)
+end
+
+
@@ -0,0 +1,126 @@
+function synth(cfg::SHTnsCfg{TR,T,N}, qlm::CuVector{ComplexF64}) where {TR,T,N}
+    Tv = TR == Real ? Float64 : ComplexF64 
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    @assert length(qlm) == nlm(cfg)*cfg.howmany
+
+    nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+    ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+    v = CuMatrix{Tv}(undef, nx, ny)
+    synth!(cfg, qlm, v)
+    return v
+end
+
+function synth(cfg::SHTnsCfg{TR,T,N}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {TR,T,N}
+    Tv = TR == Real ? Float64 : ComplexF64 
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    @assert length(slm) == length(tlm) == nlm(cfg)*cfg.howmany
+
+    nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+    ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+    utheta = CuMatrix{Tv}(undef, nx, ny)
+    uphi = CuMatrix{Tv}(undef, nx, ny)
+    synth!(cfg, slm, tlm, utheta, uphi)
+    return utheta, uphi
+end
+
+function synth(cfg::SHTnsCfg{TR,T,N}, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}) where {TR,T,N}
+    Tv = TR == Real ? Float64 : ComplexF64 
+    @assert cfg.shtype.gpu
+    @assert cfg.nlat != 0
+    @assert length(qlm) == length(slm) == length(tlm) == nlm(cfg)*cfg.howmany
+    nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+    ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+    ur = CuMatrix{Tv}(undef, nx, ny)
+    utheta = CuMatrix{Tv}(undef, nx, ny)
+    uphi = CuMatrix{Tv}(undef, nx, ny)
+    synth!(cfg, qlm, slm, tlm, ur, utheta, uphi)
+    return ur, utheta, uphi
+end
+
+
+function synth!(cfg::SHTnsCfg{Real,T,N}, qlm::CuVector{ComplexF64}, v::CuMatrix{Float64}) where {T,N}
+    @assert cfg.shtype.gpu
+    cu_SH_to_spat(cfg.cfg, qlm, v, cfg.lmax)
+    return v
+end
+
+function synth!(cfg::SHTnsCfg{Real,T,N}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, utheta::Tv, uphi::Tv) where {T,N,Tv<:CuMatrix{Float64}}
+    @assert cfg.shtype.gpu
+    cu_SHsphtor_to_spat(cfg.cfg, slm, tlm, utheta, uphi, cfg.lmax)
+    return utheta, uphi
+end
+
+function synth!(cfg::SHTnsCfg{Real,T,N}, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, ur::Tv, utheta::Tv, uphi::Tv) where {T,N,Tv<:CuMatrix{Float64}}
+    @assert cfg.shtype.gpu
+    cu_SHqst_to_spat(cfg.cfg, qlm, slm, tlm, ur, utheta, uphi, cfg.lmax)
+    return ur, utheta, uphi
+end
+
+
+#complex to complex not available for CUDA (status: SHTns v3.7)
+
+# function synth_cplx(cfg::SHTnsCfg, qlm::CuVector{ComplexF64})
+#     @assert cfg.shtype.gpu
+#     @assert cfg.nlat != 0
+#     @assert length(qlm) == cfg.nlm_cplx
+#     @assert cfg.lmax == cfg.mmax
+
+#     nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+#     ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+#     v = CuMatrix{ComplexF64}(undef, nx, ny)
+#     synth!(cfg, qlm, v)
+#     return v
+# end
+
+# function synth_cplx(cfg::SHTnsCfg, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64})
+#     @assert cfg.shtype.gpu
+#     @assert cfg.nlat != 0
+#     @assert length(slm) == length(tlm) == cfg.nlm_cplx
+#     @assert cfg.lmax == cfg.mmax
+
+#     nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+#     ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+#     utheta = CuMatrix{ComplexF64}(undef, nx, ny)
+#     uphi = CuMatrix{ComplexF64}(undef, nx, ny)
+#     synth!(cfg, slm, tlm, utheta, uphi)
+#     return utheta, uphi
+# end
+
+# function synth_cplx(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64})
+#     @assert cfg.shtype.gpu
+#     @assert cfg.nlat != 0
+#     @assert length(qlm) == length(slm) == length(tlm) == cfg.nlm_cplx
+#     @assert cfg.lmax == cfg.mmax
+
+#     nx = cfg.shtype.contiguous_phi ? cfg.nphi : cfg.nlat_padded
+#     ny = cfg.shtype.contiguous_phi ? cfg.nlat_padded : cfg.nphi
+     
+#     ur = CuMatrix{ComplexF64}(undef, nx, ny)
+#     utheta = CuMatrix{ComplexF64}(undef, nx, ny)
+#     uphi = CuMatrix{ComplexF64}(undef, nx, ny)
+#     synth!(cfg, qlm, slm, tlm, ur, utheta, uphi)
+#     return ur, utheta, uphi
+# end
+
+# function synth!(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, v::CuMatrix{ComplexF64})
+#     cu_SH_to_spat_cplx(cfg.cfg, qlm, v, cfg.lmax)
+#     return v
+# end
+
+# function synth!(cfg::SHTnsCfg, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, utheta::T, uphi::T) where {T<:CuMatrix{ComplexF64}}
+#     cu_SHsphtor_to_spat_cplx(cfg.cfg, slm, tlm, utheta, uphi, cfg.lmax)
+#     return utheta, uphi
+# end
+
+# function synth!(cfg::SHTnsCfg, qlm::CuVector{ComplexF64}, slm::CuVector{ComplexF64}, tlm::CuVector{ComplexF64}, ur::T, utheta::T, uphi::T) where {T<:CuMatrix{ComplexF64}}
+#     cu_SHqst_to_spat_cplx(cfg.cfg, qlm, slm, tlm, ur, utheta, uphi, cfg.lmax)
+#     return ur, utheta, uphi
+# end
+
@@ -82,16 +82,22 @@ for (type, enumtype) in [(:Gauss, :sht_gauss), (:RegFast, :sht_reg_fast), (:RegD
         
         """
         Base.@kwdef struct $(type)<:SHTnsType
-            contiguous_lat::Bool=false
+            contiguous_lat::Bool=true
             contiguous_phi::Bool=false
             padding::Bool=false
+            gpu::Bool=false
+            southpolefirst::Bool=false
+            float32::Bool=false
         end
 
         function Base.convert(::Type{shtns_type}, x::$(type)) 
             shtype = $(enumtype) 
-            x.contiguous_lat && (shtype += SHT_THETA_CONTIGUOUS)
             x.contiguous_phi && (shtype += SHT_PHI_CONTIGUOUS) 
             x.padding && (shtype += SHT_ALLOW_PADDING)
+            x.gpu && (shtype += SHT_ALLOW_GPU)
+            x.contiguous_lat && (shtype += SHT_THETA_CONTIGUOUS)
+            x.southpolefirst && (shtype += SHT_SOUTH_POLE_FIRST)
+            x.float32 && (shtype += SHT_FP32)
             return shtype
         end
     end
@@ -115,7 +121,7 @@ end
 
 Configuration of spherical harmonic transform.
 """
-mutable struct SHTnsCfg{N<:SHTnsNorm, T<:SHTnsType}
+mutable struct SHTnsCfg{TR<:Union{Real,Complex}, N<:SHTnsNorm, T<:SHTnsType}
     cfg::Ptr{shtns_info}
     norm::N
     shtype::T
@@ -134,23 +140,32 @@ mutable struct SHTnsCfg{N<:SHTnsNorm, T<:SHTnsType}
     st::Vector{Float64}
     nlat_padded::Int
     nlm_cplx::Int
+    howmany::Int
     function SHTnsCfg(lmax, mmax, mres, nlat, nphi; 
                         shtype::T=QuickInit(), 
                         norm::N=Orthonormal(), 
                         eps=1e-10, 
                         robert_form=false,
+                        howmany = 1,
+                        transform::Union{Type{Real}, Type{Complex}} = Real
                         ) where {T<:SHTnsType, N<:SHTnsNorm}
 
         _init_checks(shtype, lmax, mmax, mres, nlat, nphi)
         cfg = shtns_create(lmax, mmax, mres, norm)
         robert_form && shtns_robert_form(cfg,1)
+        if howmany > 1 
+            @assert transform == Real "Only real transform is supported for batched transforms"
+            info = unsafe_load(cfg)
+            spec_dist = transform == Real ? info.nlm : info.nlm_cplx
+            shtns_set_many(cfg, howmany, spec_dist)
+        end
         shtns_set_grid(cfg, shtype, eps, nlat, nphi)
         info = unsafe_load(cfg)
         li = Vector{Int}(unsafe_wrap(Vector{Cushort},info.li,Int(info.nlm)))
         mi = Vector{Int}(unsafe_wrap(Vector{Cushort},info.mi,Int(info.nlm)))
         ct = Vector{Float64}(unsafe_wrap(Vector{Cdouble},info.ct,Int(info.nlat)))
         st = Vector{Float64}(unsafe_wrap(Vector{Cdouble},info.st,Int(info.nlat)))
-        stream = new{N,T}(cfg, norm, shtype, robert_form, info.nlm, info.lmax, info.mmax, info.mres, info.nlat_2, info.nlat, info.nphi, info.nspat, li, mi, ct, st, info.nlat_padded, info.nlm_cplx)
+        stream = new{transform,N,T}(cfg, norm, shtype, robert_form, info.nlm, info.lmax, info.mmax, info.mres, info.nlat_2, info.nlat, info.nphi, info.nspat, li, mi, ct, st, info.nlat_padded, info.nlm_cplx, howmany)
         finalizer(x->shtns_destroy(x.cfg), stream)
         return stream
     end
@@ -160,6 +175,8 @@ mutable struct SHTnsCfg{N<:SHTnsNorm, T<:SHTnsType}
         eps=1e-10, 
         robert_form=false,
         nl_order = 0,
+        howmany = 1,
+        transform::Union{Type{Real}, Type{Complex}} = Real
         ) where {T<:SHTnsType, N<:SHTnsNorm}
 
         @assert lmax > 1 
@@ -169,13 +186,19 @@ mutable struct SHTnsCfg{N<:SHTnsNorm, T<:SHTnsType}
         cfg = shtns_create(lmax, mmax, mres, norm)
         robert_form && shtns_robert_form(cfg,1)
         info = unsafe_load(cfg)
+        if howmany > 1 
+            @assert transform == Real "Only real transform is supported for batched transforms"
+            info = unsafe_load(cfg)
+            spec_dist = transform == Real ? info.nlm : info.nlm_cplx
+            shtns_set_many(cfg, howmany, spec_dist)
+        end
         shtns_set_grid_auto(cfg, shtype, eps, nl_order, Ref(info.nlat), Ref(info.nphi))
         info = unsafe_load(cfg)
         li = Vector{Int}(unsafe_wrap(Vector{Cushort},info.li,Int(info.nlm)))
         mi = Vector{Int}(unsafe_wrap(Vector{Cushort},info.mi,Int(info.nlm)))
         ct = Vector{Float64}(unsafe_wrap(Vector{Cdouble},info.ct,Int(info.nlat)))
         st = Vector{Float64}(unsafe_wrap(Vector{Cdouble},info.st,Int(info.nlat)))
-        stream = new{N,T}(cfg, norm, shtype, robert_form, info.nlm, info.lmax, info.mmax, info.mres, info.nlat_2, info.nlat, info.nphi, info.nspat, li, mi, ct, st, info.nlat_padded, info.nlm_cplx)
+        stream = new{transform,N,T}(cfg, norm, shtype, robert_form, info.nlm, info.lmax, info.mmax, info.mres, info.nlat_2, info.nlat, info.nphi, info.nspat, li, mi, ct, st, info.nlat_padded, info.nlm_cplx, howmany)
         finalizer(x->shtns_destroy(x.cfg), stream)
         return stream
     end
@@ -223,6 +246,7 @@ const SHT_SCALAR_ONLY = UInt32(256 * 16)
 const SHT_LOAD_SAVE_CFG = UInt32(256 * 64)
 const SHT_ALLOW_GPU = UInt32(256 * 128)
 const SHT_ALLOW_PADDING = UInt32(256 * 256)
+const SHT_FP32 = UInt32(256 * 1024)
 
 include("sht.jl")
 include("tools.jl")