From 7263998028d9e19a6d7d7df3635a9c5e3e141d26 Mon Sep 17 00:00:00 2001
From: marius <mariusmillea@gmail.com>
Date: Fri, 26 Feb 2021 02:23:44 -0800
Subject: [PATCH 1/4] WIP one-GPU-per-thread

---
 src/flat_proj.jl | 13 +++++++++----
 src/gpu.jl       | 10 +++++++---
 src/util_fft.jl  | 22 ++++++++++++++--------
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/src/flat_proj.jl b/src/flat_proj.jl
index 0998cf86..530470a8 100644
--- a/src/flat_proj.jl
+++ b/src/flat_proj.jl
@@ -24,10 +24,15 @@ end
 real_type(T) = promote_type(real(T), Float32)
 @init @require Unitful="1986cc42-f94f-5a68-af5c-568840ba703d" real_type(::Type{<:Unitful.Quantity{T}}) where {T} = real_type(T)
 
+# for CuArray-backed fields, each GPU needs to have a copy of the
+# ProjLambert quantities in its own memory, so we need to memoize per "deviceid"
+# gpu-specialized version of this in gpu.jl 
+_deviceid(::Any) = 0
+
 ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) = 
     ProjLambert(Ny, Nx, Float64(θpix), Float64.(center), real_type(T), storage)
 
-@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage) where {T}
+@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage, dev=_deviceid(storage)) where {T}
 
     Δx           = T(deg2rad(θpix/60))
     Δℓx          = T(2π/(Nx*Δx))
@@ -36,9 +41,9 @@ ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) =
     Ωpix         = T(Δx^2)
     ℓy           = adapt(storage, (ifftshift(-Ny÷2:(Ny-1)÷2) .* Δℓy)[1:Ny÷2+1])
     ℓx           = adapt(storage, (ifftshift(-Nx÷2:(Nx-1)÷2) .* Δℓx))
-    ℓmag         = @. sqrt(ℓx'^2 + ℓy^2)
-    ϕ            = @. angle(ℓx' + im*ℓy)
-    sin2ϕ, cos2ϕ = @. sin(2ϕ), cos(2ϕ)
+    ℓmag         = adapt(storage, (@. sqrt(ℓx'^2 + ℓy^2)))
+    ϕ            = adapt(storage, (@. angle(ℓx' + im*ℓy)))
+    sin2ϕ, cos2ϕ = adapt(storage, (@. sin(2ϕ), cos(2ϕ)))
     if iseven(Ny)
         sin2ϕ[end, end:-1:(Nx÷2+2)] .= sin2ϕ[end, 2:Nx÷2]
     end
diff --git a/src/gpu.jl b/src/gpu.jl
index 50f8afe0..35f58dc8 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -27,6 +27,9 @@ gpu(x) = adapt_structure(CuArray, x)
 adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj)
 
 
+_deviceid(::Type{<:CuArray}) = deviceid()
+
+
 function Cℓ_to_2D(Cℓ, proj::ProjLambert{T,<:CuArray}) where {T}
     # todo: remove needing to go through cpu here:
     gpu(Complex{T}.(nan2zero.(Cℓ.(cpu(proj.ℓmag)))))
@@ -77,10 +80,11 @@ Random.randn!(rng::MersenneTwister, A::CuArray{T}) where {T} =
 # in the C2R case we pre-allocate the memory only once (via memoization) as well
 # as doing the copy asynchronously
 import CUDA.CUFFT: unsafe_execute!
-using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, cufftExecC2R, CUFFT_C2R, unsafe_copyto!, CuDefaultStream, pointer
+using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, 
+    cufftExecC2R, CUFFT_C2R, unsafe_copyto!, pointer, stream
 
 plan_buffer(x) = plan_buffer(eltype(x),size(x))
-@memoize plan_buffer(T,dims) = CuArray{T}(undef,dims...)
+@memoize plan_buffer(T, dims, dev=deviceid()) = CuArray{T}(undef,dims...)
 
 function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
                             x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
@@ -92,7 +96,7 @@ function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
                             x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
                             ) where {K,N}
     @assert plan.xtype == CUFFT_C2R
-    cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=CuDefaultStream()), y)
+    cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y)
 end
 
 # monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436
diff --git a/src/util_fft.jl b/src/util_fft.jl
index 1a62ece5..c3f4a8e1 100644
--- a/src/util_fft.jl
+++ b/src/util_fft.jl
@@ -17,20 +17,26 @@ FFTW_TIMELIMIT = 5
 
 # a set of wrapper FFT functions which use a @memoize'd plan
 _fft_arr_type(arr) = basetype(typeof(parent(arr)))
-m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...) * arr
-m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, d, size(arr)[2:end]...) \ arr
-m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...), arr)
-m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)...), copy_if_fftw(arr))
-@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz...) where {T, N, A<:AbstractArray{T,N}, Dims}
+m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)) * arr
+m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, (d, size(arr)[2:end]...)) \ arr
+m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)), arr)
+m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)), copy_if_fftw(arr))
+@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz) where {T, N, A<:Array{T,N}, Dims}
     FFTW.set_num_threads(FFTW_NUM_THREADS)
-    plan_rfft(
-        A(undef, sz...), dims; (A <: Array ? (timelimit=FFTW_TIMELIMIT,) : ())...
-    ) #:: FFTW.rFFTWPlan{T,-1,false,N,Dims} # type assert to help inference in @memoized function
+    plan_rfft(A(undef, sz...), dims; timelimit=FFTW_TIMELIMIT)
 end
 # FFTW (but not MKL) destroys the input array for inplace inverse real
 # FFTs, so we need a copy. see https://github.com/JuliaMath/FFTW.jl/issues/158
 copy_if_fftw(x) = FFTW.fftw_vendor==:fftw ? copy(x) : x
 
+@init @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
+    using .CUDA
+    copy_if_fftw(x::CuArray) = x
+    @memoize function m_plan_rfft(::Type{A}, dims::Dims, sz, dev=deviceid()) where {T, N, A<:CuArray{T,N}, Dims}
+        plan_rfft(A(undef, sz...), dims)
+    end
+end
+
 
 """
 Convert a matrix A which is the output of a real FFT to a real vector, keeping

From d2fa591a6730bfc717c6177ce82919c44800dc9e Mon Sep 17 00:00:00 2001
From: marius <mariusmillea@gmail.com>
Date: Sat, 27 Feb 2021 13:46:51 -0800
Subject: [PATCH 2/4] add function to adapt to unified GPU memory

---
 src/gpu.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/gpu.jl b/src/gpu.jl
index 35f58dc8..e5805ce1 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -23,6 +23,16 @@ global_rng_for(::Type{<:CuArray}) = curand_rng()
 
 gpu(x) = adapt_structure(CuArray, x)
 
+function adapt_structure(::Type{Mem.Unified}, x::Union{Array{T,N},CuArray{T,N}}) where {T,N}
+    buf = Mem.alloc(Mem.Unified, sizeof(T) * prod(size(x)))
+    y = unsafe_wrap(CuArray{T,N}, convert(CuPtr{T}, buf), size(x); own=false)
+    # TODO: need to write finalizer, right now this just leaks the unified memory
+    copyto!(y, x)
+    return y
+end
+unified_gpu(x) = adapt(Mem.Unified, x)
+
+
 
 adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj)
 

From f6ada72b2c2937d772fddd9bdb39e728286a28e8 Mon Sep 17 00:00:00 2001
From: marius <mariusmillea@gmail.com>
Date: Tue, 27 Apr 2021 16:07:35 -0700
Subject: [PATCH 3/4] leave CUDA's unsafe_execute be

---
 src/gpu.jl | 45 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

diff --git a/src/gpu.jl b/src/gpu.jl
index 165d7697..81b2d295 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -117,37 +117,20 @@ using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C,
 plan_buffer(x) = plan_buffer(eltype(x),size(x))
 @memoize plan_buffer(T, dims, dev=deviceid()) = CuArray{T}(undef,dims...)
 
-function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
-                            x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
-                            ) where {K,N}
-    @assert plan.xtype == CUFFT_R2C
-    cufftExecR2C(plan, x, y)
-end
-function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
-                            x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
-                            ) where {K,N}
-    @assert plan.xtype == CUFFT_C2R
-    cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y)
-end
-
-# monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436
-# until it hits a release
-using CUDA.CURAND: curandSetPseudoRandomGeneratorSeed, curandSetGeneratorOffset, 
-    CURAND_STATUS_ALLOCATION_FAILED, CURAND_STATUS_PREEXISTING_FAILURE, CURAND_STATUS_SUCCESS,
-    unsafe_curandGenerateSeeds, throw_api_error, @retry_reclaim, RNG
-
-function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0)
-    curandSetPseudoRandomGeneratorSeed(rng, seed)
-    curandSetGeneratorOffset(rng, offset)
-    res = @retry_reclaim err->isequal(err, CURAND_STATUS_ALLOCATION_FAILED) ||
-                              isequal(err, CURAND_STATUS_PREEXISTING_FAILURE) begin
-        unsafe_curandGenerateSeeds(rng)
-    end
-    if res != CURAND_STATUS_SUCCESS
-        throw_api_error(res)
-    end
-    return
-end
+## might want to bring this back but need to adapt to newer CUDA verions
+
+# function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
+#                             x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
+#                             ) where {K,N}
+#     @assert plan.xtype == CUFFT_R2C
+#     cufftExecR2C(plan, x, y)
+# end
+# function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
+#                             x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
+#                             ) where {K,N}
+#     @assert plan.xtype == CUFFT_C2R
+#     cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y)
+# end
 
 
 """

From 2e875d6345eed6d5347dd98a64f683fce5d719a9 Mon Sep 17 00:00:00 2001
From: marius <mariusmillea@gmail.com>
Date: Wed, 28 Apr 2021 11:47:11 -0700
Subject: [PATCH 4/4] turn off TimerOutputs when using threads

---
 src/CMBLensing.jl | 2 +-
 src/util.jl       | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/CMBLensing.jl b/src/CMBLensing.jl
index 6624ade5..65aba5b5 100644
--- a/src/CMBLensing.jl
+++ b/src/CMBLensing.jl
@@ -45,7 +45,7 @@ using StaticArrays: @SMatrix, @SVector, SMatrix, StaticArray, StaticArrayStyle,
     StaticMatrix, StaticVector, SVector, SArray, SizedArray
 using Statistics
 using StatsBase
-using TimerOutputs: @timeit, get_defaulttimer, reset_timer!
+using TimerOutputs: @timeit, get_defaulttimer, reset_timer!, disable_timer!
 using UnPack
 using Zygote
 using Zygote: unbroadcast, Numeric, @adjoint, @nograd
diff --git a/src/util.jl b/src/util.jl
index b0dc0ffb..b1c18416 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -381,6 +381,10 @@ macro show⌛(ex)
     end
 end
 
+@init if Threads.nthreads() > 1
+    disable_timer!(get_defaulttimer())
+end
+
 
 
 # used in a couple of places to create a Base.promote_rule-like system