marius311 · marius311 · Feb 26, 2021 · Feb 27, 2021 · Apr 15, 2021 · Apr 26, 2021
diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        julia-version: ['1.5', '1.6', '1.7-nightly']
+        julia-version: ['1.6', '1.7-nightly']
         os: [ubuntu-latest]
         JULIA_FFTW_PROVIDER: ['MKL','FFTW']
       fail-fast: false

diff --git a/Project.toml b/Project.toml
@@ -53,7 +53,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 AbstractFFTs = "0.5, 1"
 Adapt = "1.0.1, 2, 3"
-CUDA = "2.1"
+CUDA = "3"
 Combinatorics = "1"
 DataStructures = "0.17.9, 0.18"
 FFTW = "1.2 - 1.3"
@@ -83,5 +83,5 @@ StaticArrays = "0.12.1, 1.0"
 StatsBase = "0.32, 0.33"
 TimerOutputs = "0.5"
 UnPack = "1"
-Zygote = "0.4.14, 0.5, 0.6 - 0.6.4"
-julia = "1.5"
+Zygote = "0.6.9"
+julia = "1.6"
diff --git a/README.md b/README.md
@@ -9,11 +9,11 @@
 CMBLensing.jl is a next-generation tool for analysis of the lensed Cosmic Microwave Background. It is written in [Julia](https://julialang.org/) and transparently callable from Python.
 
 
-At its heart, CMBLensing.jl maximizes or samples the Bayesian posterior for the CMB lensing problem. It also contains tools to quickly manipulate and process CMB maps, set up modified posteriors, and take gradients using automatic differentation.
+At its heart, CMBLensing.jl maximizes or samples the Bayesian posterior for the CMB lensing problem. It also contains tools to quickly manipulate and process CMB maps, set up modified posteriors, and take gradients using automatic differentiation.
 
 ### Highlights
-* Fully Nvidia GPU compatible (speedups over CPU are currently 3x-10x, depending on the problem size and hardware).
-* Automatic differentation (via [Zygote.jl](https://fluxml.ai/Zygote.jl/)) provides for-free gradients of your custom posteriors.
+* Fully Nvidia GPU compatible (1-2 order of magnitude speedups over CPU, depending on the problem size and hardware).
+* Automatic differentiation (via [Zygote.jl](https://fluxml.ai/Zygote.jl/)) provides for-free gradients of your custom posteriors.
 * Includes the following algorithms to lense a map:
     * `LenseFlow` ([Millea, Anderes, & Wandelt 2017](https://arxiv.org/abs/1708.06753))
     * `Taylens` ([Næss & Louis 2013](https://arxiv.org/abs/1307.0719))
@@ -29,14 +29,14 @@ The best place to get started is to read the [documentation](https://cosmicmar.c
 
 Most of the pages in the documentation are Jupyter notebooks, and you can click the "launch binder" link at the top of each page to launch a Jupyterlab server running the notebook in your browser (courtesy of [binder](https://mybinder.org/)). 
 
-You can also clone the repostiory and open the notebooks in [docs/src](https://github.com/marius311/CMBLensing.jl/tree/master/docs/src) if you want to run them locally (which will usually lead to higher performance). The notebooks are stored as `.md` files rather than `.ipynb` format. Its recommented to install [Jupytext](jupytext) (`pip install jupytext`) and then you can run these `.md` directly from Jupyterlab by right-clicking on them and selecting `Open With -> Notebook`. Otherwise, run the script `docs/make_notebooks.sh` to convert the `.md` files to `.ipynb` which you can then open as desired. 
+You can also clone the repostiory and open the notebooks in [docs/src](https://github.com/marius311/CMBLensing.jl/tree/master/docs/src) if you want to run them locally (which will usually lead to higher performance). The notebooks are stored as `.md` files rather than `.ipynb` format. Its recommended to install [Jupytext](jupytext) (`pip install jupytext`) and then you can run these `.md` directly from Jupyterlab by right-clicking on them and selecting `Open With -> Notebook`. Otherwise, run the script `docs/make_notebooks.sh` to convert the `.md` files to `.ipynb` which you can then open as desired. 
 
 
 ## Installation
 
 ### Requirements
 
-* Julia 1.5
+* Julia 1.6
 * _(recommended)_ An Nvidia GPU and [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) for GPU support
 * _(recommended)_ FFTW.jl built with [`JULIA_FFTW_PROVIDER=MKL`](https://juliamath.github.io/FFTW.jl/stable/#Installation-1) for faster CPU FFTs 
 * _(recommended)_ Python 3 + matplotlib (used for plotting)

diff --git a/src/CMBLensing.jl b/src/CMBLensing.jl
@@ -45,7 +45,7 @@ using StaticArrays: @SMatrix, @SVector, SMatrix, StaticArray, StaticArrayStyle,
     StaticMatrix, StaticVector, SVector, SArray, SizedArray
 using Statistics
 using StatsBase
-using TimerOutputs: @timeit, get_defaulttimer, reset_timer!
+using TimerOutputs: @timeit, get_defaulttimer, reset_timer!, disable_timer!
 using UnPack
 using Zygote
 using Zygote: unbroadcast, Numeric, @adjoint, @nograd

diff --git a/src/autodiff.jl b/src/autodiff.jl
@@ -66,7 +66,7 @@ Zygote.accum(f::BaseField, nt::NamedTuple{(:arr,:metadata)}) = (@assert(isnothin
 @adjoint dot(f::Field{B1}, g::Field{B2}) where {B1,B2} = dot(f,g), Δ -> (Δ*B1(g), Δ*B2(f))
 @adjoint (*)(f::Adjoint{<:Any,<:Field}, g::Field) = Zygote.pullback((f,g)->dot(f',g),f,g)
 # ℝᴺˣᴺ -> ℝ¹ 
-@adjoint logdet(L::ParamDependentOp, θ) = Zygote._pullback(θ->logdet(L(θ)), θ)
+@adjoint logdet(L::ParamDependentOp, θ) = Zygote.pullback((L,θ)->logdet(L(θ)), L, θ)
 @adjoint logdet(L::DiagOp) = logdet(L), Δ -> (Δ * Zfac(L) * pinv(L)',)
 
 # basis conversion
@@ -110,9 +110,8 @@ end
 @adjoint function *(x::FieldOrOpRowVector, y::FieldVector)
     z = x * y
     # when x is a vector of Fields
-    back(Δ) = (Δ * y', x' * Δ)
-    # when x is a vector of Diagonals. in this case, Δ * basis(Δ)(y)' create an
-    # OuterProdOp in the same basis as the Diagonals in x
+    back(Δ::Real) = ((Δ * y)', x' * Δ)
+    # when x is a vector of Diagonals. in this case, Δ * basis(Δ)(y)'
     back(Δ::Field{B}) where {B} = (Δ * basis(Δ)(y)'), (x' * Δ)
     z, back
 end

diff --git a/src/flat_proj.jl b/src/flat_proj.jl
@@ -24,10 +24,15 @@ end
 real_type(T) = promote_type(real(T), Float32)
 @init @require Unitful="1986cc42-f94f-5a68-af5c-568840ba703d" real_type(::Type{<:Unitful.Quantity{T}}) where {T} = real_type(T)
 
+# for CuArray-backed fields, each GPU needs to have a copy of the
+# ProjLambert quantities in its own memory, so we need to memoize per "deviceid"
+# gpu-specialized version of this in gpu.jl 
+_deviceid(::Any) = 0
+
 ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) = 
     ProjLambert(Ny, Nx, Float64(θpix), Float64.(center), real_type(T), storage)
 
-@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage) where {T}
+@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage, dev=_deviceid(storage)) where {T}
 
     Δx           = T(deg2rad(θpix/60))
     Δℓx          = T(2π/(Nx*Δx))
@@ -36,9 +41,9 @@ ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) =
     Ωpix         = T(Δx^2)
     ℓy           = adapt(storage, (ifftshift(-Ny÷2:(Ny-1)÷2) .* Δℓy)[1:Ny÷2+1])
     ℓx           = adapt(storage, (ifftshift(-Nx÷2:(Nx-1)÷2) .* Δℓx))
-    ℓmag         = @. sqrt(ℓx'^2 + ℓy^2)
-    ϕ            = @. angle(ℓx' + im*ℓy)
-    sin2ϕ, cos2ϕ = @. sin(2ϕ), cos(2ϕ)
+    ℓmag         = adapt(storage, (@. sqrt(ℓx'^2 + ℓy^2)))
+    ϕ            = adapt(storage, (@. angle(ℓx' + im*ℓy)))
+    sin2ϕ, cos2ϕ = adapt(storage, (@. sin(2ϕ), cos(2ϕ)))
     if iseven(Ny)
         sin2ϕ[end, end:-1:(Nx÷2+2)] .= sin2ϕ[end, 2:Nx÷2]
     end

diff --git a/src/gpu.jl b/src/gpu.jl
@@ -1,6 +1,6 @@
 
 using CUDA
-using CUDA: cufunc, curand_rng
+using CUDA: curand_rng
 using CUDA.CUSPARSE: CuSparseMatrix, CuSparseMatrixCSR, CuSparseMatrixCOO
 using CUDA.CUSOLVER: CuQR
 
@@ -40,6 +40,16 @@ to Float32. Equivalent to `adapt_structure(CuArray, x)`. Returns nothing.
 """
 gpu(x) = adapt_structure(CuArray, x)
 
+function adapt_structure(::Type{Mem.Unified}, x::Union{Array{T,N},CuArray{T,N}}) where {T,N}
+    buf = Mem.alloc(Mem.Unified, sizeof(T) * prod(size(x)))
+    y = unsafe_wrap(CuArray{T,N}, convert(CuPtr{T}, buf), size(x); own=false)
+    # TODO: need to write finalizer, right now this just leaks the unified memory
+    copyto!(y, x)
+    return y
+end
+unified_gpu(x) = adapt(Mem.Unified, x)
+
+
 
 @doc doc"""
 
@@ -57,6 +67,9 @@ end
 adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj)
 
 
+_deviceid(::Type{<:CuArray}) = deviceid()
+
+
 function Cℓ_to_2D(Cℓ, proj::ProjLambert{T,<:CuArray}) where {T}
     # todo: remove needing to go through cpu here:
     gpu(Complex{T}.(nan2zero.(Cℓ.(cpu(proj.ℓmag)))))
@@ -72,17 +85,6 @@ fill!(f::CuBaseField, x) = (fill!(f.arr,x); f)
 ==(a::CuBaseField, b::CuBaseField) = (==)(promote(a.arr, b.arr)...)
 sum(f::CuBaseField; dims=:) = (dims == :) ? sum(f.arr) : (1 in dims) ? error("Sum over invalid dims of CuFlatS0.") : f
 
-
-# these only work for Reals in CUDA
-# with these definitions, they work for Complex as well
-CUDA.isfinite(x::Complex) = Base.isfinite(x)
-CUDA.sqrt(x::Complex) = CUDA.sqrt(CUDA.abs(x)) * CUDA.exp(im*CUDA.angle(x)/2)
-CUDA.culiteral_pow(::typeof(^), x::Complex, ::Val{2}) = x * x
-CUDA.pow(x::Complex, p) = x^p
-
-# until https://github.com/JuliaGPU/CUDA.jl/pull/618 (CUDA 2.5)
-CUDA.cufunc(::typeof(angle)) = CUDA.angle
-
 # adapting of SparseMatrixCSC ↔ CuSparseMatrixCSR (otherwise dense arrays created)
 adapt_structure(::Type{<:CuArray}, L::SparseMatrixCSC)   = CuSparseMatrixCSR(L)
 adapt_structure(::Type{<:Array},   L::CuSparseMatrixCSR) = SparseMatrixCSC(L)
@@ -109,42 +111,26 @@ Random.randn!(rng::MersenneTwister, A::CuArray) =
 # in the C2R case we pre-allocate the memory only once (via memoization) as well
 # as doing the copy asynchronously
 import CUDA.CUFFT: unsafe_execute!
-using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, cufftExecC2R, CUFFT_C2R, unsafe_copyto!, CuDefaultStream, pointer
+using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, 
+    cufftExecC2R, CUFFT_C2R, unsafe_copyto!, pointer, stream
 
 plan_buffer(x) = plan_buffer(eltype(x),size(x))
-@memoize plan_buffer(T,dims) = CuArray{T}(undef,dims...)
-
-function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
-                            x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
-                            ) where {K,N}
-    @assert plan.xtype == CUFFT_R2C
-    cufftExecR2C(plan, x, y)
-end
-function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
-                            x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
-                            ) where {K,N}
-    @assert plan.xtype == CUFFT_C2R
-    cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=CuDefaultStream()), y)
-end
-
-# monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436
-# until it hits a release
-using CUDA.CURAND: curandSetPseudoRandomGeneratorSeed, curandSetGeneratorOffset, 
-    CURAND_STATUS_ALLOCATION_FAILED, CURAND_STATUS_PREEXISTING_FAILURE, CURAND_STATUS_SUCCESS,
-    unsafe_curandGenerateSeeds, throw_api_error, @retry_reclaim, RNG
-
-function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0)
-    curandSetPseudoRandomGeneratorSeed(rng, seed)
-    curandSetGeneratorOffset(rng, offset)
-    res = @retry_reclaim err->isequal(err, CURAND_STATUS_ALLOCATION_FAILED) ||
-                              isequal(err, CURAND_STATUS_PREEXISTING_FAILURE) begin
-        unsafe_curandGenerateSeeds(rng)
-    end
-    if res != CURAND_STATUS_SUCCESS
-        throw_api_error(res)
-    end
-    return
-end
+@memoize plan_buffer(T, dims, dev=deviceid()) = CuArray{T}(undef,dims...)
+
+## might want to bring this back but need to adapt to newer CUDA verions
+
+# function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
+#                             x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
+#                             ) where {K,N}
+#     @assert plan.xtype == CUFFT_R2C
+#     cufftExecR2C(plan, x, y)
+# end
+# function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
+#                             x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
+#                             ) where {K,N}
+#     @assert plan.xtype == CUFFT_C2R
+#     cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y)
+# end
 
 
 """

diff --git a/src/util.jl b/src/util.jl
@@ -381,6 +381,10 @@ macro show⌛(ex)
     end
 end
 
+@init if Threads.nthreads() > 1
+    disable_timer!(get_defaulttimer())
+end
+
 
 
 # used in a couple of places to create a Base.promote_rule-like system

diff --git a/src/util_fft.jl b/src/util_fft.jl
@@ -17,20 +17,26 @@ FFTW_TIMELIMIT = 5
 
 # a set of wrapper FFT functions which use a @memoize'd plan
 _fft_arr_type(arr) = basetype(typeof(parent(arr)))
-m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...) * arr
-m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, d, size(arr)[2:end]...) \ arr
-m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...), arr)
-m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)...), copy_if_fftw(arr))
-@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz...) where {T, N, A<:AbstractArray{T,N}, Dims}
+m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)) * arr
+m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, (d, size(arr)[2:end]...)) \ arr
+m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)), arr)
+m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)), copy_if_fftw(arr))
+@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz) where {T, N, A<:Array{T,N}, Dims}
     FFTW.set_num_threads(FFTW_NUM_THREADS)
-    plan_rfft(
-        A(undef, sz...), dims; (A <: Array ? (timelimit=FFTW_TIMELIMIT,) : ())...
-    ) #:: FFTW.rFFTWPlan{T,-1,false,N,Dims} # type assert to help inference in @memoized function
+    plan_rfft(A(undef, sz...), dims; timelimit=FFTW_TIMELIMIT)
 end
 # FFTW (but not MKL) destroys the input array for inplace inverse real
 # FFTs, so we need a copy. see https://github.com/JuliaMath/FFTW.jl/issues/158
 copy_if_fftw(x) = FFTW.fftw_vendor==:fftw ? copy(x) : x
 
+@init @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
+    using .CUDA
+    copy_if_fftw(x::CuArray) = x
+    @memoize function m_plan_rfft(::Type{A}, dims::Dims, sz, dev=deviceid()) where {T, N, A<:CuArray{T,N}, Dims}
+        plan_rfft(A(undef, sz...), dims)
+    end
+end
+
 
 """
 Convert a matrix A which is the output of a real FFT to a real vector, keeping

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -333,8 +333,8 @@ end
 
         f = maybegpu(FlatMap(rand(Nside...)))
 
-        @test                  @inferred(MidPass(100,200) .* Diagonal(Fourier(f))) isa Diagonal
-        @test_throws Exception           MidPass(100,200) .* Diagonal(        f)
+        @test                  (MidPass(100,200) .* Diagonal(Fourier(f))) isa Diagonal
+        @test_throws Exception  MidPass(100,200) .* Diagonal(        f)
 
     end
 
@@ -404,7 +404,7 @@ end;
         ]
 
             Ny,Nx = Nside
-            Ixs = collect(rand(Nside...) for i=1:Npol)
+            Ixs = collect(maybegpu(rand(Nside...)) for i=1:Npol)
             Ils = rfft.(Ixs)
             f,g,h = @repeated(maybegpu(FMap(Ixs...)),3)
             v = @SVector[f,f]
-Original file line number
+Diff line change
@@ Expand Up / @@ -381,6 +381,10 @@ macro show⌛(ex) @@
         end
     end
+    @init if Threads.nthreads() > 1
+        disable_timer!(get_defaulttimer())
+    end
     # used in a couple of places to create a Base.promote_rule-like system
@@ Expand Down @@