From 7263998028d9e19a6d7d7df3635a9c5e3e141d26 Mon Sep 17 00:00:00 2001 From: marius Date: Fri, 26 Feb 2021 02:23:44 -0800 Subject: [PATCH 1/4] WIP one-GPU-per-thread --- src/flat_proj.jl | 13 +++++++++---- src/gpu.jl | 10 +++++++--- src/util_fft.jl | 22 ++++++++++++++-------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/flat_proj.jl b/src/flat_proj.jl index 0998cf86..530470a8 100644 --- a/src/flat_proj.jl +++ b/src/flat_proj.jl @@ -24,10 +24,15 @@ end real_type(T) = promote_type(real(T), Float32) @init @require Unitful="1986cc42-f94f-5a68-af5c-568840ba703d" real_type(::Type{<:Unitful.Quantity{T}}) where {T} = real_type(T) +# for CuArray-backed fields, each GPU needs to have a copy of the +# ProjLambert quantities in its own memory, so we need to memoize per "deviceid" +# gpu-specialized version of this in gpu.jl +_deviceid(::Any) = 0 + ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) = ProjLambert(Ny, Nx, Float64(θpix), Float64.(center), real_type(T), storage) -@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage) where {T} +@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage, dev=_deviceid(storage)) where {T} Δx = T(deg2rad(θpix/60)) Δℓx = T(2π/(Nx*Δx)) @@ -36,9 +41,9 @@ ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) = Ωpix = T(Δx^2) ℓy = adapt(storage, (ifftshift(-Ny÷2:(Ny-1)÷2) .* Δℓy)[1:Ny÷2+1]) ℓx = adapt(storage, (ifftshift(-Nx÷2:(Nx-1)÷2) .* Δℓx)) - ℓmag = @. sqrt(ℓx'^2 + ℓy^2) - ϕ = @. angle(ℓx' + im*ℓy) - sin2ϕ, cos2ϕ = @. sin(2ϕ), cos(2ϕ) + ℓmag = adapt(storage, (@. sqrt(ℓx'^2 + ℓy^2))) + ϕ = adapt(storage, (@. angle(ℓx' + im*ℓy))) + sin2ϕ, cos2ϕ = adapt(storage, (@. sin(2ϕ), cos(2ϕ))) if iseven(Ny) sin2ϕ[end, end:-1:(Nx÷2+2)] .= sin2ϕ[end, 2:Nx÷2] end diff --git a/src/gpu.jl b/src/gpu.jl index 50f8afe0..35f58dc8 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -27,6 +27,9 @@ gpu(x) = adapt_structure(CuArray, x) adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj) +_deviceid(::Type{<:CuArray}) = deviceid() + + function Cℓ_to_2D(Cℓ, proj::ProjLambert{T,<:CuArray}) where {T} # todo: remove needing to go through cpu here: gpu(Complex{T}.(nan2zero.(Cℓ.(cpu(proj.ℓmag))))) @@ -77,10 +80,11 @@ Random.randn!(rng::MersenneTwister, A::CuArray{T}) where {T} = # in the C2R case we pre-allocate the memory only once (via memoization) as well # as doing the copy asynchronously import CUDA.CUFFT: unsafe_execute! -using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, cufftExecC2R, CUFFT_C2R, unsafe_copyto!, CuDefaultStream, pointer +using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, + cufftExecC2R, CUFFT_C2R, unsafe_copyto!, pointer, stream plan_buffer(x) = plan_buffer(eltype(x),size(x)) -@memoize plan_buffer(T,dims) = CuArray{T}(undef,dims...) +@memoize plan_buffer(T, dims, dev=deviceid()) = CuArray{T}(undef,dims...) function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N}, x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N} @@ -92,7 +96,7 @@ function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N}, x::CuArray{cufftComplex,N}, y::CuArray{cufftReal} ) where {K,N} @assert plan.xtype == CUFFT_C2R - cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=CuDefaultStream()), y) + cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y) end # monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436 diff --git a/src/util_fft.jl b/src/util_fft.jl index 1a62ece5..c3f4a8e1 100644 --- a/src/util_fft.jl +++ b/src/util_fft.jl @@ -17,20 +17,26 @@ FFTW_TIMELIMIT = 5 # a set of wrapper FFT functions which use a @memoize'd plan _fft_arr_type(arr) = basetype(typeof(parent(arr))) -m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...) * arr -m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, d, size(arr)[2:end]...) \ arr -m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...), arr) -m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)...), copy_if_fftw(arr)) -@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz...) where {T, N, A<:AbstractArray{T,N}, Dims} +m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)) * arr +m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, (d, size(arr)[2:end]...)) \ arr +m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)), arr) +m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)), copy_if_fftw(arr)) +@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz) where {T, N, A<:Array{T,N}, Dims} FFTW.set_num_threads(FFTW_NUM_THREADS) - plan_rfft( - A(undef, sz...), dims; (A <: Array ? (timelimit=FFTW_TIMELIMIT,) : ())... - ) #:: FFTW.rFFTWPlan{T,-1,false,N,Dims} # type assert to help inference in @memoized function + plan_rfft(A(undef, sz...), dims; timelimit=FFTW_TIMELIMIT) end # FFTW (but not MKL) destroys the input array for inplace inverse real # FFTs, so we need a copy. see https://github.com/JuliaMath/FFTW.jl/issues/158 copy_if_fftw(x) = FFTW.fftw_vendor==:fftw ? copy(x) : x +@init @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin + using .CUDA + copy_if_fftw(x::CuArray) = x + @memoize function m_plan_rfft(::Type{A}, dims::Dims, sz, dev=deviceid()) where {T, N, A<:CuArray{T,N}, Dims} + plan_rfft(A(undef, sz...), dims) + end +end + """ Convert a matrix A which is the output of a real FFT to a real vector, keeping From d2fa591a6730bfc717c6177ce82919c44800dc9e Mon Sep 17 00:00:00 2001 From: marius Date: Sat, 27 Feb 2021 13:46:51 -0800 Subject: [PATCH 2/4] add function to adapt to unified GPU memory --- src/gpu.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/gpu.jl b/src/gpu.jl index 35f58dc8..e5805ce1 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -23,6 +23,16 @@ global_rng_for(::Type{<:CuArray}) = curand_rng() gpu(x) = adapt_structure(CuArray, x) +function adapt_structure(::Type{Mem.Unified}, x::Union{Array{T,N},CuArray{T,N}}) where {T,N} + buf = Mem.alloc(Mem.Unified, sizeof(T) * prod(size(x))) + y = unsafe_wrap(CuArray{T,N}, convert(CuPtr{T}, buf), size(x); own=false) + # TODO: need to write finalizer, right now this just leaks the unified memory + copyto!(y, x) + return y +end +unified_gpu(x) = adapt(Mem.Unified, x) + + adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj) From f6ada72b2c2937d772fddd9bdb39e728286a28e8 Mon Sep 17 00:00:00 2001 From: marius Date: Tue, 27 Apr 2021 16:07:35 -0700 Subject: [PATCH 3/4] leave CUDA's unsafe_execute be --- src/gpu.jl | 45 ++++++++++++++------------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/src/gpu.jl b/src/gpu.jl index 165d7697..81b2d295 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -117,37 +117,20 @@ using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, plan_buffer(x) = plan_buffer(eltype(x),size(x)) @memoize plan_buffer(T, dims, dev=deviceid()) = CuArray{T}(undef,dims...) -function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N}, - x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N} - ) where {K,N} - @assert plan.xtype == CUFFT_R2C - cufftExecR2C(plan, x, y) -end -function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N}, - x::CuArray{cufftComplex,N}, y::CuArray{cufftReal} - ) where {K,N} - @assert plan.xtype == CUFFT_C2R - cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y) -end - -# monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436 -# until it hits a release -using CUDA.CURAND: curandSetPseudoRandomGeneratorSeed, curandSetGeneratorOffset, - CURAND_STATUS_ALLOCATION_FAILED, CURAND_STATUS_PREEXISTING_FAILURE, CURAND_STATUS_SUCCESS, - unsafe_curandGenerateSeeds, throw_api_error, @retry_reclaim, RNG - -function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0) - curandSetPseudoRandomGeneratorSeed(rng, seed) - curandSetGeneratorOffset(rng, offset) - res = @retry_reclaim err->isequal(err, CURAND_STATUS_ALLOCATION_FAILED) || - isequal(err, CURAND_STATUS_PREEXISTING_FAILURE) begin - unsafe_curandGenerateSeeds(rng) - end - if res != CURAND_STATUS_SUCCESS - throw_api_error(res) - end - return -end +## might want to bring this back but need to adapt to newer CUDA verions + +# function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N}, +# x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N} +# ) where {K,N} +# @assert plan.xtype == CUFFT_R2C +# cufftExecR2C(plan, x, y) +# end +# function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N}, +# x::CuArray{cufftComplex,N}, y::CuArray{cufftReal} +# ) where {K,N} +# @assert plan.xtype == CUFFT_C2R +# cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y) +# end """ From 2e875d6345eed6d5347dd98a64f683fce5d719a9 Mon Sep 17 00:00:00 2001 From: marius Date: Wed, 28 Apr 2021 11:47:11 -0700 Subject: [PATCH 4/4] turn off TimerOutputs when using threads --- src/CMBLensing.jl | 2 +- src/util.jl | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/CMBLensing.jl b/src/CMBLensing.jl index 6624ade5..65aba5b5 100644 --- a/src/CMBLensing.jl +++ b/src/CMBLensing.jl @@ -45,7 +45,7 @@ using StaticArrays: @SMatrix, @SVector, SMatrix, StaticArray, StaticArrayStyle, StaticMatrix, StaticVector, SVector, SArray, SizedArray using Statistics using StatsBase -using TimerOutputs: @timeit, get_defaulttimer, reset_timer! +using TimerOutputs: @timeit, get_defaulttimer, reset_timer!, disable_timer! using UnPack using Zygote using Zygote: unbroadcast, Numeric, @adjoint, @nograd diff --git a/src/util.jl b/src/util.jl index b0dc0ffb..b1c18416 100644 --- a/src/util.jl +++ b/src/util.jl @@ -381,6 +381,10 @@ macro show⌛(ex) end end +@init if Threads.nthreads() > 1 + disable_timer!(get_defaulttimer()) +end + # used in a couple of places to create a Base.promote_rule-like system