diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml
index 97ca72fd..0f202ced 100644
--- a/.github/workflows/runtests.yml
+++ b/.github/workflows/runtests.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        julia-version: ['1.5', '1.6', '1.7-nightly']
+        julia-version: ['1.6', '1.7-nightly']
         os: [ubuntu-latest]
         JULIA_FFTW_PROVIDER: ['MKL','FFTW']
       fail-fast: false
diff --git a/Project.toml b/Project.toml
index a144458e..9c66bb8a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -53,7 +53,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 AbstractFFTs = "0.5, 1"
 Adapt = "1.0.1, 2, 3"
-CUDA = "2.1"
+CUDA = "3"
 Combinatorics = "1"
 DataStructures = "0.17.9, 0.18"
 FFTW = "1.2 - 1.3"
@@ -83,5 +83,5 @@ StaticArrays = "0.12.1, 1.0"
 StatsBase = "0.32, 0.33"
 TimerOutputs = "0.5"
 UnPack = "1"
-Zygote = "0.4.14, 0.5, 0.6 - 0.6.4"
-julia = "1.5"
+Zygote = "0.6.9"
+julia = "1.6"
diff --git a/README.md b/README.md
index 5e67eb2e..9e75d75b 100644
--- a/README.md
+++ b/README.md
@@ -9,11 +9,11 @@
 CMBLensing.jl is a next-generation tool for analysis of the lensed Cosmic Microwave Background. It is written in [Julia](https://julialang.org/) and transparently callable from Python.
 
 
-At its heart, CMBLensing.jl maximizes or samples the Bayesian posterior for the CMB lensing problem. It also contains tools to quickly manipulate and process CMB maps, set up modified posteriors, and take gradients using automatic differentation.
+At its heart, CMBLensing.jl maximizes or samples the Bayesian posterior for the CMB lensing problem. It also contains tools to quickly manipulate and process CMB maps, set up modified posteriors, and take gradients using automatic differentiation.
 
 ### Highlights
-* Fully Nvidia GPU compatible (speedups over CPU are currently 3x-10x, depending on the problem size and hardware).
-* Automatic differentation (via [Zygote.jl](https://fluxml.ai/Zygote.jl/)) provides for-free gradients of your custom posteriors.
+* Fully Nvidia GPU compatible (1-2 order of magnitude speedups over CPU, depending on the problem size and hardware).
+* Automatic differentiation (via [Zygote.jl](https://fluxml.ai/Zygote.jl/)) provides for-free gradients of your custom posteriors.
 * Includes the following algorithms to lense a map:
     * `LenseFlow` ([Millea, Anderes, & Wandelt 2017](https://arxiv.org/abs/1708.06753))
     * `Taylens` ([Næss & Louis 2013](https://arxiv.org/abs/1307.0719))
@@ -29,14 +29,14 @@ The best place to get started is to read the [documentation](https://cosmicmar.c
 
 Most of the pages in the documentation are Jupyter notebooks, and you can click the "launch binder" link at the top of each page to launch a Jupyterlab server running the notebook in your browser (courtesy of [binder](https://mybinder.org/)). 
 
-You can also clone the repostiory and open the notebooks in [docs/src](https://github.com/marius311/CMBLensing.jl/tree/master/docs/src) if you want to run them locally (which will usually lead to higher performance). The notebooks are stored as `.md` files rather than `.ipynb` format. Its recommented to install [Jupytext](jupytext) (`pip install jupytext`) and then you can run these `.md` directly from Jupyterlab by right-clicking on them and selecting `Open With -> Notebook`. Otherwise, run the script `docs/make_notebooks.sh` to convert the `.md` files to `.ipynb` which you can then open as desired. 
+You can also clone the repostiory and open the notebooks in [docs/src](https://github.com/marius311/CMBLensing.jl/tree/master/docs/src) if you want to run them locally (which will usually lead to higher performance). The notebooks are stored as `.md` files rather than `.ipynb` format. Its recommended to install [Jupytext](jupytext) (`pip install jupytext`) and then you can run these `.md` directly from Jupyterlab by right-clicking on them and selecting `Open With -> Notebook`. Otherwise, run the script `docs/make_notebooks.sh` to convert the `.md` files to `.ipynb` which you can then open as desired. 
 
 
 ## Installation
 
 ### Requirements
 
-* Julia 1.5
+* Julia 1.6
 * _(recommended)_ An Nvidia GPU and [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) for GPU support
 * _(recommended)_ FFTW.jl built with [`JULIA_FFTW_PROVIDER=MKL`](https://juliamath.github.io/FFTW.jl/stable/#Installation-1) for faster CPU FFTs 
 * _(recommended)_ Python 3 + matplotlib (used for plotting)
diff --git a/src/CMBLensing.jl b/src/CMBLensing.jl
index 6624ade5..65aba5b5 100644
--- a/src/CMBLensing.jl
+++ b/src/CMBLensing.jl
@@ -45,7 +45,7 @@ using StaticArrays: @SMatrix, @SVector, SMatrix, StaticArray, StaticArrayStyle,
     StaticMatrix, StaticVector, SVector, SArray, SizedArray
 using Statistics
 using StatsBase
-using TimerOutputs: @timeit, get_defaulttimer, reset_timer!
+using TimerOutputs: @timeit, get_defaulttimer, reset_timer!, disable_timer!
 using UnPack
 using Zygote
 using Zygote: unbroadcast, Numeric, @adjoint, @nograd
diff --git a/src/autodiff.jl b/src/autodiff.jl
index ef16da3e..f7e52021 100644
--- a/src/autodiff.jl
+++ b/src/autodiff.jl
@@ -66,7 +66,7 @@ Zygote.accum(f::BaseField, nt::NamedTuple{(:arr,:metadata)}) = (@assert(isnothin
 @adjoint dot(f::Field{B1}, g::Field{B2}) where {B1,B2} = dot(f,g), Δ -> (Δ*B1(g), Δ*B2(f))
 @adjoint (*)(f::Adjoint{<:Any,<:Field}, g::Field) = Zygote.pullback((f,g)->dot(f',g),f,g)
 # ℝᴺˣᴺ -> ℝ¹ 
-@adjoint logdet(L::ParamDependentOp, θ) = Zygote._pullback(θ->logdet(L(θ)), θ)
+@adjoint logdet(L::ParamDependentOp, θ) = Zygote.pullback((L,θ)->logdet(L(θ)), L, θ)
 @adjoint logdet(L::DiagOp) = logdet(L), Δ -> (Δ * Zfac(L) * pinv(L)',)
 
 # basis conversion
@@ -110,9 +110,8 @@ end
 @adjoint function *(x::FieldOrOpRowVector, y::FieldVector)
     z = x * y
     # when x is a vector of Fields
-    back(Δ) = (Δ * y', x' * Δ)
-    # when x is a vector of Diagonals. in this case, Δ * basis(Δ)(y)' create an
-    # OuterProdOp in the same basis as the Diagonals in x
+    back(Δ::Real) = ((Δ * y)', x' * Δ)
+    # when x is a vector of Diagonals. in this case, Δ * basis(Δ)(y)'
     back(Δ::Field{B}) where {B} = (Δ * basis(Δ)(y)'), (x' * Δ)
     z, back
 end
diff --git a/src/flat_proj.jl b/src/flat_proj.jl
index 0998cf86..530470a8 100644
--- a/src/flat_proj.jl
+++ b/src/flat_proj.jl
@@ -24,10 +24,15 @@ end
 real_type(T) = promote_type(real(T), Float32)
 @init @require Unitful="1986cc42-f94f-5a68-af5c-568840ba703d" real_type(::Type{<:Unitful.Quantity{T}}) where {T} = real_type(T)
 
+# for CuArray-backed fields, each GPU needs to have a copy of the
+# ProjLambert quantities in its own memory, so we need to memoize per "deviceid"
+# gpu-specialized version of this in gpu.jl 
+_deviceid(::Any) = 0
+
 ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) = 
     ProjLambert(Ny, Nx, Float64(θpix), Float64.(center), real_type(T), storage)
 
-@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage) where {T}
+@memoize function ProjLambert(Ny, Nx, θpix, center, ::Type{T}, storage, dev=_deviceid(storage)) where {T}
 
     Δx           = T(deg2rad(θpix/60))
     Δℓx          = T(2π/(Nx*Δx))
@@ -36,9 +41,9 @@ ProjLambert(;Ny, Nx, θpix=1, center=(0,0), T=Float32, storage=Array) =
     Ωpix         = T(Δx^2)
     ℓy           = adapt(storage, (ifftshift(-Ny÷2:(Ny-1)÷2) .* Δℓy)[1:Ny÷2+1])
     ℓx           = adapt(storage, (ifftshift(-Nx÷2:(Nx-1)÷2) .* Δℓx))
-    ℓmag         = @. sqrt(ℓx'^2 + ℓy^2)
-    ϕ            = @. angle(ℓx' + im*ℓy)
-    sin2ϕ, cos2ϕ = @. sin(2ϕ), cos(2ϕ)
+    ℓmag         = adapt(storage, (@. sqrt(ℓx'^2 + ℓy^2)))
+    ϕ            = adapt(storage, (@. angle(ℓx' + im*ℓy)))
+    sin2ϕ, cos2ϕ = adapt(storage, (@. sin(2ϕ), cos(2ϕ)))
     if iseven(Ny)
         sin2ϕ[end, end:-1:(Nx÷2+2)] .= sin2ϕ[end, 2:Nx÷2]
     end
diff --git a/src/gpu.jl b/src/gpu.jl
index 4ac71fb2..81b2d295 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -1,6 +1,6 @@
 
 using CUDA
-using CUDA: cufunc, curand_rng
+using CUDA: curand_rng
 using CUDA.CUSPARSE: CuSparseMatrix, CuSparseMatrixCSR, CuSparseMatrixCOO
 using CUDA.CUSOLVER: CuQR
 
@@ -40,6 +40,16 @@ to Float32. Equivalent to `adapt_structure(CuArray, x)`. Returns nothing.
 """
 gpu(x) = adapt_structure(CuArray, x)
 
+function adapt_structure(::Type{Mem.Unified}, x::Union{Array{T,N},CuArray{T,N}}) where {T,N}
+    buf = Mem.alloc(Mem.Unified, sizeof(T) * prod(size(x)))
+    y = unsafe_wrap(CuArray{T,N}, convert(CuPtr{T}, buf), size(x); own=false)
+    # TODO: need to write finalizer, right now this just leaks the unified memory
+    copyto!(y, x)
+    return y
+end
+unified_gpu(x) = adapt(Mem.Unified, x)
+
+
 
 @doc doc"""
 
@@ -57,6 +67,9 @@ end
 adapt_structure(::CUDA.Float32Adaptor, proj::ProjLambert) = adapt_structure(CuArray{Float32}, proj)
 
 
+_deviceid(::Type{<:CuArray}) = deviceid()
+
+
 function Cℓ_to_2D(Cℓ, proj::ProjLambert{T,<:CuArray}) where {T}
     # todo: remove needing to go through cpu here:
     gpu(Complex{T}.(nan2zero.(Cℓ.(cpu(proj.ℓmag)))))
@@ -72,17 +85,6 @@ fill!(f::CuBaseField, x) = (fill!(f.arr,x); f)
 ==(a::CuBaseField, b::CuBaseField) = (==)(promote(a.arr, b.arr)...)
 sum(f::CuBaseField; dims=:) = (dims == :) ? sum(f.arr) : (1 in dims) ? error("Sum over invalid dims of CuFlatS0.") : f
 
-
-# these only work for Reals in CUDA
-# with these definitions, they work for Complex as well
-CUDA.isfinite(x::Complex) = Base.isfinite(x)
-CUDA.sqrt(x::Complex) = CUDA.sqrt(CUDA.abs(x)) * CUDA.exp(im*CUDA.angle(x)/2)
-CUDA.culiteral_pow(::typeof(^), x::Complex, ::Val{2}) = x * x
-CUDA.pow(x::Complex, p) = x^p
-
-# until https://github.com/JuliaGPU/CUDA.jl/pull/618 (CUDA 2.5)
-CUDA.cufunc(::typeof(angle)) = CUDA.angle
-
 # adapting of SparseMatrixCSC ↔ CuSparseMatrixCSR (otherwise dense arrays created)
 adapt_structure(::Type{<:CuArray}, L::SparseMatrixCSC)   = CuSparseMatrixCSR(L)
 adapt_structure(::Type{<:Array},   L::CuSparseMatrixCSR) = SparseMatrixCSC(L)
@@ -109,42 +111,26 @@ Random.randn!(rng::MersenneTwister, A::CuArray) =
 # in the C2R case we pre-allocate the memory only once (via memoization) as well
 # as doing the copy asynchronously
 import CUDA.CUFFT: unsafe_execute!
-using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, cufftExecC2R, CUFFT_C2R, unsafe_copyto!, CuDefaultStream, pointer
+using CUDA.CUFFT: rCuFFTPlan, cufftReal, cufftComplex, CUFFT_R2C, cufftExecR2C, 
+    cufftExecC2R, CUFFT_C2R, unsafe_copyto!, pointer, stream
 
 plan_buffer(x) = plan_buffer(eltype(x),size(x))
-@memoize plan_buffer(T,dims) = CuArray{T}(undef,dims...)
-
-function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
-                            x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
-                            ) where {K,N}
-    @assert plan.xtype == CUFFT_R2C
-    cufftExecR2C(plan, x, y)
-end
-function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
-                            x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
-                            ) where {K,N}
-    @assert plan.xtype == CUFFT_C2R
-    cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=CuDefaultStream()), y)
-end
-
-# monkey-patched version of https://github.com/JuliaGPU/CUDA.jl/pull/436
-# until it hits a release
-using CUDA.CURAND: curandSetPseudoRandomGeneratorSeed, curandSetGeneratorOffset, 
-    CURAND_STATUS_ALLOCATION_FAILED, CURAND_STATUS_PREEXISTING_FAILURE, CURAND_STATUS_SUCCESS,
-    unsafe_curandGenerateSeeds, throw_api_error, @retry_reclaim, RNG
-
-function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0)
-    curandSetPseudoRandomGeneratorSeed(rng, seed)
-    curandSetGeneratorOffset(rng, offset)
-    res = @retry_reclaim err->isequal(err, CURAND_STATUS_ALLOCATION_FAILED) ||
-                              isequal(err, CURAND_STATUS_PREEXISTING_FAILURE) begin
-        unsafe_curandGenerateSeeds(rng)
-    end
-    if res != CURAND_STATUS_SUCCESS
-        throw_api_error(res)
-    end
-    return
-end
+@memoize plan_buffer(T, dims, dev=deviceid()) = CuArray{T}(undef,dims...)
+
+## might want to bring this back but need to adapt to newer CUDA verions
+
+# function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
+#                             x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
+#                             ) where {K,N}
+#     @assert plan.xtype == CUFFT_R2C
+#     cufftExecR2C(plan, x, y)
+# end
+# function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
+#                             x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
+#                             ) where {K,N}
+#     @assert plan.xtype == CUFFT_C2R
+#     cufftExecC2R(plan, unsafe_copyto!(pointer(plan_buffer(x)),pointer(x),length(x),async=true,stream=stream()), y)
+# end
 
 
 """
diff --git a/src/util.jl b/src/util.jl
index b0dc0ffb..b1c18416 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -381,6 +381,10 @@ macro show⌛(ex)
     end
 end
 
+@init if Threads.nthreads() > 1
+    disable_timer!(get_defaulttimer())
+end
+
 
 
 # used in a couple of places to create a Base.promote_rule-like system
diff --git a/src/util_fft.jl b/src/util_fft.jl
index 1a62ece5..c3f4a8e1 100644
--- a/src/util_fft.jl
+++ b/src/util_fft.jl
@@ -17,20 +17,26 @@ FFTW_TIMELIMIT = 5
 
 # a set of wrapper FFT functions which use a @memoize'd plan
 _fft_arr_type(arr) = basetype(typeof(parent(arr)))
-m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...) * arr
-m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, d, size(arr)[2:end]...) \ arr
-m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)...), arr)
-m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)...), copy_if_fftw(arr))
-@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz...) where {T, N, A<:AbstractArray{T,N}, Dims}
+m_rfft(arr::AbstractArray{T,N}, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)) * arr
+m_irfft(arr::AbstractArray{Complex{T},N}, d, dims) where {T,N} = m_plan_rfft(_fft_arr_type(arr){T,N}, dims, (d, size(arr)[2:end]...)) \ arr
+m_rfft!(dst, arr::AbstractArray{T,N}, dims) where {T,N} = mul!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(arr)), arr)
+m_irfft!(dst, arr::AbstractArray{Complex{T},N}, dims) where {T,N} = ldiv!(dst, m_plan_rfft(_fft_arr_type(arr){T,N}, dims, size(dst)), copy_if_fftw(arr))
+@memoize function m_plan_rfft(::Type{A}, dims::Dims, sz) where {T, N, A<:Array{T,N}, Dims}
     FFTW.set_num_threads(FFTW_NUM_THREADS)
-    plan_rfft(
-        A(undef, sz...), dims; (A <: Array ? (timelimit=FFTW_TIMELIMIT,) : ())...
-    ) #:: FFTW.rFFTWPlan{T,-1,false,N,Dims} # type assert to help inference in @memoized function
+    plan_rfft(A(undef, sz...), dims; timelimit=FFTW_TIMELIMIT)
 end
 # FFTW (but not MKL) destroys the input array for inplace inverse real
 # FFTs, so we need a copy. see https://github.com/JuliaMath/FFTW.jl/issues/158
 copy_if_fftw(x) = FFTW.fftw_vendor==:fftw ? copy(x) : x
 
+@init @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
+    using .CUDA
+    copy_if_fftw(x::CuArray) = x
+    @memoize function m_plan_rfft(::Type{A}, dims::Dims, sz, dev=deviceid()) where {T, N, A<:CuArray{T,N}, Dims}
+        plan_rfft(A(undef, sz...), dims)
+    end
+end
+
 
 """
 Convert a matrix A which is the output of a real FFT to a real vector, keeping
diff --git a/test/runtests.jl b/test/runtests.jl
index 1c999add..c845f1f7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -333,8 +333,8 @@ end
 
         f = maybegpu(FlatMap(rand(Nside...)))
         
-        @test                  @inferred(MidPass(100,200) .* Diagonal(Fourier(f))) isa Diagonal
-        @test_throws Exception           MidPass(100,200) .* Diagonal(        f)
+        @test                  (MidPass(100,200) .* Diagonal(Fourier(f))) isa Diagonal
+        @test_throws Exception  MidPass(100,200) .* Diagonal(        f)
 
     end
     
@@ -404,7 +404,7 @@ end;
         ]
             
             Ny,Nx = Nside
-            Ixs = collect(rand(Nside...) for i=1:Npol)
+            Ixs = collect(maybegpu(rand(Nside...)) for i=1:Npol)
             Ils = rfft.(Ixs)
             f,g,h = @repeated(maybegpu(FMap(Ixs...)),3)
             v = @SVector[f,f]