From c565dfeb748c9df93cb432a31f0b45458333a0a4 Mon Sep 17 00:00:00 2001
From: Chantal Chen <juntao.chen@wustl.edu>
Date: Tue, 7 Jun 2022 16:54:24 -0500
Subject: [PATCH] Working version on 6.7.2022

---
 ImageTransformationsCUDA/Project.toml         | 30 ++++++++
 .../src/ImageTransformationsCUDA.jl           | 21 ++++++
 .../src/interpolationsCUDA.jl                 | 75 +++++++++++++++++++
 ImageTransformationsCUDA/src/warpCUDA.jl      | 36 +++++++++
 ImageTransformationsCUDA/test/runtests.jl     | 22 ++++++
 ImageTransformationsCUDA/test/warpCUDA.jl     | 72 ++++++++++++++++++
 6 files changed, 256 insertions(+)
 create mode 100644 ImageTransformationsCUDA/Project.toml
 create mode 100644 ImageTransformationsCUDA/src/ImageTransformationsCUDA.jl
 create mode 100644 ImageTransformationsCUDA/src/interpolationsCUDA.jl
 create mode 100644 ImageTransformationsCUDA/src/warpCUDA.jl
 create mode 100644 ImageTransformationsCUDA/test/runtests.jl
 create mode 100644 ImageTransformationsCUDA/test/warpCUDA.jl

diff --git a/ImageTransformationsCUDA/Project.toml b/ImageTransformationsCUDA/Project.toml
new file mode 100644
index 0000000..8107ca1
--- /dev/null
+++ b/ImageTransformationsCUDA/Project.toml
@@ -0,0 +1,30 @@
+name = "ImageTransformationsCUDA"
+uuid = "c180182a-82a0-4af3-89e8-12346a81d86b"
+authors = ["Chantal Chen <juntao.chen@wustl.edu>"]
+version = "0.0.1"
+
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CoordinateTransformations = "150eb455-5306-5404-9cee-2592286d6298"
+ImageTransformations = "02fcd773-0e25-5acc-982a-7f6622650795"
+Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+Rotations = "6038ab10-8711-5258-84ad-4b1120ba62dc"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+CUDA = "3"
+CoordinateTransformations = "0.6.2"
+ImageTransformations = "0.9.4"
+Interpolations = "0.13.6"
+OffsetArrays = "1"
+Rotations = "1"
+StaticArrays = "1"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/ImageTransformationsCUDA/src/ImageTransformationsCUDA.jl b/ImageTransformationsCUDA/src/ImageTransformationsCUDA.jl
new file mode 100644
index 0000000..e33a0ae
--- /dev/null
+++ b/ImageTransformationsCUDA/src/ImageTransformationsCUDA.jl
@@ -0,0 +1,21 @@
+"""
+
+This package provides analogous ImageTransformation functions for CuArrays.
+
+    - `warp`: Transforms coordinates of a CuArray, returning an OffsetArray with a CuArray within.
+"""
+module ImageTransformationsCUDA
+
+using ImageTransformations
+
+using OffsetArrays, Rotations, StaticArrays, CoordinateTransformations
+using CUDA
+
+export 
+    
+    warp
+
+include("warpCUDA.jl")
+include("interpolationsCUDA.jl")
+
+end # module
diff --git a/ImageTransformationsCUDA/src/interpolationsCUDA.jl b/ImageTransformationsCUDA/src/interpolationsCUDA.jl
new file mode 100644
index 0000000..c717465
--- /dev/null
+++ b/ImageTransformationsCUDA/src/interpolationsCUDA.jl
@@ -0,0 +1,75 @@
+using ImageTransformations: _default_fillvalue
+
+#Julia has heuristics for deciding on when it's no longer worth specializing. "I" forces julia to specialize in the full tuple type of inds. 
+function access_value(img::C, inds::I, ::Type{T}) where {T,C, I<:Union{Tuple,SVector}} #Getting the wrong type for T because broadcasting over Ref(img) doesn't return a concrete type. 
+
+    inds_upper = ceil.(Int, Tuple(inds))
+    inds_lower = floor.(Int, Tuple(inds))
+
+    return checkbounds(Bool, img, inds_upper...) && checkbounds(Bool, img, inds_lower...) ? recursive_bspline(SVector(inds...), img, T) : _default_fillvalue(T) #this line doesn't work because of potential type instability
+end 
+
+#No need to interpolate on CartesianIndices. Seems to be type unstable?
+function access_value(img::C, inds::CartesianIndex, ::Type{T}) where {T,C} #Getting the wrong type for T because broadcasting over Ref(img) doesn't return a concrete type. 
+    return checkbounds(Bool, img, inds) ? T(img[inds]) : _default_fillvalue(T) #this line doesn't work because of potential type instability
+end 
+
+_getweights(ind::T) where {T} = (1+floor(ind)-ind, ind -floor(ind)) #this returns 0,0 for whole numbers when it needs to return 1, 0 
+
+_weightcalc(w::Tuple{T,T}, a::Tuple{S,S}) where{T,S} = sum(w.*a) 
+
+_getinds(ind::T) where {T} = (floor(ind), ceil(ind))
+
+
+
+"""
+BSpline(Linear()) interpolation for CUDA
+
+    recursive_bspline(inds::SVector, img, ::Type) = weighted interpolation value 
+
+"""
+function recursive_bspline(ind_list::SVector{N, S}, a::AbstractArray, ::Type{T}) where {N, S, T} #this only works if you cut down the array to the correct 2x2x2 window!
+
+    ind = last(ind_list)
+    a_1 = viewdim(a,floor(Int, ind))
+    
+    if ind == floor(ind)
+        return T(recursive_bspline(pop(ind_list), a_1, T))
+    else
+        a_2 = viewdim(a,(floor(Int, ind)+1))
+        weights = _getweights(ind)
+        return T(weights[1]*recursive_bspline(pop(ind_list), a_1, T) + weights[2]*recursive_bspline(pop(ind_list), a_2, T))
+    end
+end
+
+function recursive_bspline(weightlist::SVector{0, S}, a::AbstractArray, ::Type{T}) where {S, T}
+    return a[1] #type
+end
+
+viewdim(a::AbstractArray{T,3}, ind) where{T} = view(a, :, :, ind)
+viewdim(a::AbstractArray{T,2}, ind) where{T} = view(a, :, ind)
+viewdim(a::AbstractArray{T,1}, ind) where{T} = view(a, ind)
+
+
+#What if I adjusted this to do the BSpline Function without needing recursion?
+# using Interpolations
+# using Interpolations: padded_similar, copy_with_padding
+
+# function Interpolations.copy_with_padding(::Type{TC}, A::CuArray, it::Interpolations.DimSpec{Interpolations.InterpolationType}) where {TC}
+#     indsA = axes(A)
+#     indspad = Interpolations.padded_axes(indsA, it)
+#     coefs = padded_similar(TC, indspad, A)
+#     if indspad == indsA
+#         coefs = copyto!(coefs, A)
+#     else
+#         fill!(coefs, zero(TC))
+#         Interpolations.ct!(coefs, indsA, A, indsA)
+#     end
+#     coefs
+# end
+
+# Interpolations.padded_similar(::Type{TC}, inds::Tuple{Vararg{Base.OneTo{Int}}}, A::CuArray) where TC = CuArray{TC}(undef, length.(inds))
+
+#this is working out to be a lot more trouble than it's worth I have no idea why get_index doesn't work here
+
+# I don't even know where to begin here. I guess GPU indexing doesn't work?
diff --git a/ImageTransformationsCUDA/src/warpCUDA.jl b/ImageTransformationsCUDA/src/warpCUDA.jl
new file mode 100644
index 0000000..17d86bd
--- /dev/null
+++ b/ImageTransformationsCUDA/src/warpCUDA.jl
@@ -0,0 +1,36 @@
+using LinearAlgebra
+
+using ImageTransformations: warp, warp!
+# @Tim should I import warp and warp! instead, or continue using the ImageTransformation.warp syntax for clarity
+using ImageTransformations: autorange, try_static
+
+
+function ImageTransformations.warp(img::Union{CuArray{T,N},OffsetArray{T,N,<:CuArray}}, tform, inds::Tuple = autorange(img, inv(tform))) where {T, N}
+    out = OffsetArray(CuArray{T}(undef, map(length, inds)), inds); #Can't make CuArray of OffsetArray
+    warp!(out, img, try_static(tform, img))
+end 
+
+function ImageTransformations.warp!(out, img::OffsetArray{T,N,<:CuArray}, tform) where {T,N}
+    warp!(out, img.parent, tform, img.offsets)
+end
+
+function ImageTransformations.warp!(out, img::CuArray{T,N}, tform, in_offsets = ntuple(i->0, N)) where {T,N} #why is this now unstable?!
+    img_inds = map(out->out.I, CartesianIndices(axes(out.parent))) 
+    tform_offset = offset_calc(out, in_offsets, tform)
+
+    tformindex = CuArray(tform_offset.(SVector.(img_inds))) 
+    # TODO write an extra function here that converts tformindex to CartesianIndices if possible.
+    # return tformindex, out.offsets, T #for checking
+
+    return out = OffsetArray(access_value.(Ref(img), tformindex, T), out.offsets...)
+end 
+
+#calculates translation array with input and output offset stripped off. 
+#tform(xi + Δxi) = xo + Δxo
+#tform.linear(xi) + tform(Δxi) - Δxo = xo
+#tform2(xi) = (tform.linear, (tform(Δxi) - Δxo))(xi) = xo
+function offset_calc(out::OffsetArray{T,N,<:CuArray}, in_offsets, tform) where {T,N} # doesn't work for 3D!
+    in_translation = AffineMap(I, -1 .*[in_offsets...]) #diagm(ones(T, N))
+    out_translation = AffineMap(I, [out.offsets...])
+    return in_translation∘tform∘out_translation
+end 
diff --git a/ImageTransformationsCUDA/test/runtests.jl b/ImageTransformationsCUDA/test/runtests.jl
new file mode 100644
index 0000000..a13c321
--- /dev/null
+++ b/ImageTransformationsCUDA/test/runtests.jl
@@ -0,0 +1,22 @@
+using OffsetArrays, Rotations, StaticArrays, CoordinateTransformations, LinearAlgebra
+using ImageTransformations, ImageTransformationsCUDA, Interpolations
+using CUDA
+using Test
+
+# helper function from ImageTransformations/test.jl to compare NaN
+nearlysame(x, y) = x ≈ y || (isnan(x) & isnan(y))
+nearlysame(A::AbstractArray, B::AbstractArray) = all(map(nearlysame, A, B))
+
+tests = [
+    "warpCUDA.jl",
+]
+
+@testset "ImageTransformations" begin
+    for t in tests
+        @testset "$t" begin
+            include(t)
+        end
+    end
+end
+
+nothing
\ No newline at end of file
diff --git a/ImageTransformationsCUDA/test/warpCUDA.jl b/ImageTransformationsCUDA/test/warpCUDA.jl
new file mode 100644
index 0000000..85f5651
--- /dev/null
+++ b/ImageTransformationsCUDA/test/warpCUDA.jl
@@ -0,0 +1,72 @@
+import ImageTransformations: warp
+
+# TODO design more tests, esp with color. 
+
+@testset "Conditional tests for warping on GPU" begin #should break this into 2D, 3D, etc.
+    #Float32 2D
+    x = zeros(Float32, 5,5)
+    x[2,2] = 1.0
+    x[2,3] = 2.0
+    y = convert(CuArray, x)
+
+    tform = AffineMap(RotMatrix(pi/2), [6,0])
+
+    z = warp(y, tform); 
+    @test isa(z, OffsetArray{Float32, 2, <:CuArray})
+    @test nearlysame(OffsetArray(Array(z.parent), z.offsets),warp(x, tform))
+
+    #Float32 3D
+    a = zeros(3,3,3)
+    a[1,2,3] = 1.0
+    b = CuArray(a)
+
+    tform3DNull = AffineMap(RotXYZ(0,0,0), [0.0,0.0,0.0]);
+    c = warp(b, tform3DNull) #3D is broken now. 
+
+    @test nearlysame(warp(a, tform3DNull), OffsetArray(Array(c.parent), c.offsets))
+
+    #introduce offsets
+    d = zeros(6,6)
+    d[1,2] = 1.0
+    e = CuArray(d)
+
+    tform_trans = AffineMap(RotMatrix(0), [1, 2]) #just translation
+    f = warp(e, tform_trans); 
+
+    @test nearlysame(warp(d, tform_trans), OffsetArray(Array(f.parent), f.offsets)) #OffsetArrays doesn't work with CuArray
+
+    #Basic rotation
+    tform_lin = AffineMap([0 -1; 1 0], [0,0])
+    g = warp(e, tform_lin);
+
+    @test nearlysame(warp(d, tform_lin), OffsetArray(Array(g.parent), g.offsets))#OffsetArrays doesn't work with CuArray
+
+    h = OffsetArray(d, -1, 0)
+    i = OffsetArray(e, -1, 0);
+
+    #Basic translation
+    j = warp(i, tform_trans);
+    @test nearlysame(warp(h, tform_trans), OffsetArray(Array(j.parent), j.offsets)) #OffsetArrays doesn't work with CuArray
+
+    #With BSpline Interpolation
+    tform_trans_float = AffineMap(RotMatrix(0), [0.1, 0.2])
+    k = warp(e, tform_trans_float)
+
+    @test nearlysame(warp(d, tform_trans_float), OffsetArray(Array(k.parent), k.offsets))
+    
+    tform_lin_float = AffineMap(RotMatrix(pi/4), [0,0])
+    l = warp(e, tform_lin_float);
+
+    @test nearlysame(warp(d, tform_lin_float), OffsetArray(Array(l.parent), l.offsets))
+
+    m = rand(3,3,3)
+    n = CuArray(m)
+
+    tform3D_complex = AffineMap(RotXYZ(0.1, 0.1, 0.1), [0.3,0.2,0.1])
+
+    o = warp(n, tform3D_complex);
+
+    @test nearlysame(warp(m, tform3D_complex), OffsetArray(Array(o.parent), o.offsets))
+
+end
+