Skip to content

Commit 4aa430c

Browse files
authored
Merge pull request #235 from JuliaGPU/tb/linalg
Port some CuArrays linalg kernels.
2 parents 03a86d5 + 9c25ebc commit 4aa430c

File tree

9 files changed

+170
-61
lines changed

9 files changed

+170
-61
lines changed

src/device/indexing.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ So it can be used like this:
3636
3737
```julia
3838
function kernel(ctx::AbstractKernelContext, A)
39-
idx = @linear_index A ctx
39+
idx = @linearidx A ctx
4040
# from here on it's save to index into A with idx
4141
@inbounds begin
4242
A[idx] = ...
@@ -56,12 +56,12 @@ end
5656
"""
5757
cartesianidx(A, ctxsym = :ctx)
5858
59-
Like [`@linearidx(A, ctxsym = :ctx)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
59+
Like [`@linearidx(A, ctxsym = :ctx)`](@ref), but returns a N-dimensional `CartesianIndex`.
6060
"""
6161
macro cartesianidx(A, ctxsym = :ctx)
6262
quote
6363
x = $(esc(A))
64-
i2 = @linearidx(x, $(esc(ctxsym)))
65-
gpu_ind2sub(x, i2)
64+
i = @linearidx(x, $(esc(ctxsym)))
65+
CartesianIndices(x)[i]
6666
end
6767
end

src/host/abstractarray.jl

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,12 @@ end
139139
Base.copyto!(dest::AbstractGPUArray, src::AbstractGPUArray) =
140140
copyto!(dest, CartesianIndices(dest), src, CartesianIndices(src))
141141

142-
function copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, shape_dest, shape_source, length)
142+
function copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, length)
143143
i = linear_index(ctx)
144144
if i <= length
145145
# TODO can this be done faster and smarter?
146-
idx = gpu_ind2sub(shape, i)
147-
dest_idx = gpu_sub2ind(shape_dest, idx .+ dest_offsets)
148-
src_idx = gpu_sub2ind(shape_source, idx .+ src_offsets)
149-
@inbounds dest[dest_idx] = src[src_idx]
146+
idx = CartesianIndices(shape)[i]
147+
@inbounds dest[idx + dest_offsets] = src[idx + src_offsets]
150148
end
151149
return
152150
end
@@ -159,10 +157,10 @@ function Base.copyto!(dest::AbstractGPUArray{T, N}, destcrange::CartesianIndices
159157
end
160158
len = length(destcrange)
161159

162-
dest_offsets = first.(destcrange.indices) .- 1
163-
src_offsets = first.(srccrange.indices) .- 1
160+
dest_offsets = first(destcrange) - one(CartesianIndex{N})
161+
src_offsets = first(srccrange) - one(CartesianIndex{N})
164162
gpu_call(copy_kernel!,
165-
dest, dest_offsets, src, src_offsets, shape, size(dest), size(src), len;
163+
dest, dest_offsets, src, src_offsets, shape, len;
166164
total_threads=len)
167165
dest
168166
end

src/host/base.jl

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,40 +19,6 @@ Base.map!(f, y::AbstractGPUArray, x::AbstractGPUArray) =
1919
Base.map!(f, y::AbstractGPUArray, x1::AbstractGPUArray, x2::AbstractGPUArray) =
2020
invoke(map!, Tuple{Any,AbstractGPUArray, Vararg{AbstractGPUArray}}, f, y, x1, x2)
2121

22-
23-
# Base functions that are sadly not fit for the the GPU yet (they only work for Int64)
24-
Base.@pure @inline function gpu_ind2sub(A::AbstractArray, ind::T) where T
25-
_ind2sub(size(A), ind - T(1))
26-
end
27-
Base.@pure @inline function gpu_ind2sub(dims::NTuple{N}, ind::T) where {N, T}
28-
_ind2sub(NTuple{N, T}(dims), ind - T(1))
29-
end
30-
Base.@pure @inline _ind2sub(::Tuple{}, ind::T) where {T} = (ind + T(1),)
31-
Base.@pure @inline function _ind2sub(indslast::NTuple{1}, ind::T) where T
32-
((ind + T(1)),)
33-
end
34-
Base.@pure @inline function _ind2sub(inds, ind::T) where T
35-
r1 = inds[1]
36-
indnext = div(ind, r1)
37-
f = T(1); l = r1
38-
(ind-l*indnext+f, _ind2sub(Base.tail(inds), indnext)...)
39-
end
40-
41-
Base.@pure function gpu_sub2ind(dims::NTuple{N}, I::NTuple{N2, T}) where {N, N2, T}
42-
Base.@_inline_meta
43-
_sub2ind(NTuple{N, T}(dims), T(1), T(1), I...)
44-
end
45-
_sub2ind(x, L, ind) = ind
46-
function _sub2ind(::Tuple{}, L, ind, i::T, I::T...) where T
47-
Base.@_inline_meta
48-
ind + (i - T(1)) * L
49-
end
50-
function _sub2ind(inds, L, ind, i::IT, I::IT...) where IT
51-
Base.@_inline_meta
52-
r1 = inds[1]
53-
_sub2ind(Base.tail(inds), L * r1, ind + (i - IT(1)) * L, I...)
54-
end
55-
5622
# This is pretty ugly, but I feel bad to add those to device arrays, since
5723
# we're never bound checking... So getindex(a::AbstractGPUVector, 10, 10) would silently go unnoticed
5824
# we need this here for easier implementation of repeat

src/host/indexing.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,8 @@ to_index(a, x::Base.LogicalIndex) = error("Logical indexing not implemented")
8484
@generated function index_kernel(ctx::AbstractKernelContext, dest::AbstractArray, src::AbstractArray, idims, Is)
8585
N = length(Is.parameters)
8686
quote
87-
i = linear_index(ctx)
88-
i > length(dest) && return
89-
is = gpu_ind2sub(idims, i)
87+
i = @linearidx dest
88+
is = CartesianIndices(idims)[i]
9089
@nexprs $N i -> @inbounds I_i = Is[i][is[i]]
9190
@inbounds dest[i] = @ncall $N getindex src i -> I_i
9291
return
@@ -112,7 +111,7 @@ end
112111
quote
113112
i = linear_index(ctx)
114113
i > len && return
115-
is = gpu_ind2sub(idims, i)
114+
is = CartesianIndices(idims)[i]
116115
@inbounds setindex!(dest, bgetindex(src, i), $(idx...))
117116
return
118117
end

src/host/linalg.jl

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ function LinearAlgebra.transpose!(At::AbstractGPUArray{T, 2}, A::AbstractGPUArra
99
At
1010
end
1111

12-
function genperm(I::NTuple{N}, perm::NTuple{N}) where N
13-
ntuple(d-> (@inbounds return I[perm[d]]), Val(N))
12+
function genperm(I::CartesianIndex{N}, perm::NTuple{N}) where N
13+
CartesianIndex(ntuple(d-> (@inbounds return I[perm[d]]), Val(N)))
1414
end
1515

1616
function LinearAlgebra.permutedims!(dest::AbstractGPUArray, src::AbstractGPUArray, perm) where N
1717
perm isa Tuple || (perm = Tuple(perm))
1818
gpu_call(dest, src, perm) do ctx, dest, src, perm
1919
I = @cartesianidx src ctx
20-
@inbounds dest[genperm(I, perm)...] = src[I...]
20+
@inbounds dest[genperm(I, perm)] = src[I]
2121
return
2222
end
2323
return dest
@@ -39,3 +39,111 @@ end
3939
function Base.copyto!(A::AbstractGPUArray, B::Adjoint{T, <: AbstractGPUArray}) where T
4040
transpose!(A, B.parent)
4141
end
42+
43+
function LinearAlgebra.tril!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T
44+
function kernel!(ctx, _A, _d)
45+
I = @cartesianidx _A
46+
i, j = Tuple(I)
47+
if i < j - _d
48+
_A[i, j] = 0
49+
end
50+
return nothing
51+
end
52+
53+
gpu_call(kernel!, A, d)
54+
return A
55+
end
56+
57+
function LinearAlgebra.triu!(A::AbstractGPUMatrix{T}, d::Integer = 0) where T
58+
function kernel!(ctx, _A, _d)
59+
I = @cartesianidx _A
60+
i, j = Tuple(I)
61+
if j < i + _d
62+
_A[i, j] = 0
63+
end
64+
return nothing
65+
end
66+
67+
gpu_call(kernel!, A, d)
68+
return A
69+
end
70+
71+
function LinearAlgebra.copy_transpose!(dst::AbstractGPUArray, src::AbstractGPUArray)
72+
function kernel(ctx, dst, src)
73+
I = @cartesianidx dst
74+
dst[I...] = src[reverse(I)...]
75+
return
76+
end
77+
78+
gpu_call(kernel, dst, src)
79+
return dst
80+
end
81+
82+
83+
# matrix multiplication
84+
85+
function generic_matmatmul!(C::AbstractVecOrMat{R}, A::AbstractVecOrMat{T}, B::AbstractVecOrMat{S}) where {T,S,R}
86+
if size(A,2) != size(B,1)
87+
throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))"))
88+
end
89+
if size(C,1) != size(A,1) || size(C,2) != size(B,2)
90+
throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((size(A,1),size(B,2)))"))
91+
end
92+
if isempty(A) || isempty(B)
93+
return fill!(C, zero(R))
94+
end
95+
96+
function kernel(ctx, C, A, B)
97+
idx = @linearidx C
98+
i, j = Tuple(CartesianIndices(C)[idx])
99+
100+
if i <= size(A,1) && j <= size(B,2)
101+
z2 = zero(A[i, 1]*B[1, j] + A[i, 1]*B[1, j])
102+
Ctmp = convert(promote_type(R, typeof(z2)), z2)
103+
for k in 1:size(A,2)
104+
Ctmp += A[i, k]*B[k, j]
105+
end
106+
C[i,j] = Ctmp
107+
end
108+
109+
return
110+
end
111+
112+
gpu_call(kernel, C, A, B)
113+
114+
C
115+
end
116+
117+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::AbstractGPUVecOrMat, B::AbstractGPUVecOrMat) = generic_matmatmul!(C, A, B)
118+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::AbstractGPUVecOrMat, B::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
119+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::AbstractGPUVecOrMat, B::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
120+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}, B::AbstractGPUVecOrMat) = generic_matmatmul!(C, A, B)
121+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}, B::AbstractGPUVecOrMat) = generic_matmatmul!(C, A, B)
122+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
123+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
124+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
125+
LinearAlgebra.mul!(C::AbstractGPUVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:AbstractGPUVecOrMat}) = generic_matmatmul!(C, A, B)
126+
127+
function generic_rmul!(X::AbstractGPUArray, s::Number)
128+
function kernel(ctx, X, s)
129+
i = @linearidx X
130+
@inbounds X[i] *= s
131+
return
132+
end
133+
gpu_call(kernel, X, s)
134+
X
135+
end
136+
137+
LinearAlgebra.rmul!(A::AbstractGPUArray, b::Number) = generic_rmul!(A, b)
138+
139+
function generic_lmul!(s::Number, X::AbstractGPUArray)
140+
function kernel(ctx, X, s)
141+
i = @linearidx X
142+
@inbounds X[i] = s*X[i]
143+
return
144+
end
145+
gpu_call(kernel, X, s)
146+
X
147+
end
148+
149+
LinearAlgebra.lmul!(a::Number, B::AbstractGPUArray) = generic_lmul!(a, B)

src/host/math.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import Base.clamp!
33
function Base.clamp!(A::AbstractGPUArray, low, high)
44
function kernel(state, A, low, high)
55
I = @cartesianidx A state
6-
A[I...] = clamp(A[I...], low, high)
6+
A[I] = clamp(A[I], low, high)
77
return
88
end
99
gpu_call(kernel, A, low, high)

src/reference.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,11 @@ function GPUArrays.gpu_call(::JLBackend, f, args...; blocks::Int, threads::Int)
5959
ctx = JLKernelContext(threads, blocks)
6060
device_args = to_device.(Ref(ctx), args)
6161
tasks = Array{Task}(undef, threads)
62-
for blockidx in 1:blocks
62+
@allowscalar for blockidx in 1:blocks
6363
ctx.blockidx = blockidx
6464
for threadidx in 1:threads
6565
thread_ctx = JLKernelContext(ctx, threadidx)
66-
tasks[threadidx] = @async @allowscalar f(thread_ctx, device_args...)
66+
tasks[threadidx] = @async f(thread_ctx, device_args...)
6767
# TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading
6868
# (this would require a different synchronization mechanism)
6969
end

test/testsuite/base.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
function cartesian_iter(state, res, A, Asize)
22
for i in CartesianIndices(Asize)
3-
idx = GPUArrays.gpu_sub2ind(Asize, i.I)
4-
res[idx] = A[idx]
3+
res[i] = A[i]
54
end
65
return
76
end

test/testsuite/linalg.jl

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ function test_linalg(AT)
66
@test compare(transpose!, AT, Array{Float32}(undef, 32, 32), rand(Float32, 32, 32))
77
@test compare(transpose!, AT, Array{Float32}(undef, 128, 32), rand(Float32, 32, 128))
88
end
9-
9+
1010
@testset "copyto! for triangular" begin
11-
ga = Array{Float32}(undef, 128, 128)
11+
ga = Array{Float32}(undef, 128, 128)
1212
gb = AT{Float32}(undef, 128, 128)
1313
rand!(gb)
1414
copyto!(ga, UpperTriangular(gb))
1515
@test ga == Array(collect(UpperTriangular(gb)))
16-
ga = Array{Float32}(undef, 128, 128)
16+
ga = Array{Float32}(undef, 128, 128)
1717
gb = AT{Float32}(undef, 128, 128)
1818
rand!(gb)
1919
copyto!(ga, LowerTriangular(gb))
@@ -49,5 +49,44 @@ function test_linalg(AT)
4949
B = A + D
5050
@test collect(B) collect(A) + collect(D)
5151
end
52+
53+
@testset "$f! with diagonal $d" for (f, f!) in ((triu, triu!), (tril, tril!)),
54+
d in -2:2
55+
A = randn(10, 10)
56+
@test f(A, d) == Array(f!(AT(A), d))
57+
end
58+
59+
@testset "matrix multiplication" begin
60+
a = rand(Int8, 3, 3)
61+
b = rand(Int8, 3, 3)
62+
d_a = AT{Int8}(a)
63+
d_b = AT{Int8}(b)
64+
d_c = d_a*d_b
65+
@test collect(d_c) == a*b
66+
a = rand(Complex{Int8}, 3, 3)
67+
b = rand(Complex{Int8}, 3, 3)
68+
d_a = AT{Complex{Int8}}(a)
69+
d_b = AT{Complex{Int8}}(b)
70+
d_c = d_a'*d_b
71+
@test collect(d_c) == a'*b
72+
d_c = d_a*d_b'
73+
@test collect(d_c) == a*b'
74+
d_c = d_a'*d_b'
75+
@test collect(d_c) == a'*b'
76+
d_c = transpose(d_a)*d_b'
77+
@test collect(d_c) == transpose(a)*b'
78+
d_c = d_a'*transpose(d_b)
79+
@test collect(d_c) == a'*transpose(b)
80+
d_c = transpose(d_a)*d_b
81+
@test collect(d_c) == transpose(a)*b
82+
d_c = d_a*transpose(d_b)
83+
@test collect(d_c) == a*transpose(b)
84+
d_c = transpose(d_a)*transpose(d_b)
85+
@test collect(d_c) == transpose(a)*transpose(b)
86+
d_c = rmul!(copy(d_a), Complex{Int8}(2, 2))
87+
@test collect(d_c) == a*Complex{Int8}(2, 2)
88+
d_c = lmul!(Complex{Int8}(2, 2), copy(d_a))
89+
@test collect(d_c) == Complex{Int8}(2, 2)*a
90+
end
5291
end
5392
end

0 commit comments

Comments
 (0)