Skip to content
This repository was archived by the owner on Sep 27, 2021. It is now read-only.

Commit 9b85e14

Browse files
committed
get many more Knet tests working
1 parent d88c361 commit 9b85e14

File tree

7 files changed

+131
-87
lines changed

7 files changed

+131
-87
lines changed

REQUIRE

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,29 @@
11
julia 0.6
2+
CLFFT
3+
CLBLAS
4+
OpenCL
5+
Transpiler
6+
Sugar
7+
GPUArrays
8+
9+
10+
julia 0.6
11+
StaticArrays
12+
ColorTypes
13+
14+
Transpiler 0.3
15+
Sugar 0.3
16+
Matcha 0.0.2
17+
18+
CUDAnative 0.4.1 # llvm codegen fix
19+
CUDAdrv 0.5.1
20+
CUDArt 0.4.0 # for cuda c compiler support
21+
CUBLAS 0.2.0 # for CUDAdrv support
22+
CUFFT
23+
24+
OpenCL 0.6.0 #proper packed conversion
25+
CLBLAS 1.1.0
26+
CLFFT 0.4.0 # 0.5.0
27+
28+
Interpolations
29+
IterTools

src/CLArrays.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ function context end
1111

1212
include("memory.jl")
1313
include("array.jl")
14+
include("ondevice.jl")
1415
include("device.jl")
1516
include("context.jl")
1617
include("intrinsics.jl")
1718
include("compilation.jl")
1819
include("mapreduce.jl")
1920
include("3rdparty.jl")
20-
include("device_funcs.jl")
2121

2222
export CLArray
2323

src/array.jl

Lines changed: 0 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -8,79 +8,12 @@ using Sugar: to_tuple
88
import Base: pointer, similar, size, copy!, convert
99
using Base: RefValue
1010

11-
# pointer MUST be a type parameter, to make it easier to replace it with a non pointer type for host upload
1211
mutable struct CLArray{T, N} <: GPUArray{T, N}
1312
ptr::OwnedPtr{T}
1413
size::NTuple{N, Cuint}
1514
end
1615

1716

18-
struct HostPtr{T}
19-
ptr::Int32
20-
(::Type{HostPtr{T}})() where T = new{T}(Int32(0))
21-
end
22-
Base.eltype(::Type{HostPtr{T}}) where T = T
23-
24-
struct DeviceArray{T, N, Ptr} <: AbstractArray{T, N}
25-
ptr::Ptr
26-
size::NTuple{N, Cuint}
27-
end
28-
const PreDeviceArray{T, N} = DeviceArray{T, N, HostPtr{T}}
29-
const OnDeviceArray{T, N} = DeviceArray{T, N, GlobalPointer{T}}
30-
31-
kernel_convert(A::CLArray{T, N}) where {T, N} = PreDeviceArray{T, N}(HostPtr{T}(), A.size)
32-
predevice_type(::Type{OnDeviceArray{T, N}}) where {T, N} = PreDeviceArray{T, N}
33-
device_type(::CLArray{T, N}) where {T, N} = OnDeviceArray{T, N}
34-
reconstruct(x::PreDeviceArray{T, N}, ptr::GlobalPointer{T}) where {T, N} = OnDeviceArray{T, N}(ptr, x.size)
35-
36-
kernel_convert(x::RefValue{T}) where T <: CLArray = RefValue(kernel_convert(x[]))
37-
predevice_type(::Type{RefValue{T}}) where T <: OnDeviceArray = RefValue{predevice_type(T)}
38-
device_type(x::RefValue{T}) where T <: CLArray = RefValue{device_type(x[])}
39-
reconstruct(x::RefValue{T}, ptr::GlobalPointer) where T <: PreDeviceArray = RefValue(reconstruct(x[], ptr))
40-
41-
kernel_convert(x::Tuple) = kernel_convert.(x)
42-
predevice_type(::Type{T}) where T <: Tuple = Tuple{predevice_type.((T.parameters...))...}
43-
device_type(x::T) where T <: Tuple = Tuple{device_type.(x)...}
44-
45-
@generated function reconstruct(x::Tuple, ptrs::GlobalPointer...)
46-
ptrlist = to_tuple(ptrs)
47-
tup = Expr(:tuple)
48-
ptr_idx = 0
49-
for (xi, T) in enumerate(to_tuple(x))
50-
hasptr, fields = contains_pointer(T)
51-
if hasptr
52-
# consume the n pointers that T contains
53-
ptr_args = ntuple(i-> :(ptrs[$(i + ptr_idx)]), length(fields))
54-
ptr_idx += 1
55-
push!(tup.args, :(reconstruct(x[$xi], $(ptr_args...))))
56-
else
57-
push!(tup.args, :(x[$xi]))
58-
end
59-
end
60-
return tup
61-
end
62-
63-
64-
Base.size(x::OnDeviceArray) = x.size
65-
66-
67-
function Base.getindex(x::OnDeviceArray{T, N}, i::Vararg{Integer, N}) where {T, N}
68-
ilin = gpu_sub2ind(size(x), Cuint.(i))
69-
return x.ptr[ilin]
70-
end
71-
function Base.setindex!(x::OnDeviceArray{T, N}, val, i::Vararg{Integer, N}) where {T, N}
72-
ilin = gpu_sub2ind(size(x), Cuint.(i))
73-
x.ptr[ilin] = T(val)
74-
return
75-
end
76-
function Base.setindex!(x::OnDeviceArray{T, N}, val, ilin::Integer) where {T, N}
77-
x.ptr[ilin] = T(val)
78-
return
79-
end
80-
81-
function Base.getindex(x::OnDeviceArray, ilin::Integer)
82-
return x.ptr[ilin]
83-
end
8417
# arguments are swapped to not override default constructor
8518
function (::Type{CLArray{T, N}})(size::NTuple{N, Integer}, ptr::OwnedPtr{T}) where {T, N}
8619
arr = CLArray{T, N}(ptr, size)

src/device_funcs.jl

Lines changed: 0 additions & 7 deletions
This file was deleted.

src/mapreduce.jl

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
import GPUArrays: acc_mapreduce
2+
using Transpiler.cli: get_local_id, get_global_id, barrier, CLK_LOCAL_MEM_FENCE
3+
using Transpiler.cli: get_local_size, get_global_size, get_group_id
4+
using GPUArrays: blockdim_x, blockidx_x, threadidx_x, synchronize, synchronize_threads, device
5+
16
for i = 0:10
27
args = ntuple(x-> Symbol("arg_", x), i)
38
fargs = ntuple(x-> :(broadcast_index($(args[x]), length, global_index)), i)
49
@eval begin
5-
function reduce_kernel(f, op, v0, A, tmp_local, length, result, $(args...))
10+
function reduce_kernel(state, f, op, v0, A, tmp_local, length, result, $(args...))
611
ui1 = Cuint(1)
712
global_index = get_global_id(0) + ui1
813
local_v0 = v0
@@ -14,21 +19,21 @@ for i = 0:10
1419
end
1520

1621
# Perform parallel reduction
17-
local_index = threadidx_x(A)
22+
local_index = threadidx_x(state)
1823
tmp_local[local_index + ui1] = local_v0
19-
synchronize_threads(A)
20-
offset = blockdim_x(A) ÷ ui1
24+
barrier(CLK_LOCAL_MEM_FENCE)
25+
offset = blockdim_x(state) ÷ ui1
2126
while offset > 0
2227
if (local_index < offset)
2328
other = tmp_local[local_index + offset + ui1]
2429
mine = tmp_local[local_index + ui1]
2530
tmp_local[local_index + ui1] = op(mine, other)
2631
end
27-
synchronize_threads(A)
32+
barrier(CLK_LOCAL_MEM_FENCE)
2833
offset = offset ÷ Cuint(2)
2934
end
3035
if local_index == Cuint(0)
31-
result[blockidx_x(A) + ui1] = tmp_local[1]
36+
result[blockidx_x(state) + ui1] = tmp_local[1]
3237
end
3338
return
3439
end
@@ -38,16 +43,14 @@ end
3843
function acc_mapreduce{T, OT, N}(
3944
f, op, v0::OT, A::CLArray{T, N}, rest::Tuple
4045
)
41-
dev = context(A).device
46+
dev = device(A)
4247
block_size = 16
4348
group_size = ceil(Int, length(A) / block_size)
4449
out = similar(A, OT, (group_size,))
4550
fill!(out, v0)
4651
lmem = LocalMemory{OT}(block_size)
4752
args = (f, op, v0, A, lmem, Cuint(length(A)), out, rest...)
48-
49-
func = CLFunction(A, reduce_kernel, args...)
50-
func(args, group_size * block_size, (block_size,))
51-
x = reduce(op, Array(out))
52-
x
53+
gpu_call(reduce_kernel, A, args, (group_size * block_size,), (block_size,))
54+
println(Array(out))
55+
reduce(op, Array(out))
5356
end

src/ondevice.jl

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import Base: setindex!, getindex, size, IndexStyle, next, done, start, sum, eltype
2+
using Base: IndexLinear
3+
"""
4+
Array type on the device
5+
"""
6+
struct DeviceArray{T, N, Ptr} <: AbstractArray{T, N}
7+
ptr::Ptr
8+
size::NTuple{N, Cuint}
9+
end
10+
# shaninagans for uploading CLArrays to OpenCL as a DeviceArray
11+
# (spoiler alert: they can't contain pointers while uploading, but can on the device)
12+
"""
13+
Dummy pointer type for inlining into structs that get uploaded to the GPU
14+
"""
15+
struct HostPtr{T}
16+
ptr::Int32
17+
(::Type{HostPtr{T}})() where T = new{T}(Int32(0))
18+
end
19+
eltype(::Type{HostPtr{T}}) where T = T
20+
const PreDeviceArray{T, N} = DeviceArray{T, N, HostPtr{T}} # Pointer free variant for kernel upload
21+
const OnDeviceArray{T, N} = DeviceArray{T, N, GlobalPointer{T}} # Variant on the device containing the correct pointer
22+
23+
size(x::OnDeviceArray) = x.size
24+
IndexStyle(::OnDeviceArray) = IndexLinear()
25+
start(x::OnDeviceArray) = Cuint(1)
26+
next(x::OnDeviceArray, state::Cuint) = x[state], state + Cuint(1)
27+
done(x::OnDeviceArray, state::Cuint) = state > length(x)
28+
29+
function getindex(x::OnDeviceArray{T, N}, i::Vararg{Integer, N}) where {T, N}
30+
ilin = gpu_sub2ind(size(x), Cuint.(i))
31+
return x.ptr[ilin]
32+
end
33+
function setindex!(x::OnDeviceArray{T, N}, val, i::Vararg{Integer, N}) where {T, N}
34+
ilin = gpu_sub2ind(size(x), Cuint.(i))
35+
x.ptr[ilin] = T(val)
36+
return
37+
end
38+
function setindex!(x::OnDeviceArray{T, N}, val, ilin::Integer) where {T, N}
39+
x.ptr[ilin] = T(val)
40+
return
41+
end
42+
getindex(x::OnDeviceArray, ilin::Integer) = x.ptr[ilin]
43+
44+
45+
kernel_convert(A::CLArray{T, N}) where {T, N} = PreDeviceArray{T, N}(HostPtr{T}(), A.size)
46+
predevice_type(::Type{OnDeviceArray{T, N}}) where {T, N} = PreDeviceArray{T, N}
47+
device_type(::CLArray{T, N}) where {T, N} = OnDeviceArray{T, N}
48+
reconstruct(x::PreDeviceArray{T, N}, ptr::GlobalPointer{T}) where {T, N} = OnDeviceArray{T, N}(ptr, x.size)
49+
50+
# some converts to inline CLArrays into tuples and refs
51+
kernel_convert(x::RefValue{T}) where T <: CLArray = RefValue(kernel_convert(x[]))
52+
predevice_type(::Type{RefValue{T}}) where T <: OnDeviceArray = RefValue{predevice_type(T)}
53+
device_type(x::RefValue{T}) where T <: CLArray = RefValue{device_type(x[])}
54+
reconstruct(x::RefValue{T}, ptr::GlobalPointer) where T <: PreDeviceArray = RefValue(reconstruct(x[], ptr))
55+
56+
kernel_convert(x::Tuple) = kernel_convert.(x)
57+
predevice_type(::Type{T}) where T <: Tuple = Tuple{predevice_type.((T.parameters...))...}
58+
device_type(x::T) where T <: Tuple = Tuple{device_type.(x)...}
59+
60+
@generated function reconstruct(x::Tuple, ptrs::GlobalPointer...)
61+
ptrlist = to_tuple(ptrs)
62+
tup = Expr(:tuple)
63+
ptr_idx = 0
64+
for (xi, T) in enumerate(to_tuple(x))
65+
hasptr, fields = contains_pointer(T)
66+
if hasptr
67+
# consume the n pointers that T contains
68+
ptr_args = ntuple(i-> :(ptrs[$(i + ptr_idx)]), length(fields))
69+
ptr_idx += 1
70+
push!(tup.args, :(reconstruct(x[$xi], $(ptr_args...))))
71+
else
72+
push!(tup.args, :(x[$xi]))
73+
end
74+
end
75+
return tup
76+
end
77+
78+
79+
80+
function sum(A::CLArrays.DeviceArray{T}) where T
81+
acc = zero(T)
82+
for elem in A
83+
acc += elem
84+
end
85+
acc
86+
end

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
using CLArrays
22
using GPUArrays.TestSuite, Base.Test
3+
34
TestSuite.run_tests(CLArray)

0 commit comments

Comments
 (0)