Skip to content
This repository was archived by the owner on Sep 27, 2021. It is now read-only.

Commit 5fec458

Browse files
committed
bug fixes, fixes for reduce
1 parent ec3d9ec commit 5fec458

File tree

7 files changed

+46
-71
lines changed

7 files changed

+46
-71
lines changed

src/CLArrays.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@ include("compilation.jl")
1919
include("mapreduce.jl")
2020
include("3rdparty.jl")
2121

22-
export CLArray
22+
export CLArray, gpu_call
2323

2424
end # module

src/array.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ function (::Type{CLArray{T, N}})(size::NTuple{N, Integer}, ctx = global_context(
3434
CLArray{clT, N}(size, ptr)
3535
end
3636

37+
raw_print(msg::AbstractString...) =
38+
ccall(:write, Cssize_t, (Cint, Cstring, Csize_t), 1, join(msg), length(join(msg)))
3739

3840
similar(::Type{<: CLArray}, ::Type{T}, size::Base.Dims{N}) where {T, N} = CLArray{T, N}(size)
3941

@@ -42,6 +44,7 @@ function unsafe_free!(a::CLArray)
4244
ctxid = context(ptr).id
4345
if cl.is_ctx_id_alive(ctxid) && ctxid != C_NULL
4446
Mem.free(ptr)
47+
Mem.current_allocated_mem[] -= sizeof(eltype(a)) * length(a)
4548
end
4649
#TODO logging that we don't free since context is not alive
4750
end

src/compilation.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ function assemble_kernel(m::CLMethod)
161161
kernel_ptrs = []
162162
body = Expr(:block)
163163
nargs = method_nargs(m)
164+
# declare rest of slots
165+
for (i, (T, name)) in enumerate(getslots!(m)[nargs+1:end])
166+
slot = TypedSlot(i + nargs, T)
167+
push!(m.decls, slot)
168+
push!(m, T)
169+
tmp = :($name::$T)
170+
tmp.typ = T
171+
push!(body.args, tmp)
172+
end
164173
st = getslots!(m)[2:nargs] # don't include self
165174
arg_idx = 1
166175
ptr_extract = []
@@ -196,6 +205,7 @@ function assemble_kernel(m::CLMethod)
196205
push!(kernel_args, :($argslot::$T))
197206
end
198207
end
208+
199209
append!(kernel_args, kernel_ptrs)
200210
real_body = _getast(m)
201211
body.typ = real_body.typ # use real type

src/intrinsics.jl

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
using Transpiler.cli: get_local_id, get_global_id, barrier, CLK_LOCAL_MEM_FENCE
2-
using Transpiler.cli: get_local_size, get_global_size, get_group_id
3-
import GPUArrays: synchronize, synchronize_threads, device
2+
using Transpiler.cli: get_local_size, get_global_size, get_group_id, get_num_groups
3+
import GPUArrays: synchronize, synchronize_threads, device, global_size, linear_index
44
#synchronize
55
function synchronize(x::CLArray)
6-
cl.finish(context(x).queue) # TODO figure out the diverse ways of synchronization
6+
cl.finish(global_queue(x)) # TODO figure out the diverse ways of synchronization
77
end
88

99

@@ -12,12 +12,14 @@ immutable KernelState
1212
KernelState() = new(Int32(0))
1313
end
1414

15-
for (f, fcl, isidx) in (
16-
(:blockidx, get_group_id, true),
17-
(:blockdim, get_local_size, false),
18-
(:threadidx, get_local_id, true)
19-
)
20-
for (i, sym) in enumerate((:x, :y, :z))
15+
for (i, sym) in enumerate((:x, :y, :z))
16+
for (f, fcl, isidx) in (
17+
(:blockidx, get_group_id, true),
18+
(:blockdim, get_local_size, false),
19+
(:threadidx, get_local_id, true),
20+
(:griddim, get_num_groups, false)
21+
)
22+
2123
fname = Symbol(string(f, '_', sym))
2224
if isidx
2325
@eval GPUArrays.$fname(::KernelState)::Cuint = $fcl($(i-1)) + Cuint(1)
@@ -27,4 +29,8 @@ for (f, fcl, isidx) in (
2729
end
2830
end
2931

32+
global_size(state::KernelState) = get_global_size(0)
33+
linear_index(state::KernelState) = get_global_id(0) + Cuint(1)
34+
35+
3036
synchronize_threads(::KernelState) = cli.barrier(CLK_LOCAL_MEM_FENCE)

src/mapreduce.jl

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +0,0 @@
1-
import GPUArrays: acc_mapreduce
2-
using Transpiler.cli: get_local_id, get_global_id, barrier, CLK_LOCAL_MEM_FENCE
3-
using Transpiler.cli: get_local_size, get_global_size, get_group_id
4-
using GPUArrays: blockdim_x, blockidx_x, threadidx_x, synchronize, synchronize_threads, device
5-
6-
for i = 0:10
7-
args = ntuple(x-> Symbol("arg_", x), i)
8-
fargs = ntuple(x-> :(broadcast_index($(args[x]), length, global_index)), i)
9-
@eval begin
10-
function reduce_kernel(state, f, op, v0, A, tmp_local, length, result, $(args...))
11-
ui1 = Cuint(1)
12-
global_index = get_global_id(0) + ui1
13-
local_v0 = v0
14-
# Loop sequentially over chunks of input vector
15-
while (global_index <= length)
16-
element = f(A[global_index], $(fargs...))
17-
local_v0 = op(local_v0, element)
18-
global_index += get_global_size(0)
19-
end
20-
21-
# Perform parallel reduction
22-
local_index = threadidx_x(state)
23-
tmp_local[local_index + ui1] = local_v0
24-
barrier(CLK_LOCAL_MEM_FENCE)
25-
offset = blockdim_x(state) ÷ ui1
26-
while offset > 0
27-
if (local_index < offset)
28-
other = tmp_local[local_index + offset + ui1]
29-
mine = tmp_local[local_index + ui1]
30-
tmp_local[local_index + ui1] = op(mine, other)
31-
end
32-
barrier(CLK_LOCAL_MEM_FENCE)
33-
offset = offset ÷ Cuint(2)
34-
end
35-
if local_index == Cuint(0)
36-
result[blockidx_x(state) + ui1] = tmp_local[1]
37-
end
38-
return
39-
end
40-
end
41-
end
42-
43-
function acc_mapreduce{T, OT, N}(
44-
f, op, v0::OT, A::CLArray{T, N}, rest::Tuple
45-
)
46-
dev = device(A)
47-
block_size = 16
48-
group_size = ceil(Int, length(A) / block_size)
49-
out = similar(A, OT, (group_size,))
50-
fill!(out, v0)
51-
lmem = LocalMemory{OT}(block_size)
52-
args = (f, op, v0, A, lmem, Cuint(length(A)), out, rest...)
53-
gpu_call(reduce_kernel, A, args, (group_size * block_size,), (block_size,))
54-
println(Array(out))
55-
reduce(op, Array(out))
56-
end

src/ondevice.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,22 @@ start(x::OnDeviceArray) = Cuint(1)
2626
next(x::OnDeviceArray, state::Cuint) = x[state], state + Cuint(1)
2727
done(x::OnDeviceArray, state::Cuint) = state > length(x)
2828

29+
getindex(x::OnDeviceArray, ilin::Integer) = x.ptr[ilin]
2930
function getindex(x::OnDeviceArray{T, N}, i::Vararg{Integer, N}) where {T, N}
3031
ilin = gpu_sub2ind(size(x), Cuint.(i))
3132
return x.ptr[ilin]
3233
end
33-
function setindex!(x::OnDeviceArray{T, N}, val, i::Vararg{Integer, N}) where {T, N}
34-
ilin = gpu_sub2ind(size(x), Cuint.(i))
34+
function setindex!(x::OnDeviceArray{T, N}, val, ilin::Integer) where {T, N}
3535
x.ptr[ilin] = T(val)
3636
return
3737
end
38-
function setindex!(x::OnDeviceArray{T, N}, val, ilin::Integer) where {T, N}
38+
39+
function setindex!(x::OnDeviceArray{T, N}, val, i::Vararg{Integer, N}) where {T, N}
40+
ilin = gpu_sub2ind(size(x), Cuint.(i))
3941
x.ptr[ilin] = T(val)
4042
return
4143
end
42-
getindex(x::OnDeviceArray, ilin::Integer) = x.ptr[ilin]
44+
4345

4446

4547
kernel_convert(A::CLArray{T, N}) where {T, N} = PreDeviceArray{T, N}(HostPtr{T}(), A.size)

test/runtests.jl

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
11
using CLArrays
22
using GPUArrays.TestSuite, Base.Test
3-
3+
using GPUArrays: global_size
4+
using CUDAnative, CUDAdrv
45
TestSuite.run_tests(CLArray)
6+
7+
using CLArrays
8+
9+
x = CLArray(rand(Float32, 10))
10+
11+
GPUArrays.gpu_call(x, (x,)) do state, l
12+
l[1] = 1f0 ^ 1.0
13+
return
14+
end

0 commit comments

Comments
 (0)