cosmetic changes

SimonDanisch · SimonDanisch · commit a0319f4c7e14 · 2017-09-22T01:52:17.000+02:00
diff --git a/src/abstract_gpu_interface.jl b/src/abstract_gpu_interface.jl
@@ -5,7 +5,7 @@ Uses CUDA like names
 for sym in (:x, :y, :z)
     for f in (:blockidx, :blockdim, :threadidx, :griddim)
         fname = Symbol(string(f, '_', sym))
-        @eval $fname(state)::Cuint = error("Not implemented")
+        @eval $fname(state)::UInt32 = error("Not implemented")
         @eval export $fname
     end
 end
@@ -23,7 +23,7 @@ end
 linear index in a GPU kernel (equal to  OpenCL.get_global_id)
 """
 @inline function linear_index(state)
-    Cuint((blockidx_x(state) - Cuint(1)) * blockdim_x(state) + threadidx_x(state))
+    UInt32((blockidx_x(state) - UInt32(1)) * blockdim_x(state) + threadidx_x(state))
 end
 @inline function global_size(state)
     griddim_x(state) * blockdim_x(state)
@@ -72,9 +72,9 @@ Calls function `f` on the GPU.
 and supplies queues and contexts.
 Calls kernel with `kernel(state, args...)`, where state is dependant on the backend
 and can be used for e.g getting an index into A with `linear_index(state)`.
-Optionally, lunch configuration can be supplied in the following way:
+Optionally, launch configuration can be supplied in the following way:
 
-    1) A single integer, indicating how many work items (total number of threads) you want to lunch.
+    1) A single integer, indicating how many work items (total number of threads) you want to launch.
         in this case `linear_index(state)` will be a number in the range 1:configuration
     2) Pass a tuple of integer tuples to define blocks and threads per blocks!
 
@@ -94,10 +94,10 @@ function gpu_call(f, A::GPUArray, args::Tuple, configuration = length(A))
         end
         map(x-> Int.(x), configuration) # make sure it all has the same int type
     else
-        error("""Please lunch a gpu kernel with a valid configuration.
+        error("""Please launch a gpu kernel with a valid configuration.
             Found: $configurations
             Configuration needs to be:
-            1) A single integer, indicating how many work items (total number of threads) you want to lunch.
+            1) A single integer, indicating how many work items (total number of threads) you want to launch.
                 in this case `linear_index(state)` will be a number in the range 1:configuration
             2) Pass a tuple of integer tuples to define blocks and threads per blocks!
                 `linear_index` will be inbetween 1:prod((blocks..., threads...))
diff --git a/src/abstractarray.jl b/src/abstractarray.jl
@@ -143,12 +143,12 @@ function copy!{T, N}(
         throw(DimensionMismatch("Ranges don't match their size. Found: $shape, $(size(srccrange))"))
     end
     len = length(destcrange)
-    dest_offsets = Cuint.(destcrange.start.I .- 1)
-    src_offsets = Cuint.(srccrange.start.I .- 1)
-    ui_shape = Cuint.(shape)
+    dest_offsets = UInt32.(destcrange.start.I .- 1)
+    src_offsets = UInt32.(srccrange.start.I .- 1)
+    ui_shape = UInt32.(shape)
     gpu_call(
         copy_kernel!, dest,
-        (dest, dest_offsets, src, src_offsets, ui_shape, Cuint.(size(dest)), Cuint.(size(src)), Cuint(len)),
+        (dest, dest_offsets, src, src_offsets, ui_shape, UInt32.(size(dest)), UInt32.(size(src)), UInt32(len)),
         len
     )
     dest
diff --git a/src/base.jl b/src/base.jl
@@ -39,7 +39,7 @@ map!(f, y::GPUArray, x1::GPUArray, x2::GPUArray) =
 # end
 #
 # function _cat(dim, dest, xs...)
-#     gpu_call(dest, (Cuint(dim), dest, xs)) do state, dim, dest, xs
+#     gpu_call(dest, (UInt32(dim), dest, xs)) do state, dim, dest, xs
 #         I = @cartesianidx dest state
 #         nI = catindex(dim, I, size.(xs))
 #         n = nI[1]; I′ = nI[2]
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -19,12 +19,12 @@ end
 end
 
 function broadcast!(f, A::GPUArray)
-    gpu_call(const_kernel, A, (A, f, Cuint(length(A))))
+    gpu_call(const_kernel, A, (A, f, UInt32(length(A))))
     A
 end
 function broadcast!(f::typeof(identity), A::GPUArray, val::Number)
     valconv = convert(eltype(A), val)
-    gpu_call(const_kernel2, A, (A, valconv, Cuint(length(A))))
+    gpu_call(const_kernel2, A, (A, valconv, UInt32(length(A))))
     A
 end
 @inline function broadcast_t(f, T::Type{Bool}, shape, it, A::GPUArrays.GPUArray, Bs::Vararg{Any,N}) where N
@@ -81,48 +81,48 @@ function _broadcast!(
         A::AT, Bs::BT, ::Type{Val{N}}, unused2 # we don't need those arguments
     ) where {N, K, ID, AT, BT}
 
-    shape = Cuint.(size(out))
+    shape = UInt32.(size(out))
     args = (A, Bs...)
     descriptor_tuple = ntuple(length(args)) do i
         BroadcastDescriptor(args[i], keeps[i], Idefaults[i])
     end
-    gpu_call(broadcast_kernel!, out, (func, out, shape, Cuint(length(out)), descriptor_tuple, A,  deref.(Bs)...))
+    gpu_call(broadcast_kernel!, out, (func, out, shape, UInt32(length(out)), descriptor_tuple, A,  deref.(Bs)...))
     out
 end
 
 
 
 function Base.foreach(func, over::GPUArray, Bs...)
-    shape = Cuint.(size(over))
+    shape = UInt32.(size(over))
     keeps, Idefaults = map_newindexer(shape, over, Bs)
     args = (over, Bs...)
     descriptor_tuple = ntuple(length(args)) do i
         BroadcastDescriptor(args[i], keeps[i], Idefaults[i])
     end
-    gpu_call(foreach_kernel, over, (func, shape, Cuint.(length(over)), descriptor_tuple, over, deref.(Bs)...))
+    gpu_call(foreach_kernel, over, (func, shape, UInt32.(length(over)), descriptor_tuple, over, deref.(Bs)...))
     return
 end
 
 
-arg_length(x::Tuple) = (Cuint(length(x)),)
-arg_length(x::GPUArray) = Cuint.(size(x))
+arg_length(x::Tuple) = (UInt32(length(x)),)
+arg_length(x::GPUArray) = UInt32.(size(x))
 arg_length(x) = ()
 
 abstract type BroadcastDescriptor{Typ} end
 
 immutable BroadcastDescriptorN{Typ, N} <: BroadcastDescriptor{Typ}
-    size::NTuple{N, Cuint}
-    keep::NTuple{N, Cuint}
-    idefault::NTuple{N, Cuint}
+    size::NTuple{N, UInt32}
+    keep::NTuple{N, UInt32}
+    idefault::NTuple{N, UInt32}
 end
 function BroadcastDescriptor(val::RefValue, keep, idefault)
-    BroadcastDescriptorN{Tuple, 1}((Cuint(1),), (Cuint(0),), (Cuint(1),))
+    BroadcastDescriptorN{Tuple, 1}((UInt32(1),), (UInt32(0),), (UInt32(1),))
 end
 
 function BroadcastDescriptor(val, keep, idefault)
     N = length(keep)
     typ = Broadcast.containertype(val)
-    BroadcastDescriptorN{typ, N}(arg_length(val), Cuint.(keep), Cuint.(idefault))
+    BroadcastDescriptorN{typ, N}(arg_length(val), UInt32.(keep), UInt32.(idefault))
 end
 
 @propagate_inbounds @inline function _broadcast_getindex(
@@ -208,11 +208,11 @@ for N = 0:10
 end
 
 function mapidx{N}(f, A::GPUArray, args::NTuple{N, Any})
-    gpu_call(mapidx_kernel, A, (f, A, Cuint(length(A)), args...))
+    gpu_call(mapidx_kernel, A, (f, A, UInt32(length(A)), args...))
 end
 
 # don't do anything for empty tuples
-@pure newindex(I, ilin, keep::Tuple{}, Idefault::Tuple{}, size::Tuple{}) = Cuint(1)
+@pure newindex(I, ilin, keep::Tuple{}, Idefault::Tuple{}, size::Tuple{}) = UInt32(1)
 
 # optimize for 1D arrays
 @pure function newindex(I::NTuple{1}, ilin, keep::NTuple{1}, Idefault, size)
diff --git a/src/construction.jl b/src/construction.jl
@@ -11,7 +11,7 @@ end
 
 function fill!{T, N}(A::GPUArray{T, N}, val)
     valconv = T(val)
-    gpu_call(const_kernel2, A, (A, valconv, Cuint(length(A))))
+    gpu_call(const_kernel2, A, (A, valconv, UInt32(length(A))))
     A
 end
 
@@ -21,7 +21,7 @@ ones(T::Type{<: GPUArray}, dims::NTuple{N, Integer}) where N = fill(T, one(eltyp
 function eyekernel(state, res::AbstractArray{T}, stride) where T
     i = linear_index(state)
     i > stride && return
-    ilin = (stride * (i - Cuint(1))) + i
+    ilin = (stride * (i - UInt32(1))) + i
     @inbounds res[ilin] = one(T)
     return
 end
@@ -30,7 +30,7 @@ eye(T::Type{<: GPUArray}, i1::Integer) = eye(T, (i1, i1))
 eye(T::Type{<: GPUArray}, i1::Integer, i2::Integer) = eye(T, (i1, i2))
 function eye(T::Type{<: GPUArray}, dims::NTuple{2, Integer})
     res = zeros(T, dims)
-    gpu_call(eyekernel, res, (res, Cuint(size(res, 1))), minimum(dims))
+    gpu_call(eyekernel, res, (res, UInt32(size(res, 1))), minimum(dims))
     res
 end
 
diff --git a/src/convolution.jl b/src/convolution.jl
@@ -7,7 +7,7 @@
 #         ::Val{BLOCK_SIZE},
 #         ::Val{LOCAL_WIDTH}
 #     ) where {T, BLOCK_SIZE, LOCAL_WIDTH}
-#     ui1 = Cuint(1); ui0 = Cuint(0)
+#     ui1 = UInt32(1); ui0 = UInt32(0)
 #     w = kernel_width
 #     wBy2 = w >> ui1 #w divided by 2
 #     #Goes up to 15x15 filters
@@ -54,8 +54,8 @@ function convolution_kernel(state, A::AbstractArray{T}, out, K, Asize, Ksize) wh
     end
     accum = zero(T)
     kw, kh = Ksize[1], Ksize[2]
-    for ix = Cuint(0):(kw - Cuint(1))
-        for jy = Cuint(0):(kh - Cuint(1))
+    for ix = UInt32(0):(kw - UInt32(1))
+        for jy = UInt32(0):(kh - UInt32(1))
             temp = A[gpu_sub2ind(Asize, idx .+ (ix, jy))]
             accum += temp * K[ix + kw*jy + 1]
         end
@@ -66,7 +66,7 @@ end
 
 
 function convolution!(a, out, k)
-    gpu_call(convolution_kernel, a, (a, out, k, Cuint.(size(a)), Cuint.(size(k))))
+    gpu_call(convolution_kernel, a, (a, out, k, UInt32.(size(a)), UInt32.(size(k))))
     GPUArrays.synchronize(out)
     out
 end
diff --git a/src/indexing.jl b/src/indexing.jl
@@ -38,7 +38,7 @@ Base.setindex!(xs::GPUArray, v, i::Integer) = xs[i] = convert(eltype(xs), v)
 using Base.Cartesian
 to_index(a, x) = x
 to_index(::A, x::Array{ET}) where {A, ET} = copy!(similar(A, ET, size(x)), x)
-to_index(a, x::UnitRange{<: Integer}) = convert(UnitRange{Cuint}, x)
+to_index(a, x::UnitRange{<: Integer}) = convert(UnitRange{UInt32}, x)
 to_index(a, x::Base.LogicalIndex) = error("Logical indexing not implemented")
 
 @generated function index_kernel(state, dest::AbstractArray, src::AbstractArray, idims, Is)
@@ -59,6 +59,6 @@ function Base._unsafe_getindex!(dest::GPUArray, src::GPUArray, Is::Union{Real, A
         return dest
     end
     idims = map(length, Is)
-    gpu_call(index_kernel, dest, (dest, src, Cuint.(idims), map(x-> to_index(dest, x), Is)))
+    gpu_call(index_kernel, dest, (dest, src, UInt32.(idims), map(x-> to_index(dest, x), Is)))
     return dest
 end
diff --git a/src/jlbackend.jl b/src/jlbackend.jl
@@ -149,7 +149,7 @@ end
 for (i, sym) in enumerate((:x, :y, :z))
     for f in (:blockidx, :blockdim, :threadidx, :griddim)
         fname = Symbol(string(f, '_', sym))
-        @eval $fname(state::JLState) = Cuint(state.$f[$i])
+        @eval $fname(state::JLState) = UInt32(state.$f[$i])
     end
 end
 
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -27,7 +27,7 @@ function transpose_blocks!(
         state, odata::AbstractArray{T}, idata, ::Val{SHMEM}, ::Val{TDIM}, ::Val{BLOCK_ROWS}, ::Val{NROW}
     ) where {T, SHMEM, TDIM, BLOCK_ROWS, NROW}
 
-    ui1 = Cuint(1)
+    ui1 = UInt32(1)
     tile = @LocalMemory(state, T, SHMEM)
     bidx_x = blockidx_x(state) - ui1
     bidx_y = blockidx_y(state) - ui1
@@ -38,15 +38,15 @@ function transpose_blocks!(
     y = bidx_y * TDIM + tidx_y + ui1
     dims = size(idata)
 
-    (x <= dims[2] && (y + (BLOCK_ROWS * Cuint(3))) <= dims[1]) || return
+    (x <= dims[2] && (y + (BLOCK_ROWS * UInt32(3))) <= dims[1]) || return
 
-    for j = Cuint(0):Cuint(3)
+    for j = UInt32(0):UInt32(3)
         j0 = j * BLOCK_ROWS
         tile[tidx_x + ui1, tidx_y + j0 + ui1] = idata[y + j0, x]
     end
 
     synchronize_threads(state)
-    for j = Cuint(0):Cuint(3)
+    for j = UInt32(0):UInt32(3)
         j0 = j * BLOCK_ROWS
         odata[x, y + j0] = tile[tidx_x + ui1, tidx_y + j0 + ui1]
     end
@@ -56,9 +56,9 @@ end
 function transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
     if size(A, 1) == size(A, 2) && all(x-> x % 32 == 0, size(A))
         outsize = UInt32.(size(At))
-        TDIM = Cuint(32); BLOCK_ROWS = Cuint(8)
+        TDIM = UInt32(32); BLOCK_ROWS = UInt32(8)
         nrows = TDIM ÷ BLOCK_ROWS
-        shmemdim = (TDIM, (TDIM + Cuint(1)))
+        shmemdim = (TDIM, (TDIM + UInt32(1)))
         static_params = map(x-> Val{x}(), (shmemdim, TDIM, BLOCK_ROWS, nrows))
         args = (At, A, static_params...)
 
@@ -82,7 +82,7 @@ function genperm(I::NTuple{N}, perm::NTuple{N}) where N
 end
 
 function permutedims!(dest::GPUArray, src::GPUArray, perm)
-    perm = Cuint.((perm...,))
+    perm = UInt32.((perm...,))
     gpu_call(dest, (dest, src, perm)) do state, dest, src, perm
         I = @cartesianidx dest state
         @inbounds dest[I...] = src[genperm(I, perm)...]
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -4,9 +4,9 @@ import Base: any, count, countnz
 # reduce
 # functions in base implemented with a direct loop need to be overloaded to use mapreduce
 any(pred, A::GPUArray) = Bool(mapreduce(pred, |, Cint(0), (u)))
-count(pred, A::GPUArray) = Int(mapreduce(pred, +, Cuint(0), A))
-countnz(A::GPUArray) = Int(mapreduce(x-> x != 0, +, Cuint(0), A))
-countnz(A::GPUArray, dim) = Int(mapreducedim(x-> x != 0, +, Cuint(0), A, dim))
+count(pred, A::GPUArray) = Int(mapreduce(pred, +, UInt32(0), A))
+countnz(A::GPUArray) = Int(mapreduce(x-> x != 0, +, UInt32(0), A))
+countnz(A::GPUArray, dim) = Int(mapreducedim(x-> x != 0, +, UInt32(0), A, dim))
 
 
 # hack to get around of fetching the first element of the GPUArray
@@ -49,10 +49,10 @@ end
 
 
 function mapreducedim_kernel(state, f, op, R::AbstractArray{T1, N}, A::AbstractArray{T, N}, slice_size, sizeA, dim) where {T1, T, N}
-    ilin = Cuint(linear_index(state))
+    ilin = UInt32(linear_index(state))
     ilin > length(R) && return
     accum = zero(T1)
-    @inbounds for i = Cuint(1):slice_size
+    @inbounds for i = UInt32(1):slice_size
         idx = N == dim ? (ilin, i) : (i, ilin)
         i2d = gpu_sub2ind(sizeA, idx)
         accum = op(accum, f(A[i2d]))
@@ -70,7 +70,7 @@ function Base._mapreducedim!(f, op, R::GPUArray, A::GPUArray)
     @assert count(x-> x == 1, sizeR) == (ndims(R) - 1) "Not implemented"
     dim = findfirst(x-> x == 1, sizeR)
     slice_size = size(A, dim)
-    gpu_call(mapreducedim_kernel, R, (f, op, R, A, Cuint(slice_size), Cuint.(size(A)), Cuint(dim)))
+    gpu_call(mapreducedim_kernel, R, (f, op, R, A, UInt32(slice_size), UInt32.(size(A)), UInt32(dim)))
     return R
 end
 
@@ -80,7 +80,7 @@ for i = 0:10
     @eval begin
         # http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
         function reduce_kernel(state, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
-            ui0 = Cuint(0); ui1 = Cuint(1); ui2 = Cuint(2)
+            ui0 = UInt32(0); ui1 = UInt32(1); ui2 = UInt32(2)
             tmp_local = @LocalMemory(state, T, LMEM)
             global_index = linear_index(state)
             acc = v0
diff --git a/src/ondevice.jl b/src/ondevice.jl
@@ -3,9 +3,9 @@ import Base: sum, next, start, done, IndexStyle
 abstract type AbstractDeviceArray{T, N} <: AbstractArray{T, N} end
 
 IndexStyle(::AbstractDeviceArray) = IndexLinear()
-start(x::AbstractDeviceArray) = Cuint(1)
-next(x::AbstractDeviceArray, state::Cuint) = x[state], state + Cuint(1)
-done(x::AbstractDeviceArray, state::Cuint) = state > length(x)
+start(x::AbstractDeviceArray) = UInt32(1)
+next(x::AbstractDeviceArray, state::UInt32) = x[state], state + UInt32(1)
+done(x::AbstractDeviceArray, state::UInt32) = state > length(x)
 
 function sum(A::AbstractDeviceArray{T}) where T
     acc = zero(T)
diff --git a/src/random.jl b/src/random.jl
@@ -26,7 +26,7 @@ function next_rand(::Type{FT}, state::NTuple{4, T}) where {FT, T <: Unsigned}
     )
 end
 
-function gpu_rand(::Type{T}, state, randstate::AbstractVector{NTuple{4, Cuint}}) where T
+function gpu_rand(::Type{T}, state, randstate::AbstractVector{NTuple{4, UInt32}}) where T
     threadid = GPUArrays.threadidx_x(state)
     stateful_rand = next_rand(T, randstate[threadid])
     randstate[threadid] = stateful_rand[1]
@@ -40,8 +40,8 @@ let rand_state_dict = Dict()
         dev = GPUArrays.device(x)
         get!(rand_state_dict, dev) do
             N = GPUArrays.threads(dev)
-            res = similar(x, NTuple{4, Cuint}, N)
-            copy!(res, [ntuple(i-> rand(Cuint), 4) for i=1:N])
+            res = similar(x, NTuple{4, UInt32}, N)
+            copy!(res, [ntuple(i-> rand(UInt32), 4) for i=1:N])
             res
         end
     end
diff --git a/src/testsuite/base.jl b/src/testsuite/base.jl
@@ -37,8 +37,8 @@ function run_base(Typ)
             b = rand(Complex64, 77)
             A = Typ(a)
             B = Typ(b)
-            off = Cuint(1)
-            mapidx(A, (B, off, Cuint(length(A)))) do i, a, b, off, len
+            off = UInt32(1)
+            mapidx(A, (B, off, UInt32(length(A)))) do i, a, b, off, len
                 x = b[i]
                 x2 = b[min(i+off, len)]
                 a[i] = x * x2
diff --git a/src/testsuite/broadcasting.jl b/src/testsuite/broadcasting.jl
@@ -40,7 +40,7 @@ function test_broadcast(Typ)
         T = Typ{ET}
         @testset "broadcast $ET" begin
             @testset "RefValue" begin
-                cidx = rand(Cuint(1):Cuint(N), 2*N)
+                cidx = rand(UInt32(1):UInt32(N), 2*N)
                 gidx = Typ(cidx)
                 cy = TestSuite.toarray(ET, (2*N,))
                 gy = Typ(cy)

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ map!(f, y::GPUArray, x1::GPUArray, x2::GPUArray) =`
`39`	`39`	`# end`
`40`	`40`	`#`
`41`	`41`	`# function _cat(dim, dest, xs...)`
`42`		`-# gpu_call(dest, (Cuint(dim), dest, xs)) do state, dim, dest, xs`
	`42`	`+# gpu_call(dest, (UInt32(dim), dest, xs)) do state, dim, dest, xs`
`43`	`43`	`# I = @cartesianidx dest state`
`44`	`44`	`# nI = catindex(dim, I, size.(xs))`
`45`	`45`	`# n = nI[1]; I′ = nI[2]`