Merge pull request #230 from JuliaGPU/tb/1d

maleadt · web-flow · commit 11e6179bb803 · 2020-01-27T12:27:07.000+01:00
Remove non-1D indexing
diff --git a/src/array.jl b/src/array.jl
@@ -154,25 +154,24 @@ struct JLBackend <: AbstractGPUBackend end
 
 GPUArrays.backend(::Type{<:JLArray}) = JLBackend()
 
-mutable struct JLState{N}
-    blockdim::NTuple{N, Int}
-    griddim::NTuple{N, Int}
+mutable struct JLState
+    blockdim::Int
+    griddim::Int
 
-    blockidx::NTuple{N, Int}
-    threadidx::NTuple{N, Int}
+    blockidx::Int
+    threadidx::Int
     localmem_counter::Int
     localmems::Vector{Vector{Array}}
 end
 
-function JLState(threads::NTuple{N}, blockdim::NTuple{N}) where N
-    idx = ntuple(i-> 1, Val(N))
+function JLState(threads::Int, blockdim::Int)
     blockcount = prod(blockdim)
     lmems = [Vector{Array}() for i in 1:blockcount]
-    JLState{N}(threads, blockdim, idx, idx, 0, lmems)
+    JLState(threads, blockdim, 1, 1, 0, lmems)
 end
 
-function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
-    JLState{N}(
+function JLState(state::JLState, threadidx::Int)
+    JLState(
         state.blockdim,
         state.griddim,
         state.blockidx,
@@ -187,17 +186,15 @@ to_device(state, x::Tuple) = to_device.(Ref(state), x)
 to_device(state, x::Base.RefValue{<: JLArray}) = Base.RefValue(to_device(state, x[]))
 to_device(state, x) = x
 
-function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
+function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{Int, Int})
     blocks, threads = blocks_threads
-    idx = ntuple(i-> 1, length(blocks))
-    blockdim = blocks
-    state = JLState(threads, blockdim)
+    state = JLState(threads, blocks)
     device_args = to_device.(Ref(state), args)
-    tasks = Array{Task}(undef, threads...)
-    for blockidx in CartesianIndices(blockdim)
-        state.blockidx = blockidx.I
-        for threadidx in CartesianIndices(threads)
-            thread_state = JLState(state, threadidx.I)
+    tasks = Array{Task}(undef, threads)
+    for blockidx in 1:blocks
+        state.blockidx = blockidx
+        for threadidx in 1:threads
+            thread_state = JLState(state, threadidx)
             tasks[threadidx] = @async @allowscalar f(thread_state, device_args...)
             # TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading
             #       (this would require a different synchronization mechanism)
@@ -246,7 +243,7 @@ end
 
 function GPUArrays.LocalMemory(state::JLState, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
     state.localmem_counter += 1
-    lmems = state.localmems[blockidx_x(state)]
+    lmems = state.localmems[blockidx(state)]
 
     # first invocation in block
     data = if length(lmems) < state.localmem_counter
@@ -272,11 +269,8 @@ Base.size(x::JLDeviceArray) = x.dims
 @inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(A.data, index)
 @inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(A.data, x, index)
 
-for (i, sym) in enumerate((:x, :y, :z))
-    for f in (:blockidx, :blockdim, :threadidx, :griddim)
-        fname = Symbol(string(f, '_', sym))
-        @eval GPUArrays.$fname(state::JLState) = Int(state.$f[$i])
-    end
+for f in (:blockidx, :blockdim, :threadidx, :griddim)
+    @eval GPUArrays.$f(state::JLState) = state.$f
 end
 
 
diff --git a/src/device/indexing.jl b/src/device/indexing.jl
@@ -4,12 +4,9 @@ export global_size, synchronize_threads, linear_index
 
 
 # thread indexing functions
-for sym in (:x, :y, :z)
-    for f in (:blockidx, :blockdim, :threadidx, :griddim)
-        fname = Symbol(string(f, '_', sym))
-        @eval $fname(state)::Int = error("Not implemented") # COV_EXCL_LINE
-        @eval export $fname
-    end
+for f in (:blockidx, :blockdim, :threadidx, :griddim)
+    @eval $f(state)::Int = error("Not implemented") # COV_EXCL_LINE
+    @eval export $f
 end
 
 """
@@ -18,8 +15,7 @@ end
 Global size == blockdim * griddim == total number of kernel execution
 """
 @inline function global_size(state)
-    # TODO nd version
-    griddim_x(state) * blockdim_x(state)
+    griddim(state) * blockdim(state)
 end
 
 """
@@ -29,7 +25,7 @@ linear index corresponding to each kernel launch (in OpenCL equal to get_global_
 
 """
 @inline function linear_index(state)
-    (blockidx_x(state) - 1) * blockdim_x(state) + threadidx_x(state)
+    (blockidx(state) - 1) * blockdim(state) + threadidx(state)
 end
 
 """
diff --git a/src/host/execution.jl b/src/host/execution.jl
@@ -27,14 +27,12 @@ function gpu_call(kernel, A::AbstractArray, args::Tuple, configuration = length(
     thread_blocks = if isa(configuration, Integer)
         thread_blocks_heuristic(configuration)
     elseif isa(configuration, ITuple)
-        # if a single integer ntuple, we assume it to configure the blocks
-        configuration,  ntuple(x-> x == 1 ? 256 : 1, length(configuration))
+        @assert length(configuration) == 1
+        configuration[1], 1
     elseif isa(configuration, Tuple{ITuple, ITuple})
-        # 2 dim tuple of ints == blocks + threads per block
-        if any(x-> length(x) > 3 || length(x) < 1, configuration)
-            error("blocks & threads must be 1-3 dimensional. Found: $configuration")
-        end
-        map(x-> Int.(x), configuration) # make sure it all has the same int type
+        @assert length(configuration[1]) == 1
+        @assert length(configuration[2]) == 1
+        configuration[1][1], configuration[2][1]
     else
         error("""Please launch a gpu kernel with a valid configuration.
             Found: $configurations
@@ -65,5 +63,5 @@ function thread_blocks_heuristic(len::Integer)
     # TODO better threads default
     threads = clamp(len, 1, 256)
     blocks = max(ceil(Int, len / threads), 1)
-    (blocks,), (threads,)
+    (blocks, threads)
 end
diff --git a/src/host/linalg.jl b/src/host/linalg.jl
@@ -114,56 +114,11 @@ end
 
 ## high-level functionality
 
-function transpose_blocks!(
-        state, odata::AbstractArray{T}, idata, ::Val{SHMEM}, ::Val{TDIM}, ::Val{BLOCK_ROWS}, ::Val{NROW}
-    ) where {T, SHMEM, TDIM, BLOCK_ROWS, NROW}
-
-    tile = @LocalMemory(state, T, SHMEM)
-    bidx_x = blockidx_x(state) - 1
-    bidx_y = blockidx_y(state) - 1
-    tidx_x = threadidx_x(state) - 1
-    tidx_y = threadidx_y(state) - 1
-
-    x = bidx_x * TDIM + tidx_x + 1
-    y = bidx_y * TDIM + tidx_y + 1
-    dims = size(idata)
-
-    (x <= dims[2] && (y + (BLOCK_ROWS * 3)) <= dims[1]) || return
-
-    for j = 0:3
-        j0 = j * BLOCK_ROWS
-        @inbounds tile[tidx_x + 1, tidx_y + j0 + 1] = idata[y + j0, x]
-    end
-
-    synchronize_threads(state)
-    for j = 0:3
-        j0 = j * BLOCK_ROWS
-        @inbounds odata[x, y + j0] = tile[tidx_x + 1, tidx_y + j0 + 1]
-    end
-
-    return
-end
-
 function LinearAlgebra.transpose!(At::AbstractGPUArray{T, 2}, A::AbstractGPUArray{T, 2}) where T
-    if size(A, 1) == size(A, 2) && all(x-> x % 32 == 0, size(A))
-        outsize = size(At)
-        TDIM = 32; BLOCK_ROWS = 8
-        nrows = TDIM ÷ BLOCK_ROWS
-        shmemdim = (TDIM, (TDIM + 1))
-        static_params = map(x-> Val(x), (shmemdim, TDIM, BLOCK_ROWS, nrows))
-        args = (At, A, static_params...)
-
-        griddim = ceil.(Int, size(A) ./ (TDIM, TDIM))
-        blockdim = (TDIM, BLOCK_ROWS)
-        # optimized version for 32x & square dimensions
-        gpu_call(transpose_blocks!, At, args, (griddim, blockdim))
-    else
-        # simple fallback
-        gpu_call(At, (At, A)) do state, At, A
-            idx = @cartesianidx A state
-            @inbounds At[idx[2], idx[1]] = A[idx[1], idx[2]]
-            return
-        end
+    gpu_call(At, (At, A)) do state, At, A
+        idx = @cartesianidx A state
+        @inbounds At[idx[2], idx[1]] = A[idx[1], idx[2]]
+        return
     end
     At
 end
diff --git a/src/host/mapreduce.jl b/src/host/mapreduce.jl
@@ -142,11 +142,11 @@ for i = 0:10
                 global_index += global_size(state)
             end
             # Perform parallel reduction
-            local_index = threadidx_x(state) - 1
+            local_index = threadidx(state) - 1
             @inbounds tmp_local[local_index + 1] = acc
             synchronize_threads(state)
 
-            offset = blockdim_x(state) ÷ 2
+            offset = blockdim(state) ÷ 2
             @inbounds while offset > 0
                 if (local_index < offset)
                     other = tmp_local[local_index + offset + 1]
@@ -157,7 +157,7 @@ for i = 0:10
                 offset = offset ÷ 2
             end
             if local_index == 0
-                @inbounds result[blockidx_x(state)] = tmp_local[1]
+                @inbounds result[blockidx(state)] = tmp_local[1]
             end
             return
         end
diff --git a/src/host/random.jl b/src/host/random.jl
@@ -29,7 +29,7 @@ function next_rand(::Type{FT}, state::NTuple{4, T}) where {FT, T <: Unsigned}
 end
 
 function gpu_rand(::Type{T}, state, randstate::AbstractVector{NTuple{4, UInt32}}) where T
-    threadid = GPUArrays.threadidx_x(state)
+    threadid = GPUArrays.threadidx(state)
     stateful_rand = next_rand(T, randstate[threadid])
     randstate[threadid] = stateful_rand[1]
     return stateful_rand[2]
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -146,8 +146,8 @@ function test_base(AT)
 
         @testset "heuristics" begin
             blocks, threads = thread_blocks_heuristic(0)
-            @test blocks == (1,)
-            @test threads == (1,)
+            @test blocks == 1
+            @test threads == 1
         end
     end
 end
diff --git a/test/testsuite/gpuinterface.jl b/test/testsuite/gpuinterface.jl
@@ -16,25 +16,25 @@ function test_gpuinterface(AT)
         @test all(x-> x == 2, Array(x))
         configuration = ((N ÷ 2,), (2,))
         gpu_call(x, (x,), configuration) do state, x
-            x[linear_index(state)] = threadidx_x(state)
+            x[linear_index(state)] = threadidx(state)
             return
         end
         @test Array(x) == [1,2,1,2,1,2,1,2,1,2]
 
         gpu_call(x, (x,), configuration) do state, x
-            x[linear_index(state)] = blockidx_x(state)
+            x[linear_index(state)] = blockidx(state)
             return
         end
         @test Array(x) == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
         x2 = AT([0])
         gpu_call(x, (x2,), configuration) do state, x
-            x[1] = blockdim_x(state)
+            x[1] = blockdim(state)
             return
         end
         @test Array(x2) == [2]
 
         gpu_call(x, (x2,), configuration) do state, x
-            x[1] = griddim_x(state)
+            x[1] = griddim(state)
             return
         end
         @test Array(x2) == [5]