clean up device handling

SimonDanisch · SimonDanisch · commit c16aefaa8284 · 2017-08-30T18:55:32.000+02:00
diff --git a/src/backends/backends.jl b/src/backends/backends.jl
@@ -1,4 +1,5 @@
 global current_context, make_current
+
 function default_backend()
     if is_backend_supported(:cudanative)
         CUBackend
@@ -9,22 +10,7 @@ function default_backend()
     end
 end
 
-let compute_contexts = Context[]
-    function current_context()
-        if isempty(compute_contexts)
-            default_backend().init()
-        end
-        last(compute_contexts)
-    end
-    all_contexts() = copy(compute_contexts)
-    function make_current(ctx)
-        idx = findfirst(compute_contexts, ctx)
-        if idx != 0
-            splice!(compute_contexts, idx) # remove
-        end
-        push!(compute_contexts, ctx)
-    end
-end
+
 #interface
 function create_buffer(ctx, array) end
 """
@@ -34,38 +20,61 @@ function synchronize(A::AbstractArray)
     # fallback is a noop, for backends not needing synchronization. This
     # makes it easier to write generic code that also works for AbstractArrays
 end
+
 """
 `A` must be a gpu Array and will help to dispatch to the correct GPU backend
 and can supply queues and contexts.
 Calls `f` on args on the GPU, falls back to a normal call if there is no backend.
 """
-function gpu_call(A::AbstractArray, f, args, worksize, localsize = nothing)
+function gpu_call(f, A::AbstractArray, args::Tuple, worksize = length(A), localsize = nothing)
     f(args...)
 end
 
-function free(x::AbstractArray)
+free(x::AbstractArray) = nothing
 
-end
 #=
 Functions to select contexts
 =#
 
-is_gpu(ctx) = false
-is_cpu(ctx) = false
-is_opencl(ctx) = false
-is_cudanative(ctx) = false
-is_julia(ctx) = false
-is_opengl(ctx) = false
-has_atleast(ctx, attribute, value) = error("has_atleast not implemented yet")
+threads(device) = 0
+blocks(device) = 0
+global_memory(device) = 0
+free_global_memory(device) = NaN
+local_memory(device) = 0
+name(device) = "Undefined"
+
+function device_summary(io::IO, device)
+    println(io, "Device: ", name(device))
+    for (n, f) in (:threads => threads, :blocks => blocks)
+        @printf(io, "%19s: %s\n", string(n), string(f(device)))
+    end
+    for (n, f) in (:global_memory => global_memory, :free_global_memory => free_global_memory, :local_memory => local_memory)
+        @printf(io, "%19s: %f mb\n", string(n), f(device) / 10^6)
+    end
+    return
+end
+
+################################
+# Device selection functions for e.g. devices(filterfuncs)
+is_gpu(device) = false
+is_cpu(device) = false
+has_atleast(device, attribute, value) = attribute(ctx_or_device) >= value
+
+"""
+Creates a new context from `device` without caching the resulting context.
+"""
+function new_context(device)
+    error("Device $device not supported")
+end
 
 # BLAS support
 hasblas(x) = false
 include("blas.jl")
 include("supported_backends.jl")
 include("shared.jl")
 
-function to_backend_module(backend::Symbol)
-    if backend in supported_backends()
+function backend_module(sym::Symbol)
+    if sym in supported_backends()
         if sym == :julia
             JLBackend
         elseif sym == :cudanative
@@ -82,17 +91,69 @@ end
 function init(sym::Symbol, args...; kw_args...)
     backend_module(sym).init(args...; kw_args...)
 end
+
 function init(filterfuncs::Function...; kw_args...)
-    init_from_device(first(devices(filterfuncs...)))
+    devices = available_devices(filterfuncs...)
+    if isempty(devices)
+        error("No device found for: $(join(string.(filterfuncs), " "))")
+    end
+    current_backend().init(first(devices))
+end
+
+active_backends() = backend_module.(supported_backends())
+
+const global_current_backend = Ref{Module}(default_backend())
+
+current_backend() = global_current_backend[]
+current_device() = current_backend().current_device()
+current_context() = current_backend().current_context()
+
+"""
+Sets the current backend to be used globally. Accepts the symbols:
+:cudanative, :opencl, :julia.
+"""
+function setbackend!(backend::Symbol)
+    setbackend!(backend_module(backend))
 end
-backend_modules() = to_backend_module.(supported_backends())
 
+function setbackend!(backend::Module)
+    global_current_backend[] = backend
+    return
+end
 
+"""
+Creates a temporary context for `device` and executes `f(context)` while this context is active.
+Context gets destroyed afterwards. Note, that creating a temporary context is expensive.
+"""
+function on_device(f, device = current_device())
+    ctx = new_context(device)
+    f(ctx)
+    destroy!(ctx)
+    return
+end
+
+"""
+Returns all devices for the current backend.
+Can be filtered by passing `filter_funcs`, e.g. `is_gpu`, `is_cpu`, `(dev)-> has_atleast(dev, threads, 512)`
+"""
+function available_devices(filter_funcs...)
+    result = []
+    for device in current_backend().devices()
+        if all(f-> f(device), filter_funcs)
+            push!(result, device)
+        end
+    end
+    result
+end
 
 
-function devices(filter_funcs...)
+"""
+Returns all devices from `backends = active_backends()`.
+Can be filtered by passing `filter_funcs`, e.g. `is_gpu`, `is_cpu`, `dev-> has_atleast(dev, threads, 512)`
+"""
+function all_devices(filter_funcs...; backends = active_backends())
     result = []
-    for Module in backend_modules()
+    for Module in backends
         for device in Module.devices()
             if all(f-> f(device), filter_funcs)
                 push!(result, device)
@@ -113,11 +174,11 @@ function perbackend(f)
 end
 
 """
-Iterates through all available devices and calls `f` after initializing the current one!
+Iterates through all available devices and calls `f(context)` after initializing the standard context for that device.
 """
 function forall_devices(f, filterfuncs...)
-    for device in devices(filterfunc)
-        make_current(device)
-        f(device)
+    for device in all_devices(filterfunc...)
+        ctx = init(device)
+        f(ctx)
     end
 end
diff --git a/src/backends/cudanative/cudanative.jl b/src/backends/cudanative/cudanative.jl
@@ -6,9 +6,10 @@ import CUDAdrv, CUDArt #, CUFFT
 
 import GPUArrays: buffer, create_buffer, acc_mapreduce
 import GPUArrays: Context, GPUArray, context, linear_index, gpu_call
-import GPUArrays: blas_module, blasbuffer, is_blas_supported, hasblas
+import GPUArrays: blas_module, blasbuffer, is_blas_supported, hasblas, init
 import GPUArrays: default_buffer_type, broadcast_index, is_fft_supported, unsafe_reinterpret
-
+import GPUArrays: is_gpu, name, threads, blocks, global_memory, local_memory, new_context
+using GPUArrays: device_summary
 
 using CUDAdrv: CuDefaultStream
 
@@ -23,36 +24,83 @@ immutable CUContext <: Context
     device::CUDAdrv.CuDevice
 end
 
-Base.show(io::IO, ctx::CUContext) = print(io, "CUContext")
+function Base.show(io::IO, ctx::CUContext)
+    println(io, "CUDAnative context with:")
+    device_summary(io, ctx.device)
+end
+
 
-function any_context()
-    dev = CUDAdrv.CuDevice(0)
-    ctx = CUDAdrv.CuContext(dev)
-    CUContext(ctx, dev)
+devices() = CUDAdrv.devices()
+is_gpu(dev::CUDAdrv.CuDevice) = true
+name(dev::CUDAdrv.CuDevice) = CUDAdrv.name(dev)
+threads(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)
+
+function blocks(dev::CUDAdrv.CuDevice)
+    (
+        CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X),
+        CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y),
+        CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Z),
+    )
 end
 
+global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.totalmem(dev)
+local_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.TOTAL_CONSTANT_MEMORY)
+
+
 #const GLArrayImg{T, N} = GPUArray{T, N, gl.Texture{T, N}, GLContext}
 const CUArray{T, N, B} = GPUArray{T, N, B, CUContext} #, GLArrayImg{T, N}}
 const CUArrayBuff{T, N} = CUArray{T, N, CUDAdrv.CuArray{T, N}}
 
 
-global init, all_contexts, current_context
-let contexts = CUContext[]
-    all_contexts() = copy(contexts)::Vector{CUContext}
-    current_context() = last(contexts)::CUContext
-    function init(;ctx = nothing)
-        ctx = if ctx == nothing
-            if isempty(contexts)
-                any_context()
-            else
-                current_context()
-            end
+global init, all_contexts, current_context, current_device
+
+let contexts = Dict{CUDAdrv.CuDevice, CUContext}(), active_device = CUDAdrv.CuDevice[]
+
+    all_contexts() = values(contexts)
+    function current_device()
+        if isempty(active_device)
+            push!(active_device, CUDAnative.default_device[])
         end
-        GPUArrays.make_current(ctx)
-        push!(contexts, ctx)
+        active_device[]
+    end
+    current_context() = contexts[current_device()]
+    function init(dev::CUDAdrv.CuDevice = current_device())
+        if isempty(active_device)
+            push!(active_device, dev)
+        else
+            active_device[] = dev
+        end
+        ctx = get!(()-> new_context(dev), contexts, dev)
+        CUDAdrv.activate(ctx.ctx)
         ctx
     end
+
+    function destroy!(context::CUContext)
+        # don't destroy primary device context
+        dev = context.device
+        if haskey(contexts, dev) && contexts[dev] == context
+            error("Trying to destroy primary device context which is prohibited. Please use reset!(context)")
+        end
+        CUDAdrv.destroy!(context.ctx)
+        return
+    end
 end
+
+function reset!(context::CUContext)
+    dev = context.device
+    CUDAdrv.destroy!(context.ctx)
+    context.ctx = CUDAdrv.CuContext(dev)
+    return
+end
+
+function new_context(dev::CUDAdrv.CuDevice = current_device())
+    cuctx = CUDAdrv.CuContext(dev)
+    ctx = CUContext(cuctx, dev)
+    CUDAdrv.activate(cuctx)
+    return ctx
+end
+
+
 # synchronize
 function GPUArrays.synchronize{T, N}(x::CUArray{T, N})
     CUDAdrv.synchronize(context(x).ctx) # TODO figure out the diverse ways of synchronization
diff --git a/src/backends/julia/julia.jl b/src/backends/julia/julia.jl
@@ -7,6 +7,7 @@ import GPUArrays: buffer, create_buffer, Context, context, mapidx, unpack_buffer
 import GPUArrays: AbstractAccArray, AbstractSampler, acc_mapreduce, gpu_call
 import GPUArrays: hasblas, blas_module, blasbuffer, default_buffer_type
 import GPUArrays: unsafe_reinterpret, broadcast_index, linear_index
+import GPUArrays: is_cpu, name, threads, blocks, global_memory
 
 import Base.Threads: @threads
 
@@ -26,6 +27,16 @@ let contexts = JLContext[]
     end
 end
 
+immutable JLDevice end
+
+threads(x::JLDevice) = Base.Threads.nthreads()
+global_memory(x::JLDevice) = Sys.total_memory()
+free_global_memory(x::JLDevice) = Sys.free_memory()
+name(x::JLDevice) = Sys.cpu_info()[1].model # TODO,one could have multiple CPUs ?
+is_cpu(::JLDevice) = true
+
+devices() = (JLDevice(),)
+
 
 immutable Sampler{T, N, Buffer} <: AbstractSampler{T, N}
     buffer::Buffer
diff --git a/src/backends/opencl/opencl.jl b/src/backends/opencl/opencl.jl
@@ -11,7 +11,7 @@ import GPUArrays: Context, GPUArray, context, linear_index, free
 import GPUArrays: blasbuffer, blas_module, is_blas_supported, is_fft_supported
 import GPUArrays: synchronize, hasblas, LocalMemory, AccMatrix, AccVector, gpu_call
 import GPUArrays: default_buffer_type, broadcast_index, unsafe_reinterpret
-import GPUArrays: is_opencl, is_gpu, is_cpu
+import GPUArrays: is_gpu, is_cpu, name, threads, blocks, global_memory, local_memory
 
 using Transpiler
 import Transpiler: cli, cli.get_global_id
@@ -55,12 +55,19 @@ function Base.show(io::IO, ctx::CLContext)
 end
 
 
-function devices()
-    cl.devices()
-end
-is_opencl(ctx::CLContext) = true
-is_gpu(ctx::CLContext) = cl.info(ctx.device, :device_type) == :gpu
-is_cpu(ctx::CLContext) = cl.info(ctx.device, :device_type) == :cpu
+devices() = cl.devices()
+
+is_gpu(dev::cl.Device) = cl.info(dev, :device_type) == :gpu
+is_cpu(dev::cl.Device) = cl.info(dev, :device_type) == :cpu
+
+name(dev::cl.Device) = cl.info(dev, :name)
+
+threads(dev::cl.Device) = cl.info(dev, :max_work_group_size) |> Int
+blocks(dev::cl.Device) = cl.info(dev, :max_work_item_size)
+
+global_memory(dev::cl.Device) = cl.info(dev, :global_mem_size) |> Int
+local_memory(dev::cl.Device) = cl.info(dev, :local_mem_size) |> Int
+
 
 
 global init, all_contexts, current_context
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -64,3 +64,6 @@ log_gpu_mem()
     include("fft.jl")
 end
 log_gpu_mem()
+
+
+using GPUArrays