JuliaGPU
diff --git a/‎src/abstractarray.jl
Lines changed: 0 additions & 5 deletions b/‎src/abstractarray.jl
Lines changed: 0 additions & 5 deletions
diff --git a/‎src/backends/backends.jl
Lines changed: 40 additions & 7 deletions b/‎src/backends/backends.jl
Lines changed: 40 additions & 7 deletions
diff --git a/‎src/backends/cudanative/cudanative.jl
Lines changed: 14 additions & 8 deletions b/‎src/backends/cudanative/cudanative.jl
Lines changed: 14 additions & 8 deletions
diff --git a/‎src/backends/julia/julia.jl
Lines changed: 28 additions & 14 deletions b/‎src/backends/julia/julia.jl
Lines changed: 28 additions & 14 deletions
diff --git a/‎src/backends/opencl/opencl.jl
Lines changed: 53 additions & 46 deletions b/‎src/backends/opencl/opencl.jl
Lines changed: 53 additions & 46 deletions
@@ -20,7 +20,6 @@ end
 #=
 Interface for accessing the lower level
 =#
-
 buffer(A::AbstractAccArray) = A.buffer
 context(A::AbstractAccArray) = A.context
 default_buffer_type(typ, context) = error("Found unsupported context: $context")
@@ -66,7 +65,6 @@ function Base.similar{N, ET}(x::AbstractAccArray, ::Type{ET}, sz::NTuple{N, Int}
 end
 
 
-using Compat.TypeUtils
 function Base.similar{T <: GPUArray, ET, N}(
         ::Type{T}, ::Type{ET}, sz::NTuple{N, Int};
         context::Context = current_context(), kw_args...
@@ -77,9 +75,6 @@ end
 
 
 
-
-
-
 #=
 Host to Device data transfers
 =#
 
@@ -56,22 +56,42 @@ end
 
 ################################
 # Device selection functions for e.g. devices(filterfuncs)
+is_gpu(ctx::Context) = is_gpu(ctx.device)
+is_cpu(ctx::Context) = is_cpu(ctx.device)
+has_atleast(ctx::Context, attribute, value) = has_atleast(ctx.device, attribute, value)
+
 is_gpu(device) = false
 is_cpu(device) = false
 has_atleast(device, attribute, value) = attribute(ctx_or_device) >= value
 
+
+#################################
+# Context filter functions
+# Works for context objects as well but is overloaded in the backends
+is_opencl(ctx::Symbol) = ctx == :opencl
+is_cudanative(ctx::Symbol) =  ctx == :cudanative
+is_julia(ctx::Symbol) =  ctx == :julia
+is_opengl(ctx::Symbol) =  ctx == :opengl
+
+is_opencl(ctx) = false
+is_cudanative(ctx) = false
+is_julia(ctx) = false
+is_opengl(ctx) = false
+
+
 """
 Creates a new context from `device` without caching the resulting context.
 """
 function new_context(device)
     error("Device $device not supported")
 end
 
-# BLAS support
-hasblas(x) = false
-include("blas.jl")
-include("supported_backends.jl")
-include("shared.jl")
+"""
+Resets a context freeing all resources and creating a new context.
+"""
+function reset!(context)
+    error("Context $context not supported")
+end
 
 function backend_module(sym::Symbol)
     if sym in supported_backends()
@@ -89,17 +109,27 @@ function backend_module(sym::Symbol)
     end
 end
 function init(sym::Symbol, args...; kw_args...)
-    backend_module(sym).init(args...; kw_args...)
+    mod = backend_module(sym)
+    setbackend!(mod)
+    init(args...; kw_args...)
 end
 
 function init(filterfuncs::Function...; kw_args...)
     devices = available_devices(filterfuncs...)
     if isempty(devices)
         error("No device found for: $(join(string.(filterfuncs), " "))")
     end
-    current_backend().init(first(devices))
+    init(first(devices))
 end
 
+# BLAS support
+hasblas(x) = false
+include("blas.jl")
+include("supported_backends.jl")
+include("shared.jl")
+
+
+
 active_backends() = backend_module.(supported_backends())
 
 const global_current_backend = Ref{Module}(default_backend())
@@ -182,3 +212,6 @@ function forall_devices(f, filterfuncs...)
         f(ctx)
     end
 end
+
+
+export is_cudanative, is_julia, is_opencl
@@ -4,7 +4,7 @@ using ..GPUArrays, CUDAnative, StaticArrays
 
 import CUDAdrv, CUDArt #, CUFFT
 
-import GPUArrays: buffer, create_buffer, acc_mapreduce
+import GPUArrays: buffer, create_buffer, acc_mapreduce, is_cudanative
 import GPUArrays: Context, GPUArray, context, linear_index, gpu_call
 import GPUArrays: blas_module, blasbuffer, is_blas_supported, hasblas, init
 import GPUArrays: default_buffer_type, broadcast_index, is_fft_supported, unsafe_reinterpret
@@ -23,7 +23,7 @@ immutable CUContext <: Context
     ctx::CUDAdrv.CuContext
     device::CUDAdrv.CuDevice
 end
-
+is_cudanative(ctx::CUContext) = true
 function Base.show(io::IO, ctx::CUContext)
     println(io, "CUDAnative context with:")
     device_summary(io, ctx.device)
@@ -52,7 +52,7 @@ const CUArray{T, N, B} = GPUArray{T, N, B, CUContext} #, GLArrayImg{T, N}}
 const CUArrayBuff{T, N} = CUArray{T, N, CUDAdrv.CuArray{T, N}}
 
 
-global init, all_contexts, current_context, current_device
+global all_contexts, current_context, current_device
 
 let contexts = Dict{CUDAdrv.CuDevice, CUContext}(), active_device = CUDAdrv.CuDevice[]
 
@@ -63,8 +63,14 @@ let contexts = Dict{CUDAdrv.CuDevice, CUContext}(), active_device = CUDAdrv.CuDe
         end
         active_device[]
     end
-    current_context() = contexts[current_device()]
-    function init(dev::CUDAdrv.CuDevice = current_device())
+    function current_context()
+        dev = current_device()
+        get!(contexts, dev) do
+            new_context(dev)
+        end
+    end
+
+    function GPUArrays.init(dev::CUDAdrv.CuDevice)
         if isempty(active_device)
             push!(active_device, dev)
         else
@@ -93,7 +99,7 @@ function reset!(context::CUContext)
     return
 end
 
-function new_context(dev::CUDAdrv.CuDevice = current_device())
+function new_context(dev::CUDAdrv.CuDevice)
     cuctx = CUDAdrv.CuContext(dev)
     ctx = CUContext(cuctx, dev)
     CUDAdrv.activate(cuctx)
@@ -241,14 +247,14 @@ function (f::CUFunction{F}){F <: CUDAdrv.CuFunction, T, N}(A::CUArray{T, N}, arg
     )
 end
 
-function gpu_call{T, N}(f::Function, A::CUArray{T, N}, args, globalsize = length(A), localsize = nothing)
+function gpu_call{T, N}(f::Function, A::CUArray{T, N}, args::Tuple, globalsize = length(A), localsize = nothing)
     blocks, thread = thread_blocks_heuristic(globalsize)
     args = map(unpack_cu_array, args)
     #cu_kernel, rewritten = CUDAnative.rewrite_for_cudanative(kernel, map(typeof, args))
     #println(CUDAnative.@code_typed kernel(args...))
     @cuda (blocks, thread) f(0f0, args...)
 end
-function gpu_call{T, N}(f::Tuple{String, Symbol}, A::CUArray{T, N}, args, globalsize = size(A), localsize = nothing)
+function gpu_call{T, N}(f::Tuple{String, Symbol}, A::CUArray{T, N}, args::Tuple, globalsize = size(A), localsize = nothing)
     func = CUFunction(A, f, args...)
     # TODO cache
     func(A, args) # TODO pass through local/global size
 
@@ -8,34 +8,48 @@ import GPUArrays: AbstractAccArray, AbstractSampler, acc_mapreduce, gpu_call
 import GPUArrays: hasblas, blas_module, blasbuffer, default_buffer_type
 import GPUArrays: unsafe_reinterpret, broadcast_index, linear_index
 import GPUArrays: is_cpu, name, threads, blocks, global_memory
+import GPUArrays: new_context, init, free_global_memory
 
 import Base.Threads: @threads
 
 immutable JLContext <: Context
     nthreads::Int
 end
+# TODO,one could have multiple CPUs ?
+immutable JLDevice <: Context
+    index::Int
+end
+
 
-global current_context, make_current, init
-let contexts = JLContext[]
-    all_contexts() = copy(contexts)::Vector{JLContext}
-    current_context() = last(contexts)::JLContext
-    function init()
-        ctx = JLContext(Base.Threads.nthreads())
-        GPUArrays.make_current(ctx)
-        push!(contexts, ctx)
+global all_contexts, current_context, current_device
+let contexts = Dict{JLDevice, JLContext}(), active_device = JLDevice[]
+    all_contexts() = values(contexts)
+    function current_device()
+        if isempty(active_device)
+            push!(active_device, JLDevice(0))
+        end
+        active_device[]
+    end
+    current_context() = contexts[current_device()]
+    function GPUArrays.init(dev::JLDevice)
+        if isempty(active_device)
+            push!(active_device, dev)
+        else
+            active_device[] = dev
+        end
+        ctx = get!(()-> new_context(dev), contexts, dev)
         ctx
     end
 end
 
-immutable JLDevice end
-
+new_context(dev::JLDevice) = JLContext(Threads.nthreads())
 threads(x::JLDevice) = Base.Threads.nthreads()
 global_memory(x::JLDevice) = Sys.total_memory()
 free_global_memory(x::JLDevice) = Sys.free_memory()
-name(x::JLDevice) = Sys.cpu_info()[1].model # TODO,one could have multiple CPUs ?
+name(x::JLDevice) = Sys.cpu_info()[1].model
 is_cpu(::JLDevice) = true
 
-devices() = (JLDevice(),)
+devices() = (JLDevice(0),)
 
 
 immutable Sampler{T, N, Buffer} <: AbstractSampler{T, N}
@@ -109,8 +123,8 @@ Base.@propagate_inbounds Base.setindex!{T, N}(A::JLArray{T, N}, val, i::Integer)
 Base.IndexStyle{T, N}(::Type{JLArray{T, N}}) = IndexLinear()
 
 function Base.show(io::IO, ctx::JLContext)
-    cpu = Sys.cpu_info()
-    print(io, "JLContext $(cpu[1].model) with $(ctx.nthreads) threads")
+    println("Threaded Julia Context with:")
+    GPUArrays.device_summary(io, JLDevice(0))
 end
 ##############################################
 # Implement BLAS interface
 
@@ -6,52 +6,34 @@ using OpenCL: cl
 
 using ..GPUArrays, StaticArrays
 
-import GPUArrays: buffer, create_buffer, acc_mapreduce, mapidx
-import GPUArrays: Context, GPUArray, context, linear_index, free
+import GPUArrays: buffer, create_buffer, acc_mapreduce, mapidx, is_opencl
+import GPUArrays: Context, GPUArray, context, linear_index, free, init
 import GPUArrays: blasbuffer, blas_module, is_blas_supported, is_fft_supported
 import GPUArrays: synchronize, hasblas, LocalMemory, AccMatrix, AccVector, gpu_call
-import GPUArrays: default_buffer_type, broadcast_index, unsafe_reinterpret
+import GPUArrays: default_buffer_type, broadcast_index, unsafe_reinterpret, reset!
 import GPUArrays: is_gpu, is_cpu, name, threads, blocks, global_memory, local_memory
+using GPUArrays: device_summary
 
 using Transpiler
 import Transpiler: cli, cli.get_global_id
 
 
-
-immutable CLContext <: Context
+type CLContext <: Context
     device::cl.Device
     context::cl.Context
     queue::cl.CmdQueue
-    function CLContext(device_type = nothing)
-        device = if device_type == nothing
-            devlist = cl.devices(:gpu)
-            dev = if isempty(devlist)
-                devlist = cl.devices(:cpu)
-                if isempty(devlist)
-                    error("no device found to be supporting opencl")
-                else
-                    first(devlist)
-                end
-            else
-                first(devlist)
-            end
-            dev
-        else
-            # if device type supplied by user, assume it's actually existant!
-            devlist = cl.devices(device_type)
-            if isempty(devlist)
-                error("Can't find OpenCL device for $device_type")
-            end
-            first(devlist)
-        end
+    function CLContext(device::cl.Device)
         ctx = cl.Context(device)
         queue = cl.CmdQueue(ctx)
         new(device, ctx, queue)
     end
 end
+
+is_opencl(ctx::CLContext) = true
+
 function Base.show(io::IO, ctx::CLContext)
-    name = replace(ctx.device[:name], r"\s+", " ")
-    print(io, "CLContext: $name")
+    println(io, "OpenCL context with:")
+    device_summary(io, ctx.device)
 end
 
 
@@ -69,27 +51,52 @@ global_memory(dev::cl.Device) = cl.info(dev, :global_mem_size) |> Int
 local_memory(dev::cl.Device) = cl.info(dev, :local_mem_size) |> Int
 
 
-
-global init, all_contexts, current_context
-let contexts = CLContext[]
-    all_contexts() = copy(contexts)::Vector{CLContext}
-    current_context() = last(contexts)::CLContext
-    function init(;device_type = nothing, ctx = nothing)
-        context = if ctx == nothing
-            if isempty(contexts)
-                CLContext(device_type)
-            else
-                current_context()
-            end
+global all_contexts, current_context, current_device
+let contexts = Dict{cl.Device, CLContext}(), active_device = cl.Device[]
+    all_contexts() = values(contexts)
+    function current_device()
+        if isempty(active_device)
+            push!(active_device, CUDAnative.default_device[])
+        end
+        active_device[]
+    end
+    function current_context()
+        dev = current_device()
+        get!(contexts, dev) do
+            new_context(dev)
+        end
+    end
+    function GPUArrays.init(dev::cl.Device)
+        if isempty(active_device)
+            push!(active_device, dev)
         else
-            ctx
+            active_device[] = dev
+        end
+        ctx = get!(()-> new_context(dev), contexts, dev)
+        ctx
+    end
+
+    function destroy!(context::CLContext)
+        # don't destroy primary device context
+        dev = context.device
+        if haskey(contexts, dev) && contexts[dev] == context
+            error("Trying to destroy primary device context which is prohibited. Please use reset!(context)")
         end
-        GPUArrays.make_current(context)
-        push!(contexts, context)
-        context
+        finalize(context.ctx)
+        return
     end
 end
 
+function reset!(context::CLContext)
+    device = context.device
+    finalize(context.context)
+    context.context = cl.Context(device)
+    context.queue = cl.CmdQueue(context.context)
+    return
+end
+
+new_context(dev::cl.Device) = CLContext(dev)
+
 const CLArray{T, N} = GPUArray{T, N, B, CLContext} where B <: cl.Buffer
 
 include("compilation.jl")
@@ -233,7 +240,7 @@ function thread_blocks_heuristic(len::Integer)
 end
 
 
-function gpu_call{T, N}(f, A::CLArray{T, N}, args, globalsize = length(A), localsize = nothing)
+function gpu_call{T, N}(f, A::CLArray{T, N}, args::Tuple, globalsize = length(A), localsize = nothing)
     ctx = GPUArrays.context(A)
     _args = if !isa(f, Tuple{String, Symbol})
         (0f0, args...)# include "state"